From 832da018d205bf24f7d8233158160a43ee514fcc Mon Sep 17 00:00:00 2001
From: Kibum Kim <kb0929.kim@samsung.com>
Date: Sat, 7 Jan 2012 00:50:43 +0900
Subject: [PATCH] Git init

---
 AUTHORS                               |    0
 CODING_STYLE                          |  199 ++
 COPYING                               |   42 +
 ChangeLog                             |    0
 Makefile.am                           |  131 +
 Makefile.win32                        |   25 +
 Makefile.win32.common                 |   54 +
 NEWS                                  |    0
 README                                |   22 +
 RELEASING                             |   57 +
 TODO                                  |  271 ++
 autogen.sh                            |   12 +
 configure.ac                          |  895 +++++
 debian/README.source                  |    3 +
 debian/changelog                      |    7 +
 debian/compat                         |    1 +
 debian/control                        |   48 +
 debian/copyright                      |  114 +
 debian/libpixman-1-0-udeb.install     |    1 +
 debian/libpixman-1-0.install          |    1 +
 debian/libpixman-1-0.symbols          |  117 +
 debian/libpixman-1-dev.install        |    5 +
 debian/rules                          |  110 +
 debian/watch                          |    2 +
 demos/Makefile.am                     |   36 +
 demos/alpha-test.c                    |  119 +
 demos/clip-in.c                       |   50 +
 demos/clip-test.c                     |   97 +
 demos/composite-test.c                |  191 ++
 demos/convolution-test.c              |   47 +
 demos/gradient-test.c                 |   93 +
 demos/gtk-utils.c                     |  115 +
 demos/gtk-utils.h                     |   13 +
 demos/radial-test.c                   |  198 ++
 demos/screen-test.c                   |   44 +
 demos/trap-test.c                     |   49 +
 demos/tri-test.c                      |   48 +
 packaging/pixman.spec                 |   62 +
 pixman-1-uninstalled.pc.in            |    5 +
 pixman-1.pc.in                        |   11 +
 pixman/Makefile.am                    |  106 +
 pixman/Makefile.sources               |   55 +
 pixman/Makefile.win32                 |   66 +
 pixman/make-combine.pl                |   86 +
 pixman/pixman-access-accessors.c      |    3 +
 pixman/pixman-access.c                | 1226 +++++++
 pixman/pixman-accessor.h              |   40 +
 pixman/pixman-arm-common.h            |  416 +++
 pixman/pixman-arm-detect-win32.asm    |   21 +
 pixman/pixman-arm-neon-asm-bilinear.S | 1367 ++++++++
 pixman/pixman-arm-neon-asm.S          | 3636 ++++++++++++++++++++
 pixman/pixman-arm-neon-asm.h          | 1177 +++++++
 pixman/pixman-arm-neon.c              |  517 +++
 pixman/pixman-arm-simd-asm.S          |  439 +++
 pixman/pixman-arm-simd.c              |  432 +++
 pixman/pixman-bits-image.c            | 1511 ++++++++
 pixman/pixman-combine.c.template      | 2461 +++++++++++++
 pixman/pixman-combine.h.template      |  226 ++
 pixman/pixman-compiler.h              |  209 ++
 pixman/pixman-conical-gradient.c      |  211 ++
 pixman/pixman-cpu.c                   |  631 ++++
 pixman/pixman-edge-accessors.c        |    4 +
 pixman/pixman-edge-imp.h              |  182 +
 pixman/pixman-edge.c                  |  384 +++
 pixman/pixman-fast-path.c             | 2166 ++++++++++++
 pixman/pixman-general.c               |  264 ++
 pixman/pixman-gradient-walker.c       |  254 ++
 pixman/pixman-image.c                 |  837 +++++
 pixman/pixman-implementation.c        |  286 ++
 pixman/pixman-inlines.h               | 1280 +++++++
 pixman/pixman-linear-gradient.c       |  286 ++
 pixman/pixman-matrix.c                |  766 +++++
 pixman/pixman-mmx.c                   | 3237 ++++++++++++++++++
 pixman/pixman-noop.c                  |  137 +
 pixman/pixman-private.h               | 1001 ++++++
 pixman/pixman-radial-gradient.c       |  470 +++
 pixman/pixman-region.c                | 2810 +++++++++++++++
 pixman/pixman-region16.c              |   67 +
 pixman/pixman-region32.c              |   47 +
 pixman/pixman-solid-fill.c            |   89 +
 pixman/pixman-sse2.c                  | 6071 +++++++++++++++++++++++++++++++++
 pixman/pixman-timer.c                 |   66 +
 pixman/pixman-trap.c                  |  668 ++++
 pixman/pixman-utils.c                 |  356 ++
 pixman/pixman-version.h.in            |   50 +
 pixman/pixman-vmx.c                   | 1647 +++++++++
 pixman/pixman.c                       | 1140 +++++++
 pixman/pixman.h                       |  990 ++++++
 pixman/refactor                       |  478 +++
 pixman/solaris-hwcap.mapfile          |   30 +
 test/Makefile.am                      |   13 +
 test/Makefile.sources                 |   36 +
 test/Makefile.win32                   |   31 +
 test/a1-trap-test.c                   |   50 +
 test/affine-test.c                    |  311 ++
 test/alpha-loop.c                     |   29 +
 test/alphamap.c                       |  256 ++
 test/blitters-test.c                  |  430 +++
 test/composite-traps-test.c           |  257 ++
 test/composite.c                      |  920 +++++
 test/fetch-test.c                     |  209 ++
 test/fuzzer-find-diff.pl              |   68 +
 test/gradient-crash-test.c            |  158 +
 test/lowlevel-blt-bench.c             |  727 ++++
 test/oob-test.c                       |  101 +
 test/pdf-op-test.c                    |   83 +
 test/region-contains-test.c           |  170 +
 test/region-test.c                    |  123 +
 test/region-translate-test.c          |   30 +
 test/scaling-crash-test.c             |  217 ++
 test/scaling-helpers-test.c           |   91 +
 test/scaling-test.c                   |  368 ++
 test/stress-test.c                    |  872 +++++
 test/trap-crasher.c                   |   27 +
 test/utils.c                          |  704 ++++
 test/utils.h                          |  154 +
 116 files changed, 50629 insertions(+)
 create mode 100644 AUTHORS
 create mode 100644 CODING_STYLE
 create mode 100644 COPYING
 create mode 100644 ChangeLog
 create mode 100644 Makefile.am
 create mode 100644 Makefile.win32
 create mode 100644 Makefile.win32.common
 create mode 100644 NEWS
 create mode 100644 README
 create mode 100644 RELEASING
 create mode 100644 TODO
 create mode 100755 autogen.sh
 create mode 100755 configure.ac
 create mode 100755 debian/README.source
 create mode 100755 debian/changelog
 create mode 100755 debian/compat
 create mode 100755 debian/control
 create mode 100755 debian/copyright
 create mode 100755 debian/libpixman-1-0-udeb.install
 create mode 100755 debian/libpixman-1-0.install
 create mode 100755 debian/libpixman-1-0.symbols
 create mode 100755 debian/libpixman-1-dev.install
 create mode 100755 debian/rules
 create mode 100755 debian/watch
 create mode 100644 demos/Makefile.am
 create mode 100644 demos/alpha-test.c
 create mode 100644 demos/clip-in.c
 create mode 100644 demos/clip-test.c
 create mode 100644 demos/composite-test.c
 create mode 100644 demos/convolution-test.c
 create mode 100644 demos/gradient-test.c
 create mode 100644 demos/gtk-utils.c
 create mode 100644 demos/gtk-utils.h
 create mode 100644 demos/radial-test.c
 create mode 100644 demos/screen-test.c
 create mode 100644 demos/trap-test.c
 create mode 100644 demos/tri-test.c
 create mode 100644 packaging/pixman.spec
 create mode 100644 pixman-1-uninstalled.pc.in
 create mode 100644 pixman-1.pc.in
 create mode 100644 pixman/Makefile.am
 create mode 100644 pixman/Makefile.sources
 create mode 100644 pixman/Makefile.win32
 create mode 100644 pixman/make-combine.pl
 create mode 100644 pixman/pixman-access-accessors.c
 create mode 100644 pixman/pixman-access.c
 create mode 100644 pixman/pixman-accessor.h
 create mode 100644 pixman/pixman-arm-common.h
 create mode 100644 pixman/pixman-arm-detect-win32.asm
 create mode 100644 pixman/pixman-arm-neon-asm-bilinear.S
 create mode 100644 pixman/pixman-arm-neon-asm.S
 create mode 100644 pixman/pixman-arm-neon-asm.h
 create mode 100644 pixman/pixman-arm-neon.c
 create mode 100644 pixman/pixman-arm-simd-asm.S
 create mode 100644 pixman/pixman-arm-simd.c
 create mode 100644 pixman/pixman-bits-image.c
 create mode 100644 pixman/pixman-combine.c.template
 create mode 100644 pixman/pixman-combine.h.template
 create mode 100644 pixman/pixman-compiler.h
 create mode 100644 pixman/pixman-conical-gradient.c
 create mode 100644 pixman/pixman-cpu.c
 create mode 100644 pixman/pixman-edge-accessors.c
 create mode 100644 pixman/pixman-edge-imp.h
 create mode 100644 pixman/pixman-edge.c
 create mode 100644 pixman/pixman-fast-path.c
 create mode 100644 pixman/pixman-general.c
 create mode 100644 pixman/pixman-gradient-walker.c
 create mode 100644 pixman/pixman-image.c
 create mode 100644 pixman/pixman-implementation.c
 create mode 100644 pixman/pixman-inlines.h
 create mode 100644 pixman/pixman-linear-gradient.c
 create mode 100644 pixman/pixman-matrix.c
 create mode 100644 pixman/pixman-mmx.c
 create mode 100644 pixman/pixman-noop.c
 create mode 100644 pixman/pixman-private.h
 create mode 100644 pixman/pixman-radial-gradient.c
 create mode 100644 pixman/pixman-region.c
 create mode 100644 pixman/pixman-region16.c
 create mode 100644 pixman/pixman-region32.c
 create mode 100644 pixman/pixman-solid-fill.c
 create mode 100644 pixman/pixman-sse2.c
 create mode 100644 pixman/pixman-timer.c
 create mode 100644 pixman/pixman-trap.c
 create mode 100644 pixman/pixman-utils.c
 create mode 100644 pixman/pixman-version.h.in
 create mode 100644 pixman/pixman-vmx.c
 create mode 100644 pixman/pixman.c
 create mode 100644 pixman/pixman.h
 create mode 100644 pixman/refactor
 create mode 100644 pixman/solaris-hwcap.mapfile
 create mode 100755 test/Makefile.am
 create mode 100644 test/Makefile.sources
 create mode 100755 test/Makefile.win32
 create mode 100644 test/a1-trap-test.c
 create mode 100755 test/affine-test.c
 create mode 100644 test/alpha-loop.c
 create mode 100644 test/alphamap.c
 create mode 100755 test/blitters-test.c
 create mode 100755 test/composite-traps-test.c
 create mode 100755 test/composite.c
 create mode 100755 test/fetch-test.c
 create mode 100644 test/fuzzer-find-diff.pl
 create mode 100644 test/gradient-crash-test.c
 create mode 100644 test/lowlevel-blt-bench.c
 create mode 100644 test/oob-test.c
 create mode 100644 test/pdf-op-test.c
 create mode 100644 test/region-contains-test.c
 create mode 100644 test/region-test.c
 create mode 100644 test/region-translate-test.c
 create mode 100644 test/scaling-crash-test.c
 create mode 100755 test/scaling-helpers-test.c
 create mode 100755 test/scaling-test.c
 create mode 100755 test/stress-test.c
 create mode 100755 test/trap-crasher.c
 create mode 100755 test/utils.c
 create mode 100755 test/utils.h

diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..e69de29
diff --git a/CODING_STYLE b/CODING_STYLE
new file mode 100644
index 0000000..9f5171d
--- /dev/null
+++ b/CODING_STYLE
@@ -0,0 +1,199 @@
+Pixman coding style.
+====================
+
+The pixman coding style is close to cairo's with one exception: braces
+go on their own line, rather than on the line of the if/while/for:
+
+	if (condition)
+	{
+	    do_something();
+	    do_something_else();
+	}
+
+not
+
+	if (condition) {
+	    do_something();
+	    do_something_else();
+        }
+
+
+
+Indentation
+===========
+
+Each new level is indented four spaces:
+
+	if (condition)
+	    do_something();
+
+This may be achieved with space characters or with a combination of
+tab characters and space characters. Tab characters are interpreted as
+
+	Advance to the next column which is a multiple of 8.
+
+
+Names
+=====
+
+In all names, words are separated with underscores. Do not use
+CamelCase for any names.
+
+Macros have ALL_CAPITAL_NAMES
+
+Type names are in lower case and end with "_t". For example
+pixman_image_t.
+
+Labels, functions and variables have lower case names.
+
+
+Braces
+======
+
+Braces always go on their own line:
+
+	if (condition)
+	{
+	    do_this ();
+	    do_that ();
+	}
+	else
+	{
+	    do_the_other ();
+	}
+
+Rules for braces and substatements of if/while/for/do:
+
+* If a substatement spans multiple lines, then there must be braces
+  around it.
+
+* If the condition of an if/while/for spans multiple lines, then 
+  braces must be used for the substatements.
+
+* If one substatement of an if statement has braces, then the other
+  must too.
+
+* Otherwise, don't add braces.
+
+
+Comments
+========
+
+For comments either like this:
+
+        /* One line comment */
+
+or like this:
+
+	/* This is a multi-line comment
+	 *
+         * It extends over multiple lines
+	 */
+
+Generally comments should say things that aren't clear from the code
+itself. If too many comments say obvious things, then people will just
+stop reading all comments, including the good ones.
+
+
+Whitespace
+==========
+
+* Put a single space after commas
+
+* Put spaces around arithmetic operators such a +, -, *, /:
+
+        y * stride + x
+
+        x / unit_x
+
+* Do not put spaces after the address-of operator, the * when used as
+  a pointer derefernce or the ! and ~ operators:
+
+     &foo;
+
+     ~0x00000000
+
+     !condition
+
+     *result = 100
+
+* Break up long lines (> ~80 characters) and use whitespace to align
+  things nicely. This is one way:
+
+  	 some_very_long_function name (
+	 	implementation, op, src, mask, dest, 
+		src_x, src_y, mask_x, mask_y, dest_x, dest_y,
+		width, height);
+
+  This is another:
+
+        some_very_long_function_name (implementation, op,
+                                      src, mask, dest,
+				      src_x, src_y,
+				      mask_x, mask_y,
+				      dest_x, dest_y,
+				      width, height);
+
+* Separate logically distinct chunks with a single newline. This
+  obviously applies between functions, but also applies within a
+  function or block or structure definition.
+
+* Use a newline after a block of variable declarations.
+
+* Use a single space before a left parenthesis, except where the
+  standard will not allow it, (eg. when defining a parameterized macro).
+
+* Don't eliminate newlines just because things would still fit on one
+  line. This breaks the expected visual structure of the code making
+  it much harder to read and understand:
+
+	if (condition) foo (); else bar ();	/* Yuck! */
+
+
+Function Definitions
+====================
+
+Function definitions should take the following form:
+
+	void
+	my_function (int argument)
+	{
+	    do_my_things ();
+	}
+
+If all the parameters to a function fit naturally on one line, format
+them that way. Otherwise, put one argument on each line, adding
+whitespace so that the parameter names are aligned with each other.
+
+I.e., do either this:
+
+        void
+        short_arguments (const char *str, int x, int y, int z)
+        {
+        }
+
+or this:
+
+	void
+	long_arguments (const char *char_star_arg,
+			int	    int_arg,
+			double	   *double_star_arg,
+			double	    double_arg)
+	{
+	}
+
+
+Mode lines
+==========
+
+Given the rules above, what is the best way to simplify one's life as
+a code monkey? Get your editor to do most of the tedious work of
+beautifying your code!
+
+As a reward for reading this far, here are some mode lines for the more
+popular editors:
+/*
+ * vim:sw=4:sts=4:ts=8:tw=78:fo=tcroq:cindent:cino=\:0,(0
+ * vim:isk=a-z,A-Z,48-57,_,.,-,>
+ */
+
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..6168dea
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,42 @@
+The following is the MIT license, agreed upon by most contributors.
+Copyright holders of new code should use this license statement where
+possible. They may also add themselves to the list below.
+
+/*
+ * Copyright 1987, 1988, 1989, 1998  The Open Group
+ * Copyright 1987, 1988, 1989 Digital Equipment Corporation
+ * Copyright 1999, 2004, 2008 Keith Packard
+ * Copyright 2000 SuSE, Inc.
+ * Copyright 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * Copyright 2004, 2005, 2007, 2008, 2009, 2010 Red Hat, Inc.
+ * Copyright 2004 Nicholas Miell
+ * Copyright 2005 Lars Knoll & Zack Rusin, Trolltech
+ * Copyright 2005 Trolltech AS
+ * Copyright 2007 Luca Barbato
+ * Copyright 2008 Aaron Plattner, NVIDIA Corporation
+ * Copyright 2008 Rodrigo Kumpera
+ * Copyright 2008 AndrÃ© TupinambÃ¡
+ * Copyright 2008 Mozilla Corporation
+ * Copyright 2008 Frederic Plourde
+ * Copyright 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2009, 2010 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
diff --git a/ChangeLog b/ChangeLog
new file mode 100644
index 0000000..e69de29
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 0000000..ff87e26
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,131 @@
+SUBDIRS = pixman demos test
+
+pkgconfigdir=$(libdir)/pkgconfig
+pkgconfig_DATA=pixman-1.pc
+
+$(pkgconfig_DATA): pixman-1.pc.in
+
+snapshot:
+	distdir="$(distdir)-`date '+%Y%m%d'`"; \
+	test -d "$(srcdir)/.git" && distdir=$$distdir-`cd "$(srcdir)" && git rev-parse HEAD | cut -c 1-6`; \
+	$(MAKE) $(AM_MAKEFLAGS) distdir="$$distdir" dist
+
+GPGKEY=6FF7C1A8
+USERNAME=$$USER
+RELEASE_OR_SNAPSHOT = $$(if test "x$(PIXMAN_VERSION_MINOR)" = "x$$(echo "$(PIXMAN_VERSION_MINOR)/2*2" | bc)" ; then echo release; else echo snapshot; fi)
+RELEASE_CAIRO_HOST =	$(USERNAME)@cairographics.org
+RELEASE_CAIRO_DIR =	/srv/cairo.freedesktop.org/www/$(RELEASE_OR_SNAPSHOT)s
+RELEASE_CAIRO_URL = 	http://cairographics.org/$(RELEASE_OR_SNAPSHOT)s
+RELEASE_XORG_URL =	http://xorg.freedesktop.org/archive/individual/lib
+RELEASE_XORG_HOST =	$(USERNAME)@xorg.freedesktop.org
+RELEASE_XORG_DIR =	/srv/xorg.freedesktop.org/archive/individual/lib
+RELEASE_ANNOUNCE_LIST = cairo-announce@cairographics.org, xorg-announce@lists.freedesktop.org, pixman@lists.freedesktop.org
+
+tar_gz = $(PACKAGE)-$(VERSION).tar.gz
+tar_bz2 = $(PACKAGE)-$(VERSION).tar.bz2
+
+sha1_tgz = $(tar_gz).sha1
+md5_tgz = $(tar_gz).md5
+
+sha1_tbz2 = $(tar_bz2).sha1
+md5_tbz2 = $(tar_bz2).md5
+
+gpg_file = $(sha1_tgz).asc
+
+$(sha1_tgz): $(tar_gz)
+	sha1sum $^ > $@
+
+$(md5_tgz): $(tar_gz)
+	md5sum $^ > $@
+
+$(sha1_tbz2): $(tar_bz2)
+	sha1sum $^ > $@
+
+$(md5_tbz2): $(tar_bz2)
+	md5sum $^ > $@
+
+$(gpg_file): $(sha1_tgz)
+	@echo "Please enter your GPG password to sign the checksum."
+	gpg --armor --sign $^ 
+
+HASHFILES = $(sha1_tgz) $(sha1_tbz2) $(md5_tgz) $(md5_tbz2)
+
+release-verify-newer:
+	@echo -n "Checking that no $(VERSION) release already exists at $(RELEASE_XORG_HOST)..."
+	@ssh $(RELEASE_XORG_HOST) test ! -e $(RELEASE_XORG_DIR)/$(tar_gz) \
+		|| (echo "Ouch." && echo "Found: $(RELEASE_XORG_HOST):$(RELEASE_XORG_DIR)/$(tar_gz)" \
+		&& echo "Refusing to try to generate a new release of the same name." \
+		&& false)
+	@ssh $(RELEASE_CAIRO_HOST) test ! -e $(RELEASE_CAIRO_DIR)/$(tar_gz) \
+		|| (echo "Ouch." && echo "Found: $(RELEASE_CAIRO_HOST):$(RELEASE_CAIRO_DIR)/$(tar_gz)" \
+		&& echo "Refusing to try to generate a new release of the same name." \
+		&& false)
+	@echo "Good."
+
+release-remove-old:
+	$(RM) $(tar_gz) $(tar_bz2) $(HASHFILES) $(gpg_file)
+
+ensure-prev:
+	@if [[ "$(PREV)" == "" ]]; then							\
+		echo ""							          &&	\
+		echo "You must set the PREV variable on the make command line to" &&	\
+		echo "the last version."				  	  &&	\
+		echo ""								  &&	\
+		echo "For example:"						  &&	\
+		echo "      make PREV=0.7.3"				  	  &&	\
+		echo ""								  &&	\
+		false;									\
+	fi
+
+release-check: ensure-prev release-verify-newer release-remove-old distcheck
+
+release-tag:
+	git tag -u $(GPGKEY) -m "$(PACKAGE) $(VERSION) release" $(PACKAGE)-$(VERSION)
+
+release-upload: release-check $(tar_gz) $(tar_bz2) $(sha1_tgz) $(sha1_tbz2) $(md5_tgz) $(gpg_file)
+	scp $(tar_gz) $(sha1_tgz) $(gpg_file) $(RELEASE_CAIRO_HOST):$(RELEASE_CAIRO_DIR)
+	scp $(tar_gz) $(tar_bz2) $(RELEASE_XORG_HOST):$(RELEASE_XORG_DIR)
+	ssh $(RELEASE_CAIRO_HOST) "rm -f $(RELEASE_CAIRO_DIR)/LATEST-$(PACKAGE)-[0-9]* && ln -s $(tar_gz) $(RELEASE_CAIRO_DIR)/LATEST-$(PACKAGE)-$(VERSION)"
+
+release-publish-message: $(HASHFILES) ensure-prev
+	@echo "Please follow the instructions in RELEASING to push stuff out and"
+	@echo "send out the announcement mails.  Here is the excerpt you need:"
+	@echo ""
+	@echo "Lists:  $(RELEASE_ANNOUNCE_LIST)"
+	@echo "Subject: [ANNOUNCE] $(PACKAGE) release $(VERSION) now available"
+	@echo "============================== CUT HERE =============================="
+	@echo "A new $(PACKAGE) release $(VERSION) is now available"
+	@echo ""
+	@echo "tar.gz:"
+	@echo "	$(RELEASE_CAIRO_URL)/$(tar_gz)"
+	@echo "	$(RELEASE_XORG_URL)/$(tar_gz)"
+	@echo ""
+	@echo "tar.bz2:"
+	@echo "	$(RELEASE_XORG_URL)/$(tar_bz2)"
+	@echo ""
+	@echo "Hashes:"
+	@echo -n "	MD5:  "
+	@cat $(md5_tgz)
+	@echo -n "	MD5:  "
+	@cat $(md5_tbz2)
+	@echo -n "	SHA1: "
+	@cat $(sha1_tgz)
+	@echo -n "	SHA1: "
+	@cat $(sha1_tbz2)
+	@echo ""
+	@echo "GPG signature:"
+	@echo "	$(RELEASE_CAIRO_URL)/$(gpg_file)"
+	@echo "	(signed by `git config --get user.name` <`git config --get user.email`>)"
+	@echo ""
+	@echo "Git:"
+	@echo "	git://git.freedesktop.org/git/pixman"
+	@echo "	tag: $(PACKAGE)-$(VERSION)"
+	@echo ""
+	@echo "Log:"
+	@git log --no-merges "$(PACKAGE)-$(PREV)".."$(PACKAGE)-$(VERSION)" | git shortlog | awk '{ printf "\t"; print ; }' | cut -b1-80
+	@echo "============================== CUT HERE =============================="
+	@echo ""
+
+release-publish: release-upload release-tag release-publish-message
+
+.PHONY: release-upload release-publish release-publish-message release-tag
diff --git a/Makefile.win32 b/Makefile.win32
new file mode 100644
index 0000000..91cd12a
--- /dev/null
+++ b/Makefile.win32
@@ -0,0 +1,25 @@
+default: all
+
+top_srcdir = .
+include $(top_srcdir)/Makefile.win32.common
+
+# Recursive targets
+pixman_r:
+	@$(MAKE) -C pixman -f Makefile.win32
+
+test_r:
+	@$(MAKE) -C test -f Makefile.win32
+
+clean_r:
+	@$(MAKE) -C pixman -f Makefile.win32 clean
+	@$(MAKE) -C test   -f Makefile.win32 clean
+
+check_r:
+	@$(MAKE) -C test -f Makefile.win32 check
+
+# Base targets
+all: test_r
+
+clean: clean_r
+
+check: check_r
diff --git a/Makefile.win32.common b/Makefile.win32.common
new file mode 100644
index 0000000..56c3593
--- /dev/null
+++ b/Makefile.win32.common
@@ -0,0 +1,54 @@
+LIBRARY = pixman-1
+
+CC = cl
+LD = link
+AR = lib
+PERL = perl
+
+ifeq ($(top_builddir),)
+top_builddir = $(top_srcdir)
+endif
+
+CFG_VAR = $(CFG)
+ifeq ($(CFG_VAR),)
+CFG_VAR = release
+endif
+
+ifeq ($(CFG_VAR),debug)
+CFG_CFLAGS  = -MDd -Od -Zi
+CFG_LDFLAGS = -DEBUG
+else
+CFG_CFLAGS  = -MD -O2
+CFG_LDFLAGS =
+endif
+
+# Package definitions, to be used instead of those provided in config.h
+PKG_CFLAGS  = -DPACKAGE=$(LIBRARY) -DPACKAGE_VERSION="" -DPACKAGE_BUGREPORT=""
+
+BASE_CFLAGS = -nologo -I. -I$(top_srcdir) -I$(top_srcdir)/pixman
+
+PIXMAN_CFLAGS  = $(BASE_CFLAGS) $(PKG_CFLAGS) $(CFG_CFLAGS) $(CFLAGS)
+PIXMAN_LDFLAGS = -nologo $(CFG_LDFLAGS) $(LDFLAGS)
+PIXMAN_ARFLAGS = -nologo $(LDFLAGS)
+
+
+inform:
+ifneq ($(CFG),release)
+ifneq ($(CFG),debug)
+ifneq ($(CFG),)
+	@echo "Invalid specified configuration option: "$(CFG)"."
+	@echo
+	@echo "Possible choices for configuration are 'release' and 'debug'"
+	@exit 1
+endif
+	@echo "Using default RELEASE configuration... (use CFG=release or CFG=debug)"
+endif
+endif
+
+
+$(CFG_VAR)/%.obj: %.c $(BUILT_SOURCES)
+	@mkdir -p $(CFG_VAR)
+	@$(CC) -c $(PIXMAN_CFLAGS) -Fo"$@" $<
+
+clean: inform
+	@$(RM) $(CFG_VAR)/*.{exe,ilk,lib,obj,pdb} $(BUILT_SOURCES) || exit 0
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..e69de29
diff --git a/README b/README
new file mode 100644
index 0000000..3cfbc50
--- /dev/null
+++ b/README
@@ -0,0 +1,22 @@
+pixman is a library that provides low-level pixel manipulation
+features such as image compositing and trapezoid rasterization.
+
+All questions regarding this software should be directed to the pixman
+mailing list:
+
+        http://lists.freedesktop.org/mailman/listinfo/pixman
+
+Please send patches and bug reports either to the mailing list above,
+or file them at the freedesktop bug tracker:
+
+        https://bugs.freedesktop.org/enter_bug.cgi?product=pixman
+
+The master development code repository can be found at:
+
+	git://anongit.freedesktop.org/git/pixman
+
+	http://gitweb.freedesktop.org/?p=pixman;a=summary
+
+For more information on the git code manager, see:
+
+	http://wiki.x.org/wiki/GitPage
diff --git a/RELEASING b/RELEASING
new file mode 100644
index 0000000..fbe1581
--- /dev/null
+++ b/RELEASING
@@ -0,0 +1,57 @@
+Here are the steps to follow to create a new pixman release:
+
+1) Ensure that there are no uncommitted changes or unpushed commits,
+   and that you are up to date with the latest commits in the central
+   repository. Here are a couple of useful commands:
+
+	git diff			(no output)
+	
+	git status			(should report "nothing to commit")
+
+	git log master...origin		(no output; note: *3* dots)
+
+2) Increment pixman_(major|minor|micro) in configure.ac according to
+   the directions in that file.
+
+3) Make sure that new version works, including
+
+	- make distcheck passes
+
+	- the X server still works with the new pixman version
+	  installed
+
+	- the cairo test suite hasn't gained any new failures compared
+	  to last pixman version.
+
+4) Use "git commit" to record the changes made in step 2 and 3.
+
+5) Generate and publish the tar files by running 
+
+	make PREV=<last version> GPGKEY=<your gpg key id> release-publish
+
+   If your freedesktop user name is different from your local one,
+   then also set the variable USER to your freedesktop user name.
+
+6) Run 
+
+	make release-publish-message
+
+   to generate a draft release announcement. Edit it as appropriate and
+   send it to 
+
+	cairo-announce@cairographics.org
+
+	pixman@lists.freedesktop.org
+
+	xorg-announce@lists.freedesktop.org
+
+7) Increment pixman_micro to the next larger (odd) number in
+   configure.ac. Commit this change, and push all commits created
+   during this process using
+
+	git push
+	git push --tags
+
+   You must use "--tags" here; otherwise the new tag will not
+   be pushed out.
+
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..4434ec7
--- /dev/null
+++ b/TODO
@@ -0,0 +1,271 @@
+  - Testing
+    - Test implementations against each other
+    - Test both with and without the operator strength reduction.
+      They shold be identical.
+
+  - SSE 2 issues:
+
+      - Use MM_HINT_NTA instead of MM_HINT_T0
+
+      - Use of fbCompositeOver_x888x8x8888sse2()
+
+  - Update the RLEASING file
+
+  - Things to keep in mind if breaking ABI:
+
+      - There should be a guard #ifndef I_AM_EITHER_CAIRO_OR_THE_X_SERVER
+
+      - X server will require 16.16 essentially forever. Can we get
+        the required precision by simply adding offset_x/y to the
+        relevant rendering API?
+
+      - Get rid of workaround for X server bug.
+
+      - pixman_image_set_indexed() should copy its argument, and X
+        should be ported over to use a pixman_image as the
+        representation of a Picture, rather than creating one on each
+        operation.
+
+      - We should get rid of pixman_set_static_pointers()
+
+      - We should get rid of the various trapezoid helper functions().
+        (They only exist because they are theoretically available to
+        drivers).
+
+      - 16 bit regions should be deleted
+
+      - There should only be one trap rasterization API.
+
+      - The PIXMAN_g8/c8/etc formats should use the A channel
+        to indicate the actual depth. That way PIXMAN_x4c4 and PIXMAN_c8
+	won't collide.
+
+  - Maybe bite the bullet and make configure.ac generate a pixman-types.h
+    file that can be included from pixman.h to avoid the #ifdef magic
+    in pixman.h
+
+  - Make pixman_region_point_in() survive a NULL box, then fix up
+    pixman-compose.c
+
+      - Possibly look into inlining the fetch functions
+
+  - There is a bug with source clipping demonstrated by clip-test in the
+    test directory. If we interprete source clipping as given in
+    destination coordinates, which is probably the only sane choice,
+    then the result should have two red bars down the sides.
+    
+  - Test suite
+
+  - Add a general way of dealing with architecture specific
+    fast-paths.  The current idea is to have each operation that can
+    be optimized is called through a function pointer that is
+    initially set to an initialization function that is responsible for
+    setting the function pointer to the appropriate fast-path.
+
+  - Go through things marked FIXME
+
+  - Add calls to prepare and finish access where necessary.  grep for
+    ACCESS_MEM, and make sure they are correctly wrapped in prepare
+    and finish.
+
+  - restore READ/WRITE in the fbcompose combiners since they sometimes
+    store directly to destination drawables.
+
+  - It probably makes sense to move the more strange X region API
+    into pixman as well, but guarded with PIXMAN_XORG_COMPATIBILITY
+
+  - Reinstate the FbBits typedef? At the moment we don't
+    even have the FbBits type; we just use uint32_t everywhere.
+
+    Keith says in bug 2335:
+
+        The 64-bit code in fb (pixman) is probably broken; it hasn't been
+        used in quite some time as PCI (and AGP) is 32-bits wide, so
+        doing things 64-bits at a time is a net loss.  To quickly fix
+        this, I suggest just using 32-bit datatypes by setting
+        IC_SHIFT to 5 for all machines.
+
+  - Consider optimizing the 8/16 bit solid fills in pixman-util.c by
+    storing more than one value at a time.
+
+  - Add an image cache to prevent excessive malloc/free. Note that pixman
+    needs to be thread safe when used from cairo.
+
+  - Moving to 24.8 coordinates. This is tricky because X is still
+    defined as 16.16 and will be basically forever. It's possible we
+    could do this by adding extra offset_x/y parameters to the
+    trapezoid calls. The X server could then just call the API with
+    (0, 0). Cairo would have to make sure that the delta *within* a
+    batch of trapezoids does not exceed 16 bit.
+
+  - Consider adding actual backends. Brain dump:
+
+    A backend is something that knows how to
+
+      - Create images
+      - Composite three images
+      - Rasterize trapezoids
+      - Do solid fills and blits
+
+    These operations are provided by a vtable that the backend will
+    create when it is initialized. Initial backends:
+
+      - VMX
+      - SSE2
+      - MMX
+      - Plain Old C
+
+    When the SIMD backends are initialized, they will be passed a
+    pointer to the Plain Old C backend that they can use for fallback
+    purposes.
+
+    Images would gain a vtable as well that would contain things like
+
+      - Read scanline
+      - Write scanline
+
+    (Or even read_patch/write_patch as suggested by Keith a while
+    back).
+
+    This could simplify the compositing code considerably.
+
+  - Review the pixman_format_code_t enum to make sure it will support
+    future formats. Some formats we will probably need:
+
+    	   ARGB/ABGR with 16/32/64 bit integer/floating channels
+	   YUV2,
+	   YV12
+
+    Also we may need the ability to distinguish between PICT_c8 and
+    PICT_x4c4. (This could be done by interpreting the A channel as
+    the depth for TYPE_COLOR and TYPE_GRAY formats).
+
+    A possibility may be to reserve the two top bits and make them
+    encode "number of places to shift the channel widths given" Since
+    these bits are 00 at the moment everything will continue to work,
+    but these additional widths will be allowed:
+
+    	     All even widths between 18-32
+	     All multiples of four widths between 33 and 64
+	     All multiples of eight between 64 and 128
+
+    This means things like r21g22b21 won't work - is that worth
+    worrying about? I don't think so. And of course the bpp field
+    can't handle a depth of over 256, so > 64 bit channels arent'
+    really all that useful.
+
+    We could reserve one extra bit to indicate floating point, but
+    we may also just add 
+
+       	   PIXMAN_TYPE_ARGB_FLOAT
+	   PIXMAN_TYPE_BGRA_FLOAT
+	   PIXMAN_TYPE_A_FLOAT
+    
+    image types. With five bits we can support up to 32 different
+    format types, which should be enough for everybody, even if we
+    decide to support all the various video formats here:
+
+    	        http://www.fourcc.org/yuv.php
+
+    It may make sense to have a PIXMAN_TYPE_YUV, and then use the
+    channel bits to specify the exact subtype.
+
+    Another possibility is to add 
+
+      	  PIXMAN_TYPE_ARGB_W
+	  PIXMAN_TYPE_ARGB_WW
+    
+    where the channel widths would get 16 and 32 added to them,
+    respectively.
+
+    What about color spaces such a linear vs. srGB etc.?
+
+
+done:
+
+- Use pixmanFillsse2 and pixmanBltsse2
+
+- Be consistent about calling sse2 sse2
+
+- Rename "SSE" to "MMX_EXTENSIONS". (Deleted mmx extensions).
+
+- Commented-out uses of fbCompositeCopyAreasse2()
+
+- Consider whether calling regions region16 is really such a great
+  idea. Vlad wants 32 bit regions for Cairo. This will break X server
+  ABI, but should otherwise be mostly harmless, though a
+  pixman_region_get_boxes16() may be useful.
+
+- Altivec signal issue (Company has fix, there is also a patch by
+  dwmw2 in rawhide).
+
+- Behdad's MMX issue - see list
+
+- SSE2 issues:
+    - Crashes in Mozilla because of unaligned stack. Possible fixes
+        - Make use of gcc 4.2 feature to align the stack
+        - Write some sort of trampoline that aligns the stack
+          before calling SSE functions.
+
+- Get rid of the switch-of-doom; replace it with a big table
+  describing the various fast paths.
+
+- Make source clipping optional.
+    - done: source clipping happens through an indirection.
+        still needs to make the indirection settable. (And call it
+        from X)
+
+- Run cairo test suite; fix bugs
+	- one bug in source-scale-clip
+
+ - Remove the warning suppression in the ACCESS_MEM macro and fix the
+    warnings that are real
+	- irrelevant now.
+
+- make the wrapper functions global instead of image specific
+	- this won't work since pixman is linked to both fb and wfb
+
+- Add non-mmx solid fill
+
+- Make sure the endian-ness macros are defined correctly.
+
+- The rectangles in a region probably shouldn't be returned const as
+  the X server will be changing them.
+
+- Right now we _always_ have a clip region, which is empty by default.
+  Why does this work at all? It probably doesn't. The server
+  distinguishes two cases, one where nothing is clipped (CT_NONE), and
+  one where there is a clip region (CT_REGION).
+
+- Default clip region should be the full image
+
+  - Test if pseudo color still works. It does, but it also shows that
+    copying a pixman_indexed_t on every composite operation is not
+    going to fly. So, for now set_indexed() does not copy the 
+    indexed table. 
+
+    Also just the malloc() to allocate a pixman image shows up pretty
+    high.
+
+    Options include
+
+      - Make all the setters not copy their arguments
+
+      - Possibly combined with going back to the stack allocated 
+        approach that we already use for regions.
+
+      - Keep a cached pixman_image_t around for every picture. It would
+        have to be kept uptodate every time something changes about the
+        picture.
+
+      - Break the X server ABI and simply have the relevant parameter
+        stored in the pixman image. This would have the additional benefits
+        that:
+
+          - We can get rid of the annoying repeat field which is duplicated
+            elsewhere.
+
+          - We can use pixman_color_t and pixman_gradient_stop_t
+            etc. instead of the types that are defined in
+            renderproto.h
+
diff --git a/autogen.sh b/autogen.sh
new file mode 100755
index 0000000..354f254
--- /dev/null
+++ b/autogen.sh
@@ -0,0 +1,12 @@
+#! /bin/sh
+
+srcdir=`dirname $0`
+test -z "$srcdir" && srcdir=.
+
+ORIGDIR=`pwd`
+cd $srcdir
+
+autoreconf -v --install || exit 1
+cd $ORIGDIR || exit $?
+
+$srcdir/configure "$@"
diff --git a/configure.ac b/configure.ac
new file mode 100755
index 0000000..6c88c84
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,895 @@
+dnl  Copyright 2005 Red Hat, Inc.
+dnl 
+dnl  Permission to use, copy, modify, distribute, and sell this software and its
+dnl  documentation for any purpose is hereby granted without fee, provided that
+dnl  the above copyright notice appear in all copies and that both that
+dnl  copyright notice and this permission notice appear in supporting
+dnl  documentation, and that the name of Red Hat not be used in
+dnl  advertising or publicity pertaining to distribution of the software without
+dnl  specific, written prior permission.  Red Hat makes no
+dnl  representations about the suitability of this software for any purpose.  It
+dnl  is provided "as is" without express or implied warranty.
+dnl 
+dnl  RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+dnl  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+dnl  EVENT SHALL RED HAT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+dnl  CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+dnl  DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+dnl  TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+dnl  PERFORMANCE OF THIS SOFTWARE.
+dnl
+dnl Process this file with autoconf to create configure.
+
+AC_PREREQ([2.57])
+
+#   Pixman versioning scheme
+#
+#   - The version in git has an odd MICRO version number
+#
+#   - Released versions, both development and stable, have an
+#     even MICRO version number
+#
+#   - Released development versions have an odd MINOR number
+#
+#   - Released stable versions have an even MINOR number
+#
+#   - Versions that break ABI must have a new MAJOR number
+#
+#   - If you break the ABI, then at least this must be done:
+#
+#        - increment MAJOR
+#
+#        - In the first development release where you break ABI, find
+#          all instances of "pixman-n" and change them to pixman-(n+1)
+#
+#          This needs to be done at least in 
+#                    configure.ac
+#                    all Makefile.am's
+#                    pixman-n.pc.in
+#
+#      This ensures that binary incompatible versions can be installed
+#      in parallel.  See http://www106.pair.com/rhp/parallel.html for
+#      more information
+#
+
+m4_define([pixman_major], 0)
+m4_define([pixman_minor], 23)
+m4_define([pixman_micro], 7)
+
+m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
+
+AC_INIT(pixman, pixman_version, [pixman@lists.freedesktop.org], pixman)
+AM_INIT_AUTOMAKE([foreign dist-bzip2])
+
+# Suppress verbose compile lines
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+
+AM_CONFIG_HEADER(config.h)
+
+AC_CANONICAL_HOST
+
+test_CFLAGS=${CFLAGS+set} # We may override autoconf default CFLAGS.
+
+AC_PROG_CC
+AM_PROG_AS
+AC_PROG_LIBTOOL
+AC_CHECK_FUNCS([getisax])
+AC_C_BIGENDIAN
+AC_C_INLINE
+
+dnl PIXMAN_LINK_WITH_ENV(env-setup, program, true-action, false-action)
+dnl
+dnl Compiles and links the given program in the environment setup by env-setup
+dnl and executes true-action on success and false-action on failure.
+AC_DEFUN([PIXMAN_LINK_WITH_ENV],[dnl
+	save_CFLAGS="$CFLAGS"
+	save_LDFLAGS="$LDFLAGS"
+	save_LIBS="$LIBS"
+	CFLAGS=""
+	LDFLAGS=""
+	LIBS=""
+	$1
+	AC_LINK_IFELSE(
+		[AC_LANG_SOURCE([$2])],
+		[pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
+		 pixman_cc_flag=yes],
+		[pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
+		 pixman_cc_flag=no])
+
+	if test "x$pixman_cc_stderr" != "x"; then
+		pixman_cc_flag=no
+	fi
+
+	if test "x$pixman_cc_flag" = "xyes"; then
+		ifelse([$3], , :, [$3])
+	else
+		ifelse([$4], , :, [$4])
+	fi
+	CFLAGS="$save_CFLAGS"
+	LDFLAGS="$save_LDFLAGS"
+	LIBS="$save_LIBS"
+])
+
+dnl Find a -Werror for catching warnings.
+WERROR=
+for w in -Werror -errwarn; do
+    if test "z$WERROR" = "z"; then
+        AC_MSG_CHECKING([whether the compiler supports $w])
+        PIXMAN_LINK_WITH_ENV(
+		[CFLAGS=$w],
+		[int main(int c, char **v) { (void)c; (void)v; return 0; }],
+		[WERROR=$w; yesno=yes], [yesno=no])
+	AC_MSG_RESULT($yesno)
+    fi
+done
+
+dnl PIXMAN_CHECK_CFLAG(flag, [program])
+dnl  Adds flag to CFLAGS if the given program links without warnings or errors.
+AC_DEFUN([PIXMAN_CHECK_CFLAG], [dnl
+	AC_MSG_CHECKING([whether the compiler supports $1])
+	PIXMAN_LINK_WITH_ENV(
+		[CFLAGS="$WERROR $1"],
+		[$2
+		 int main(int c, char **v) { (void)c; (void)v; return 0; }
+		],
+		[_yesno=yes],
+		[_yesno=no])
+	if test "x$_yesno" = xyes; then
+	   CFLAGS="$CFLAGS $1"
+	fi
+	AC_MSG_RESULT($_yesno)
+])
+
+AC_CHECK_SIZEOF(long)
+
+# Checks for Sun Studio compilers
+AC_CHECK_DECL([__SUNPRO_C], [SUNCC="yes"], [SUNCC="no"])
+AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"])
+
+# Default CFLAGS to -O -g rather than just the -g from AC_PROG_CC
+# if we're using Sun Studio and neither the user nor a config.site
+# has set CFLAGS.
+if test $SUNCC = yes &&			\
+   test "$test_CFLAGS" == "" &&		\
+   test "$CFLAGS" = "-g"
+then
+  CFLAGS="-O -g"
+fi
+
+# 
+# We ignore pixman_major in the version here because the major version should
+# always be encoded in the actual library name. Ie., the soname is:
+#
+#      pixman-$(pixman_major).0.minor.micro
+#
+m4_define([lt_current], [pixman_minor])
+m4_define([lt_revision], [pixman_micro])
+m4_define([lt_age], [pixman_minor])
+
+LT_VERSION_INFO="lt_current:lt_revision:lt_age"
+
+PIXMAN_VERSION_MAJOR=pixman_major()
+AC_SUBST(PIXMAN_VERSION_MAJOR)
+PIXMAN_VERSION_MINOR=pixman_minor()
+AC_SUBST(PIXMAN_VERSION_MINOR)
+PIXMAN_VERSION_MICRO=pixman_micro()
+AC_SUBST(PIXMAN_VERSION_MICRO)
+
+AC_SUBST(LT_VERSION_INFO)
+
+# Check for dependencies
+
+PIXMAN_CHECK_CFLAG([-Wall])
+PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
+
+AC_PATH_PROG(PERL, perl, no)
+if test "x$PERL" = xno; then
+    AC_MSG_ERROR([Perl is required to build pixman.])
+fi
+AC_SUBST(PERL)
+
+dnl =========================================================================
+dnl OpenMP for the test suite?
+dnl
+
+# Check for OpenMP support only when autoconf support that (require autoconf >=2.62)
+OPENMP_CFLAGS=
+m4_ifdef([AC_OPENMP], [AC_OPENMP])
+
+if test "x$enable_openmp" = "xyes" && test "x$ac_cv_prog_c_openmp" = "xunsupported" ; then
+  AC_MSG_WARN([OpenMP support requested but found unsupported])
+fi
+
+dnl May not fail to link without -Wall -Werror added
+dnl So try to link only when openmp is supported
+dnl ac_cv_prog_c_openmp is not defined when --disable-openmp is used
+if test "x$ac_cv_prog_c_openmp" != "xunsupported" && test "x$ac_cv_prog_c_openmp" != "x"; then
+  m4_define([openmp_test_program],[dnl
+  #include <stdio.h>
+
+  extern unsigned int lcg_seed;
+  #pragma omp threadprivate(lcg_seed)
+  unsigned int lcg_seed;
+
+  unsigned function(unsigned a, unsigned b)
+  {
+	lcg_seed ^= b;
+	return ((a + b) ^ a ) + lcg_seed;
+  }
+
+  int main(int argc, char **argv)
+  {
+	int i;
+	int n1 = 0, n2 = argc;
+	unsigned checksum = 0;
+	int verbose = argv != NULL;
+	unsigned (*test_function)(unsigned, unsigned);
+	test_function = function;
+	#pragma omp parallel for reduction(+:checksum) default(none) \
+					shared(n1, n2, test_function, verbose)
+	for (i = n1; i < n2; i++)
+	{
+		unsigned crc = test_function (i, 0);
+		if (verbose)
+			printf ("%d: %08X\n", i, crc);
+		checksum += crc;
+	}
+	printf("%u\n", checksum);
+	return 0;
+  }
+  ])
+
+  PIXMAN_LINK_WITH_ENV(
+	[CFLAGS="$OPENMP_CFLAGS" LDFLAGS="$OPENMP_CFLAGS"],
+	[openmp_test_program],
+	[have_openmp=yes],
+	[have_openmp=no])
+  if test "x$have_openmp" = "xyes" ; then
+    AC_DEFINE(USE_OPENMP, 1, [use OpenMP in the test suite])
+  fi
+fi
+AC_SUBST(OPENMP_CFLAGS)
+
+dnl =========================================================================
+dnl -fvisibility stuff
+
+PIXMAN_CHECK_CFLAG([-fvisibility=hidden], [dnl
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#ifdef _WIN32
+#error Have -fvisibility but it is ignored and generates a warning
+#endif
+#else
+error Need GCC 4.0 for visibility
+#endif
+])
+
+PIXMAN_CHECK_CFLAG([-xldscope=hidden], [dnl
+#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
+#else
+error Need Sun Studio 8 for visibility
+#endif
+])
+
+dnl ===========================================================================
+dnl Check for MMX
+
+if test "x$MMX_CFLAGS" = "x" ; then
+   if test "x$SUNCC" = "xyes"; then
+      # Sun Studio doesn't have an -xarch=mmx flag, so we have to use sse
+      # but if we're building 64-bit, mmx & sse support is on by default and
+      # -xarch=sse throws an error instead
+      if test "$AMD64_ABI" = "no" ; then
+         MMX_CFLAGS="-xarch=sse"
+      fi
+   else
+      MMX_CFLAGS="-mmmx -Winline"
+   fi
+fi
+
+have_mmx_intrinsics=no
+AC_MSG_CHECKING(whether to use MMX intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$MMX_CFLAGS $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
+error "Need GCC >= 3.4 for MMX intrinsics"
+#endif
+#include <mmintrin.h>
+int main () {
+    __m64 v = _mm_cvtsi32_si64 (1);
+    return _mm_cvtsi64_si32 (v);
+}]])], have_mmx_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(mmx,
+   [AC_HELP_STRING([--disable-mmx],
+                   [disable x86 MMX fast paths])],
+   [enable_mmx=$enableval], [enable_mmx=auto])
+
+if test $enable_mmx = no ; then
+   have_mmx_intrinsics=disabled
+fi
+
+if test $have_mmx_intrinsics = yes ; then
+   AC_DEFINE(USE_X86_MMX, 1, [use x86 MMX compiler intrinsics])
+else
+   MMX_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_mmx_intrinsics)
+if test $enable_mmx = yes && test $have_mmx_intrinsics = no ; then
+   AC_MSG_ERROR([x86 MMX intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_X86_MMX, test $have_mmx_intrinsics = yes)
+
+dnl ===========================================================================
+dnl Check for SSE2
+
+if test "x$SSE2_CFLAGS" = "x" ; then
+   if test "x$SUNCC" = "xyes"; then
+      # SSE2 is enabled by default in the Sun Studio 64-bit environment
+      if test "$AMD64_ABI" = "no" ; then
+         SSE2_CFLAGS="-xarch=sse2"
+      fi
+   else
+      SSE2_CFLAGS="-msse2 -Winline"
+   fi
+fi
+
+have_sse2_intrinsics=no
+AC_MSG_CHECKING(whether to use SSE2 intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$SSE2_CFLAGS $CFLAGS"
+
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
+#   if !defined(__amd64__) && !defined(__x86_64__)
+#      error "Need GCC >= 4.2 for SSE2 intrinsics on x86"
+#   endif
+#endif
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+int main () {
+    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
+	c = _mm_xor_si128 (a, b);
+    return 0;
+}]])], have_sse2_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(sse2,
+   [AC_HELP_STRING([--disable-sse2],
+                   [disable SSE2 fast paths])],
+   [enable_sse2=$enableval], [enable_sse2=auto])
+
+if test $enable_sse2 = no ; then
+   have_sse2_intrinsics=disabled
+fi
+
+if test $have_sse2_intrinsics = yes ; then
+   AC_DEFINE(USE_SSE2, 1, [use SSE2 compiler intrinsics])
+fi
+
+AC_MSG_RESULT($have_sse2_intrinsics)
+if test $enable_sse2 = yes && test $have_sse2_intrinsics = no ; then
+   AC_MSG_ERROR([SSE2 intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)
+
+dnl ===========================================================================
+dnl Other special flags needed when building code using MMX or SSE instructions
+case $host_os in
+   solaris*)
+      # When building 32-bit binaries, apply a mapfile to ensure that the
+      # binaries aren't flagged as only able to run on MMX+SSE capable CPUs
+      # since they check at runtime before using those instructions.
+      # Not all linkers grok the mapfile format so we check for that first.
+      if test "$AMD64_ABI" = "no" ; then
+	 use_hwcap_mapfile=no
+	 AC_MSG_CHECKING(whether to use a hardware capability map file)
+	 hwcap_save_LDFLAGS="$LDFLAGS"
+	 HWCAP_LDFLAGS='-Wl,-M,$(srcdir)/solaris-hwcap.mapfile'
+	 LDFLAGS="$LDFLAGS -Wl,-M,pixman/solaris-hwcap.mapfile"
+	 AC_LINK_IFELSE([AC_LANG_SOURCE([[int main() { return 0; }]])],
+			use_hwcap_mapfile=yes,
+			HWCAP_LDFLAGS="")
+	 LDFLAGS="$hwcap_save_LDFLAGS"
+	 AC_MSG_RESULT($use_hwcap_mapfile)
+      fi
+      if test "x$MMX_LDFLAGS" = "x" ; then
+         MMX_LDFLAGS="$HWCAP_LDFLAGS"
+      fi
+      if test "x$SSE2_LDFLAGS" = "x" ; then
+	 SSE2_LDFLAGS="$HWCAP_LDFLAGS"
+      fi
+      ;;
+esac
+
+AC_SUBST(MMX_CFLAGS)
+AC_SUBST(MMX_LDFLAGS)
+AC_SUBST(SSE2_CFLAGS)
+AC_SUBST(SSE2_LDFLAGS)
+
+dnl ===========================================================================
+dnl Check for VMX/Altivec
+if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then
+    VMX_CFLAGS="-faltivec"
+else
+    VMX_CFLAGS="-maltivec -mabi=altivec"
+fi
+
+have_vmx_intrinsics=no
+AC_MSG_CHECKING(whether to use VMX/Altivec intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$VMX_CFLAGS $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
+error "Need GCC >= 3.4 for sane altivec support"
+#endif
+#include <altivec.h>
+int main () {
+    vector unsigned int v = vec_splat_u32 (1);
+    v = vec_sub (v, v);
+    return 0;
+}]])], have_vmx_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(vmx,
+   [AC_HELP_STRING([--disable-vmx],
+                   [disable VMX fast paths])],
+   [enable_vmx=$enableval], [enable_vmx=auto])
+
+if test $enable_vmx = no ; then
+   have_vmx_intrinsics=disabled
+fi
+
+if test $have_vmx_intrinsics = yes ; then
+   AC_DEFINE(USE_VMX, 1, [use VMX compiler intrinsics])
+else
+   VMX_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_vmx_intrinsics)
+if test $enable_vmx = yes && test $have_vmx_intrinsics = no ; then
+   AC_MSG_ERROR([VMX intrinsics not detected])
+fi
+
+AC_SUBST(VMX_CFLAGS)
+
+AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
+
+dnl ==========================================================================
+dnl Check if assembler is gas compatible and supports ARM SIMD instructions
+have_arm_simd=no
+AC_MSG_CHECKING(whether to use ARM SIMD assembler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="-x assembler-with-cpp $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+.text
+.arch armv6
+.object_arch armv4
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+uqadd8 r0, r0, r0]])], have_arm_simd=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(arm-simd,
+   [AC_HELP_STRING([--disable-arm-simd],
+                   [disable ARM SIMD fast paths])],
+   [enable_arm_simd=$enableval], [enable_arm_simd=auto])
+
+if test $enable_arm_simd = no ; then
+   have_arm_simd=disabled
+fi
+
+if test $have_arm_simd = yes ; then
+   AC_DEFINE(USE_ARM_SIMD, 1, [use ARM SIMD assembly optimizations])
+fi
+
+AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes)
+
+AC_MSG_RESULT($have_arm_simd)
+if test $enable_arm_simd = yes && test $have_arm_simd = no ; then
+   AC_MSG_ERROR([ARM SIMD intrinsics not detected])
+fi
+
+dnl ==========================================================================
+dnl Check if assembler is gas compatible and supports NEON instructions
+have_arm_neon=no
+AC_MSG_CHECKING(whether to use ARM NEON assembler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="-x assembler-with-cpp $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.eabi_attribute 10, 0
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+vmovn.u16 d0, q0]])], have_arm_neon=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(arm-neon,
+   [AC_HELP_STRING([--disable-arm-neon],
+                   [disable ARM NEON fast paths])],
+   [enable_arm_neon=$enableval], [enable_arm_neon=auto])
+
+if test $enable_arm_neon = no ; then
+   have_arm_neon=disabled
+fi
+
+if test $have_arm_neon = yes ; then
+   AC_DEFINE(USE_ARM_NEON, 1, [use ARM NEON assembly optimizations])
+fi
+
+AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes)
+
+AC_MSG_RESULT($have_arm_neon)
+if test $enable_arm_neon = yes && test $have_arm_neon = no ; then
+   AC_MSG_ERROR([ARM NEON intrinsics not detected])
+fi
+
+dnl ===========================================================================
+dnl Check for IWMMXT
+
+if test "x$IWMMXT_CFLAGS" = "x" ; then
+   IWMMXT_CFLAGS="-march=iwmmxt -flax-vector-conversions -Winline"
+fi
+
+have_iwmmxt_intrinsics=no
+AC_MSG_CHECKING(whether to use ARM IWMMXT intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$IWMMXT_CFLAGS $CFLAGS"
+AC_COMPILE_IFELSE([
+#ifndef __arm__
+#error "IWMMXT is only available on ARM"
+#endif
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6))
+#error "Need GCC >= 4.6 for IWMMXT intrinsics"
+#endif
+#include <mmintrin.h>
+int main () {
+	union {
+		__m64 v;
+		[char c[8];]
+	} a = { .c = {1, 2, 3, 4, 5, 6, 7, 8} };
+	int b = 4;
+	__m64 c = _mm_srli_si64 (a.v, b);
+}], have_iwmmxt_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(arm-iwmmxt,
+   [AC_HELP_STRING([--disable-arm-iwmmxt],
+                   [disable ARM IWMMXT fast paths])],
+   [enable_iwmmxt=$enableval], [enable_iwmmxt=auto])
+
+if test $enable_iwmmxt = no ; then
+   have_iwmmxt_intrinsics=disabled
+fi
+
+if test $have_iwmmxt_intrinsics = yes ; then
+   AC_DEFINE(USE_ARM_IWMMXT, 1, [use ARM IWMMXT compiler intrinsics])
+else
+   IWMMXT_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_iwmmxt_intrinsics)
+if test $enable_iwmmxt = yes && test $have_iwmmxt_intrinsics = no ; then
+   AC_MSG_ERROR([IWMMXT intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_ARM_IWMMXT, test $have_iwmmxt_intrinsics = yes)
+
+dnl =========================================================================================
+dnl Check for GNU-style inline assembly support
+
+have_gcc_inline_asm=no
+AC_MSG_CHECKING(whether to use GNU-style inline assembler)
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+int main () {
+    /* Most modern architectures have a NOP instruction, so this is a fairly generic test. */
+	asm volatile ( "\tnop\n" : : : "cc", "memory" );
+    return 0;
+}]])], have_gcc_inline_asm=yes)
+
+AC_ARG_ENABLE(gcc-inline-asm,
+   [AC_HELP_STRING([--disable-gcc-inline-asm],
+                   [disable GNU-style inline assembler])],
+   [enable_gcc_inline_asm=$enableval], [enable_gcc_inline_asm=auto])
+
+if test $enable_gcc_inline_asm = no ; then
+   have_gcc_inline_asm=disabled
+fi
+
+if test $have_gcc_inline_asm = yes ; then
+   AC_DEFINE(USE_GCC_INLINE_ASM, 1, [use GNU-style inline assembler])
+fi
+
+AC_MSG_RESULT($have_gcc_inline_asm)
+if test $enable_gcc_inline_asm = yes && test $have_gcc_inline_asm = no ; then
+   AC_MSG_ERROR([GNU-style inline assembler not detected])
+fi
+
+AM_CONDITIONAL(USE_GCC_INLINE_ASM, test $have_gcc_inline_asm = yes)
+
+dnl ==============================================
+dnl Static test programs
+
+AC_ARG_ENABLE(static-testprogs,
+   [AC_HELP_STRING([--enable-static-testprogs],
+		   [build test programs as static binaries [default=no]])],
+   [enable_static_testprogs=$enableval], [enable_static_testprogs=no])
+
+TESTPROGS_EXTRA_LDFLAGS=
+if test "x$enable_static_testprogs" = "xyes" ; then
+   TESTPROGS_EXTRA_LDFLAGS="-all-static"
+fi
+AC_SUBST(TESTPROGS_EXTRA_LDFLAGS)
+
+dnl ==============================================
+dnl Timers
+
+AC_ARG_ENABLE(timers,
+   [AC_HELP_STRING([--enable-timers],
+		   [enable TIMER_BEGIN and TIMER_END macros [default=no]])],
+   [enable_timers=$enableval], [enable_timers=no])
+
+if test $enable_timers = yes ; then 
+   AC_DEFINE(PIXMAN_TIMERS, 1, [enable TIMER_BEGIN/TIMER_END macros])
+fi
+AC_SUBST(PIXMAN_TIMERS)
+
+dnl ===================================
+dnl GTK+
+
+AC_ARG_ENABLE(gtk,
+   [AC_HELP_STRING([--enable-gtk],
+                   [enable tests using GTK+ [default=auto]])],
+   [enable_gtk=$enableval], [enable_gtk=auto])
+
+PKG_PROG_PKG_CONFIG
+
+if test $enable_gtk = yes ; then
+   AC_CHECK_LIB([pixman-1], [pixman_version_string])
+   PKG_CHECK_MODULES(GTK, [gtk+-2.0 pixman-1])
+fi
+
+if test $enable_gtk = auto ; then
+   AC_CHECK_LIB([pixman-1], [pixman_version_string], [enable_gtk=auto], [enable_gtk=no])
+fi
+
+if test $enable_gtk = auto ; then
+   PKG_CHECK_MODULES(GTK, [gtk+-2.0 pixman-1], [enable_gtk=yes], [enable_gtk=no])
+fi
+
+AM_CONDITIONAL(HAVE_GTK, [test "x$enable_gtk" = xyes])
+
+AC_SUBST(GTK_CFLAGS)
+AC_SUBST(GTK_LIBS)
+AC_SUBST(DEP_CFLAGS)
+AC_SUBST(DEP_LIBS)
+
+dnl =====================================
+dnl posix_memalign, sigaction, alarm, gettimeofday
+
+AC_CHECK_FUNC(posix_memalign, have_posix_memalign=yes, have_posix_memalign=no)
+if test x$have_posix_memalign = xyes; then
+   AC_DEFINE(HAVE_POSIX_MEMALIGN, 1, [Whether we have posix_memalign()])
+fi
+
+AC_CHECK_FUNC(sigaction, have_sigaction=yes, have_sigaction=no)
+if test x$have_sigaction = xyes; then
+   AC_DEFINE(HAVE_SIGACTION, 1, [Whether we have sigaction()])
+fi
+
+AC_CHECK_FUNC(alarm, have_alarm=yes, have_alarm=no)
+if test x$have_alarm = xyes; then
+   AC_DEFINE(HAVE_ALARM, 1, [Whether we have alarm()])
+fi
+
+AC_CHECK_HEADER([sys/mman.h],
+   [AC_DEFINE(HAVE_SYS_MMAN_H, [1], [Define to 1 if we have <sys/mman.h>])])
+
+AC_CHECK_FUNC(mmap, have_mmap=yes, have_mmap=no)
+if test x$have_mmap = xyes; then
+   AC_DEFINE(HAVE_MMAP, 1, [Whether we have mmap()])
+fi
+
+AC_CHECK_FUNC(mprotect, have_mprotect=yes, have_mprotect=no)
+if test x$have_mprotect = xyes; then
+   AC_DEFINE(HAVE_MPROTECT, 1, [Whether we have mprotect()])
+fi
+
+AC_CHECK_FUNC(getpagesize, have_getpagesize=yes, have_getpagesize=no)
+if test x$have_getpagesize = xyes; then
+   AC_DEFINE(HAVE_GETPAGESIZE, 1, [Whether we have getpagesize()])
+fi
+
+AC_CHECK_HEADER([fenv.h],
+   [AC_DEFINE(HAVE_FENV_H, [1], [Define to 1 if we have <fenv.h>])])
+
+AC_CHECK_LIB(m, feenableexcept, have_feenableexcept=yes, have_feenableexcept=no)
+if test x$have_feenableexcept = xyes; then
+   AC_DEFINE(HAVE_FEENABLEEXCEPT, 1, [Whether we have feenableexcept()])
+fi
+
+AC_CHECK_FUNC(gettimeofday, have_gettimeofday=yes, have_gettimeofday=no)
+AC_CHECK_HEADER(sys/time.h, have_sys_time_h=yes, have_sys_time_h=no)
+if test x$have_gettimeofday = xyes && test x$have_sys_time_h = xyes; then
+   AC_DEFINE(HAVE_GETTIMEOFDAY, 1, [Whether we have gettimeofday()])
+fi
+
+dnl =====================================
+dnl Thread local storage
+
+support_for__thread=no
+
+AC_MSG_CHECKING(for __thread)
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+#if defined(__MINGW32__) && !(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))
+#error This MinGW version has broken __thread support
+#endif
+#ifdef __OpenBSD__
+#error OpenBSD has broken __thread support
+#endif
+static __thread int x ;
+int main () { x = 123; return x; }
+]])], support_for__thread=yes)
+
+if test $support_for__thread = yes; then 
+   AC_DEFINE([TOOLCHAIN_SUPPORTS__THREAD],[],[Whether the tool chain supports __thread])
+fi
+
+AC_MSG_RESULT($support_for__thread)
+
+dnl
+dnl posix tls
+dnl
+
+m4_define([pthread_test_program],AC_LANG_SOURCE([[dnl
+#include <stdlib.h>
+#include <pthread.h>
+
+static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+static pthread_key_t key;
+
+static void
+make_key (void)
+{
+    pthread_key_create (&key, NULL);
+}
+
+int
+main ()
+{
+    void *value = NULL;
+
+    if (pthread_once (&once_control, make_key) != 0)
+    {
+	value = NULL;
+    }
+    else
+    {
+	value = pthread_getspecific (key);
+	if (!value)
+	{
+	    value = malloc (100);
+	    pthread_setspecific (key, value);
+	}
+    }
+    return 0;
+}
+]]))
+
+AC_DEFUN([PIXMAN_CHECK_PTHREAD],[dnl
+    if test "z$support_for_pthread_setspecific" != "zyes"; then
+	PIXMAN_LINK_WITH_ENV(
+		[$1], [pthread_test_program],
+		[PTHREAD_CFLAGS="$CFLAGS"
+		 PTHREAD_LIBS="$LIBS"
+		 PTHREAD_LDFLAGS="$LDFLAGS"
+		 support_for_pthread_setspecific=yes])
+    fi
+])
+
+if test $support_for__thread = no; then
+    support_for_pthread_setspecific=no
+
+    AC_MSG_CHECKING(for pthread_setspecific)
+
+    PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LIBS="-lpthread"])
+    PIXMAN_CHECK_PTHREAD([CFLAGS="-pthread"; LDFLAGS="-pthread"])
+    PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"])
+    
+    if test $support_for_pthread_setspecific = yes; then
+	CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+	AC_DEFINE([HAVE_PTHREAD_SETSPECIFIC], [], [Whether pthread_setspecific() is supported])
+    fi
+
+    AC_MSG_RESULT($support_for_pthread_setspecific);
+fi
+
+AC_SUBST(TOOLCHAIN_SUPPORTS__THREAD)
+AC_SUBST(HAVE_PTHREAD_SETSPECIFIC)
+AC_SUBST(PTHREAD_LDFLAGS)
+AC_SUBST(PTHREAD_LIBS)
+
+dnl =====================================
+dnl __attribute__((constructor))
+
+support_for_attribute_constructor=no
+
+AC_MSG_CHECKING(for __attribute__((constructor)))
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7))
+/* attribute 'constructor' is supported since gcc 2.7, but some compilers
+ * may only pretend to be gcc, so let's try to actually use it
+ */
+static int x = 1;
+static void __attribute__((constructor)) constructor_function () { x = 0; }
+int main (void) { return x; }
+#else
+#error not gcc or gcc version is older than 2.7
+#endif
+]])], support_for_attribute_constructor=yes)
+
+if test x$support_for_attribute_constructor = xyes; then
+   AC_DEFINE([TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR],
+             [],[Whether the tool chain supports __attribute__((constructor))])
+fi
+
+AC_MSG_RESULT($support_for_attribute_constructor)
+AC_SUBST(TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR)
+
+dnl ==================
+dnl libpng
+
+PKG_CHECK_MODULES(PNG, [libpng], have_libpng=yes, have_libpng=no)
+
+if test x$have_libpng = xyes; then
+    AC_DEFINE([HAVE_LIBPNG], [1], [Whether we have libpng])
+fi
+
+AC_SUBST(HAVE_LIBPNG)
+
+AC_OUTPUT([pixman-1.pc
+           pixman-1-uninstalled.pc
+           Makefile
+	   pixman/Makefile
+	   pixman/pixman-version.h
+	   demos/Makefile
+	   test/Makefile])
+
+m4_if(m4_eval(pixman_minor % 2), [1], [
+   echo
+   echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+   echo
+   echo "      Thanks for testing this development snapshot of pixman. Please"
+   echo "      report any problems you find, either by sending email to "
+   echo
+   echo "          pixman@lists.freedesktop.org"
+   echo
+   echo "      or by filing a bug at "
+   echo
+   echo "          https://bugs.freedesktop.org/enter_bug.cgi?product=pixman "
+   echo
+   echo "      If you are looking for a stable release of pixman, please note "
+   echo "      that stable releases have _even_ minor version numbers. Ie., "
+   echo "      pixman-0.]m4_eval(pixman_minor & ~1)[.x are stable releases, whereas pixman-$PIXMAN_VERSION_MAJOR.$PIXMAN_VERSION_MINOR.$PIXMAN_VERSION_MICRO is a "
+   echo "      development snapshot that may contain bugs and experimental "
+   echo "      features. "
+   echo 
+   echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+   echo
+])
diff --git a/debian/README.source b/debian/README.source
new file mode 100755
index 0000000..e9078df
--- /dev/null
+++ b/debian/README.source
@@ -0,0 +1,3 @@
+This package uses quilt to manage modifications to the upstream source.  See
+/usr/share/doc/quilt/README.source for details.
+
diff --git a/debian/changelog b/debian/changelog
new file mode 100755
index 0000000..7593eeb
--- /dev/null
+++ b/debian/changelog
@@ -0,0 +1,7 @@
+pixman (0.23.7-1slp2+1) unstable; urgency=low
+
+  * Initial version
+  * Git: pkgs/p/pixman
+  * Tag: pixman_0.23.7-1slp2+1
+
+ -- Seongwon Cho <seongwon1.cho@samsung.com>  Thu, 08 Dec 2011 13:45:27 +0900
diff --git a/debian/compat b/debian/compat
new file mode 100755
index 0000000..7ed6ff8
--- /dev/null
+++ b/debian/compat
@@ -0,0 +1 @@
+5
diff --git a/debian/control b/debian/control
new file mode 100755
index 0000000..5169912
--- /dev/null
+++ b/debian/control
@@ -0,0 +1,48 @@
+Source: pixman
+Section: devel
+Priority: optional
+Maintainer: Debian X Strike Force <debian-x@lists.debian.org>, Seongwon Cho <seongwon1.cho@samsung.com> 
+Uploaders: Julien Cristau <jcristau@debian.org>, David Nusinow <dnusinow@debian.org>, Seongwon Cho <seongwon1.cho@samsung.com> 
+Build-Depends: debhelper (>= 5), automake, autoconf, libtool, pkg-config, quilt, libpng12-dev
+Standards-Version: 3.8.3
+Vcs-Git: git://git.debian.org/git/pkg-xorg/lib/pixman
+Vcs-Browser: http://git.debian.org/?p=pkg-xorg/lib/pixman.git
+
+Package: libpixman-1-0
+Section: libs
+Architecture: any
+Depends:  ${shlibs:Depends}, ${misc:Depends}, libpng12-0
+Description: pixel-manipulation library for X and cairo
+ A library for manipulating pixel regions -- a set of Y-X banded
+ rectangles, image compositing using the Porter/Duff model
+ and implicit mask generation for geometric primitives including
+ trapezoids, triangles, and rectangles.
+
+#Package: libpixman-1-0-udeb
+#Section: debian-installer
+#XC-Package-Type: udeb
+#Architecture: any
+#Depends:
+# ${shlibs:Depends},
+# ${misc:Depends},
+#Description: pixel-manipulation library for X and cairo
+# This package contains a minimal set of libraries needed for the Debian
+# installer.  Do not install it on a normal system.
+
+Package: libpixman-1-0-dbg
+Section: debug
+Priority: extra
+Architecture: any
+Depends: libpixman-1-0 (= ${binary:Version}), ${misc:Depends},
+Description: pixel-manipulation library for X and cairo (debugging symbols)
+ Debugging symbols for the Cairo/X pixel manipulation library.  This is
+ needed to debug programs linked against libpixman0.
+
+Package: libpixman-1-dev
+Section: libdevel
+Architecture: any
+Depends: libpixman-1-0 (= ${binary:Version}), ${misc:Depends},libpng12-dev
+Conflicts: libpixman1-dev
+Description: pixel-manipulation library for X and cairo (development files)
+ Development libraries, header files and documentation needed by
+ programs that want to compile with the Cairo/X pixman library.
diff --git a/debian/copyright b/debian/copyright
new file mode 100755
index 0000000..93ed0b7
--- /dev/null
+++ b/debian/copyright
@@ -0,0 +1,114 @@
+This package was downloaded from
+http://xorg.freedesktop.org/releases/individual/lib/
+
+Debian packaging by Julien Cristau <jcristau@debian.org>, 18 May 2007.
+
+The following is the 'standard copyright' agreed upon by most contributors,
+and is currently the canonical license, though a modification is currently
+under discussion.  Copyright holders of new code should use this license
+statement where possible, and append their name to this list.  
+
+Copyright 1987, 1988, 1989, 1998  The Open Group
+Copyright 1987, 1988, 1989 Digital Equipment Corporation
+Copyright 1999, 2004, 2008 Keith Packard
+Copyright 2000 SuSE, Inc.
+Copyright 2000 Keith Packard, member of The XFree86 Project, Inc.
+Copyright 2004, 2005, 2007, 2008 Red Hat, Inc.
+Copyright 2004 Nicholas Miell
+Copyright 2005 Lars Knoll & Zack Rusin, Trolltech
+Copyright 2005 Trolltech AS
+Copyright 2007 Luca Barbato
+Copyright 2008 Aaron Plattner, NVIDIA Corporation
+Copyright 2008 Rodrigo Kumpera
+Copyright 2008 AndrÃ© TupinambÃ¡
+Copyright 2008 Mozilla Corporation
+Copyright 2008 Frederic Plourde
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+Other licenses:
+
+Copyright Â© 2000 Keith Packard, member of The XFree86 Project, Inc.
+            2005 Lars Knoll & Zack Rusin, Trolltech
+Copyright Â© 2000 SuSE, Inc.
+Copyright Â© 2007 Red Hat, Inc.
+Copyright Â© 1998 Keith Packard
+
+Permission to use, copy, modify, distribute, and sell this software and its
+documentation for any purpose is hereby granted without fee, provided that
+the above copyright notice appear in all copies and that both that
+copyright notice and this permission notice appear in supporting
+documentation, and that the name of the copyright holders not be used in
+advertising or publicity pertaining to distribution of the software without
+specific, written prior permission.  The copyright holders make no
+representations about the suitability of this software for any purpose.  It
+is provided "as is" without express or implied warranty.
+
+THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+SOFTWARE.
+
+Copyright 1987, 1988, 1989, 1998  The Open Group
+
+Permission to use, copy, modify, distribute, and sell this software and its
+documentation for any purpose is hereby granted without fee, provided that
+the above copyright notice appear in all copies and that both that
+copyright notice and this permission notice appear in supporting
+documentation.
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+Except as contained in this notice, the name of The Open Group shall not be
+used in advertising or otherwise to promote the sale, use or other dealings
+in this Software without prior written authorization from The Open Group.
+
+Copyright 1987, 1988, 1989 by
+Digital Equipment Corporation, Maynard, Massachusetts.
+
+                        All Rights Reserved
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted,
+provided that the above copyright notice appear in all copies and that
+both that copyright notice and this permission notice appear in
+supporting documentation, and that the name of Digital not be
+used in advertising or publicity pertaining to distribution of the
+software without specific, written prior permission.
+
+DIGITAL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
+ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL
+DIGITAL BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
+ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+SOFTWARE.
diff --git a/debian/libpixman-1-0-udeb.install b/debian/libpixman-1-0-udeb.install
new file mode 100755
index 0000000..44c3b82
--- /dev/null
+++ b/debian/libpixman-1-0-udeb.install
@@ -0,0 +1 @@
+usr/lib/libpixman-1.so.*
diff --git a/debian/libpixman-1-0.install b/debian/libpixman-1-0.install
new file mode 100755
index 0000000..44c3b82
--- /dev/null
+++ b/debian/libpixman-1-0.install
@@ -0,0 +1 @@
+usr/lib/libpixman-1.so.*
diff --git a/debian/libpixman-1-0.symbols b/debian/libpixman-1-0.symbols
new file mode 100755
index 0000000..db16c97
--- /dev/null
+++ b/debian/libpixman-1-0.symbols
@@ -0,0 +1,117 @@
+libpixman-1.so.0 libpixman-1-0 #MINVER#
+ pixman_add_trapezoids@Base 0
+ pixman_add_traps@Base 0
+ pixman_blt@Base 0
+ pixman_compute_composite_region@Base 0
+ pixman_disable_out_of_bounds_workaround@Base 0.15.16
+ pixman_edge_init@Base 0
+ pixman_edge_step@Base 0
+ pixman_f_transform_bounds@Base 0.13.2
+ pixman_f_transform_from_pixman_transform@Base 0.13.2
+ pixman_f_transform_init_identity@Base 0.13.2
+ pixman_f_transform_init_rotate@Base 0.13.2
+ pixman_f_transform_init_scale@Base 0.13.2
+ pixman_f_transform_init_translate@Base 0.13.2
+ pixman_f_transform_invert@Base 0.13.2
+ pixman_f_transform_multiply@Base 0.13.2
+ pixman_f_transform_point@Base 0.13.2
+ pixman_f_transform_point_3d@Base 0.13.2
+ pixman_f_transform_rotate@Base 0.13.2
+ pixman_f_transform_scale@Base 0.13.2
+ pixman_f_transform_translate@Base 0.13.2
+ pixman_fill@Base 0
+ pixman_image_composite@Base 0.15.14
+ pixman_image_create_bits@Base 0.15.12
+ pixman_image_create_conical_gradient@Base 0
+ pixman_image_create_linear_gradient@Base 0
+ pixman_image_create_radial_gradient@Base 0
+ pixman_image_create_solid_fill@Base 0
+ pixman_image_fill_rectangles@Base 0.15.14
+ pixman_image_get_data@Base 0
+ pixman_image_get_depth@Base 0
+ pixman_image_get_height@Base 0
+ pixman_image_get_stride@Base 0
+ pixman_image_get_width@Base 0
+ pixman_image_ref@Base 0
+ pixman_image_set_accessors@Base 0
+ pixman_image_set_alpha_map@Base 0
+ pixman_image_set_clip_region32@Base 0.11.2
+ pixman_image_set_clip_region@Base 0
+ pixman_image_set_component_alpha@Base 0
+ pixman_image_set_destroy_function@Base 0.15.12
+ pixman_image_set_filter@Base 0
+ pixman_image_set_has_client_clip@Base 0
+ pixman_image_set_indexed@Base 0
+ pixman_image_set_repeat@Base 0
+ pixman_image_set_source_clipping@Base 0.9.4-2~
+ pixman_image_set_transform@Base 0
+ pixman_image_unref@Base 0
+ pixman_line_fixed_edge_init@Base 0
+ pixman_rasterize_edges@Base 0
+ pixman_rasterize_trapezoid@Base 0
+ pixman_region32_contains_point@Base 0.11.2
+ pixman_region32_contains_rectangle@Base 0.11.2
+ pixman_region32_copy@Base 0.11.2
+ pixman_region32_equal@Base 0.11.2
+ pixman_region32_extents@Base 0.11.2
+ pixman_region32_fini@Base 0.11.2
+ pixman_region32_init@Base 0.11.2
+ pixman_region32_init_rect@Base 0.11.2
+ pixman_region32_init_rects@Base 0.11.2
+ pixman_region32_init_with_extents@Base 0.11.2
+ pixman_region32_intersect@Base 0.11.2
+ pixman_region32_inverse@Base 0.11.2
+ pixman_region32_n_rects@Base 0.11.2
+ pixman_region32_not_empty@Base 0.11.2
+ pixman_region32_rectangles@Base 0.11.2
+ pixman_region32_reset@Base 0.11.2
+ pixman_region32_selfcheck@Base 0.11.2
+ pixman_region32_subtract@Base 0.11.2
+ pixman_region32_translate@Base 0.11.2
+ pixman_region32_union@Base 0.11.2
+ pixman_region32_union_rect@Base 0.11.2
+ pixman_region_contains_point@Base 0
+ pixman_region_contains_rectangle@Base 0
+ pixman_region_copy@Base 0
+ pixman_region_equal@Base 0
+ pixman_region_extents@Base 0
+ pixman_region_fini@Base 0
+ pixman_region_init@Base 0
+ pixman_region_init_rect@Base 0
+ pixman_region_init_rects@Base 0
+ pixman_region_init_with_extents@Base 0
+ pixman_region_intersect@Base 0
+ pixman_region_inverse@Base 0
+ pixman_region_n_rects@Base 0
+ pixman_region_not_empty@Base 0
+ pixman_region_rectangles@Base 0
+ pixman_region_reset@Base 0
+ pixman_region_selfcheck@Base 0
+ pixman_region_set_static_pointers@Base 0
+ pixman_region_subtract@Base 0
+ pixman_region_translate@Base 0
+ pixman_region_union@Base 0
+ pixman_region_union_rect@Base 0
+ pixman_sample_ceil_y@Base 0
+ pixman_sample_floor_y@Base 0
+ pixman_transform_bounds@Base 0.13.2
+ pixman_transform_from_pixman_f_transform@Base 0.13.2
+ pixman_transform_init_identity@Base 0.13.2
+ pixman_transform_init_rotate@Base 0.13.2
+ pixman_transform_init_scale@Base 0.13.2
+ pixman_transform_init_translate@Base 0.13.2
+ pixman_transform_invert@Base 0.13.2
+ pixman_transform_is_identity@Base 0.13.2
+ pixman_transform_is_int_translate@Base 0.13.2
+ pixman_transform_is_inverse@Base 0.13.2
+ pixman_transform_is_scale@Base 0.13.2
+ pixman_transform_multiply@Base 0.13.2
+ pixman_transform_point@Base 0.13.2
+ pixman_transform_rotate@Base 0.13.2
+ pixman_transform_scale@Base 0.13.2
+ pixman_transform_translate@Base 0.13.2
+ pixman_transform_point_3d@Base 0
+ pixman_version@Base 0.10.0
+ pixman_version_string@Base 0.10.0
+ pixman_format_supported_destination@Base 0.15.16
+ pixman_format_supported_source@Base 0.15.16
diff --git a/debian/libpixman-1-dev.install b/debian/libpixman-1-dev.install
new file mode 100755
index 0000000..7f75e79
--- /dev/null
+++ b/debian/libpixman-1-dev.install
@@ -0,0 +1,5 @@
+usr/lib/libpixman-1.la
+usr/lib/libpixman-1.so
+usr/lib/libpixman-1.a
+usr/lib/pkgconfig
+usr/include/pixman-1
diff --git a/debian/rules b/debian/rules
new file mode 100755
index 0000000..11a0b83
--- /dev/null
+++ b/debian/rules
@@ -0,0 +1,110 @@
+#!/usr/bin/make -f
+
+#include /usr/share/quilt/quilt.make
+
+PACKAGE = libpixman-1-0
+SHLIBS_VERSION = 0.15.16
+
+CFLAGS = -Wall -g
+ifneq (,$(filter noopt,$(DEB_BUILD_OPTIONS)))
+	CFLAGS += -O0
+else
+	CFLAGS += -O2
+endif
+ifneq (,$(filter parallel=%,$(DEB_BUILD_OPTIONS)))
+	NUMJOBS = $(patsubst parallel=%,%,$(filter parallel=%,$(DEB_BUILD_OPTIONS)))
+	MAKEFLAGS += -j$(NUMJOBS)
+endif
+
+DEB_HOST_ARCH      ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)
+DEB_HOST_GNU_TYPE  ?= $(shell dpkg-architecture -qDEB_HOST_GNU_TYPE)
+DEB_BUILD_GNU_TYPE ?= $(shell dpkg-architecture -qDEB_BUILD_GNU_TYPE)
+ifeq ($(DEB_BUILD_GNU_TYPE), $(DEB_HOST_GNU_TYPE))
+	confflags += --build=$(DEB_HOST_GNU_TYPE)
+else
+	confflags += --build=$(DEB_HOST_GNU_TYPE) --host=$(DEB_HOST_GNU_TYPE)
+#	confflags += --build=$(DEB_BUILD_GNU_TYPE) --host=$(DEB_HOST_GNU_TYPE)
+endif
+
+ifeq (armel, $(DEB_HOST_ARCH))
+	CFLAGS += -mfpu=vfp -mfloat-abi=softfp
+endif
+
+autogen: autogen-stamp
+autogen-stamp: $(QUILT_STAMPFN)
+	dh_testdir
+	autoreconf -vfi
+	touch $@
+
+config: config-stamp
+config-stamp: autogen-stamp
+	dh_testdir
+	test -d obj-$(DEB_BUILD_GNU_TYPE) || mkdir obj-$(DEB_BUILD_GNU_TYPE)
+	cd obj-$(DEB_BUILD_GNU_TYPE) && \
+	../configure \
+	  --prefix=/usr \
+	  --mandir=\$${prefix}/share/man \
+	  --infodir=\$${prefix}/share/info \
+	  $(confflags) \
+	  CFLAGS="$(CFLAGS)"
+	touch $@
+
+
+build: build-stamp
+build-stamp: config-stamp
+	dh_testdir
+	cd obj-$(DEB_BUILD_GNU_TYPE) && $(MAKE)
+	
+	touch $@
+
+clean: 
+	#unpatch
+	dh_testdir
+	dh_testroot
+	rm -f autogen-stamp config-stamp build-stamp install-stamp
+	
+	rm -f config.cache config.log config.status
+	rm -f */config.cache */config.log */config.status
+	rm -f conftest* */conftest*
+	rm -rf autom4te.cache */autom4te.cache
+	rm -rf obj-*
+	rm -f $$(find -name Makefile.in)
+	rm -f compile config.guess config.sub configure depcomp install-sh
+	rm -f ltmain.sh missing INSTALL aclocal.m4 config.h.in
+	
+	dh_clean
+
+install: install-stamp
+install-stamp: build-stamp
+	dh_testdir
+	dh_testroot
+	dh_clean -k
+	dh_installdirs
+
+	cd obj-$(DEB_BUILD_GNU_TYPE) && $(MAKE) DESTDIR=$(CURDIR)/debian/tmp install
+	touch $@
+
+# Install architecture-dependent files here.
+binary-arch: install
+	dh_testdir
+	dh_testroot
+
+	dh_installdocs
+	dh_install --sourcedir=debian/tmp --list-missing
+	dh_installchangelogs ChangeLog
+	dh_link
+	dh_strip --dbg-package=$(PACKAGE)-dbg
+	dh_compress
+	dh_fixperms
+	dh_makeshlibs -p$(PACKAGE) --add-udeb $(PACKAGE)-udeb -V"$(PACKAGE) (>= $(SHLIBS_VERSION))"
+	dh_installdeb
+	dh_shlibdeps
+	dh_gencontrol
+	dh_md5sums
+	dh_builddeb
+
+binary-indep: install
+# Nothing to do
+
+binary: binary-indep binary-arch
+.PHONY: autogen config build clean binary-indep binary-arch binary install
diff --git a/debian/watch b/debian/watch
new file mode 100755
index 0000000..b83209f
--- /dev/null
+++ b/debian/watch
@@ -0,0 +1,2 @@
+version=3
+http://xorg.freedesktop.org/releases/individual/lib/ pixman-(.*)\.tar\.gz
diff --git a/demos/Makefile.am b/demos/Makefile.am
new file mode 100644
index 0000000..070c2d7
--- /dev/null
+++ b/demos/Makefile.am
@@ -0,0 +1,36 @@
+if HAVE_GTK
+
+AM_CFLAGS = $(OPENMP_CFLAGS)
+AM_LDFLAGS = $(OPENMP_CFLAGS)
+
+LDADD = $(top_builddir)/pixman/libpixman-1.la -lm $(GTK_LIBS) $(PNG_LIBS)
+INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(GTK_CFLAGS) $(PNG_CFLAGS)
+
+GTK_UTILS = gtk-utils.c gtk-utils.h
+
+DEMOS =				\
+	clip-test		\
+	clip-in			\
+	composite-test		\
+	gradient-test		\
+	radial-test		\
+	alpha-test		\
+	screen-test		\
+	convolution-test	\
+	trap-test		\
+	tri-test
+
+gradient_test_SOURCES = gradient-test.c $(GTK_UTILS)
+alpha_test_SOURCES = alpha-test.c $(GTK_UTILS)
+composite_test_SOURCES = composite-test.c $(GTK_UTILS)
+clip_test_SOURCES = clip-test.c $(GTK_UTILS)
+clip_in_SOURCES = clip-in.c $(GTK_UTILS)
+trap_test_SOURCES = trap-test.c $(GTK_UTILS)
+screen_test_SOURCES = screen-test.c $(GTK_UTILS)
+convolution_test_SOURCES = convolution-test.c $(GTK_UTILS)
+radial_test_SOURCES = radial-test.c ../test/utils.c ../test/utils.h $(GTK_UTILS)
+tri_test_SOURCES = tri-test.c ../test/utils.c ../test/utils.h $(GTK_UTILS)
+
+noinst_PROGRAMS = $(DEMOS)
+
+endif
diff --git a/demos/alpha-test.c b/demos/alpha-test.c
new file mode 100644
index 0000000..54e30fa
--- /dev/null
+++ b/demos/alpha-test.c
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 400
+#define HEIGHT 200
+    
+    uint32_t *alpha = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *grad_img;
+    pixman_image_t *alpha_img;
+    pixman_image_t *dest_img;
+    pixman_image_t *src_img;
+    int i;
+    pixman_gradient_stop_t stops[2] =
+	{
+	    { pixman_int_to_fixed (0), { 0x0000, 0x0000, 0x0000, 0x0000 } },
+	    { pixman_int_to_fixed (1), { 0xffff, 0x0000, 0x1111, 0xffff } }
+	};
+    pixman_point_fixed_t p1 = { pixman_double_to_fixed (0), 0 };
+    pixman_point_fixed_t p2 = { pixman_double_to_fixed (WIDTH),
+				pixman_int_to_fixed (0) };
+#if 0
+    pixman_transform_t trans = {
+	{ { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
+	}
+    };
+#else
+    pixman_transform_t trans = {
+	{ { pixman_fixed_1, 0, 0 },
+	  { 0, pixman_fixed_1, 0 },
+	  { 0, 0, pixman_fixed_1 } }
+    };
+#endif
+
+#if 0
+    pixman_point_fixed_t c_inner;
+    pixman_point_fixed_t c_outer;
+    pixman_fixed_t r_inner;
+    pixman_fixed_t r_outer;
+#endif
+    
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	alpha[i] = 0x4f00004f; /* pale blue */
+    
+    alpha_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT, 
+					  alpha,
+					 WIDTH * 4);
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	dest[i] = 0xffffff00;		/* yellow */
+    
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT, 
+					 dest,
+					 WIDTH * 4);
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	src[i] = 0xffff0000;
+
+    src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					WIDTH, HEIGHT,
+					src,
+					WIDTH * 4);
+    
+#if 0
+    c_inner.x = pixman_double_to_fixed (50.0);
+    c_inner.y = pixman_double_to_fixed (50.0);
+    c_outer.x = pixman_double_to_fixed (50.0);
+    c_outer.y = pixman_double_to_fixed (50.0);
+    r_inner = 0;
+    r_outer = pixman_double_to_fixed (50.0);
+    
+    grad_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
+						    stops, 2);
+#endif
+#if 0
+    grad_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
+						    stops, 2);
+    grad_img = pixman_image_create_linear_gradient (&c_inner, &c_outer,
+						   r_inner, r_outer,
+						   stops, 2);
+#endif
+    
+    grad_img = pixman_image_create_linear_gradient  (&p1, &p2,
+						    stops, 2);
+
+    pixman_image_set_transform (grad_img, &trans);
+    pixman_image_set_repeat (grad_img, PIXMAN_REPEAT_PAD);
+    
+    pixman_image_composite (PIXMAN_OP_OVER, grad_img, NULL, alpha_img,
+			    0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
+
+    pixman_image_set_alpha_map (src_img, alpha_img, 10, 10);
+    
+    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img,
+			    0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
+    
+    printf ("0, 0: %x\n", dest[0]);
+    printf ("10, 10: %x\n", dest[10 * 10 + 10]);
+    printf ("w, h: %x\n", dest[(HEIGHT - 1) * 100 + (WIDTH - 1)]);
+    
+    show_image (dest_img);
+
+    pixman_image_unref (src_img);
+    pixman_image_unref (grad_img);
+    pixman_image_unref (alpha_img);
+    free (dest);
+    
+    return 0;
+}
diff --git a/demos/clip-in.c b/demos/clip-in.c
new file mode 100644
index 0000000..5157981
--- /dev/null
+++ b/demos/clip-in.c
@@ -0,0 +1,50 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+/* This test demonstrates that clipping is done totally different depending
+ * on whether the source is transformed or not.
+ */
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+#define SMALL 25
+    
+    uint32_t *sbits = malloc (SMALL * SMALL * 4);
+    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
+    pixman_transform_t trans = {
+    {
+	{ pixman_double_to_fixed (1.0), pixman_double_to_fixed (0), pixman_double_to_fixed (-0.1), },
+	{ pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (-0.1), },
+	{ pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (1.0) }
+    } };
+	  
+    pixman_image_t *src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, SMALL, SMALL, sbits, 4 * SMALL);
+    pixman_image_t *dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, 4 * WIDTH);
+
+    memset (bits, 0xff, WIDTH * HEIGHT * 4);
+    memset (sbits, 0x00, SMALL * SMALL * 4);
+
+    pixman_image_composite (PIXMAN_OP_IN,
+			    src_img, NULL, dest_img,
+			    0, 0, 0, 0, SMALL, SMALL, 200, 200);
+    
+    pixman_image_set_transform (src_img, &trans);
+    
+    pixman_image_composite (PIXMAN_OP_IN,
+			    src_img, NULL, dest_img,
+			    0, 0, 0, 0, SMALL * 2, SMALL * 2, 200, 200);
+    
+    show_image (dest_img);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dest_img);
+    free (bits);
+    
+    return 0;
+}
diff --git a/demos/clip-test.c b/demos/clip-test.c
new file mode 100644
index 0000000..aa0df44
--- /dev/null
+++ b/demos/clip-test.c
@@ -0,0 +1,97 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+#define WIDTH 200
+#define HEIGHT 200
+    
+static pixman_image_t *
+create_solid_bits (uint32_t pixel)
+{
+    uint32_t *pixels = malloc (WIDTH * HEIGHT * 4);
+    int i;
+    
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	pixels[i] = pixel;
+
+    return pixman_image_create_bits (PIXMAN_a8r8g8b8,
+				     WIDTH, HEIGHT, 
+				     pixels,
+				     WIDTH * 4);
+}
+
+int
+main (int argc, char **argv)
+{
+    pixman_image_t *gradient_img;
+    pixman_image_t *src_img, *dst_img;
+    pixman_gradient_stop_t stops[2] =
+	{
+	    { pixman_int_to_fixed (0), { 0xffff, 0x0000, 0x0000, 0xffff } },
+	    { pixman_int_to_fixed (1), { 0xffff, 0xffff, 0x0000, 0xffff } }
+	};
+#if 0
+    pixman_point_fixed_t p1 = { 0, 0 };
+    pixman_point_fixed_t p2 = { pixman_int_to_fixed (WIDTH),
+				pixman_int_to_fixed (HEIGHT) };
+#endif
+    pixman_point_fixed_t c_inner;
+    pixman_point_fixed_t c_outer;
+    pixman_fixed_t r_inner;
+    pixman_fixed_t r_outer;
+    pixman_region32_t clip_region;
+    pixman_transform_t trans = {
+	{ { pixman_double_to_fixed (1.3), pixman_double_to_fixed (0), pixman_double_to_fixed (-0.5), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (-0.5), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (1.0) } 
+	}
+    };
+    
+    src_img = create_solid_bits (0xff0000ff);
+    
+    c_inner.x = pixman_double_to_fixed (100.0);
+    c_inner.y = pixman_double_to_fixed (100.0);
+    c_outer.x = pixman_double_to_fixed (100.0);
+    c_outer.y = pixman_double_to_fixed (100.0);
+    r_inner = 0;
+    r_outer = pixman_double_to_fixed (100.0);
+    
+    gradient_img = pixman_image_create_radial_gradient (&c_inner, &c_outer,
+							r_inner, r_outer,
+							stops, 2);
+
+#if 0
+    gradient_img = pixman_image_create_linear_gradient  (&p1, &p2,
+							 stops, 2);
+    
+#endif
+
+    pixman_image_composite (PIXMAN_OP_OVER, gradient_img, NULL, src_img,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+    
+    pixman_region32_init_rect (&clip_region, 50, 0, 100, 200);
+    pixman_image_set_clip_region32 (src_img, &clip_region);
+    pixman_image_set_source_clipping (src_img, TRUE);
+    pixman_image_set_has_client_clip (src_img, TRUE);
+    pixman_image_set_transform (src_img, &trans);
+    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
+    
+    dst_img = create_solid_bits (0xffff0000);
+    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dst_img,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+    
+
+#if 0
+    printf ("0, 0: %x\n", src[0]);
+    printf ("10, 10: %x\n", src[10 * 10 + 10]);
+    printf ("w, h: %x\n", src[(HEIGHT - 1) * 100 + (WIDTH - 1)]);
+#endif
+    
+    show_image (dst_img);
+    
+    pixman_image_unref (gradient_img);
+    pixman_image_unref (src_img);
+    
+    return 0;
+}
diff --git a/demos/composite-test.c b/demos/composite-test.c
new file mode 100644
index 0000000..f5f352f
--- /dev/null
+++ b/demos/composite-test.c
@@ -0,0 +1,191 @@
+#include <gtk/gtk.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+#define WIDTH	60
+#define HEIGHT	60
+
+typedef struct {
+    const char *name;
+    pixman_op_t op;
+} operator_t;
+
+static const operator_t operators[] = {
+    { "CLEAR",		PIXMAN_OP_CLEAR },
+    { "SRC",		PIXMAN_OP_SRC },
+    { "DST",		PIXMAN_OP_DST },
+    { "OVER",		PIXMAN_OP_OVER },
+    { "OVER_REVERSE",	PIXMAN_OP_OVER_REVERSE },
+    { "IN",		PIXMAN_OP_IN },
+    { "IN_REVERSE",	PIXMAN_OP_IN_REVERSE },
+    { "OUT",		PIXMAN_OP_OUT },
+    { "OUT_REVERSE",	PIXMAN_OP_OUT_REVERSE },
+    { "ATOP",		PIXMAN_OP_ATOP },
+    { "ATOP_REVERSE",	PIXMAN_OP_ATOP_REVERSE },
+    { "XOR",		PIXMAN_OP_XOR },
+    { "ADD",		PIXMAN_OP_ADD },
+    { "SATURATE",	PIXMAN_OP_SATURATE },
+
+    { "MULTIPLY",	PIXMAN_OP_MULTIPLY },
+    { "SCREEN",		PIXMAN_OP_SCREEN },
+    { "OVERLAY",	PIXMAN_OP_OVERLAY },
+    { "DARKEN",		PIXMAN_OP_DARKEN },
+    { "LIGHTEN",	PIXMAN_OP_LIGHTEN },
+    { "COLOR_DODGE",	PIXMAN_OP_COLOR_DODGE },
+    { "COLOR_BURN",	PIXMAN_OP_COLOR_BURN },
+    { "HARD_LIGHT",	PIXMAN_OP_HARD_LIGHT },
+    { "SOFT_LIGHT",	PIXMAN_OP_SOFT_LIGHT },
+    { "DIFFERENCE",	PIXMAN_OP_DIFFERENCE },
+    { "EXCLUSION",	PIXMAN_OP_EXCLUSION },
+    { "HSL_HUE",	PIXMAN_OP_HSL_HUE },
+    { "HSL_SATURATION",	PIXMAN_OP_HSL_SATURATION },
+    { "HSL_COLOR",	PIXMAN_OP_HSL_COLOR },
+    { "HSL_LUMINOSITY",	PIXMAN_OP_HSL_LUMINOSITY },
+};
+
+static uint32_t
+reader (const void *src, int size)
+{
+    switch (size)
+    {
+    case 1:
+	return *(uint8_t *)src;
+    case 2:
+	return *(uint16_t *)src;
+    case 4:
+	return *(uint32_t *)src;
+    default:
+	g_assert_not_reached();
+    }
+}
+
+static void
+writer (void *src, uint32_t value, int size)
+{
+    switch (size)
+    {
+    case 1:
+	*(uint8_t *)src = value;
+	break;
+
+    case 2:
+	*(uint16_t *)src = value;
+	break;
+
+    case 4:
+	*(uint32_t *)src = value;
+	break;
+
+    default:
+        break;
+    }
+}
+
+int
+main (int argc, char **argv)
+{
+#define d2f pixman_double_to_fixed
+    
+    GtkWidget *window, *swindow;
+    GtkWidget *table;
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *src_img;
+    pixman_image_t *dest_img;
+    pixman_point_fixed_t p1 = { -10 << 0, 0 };
+    pixman_point_fixed_t p2 = { WIDTH << 16, (HEIGHT - 10) << 16 };
+    uint16_t full = 0xcfff;
+    uint16_t low  = 0x5000;
+    uint16_t alpha = 0xffff;
+    pixman_gradient_stop_t stops[6] =
+    {
+	{ d2f (0.0), { full, low, low, alpha } },
+	{ d2f (0.25), { full, full, low, alpha } },
+	{ d2f (0.4), { low, full, low, alpha } },
+	{ d2f (0.6), { low, full, full, alpha } },
+	{ d2f (0.8), { low, low, full, alpha } },
+	{ d2f (1.0), { full, low, full, alpha } },
+    };
+
+    int i;
+
+    gtk_init (&argc, &argv);
+
+    window = gtk_window_new (GTK_WINDOW_TOPLEVEL);
+
+    gtk_window_set_default_size (GTK_WINDOW (window), 800, 600);
+    
+    g_signal_connect (window, "delete-event",
+		      G_CALLBACK (gtk_main_quit),
+		      NULL);
+    table = gtk_table_new (G_N_ELEMENTS (operators) / 6, 6, TRUE);
+
+    src_img = pixman_image_create_linear_gradient (&p1, &p2, stops,
+						   sizeof (stops) / sizeof (stops[0]));
+
+    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_PAD);
+    
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT,
+					 dest,
+					 WIDTH * 4);
+    pixman_image_set_accessors (dest_img, reader, writer);
+
+    for (i = 0; i < G_N_ELEMENTS (operators); ++i)
+    {
+	GtkWidget *image;
+	GdkPixbuf *pixbuf;
+	GtkWidget *vbox;
+	GtkWidget *label;
+	int j, k;
+
+	vbox = gtk_vbox_new (FALSE, 0);
+
+	label = gtk_label_new (operators[i].name);
+	gtk_box_pack_start (GTK_BOX (vbox), label, FALSE, FALSE, 6);
+	gtk_widget_show (label);
+
+	for (j = 0; j < HEIGHT; ++j)
+	{
+	    for (k = 0; k < WIDTH; ++k)
+		dest[j * WIDTH + k] = 0x7f6f6f00;
+	}
+	pixman_image_composite (operators[i].op, src_img, NULL, dest_img,
+				0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+	pixbuf = pixbuf_from_argb32 (pixman_image_get_data (dest_img), TRUE,
+				     WIDTH, HEIGHT, WIDTH * 4);
+	image = gtk_image_new_from_pixbuf (pixbuf);
+	gtk_box_pack_start (GTK_BOX (vbox), image, FALSE, FALSE, 0);
+	gtk_widget_show (image);
+
+	gtk_table_attach_defaults (GTK_TABLE (table), vbox,
+				   i % 6, (i % 6) + 1, i / 6, (i / 6) + 1);
+	gtk_widget_show (vbox);
+
+	g_object_unref (pixbuf);
+    }
+
+    pixman_image_unref (src_img);
+    free (src);
+    pixman_image_unref (dest_img);
+    free (dest);
+
+    swindow = gtk_scrolled_window_new (NULL, NULL);
+    gtk_scrolled_window_set_policy (GTK_SCROLLED_WINDOW (swindow),
+				    GTK_POLICY_AUTOMATIC,
+				    GTK_POLICY_AUTOMATIC);
+    
+    gtk_scrolled_window_add_with_viewport (GTK_SCROLLED_WINDOW (swindow), table);
+    gtk_widget_show (table);
+
+    gtk_container_add (GTK_CONTAINER (window), swindow);
+    gtk_widget_show (swindow);
+
+    gtk_widget_show (window);
+
+    gtk_main ();
+
+    return 0;
+}
diff --git a/demos/convolution-test.c b/demos/convolution-test.c
new file mode 100644
index 0000000..da284af
--- /dev/null
+++ b/demos/convolution-test.c
@@ -0,0 +1,47 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+#define d2f pixman_double_to_fixed
+    
+    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *mask = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    pixman_fixed_t convolution[] =
+    {
+	d2f (3), d2f (3),
+	d2f (0.5), d2f (0.5), d2f (0.5),
+	d2f (0.5), d2f (0.5), d2f (0.5),
+	d2f (0.5), d2f (0.5), d2f (0.5),
+    };
+    pixman_image_t *simg, *mimg, *dimg;
+
+    int i;
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+    {
+	src[i] = 0x7f007f00;
+	mask[i] = (i % 256) * 0x01000000;
+	dest[i] = 0;
+    }
+
+    simg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src, WIDTH * 4);
+    mimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, mask, WIDTH * 4);
+    dimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, dest, WIDTH * 4);
+
+    pixman_image_set_filter (mimg, PIXMAN_FILTER_CONVOLUTION,
+			     convolution, 11);
+
+    pixman_image_composite (PIXMAN_OP_OVER, simg, mimg, dimg, 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    show_image (dimg);
+    
+    return 0;
+}
diff --git a/demos/gradient-test.c b/demos/gradient-test.c
new file mode 100644
index 0000000..20f78a6
--- /dev/null
+++ b/demos/gradient-test.c
@@ -0,0 +1,93 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 400
+#define HEIGHT 200
+    
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *src_img;
+    pixman_image_t *dest_img;
+    int i;
+    pixman_gradient_stop_t stops[2] =
+	{
+	    { pixman_int_to_fixed (0), { 0xffff, 0xeeee, 0xeeee, 0xeeee } },
+	    { pixman_int_to_fixed (1), { 0xffff, 0x1111, 0x1111, 0x1111 } }
+	};
+    pixman_point_fixed_t p1 = { pixman_double_to_fixed (0), 0 };
+    pixman_point_fixed_t p2 = { pixman_double_to_fixed (WIDTH / 8.),
+				pixman_int_to_fixed (0) };
+#if 0
+    pixman_transform_t trans = {
+	{ { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
+	}
+    };
+#else
+    pixman_transform_t trans = {
+	{ { pixman_fixed_1, 0, 0 },
+	  { 0, pixman_fixed_1, 0 },
+	  { 0, 0, pixman_fixed_1 } }
+    };
+#endif
+
+#if 0
+    pixman_point_fixed_t c_inner;
+    pixman_point_fixed_t c_outer;
+    pixman_fixed_t r_inner;
+    pixman_fixed_t r_outer;
+#endif
+    
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	dest[i] = 0x4f00004f; /* pale blue */
+    
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT, 
+					 dest,
+					 WIDTH * 4);
+
+#if 0
+    c_inner.x = pixman_double_to_fixed (50.0);
+    c_inner.y = pixman_double_to_fixed (50.0);
+    c_outer.x = pixman_double_to_fixed (50.0);
+    c_outer.y = pixman_double_to_fixed (50.0);
+    r_inner = 0;
+    r_outer = pixman_double_to_fixed (50.0);
+    
+    src_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
+						    stops, 2);
+#endif
+#if 0
+    src_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
+						    stops, 2);
+    src_img = pixman_image_create_linear_gradient (&c_inner, &c_outer,
+						   r_inner, r_outer,
+						   stops, 2);
+#endif
+    
+    src_img = pixman_image_create_linear_gradient  (&p1, &p2,
+						    stops, 2);
+    
+    pixman_image_set_transform (src_img, &trans);
+    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_PAD);
+    
+    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img,
+			    0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
+    
+    printf ("0, 0: %x\n", dest[0]);
+    printf ("10, 10: %x\n", dest[10 * 10 + 10]);
+    printf ("w, h: %x\n", dest[(HEIGHT - 1) * 100 + (WIDTH - 1)]);
+    
+    show_image (dest_img);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dest_img);
+    free (dest);
+    
+    return 0;
+}
diff --git a/demos/gtk-utils.c b/demos/gtk-utils.c
new file mode 100644
index 0000000..0e7cb5c
--- /dev/null
+++ b/demos/gtk-utils.c
@@ -0,0 +1,115 @@
+#include <gtk/gtk.h>
+#include <config.h>
+#include "pixman-private.h"	/* For image->bits.format
+				 * FIXME: there should probably be public API for this
+				 */
+#include "gtk-utils.h"
+
+GdkPixbuf *
+pixbuf_from_argb32 (uint32_t *bits,
+		    gboolean has_alpha,
+		    int width,
+		    int height,
+		    int stride)
+{
+    GdkPixbuf *pixbuf = gdk_pixbuf_new (GDK_COLORSPACE_RGB, TRUE,
+					8, width, height);
+    int p_stride = gdk_pixbuf_get_rowstride (pixbuf);
+    guint32 *p_bits = (guint32 *)gdk_pixbuf_get_pixels (pixbuf);
+    int w, h;
+    
+    for (h = 0; h < height; ++h)
+    {
+	for (w = 0; w < width; ++w)
+	{
+	    uint32_t argb = bits[h * (stride / 4) + w];
+	    guint r, g, b, a;
+	    char *pb = (char *)p_bits;
+
+	    pb += h * p_stride + w * 4;
+
+	    r = (argb & 0x00ff0000) >> 16;
+	    g = (argb & 0x0000ff00) >> 8;
+	    b = (argb & 0x000000ff) >> 0;
+	    a = has_alpha? (argb & 0xff000000) >> 24 : 0xff;
+
+	    if (a)
+	    {
+		r = (r * 255) / a;
+		g = (g * 255) / a;
+		b = (b * 255) / a;
+	    }
+
+	    if (r > 255) r = 255;
+	    if (g > 255) g = 255;
+	    if (b > 255) b = 255;
+	    
+	    pb[0] = r;
+	    pb[1] = g;
+	    pb[2] = b;
+	    pb[3] = a;
+	}
+    }
+    
+    return pixbuf;
+}
+
+
+static gboolean
+on_expose (GtkWidget *widget, GdkEventExpose *expose, gpointer data)
+{
+    GdkPixbuf *pixbuf = data;
+    
+    gdk_draw_pixbuf (widget->window, NULL,
+		     pixbuf, 0, 0, 0, 0,
+		     gdk_pixbuf_get_width (pixbuf),
+		     gdk_pixbuf_get_height (pixbuf),
+		     GDK_RGB_DITHER_NONE,
+		     0, 0);
+    
+    return TRUE;
+}
+
+void
+show_image (pixman_image_t *image)
+{
+    GtkWidget *window;
+    GdkPixbuf *pixbuf;
+    int width, height, stride;
+    int argc;
+    char **argv;
+    char *arg0 = g_strdup ("pixman-test-program");
+    gboolean has_alpha;
+    pixman_format_code_t format;
+
+    argc = 1;
+    argv = (char **)&arg0;
+
+    gtk_init (&argc, &argv);
+    
+    window = gtk_window_new (GTK_WINDOW_TOPLEVEL);
+    width = pixman_image_get_width (image);
+    height = pixman_image_get_height (image);
+    stride = pixman_image_get_stride (image);
+
+    gtk_window_set_default_size (GTK_WINDOW (window), width, height);
+    
+    format = image->bits.format;
+    
+    if (format == PIXMAN_a8r8g8b8)
+	has_alpha = TRUE;
+    else if (format == PIXMAN_x8r8g8b8)
+	has_alpha = FALSE;
+    else
+	g_error ("Can't deal with this format: %x\n", format);
+    
+    pixbuf = pixbuf_from_argb32 (pixman_image_get_data (image), has_alpha,
+				 width, height, stride);
+    
+    g_signal_connect (window, "expose_event", G_CALLBACK (on_expose), pixbuf);
+    g_signal_connect (window, "delete_event", G_CALLBACK (gtk_main_quit), NULL);
+    
+    gtk_widget_show (window);
+    
+    gtk_main ();
+}
diff --git a/demos/gtk-utils.h b/demos/gtk-utils.h
new file mode 100644
index 0000000..2cb13bc
--- /dev/null
+++ b/demos/gtk-utils.h
@@ -0,0 +1,13 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <glib.h>
+#include <gtk/gtk.h>
+#include "pixman.h"
+
+void show_image (pixman_image_t *image);
+
+GdkPixbuf *pixbuf_from_argb32 (uint32_t *bits,
+		               gboolean has_alpha,
+                               int width,
+                               int height,
+                               int stride);
diff --git a/demos/radial-test.c b/demos/radial-test.c
new file mode 100644
index 0000000..35e90d7
--- /dev/null
+++ b/demos/radial-test.c
@@ -0,0 +1,198 @@
+#include "../test/utils.h"
+#include "gtk-utils.h"
+
+#define NUM_GRADIENTS 7
+#define NUM_STOPS 3
+#define NUM_REPEAT 4
+#define SIZE 128
+#define WIDTH (SIZE * NUM_GRADIENTS)
+#define HEIGHT (SIZE * NUM_REPEAT)
+
+/*
+ * We want to test all the possible relative positions of the start
+ * and end circle:
+ *
+ *  - The start circle can be smaller/equal/bigger than the end
+ *    circle. A radial gradient can be classified in one of these
+ *    three cases depending on the sign of dr.
+ *
+ *  - The smaller circle can be completely inside/internally
+ *    tangent/outside (at least in part) of the bigger circle. This
+ *    classification is the same as the one which can be computed by
+ *    examining the sign of a = (dx^2 + dy^2 - dr^2).
+ *
+ *  - If the two circles have the same size, neither can be inside or
+ *    internally tangent
+ *
+ * This test draws radial gradients whose circles always have the same
+ * centers (0, 0) and (1, 0), but with different radiuses. From left
+ * to right:
+ *
+ * - Small start circle completely inside the end circle
+ *     0.25 -> 1.75; dr =  1.5 > 0; a = 1 - 1.50^2 < 0
+ *
+ * - Small start circle internally tangent to the end circle
+ *     0.50 -> 1.50; dr =  1.0 > 0; a = 1 - 1.00^2 = 0
+ *
+ * - Small start circle outside of the end circle
+ *     0.50 -> 1.00; dr =  0.5 > 0; a = 1 - 0.50^2 > 0
+ *
+ * - Start circle with the same size as the end circle
+ *     1.00 -> 1.00; dr =  0.0 = 0; a = 1 - 0.00^2 > 0
+ *
+ * - Small end circle outside of the start circle
+ *     1.00 -> 0.50; dr = -0.5 > 0; a = 1 - 0.50^2 > 0
+ *
+ * - Small end circle internally tangent to the start circle
+ *     1.50 -> 0.50; dr = -1.0 > 0; a = 1 - 1.00^2 = 0
+ *
+ * - Small end circle completely inside the start circle
+ *     1.75 -> 0.25; dr = -1.5 > 0; a = 1 - 1.50^2 < 0
+ *
+ */
+
+const static double radiuses[NUM_GRADIENTS] = {
+    0.25,
+    0.50,
+    0.50,
+    1.00,
+    1.00,
+    1.50,
+    1.75
+};
+
+#define double_to_color(x)					\
+    (((uint32_t) ((x)*65536)) - (((uint32_t) ((x)*65536)) >> 16))
+
+#define PIXMAN_STOP(offset,r,g,b,a)		\
+    { pixman_double_to_fixed (offset),		\
+	{					\
+	double_to_color (r),			\
+	double_to_color (g),			\
+	double_to_color (b),			\
+	double_to_color (a)			\
+	}					\
+    }
+
+static const pixman_gradient_stop_t stops[NUM_STOPS] = {
+    PIXMAN_STOP (0.0,        1, 0, 0, 0.75),
+    PIXMAN_STOP (0.70710678, 0, 1, 0, 0),
+    PIXMAN_STOP (1.0,        0, 0, 1, 1)
+};
+
+static pixman_image_t *
+create_radial (int index)
+{
+    pixman_point_fixed_t p0, p1;
+    pixman_fixed_t r0, r1;
+    double x0, x1, radius0, radius1, left, right, center;
+
+    x0 = 0;
+    x1 = 1;
+    radius0 = radiuses[index];
+    radius1 = radiuses[NUM_GRADIENTS - index - 1];
+
+    /* center the gradient */
+    left = MIN (x0 - radius0, x1 - radius1);
+    right = MAX (x0 + radius0, x1 + radius1);
+    center = (left + right) * 0.5;
+    x0 -= center;
+    x1 -= center;
+
+    /* scale to make it fit within a 1x1 rect centered in (0,0) */
+    x0 *= 0.25;
+    x1 *= 0.25;
+    radius0 *= 0.25;
+    radius1 *= 0.25;
+
+    p0.x = pixman_double_to_fixed (x0);
+    p0.y = pixman_double_to_fixed (0);
+
+    p1.x = pixman_double_to_fixed (x1);
+    p1.y = pixman_double_to_fixed (0);
+
+    r0 = pixman_double_to_fixed (radius0);
+    r1 = pixman_double_to_fixed (radius1);
+
+    return pixman_image_create_radial_gradient (&p0, &p1,
+						r0, r1,
+						stops, NUM_STOPS);
+}
+
+static const pixman_repeat_t repeat[NUM_REPEAT] = {
+    PIXMAN_REPEAT_NONE,
+    PIXMAN_REPEAT_NORMAL,
+    PIXMAN_REPEAT_REFLECT,
+    PIXMAN_REPEAT_PAD
+};
+
+int
+main (int argc, char **argv)
+{
+    pixman_transform_t transform;
+    pixman_image_t *src_img, *dest_img;
+    int i, j;
+
+    enable_fp_exceptions ();
+
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT,
+					 NULL, 0);
+
+    pixman_transform_init_identity (&transform);
+
+    /*
+     * The create_radial() function returns gradients centered in the
+     * origin and whose interesting part fits a 1x1 square. We want to
+     * paint these gradients on a SIZExSIZE square and to make things
+     * easier we want the origin in the top-left corner of the square
+     * we want to see.
+     */
+    pixman_transform_translate (NULL, &transform,
+				pixman_double_to_fixed (0.5),
+				pixman_double_to_fixed (0.5));
+
+    pixman_transform_scale (NULL, &transform,
+			    pixman_double_to_fixed (SIZE),
+			    pixman_double_to_fixed (SIZE));
+
+    /*
+     * Gradients are evaluated at the center of each pixel, so we need
+     * to translate by half a pixel to trigger some interesting
+     * cornercases. In particular, the original implementation of PDF
+     * radial gradients tried to divide by 0 when using this transform
+     * on the "tangent circles" cases.
+     */
+    pixman_transform_translate (NULL, &transform,
+				pixman_double_to_fixed (0.5),
+				pixman_double_to_fixed (0.5));
+
+    for (i = 0; i < NUM_GRADIENTS; i++)
+    {
+	src_img = create_radial (i);
+	pixman_image_set_transform (src_img, &transform);
+
+	for (j = 0; j < NUM_REPEAT; j++)
+	{
+	    pixman_image_set_repeat (src_img, repeat[j]);
+
+	    pixman_image_composite32 (PIXMAN_OP_OVER,
+				      src_img,
+				      NULL,
+				      dest_img,
+				      0, 0,
+				      0, 0,
+				      i * SIZE, j * SIZE,
+				      SIZE, SIZE);
+
+	}
+
+	pixman_image_unref (src_img);
+    }
+
+    show_image (dest_img);
+
+    pixman_image_unref (dest_img);
+
+    return 0;
+}
diff --git a/demos/screen-test.c b/demos/screen-test.c
new file mode 100644
index 0000000..e69dba3
--- /dev/null
+++ b/demos/screen-test.c
@@ -0,0 +1,44 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 40
+#define HEIGHT 40
+    
+    uint32_t *src1 = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src2 = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src3 = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *dest = malloc (3 * WIDTH * 2 * HEIGHT * 4);
+    pixman_image_t *simg1, *simg2, *simg3, *dimg;
+
+    int i;
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+    {
+	src1[i] = 0x7ff00000;
+	src2[i] = 0x7f00ff00;
+	src3[i] = 0x7f0000ff;
+    }
+
+    for (i = 0; i < 3 * WIDTH * 2 * HEIGHT; ++i)
+    {
+	dest[i] = 0x0;
+    }
+
+    simg1 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src1, WIDTH * 4);
+    simg2 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src2, WIDTH * 4);
+    simg3 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src3, WIDTH * 4);
+    dimg  = pixman_image_create_bits (PIXMAN_a8r8g8b8, 3 * WIDTH, 2 * HEIGHT, dest, 3 * WIDTH * 4);
+
+    pixman_image_composite (PIXMAN_OP_SCREEN, simg1, NULL, dimg, 0, 0, 0, 0, WIDTH, HEIGHT / 4, WIDTH, HEIGHT);
+    pixman_image_composite (PIXMAN_OP_SCREEN, simg2, NULL, dimg, 0, 0, 0, 0, (WIDTH/2), HEIGHT / 4 + HEIGHT / 2, WIDTH, HEIGHT);
+    pixman_image_composite (PIXMAN_OP_SCREEN, simg3, NULL, dimg, 0, 0, 0, 0, (4 * WIDTH) / 3, HEIGHT, WIDTH, HEIGHT);
+
+    show_image (dimg);
+    
+    return 0;
+}
diff --git a/demos/trap-test.c b/demos/trap-test.c
new file mode 100644
index 0000000..19295e7
--- /dev/null
+++ b/demos/trap-test.c
@@ -0,0 +1,49 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+    pixman_image_t *src_img;
+    pixman_image_t *mask_img;
+    pixman_image_t *dest_img;
+    pixman_trap_t trap;
+    pixman_color_t white = { 0x0000, 0xffff, 0x0000, 0xffff };
+    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *mbits = malloc (WIDTH * HEIGHT);
+
+    memset (mbits, 0, WIDTH * HEIGHT);
+    memset (bits, 0xff, WIDTH * HEIGHT * 4);
+    
+    trap.top.l = pixman_int_to_fixed (50) + 0x8000;
+    trap.top.r = pixman_int_to_fixed (150) + 0x8000;
+    trap.top.y = pixman_int_to_fixed (30);
+
+    trap.bot.l = pixman_int_to_fixed (50) + 0x8000;
+    trap.bot.r = pixman_int_to_fixed (150) + 0x8000;
+    trap.bot.y = pixman_int_to_fixed (150);
+
+    mask_img = pixman_image_create_bits (PIXMAN_a8, WIDTH, HEIGHT, mbits, WIDTH);
+    src_img = pixman_image_create_solid_fill (&white);
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, WIDTH * 4);
+    
+    pixman_add_traps (mask_img, 0, 0, 1, &trap);
+
+    pixman_image_composite (PIXMAN_OP_OVER,
+			    src_img, mask_img, dest_img,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+    
+    show_image (dest_img);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dest_img);
+    free (bits);
+    
+    return 0;
+}
diff --git a/demos/tri-test.c b/demos/tri-test.c
new file mode 100644
index 0000000..a71869a
--- /dev/null
+++ b/demos/tri-test.c
@@ -0,0 +1,48 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../test/utils.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+#define POINT(x,y)							\
+    { pixman_double_to_fixed ((x)), pixman_double_to_fixed ((y)) }
+    
+    pixman_image_t *src_img, *dest_img;
+    pixman_triangle_t tris[4] =
+    {
+	{ POINT (100, 100), POINT (10, 50), POINT (110, 10) },
+	{ POINT (100, 100), POINT (150, 10), POINT (200, 50) },
+	{ POINT (100, 100), POINT (10, 170), POINT (90, 175) },
+	{ POINT (100, 100), POINT (170, 150), POINT (120, 190) },
+    };
+    pixman_color_t color = { 0x4444, 0x4444, 0xffff, 0xffff };
+    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
+    int i;
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	bits[i] = (i / HEIGHT) * 0x01010000;
+    
+    src_img = pixman_image_create_solid_fill (&color);
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, WIDTH * 4);
+    
+    pixman_composite_triangles (PIXMAN_OP_ATOP_REVERSE,
+				src_img,
+				dest_img,
+				PIXMAN_a8,
+				200, 200,
+				-5, 5,
+				ARRAY_LENGTH (tris), tris);
+    show_image (dest_img);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dest_img);
+    free (bits);
+    
+    return 0;
+}
diff --git a/packaging/pixman.spec b/packaging/pixman.spec
new file mode 100644
index 0000000..b10c4dd
--- /dev/null
+++ b/packaging/pixman.spec
@@ -0,0 +1,62 @@
+
+Name:       pixman
+Summary:    Pixel manipulation library
+Version:    0.21.6
+Release:    1
+Group:      System/Libraries
+License:    MIT
+URL:        http://www.x.org/
+Source0:    http://xorg.freedesktop.org/archive/individual/lib/%{name}-%{version}.tar.gz
+Requires(post): /sbin/ldconfig
+Requires(postun): /sbin/ldconfig
+
+
+%description
+Description: %{summary}
+
+
+%package devel
+Summary:    Development components for the pixman library
+Group:      Development/Libraries
+Requires:   %{name} = %{version}-%{release}
+
+%description devel
+Description: %{summary}
+
+
+%prep
+%setup -q -n %{name}-%{version}
+
+%build
+
+%reconfigure
+make %{?jobs:-j%jobs}
+
+%install
+rm -rf %{buildroot}
+%make_install
+
+
+
+
+%post -p /sbin/ldconfig
+
+%postun -p /sbin/ldconfig
+
+
+
+
+
+%files
+%defattr(-,root,root,-)
+%{_libdir}/libpixman-1*.so.*
+
+
+%files devel
+%defattr(-,root,root,-)
+%dir %{_includedir}/pixman-1
+%{_includedir}/pixman-1/pixman.h
+%{_includedir}/pixman-1/pixman-version.h
+%{_libdir}/libpixman-1*.so
+%{_libdir}/pkgconfig/pixman-1.pc
+
diff --git a/pixman-1-uninstalled.pc.in b/pixman-1-uninstalled.pc.in
new file mode 100644
index 0000000..e0347d0
--- /dev/null
+++ b/pixman-1-uninstalled.pc.in
@@ -0,0 +1,5 @@
+Name: Pixman
+Description: The pixman library (version 1)
+Version: @PACKAGE_VERSION@
+Cflags: -I${pc_top_builddir}/${pcfiledir}/pixman
+Libs: ${pc_top_builddir}/${pcfiledir}/pixman/libpixman-1.la
diff --git a/pixman-1.pc.in b/pixman-1.pc.in
new file mode 100644
index 0000000..936d95d
--- /dev/null
+++ b/pixman-1.pc.in
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: Pixman
+Description: The pixman library (version 1)
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/pixman-1 @DEP_CFLAGS@
+Libs: -L${libdir} -lpixman-1 @DEP_LIBS@
+
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
new file mode 100644
index 0000000..286b7cf
--- /dev/null
+++ b/pixman/Makefile.am
@@ -0,0 +1,106 @@
+include $(top_srcdir)/pixman/Makefile.sources
+
+lib_LTLIBRARIES = libpixman-1.la
+
+libpixman_1_la_LDFLAGS = -version-info $(LT_VERSION_INFO) -no-undefined @PTHREAD_LDFLAGS@ 
+libpixman_1_la_LIBADD = @PTHREAD_LIBS@ @DEP_LIBS@ -lm
+libpixman_1_la_SOURCES = $(libpixman_sources) $(libpixman_headers)
+
+libpixmanincludedir = $(includedir)/pixman-1
+libpixmaninclude_HEADERS = pixman.h pixman-version.h
+noinst_LTLIBRARIES = 
+
+EXTRA_DIST =				\
+	Makefile.win32			\
+	make-combine.pl			\
+	pixman-combine.c.template	\
+	pixman-combine.h.template	\
+	pixman-region.c			\
+	solaris-hwcap.mapfile		\
+	$(NULL)
+
+DISTCLEANFILES = $(BUILT_SOURCES)
+
+# mmx code
+if USE_X86_MMX
+noinst_LTLIBRARIES += libpixman-mmx.la
+libpixman_mmx_la_SOURCES = \
+	pixman-mmx.c
+libpixman_mmx_la_CFLAGS = $(DEP_CFLAGS) $(MMX_CFLAGS)
+libpixman_mmx_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LDFLAGS += $(MMX_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-mmx.la
+
+ASM_CFLAGS_mmx=$(MMX_CFLAGS)
+endif
+
+# vmx code
+if USE_VMX
+noinst_LTLIBRARIES += libpixman-vmx.la
+libpixman_vmx_la_SOURCES = \
+	pixman-vmx.c \
+	pixman-combine32.h
+libpixman_vmx_la_CFLAGS = $(DEP_CFLAGS) $(VMX_CFLAGS)
+libpixman_vmx_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LIBADD += libpixman-vmx.la
+
+ASM_CFLAGS_vmx=$(VMX_CFLAGS)
+endif
+
+# sse2 code
+if USE_SSE2
+noinst_LTLIBRARIES += libpixman-sse2.la
+libpixman_sse2_la_SOURCES = \
+	pixman-sse2.c
+libpixman_sse2_la_CFLAGS = $(DEP_CFLAGS) $(SSE2_CFLAGS)
+libpixman_sse2_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LDFLAGS += $(SSE2_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-sse2.la
+
+ASM_CFLAGS_sse2=$(SSE2_CFLAGS)
+endif
+
+# arm simd code
+if USE_ARM_SIMD
+noinst_LTLIBRARIES += libpixman-arm-simd.la
+libpixman_arm_simd_la_SOURCES = \
+	pixman-arm-simd.c	\
+	pixman-arm-common.h	\
+	pixman-arm-simd-asm.S
+libpixman_arm_simd_la_CFLAGS = $(DEP_CFLAGS)
+libpixman_arm_simd_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LIBADD += libpixman-arm-simd.la
+
+ASM_CFLAGS_arm_simd=
+endif
+
+# arm neon code
+if USE_ARM_NEON
+noinst_LTLIBRARIES += libpixman-arm-neon.la
+libpixman_arm_neon_la_SOURCES = \
+        pixman-arm-neon.c	\
+        pixman-arm-common.h	\
+        pixman-arm-neon-asm.S	\
+		pixman-arm-neon-asm-bilinear.S \
+        pixman-arm-neon-asm.h
+libpixman_arm_neon_la_CFLAGS = $(DEP_CFLAGS)
+libpixman_arm_neon_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LIBADD += libpixman-arm-neon.la
+
+ASM_CFLAGS_arm_neon=
+endif
+
+# iwmmxt code
+if USE_ARM_IWMMXT
+noinst_LTLIBRARIES += libpixman-iwmmxt.la
+libpixman_iwmmxt_la_SOURCES = pixman-mmx.c
+libpixman_iwmmxt_la_CFLAGS = $(DEP_CFLAGS) $(IWMMXT_CFLAGS)
+libpixman_iwmmxt_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LDFLAGS += $(IWMMXT_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-iwmmxt.la
+
+ASM_CFLAGS_IWMMXT=$(IWMMXT_CFLAGS)
+endif
+
+.c.s : $(libpixmaninclude_HEADERS) $(BUILT_SOURCES)
+	$(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $<
diff --git a/pixman/Makefile.sources b/pixman/Makefile.sources
new file mode 100644
index 0000000..ca3f001
--- /dev/null
+++ b/pixman/Makefile.sources
@@ -0,0 +1,55 @@
+libpixman_sources =			\
+	pixman.c			\
+	pixman-access.c			\
+	pixman-access-accessors.c	\
+	pixman-bits-image.c		\
+	pixman-combine32.c		\
+	pixman-combine64.c		\
+	pixman-conical-gradient.c	\
+	pixman-cpu.c			\
+	pixman-edge.c			\
+	pixman-edge-accessors.c		\
+	pixman-fast-path.c		\
+	pixman-general.c		\
+	pixman-gradient-walker.c	\
+	pixman-image.c			\
+	pixman-implementation.c		\
+	pixman-linear-gradient.c	\
+	pixman-matrix.c			\
+	pixman-noop.c			\
+	pixman-radial-gradient.c	\
+	pixman-region16.c		\
+	pixman-region32.c		\
+	pixman-solid-fill.c		\
+	pixman-timer.c			\
+	pixman-trap.c			\
+	pixman-utils.c			\
+	$(NULL)
+
+libpixman_headers =			\
+	pixman.h			\
+	pixman-accessor.h		\
+	pixman-combine32.h		\
+	pixman-combine64.h		\
+	pixman-compiler.h		\
+	pixman-edge-imp.h		\
+	pixman-inlines.h		\
+	pixman-private.h		\
+	$(NULL)
+
+BUILT_SOURCES =				\
+	pixman-combine32.c		\
+	pixman-combine32.h		\
+	pixman-combine64.c		\
+	pixman-combine64.h		\
+	$(NULL)
+
+pixman-combine32.c: pixman-combine.c.template make-combine.pl
+	$(PERL) $(lastword $+) 8 < $< > $@ || ($(RM) $@; exit 1)
+pixman-combine32.h: pixman-combine.h.template make-combine.pl
+	$(PERL) $(lastword $+) 8 < $< > $@ || ($(RM) $@; exit 1)
+
+pixman-combine64.c: pixman-combine.c.template make-combine.pl
+	$(PERL) $(lastword $+) 16 < $< > $@ || ($(RM) $@; exit 1)
+pixman-combine64.h: pixman-combine.h.template make-combine.pl
+	$(PERL) $(lastword $+) 16 < $< > $@ || ($(RM) $@; exit 1)
diff --git a/pixman/Makefile.win32 b/pixman/Makefile.win32
new file mode 100644
index 0000000..381f2cd
--- /dev/null
+++ b/pixman/Makefile.win32
@@ -0,0 +1,66 @@
+default: all
+
+top_srcdir = ..
+include $(top_srcdir)/pixman/Makefile.sources
+include $(top_srcdir)/Makefile.win32.common
+
+MMX_VAR = $(MMX)
+ifeq ($(MMX_VAR),)
+MMX_VAR=on
+endif
+
+SSE2_VAR = $(SSE2)
+ifeq ($(SSE2_VAR),)
+SSE2_VAR=on
+endif
+
+MMX_CFLAGS = -DUSE_X86_MMX -w14710 -w14714
+SSE2_CFLAGS = -DUSE_SSE2
+
+# MMX compilation flags
+ifeq ($(MMX_VAR),on)
+PIXMAN_CFLAGS += $(MMX_CFLAGS)
+libpixman_sources += pixman-mmx.c
+endif
+
+# SSE2 compilation flags
+ifeq ($(SSE2_VAR),on)
+PIXMAN_CFLAGS += $(SSE2_CFLAGS)
+libpixman_sources += pixman-sse2.c
+endif
+
+OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(libpixman_sources))
+
+# targets
+all: inform informMMX informSSE2 $(CFG_VAR)/$(LIBRARY).lib
+
+informMMX:
+ifneq ($(MMX),off)
+ifneq ($(MMX),on)
+ifneq ($(MMX),)
+	@echo "Invalid specified MMX option : "$(MMX_VAR)"."
+	@echo
+	@echo "Possible choices for MMX are 'on' or 'off'"
+	@exit 1
+endif
+	@echo "Setting MMX flag to default value 'on'... (use MMX=on or MMX=off)"
+endif
+endif
+
+informSSE2:
+ifneq ($(SSE2),off)
+ifneq ($(SSE2),on)
+ifneq ($(SSE2),)
+	@echo "Invalid specified SSE option : "$(SSE2)"."
+	@echo
+	@echo "Possible choices for SSE2 are 'on' or 'off'"
+	@exit 1
+endif
+	@echo "Setting SSE2 flag to default value 'on'... (use SSE2=on or SSE2=off)"
+endif
+endif
+
+
+# pixman linking
+$(CFG_VAR)/$(LIBRARY).lib: $(OBJECTS)
+	@$(AR) $(PIXMAN_ARFLAGS) -OUT:$@ $^
diff --git a/pixman/make-combine.pl b/pixman/make-combine.pl
new file mode 100644
index 0000000..210a5da
--- /dev/null
+++ b/pixman/make-combine.pl
@@ -0,0 +1,86 @@
+$usage = "Usage: combine.pl { 8 | 16 } < pixman-combine.c.template";
+
+$#ARGV == 0 or die $usage;
+
+# Get the component size.
+$size = int($ARGV[0]);
+$size == 8 or $size == 16 or die $usage;
+
+$pixel_size = $size * 4;
+$half_pixel_size = $size * 2;
+
+sub mask {
+    my $str = shift;
+    my $suffix;
+    $suffix = "ULL" if $size > 8;
+
+    return "0x" . $str . $suffix;
+}
+
+# Generate mask strings.
+$nibbles = $size / 4;
+$mask = "f" x $nibbles;
+$zero_mask = "0" x $nibbles;
+$one_half = "8" . "0" x ($nibbles - 1);
+
+print "/* WARNING: This file is generated by combine.pl from combine.inc.\n";
+print "   Please edit one of those files rather than this one. */\n";
+print "\n";
+
+print "#line 1 \"pixman-combine.c.template\"\n";
+
+$mask_ = mask($mask);
+$one_half_ = mask($one_half);
+$g_mask = mask($mask . $zero_mask);
+$b_mask = mask($mask . $zero_mask x 2);
+$a_mask = mask($mask . $zero_mask x 3);
+$rb_mask = mask($mask . $zero_mask . $mask);
+$ag_mask = mask($mask . $zero_mask . $mask . $zero_mask);
+$rb_one_half = mask($one_half . $zero_mask . $one_half);
+$rb_mask_plus_one = mask("1" . $zero_mask x 2 . "1" .  $zero_mask);
+
+while (<STDIN>) {
+    # Mask and 1/2 value for a single component.
+    s/#define COMPONENT_SIZE\b/$& $size/;
+    s/#define MASK\b/$& $mask_/;
+    s/#define ONE_HALF\b/$& $one_half_/;
+
+    # Shifts and masks for green, blue, and alpha.
+    s/#define G_SHIFT\b/$& $size/;
+    s/#define R_SHIFT\b/$& $size * 2/;
+    s/#define A_SHIFT\b/$& $size * 3/;
+    s/#define G_MASK\b/$& $g_mask/;
+    s/#define R_MASK\b/$& $b_mask/;
+    s/#define A_MASK\b/$& $a_mask/;
+
+    # Special values for dealing with red + blue at the same time.
+    s/#define RB_MASK\b/$& $rb_mask/;
+    s/#define AG_MASK\b/$& $ag_mask/;
+    s/#define RB_ONE_HALF\b/$& $rb_one_half/;
+    s/#define RB_MASK_PLUS_ONE\b/$& $rb_mask_plus_one/;
+
+    # Add 32/64 suffix to combining function types.
+    s/\bCombineFunc\b/CombineFunc$pixel_size/;
+    s/\bFbComposeFunctions\b/FbComposeFunctions$pixel_size/;
+    s/combine_width/combine_$pixel_size/;
+    s/_pixman_setup_combiner_functions_width/_pixman_setup_combiner_functions_$pixel_size/;
+    s/UNc/UN$size/g;
+    s/ALPHA_c/ALPHA_$size/g;
+    s/RED_c/RED_$size/g;
+    s/GREEN_c/GREEN_$size/g;
+    s/BLUE_c/BLUE_$size/g;
+
+    # Convert comp*_t values into the appropriate real types.
+    s/comp1_t/uint${size}_t/g;
+    s/comp2_t/uint${half_pixel_size}_t/g;
+    s/comp4_t/uint${pixel_size}_t/g;
+
+    # Change the function table name for the 64-bit version.
+    s/pixman_composeFunctions/pixman_composeFunctions64/ if $size == 16;
+
+    # Change the header for the 64-bit version
+    s/pixman-combine.h/pixman-combine64.h/ if $size == 16;
+    s/pixman-combine.h/pixman-combine32.h/ if $size == 8;
+
+    print;
+}
diff --git a/pixman/pixman-access-accessors.c b/pixman/pixman-access-accessors.c
new file mode 100644
index 0000000..3263582
--- /dev/null
+++ b/pixman/pixman-access-accessors.c
@@ -0,0 +1,3 @@
+#define PIXMAN_FB_ACCESSORS
+
+#include "pixman-access.c"
diff --git a/pixman/pixman-access.c b/pixman/pixman-access.c
new file mode 100644
index 0000000..189b191
--- /dev/null
+++ b/pixman/pixman-access.c
@@ -0,0 +1,1226 @@
+/*
+ *
+ * Copyright Â© 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *             2008 Aaron Plattner, NVIDIA Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pixman-private.h"
+#include "pixman-accessor.h"
+
+#define CONVERT_RGB24_TO_Y15(s)						\
+    (((((s) >> 16) & 0xff) * 153 +					\
+      (((s) >>  8) & 0xff) * 301 +					\
+      (((s)      ) & 0xff) * 58) >> 2)
+
+#define CONVERT_RGB24_TO_RGB15(s)                                       \
+    ((((s) >> 3) & 0x001f) |                                            \
+     (((s) >> 6) & 0x03e0) |                                            \
+     (((s) >> 9) & 0x7c00))
+
+#define RGB15_TO_ENTRY(mif,rgb15)					\
+    ((mif)->ent[rgb15])
+
+#define RGB24_TO_ENTRY(mif,rgb24)					\
+    RGB15_TO_ENTRY (mif,CONVERT_RGB24_TO_RGB15 (rgb24))
+
+#define RGB24_TO_ENTRY_Y(mif,rgb24)					\
+    ((mif)->ent[CONVERT_RGB24_TO_Y15 (rgb24)])
+
+/* Fetch macros */
+
+#ifdef WORDS_BIGENDIAN
+#define FETCH_1(img,l,o)						\
+    (((READ ((img), ((uint32_t *)(l)) + ((o) >> 5))) >> (0x1f - ((o) & 0x1f))) & 0x1)
+#else
+#define FETCH_1(img,l,o)						\
+    ((((READ ((img), ((uint32_t *)(l)) + ((o) >> 5))) >> ((o) & 0x1f))) & 0x1)
+#endif
+
+#define FETCH_8(img,l,o)    (READ (img, (((uint8_t *)(l)) + ((o) >> 3))))
+
+#ifdef WORDS_BIGENDIAN
+#define FETCH_4(img,l,o)						\
+    (((4 * (o)) & 4) ? (FETCH_8 (img,l, 4 * (o)) & 0xf) : (FETCH_8 (img,l,(4 * (o))) >> 4))
+#else
+#define FETCH_4(img,l,o)						\
+    (((4 * (o)) & 4) ? (FETCH_8 (img, l, 4 * (o)) >> 4) : (FETCH_8 (img, l, (4 * (o))) & 0xf))
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define FETCH_24(img,l,o)                                              \
+    ((READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 16)    |       \
+     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)     |       \
+     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 0))
+#else
+#define FETCH_24(img,l,o)						\
+    ((READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 0)	|	\
+     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)	|	\
+     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 16))
+#endif
+
+/* Store macros */
+
+#ifdef WORDS_BIGENDIAN
+#define STORE_1(img,l,o,v)						\
+    do									\
+    {									\
+	uint32_t  *__d = ((uint32_t *)(l)) + ((o) >> 5);		\
+	uint32_t __m, __v;						\
+									\
+	__m = 1 << (0x1f - ((o) & 0x1f));				\
+	__v = (v)? __m : 0;						\
+									\
+	WRITE((img), __d, (READ((img), __d) & ~__m) | __v);		\
+    }									\
+    while (0)
+#else
+#define STORE_1(img,l,o,v)						\
+    do									\
+    {									\
+	uint32_t  *__d = ((uint32_t *)(l)) + ((o) >> 5);		\
+	uint32_t __m, __v;						\
+									\
+	__m = 1 << ((o) & 0x1f);					\
+	__v = (v)? __m : 0;						\
+									\
+	WRITE((img), __d, (READ((img), __d) & ~__m) | __v);		\
+    }									\
+    while (0)
+#endif
+
+#define STORE_8(img,l,o,v)  (WRITE (img, (uint8_t *)(l) + ((o) >> 3), (v)))
+
+#ifdef WORDS_BIGENDIAN
+#define STORE_4(img,l,o,v)						\
+    do									\
+    {									\
+	int bo = 4 * (o);						\
+	int v4 = (v) & 0x0f;						\
+									\
+	STORE_8 (img, l, bo, (						\
+		     bo & 4 ?						\
+		     (FETCH_8 (img, l, bo) & 0xf0) | (v4) :		\
+		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4)));	\
+    } while (0)
+#else
+#define STORE_4(img,l,o,v)						\
+    do									\
+    {									\
+	int bo = 4 * (o);						\
+	int v4 = (v) & 0x0f;						\
+									\
+	STORE_8 (img, l, bo, (						\
+		     bo & 4 ?						\
+		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4) :	\
+		     (FETCH_8 (img, l, bo) & 0xf0) | (v4)));		\
+    } while (0)
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define STORE_24(img,l,o,v)                                            \
+    do                                                                 \
+    {                                                                  \
+	uint8_t *__tmp = (l) + 3 * (o);				       \
+        							       \
+	WRITE ((img), __tmp++, ((v) & 0x00ff0000) >> 16);	       \
+	WRITE ((img), __tmp++, ((v) & 0x0000ff00) >>  8);	       \
+	WRITE ((img), __tmp++, ((v) & 0x000000ff) >>  0);	       \
+    }                                                                  \
+    while (0)
+#else
+#define STORE_24(img,l,o,v)                                            \
+    do                                                                 \
+    {                                                                  \
+	uint8_t *__tmp = (l) + 3 * (o);				       \
+        							       \
+	WRITE ((img), __tmp++, ((v) & 0x000000ff) >>  0);	       \
+	WRITE ((img), __tmp++, ((v) & 0x0000ff00) >>  8);	       \
+	WRITE ((img), __tmp++, ((v) & 0x00ff0000) >> 16);	       \
+    }								       \
+    while (0)
+#endif
+
+/*
+ * YV12 setup and access macros
+ */
+
+#define YV12_SETUP(image)                                               \
+    bits_image_t *__bits_image = (bits_image_t *)image;                 \
+    uint32_t *bits = __bits_image->bits;                                \
+    int stride = __bits_image->rowstride;                               \
+    int offset0 = stride < 0 ?                                          \
+    ((-stride) >> 1) * ((__bits_image->height - 1) >> 1) - stride :	\
+    stride * __bits_image->height;					\
+    int offset1 = stride < 0 ?                                          \
+    offset0 + ((-stride) >> 1) * ((__bits_image->height) >> 1) :	\
+	offset0 + (offset0 >> 2)
+
+/* Note no trailing semicolon on the above macro; if it's there, then
+ * the typical usage of YV12_SETUP(image); will have an extra trailing ;
+ * that some compilers will interpret as a statement -- and then any further
+ * variable declarations will cause an error.
+ */
+
+#define YV12_Y(line)                                                    \
+    ((uint8_t *) ((bits) + (stride) * (line)))
+
+#define YV12_U(line)                                                    \
+    ((uint8_t *) ((bits) + offset1 +                                    \
+                  ((stride) >> 1) * ((line) >> 1)))
+
+#define YV12_V(line)                                                    \
+    ((uint8_t *) ((bits) + offset0 +                                    \
+                  ((stride) >> 1) * ((line) >> 1)))
+
+/* Misc. helpers */
+
+static force_inline void
+get_shifts (pixman_format_code_t  format,
+	    int			 *a,
+	    int			 *r,
+	    int                  *g,
+	    int                  *b)
+{
+    switch (PIXMAN_FORMAT_TYPE (format))
+    {
+    case PIXMAN_TYPE_A:
+	*b = 0;
+	*g = 0;
+	*r = 0;
+	*a = 0;
+	break;
+
+    case PIXMAN_TYPE_ARGB:
+	*b = 0;
+	*g = *b + PIXMAN_FORMAT_B (format);
+	*r = *g + PIXMAN_FORMAT_G (format);
+	*a = *r + PIXMAN_FORMAT_R (format);
+	break;
+
+    case PIXMAN_TYPE_ABGR:
+	*r = 0;
+	*g = *r + PIXMAN_FORMAT_R (format);
+	*b = *g + PIXMAN_FORMAT_G (format);
+	*a = *b + PIXMAN_FORMAT_B (format);
+	break;
+
+    case PIXMAN_TYPE_BGRA:
+	/* With BGRA formats we start counting at the high end of the pixel */
+	*b = PIXMAN_FORMAT_BPP (format) - PIXMAN_FORMAT_B (format);
+	*g = *b - PIXMAN_FORMAT_B (format);
+	*r = *g - PIXMAN_FORMAT_G (format);
+	*a = *r - PIXMAN_FORMAT_R (format);
+	break;
+
+    case PIXMAN_TYPE_RGBA:
+	/* With BGRA formats we start counting at the high end of the pixel */
+	*r = PIXMAN_FORMAT_BPP (format) - PIXMAN_FORMAT_R (format);
+	*g = *r - PIXMAN_FORMAT_R (format);
+	*b = *g - PIXMAN_FORMAT_G (format);
+	*a = *b - PIXMAN_FORMAT_B (format);
+	break;
+
+    default:
+	assert (0);
+	break;
+    }
+}
+
+static force_inline uint32_t
+convert_channel (uint32_t pixel, uint32_t def_value,
+		 int n_from_bits, int from_shift,
+		 int n_to_bits, int to_shift)
+{
+    uint32_t v;
+
+    if (n_from_bits && n_to_bits)
+	v  = unorm_to_unorm (pixel >> from_shift, n_from_bits, n_to_bits);
+    else if (n_to_bits)
+	v = def_value;
+    else
+	v = 0;
+
+    return (v & ((1 << n_to_bits) - 1)) << to_shift;
+}
+
+static force_inline uint32_t
+convert_pixel (pixman_format_code_t from, pixman_format_code_t to, uint32_t pixel)
+{
+    int a_from_shift, r_from_shift, g_from_shift, b_from_shift;
+    int a_to_shift, r_to_shift, g_to_shift, b_to_shift;
+    uint32_t a, r, g, b;
+
+    get_shifts (from, &a_from_shift, &r_from_shift, &g_from_shift, &b_from_shift);
+    get_shifts (to, &a_to_shift, &r_to_shift, &g_to_shift, &b_to_shift);
+
+    a = convert_channel (pixel, ~0,
+			 PIXMAN_FORMAT_A (from), a_from_shift,
+			 PIXMAN_FORMAT_A (to), a_to_shift);
+
+    r = convert_channel (pixel, 0,
+			 PIXMAN_FORMAT_R (from), r_from_shift,
+			 PIXMAN_FORMAT_R (to), r_to_shift);
+
+    g = convert_channel (pixel, 0,
+			 PIXMAN_FORMAT_G (from), g_from_shift,
+			 PIXMAN_FORMAT_G (to), g_to_shift);
+
+    b = convert_channel (pixel, 0,
+			 PIXMAN_FORMAT_B (from), b_from_shift,
+			 PIXMAN_FORMAT_B (to), b_to_shift);
+
+    return a | r | g | b;
+}
+
+static force_inline uint32_t
+convert_pixel_to_a8r8g8b8 (pixman_image_t *image,
+			   pixman_format_code_t format,
+			   uint32_t pixel)
+{
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY		||
+	PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
+    {
+	return image->bits.indexed->rgba[pixel];
+    }
+    else
+    {
+	return convert_pixel (format, PIXMAN_a8r8g8b8, pixel);
+    }
+}
+
+static force_inline uint32_t
+convert_pixel_from_a8r8g8b8 (pixman_image_t *image,
+			     pixman_format_code_t format, uint32_t pixel)
+{
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
+    {
+	pixel = CONVERT_RGB24_TO_Y15 (pixel);
+
+	return image->bits.indexed->ent[pixel & 0x7fff];
+    }
+    else if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
+    {
+	pixel = convert_pixel (PIXMAN_a8r8g8b8, PIXMAN_x1r5g5b5, pixel);
+
+	return image->bits.indexed->ent[pixel & 0x7fff];
+    }
+    else
+    {
+	return convert_pixel (PIXMAN_a8r8g8b8, format, pixel);
+    }
+}
+
+static force_inline uint32_t
+fetch_and_convert_pixel (pixman_image_t	*	image,
+			 const uint8_t *	bits,
+			 int			offset,
+			 pixman_format_code_t	format)
+{
+    uint32_t pixel;
+
+    switch (PIXMAN_FORMAT_BPP (format))
+    {
+    case 1:
+	pixel = FETCH_1 (image, bits, offset);
+	break;
+
+    case 4:
+	pixel = FETCH_4 (image, bits, offset);
+	break;
+
+    case 8:
+	pixel = READ (image, bits + offset);
+	break;
+
+    case 16:
+	pixel = READ (image, ((uint16_t *)bits + offset));
+	break;
+
+    case 24:
+	pixel = FETCH_24 (image, bits, offset);
+	break;
+
+    case 32:
+	pixel = READ (image, ((uint32_t *)bits + offset));
+	break;
+
+    default:
+	pixel = 0xffff00ff; /* As ugly as possible to detect the bug */
+	break;
+    }
+
+    return convert_pixel_to_a8r8g8b8 (image, format, pixel);
+}
+
+static force_inline void
+convert_and_store_pixel (bits_image_t *		image,
+			 uint8_t *		dest,
+			 int                    offset,
+			 pixman_format_code_t	format,
+			 uint32_t		pixel)
+{
+    uint32_t converted = convert_pixel_from_a8r8g8b8 (
+	(pixman_image_t *)image, format, pixel);
+
+    switch (PIXMAN_FORMAT_BPP (format))
+    {
+    case 1:
+	STORE_1 (image, dest, offset, converted & 0x01);
+	break;
+
+    case 4:
+	STORE_4 (image, dest, offset, converted & 0xf);
+	break;
+
+    case 8:
+	WRITE (image, (dest + offset), converted & 0xff);
+	break;
+
+    case 16:
+	WRITE (image, ((uint16_t *)dest + offset), converted & 0xffff);
+	break;
+
+    case 24:
+	STORE_24 (image, dest, offset, converted);
+	break;
+
+    case 32:
+	WRITE (image, ((uint32_t *)dest + offset), converted);
+	break;
+
+    default:
+	*dest = 0x0;
+	break;
+    }
+}
+
+#define MAKE_ACCESSORS(format)						\
+    static void								\
+    fetch_scanline_ ## format (pixman_image_t *image,			\
+			       int	       x,			\
+			       int             y,			\
+			       int             width,			\
+			       uint32_t *      buffer,			\
+			       const uint32_t *mask)			\
+    {									\
+	uint8_t *bits =							\
+	    (uint8_t *)(image->bits.bits + y * image->bits.rowstride);	\
+	int i;								\
+									\
+	for (i = 0; i < width; ++i)					\
+	{								\
+	    *buffer++ =							\
+		fetch_and_convert_pixel (image, bits, x + i, PIXMAN_ ## format); \
+	}								\
+    }									\
+									\
+    static void								\
+    store_scanline_ ## format (bits_image_t *  image,			\
+			       int             x,			\
+			       int             y,			\
+			       int             width,			\
+			       const uint32_t *values)			\
+    {									\
+	uint8_t *dest =							\
+	    (uint8_t *)(image->bits + y * image->rowstride);		\
+	int i;								\
+									\
+	for (i = 0; i < width; ++i)					\
+	{								\
+	    convert_and_store_pixel (					\
+		image, dest, i + x, PIXMAN_ ## format, values[i]);	\
+	}								\
+    }									\
+									\
+    static uint32_t							\
+    fetch_pixel_ ## format (bits_image_t *image,			\
+			    int		offset,				\
+			    int		line)				\
+    {									\
+	uint8_t *bits =							\
+	    (uint8_t *)(image->bits + line * image->rowstride);		\
+									\
+	return fetch_and_convert_pixel ((pixman_image_t *)image,	\
+					bits, offset, PIXMAN_ ## format); \
+    }									\
+									\
+    static const void *const __dummy__ ## format
+
+MAKE_ACCESSORS(a8r8g8b8);
+MAKE_ACCESSORS(x8r8g8b8);
+MAKE_ACCESSORS(a8b8g8r8);
+MAKE_ACCESSORS(x8b8g8r8);
+MAKE_ACCESSORS(x14r6g6b6);
+MAKE_ACCESSORS(b8g8r8a8);
+MAKE_ACCESSORS(b8g8r8x8);
+MAKE_ACCESSORS(r8g8b8x8);
+MAKE_ACCESSORS(r8g8b8a8);
+MAKE_ACCESSORS(r8g8b8);
+MAKE_ACCESSORS(b8g8r8);
+MAKE_ACCESSORS(r5g6b5);
+MAKE_ACCESSORS(b5g6r5);
+MAKE_ACCESSORS(a1r5g5b5);
+MAKE_ACCESSORS(x1r5g5b5);
+MAKE_ACCESSORS(a1b5g5r5);
+MAKE_ACCESSORS(x1b5g5r5);
+MAKE_ACCESSORS(a4r4g4b4);
+MAKE_ACCESSORS(x4r4g4b4);
+MAKE_ACCESSORS(a4b4g4r4);
+MAKE_ACCESSORS(x4b4g4r4);
+MAKE_ACCESSORS(a8);
+MAKE_ACCESSORS(c8);
+MAKE_ACCESSORS(g8);
+MAKE_ACCESSORS(r3g3b2);
+MAKE_ACCESSORS(b2g3r3);
+MAKE_ACCESSORS(a2r2g2b2);
+MAKE_ACCESSORS(a2b2g2r2);
+MAKE_ACCESSORS(x4a4);
+MAKE_ACCESSORS(a4);
+MAKE_ACCESSORS(g4);
+MAKE_ACCESSORS(c4);
+MAKE_ACCESSORS(r1g2b1);
+MAKE_ACCESSORS(b1g2r1);
+MAKE_ACCESSORS(a1r1g1b1);
+MAKE_ACCESSORS(a1b1g1r1);
+MAKE_ACCESSORS(a1);
+MAKE_ACCESSORS(g1);
+
+/********************************** Fetch ************************************/
+
+/* Expects a uint64_t buffer */
+static void
+fetch_scanline_a2r10g10b10 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      b,
+                            const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = bits + x;
+    const uint32_t *end = pixel + width;
+    uint64_t *buffer = (uint64_t *)b;
+
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t a = p >> 30;
+	uint64_t r = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t b = p & 0x3ff;
+
+	r = r << 6 | r >> 4;
+	g = g << 6 | g >> 4;
+	b = b << 6 | b >> 4;
+
+	a <<= 14;
+	a |= a >> 2;
+	a |= a >> 4;
+	a |= a >> 8;
+
+	*buffer++ = a << 48 | r << 32 | g << 16 | b;
+    }
+}
+
+/* Expects a uint64_t buffer */
+static void
+fetch_scanline_x2r10g10b10 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      b,
+                            const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    uint64_t *buffer = (uint64_t *)b;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t r = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t b = p & 0x3ff;
+	
+	r = r << 6 | r >> 4;
+	g = g << 6 | g >> 4;
+	b = b << 6 | b >> 4;
+	
+	*buffer++ = 0xffffULL << 48 | r << 32 | g << 16 | b;
+    }
+}
+
+/* Expects a uint64_t buffer */
+static void
+fetch_scanline_a2b10g10r10 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      b,
+                            const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = bits + x;
+    const uint32_t *end = pixel + width;
+    uint64_t *buffer = (uint64_t *)b;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t a = p >> 30;
+	uint64_t b = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t r = p & 0x3ff;
+	
+	r = r << 6 | r >> 4;
+	g = g << 6 | g >> 4;
+	b = b << 6 | b >> 4;
+	
+	a <<= 14;
+	a |= a >> 2;
+	a |= a >> 4;
+	a |= a >> 8;
+
+	*buffer++ = a << 48 | r << 32 | g << 16 | b;
+    }
+}
+
+/* Expects a uint64_t buffer */
+static void
+fetch_scanline_x2b10g10r10 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      b,
+                            const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    uint64_t *buffer = (uint64_t *)b;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t b = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t r = p & 0x3ff;
+	
+	r = r << 6 | r >> 4;
+	g = g << 6 | g >> 4;
+	b = b << 6 | b >> 4;
+	
+	*buffer++ = 0xffffULL << 48 | r << 32 | g << 16 | b;
+    }
+}
+
+static void
+fetch_scanline_yuy2 (pixman_image_t *image,
+                     int             x,
+                     int             line,
+                     int             width,
+                     uint32_t *      buffer,
+                     const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + image->bits.rowstride * line;
+    int i;
+    
+    for (i = 0; i < width; i++)
+    {
+	int16_t y, u, v;
+	int32_t r, g, b;
+	
+	y = ((uint8_t *) bits)[(x + i) << 1] - 16;
+	u = ((uint8_t *) bits)[(((x + i) << 1) & - 4) + 1] - 128;
+	v = ((uint8_t *) bits)[(((x + i) << 1) & - 4) + 3] - 128;
+	
+	/* R = 1.164(Y - 16) + 1.596(V - 128) */
+	r = 0x012b27 * y + 0x019a2e * v;
+	/* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+	g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+	/* B = 1.164(Y - 16) + 2.018(U - 128) */
+	b = 0x012b27 * y + 0x0206a2 * u;
+	
+	*buffer++ = 0xff000000 |
+	    (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+	    (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+	    (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+    }
+}
+
+static void
+fetch_scanline_yv12 (pixman_image_t *image,
+                     int             x,
+                     int             line,
+                     int             width,
+                     uint32_t *      buffer,
+                     const uint32_t *mask)
+{
+    YV12_SETUP (image);
+    uint8_t *y_line = YV12_Y (line);
+    uint8_t *u_line = YV12_U (line);
+    uint8_t *v_line = YV12_V (line);
+    int i;
+    
+    for (i = 0; i < width; i++)
+    {
+	int16_t y, u, v;
+	int32_t r, g, b;
+
+	y = y_line[x + i] - 16;
+	u = u_line[(x + i) >> 1] - 128;
+	v = v_line[(x + i) >> 1] - 128;
+
+	/* R = 1.164(Y - 16) + 1.596(V - 128) */
+	r = 0x012b27 * y + 0x019a2e * v;
+	/* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+	g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+	/* B = 1.164(Y - 16) + 2.018(U - 128) */
+	b = 0x012b27 * y + 0x0206a2 * u;
+
+	*buffer++ = 0xff000000 |
+	    (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+	    (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+	    (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+    }
+}
+
+/**************************** Pixel wise fetching *****************************/
+
+/* Despite the type, expects a uint64_t buffer */
+static uint64_t
+fetch_pixel_a2r10g10b10 (bits_image_t *image,
+			 int		  offset,
+			 int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t a = p >> 30;
+    uint64_t r = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t b = p & 0x3ff;
+
+    r = r << 6 | r >> 4;
+    g = g << 6 | g >> 4;
+    b = b << 6 | b >> 4;
+
+    a <<= 14;
+    a |= a >> 2;
+    a |= a >> 4;
+    a |= a >> 8;
+
+    return a << 48 | r << 32 | g << 16 | b;
+}
+
+/* Despite the type, this function expects a uint64_t buffer */
+static uint64_t
+fetch_pixel_x2r10g10b10 (bits_image_t *image,
+			 int	   offset,
+			 int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t r = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t b = p & 0x3ff;
+    
+    r = r << 6 | r >> 4;
+    g = g << 6 | g >> 4;
+    b = b << 6 | b >> 4;
+    
+    return 0xffffULL << 48 | r << 32 | g << 16 | b;
+}
+
+/* Despite the type, expects a uint64_t buffer */
+static uint64_t
+fetch_pixel_a2b10g10r10 (bits_image_t *image,
+			 int           offset,
+			 int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t a = p >> 30;
+    uint64_t b = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t r = p & 0x3ff;
+    
+    r = r << 6 | r >> 4;
+    g = g << 6 | g >> 4;
+    b = b << 6 | b >> 4;
+    
+    a <<= 14;
+    a |= a >> 2;
+    a |= a >> 4;
+    a |= a >> 8;
+    
+    return a << 48 | r << 32 | g << 16 | b;
+}
+
+/* Despite the type, this function expects a uint64_t buffer */
+static uint64_t
+fetch_pixel_x2b10g10r10 (bits_image_t *image,
+			 int           offset,
+			 int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t b = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t r = p & 0x3ff;
+    
+    r = r << 6 | r >> 4;
+    g = g << 6 | g >> 4;
+    b = b << 6 | b >> 4;
+    
+    return 0xffffULL << 48 | r << 32 | g << 16 | b;
+}
+
+static uint32_t
+fetch_pixel_yuy2 (bits_image_t *image,
+		  int           offset,
+		  int           line)
+{
+    const uint32_t *bits = image->bits + image->rowstride * line;
+    
+    int16_t y, u, v;
+    int32_t r, g, b;
+    
+    y = ((uint8_t *) bits)[offset << 1] - 16;
+    u = ((uint8_t *) bits)[((offset << 1) & - 4) + 1] - 128;
+    v = ((uint8_t *) bits)[((offset << 1) & - 4) + 3] - 128;
+    
+    /* R = 1.164(Y - 16) + 1.596(V - 128) */
+    r = 0x012b27 * y + 0x019a2e * v;
+    
+    /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+    g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+    
+    /* B = 1.164(Y - 16) + 2.018(U - 128) */
+    b = 0x012b27 * y + 0x0206a2 * u;
+    
+    return 0xff000000 |
+	(r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+	(g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+	(b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+}
+
+static uint32_t
+fetch_pixel_yv12 (bits_image_t *image,
+		  int           offset,
+		  int           line)
+{
+    YV12_SETUP (image);
+    int16_t y = YV12_Y (line)[offset] - 16;
+    int16_t u = YV12_U (line)[offset >> 1] - 128;
+    int16_t v = YV12_V (line)[offset >> 1] - 128;
+    int32_t r, g, b;
+    
+    /* R = 1.164(Y - 16) + 1.596(V - 128) */
+    r = 0x012b27 * y + 0x019a2e * v;
+    
+    /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+    g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+    
+    /* B = 1.164(Y - 16) + 2.018(U - 128) */
+    b = 0x012b27 * y + 0x0206a2 * u;
+    
+    return 0xff000000 |
+	(r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+	(g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+	(b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+}
+
+/*********************************** Store ************************************/
+
+static void
+store_scanline_a2r10g10b10 (bits_image_t *  image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = bits + x;
+    uint64_t *values = (uint64_t *)v;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >> 32) & 0xc0000000) |
+	       ((values[i] >> 18) & 0x3ff00000) |
+	       ((values[i] >> 12) & 0xffc00) | 
+	       ((values[i] >> 6) & 0x3ff));    
+    }
+}
+
+static void
+store_scanline_x2r10g10b10 (bits_image_t *  image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint64_t *values = (uint64_t *)v;
+    uint32_t *pixel = bits + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >> 18) & 0x3ff00000) | 
+	       ((values[i] >> 12) & 0xffc00) |
+	       ((values[i] >> 6) & 0x3ff));
+    }
+}
+
+static void
+store_scanline_a2b10g10r10 (bits_image_t *  image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = bits + x;
+    uint64_t *values = (uint64_t *)v;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >> 32) & 0xc0000000) |
+	       ((values[i] >> 38) & 0x3ff) |
+	       ((values[i] >> 12) & 0xffc00) |
+	       ((values[i] << 14) & 0x3ff00000));
+    }
+}
+
+static void
+store_scanline_x2b10g10r10 (bits_image_t *  image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint64_t *values = (uint64_t *)v;
+    uint32_t *pixel = bits + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >> 38) & 0x3ff) |
+	       ((values[i] >> 12) & 0xffc00) |
+	       ((values[i] << 14) & 0x3ff00000));
+    }
+}
+
+/*
+ * Contracts a 64bpp image to 32bpp and then stores it using a regular 32-bit
+ * store proc. Despite the type, this function expects a uint64_t buffer.
+ */
+static void
+store_scanline_generic_64 (bits_image_t *  image,
+                           int             x,
+                           int             y,
+                           int             width,
+                           const uint32_t *values)
+{
+    uint32_t *argb8_pixels;
+    
+    assert (image->common.type == BITS);
+    
+    argb8_pixels = pixman_malloc_ab (width, sizeof(uint32_t));
+    if (!argb8_pixels)
+	return;
+    
+    /* Contract the scanline.  We could do this in place if values weren't
+     * const.
+     */
+    pixman_contract (argb8_pixels, (uint64_t *)values, width);
+    
+    image->store_scanline_32 (image, x, y, width, argb8_pixels);
+    
+    free (argb8_pixels);
+}
+
+/* Despite the type, this function expects both buffer
+ * and mask to be uint64_t
+ */
+static void
+fetch_scanline_generic_64 (pixman_image_t *image,
+                           int             x,
+                           int             y,
+                           int             width,
+                           uint32_t *      buffer,
+                           const uint32_t *mask)
+{
+    pixman_format_code_t format;
+
+    /* Fetch the pixels into the first half of buffer and then expand them in
+     * place.
+     */
+    image->bits.fetch_scanline_32 (image, x, y, width, buffer, NULL);
+
+    format = image->bits.format;
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR	||
+	PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
+    {
+	/* Indexed formats are mapped to a8r8g8b8 with full
+	 * precision, so when expanding we shouldn't correct
+	 * for the width of the channels
+	 */
+
+	format = PIXMAN_a8r8g8b8;
+    }
+
+    pixman_expand ((uint64_t *)buffer, buffer, format, width);
+}
+
+/* Despite the type, this function expects a uint64_t *buffer */
+static uint64_t
+fetch_pixel_generic_64 (bits_image_t *image,
+			int	      offset,
+			int           line)
+{
+    uint32_t pixel32 = image->fetch_pixel_32 (image, offset, line);
+    uint64_t result;
+    pixman_format_code_t format;
+
+    format = image->format;
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR	||
+	PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
+    {
+	/* Indexed formats are mapped to a8r8g8b8 with full
+	 * precision, so when expanding we shouldn't correct
+	 * for the width of the channels
+	 */
+
+	format = PIXMAN_a8r8g8b8;
+    }
+
+    pixman_expand ((uint64_t *)&result, &pixel32, format, 1);
+
+    return result;
+}
+
+/*
+ * XXX: The transformed fetch path only works at 32-bpp so far.  When all
+ * paths have wide versions, this can be removed.
+ *
+ * WARNING: This function loses precision!
+ */
+static uint32_t
+fetch_pixel_generic_lossy_32 (bits_image_t *image,
+			      int           offset,
+			      int           line)
+{
+    uint64_t pixel64 = image->fetch_pixel_64 (image, offset, line);
+    uint32_t result;
+
+    pixman_contract (&result, &pixel64, 1);
+
+    return result;
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    fetch_scanline_t		fetch_scanline_32;
+    fetch_scanline_t		fetch_scanline_64;
+    fetch_pixel_32_t		fetch_pixel_32;
+    fetch_pixel_64_t		fetch_pixel_64;
+    store_scanline_t		store_scanline_32;
+    store_scanline_t		store_scanline_64;
+} format_info_t;
+
+#define FORMAT_INFO(format) 						\
+    {									\
+	PIXMAN_ ## format,						\
+	    fetch_scanline_ ## format,					\
+	    fetch_scanline_generic_64,					\
+	    fetch_pixel_ ## format, fetch_pixel_generic_64,		\
+	    store_scanline_ ## format, store_scanline_generic_64	\
+    }
+
+static const format_info_t accessors[] =
+{
+/* 32 bpp formats */
+    FORMAT_INFO (a8r8g8b8),
+    FORMAT_INFO (x8r8g8b8),
+    FORMAT_INFO (a8b8g8r8),
+    FORMAT_INFO (x8b8g8r8),
+    FORMAT_INFO (b8g8r8a8),
+    FORMAT_INFO (b8g8r8x8),
+    FORMAT_INFO (r8g8b8a8),
+    FORMAT_INFO (r8g8b8x8),
+    FORMAT_INFO (x14r6g6b6),
+
+/* 24bpp formats */
+    FORMAT_INFO (r8g8b8),
+    FORMAT_INFO (b8g8r8),
+    
+/* 16bpp formats */
+    FORMAT_INFO (r5g6b5),
+    FORMAT_INFO (b5g6r5),
+    
+    FORMAT_INFO (a1r5g5b5),
+    FORMAT_INFO (x1r5g5b5),
+    FORMAT_INFO (a1b5g5r5),
+    FORMAT_INFO (x1b5g5r5),
+    FORMAT_INFO (a4r4g4b4),
+    FORMAT_INFO (x4r4g4b4),
+    FORMAT_INFO (a4b4g4r4),
+    FORMAT_INFO (x4b4g4r4),
+    
+/* 8bpp formats */
+    FORMAT_INFO (a8),
+    FORMAT_INFO (r3g3b2),
+    FORMAT_INFO (b2g3r3),
+    FORMAT_INFO (a2r2g2b2),
+    FORMAT_INFO (a2b2g2r2),
+    
+    FORMAT_INFO (c8),
+    
+    FORMAT_INFO (g8),
+    
+#define fetch_scanline_x4c4 fetch_scanline_c8
+#define fetch_pixel_x4c4 fetch_pixel_c8
+#define store_scanline_x4c4 store_scanline_c8
+    FORMAT_INFO (x4c4),
+    
+#define fetch_scanline_x4g4 fetch_scanline_g8
+#define fetch_pixel_x4g4 fetch_pixel_g8
+#define store_scanline_x4g4 store_scanline_g8
+    FORMAT_INFO (x4g4),
+    
+    FORMAT_INFO (x4a4),
+    
+/* 4bpp formats */
+    FORMAT_INFO (a4),
+    FORMAT_INFO (r1g2b1),
+    FORMAT_INFO (b1g2r1),
+    FORMAT_INFO (a1r1g1b1),
+    FORMAT_INFO (a1b1g1r1),
+    
+    FORMAT_INFO (c4),
+    
+    FORMAT_INFO (g4),
+    
+/* 1bpp formats */
+    FORMAT_INFO (a1),
+    FORMAT_INFO (g1),
+    
+/* Wide formats */
+    
+    { PIXMAN_a2r10g10b10,
+      NULL, fetch_scanline_a2r10g10b10,
+      fetch_pixel_generic_lossy_32, fetch_pixel_a2r10g10b10,
+      NULL, store_scanline_a2r10g10b10 },
+    
+    { PIXMAN_x2r10g10b10,
+      NULL, fetch_scanline_x2r10g10b10,
+      fetch_pixel_generic_lossy_32, fetch_pixel_x2r10g10b10,
+      NULL, store_scanline_x2r10g10b10 },
+    
+    { PIXMAN_a2b10g10r10,
+      NULL, fetch_scanline_a2b10g10r10,
+      fetch_pixel_generic_lossy_32, fetch_pixel_a2b10g10r10,
+      NULL, store_scanline_a2b10g10r10 },
+    
+    { PIXMAN_x2b10g10r10,
+      NULL, fetch_scanline_x2b10g10r10,
+      fetch_pixel_generic_lossy_32, fetch_pixel_x2b10g10r10,
+      NULL, store_scanline_x2b10g10r10 },
+    
+/* YUV formats */
+    { PIXMAN_yuy2,
+      fetch_scanline_yuy2, fetch_scanline_generic_64,
+      fetch_pixel_yuy2, fetch_pixel_generic_64,
+      NULL, NULL },
+    
+    { PIXMAN_yv12,
+      fetch_scanline_yv12, fetch_scanline_generic_64,
+      fetch_pixel_yv12, fetch_pixel_generic_64,
+      NULL, NULL },
+    
+    { PIXMAN_null },
+};
+
+static void
+setup_accessors (bits_image_t *image)
+{
+    const format_info_t *info = accessors;
+    
+    while (info->format != PIXMAN_null)
+    {
+	if (info->format == image->format)
+	{
+	    image->fetch_scanline_32 = info->fetch_scanline_32;
+	    image->fetch_scanline_64 = info->fetch_scanline_64;
+	    image->fetch_pixel_32 = info->fetch_pixel_32;
+	    image->fetch_pixel_64 = info->fetch_pixel_64;
+	    image->store_scanline_32 = info->store_scanline_32;
+	    image->store_scanline_64 = info->store_scanline_64;
+	    
+	    return;
+	}
+	
+	info++;
+    }
+}
+
+#ifndef PIXMAN_FB_ACCESSORS
+void
+_pixman_bits_image_setup_accessors_accessors (bits_image_t *image);
+
+void
+_pixman_bits_image_setup_accessors (bits_image_t *image)
+{
+    if (image->read_func || image->write_func)
+	_pixman_bits_image_setup_accessors_accessors (image);
+    else
+	setup_accessors (image);
+}
+
+#else
+
+void
+_pixman_bits_image_setup_accessors_accessors (bits_image_t *image)
+{
+    setup_accessors (image);
+}
+
+#endif
diff --git a/pixman/pixman-accessor.h b/pixman/pixman-accessor.h
new file mode 100644
index 0000000..90c8ea7
--- /dev/null
+++ b/pixman/pixman-accessor.h
@@ -0,0 +1,40 @@
+#ifdef PIXMAN_FB_ACCESSORS
+
+#define ACCESS(sym) sym##_accessors
+
+#define READ(img, ptr)							\
+    (((bits_image_t *)(img))->read_func ((ptr), sizeof(*(ptr))))
+#define WRITE(img, ptr,val)						\
+    (((bits_image_t *)(img))->write_func ((ptr), (val), sizeof (*(ptr))))
+
+#define MEMCPY_WRAPPED(img, dst, src, size)				\
+    do {								\
+	size_t _i;							\
+	uint8_t *_dst = (uint8_t*)(dst), *_src = (uint8_t*)(src);	\
+	for(_i = 0; _i < size; _i++) {					\
+	    WRITE((img), _dst +_i, READ((img), _src + _i));		\
+	}								\
+    } while (0)
+
+#define MEMSET_WRAPPED(img, dst, val, size)				\
+    do {								\
+	size_t _i;							\
+	uint8_t *_dst = (uint8_t*)(dst);				\
+	for(_i = 0; _i < (size_t) size; _i++) {				\
+	    WRITE((img), _dst +_i, (val));				\
+	}								\
+    } while (0)
+
+#else
+
+#define ACCESS(sym) sym
+
+#define READ(img, ptr)		(*(ptr))
+#define WRITE(img, ptr, val)	(*(ptr) = (val))
+#define MEMCPY_WRAPPED(img, dst, src, size)				\
+    memcpy(dst, src, size)
+#define MEMSET_WRAPPED(img, dst, val, size)				\
+    memset(dst, val, size)
+
+#endif
+
diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
new file mode 100644
index 0000000..f56264e
--- /dev/null
+++ b/pixman/pixman-arm-common.h
@@ -0,0 +1,416 @@
+/*
+ * Copyright Â© 2010 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+#ifndef PIXMAN_ARM_COMMON_H
+#define PIXMAN_ARM_COMMON_H
+
+#include "pixman-inlines.h"
+
+/* Define some macros which can expand into proxy functions between
+ * ARM assembly optimized functions and the rest of pixman fast path API.
+ *
+ * All the low level ARM assembly functions have to use ARM EABI
+ * calling convention and take up to 8 arguments:
+ *    width, height, dst, dst_stride, src, src_stride, mask, mask_stride
+ *
+ * The arguments are ordered with the most important coming first (the
+ * first 4 arguments are passed to function in registers, the rest are
+ * on stack). The last arguments are optional, for example if the
+ * function is not using mask, then 'mask' and 'mask_stride' can be
+ * omitted when doing a function call.
+ *
+ * Arguments 'src' and 'mask' contain either a pointer to the top left
+ * pixel of the composited rectangle or a pixel color value depending
+ * on the function type. In the case of just a color value (solid source
+ * or mask), the corresponding stride argument is unused.
+ */
+
+#define SKIP_ZERO_SRC  1
+#define SKIP_ZERO_MASK 2
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_DST(cputype, name,                \
+                                          src_type, src_cnt,            \
+                                          dst_type, dst_cnt)            \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t   w,                   \
+                                         int32_t   h,                   \
+                                         dst_type *dst,                 \
+                                         int32_t   dst_stride,          \
+                                         src_type *src,                 \
+                                         int32_t   src_stride);         \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                       \
+    dst_type *dst_line;							\
+    src_type *src_line;                                                 \
+    int32_t dst_stride, src_stride;                                     \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
+                           src_stride, src_line, src_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src_line, src_stride);     \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_N_DST(flags, cputype, name,           \
+                                        dst_type, dst_cnt)              \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         uint32_t   src);               \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+			    pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);					\
+    dst_type  *dst_line;                                                \
+    int32_t    dst_stride;                                              \
+    uint32_t   src;                                                     \
+                                                                        \
+    src = _pixman_image_get_solid (					\
+	imp, src_image, dest_image->bits.format);			\
+                                                                        \
+    if ((flags & SKIP_ZERO_SRC) && src == 0)                            \
+	return;                                                         \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src);                      \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST(flags, cputype, name,      \
+                                             mask_type, mask_cnt,       \
+                                             dst_type, dst_cnt)         \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         uint32_t   src,                \
+                                         int32_t    unused,             \
+                                         mask_type *mask,               \
+                                         int32_t    mask_stride);       \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                       \
+    dst_type  *dst_line;						\
+    mask_type *mask_line;                                               \
+    int32_t    dst_stride, mask_stride;                                 \
+    uint32_t   src;                                                     \
+                                                                        \
+    src = _pixman_image_get_solid (					\
+	imp, src_image, dest_image->bits.format);			\
+                                                                        \
+    if ((flags & SKIP_ZERO_SRC) && src == 0)                            \
+	return;                                                         \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,       \
+                           mask_stride, mask_line, mask_cnt);           \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src, 0,                    \
+                                             mask_line, mask_stride);   \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST(flags, cputype, name,       \
+                                            src_type, src_cnt,          \
+                                            dst_type, dst_cnt)          \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         src_type  *src,                \
+                                         int32_t    src_stride,         \
+                                         uint32_t   mask);              \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                       \
+    dst_type  *dst_line;						\
+    src_type  *src_line;                                                \
+    int32_t    dst_stride, src_stride;                                  \
+    uint32_t   mask;                                                    \
+                                                                        \
+    mask = _pixman_image_get_solid (					\
+	imp, mask_image, dest_image->bits.format);			\
+                                                                        \
+    if ((flags & SKIP_ZERO_MASK) && mask == 0)                          \
+	return;                                                         \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
+                           src_stride, src_line, src_cnt);              \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src_line, src_stride,      \
+                                             mask);                     \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST(cputype, name,           \
+                                               src_type, src_cnt,       \
+                                               mask_type, mask_cnt,     \
+                                               dst_type, dst_cnt)       \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         src_type  *src,                \
+                                         int32_t    src_stride,         \
+                                         mask_type *mask,               \
+                                         int32_t    mask_stride);       \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                       \
+    dst_type  *dst_line;						\
+    src_type  *src_line;                                                \
+    mask_type *mask_line;                                               \
+    int32_t    dst_stride, src_stride, mask_stride;                     \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
+                           src_stride, src_line, src_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,       \
+                           mask_stride, mask_line, mask_cnt);           \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src_line, src_stride,      \
+                                             mask_line, mask_stride);   \
+}
+
+#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST(cputype, name, op,             \
+                                               src_type, dst_type)            \
+void                                                                          \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
+                                                   int32_t          w,        \
+                                                   dst_type *       dst,      \
+                                                   const src_type * src,      \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x);  \
+                                                                              \
+static force_inline void                                                      \
+scaled_nearest_scanline_##cputype##_##name##_##op (dst_type *       pd,       \
+                                                   const src_type * ps,       \
+                                                   int32_t          w,        \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx,   \
+                                                   pixman_bool_t    zero_src) \
+{                                                                             \
+    pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
+                                                                  vx, unit_x);\
+}                                                                             \
+                                                                              \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_cover_##op,                         \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, COVER)                             \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_none_##op,                          \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, NONE)                              \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op,                           \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, PAD)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                      \
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                             \
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                              \
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
+
+#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op,   \
+                                                  src_type, dst_type)         \
+void                                                                          \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
+                                                   int32_t          w,        \
+                                                   dst_type *       dst,      \
+                                                   const src_type * src,      \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   const uint8_t *  mask);    \
+                                                                              \
+static force_inline void                                                      \
+scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t *  mask,     \
+                                                   dst_type *       pd,       \
+                                                   const src_type * ps,       \
+                                                   int32_t          w,        \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx,   \
+                                                   pixman_bool_t    zero_src) \
+{                                                                             \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
+	return;                                                               \
+    pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
+                                                                  vx, unit_x, \
+                                                                  mask);      \
+}                                                                             \
+                                                                              \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                  \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op,                   \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                    \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)              \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),                     \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),                      \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
+/*****************************************************************************/
+
+#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(flags, cputype, name, op,     \
+                                                src_type, dst_type)           \
+void                                                                          \
+pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \
+                                                dst_type *       dst,         \
+                                                const src_type * top,         \
+                                                const src_type * bottom,      \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   x,           \
+                                                pixman_fixed_t   ux,          \
+                                                int              width);      \
+                                                                              \
+static force_inline void                                                      \
+scaled_bilinear_scanline_##cputype##_##name##_##op (                          \
+                                                dst_type *       dst,         \
+                                                const uint32_t * mask,        \
+                                                const src_type * src_top,     \
+                                                const src_type * src_bottom,  \
+                                                int32_t          w,           \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   vx,          \
+                                                pixman_fixed_t   unit_x,      \
+                                                pixman_fixed_t   max_vx,      \
+                                                pixman_bool_t    zero_src)    \
+{                                                                             \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
+	return;                                                               \
+    pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
+                            dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \
+}                                                                             \
+                                                                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, COVER, FLAG_NONE)        \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, NONE, FLAG_NONE)         \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, PAD, FLAG_NONE)          \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, NORMAL,                  \
+                       FLAG_NONE)
+
+
+#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST(flags, cputype, name, op,  \
+                                                src_type, dst_type)           \
+void                                                                          \
+pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \
+                                                dst_type *       dst,         \
+                                                const uint8_t *  mask,        \
+                                                const src_type * top,         \
+                                                const src_type * bottom,      \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   x,           \
+                                                pixman_fixed_t   ux,          \
+                                                int              width);      \
+                                                                              \
+static force_inline void                                                      \
+scaled_bilinear_scanline_##cputype##_##name##_##op (                          \
+                                                dst_type *       dst,         \
+                                                const uint8_t *  mask,        \
+                                                const src_type * src_top,     \
+                                                const src_type * src_bottom,  \
+                                                int32_t          w,           \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   vx,          \
+                                                pixman_fixed_t   unit_x,      \
+                                                pixman_fixed_t   max_vx,      \
+                                                pixman_bool_t    zero_src)    \
+{                                                                             \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
+	return;                                                                   \
+    pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
+                      dst, mask, src_top, src_bottom, wt, wb, vx, unit_x, w); \
+}                                                                             \
+                                                                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint8_t, dst_type, COVER,                    \
+                       FLAG_HAVE_NON_SOLID_MASK)                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint8_t, dst_type, NONE,                     \
+                       FLAG_HAVE_NON_SOLID_MASK)                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint8_t, dst_type, PAD,                      \
+                       FLAG_HAVE_NON_SOLID_MASK)                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint8_t, dst_type, NORMAL,                   \
+                       FLAG_HAVE_NON_SOLID_MASK)
+
+
+#endif
diff --git a/pixman/pixman-arm-detect-win32.asm b/pixman/pixman-arm-detect-win32.asm
new file mode 100644
index 0000000..8f5d5eb
--- /dev/null
+++ b/pixman/pixman-arm-detect-win32.asm
@@ -0,0 +1,21 @@
+    area pixman_msvc, code, readonly
+
+    export  pixman_msvc_try_arm_simd_op
+
+pixman_msvc_try_arm_simd_op
+    ;; I don't think the msvc arm asm knows how to do SIMD insns
+    ;; uqadd8 r3,r3,r3
+    dcd 0xe6633f93
+    mov pc,lr
+    endp
+
+    export  pixman_msvc_try_arm_neon_op
+
+pixman_msvc_try_arm_neon_op
+    ;; I don't think the msvc arm asm knows how to do NEON insns
+    ;; veor d0,d0,d0
+    dcd 0xf3000110
+    mov pc,lr
+    endp
+
+    end
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
new file mode 100644
index 0000000..f7913ad
--- /dev/null
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -0,0 +1,1367 @@
+/*
+ * Copyright Â© 2011 SCore Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ * Author:  Taekyun Kim (tkq.kim@samsung.com)
+ */
+
+/*
+ * This file contains scaled bilinear scanline functions implemented
+ * using older siarhei's bilinear macro template.
+ *
+ * << General scanline function procedures >>
+ *  1. bilinear interpolate source pixels
+ *  2. load mask pixels
+ *  3. load destination pixels
+ *  4. duplicate mask to fill whole register
+ *  5. interleave source & destination pixels
+ *  6. apply mask to source pixels
+ *  7. combine source & destination pixels
+ *  8, Deinterleave final result
+ *  9. store destination pixels
+ *
+ * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
+ * Registers with double numbers(src01, dst01) are 128-bits registers.
+ * All temp registers can be used freely outside the code block.
+ * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
+ *
+ * Remarks
+ *  There can be lots of pipeline stalls inside code block and between code blocks.
+ *  Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined (__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.eabi_attribute 10, 0
+.eabi_attribute 12, 0
+.arm
+.altmacro
+.p2align 2
+
+#include "pixman-arm-neon-asm.h"
+
+/*
+ * Bilinear macros from pixman-arm-neon-asm.S
+ */
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+    .func fname
+    .global fname
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    vld1.32   {reg1}, [TMP1], STRIDE
+    vld1.32   {reg2}, [TMP1]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    vld1.32   {reg2[0]}, [TMP1], STRIDE
+    vld1.32   {reg2[1]}, [TMP1]
+    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+    bilinear_load_8888 reg1, reg2, tmp1
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    bilinear_load_8888 reg3, reg4, tmp2
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {acc2lo[1]}, [TMP1]
+    vld1.32   {acc2hi[1]}, [TMP2]
+    convert_0565_to_x888 acc2, reg3, reg2, reg1
+    vzip.u8   reg1, reg3
+    vzip.u8   reg2, reg4
+    vzip.u8   reg3, reg4
+    vzip.u8   reg1, reg2
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {xacc2lo[1]}, [TMP1]
+    vld1.32   {xacc2hi[1]}, [TMP2]
+    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
+    vzip.u8   xreg1, xreg3
+    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
+    vzip.u8   xreg2, xreg4
+    vld1.32   {yacc2lo[1]}, [TMP1]
+    vzip.u8   xreg3, xreg4
+    vld1.32   {yacc2hi[1]}, [TMP2]
+    vzip.u8   xreg1, xreg2
+    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+    vmull.u8  xacc1, xreg1, d28
+    vzip.u8   yreg1, yreg3
+    vmlal.u8  xacc1, xreg2, d29
+    vzip.u8   yreg2, yreg4
+    vmull.u8  xacc2, xreg3, d28
+    vzip.u8   yreg3, yreg4
+    vmlal.u8  xacc2, xreg4, d29
+    vzip.u8   yreg1, yreg2
+    vmull.u8  yacc1, yreg1, d28
+    vmlal.u8  yacc1, yreg2, d29
+    vmull.u8  yacc2, yreg3, d28
+    vmlal.u8  yacc2, yreg4, d29
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if numpix == 4
+    vst1.32   {d0, d1}, [OUT]!
+.elseif numpix == 2
+    vst1.32   {d0}, [OUT]!
+.elseif numpix == 1
+    vst1.32   {d0[0]}, [OUT, :32]!
+.else
+    .error bilinear_store_8888 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+    vuzp.u8 d0, d1
+    vuzp.u8 d2, d3
+    vuzp.u8 d1, d3
+    vuzp.u8 d0, d2
+    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
+.if numpix == 4
+    vst1.16   {d2}, [OUT]!
+.elseif numpix == 2
+    vst1.32   {d2[0]}, [OUT]!
+.elseif numpix == 1
+    vst1.16   {d2[0]}, [OUT]!
+.else
+    .error bilinear_store_0565 numpix is unsupported
+.endif
+.endm
+
+
+/*
+ * Macros for loading mask pixels into register 'mask'.
+ * vdup must be done in somewhere else.
+ */
+.macro bilinear_load_mask_x numpix, mask
+.endm
+
+.macro bilinear_load_mask_8 numpix, mask
+.if numpix == 4
+    vld1.32     {mask[0]}, [MASK]!
+.elseif numpix == 2
+    vld1.16     {mask[0]}, [MASK]!
+.elseif numpix == 1
+    vld1.8      {mask[0]}, [MASK]!
+.else
+    .error bilinear_load_mask_8 numpix is unsupported
+.endif
+    pld         [MASK, #prefetch_offset]
+.endm
+
+.macro bilinear_load_mask mask_fmt, numpix, mask
+    bilinear_load_mask_&mask_fmt numpix, mask
+.endm
+
+
+/*
+ * Macros for loading destination pixels into register 'dst0' and 'dst1'.
+ * Interleave should be done somewhere else.
+ */
+.macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+.if numpix == 4
+    vld1.32     {dst0, dst1}, [OUT]
+.elseif numpix == 2
+    vld1.32     {dst0}, [OUT]
+.elseif numpix == 1
+    vld1.32     {dst0[0]}, [OUT]
+.else
+    .error bilinear_load_dst_8888 numpix is unsupported
+.endif
+    pld         [OUT, #(prefetch_offset * 4)]
+.endm
+
+.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
+    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
+    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
+    bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
+.endm
+
+/*
+ * Macros for duplicating partially loaded mask to fill entire register.
+ * We will apply mask to interleaved source pixels, that is
+ *  (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
+ *  (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
+ * So, we need to duplicate loaded mask into whole register.
+ *
+ * For two pixel case
+ *  (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
+ *  (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
+ * We can do some optimizations for this including last pixel cases.
+ */
+.macro bilinear_duplicate_mask_x numpix, mask
+.endm
+
+.macro bilinear_duplicate_mask_8 numpix, mask
+.if numpix == 4
+    vdup.32     mask, mask[0]
+.elseif numpix == 2
+    vdup.16     mask, mask[0]
+.elseif numpix == 1
+    vdup.8      mask, mask[0]
+.else
+    .error bilinear_duplicate_mask_8 is unsupported
+.endif
+.endm
+
+.macro bilinear_duplicate_mask mask_fmt, numpix, mask
+    bilinear_duplicate_mask_&mask_fmt numpix, mask
+.endm
+
+/*
+ * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
+ * Interleave should be done when maks is enabled or operator is 'over'.
+ */
+.macro bilinear_interleave src0, src1, dst0, dst1
+    vuzp.8      src0, src1
+    vuzp.8      dst0, dst1
+    vuzp.8      src0, src1
+    vuzp.8      dst0, dst1
+.endm
+
+.macro bilinear_interleave_src_dst_x_src \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+.endm
+
+.macro bilinear_interleave_src_dst_x_over \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave src0, src1, dst0, dst1
+.endm
+
+.macro bilinear_interleave_src_dst_x_add \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+.endm
+
+.macro bilinear_interleave_src_dst_8_src \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave src0, src1, dst0, dst1
+.endm
+
+.macro bilinear_interleave_src_dst_8_over \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave src0, src1, dst0, dst1
+.endm
+
+.macro bilinear_interleave_src_dst_8_add \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave src0, src1, dst0, dst1
+.endm
+
+.macro bilinear_interleave_src_dst \
+                mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave_src_dst_&mask_fmt&_&op \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+.endm
+
+
+/*
+ * Macros for applying masks to src pixels. (see combine_mask_u() function)
+ * src, dst should be in interleaved form.
+ * mask register should be in form (m0, m1, m2, m3).
+ */
+.macro bilinear_apply_mask_to_src_x \
+                numpix, src0, src1, src01, mask, \
+                tmp01, tmp23, tmp45, tmp67
+.endm
+
+.macro bilinear_apply_mask_to_src_8 \
+                numpix, src0, src1, src01, mask, \
+                tmp01, tmp23, tmp45, tmp67
+
+    vmull.u8        tmp01, src0, mask
+    vmull.u8        tmp23, src1, mask
+    /* bubbles */
+    vrshr.u16       tmp45, tmp01, #8
+    vrshr.u16       tmp67, tmp23, #8
+    /* bubbles */
+    vraddhn.u16     src0, tmp45, tmp01
+    vraddhn.u16     src1, tmp67, tmp23
+.endm
+
+.macro bilinear_apply_mask_to_src \
+                mask_fmt, numpix, src0, src1, src01, mask, \
+                tmp01, tmp23, tmp45, tmp67
+
+    bilinear_apply_mask_to_src_&mask_fmt \
+                numpix, src0, src1, src01, mask, \
+                tmp01, tmp23, tmp45, tmp67
+.endm
+
+
+/*
+ * Macros for combining src and destination pixels.
+ * Interleave or not is depending on operator 'op'.
+ */
+.macro bilinear_combine_src \
+                numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+.endm
+
+.macro bilinear_combine_over \
+                numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+
+    vdup.32     tmp8, src1[1]
+    /* bubbles */
+    vmvn.8      tmp8, tmp8
+    /* bubbles */
+    vmull.u8    tmp01, dst0, tmp8
+    /* bubbles */
+    vmull.u8    tmp23, dst1, tmp8
+    /* bubbles */
+    vrshr.u16   tmp45, tmp01, #8
+    vrshr.u16   tmp67, tmp23, #8
+    /* bubbles */
+    vraddhn.u16 dst0, tmp45, tmp01
+    vraddhn.u16 dst1, tmp67, tmp23
+    /* bubbles */
+    vqadd.u8    src01, dst01, src01
+.endm
+
+.macro bilinear_combine_add \
+                numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+
+    vqadd.u8    src01, dst01, src01
+.endm
+
+.macro bilinear_combine \
+                op, numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+
+    bilinear_combine_&op \
+                numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+.endm
+
+/*
+ * Macros for final deinterleaving of destination pixels if needed.
+ */
+.macro bilinear_deinterleave numpix, dst0, dst1, dst01
+    vuzp.8      dst0, dst1
+    /* bubbles */
+    vuzp.8      dst0, dst1
+.endm
+
+.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
+    bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
+    bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
+    bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
+    bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
+    bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
+.endm
+
+
+.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
+    bilinear_load_&src_fmt d0, d1, d2
+    bilinear_load_mask mask_fmt, 1, d4
+    bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
+    vmull.u8  q1, d0, d28
+    vmlal.u8  q1, d1, d29
+    /* 5 cycles bubble */
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    /* 5 cycles bubble */
+    bilinear_duplicate_mask mask_fmt, 1, d4
+    vshrn.u32 d0, q0, #16
+    /* 3 cycles bubble */
+    vmovn.u16 d0, q0
+    /* 1 cycle bubble */
+    bilinear_interleave_src_dst \
+                mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
+    bilinear_apply_mask_to_src \
+                mask_fmt, 1, d0, d1, q0, d4, \
+                q3, q8, q10, q11
+    bilinear_combine \
+                op, 1, d0, d1, q0, d18, d19, q9, \
+                q3, q8, q10, q11, d5
+    bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
+    bilinear_store_&dst_fmt 1, q2, q3
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
+    bilinear_load_and_vertical_interpolate_two_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23
+    bilinear_load_mask mask_fmt, 2, d4
+    bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #8
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q10, #16
+    bilinear_duplicate_mask mask_fmt, 2, d4
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    vmovn.u16 d0, q0
+    bilinear_interleave_src_dst \
+                mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
+    bilinear_apply_mask_to_src \
+                mask_fmt, 2, d0, d1, q0, d4, \
+                q3, q8, q10, q11
+    bilinear_combine \
+                op, 2, d0, d1, q0, d18, d19, q9, \
+                q3, q8, q10, q11, d5
+    bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
+    bilinear_store_&dst_fmt 2, q2, q3
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
+    bilinear_load_and_vertical_interpolate_four_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23 \
+                q3, q9,  d4, d5, d16, d17, d18, d19
+    pld       [TMP1, PF_OFFS]
+    sub       TMP1, TMP1, STRIDE
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #8
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshr.u16  q15, q12, #8
+    vshll.u16 q2, d6, #8
+    vmlsl.u16 q2, d6, d30
+    vmlal.u16 q2, d7, d30
+    vshll.u16 q8, d18, #8
+    bilinear_load_mask mask_fmt, 4, d22
+    bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
+    pld       [TMP1, PF_OFFS]
+    vmlsl.u16 q8, d18, d31
+    vmlal.u16 q8, d19, d31
+    vadd.u16  q12, q12, q13
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q10, #16
+    vshrn.u32 d4, q2, #16
+    vshrn.u32 d5, q8, #16
+    bilinear_duplicate_mask mask_fmt, 4, d22
+    vshr.u16  q15, q12, #8
+    vmovn.u16 d0, q0
+    vmovn.u16 d1, q2
+    vadd.u16  q12, q12, q13
+    bilinear_interleave_src_dst \
+                mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
+    bilinear_apply_mask_to_src \
+                mask_fmt, 4, d0, d1, q0, d22, \
+                q3, q8, q9, q10
+    bilinear_combine \
+                op, 4, d0, d1, q0, d2, d3, q1, \
+                q3, q8, q9, q10, d23
+    bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
+    bilinear_store_&dst_fmt 4, q2, q3
+.endm
+
+.set BILINEAR_FLAG_USE_MASK,		1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS,	2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline functions.
+ *
+ * Bilinear scanline generator macro take folling arguments:
+ *  fname			- name of the function to generate
+ *  src_fmt			- source color format (8888 or 0565)
+ *  dst_fmt			- destination color format (8888 or 0565)
+ *  src/dst_bpp_shift		- (1 << bpp_shift) is the size of src/dst pixel in bytes
+ *  process_last_pixel		- code block that interpolate one pixel and does not
+ *				  update horizontal weight
+ *  process_two_pixels		- code block that interpolate two pixels and update
+ *				  horizontal weight
+ *  process_four_pixels		- code block that interpolate four pixels and update
+ *				  horizontal weight
+ *  process_pixblock_head	- head part of middle loop
+ *  process_pixblock_tail	- tail part of middle loop
+ *  process_pixblock_tail_head	- tail_head of middle loop
+ *  pixblock_size		- number of pixels processed in a single middle loop
+ *  prefetch_distance		- prefetch in the source image by that many pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func \
+	fname, \
+	src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
+	bilinear_process_last_pixel, \
+	bilinear_process_two_pixels, \
+	bilinear_process_four_pixels, \
+	bilinear_process_pixblock_head, \
+	bilinear_process_pixblock_tail, \
+	bilinear_process_pixblock_tail_head, \
+	pixblock_size, \
+	prefetch_distance, \
+	flags
+
+pixman_asm_function fname
+.if pixblock_size == 8
+.elseif pixblock_size == 4
+.else
+    .error unsupported pixblock size
+.endif
+
+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+    OUT       .req    r0
+    TOP       .req    r1
+    BOTTOM    .req    r2
+    WT        .req    r3
+    WB        .req    r4
+    X         .req    r5
+    UX        .req    r6
+    WIDTH     .req    ip
+    TMP1      .req    r3
+    TMP2      .req    r4
+    PF_OFFS   .req    r7
+    TMP3      .req    r8
+    TMP4      .req    r9
+    STRIDE    .req    r2
+
+    mov		ip, sp
+    push	{r4, r5, r6, r7, r8, r9}
+    mov		PF_OFFS, #prefetch_distance
+    ldmia	ip, {WB, X, UX, WIDTH}
+.else
+    OUT       .req      r0
+    MASK      .req      r1
+    TOP       .req      r2
+    BOTTOM    .req      r3
+    WT        .req      r4
+    WB        .req      r5
+    X         .req      r6
+    UX        .req      r7
+    WIDTH     .req      ip
+    TMP1      .req      r4
+    TMP2      .req      r5
+    PF_OFFS   .req      r8
+    TMP3      .req      r9
+    TMP4      .req      r10
+    STRIDE    .req      r3
+
+    .set prefetch_offset, prefetch_distance
+
+    mov       ip, sp
+    push      {r4, r5, r6, r7, r8, r9, r10, ip}
+    mov       PF_OFFS, #prefetch_distance
+    ldmia     ip, {WT, WB, X, UX, WIDTH}
+.endif
+
+    mul       PF_OFFS, PF_OFFS, UX
+
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpush     {d8-d15}
+.endif
+
+    sub	      STRIDE, BOTTOM, TOP
+    .unreq    BOTTOM
+
+    cmp       WIDTH, #0
+    ble       3f
+
+    vdup.u16  q12, X
+    vdup.u16  q13, UX
+    vdup.u8   d28, WT
+    vdup.u8   d29, WB
+    vadd.u16  d25, d25, d26
+
+    /* ensure good destination alignment  */
+    cmp       WIDTH, #1
+    blt       0f
+    tst       OUT, #(1 << dst_bpp_shift)
+    beq       0f
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    bilinear_process_last_pixel
+    sub       WIDTH, WIDTH, #1
+0:
+    vadd.u16  q13, q13, q13
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+
+    cmp       WIDTH, #2
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 1))
+    beq       0f
+    bilinear_process_two_pixels
+    sub       WIDTH, WIDTH, #2
+0:
+.if pixblock_size == 8
+    cmp       WIDTH, #4
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 2))
+    beq       0f
+    bilinear_process_four_pixels
+    sub       WIDTH, WIDTH, #4
+0:
+.endif
+    subs      WIDTH, WIDTH, #pixblock_size
+    blt       1f
+    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+    bilinear_process_pixblock_head
+    subs      WIDTH, WIDTH, #pixblock_size
+    blt       5f
+0:
+    bilinear_process_pixblock_tail_head
+    subs      WIDTH, WIDTH, #pixblock_size
+    bge       0b
+5:
+    bilinear_process_pixblock_tail
+1:
+.if pixblock_size == 8
+    tst       WIDTH, #4
+    beq       2f
+    bilinear_process_four_pixels
+2:
+.endif
+    /* handle the remaining trailing pixels */
+    tst       WIDTH, #2
+    beq       2f
+    bilinear_process_two_pixels
+2:
+    tst       WIDTH, #1
+    beq       3f
+    bilinear_process_last_pixel
+3:
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpop      {d8-d15}
+.endif
+
+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+    pop       {r4, r5, r6, r7, r8, r9}
+.else
+    pop       {r4, r5, r6, r7, r8, r9, r10, ip}
+.endif
+    bx        lr
+
+    .unreq    OUT
+    .unreq    TOP
+    .unreq    WT
+    .unreq    WB
+    .unreq    X
+    .unreq    UX
+    .unreq    WIDTH
+    .unreq    TMP1
+    .unreq    TMP2
+    .unreq    PF_OFFS
+    .unreq    TMP3
+    .unreq    TMP4
+    .unreq    STRIDE
+.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
+    .unreq    MASK
+.endif
+
+.endfunc
+
+.endm
+
+/* src_8888_8_8888 */
+.macro bilinear_src_8888_8_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_head
+    bilinear_src_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail_head
+    bilinear_src_8888_8_8888_process_pixblock_tail
+    bilinear_src_8888_8_8888_process_pixblock_head
+.endm
+
+/* src_8888_8_0565 */
+.macro bilinear_src_8888_8_0565_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_head
+    bilinear_src_8888_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail_head
+    bilinear_src_8888_8_0565_process_pixblock_tail
+    bilinear_src_8888_8_0565_process_pixblock_head
+.endm
+
+/* src_0565_8_x888 */
+.macro bilinear_src_0565_8_x888_process_last_pixel
+    bilinear_interpolate_last_pixel 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_two_pixels
+    bilinear_interpolate_two_pixels 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_four_pixels
+    bilinear_interpolate_four_pixels 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_head
+    bilinear_src_0565_8_x888_process_four_pixels
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_tail_head
+    bilinear_src_0565_8_x888_process_pixblock_tail
+    bilinear_src_0565_8_x888_process_pixblock_head
+.endm
+
+/* src_0565_8_0565 */
+.macro bilinear_src_0565_8_0565_process_last_pixel
+    bilinear_interpolate_last_pixel 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_two_pixels
+    bilinear_interpolate_two_pixels 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_four_pixels
+    bilinear_interpolate_four_pixels 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_head
+    bilinear_src_0565_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_tail_head
+    bilinear_src_0565_8_0565_process_pixblock_tail
+    bilinear_src_0565_8_0565_process_pixblock_head
+.endm
+
+/* over_8888_8888 */
+.macro bilinear_over_8888_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_head
+    mov         TMP1, X, asr #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, asl #2
+    mov         TMP2, X, asr #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, asl #2
+
+    vld1.32     {d22}, [TMP1], STRIDE
+    vld1.32     {d23}, [TMP1]
+    mov         TMP3, X, asr #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, asl #2
+    vmull.u8    q8, d22, d28
+    vmlal.u8    q8, d23, d29
+
+    vld1.32     {d22}, [TMP2], STRIDE
+    vld1.32     {d23}, [TMP2]
+    mov         TMP4, X, asr #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, asl #2
+    vmull.u8    q9, d22, d28
+    vmlal.u8    q9, d23, d29
+
+    vld1.32     {d22}, [TMP3], STRIDE
+    vld1.32     {d23}, [TMP3]
+    vmull.u8    q10, d22, d28
+    vmlal.u8    q10, d23, d29
+
+    vshll.u16   q0, d16, #8
+    vmlsl.u16   q0, d16, d30
+    vmlal.u16   q0, d17, d30
+
+    pld         [TMP4, PF_OFFS]
+    vld1.32     {d16}, [TMP4], STRIDE
+    vld1.32     {d17}, [TMP4]
+    pld         [TMP4, PF_OFFS]
+    vmull.u8    q11, d16, d28
+    vmlal.u8    q11, d17, d29
+
+    vshll.u16   q1, d18, #8
+    vmlsl.u16   q1, d18, d31
+    vmlal.u16   q1, d19, d31
+    vshr.u16    q15, q12, #8
+    vadd.u16    q12, q12, q13
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_tail
+    vshll.u16   q2, d20, #8
+    vmlsl.u16   q2, d20, d30
+    vmlal.u16   q2, d21, d30
+    vshll.u16   q3, d22, #8
+    vmlsl.u16   q3, d22, d31
+    vmlal.u16   q3, d23, d31
+    vshrn.u32   d0, q0, #16
+    vshrn.u32   d1, q1, #16
+    vld1.32     {d2, d3}, [OUT, :128]
+    pld         [OUT, #(prefetch_offset * 4)]
+    vshrn.u32   d4, q2, #16
+    vshr.u16    q15, q12, #8
+    vshrn.u32   d5, q3, #16
+    vmovn.u16   d6, q0
+    vmovn.u16   d7, q2
+    vuzp.8      d6, d7
+    vuzp.8      d2, d3
+    vuzp.8      d6, d7
+    vuzp.8      d2, d3
+    vdup.32     d4, d7[1]
+    vmvn.8      d4, d4
+    vmull.u8    q11, d2, d4
+    vmull.u8    q2, d3, d4
+    vrshr.u16   q1, q11, #8
+    vrshr.u16   q10, q2, #8
+    vraddhn.u16 d2, q1, q11
+    vraddhn.u16 d3, q10, q2
+    vqadd.u8    q3, q1, q3
+    vuzp.8      d6, d7
+    vuzp.8      d6, d7
+    vadd.u16    q12, q12, q13
+    vst1.32     {d6, d7}, [OUT, :128]!
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_tail_head
+                                            vshll.u16   q2, d20, #8
+    mov         TMP1, X, asr #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, asl #2
+                                            vmlsl.u16   q2, d20, d30
+    mov         TMP2, X, asr #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, asl #2
+                                            vmlal.u16   q2, d21, d30
+                                            vshll.u16   q3, d22, #8
+    vld1.32     {d20}, [TMP1], STRIDE
+                                            vmlsl.u16   q3, d22, d31
+                                            vmlal.u16   q3, d23, d31
+    vld1.32     {d21}, [TMP1]
+    vmull.u8    q8, d20, d28
+    vmlal.u8    q8, d21, d29
+                                            vshrn.u32   d0, q0, #16
+                                            vshrn.u32   d1, q1, #16
+                                            vld1.32     {d2, d3}, [OUT, :128]
+                                            pld         [OUT, PF_OFFS]
+                                            vshrn.u32   d4, q2, #16
+                                            vshr.u16    q15, q12, #8
+    vld1.32     {d22}, [TMP2], STRIDE
+                                            vshrn.u32   d5, q3, #16
+                                            vmovn.u16   d6, q0
+    vld1.32     {d23}, [TMP2]
+    vmull.u8    q9, d22, d28
+    mov         TMP3, X, asr #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, asl #2
+    mov         TMP4, X, asr #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, asl #2
+    vmlal.u8    q9, d23, d29
+                                            vmovn.u16   d7, q2
+    vld1.32     {d22}, [TMP3], STRIDE
+                                            vuzp.8      d6, d7
+                                            vuzp.8      d2, d3
+                                            vuzp.8      d6, d7
+                                            vuzp.8      d2, d3
+                                            vdup.32     d4, d7[1]
+    vld1.32     {d23}, [TMP3]
+                                            vmvn.8      d4, d4
+    vmull.u8    q10, d22, d28
+    vmlal.u8    q10, d23, d29
+                                            vmull.u8    q11, d2, d4
+                                            vmull.u8    q2, d3, d4
+    vshll.u16   q0, d16, #8
+    vmlsl.u16   q0, d16, d30
+                                            vrshr.u16   q1, q11, #8
+    vmlal.u16   q0, d17, d30
+                                            vrshr.u16   q8, q2, #8
+                                            vraddhn.u16 d2, q1, q11
+                                            vraddhn.u16 d3, q8, q2
+    pld         [TMP4, PF_OFFS]
+    vld1.32     {d16}, [TMP4], STRIDE
+                                            vqadd.u8    q3, q1, q3
+    vld1.32     {d17}, [TMP4]
+    pld         [TMP4, PF_OFFS]
+    vmull.u8    q11, d16, d28
+    vmlal.u8    q11, d17, d29
+                                            vuzp.8      d6, d7
+    vshll.u16   q1, d18, #8
+                                            vuzp.8      d6, d7
+    vmlsl.u16   q1, d18, d31
+                                            vadd.u16    q12, q12, q13
+    vmlal.u16   q1, d19, d31
+    vshr.u16    q15, q12, #8
+    vadd.u16    q12, q12, q13
+                                            vst1.32     {d6, d7}, [OUT, :128]!
+.endm
+
+/* over_8888_8_8888 */
+.macro bilinear_over_8888_8_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_head
+    mov         TMP1, X, asr #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, asl #2
+    vld1.32     {d0}, [TMP1], STRIDE
+    mov         TMP2, X, asr #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, asl #2
+    vld1.32     {d1}, [TMP1]
+    mov         TMP3, X, asr #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, asl #2
+    vld1.32     {d2}, [TMP2], STRIDE
+    mov         TMP4, X, asr #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, asl #2
+    vld1.32     {d3}, [TMP2]
+    vmull.u8    q2, d0, d28
+    vmull.u8    q3, d2, d28
+    vmlal.u8    q2, d1, d29
+    vmlal.u8    q3, d3, d29
+    vshll.u16   q0, d4, #8
+    vshll.u16   q1, d6, #8
+    vmlsl.u16   q0, d4, d30
+    vmlsl.u16   q1, d6, d31
+    vmlal.u16   q0, d5, d30
+    vmlal.u16   q1, d7, d31
+    vshrn.u32   d0, q0, #16
+    vshrn.u32   d1, q1, #16
+    vld1.32     {d2}, [TMP3], STRIDE
+    vld1.32     {d3}, [TMP3]
+    pld         [TMP4, PF_OFFS]
+    vld1.32     {d4}, [TMP4], STRIDE
+    vld1.32     {d5}, [TMP4]
+    pld         [TMP4, PF_OFFS]
+    vmull.u8    q3, d2, d28
+    vmlal.u8    q3, d3, d29
+    vmull.u8    q1, d4, d28
+    vmlal.u8    q1, d5, d29
+    vshr.u16    q15, q12, #8
+    vld1.32     {d22[0]}, [MASK]!
+    pld         [MASK, #prefetch_offset]
+    vadd.u16    q12, q12, q13
+    vmovn.u16   d16, q0
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_tail
+    vshll.u16   q9, d6, #8
+    vshll.u16   q10, d2, #8
+    vmlsl.u16   q9, d6, d30
+    vmlsl.u16   q10, d2, d31
+    vmlal.u16   q9, d7, d30
+    vmlal.u16   q10, d3, d31
+    vshr.u16    q15, q12, #8
+    vadd.u16    q12, q12, q13
+    vdup.32     d22, d22[0]
+    vshrn.u32   d18, q9, #16
+    vshrn.u32   d19, q10, #16
+    vmovn.u16   d17, q9
+    vld1.32     {d18, d19}, [OUT, :128]
+    pld         [OUT, PF_OFFS]
+    vuzp.8      d16, d17
+    vuzp.8      d18, d19
+    vuzp.8      d16, d17
+    vuzp.8      d18, d19
+    vmull.u8    q10, d16, d22
+    vmull.u8    q11, d17, d22
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+    vrshrn.u16  d16, q10, #8
+    vrshrn.u16  d17, q11, #8
+    vdup.32     d22, d17[1]
+    vmvn.8      d22, d22
+    vmull.u8    q10, d18, d22
+    vmull.u8    q11, d19, d22
+    vrshr.u16   q9, q10, #8
+    vrshr.u16   q0, q11, #8
+    vraddhn.u16 d18, q9, q10
+    vraddhn.u16 d19, q0, q11
+    vqadd.u8    q9, q8, q9
+    vuzp.8      d18, d19
+    vuzp.8      d18, d19
+    vst1.32     {d18, d19}, [OUT, :128]!
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
+                                            vshll.u16   q9, d6, #8
+    mov         TMP1, X, asr #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, asl #2
+                                            vshll.u16   q10, d2, #8
+    vld1.32     {d0}, [TMP1], STRIDE
+    mov         TMP2, X, asr #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, asl #2
+                                            vmlsl.u16   q9, d6, d30
+                                            vmlsl.u16   q10, d2, d31
+    vld1.32     {d1}, [TMP1]
+    mov         TMP3, X, asr #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, asl #2
+                                            vmlal.u16   q9, d7, d30
+                                            vmlal.u16   q10, d3, d31
+    vld1.32     {d2}, [TMP2], STRIDE
+    mov         TMP4, X, asr #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, asl #2
+                                            vshr.u16    q15, q12, #8
+                                            vadd.u16    q12, q12, q13
+    vld1.32     {d3}, [TMP2]
+                                            vdup.32     d22, d22[0]
+                                            vshrn.u32   d18, q9, #16
+                                            vshrn.u32   d19, q10, #16
+    vmull.u8    q2, d0, d28
+    vmull.u8    q3, d2, d28
+                                            vmovn.u16   d17, q9
+                                            vld1.32     {d18, d19}, [OUT, :128]
+                                            pld         [OUT, #(prefetch_offset * 4)]
+    vmlal.u8    q2, d1, d29
+    vmlal.u8    q3, d3, d29
+                                            vuzp.8      d16, d17
+                                            vuzp.8      d18, d19
+    vshll.u16   q0, d4, #8
+    vshll.u16   q1, d6, #8
+                                            vuzp.8      d16, d17
+                                            vuzp.8      d18, d19
+    vmlsl.u16   q0, d4, d30
+    vmlsl.u16   q1, d6, d31
+                                            vmull.u8    q10, d16, d22
+                                            vmull.u8    q11, d17, d22
+    vmlal.u16   q0, d5, d30
+    vmlal.u16   q1, d7, d31
+                                            vrsra.u16   q10, q10, #8
+                                            vrsra.u16   q11, q11, #8
+    vshrn.u32   d0, q0, #16
+    vshrn.u32   d1, q1, #16
+                                            vrshrn.u16  d16, q10, #8
+                                            vrshrn.u16  d17, q11, #8
+    vld1.32     {d2}, [TMP3], STRIDE
+                                            vdup.32     d22, d17[1]
+    vld1.32     {d3}, [TMP3]
+                                            vmvn.8      d22, d22
+    pld         [TMP4, PF_OFFS]
+    vld1.32     {d4}, [TMP4], STRIDE
+                                            vmull.u8    q10, d18, d22
+                                            vmull.u8    q11, d19, d22
+    vld1.32     {d5}, [TMP4]
+    pld         [TMP4, PF_OFFS]
+    vmull.u8    q3, d2, d28
+                                            vrshr.u16   q9, q10, #8
+                                            vrshr.u16   q15, q11, #8
+    vmlal.u8    q3, d3, d29
+    vmull.u8    q1, d4, d28
+                                            vraddhn.u16 d18, q9, q10
+                                            vraddhn.u16 d19, q15, q11
+    vmlal.u8    q1, d5, d29
+    vshr.u16    q15, q12, #8
+                                            vqadd.u8    q9, q8, q9
+    vld1.32     {d22[0]}, [MASK]!
+                                            vuzp.8      d18, d19
+    vadd.u16    q12, q12, q13
+                                            vuzp.8      d18, d19
+    vmovn.u16   d16, q0
+                                            vst1.32     {d18, d19}, [OUT, :128]!
+.endm
+
+/* add_8888_8888 */
+.macro bilinear_add_8888_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_head
+    bilinear_add_8888_8888_process_four_pixels
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_tail_head
+    bilinear_add_8888_8888_process_pixblock_tail
+    bilinear_add_8888_8888_process_pixblock_head
+.endm
+
+/* add_8888_8_8888 */
+.macro bilinear_add_8888_8_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_head
+    bilinear_add_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_tail_head
+    bilinear_add_8888_8_8888_process_pixblock_tail
+    bilinear_add_8888_8_8888_process_pixblock_head
+.endm
+
+
+/* Bilinear scanline functions */
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_src_8888_8_8888_process_last_pixel, \
+    bilinear_src_8888_8_8888_process_two_pixels, \
+    bilinear_src_8888_8_8888_process_four_pixels, \
+    bilinear_src_8888_8_8888_process_pixblock_head, \
+    bilinear_src_8888_8_8888_process_pixblock_tail, \
+    bilinear_src_8888_8_8888_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
+    8888, 0565, 2, 1, \
+    bilinear_src_8888_8_0565_process_last_pixel, \
+    bilinear_src_8888_8_0565_process_two_pixels, \
+    bilinear_src_8888_8_0565_process_four_pixels, \
+    bilinear_src_8888_8_0565_process_pixblock_head, \
+    bilinear_src_8888_8_0565_process_pixblock_tail, \
+    bilinear_src_8888_8_0565_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
+    0565, 8888, 1, 2, \
+    bilinear_src_0565_8_x888_process_last_pixel, \
+    bilinear_src_0565_8_x888_process_two_pixels, \
+    bilinear_src_0565_8_x888_process_four_pixels, \
+    bilinear_src_0565_8_x888_process_pixblock_head, \
+    bilinear_src_0565_8_x888_process_pixblock_tail, \
+    bilinear_src_0565_8_x888_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
+    0565, 0565, 1, 1, \
+    bilinear_src_0565_8_0565_process_last_pixel, \
+    bilinear_src_0565_8_0565_process_two_pixels, \
+    bilinear_src_0565_8_0565_process_four_pixels, \
+    bilinear_src_0565_8_0565_process_pixblock_head, \
+    bilinear_src_0565_8_0565_process_pixblock_tail, \
+    bilinear_src_0565_8_0565_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_over_8888_8888_process_last_pixel, \
+    bilinear_over_8888_8888_process_two_pixels, \
+    bilinear_over_8888_8888_process_four_pixels, \
+    bilinear_over_8888_8888_process_pixblock_head, \
+    bilinear_over_8888_8888_process_pixblock_tail, \
+    bilinear_over_8888_8888_process_pixblock_tail_head, \
+    4, 28, 0
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_over_8888_8_8888_process_last_pixel, \
+    bilinear_over_8888_8_8888_process_two_pixels, \
+    bilinear_over_8888_8_8888_process_four_pixels, \
+    bilinear_over_8888_8_8888_process_pixblock_head, \
+    bilinear_over_8888_8_8888_process_pixblock_tail, \
+    bilinear_over_8888_8_8888_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_add_8888_8888_process_last_pixel, \
+    bilinear_add_8888_8888_process_two_pixels, \
+    bilinear_add_8888_8888_process_four_pixels, \
+    bilinear_add_8888_8888_process_pixblock_head, \
+    bilinear_add_8888_8888_process_pixblock_tail, \
+    bilinear_add_8888_8888_process_pixblock_tail_head, \
+    4, 28, 0
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_add_8888_8_8888_process_last_pixel, \
+    bilinear_add_8888_8_8888_process_two_pixels, \
+    bilinear_add_8888_8_8888_process_four_pixels, \
+    bilinear_add_8888_8_8888_process_pixblock_head, \
+    bilinear_add_8888_8_8888_process_pixblock_tail, \
+    bilinear_add_8888_8_8888_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
new file mode 100644
index 0000000..87aae1d
--- /dev/null
+++ b/pixman/pixman-arm-neon-asm.S
@@ -0,0 +1,3636 @@
+/*
+ * Copyright Â© 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains implementations of NEON optimized pixel processing
+ * functions. There is no full and detailed tutorial, but some functions
+ * (those which are exposing some new or interesting features) are
+ * extensively commented and can be used as examples.
+ *
+ * You may want to have a look at the comments for following functions:
+ *  - pixman_composite_over_8888_0565_asm_neon
+ *  - pixman_composite_over_n_8_0565_asm_neon
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+    .text
+    .fpu neon
+    .arch armv7a
+    .object_arch armv4
+    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
+    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
+    .arm
+    .altmacro
+    .p2align 2
+
+#include "pixman-arm-neon-asm.h"
+
+/* Global configuration options and preferences */
+
+/*
+ * The code can optionally make use of unaligned memory accesses to improve
+ * performance of handling leading/trailing pixels for each scanline.
+ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
+ * example in linux if unaligned memory accesses are not configured to
+ * generate.exceptions.
+ */
+.set RESPECT_STRICT_ALIGNMENT, 1
+
+/*
+ * Set default prefetch type. There is a choice between the following options:
+ *
+ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
+ * as NOP to workaround some HW bugs or for whatever other reason)
+ *
+ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
+ * advanced prefetch intruduces heavy overhead)
+ *
+ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
+ * which can run ARM and NEON instructions simultaneously so that extra ARM
+ * instructions do not add (many) extra cycles, but improve prefetch efficiency)
+ *
+ * Note: some types of function can't support advanced prefetch and fallback
+ *       to simple one (those which handle 24bpp pixels)
+ */
+.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
+
+/* Prefetch distance in pixels for simple prefetch */
+.set PREFETCH_DISTANCE_SIMPLE, 64
+
+/*
+ * Implementation of pixman_composite_over_8888_0565_asm_neon
+ *
+ * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
+ * performs OVER compositing operation. Function fast_composite_over_8888_0565
+ * from pixman-fast-path.c does the same in C and can be used as a reference.
+ *
+ * First we need to have some NEON assembly code which can do the actual
+ * operation on the pixels and provide it to the template macro.
+ *
+ * Template macro quite conveniently takes care of emitting all the necessary
+ * code for memory reading and writing (including quite tricky cases of
+ * handling unaligned leading/trailing pixels), so we only need to deal with
+ * the data in NEON registers.
+ *
+ * NEON registers allocation in general is recommented to be the following:
+ * d0,  d1,  d2,  d3  - contain loaded source pixel data
+ * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
+ * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
+ * d28, d29, d30, d31 - place for storing the result (destination pixels)
+ *
+ * As can be seen above, four 64-bit NEON registers are used for keeping
+ * intermediate pixel data and up to 8 pixels can be processed in one step
+ * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
+ *
+ * This particular function uses the following registers allocation:
+ * d0,  d1,  d2,  d3  - contain loaded source pixel data
+ * d4,  d5            - contain loaded destination pixels (they are needed)
+ * d28, d29           - place for storing the result (destination pixels)
+ */
+
+/*
+ * Step one. We need to have some code to do some arithmetics on pixel data.
+ * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
+ * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
+ * perform all the needed calculations and write the result to {d28, d29}.
+ * The rationale for having two macros and not just one will be explained
+ * later. In practice, any single monolitic function which does the work can
+ * be split into two parts in any arbitrary way without affecting correctness.
+ *
+ * There is one special trick here too. Common template macro can optionally
+ * make our life a bit easier by doing R, G, B, A color components
+ * deinterleaving for 32bpp pixel formats (and this feature is used in
+ * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
+ * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
+ * actually use d0 register for blue channel (a vector of eight 8-bit
+ * values), d1 register for green, d2 for red and d3 for alpha. This
+ * simple conversion can be also done with a few NEON instructions:
+ *
+ * Packed to planar conversion:
+ *  vuzp.8 d0, d1
+ *  vuzp.8 d2, d3
+ *  vuzp.8 d1, d3
+ *  vuzp.8 d0, d2
+ *
+ * Planar to packed conversion:
+ *  vzip.8 d0, d2
+ *  vzip.8 d1, d3
+ *  vzip.8 d2, d3
+ *  vzip.8 d0, d1
+ *
+ * But pixel can be loaded directly in planar format using VLD4.8 NEON
+ * instruction. It is 1 cycle slower than VLD1.32, so this is not always
+ * desirable, that's why deinterleaving is optional.
+ *
+ * But anyway, here is the code:
+ */
+.macro pixman_composite_over_8888_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vmvn.8      d3, d3      /* invert source alpha */
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/*
+ * OK, now we got almost everything that we need. Using the above two
+ * macros, the work can be done right. But now we want to optimize
+ * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
+ * a lot from good code scheduling and software pipelining.
+ *
+ * Let's construct some code, which will run in the core main loop.
+ * Some pseudo-code of the main loop will look like this:
+ *   head
+ *   while (...) {
+ *     tail
+ *     head
+ *   }
+ *   tail
+ *
+ * It may look a bit weird, but this setup allows to hide instruction
+ * latencies better and also utilize dual-issue capability more
+ * efficiently (make pairs of load-store and ALU instructions).
+ *
+ * So what we need now is a '*_tail_head' macro, which will be used
+ * in the core main loop. A trivial straightforward implementation
+ * of this macro would look like this:
+ *
+ *   pixman_composite_over_8888_0565_process_pixblock_tail
+ *   vst1.16     {d28, d29}, [DST_W, :128]!
+ *   vld1.16     {d4, d5}, [DST_R, :128]!
+ *   vld4.32     {d0, d1, d2, d3}, [SRC]!
+ *   pixman_composite_over_8888_0565_process_pixblock_head
+ *   cache_preload 8, 8
+ *
+ * Now it also got some VLD/VST instructions. We simply can't move from
+ * processing one block of pixels to the other one with just arithmetics.
+ * The previously processed data needs to be written to memory and new
+ * data needs to be fetched. Fortunately, this main loop does not deal
+ * with partial leading/trailing pixels and can load/store a full block
+ * of pixels in a bulk. Additionally, destination buffer is already
+ * 16 bytes aligned here (which is good for performance).
+ *
+ * New things here are DST_R, DST_W, SRC and MASK identifiers. These
+ * are the aliases for ARM registers which are used as pointers for
+ * accessing data. We maintain separate pointers for reading and writing
+ * destination buffer (DST_R and DST_W).
+ *
+ * Another new thing is 'cache_preload' macro. It is used for prefetching
+ * data into CPU L2 cache and improve performance when dealing with large
+ * images which are far larger than cache size. It uses one argument
+ * (actually two, but they need to be the same here) - number of pixels
+ * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
+ * details about this macro. Moreover, if good performance is needed
+ * the code from this macro needs to be copied into '*_tail_head' macro
+ * and mixed with the rest of code for optimal instructions scheduling.
+ * We are actually doing it below.
+ *
+ * Now after all the explanations, here is the optimized code.
+ * Different instruction streams (originaling from '*_head', '*_tail'
+ * and 'cache_preload' macro) use different indentation levels for
+ * better readability. Actually taking the code from one of these
+ * indentation levels and ignoring a few VLD/VST instructions would
+ * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
+ * macro!
+ */
+
+#if 1
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+        vqadd.u8    d16, d2, d20
+    vld1.16     {d4, d5}, [DST_R, :128]!
+        vqadd.u8    q9, q0, q11
+    vshrn.u16   d6, q2, #8
+    fetch_src_pixblock
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+        vshll.u8    q14, d16, #8
+                                    PF add PF_X, PF_X, #8
+        vshll.u8    q8, d19, #8
+                                    PF tst PF_CTL, #0xF
+    vsri.u8     d6, d6, #5
+                                    PF addne PF_X, PF_X, #8
+    vmvn.8      d3, d3
+                                    PF subne PF_CTL, PF_CTL, #1
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    vmull.u8    q10, d3, d6
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vsri.u16    q14, q8, #5
+                                    PF cmp PF_X, ORIG_W
+        vshll.u8    q9, d18, #8
+    vrshr.u16   q13, q10, #8
+                                    PF subge PF_X, PF_X, ORIG_W
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+                                    PF subges PF_CTL, PF_CTL, #0x10
+        vsri.u16    q14, q9, #11
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vraddhn.u16 d22, q12, q15
+        vst1.16     {d28, d29}, [DST_W, :128]!
+.endm
+
+#else
+
+/* If we did not care much about the performance, we would just use this... */
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+    pixman_composite_over_8888_0565_process_pixblock_tail
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    fetch_src_pixblock
+    pixman_composite_over_8888_0565_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+#endif
+
+/*
+ * And now the final part. We are using 'generate_composite_function' macro
+ * to put all the stuff together. We are specifying the name of the function
+ * which we want to get, number of bits per pixel for the source, mask and
+ * destination (0 if unused, like mask in this case). Next come some bit
+ * flags:
+ *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
+ *                             and written, for write-only buffer we would use
+ *                             FLAG_DST_WRITEONLY flag instead
+ *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
+ *                             and separate color channels for 32bpp format.
+ * The next things are:
+ *  - the number of pixels processed per iteration (8 in this case, because
+ *    that's the maximum what can fit into four 64-bit NEON registers).
+ *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
+ *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
+ *    prefetch distance can be selected by running some benchmarks.
+ *
+ * After that we specify some macros, these are 'default_init',
+ * 'default_cleanup' here which are empty (but it is possible to have custom
+ * init/cleanup macros to be able to save/restore some extra NEON registers
+ * like d8-d15 or do anything else) followed by
+ * 'pixman_composite_over_8888_0565_process_pixblock_head',
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
+ * which we got implemented above.
+ *
+ * The last part is the NEON registers allocation scheme.
+ */
+generate_composite_function \
+    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_n_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_0565_process_pixblock_tail_head
+    pixman_composite_over_n_0565_process_pixblock_tail
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    pixman_composite_over_n_0565_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_over_n_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+    vmvn.8      d3, d3      /* invert source alpha */
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_0565_init, \
+    default_cleanup, \
+    pixman_composite_over_n_0565_process_pixblock_head, \
+    pixman_composite_over_n_0565_process_pixblock_tail, \
+    pixman_composite_over_n_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_0565_process_pixblock_head
+    vshll.u8    q8, d1, #8
+    vshll.u8    q14, d2, #8
+    vshll.u8    q9, d0, #8
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
+        vsri.u16    q14, q8, #5
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+    fetch_src_pixblock
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vsri.u16    q14, q9, #11
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vshll.u8    q8, d1, #8
+        vst1.16     {d28, d29}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vshll.u8    q14, d2, #8
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vshll.u8    q9, d0, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_8888_process_pixblock_head
+    vshrn.u16   d30, q0, #8
+    vshrn.u16   d29, q0, #3
+    vsli.u16    q0, q0, #5
+    vmov.u8     d31, #255
+    vsri.u8     d30, d30, #5
+    vsri.u8     d29, d29, #6
+    vshrn.u16   d28, q0, #2
+.endm
+
+.macro pixman_composite_src_0565_8888_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
+    pixman_composite_src_0565_8888_process_pixblock_tail
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+    fetch_src_pixblock
+    pixman_composite_src_0565_8888_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_process_pixblock_head
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail_head
+    fetch_src_pixblock
+                                    PF add PF_X, PF_X, #32
+                                    PF tst PF_CTL, #0xF
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+                                    PF addne PF_X, PF_X, #32
+                                    PF subne PF_CTL, PF_CTL, #1
+        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vqadd.u8    q14, q0, q2
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vqadd.u8    q15, q1, q3
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+    vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vqadd.u8    q14, q0, q2
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vqadd.u8    q15, q1, q3
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+    fetch_src_pixblock
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8888_process_pixblock_head
+    pixman_composite_out_reverse_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    fetch_src_pixblock
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_process_pixblock_head
+    /* deinterleaved source pixels in {d0, d1, d2, d3} */
+    /* inverted alpha in {d24} */
+    /* destination pixels in {d4, d5, d6, d7} */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_over_n_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q2, q10, #8
+    vrshr.u16   q3, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q2, q10
+    vraddhn.u16 d31, q3, q11
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+.macro pixman_composite_over_n_8888_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q2, q10, #8
+        vrshr.u16   q3, q11, #8
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+        vraddhn.u16 d30, q2, q10
+        vraddhn.u16 d31, q3, q11
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vqadd.u8    q14, q0, q14
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0x0F
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vqadd.u8    q15, q1, q15
+                                    PF cmp PF_X, ORIG_W
+    vmull.u8    q8, d24, d4
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+    vmull.u8    q9, d24, d5
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q10, d24, d6
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q11, d24, d7
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+    vmvn.8      d24, d3  /* get inverted alpha */
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+.macro pixman_composite_over_reverse_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d7[0]}, [DUMMY]
+    vdup.8      d4, d7[0]
+    vdup.8      d5, d7[1]
+    vdup.8      d6, d7[2]
+    vdup.8      d7, d7[3]
+.endm
+
+generate_composite_function \
+    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_reverse_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0,  /* dst_r_basereg */ \
+    4,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_head
+    vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
+    vmull.u8    q1,  d24, d9
+    vmull.u8    q6,  d24, d10
+    vmull.u8    q7,  d24, d11
+        vshrn.u16   d6,  q2, #8 /* convert DST_R data to 32-bpp (part1) */
+        vshrn.u16   d7,  q2, #3
+        vsli.u16    q2,  q2, #5
+    vrshr.u16   q8,  q0,  #8    /* IN for SRC pixels (part2) */
+    vrshr.u16   q9,  q1,  #8
+    vrshr.u16   q10, q6,  #8
+    vrshr.u16   q11, q7,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q9
+    vraddhn.u16 d2,  q6,  q10
+    vraddhn.u16 d3,  q7,  q11
+        vsri.u8     d6,  d6, #5 /* convert DST_R data to 32-bpp (part2) */
+        vsri.u8     d7,  d7, #6
+    vmvn.8      d3,  d3
+        vshrn.u16   d30, q2, #2
+    vmull.u8    q8,  d3, d6     /* now do alpha blending */
+    vmull.u8    q9,  d3, d7
+    vmull.u8    q10, d3, d30
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
+    /* 3 cycle bubble (after vmull.u8) */
+    vrshr.u16   q13, q8,  #8
+    vrshr.u16   q11, q9,  #8
+    vrshr.u16   q15, q10, #8
+    vraddhn.u16 d16, q8,  q13
+    vraddhn.u16 d27, q9,  q11
+    vraddhn.u16 d26, q10, q15
+    vqadd.u8    d16, d2,  d16
+    /* 1 cycle bubble */
+    vqadd.u8    q9,  q0,  q13
+    vshll.u8    q14, d16, #8    /* convert to 16bpp */
+    vshll.u8    q8,  d19, #8
+    vshll.u8    q9,  d18, #8
+    vsri.u16    q14, q8,  #5
+    /* 1 cycle bubble */
+    vsri.u16    q14, q9,  #11
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vshrn.u16   d6,  q2,  #8
+    fetch_mask_pixblock
+    vshrn.u16   d7,  q2,  #3
+    fetch_src_pixblock
+    vmull.u8    q6,  d24, d10
+        vrshr.u16   q13, q8,  #8
+        vrshr.u16   q11, q9,  #8
+        vrshr.u16   q15, q10, #8
+        vraddhn.u16 d16, q8,  q13
+        vraddhn.u16 d27, q9,  q11
+        vraddhn.u16 d26, q10, q15
+        vqadd.u8    d16, d2,  d16
+    vmull.u8    q1,  d24, d9
+        vqadd.u8    q9,  q0,  q13
+        vshll.u8    q14, d16, #8
+    vmull.u8    q0,  d24, d8
+        vshll.u8    q8,  d19, #8
+        vshll.u8    q9,  d18, #8
+        vsri.u16    q14, q8,  #5
+    vmull.u8    q7,  d24, d11
+        vsri.u16    q14, q9,  #11
+
+    cache_preload 8, 8
+
+    vsli.u16    q2,  q2,  #5
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q9,  q1,  #8
+    vrshr.u16   q10, q6,  #8
+    vrshr.u16   q11, q7,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q9
+    vraddhn.u16 d2,  q6,  q10
+    vraddhn.u16 d3,  q7,  q11
+    vsri.u8     d6,  d6,  #5
+    vsri.u8     d7,  d7,  #6
+    vmvn.8      d3,  d3
+    vshrn.u16   d30, q2,  #2
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    vmull.u8    q8,  d3,  d6
+    vmull.u8    q9,  d3,  d7
+    vmull.u8    q10, d3,  d30
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+/*
+ * This function needs a special initialization of solid mask.
+ * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
+ * offset, split into color components and replicated in d8-d11
+ * registers. Additionally, this function needs all the NEON registers,
+ * so it has to save d8-d15 registers which are callee saved according
+ * to ABI. These registers are restored from 'cleanup' macro. All the
+ * other NEON registers are caller saved, so can be clobbered freely
+ * without introducing any problems.
+ */
+.macro pixman_composite_over_n_8_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_0565_init, \
+    pixman_composite_over_n_8_0565_cleanup, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_0565_init
+    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+    vpush       {d8-d15}
+    vld1.32     {d24[0]}, [DUMMY]
+    vdup.8      d24, d24[3]
+.endm
+
+.macro pixman_composite_over_8888_n_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_n_0565_init, \
+    pixman_composite_over_8888_n_0565_cleanup, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
+    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+    fetch_src_pixblock
+    cache_preload 16, 16
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_0565_process_pixblock_head, \
+    pixman_composite_src_0565_0565_process_pixblock_tail, \
+    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail_head
+    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #8
+    vsli.u64    d0, d0, #16
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
+    FLAG_DST_WRITEONLY, \
+    32, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_8_init, \
+    pixman_composite_src_n_8_cleanup, \
+    pixman_composite_src_n_8_process_pixblock_head, \
+    pixman_composite_src_n_8_process_pixblock_tail, \
+    pixman_composite_src_n_8_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail_head
+    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #16
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_0565_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_0565_init, \
+    pixman_composite_src_n_0565_cleanup, \
+    pixman_composite_src_n_0565_process_pixblock_head, \
+    pixman_composite_src_n_0565_process_pixblock_tail, \
+    pixman_composite_src_n_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    0, /* prefetch distance */ \
+    pixman_composite_src_n_8888_init, \
+    pixman_composite_src_n_8888_cleanup, \
+    pixman_composite_src_n_8888_process_pixblock_head, \
+    pixman_composite_src_n_8888_process_pixblock_tail, \
+    pixman_composite_src_n_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+    fetch_src_pixblock
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_8888_process_pixblock_head, \
+    pixman_composite_src_8888_8888_process_pixblock_tail, \
+    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_x888_8888_process_pixblock_head
+    vorr     q0, q0, q2
+    vorr     q1, q1, q2
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+    fetch_src_pixblock
+    vorr     q0, q0, q2
+    vorr     q1, q1, q2
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_x888_8888_init
+    vmov.u8  q2, #0xFF
+    vshl.u32 q2, q2, #24
+.endm
+
+generate_composite_function \
+    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_x888_8888_init, \
+    default_cleanup, \
+    pixman_composite_src_x888_8888_process_pixblock_head, \
+    pixman_composite_src_x888_8888_process_pixblock_tail, \
+    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_head
+    /* expecting solid source in {d0, d1, d2, d3} */
+    /* mask is in d24 (d25, d26, d27 are unused) */
+
+    /* in */
+    vmull.u8    q8, d24, d0
+    vmull.u8    q9, d24, d1
+    vmull.u8    q10, d24, d2
+    vmull.u8    q11, d24, d3
+    vrsra.u16   q8, q8, #8
+    vrsra.u16   q9, q9, #8
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail
+    vrshrn.u16  d28, q8, #8
+    vrshrn.u16  d29, q9, #8
+    vrshrn.u16  d30, q10, #8
+    vrshrn.u16  d31, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
+    fetch_mask_pixblock
+                                    PF add PF_X, PF_X, #8
+        vrshrn.u16  d28, q8, #8
+                                    PF tst PF_CTL, #0x0F
+        vrshrn.u16  d29, q9, #8
+                                    PF addne PF_X, PF_X, #8
+        vrshrn.u16  d30, q10, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vrshrn.u16  d31, q11, #8
+                                    PF cmp PF_X, ORIG_W
+    vmull.u8    q8, d24, d0
+                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+    vmull.u8    q9, d24, d1
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q10, d24, d2
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q11, d24, d3
+                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vrsra.u16   q8, q8, #8
+    vrsra.u16   q9, q9, #8
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+.endm
+
+.macro pixman_composite_src_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_src_n_8_8888_init, \
+    pixman_composite_src_n_8_8888_cleanup, \
+    pixman_composite_src_n_8_8888_process_pixblock_head, \
+    pixman_composite_src_n_8_8888_process_pixblock_tail, \
+    pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_8_process_pixblock_head
+    vmull.u8    q0, d24, d16
+    vmull.u8    q1, d25, d16
+    vmull.u8    q2, d26, d16
+    vmull.u8    q3, d27, d16
+    vrsra.u16   q0, q0,  #8
+    vrsra.u16   q1, q1,  #8
+    vrsra.u16   q2, q2,  #8
+    vrsra.u16   q3, q3,  #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail
+    vrshrn.u16  d28, q0, #8
+    vrshrn.u16  d29, q1, #8
+    vrshrn.u16  d30, q2, #8
+    vrshrn.u16  d31, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
+    fetch_mask_pixblock
+                                    PF add PF_X, PF_X, #8
+        vrshrn.u16  d28, q0, #8
+                                    PF tst PF_CTL, #0x0F
+        vrshrn.u16  d29, q1, #8
+                                    PF addne PF_X, PF_X, #8
+        vrshrn.u16  d30, q2, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vrshrn.u16  d31, q3, #8
+                                    PF cmp PF_X, ORIG_W
+    vmull.u8    q0,  d24, d16
+                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+    vmull.u8    q1,  d25, d16
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q2,  d26, d16
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q3,  d27, d16
+                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vrsra.u16   q0, q0,  #8
+    vrsra.u16   q1, q1,  #8
+    vrsra.u16   q2, q2,  #8
+    vrsra.u16   q3, q3,  #8
+.endm
+
+.macro pixman_composite_src_n_8_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d16[0]}, [DUMMY]
+    vdup.8      d16, d16[3]
+.endm
+
+.macro pixman_composite_src_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_WRITEONLY, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_src_n_8_8_init, \
+    pixman_composite_src_n_8_8_cleanup, \
+    pixman_composite_src_n_8_8_process_pixblock_head, \
+    pixman_composite_src_n_8_8_process_pixblock_tail, \
+    pixman_composite_src_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_head
+    /* expecting deinterleaved source data in {d8, d9, d10, d11} */
+    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+    /* and destination data in {d4, d5, d6, d7} */
+    /* mask is in d24 (d25, d26, d27 are unused) */
+
+    /* in */
+    vmull.u8    q6, d24, d8
+    vmull.u8    q7, d24, d9
+    vmull.u8    q8, d24, d10
+    vmull.u8    q9, d24, d11
+    vrshr.u16   q10, q6, #8
+    vrshr.u16   q11, q7, #8
+    vrshr.u16   q12, q8, #8
+    vrshr.u16   q13, q9, #8
+    vraddhn.u16 d0, q6, q10
+    vraddhn.u16 d1, q7, q11
+    vraddhn.u16 d2, q8, q12
+    vraddhn.u16 d3, q9, q13
+    vmvn.8      d25, d3  /* get inverted alpha */
+    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
+    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d25, d4
+    vmull.u8    q9, d25, d5
+    vmull.u8    q10, d25, d6
+    vmull.u8    q11, d25, d7
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q6, q10, #8
+    vrshr.u16   q7, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q6, q10
+    vraddhn.u16 d31, q7, q11
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q15, q9, #8
+    fetch_mask_pixblock
+        vrshr.u16   q6, q10, #8
+                                    PF add PF_X, PF_X, #8
+        vrshr.u16   q7, q11, #8
+                                    PF tst PF_CTL, #0x0F
+        vraddhn.u16 d28, q14, q8
+                                    PF addne PF_X, PF_X, #8
+        vraddhn.u16 d29, q15, q9
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d30, q6, q10
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d31, q7, q11
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+    vmull.u8    q6, d24, d8
+                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+    vmull.u8    q7, d24, d9
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d24, d10
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d24, d11
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+        vqadd.u8    q14, q0, q14
+                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+        vqadd.u8    q15, q1, q15
+    vrshr.u16   q10, q6, #8
+    vrshr.u16   q11, q7, #8
+    vrshr.u16   q12, q8, #8
+    vrshr.u16   q13, q9, #8
+    vraddhn.u16 d0, q6, q10
+    vraddhn.u16 d1, q7, q11
+    vraddhn.u16 d2, q8, q12
+    vraddhn.u16 d3, q9, q13
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vmvn.8      d25, d3
+    vmull.u8    q8, d25, d4
+    vmull.u8    q9, d25, d5
+    vmull.u8    q10, d25, d6
+    vmull.u8    q11, d25, d7
+.endm
+
+.macro pixman_composite_over_n_8_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_8888_init, \
+    pixman_composite_over_n_8_8888_cleanup, \
+    pixman_composite_over_n_8_8888_process_pixblock_head, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8_process_pixblock_head
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d8
+    vmull.u8    q6,  d26, d8
+    vmull.u8    q7,  d27, d8
+    vrshr.u16   q10, q0,  #8
+    vrshr.u16   q11, q1,  #8
+    vrshr.u16   q12, q6,  #8
+    vrshr.u16   q13, q7,  #8
+    vraddhn.u16 d0,  q0,  q10
+    vraddhn.u16 d1,  q1,  q11
+    vraddhn.u16 d2,  q6,  q12
+    vraddhn.u16 d3,  q7,  q13
+    vmvn.8      q12, q0
+    vmvn.8      q13, q1
+    vmull.u8    q8,  d24, d4
+    vmull.u8    q9,  d25, d5
+    vmull.u8    q10, d26, d6
+    vmull.u8    q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8_8_process_pixblock_tail
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+    vqadd.u8    q14, q0,  q14
+    vqadd.u8    q15, q1,  q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_n_8_8_process_pixblock_tail
+    fetch_mask_pixblock
+    cache_preload 32, 32
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    pixman_composite_over_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d8[0]}, [DUMMY]
+    vdup.8      d8, d8[3]
+.endm
+
+.macro pixman_composite_over_n_8_8_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_8_init, \
+    pixman_composite_over_n_8_8_cleanup, \
+    pixman_composite_over_n_8_8_process_pixblock_head, \
+    pixman_composite_over_n_8_8_process_pixblock_tail, \
+    pixman_composite_over_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    /*
+     * 'combine_mask_ca' replacement
+     *
+     * input:  solid src (n) in {d8,  d9,  d10, d11}
+     *         dest in          {d4,  d5,  d6,  d7 }
+     *         mask in          {d24, d25, d26, d27}
+     * output: updated src in   {d0,  d1,  d2,  d3 }
+     *         updated mask in  {d24, d25, d26, d3 }
+     */
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d9
+    vmull.u8    q6,  d26, d10
+    vmull.u8    q7,  d27, d11
+    vmull.u8    q9,  d11, d25
+    vmull.u8    q12, d11, d24
+    vmull.u8    q13, d11, d26
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q10, q1,  #8
+    vrshr.u16   q11, q6,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q10
+    vraddhn.u16 d2,  q6,  q11
+    vrshr.u16   q11, q12, #8
+    vrshr.u16   q8,  q9,  #8
+    vrshr.u16   q6,  q13, #8
+    vrshr.u16   q10, q7,  #8
+    vraddhn.u16 d24, q12, q11
+    vraddhn.u16 d25, q9,  q8
+    vraddhn.u16 d26, q13, q6
+    vraddhn.u16 d3,  q7,  q10
+    /*
+     * 'combine_over_ca' replacement
+     *
+     * output: updated dest in {d28, d29, d30, d31}
+     */
+    vmvn.8      q12, q12
+    vmvn.8      d26, d26
+    vmull.u8    q8,  d24, d4
+    vmull.u8    q9,  d25, d5
+    vmvn.8      d27, d3
+    vmull.u8    q10, d26, d6
+    vmull.u8    q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
+    /* ... continue 'combine_over_ca' replacement */
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q6,  q10, #8
+    vrshr.u16   q7,  q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q6,  q10
+    vraddhn.u16 d31, q7,  q11
+    vqadd.u8    q14, q0,  q14
+    vqadd.u8    q15, q1,  q15
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+        vrshr.u16   q15, q9, #8
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q6, q10, #8
+        vrshr.u16   q7, q11, #8
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+        vraddhn.u16 d30, q6, q10
+        vraddhn.u16 d31, q7, q11
+    fetch_mask_pixblock
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    cache_preload 8, 8
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_8888_ca_init, \
+    pixman_composite_over_n_8888_8888_ca_cleanup, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
+    /*
+     * 'combine_mask_ca' replacement
+     *
+     * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
+     *         mask in          {d24, d25, d26}       [B, G, R]
+     * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
+     *         updated mask in  {d24, d25, d26}       [B, G, R]
+     */
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d9
+    vmull.u8    q6,  d26, d10
+    vmull.u8    q9,  d11, d25
+    vmull.u8    q12, d11, d24
+    vmull.u8    q13, d11, d26
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q10, q1,  #8
+    vrshr.u16   q11, q6,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q10
+    vraddhn.u16 d2,  q6,  q11
+    vrshr.u16   q11, q12, #8
+    vrshr.u16   q8,  q9,  #8
+    vrshr.u16   q6,  q13, #8
+    vraddhn.u16 d24, q12, q11
+    vraddhn.u16 d25, q9,  q8
+    /*
+     * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+     * and put data into d16 - blue, d17 - green, d18 - red
+     */
+       vshrn.u16   d17, q2,  #3
+       vshrn.u16   d18, q2,  #8
+    vraddhn.u16 d26, q13, q6
+       vsli.u16    q2,  q2,  #5
+       vsri.u8     d18, d18, #5
+       vsri.u8     d17, d17, #6
+    /*
+     * 'combine_over_ca' replacement
+     *
+     * output: updated dest in d16 - blue, d17 - green, d18 - red
+     */
+    vmvn.8      q12, q12
+       vshrn.u16   d16, q2,  #2
+    vmvn.8      d26, d26
+    vmull.u8    q6,  d16, d24
+    vmull.u8    q7,  d17, d25
+    vmull.u8    q11, d18, d26
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
+    /* ... continue 'combine_over_ca' replacement */
+    vrshr.u16   q10, q6,  #8
+    vrshr.u16   q14, q7,  #8
+    vrshr.u16   q15, q11, #8
+    vraddhn.u16 d16, q10, q6
+    vraddhn.u16 d17, q14, q7
+    vraddhn.u16 d18, q15, q11
+    vqadd.u8    q8,  q0,  q8
+    vqadd.u8    d18, d2,  d18
+    /*
+     * convert the results in d16, d17, d18 to r5g6b5 and store
+     * them into {d28, d29}
+     */
+    vshll.u8    q14, d18, #8
+    vshll.u8    q10, d17, #8
+    vshll.u8    q15, d16, #8
+    vsri.u16    q14, q10, #5
+    vsri.u16    q14, q15, #11
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+    fetch_mask_pixblock
+        vrshr.u16   q10, q6, #8
+        vrshr.u16   q14, q7, #8
+    vld1.16     {d4, d5}, [DST_R, :128]!
+        vrshr.u16   q15, q11, #8
+        vraddhn.u16 d16, q10, q6
+        vraddhn.u16 d17, q14, q7
+        vraddhn.u16 d22, q15, q11
+            /* process_pixblock_head */
+            /*
+             * 'combine_mask_ca' replacement
+             *
+             * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
+             *         mask in          {d24, d25, d26}       [B, G, R]
+             * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
+             *         updated mask in  {d24, d25, d26}       [B, G, R]
+             */
+            vmull.u8    q6,  d26, d10
+        vqadd.u8    q8,  q0, q8
+            vmull.u8    q0,  d24, d8
+        vqadd.u8    d22, d2, d22
+            vmull.u8    q1,  d25, d9
+        /*
+         * convert the result in d16, d17, d22 to r5g6b5 and store
+         * it into {d28, d29}
+         */
+        vshll.u8    q14, d22, #8
+        vshll.u8    q10, d17, #8
+        vshll.u8    q15, d16, #8
+            vmull.u8    q9,  d11, d25
+        vsri.u16    q14, q10, #5
+            vmull.u8    q12, d11, d24
+            vmull.u8    q13, d11, d26
+        vsri.u16    q14, q15, #11
+    cache_preload 8, 8
+            vrshr.u16   q8,  q0,  #8
+            vrshr.u16   q10, q1,  #8
+            vrshr.u16   q11, q6,  #8
+            vraddhn.u16 d0,  q0,  q8
+            vraddhn.u16 d1,  q1,  q10
+            vraddhn.u16 d2,  q6,  q11
+            vrshr.u16   q11, q12, #8
+            vrshr.u16   q8,  q9,  #8
+            vrshr.u16   q6,  q13, #8
+            vraddhn.u16 d24, q12, q11
+            vraddhn.u16 d25, q9,  q8
+                /*
+                 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
+	         * 8-bit format and put data into d16 - blue, d17 - green,
+	         * d18 - red
+                 */
+                vshrn.u16   d17, q2,  #3
+                vshrn.u16   d18, q2,  #8
+            vraddhn.u16 d26, q13, q6
+                vsli.u16    q2,  q2,  #5
+                vsri.u8     d17, d17, #6
+                vsri.u8     d18, d18, #5
+            /*
+             * 'combine_over_ca' replacement
+             *
+             * output: updated dest in d16 - blue, d17 - green, d18 - red
+             */
+            vmvn.8      q12, q12
+                vshrn.u16   d16, q2,  #2
+            vmvn.8      d26, d26
+            vmull.u8    q7,  d17, d25
+            vmull.u8    q6,  d16, d24
+            vmull.u8    q11, d18, d26
+    vst1.16     {d28, d29}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_0565_ca_init, \
+    pixman_composite_over_n_8888_0565_ca_cleanup, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_in_n_8_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* and destination data in {d4, d5, d6, d7} */
+    vmull.u8    q8,  d4,  d3
+    vmull.u8    q9,  d5,  d3
+    vmull.u8    q10, d6,  d3
+    vmull.u8    q11, d7,  d3
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q8,  q14
+    vraddhn.u16 d29, q9,  q15
+    vraddhn.u16 d30, q10, q12
+    vraddhn.u16 d31, q11, q13
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail_head
+    pixman_composite_in_n_8_process_pixblock_tail
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    cache_preload 32, 32
+    pixman_composite_in_n_8_process_pixblock_head
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_in_n_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d3, d3[3]
+.endm
+
+.macro pixman_composite_in_n_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_in_n_8_init, \
+    pixman_composite_in_n_8_cleanup, \
+    pixman_composite_in_n_8_process_pixblock_head, \
+    pixman_composite_in_n_8_process_pixblock_tail, \
+    pixman_composite_in_n_8_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+.macro pixman_composite_add_n_8_8_process_pixblock_head
+    /* expecting source data in {d8, d9, d10, d11} */
+    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+    /* and destination data in {d4, d5, d6, d7} */
+    /* mask is in d24, d25, d26, d27 */
+    vmull.u8    q0, d24, d11
+    vmull.u8    q1, d25, d11
+    vmull.u8    q6, d26, d11
+    vmull.u8    q7, d27, d11
+    vrshr.u16   q10, q0, #8
+    vrshr.u16   q11, q1, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q13, q7, #8
+    vraddhn.u16 d0, q0, q10
+    vraddhn.u16 d1, q1, q11
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d3, q7, q13
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_n_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
+    pixman_composite_add_n_8_8_process_pixblock_tail
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    fetch_mask_pixblock
+    cache_preload 32, 32
+    pixman_composite_add_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_n_8_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_add_n_8_8_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_n_8_8_init, \
+    pixman_composite_add_n_8_8_cleanup, \
+    pixman_composite_add_n_8_8_process_pixblock_head, \
+    pixman_composite_add_n_8_8_process_pixblock_tail, \
+    pixman_composite_add_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_8_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* mask in {d24, d25, d26, d27} */
+    vmull.u8    q8, d24, d0
+    vmull.u8    q9, d25, d1
+    vmull.u8    q10, d26, d2
+    vmull.u8    q11, d27, d3
+    vrshr.u16   q0, q8, #8
+    vrshr.u16   q1, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d0, q0, q8
+    vraddhn.u16 d1, q1, q9
+    vraddhn.u16 d2, q12, q10
+    vraddhn.u16 d3, q13, q11
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
+    pixman_composite_add_8_8_8_process_pixblock_tail
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    fetch_mask_pixblock
+    fetch_src_pixblock
+    cache_preload 32, 32
+    pixman_composite_add_8_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_8_8_8_init
+.endm
+
+.macro pixman_composite_add_8_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_8_8_8_init, \
+    pixman_composite_add_8_8_8_cleanup, \
+    pixman_composite_add_8_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* mask in {d24, d25, d26, d27} */
+    vmull.u8    q8,  d27, d0
+    vmull.u8    q9,  d27, d1
+    vmull.u8    q10, d27, d2
+    vmull.u8    q11, d27, d3
+    /* 1 cycle bubble */
+    vrsra.u16   q8,  q8,  #8
+    vrsra.u16   q9,  q9,  #8
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
+    /* 2 cycle bubble */
+    vrshrn.u16  d28, q8,  #8
+    vrshrn.u16  d29, q9,  #8
+    vrshrn.u16  d30, q10, #8
+    vrshrn.u16  d31, q11, #8
+    vqadd.u8    q14, q2,  q14
+    /* 1 cycle bubble */
+    vqadd.u8    q15, q3,  q15
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+        vrshrn.u16  d28, q8,  #8
+    fetch_mask_pixblock
+        vrshrn.u16  d29, q9,  #8
+    vmull.u8    q8,  d27, d0
+        vrshrn.u16  d30, q10, #8
+    vmull.u8    q9,  d27, d1
+        vrshrn.u16  d31, q11, #8
+    vmull.u8    q10, d27, d2
+        vqadd.u8    q14, q2,  q14
+    vmull.u8    q11, d27, d3
+        vqadd.u8    q15, q3,  q15
+    vrsra.u16   q8,  q8,  #8
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vrsra.u16   q9,  q9,  #8
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vrsra.u16   q10, q10, #8
+
+    cache_preload 8, 8
+
+    vrsra.u16   q11, q11, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+generate_composite_function \
+    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_n_8_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+.endm
+
+.macro pixman_composite_add_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_n_8_8888_init, \
+    pixman_composite_add_n_8_8888_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_n_8888_init
+    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+    vld1.32     {d27[0]}, [DUMMY]
+    vdup.8      d27, d27[3]
+.endm
+
+.macro pixman_composite_add_8888_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_8888_n_8888_init, \
+    pixman_composite_add_8888_n_8888_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* solid mask is in d15 */
+
+    /* 'in' */
+    vmull.u8    q8, d15, d3
+    vmull.u8    q6, d15, d2
+    vmull.u8    q5, d15, d1
+    vmull.u8    q4, d15, d0
+    vrshr.u16   q13, q8, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q11, q5, #8
+    vrshr.u16   q10, q4, #8
+    vraddhn.u16 d3, q8, q13
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d1, q5, q11
+    vraddhn.u16 d0, q4, q10
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_head
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_8888_n_8888_init
+    add         DUMMY, sp, #48
+    vpush       {d8-d15}
+    vld1.32     {d15[0]}, [DUMMY]
+    vdup.8      d15, d15[3]
+.endm
+
+.macro pixman_composite_over_8888_n_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_n_8888_init, \
+    pixman_composite_over_8888_n_8888_cleanup, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
+    vst3.8 {d0, d1, d2}, [DST_W]!
+    fetch_src_pixblock
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0888_process_pixblock_head, \
+    pixman_composite_src_0888_0888_process_pixblock_tail, \
+    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
+    vswp   d0, d2
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
+    vst4.8 {d0, d1, d2, d3}, [DST_W]!
+    fetch_src_pixblock
+    vswp   d0, d2
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_init
+    veor   d3, d3, d3
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_0888_8888_rev_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
+    vshll.u8    q8, d1, #8
+    vshll.u8    q9, d2, #8
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
+    vshll.u8    q14, d0, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
+        vshll.u8    q14, d0, #8
+    fetch_src_pixblock
+        vsri.u16    q14, q8, #5
+        vsri.u16    q14, q9, #11
+    vshll.u8    q8, d1, #8
+        vst1.16 {d28, d29}, [DST_W, :128]!
+    vshll.u8    q9, d2, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
+    vrshr.u16   q11, q8, #8
+    vswp        d3, d31
+    vrshr.u16   q12, q9, #8
+    vrshr.u16   q13, q10, #8
+    vraddhn.u16 d30, q11, q8
+    vraddhn.u16 d29, q12, q9
+    vraddhn.u16 d28, q13, q10
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
+        vrshr.u16   q11, q8, #8
+        vswp        d3, d31
+        vrshr.u16   q12, q9, #8
+        vrshr.u16   q13, q10, #8
+    fetch_src_pixblock
+        vraddhn.u16 d30, q11, q8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d29, q12, q9
+        vraddhn.u16 d28, q13, q10
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
+    vrshr.u16   q11, q8, #8
+    vswp        d3, d31
+    vrshr.u16   q12, q9, #8
+    vrshr.u16   q13, q10, #8
+    vraddhn.u16 d28, q11, q8
+    vraddhn.u16 d29, q12, q9
+    vraddhn.u16 d30, q13, q10
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
+        vrshr.u16   q11, q8, #8
+        vswp        d3, d31
+        vrshr.u16   q12, q9, #8
+        vrshr.u16   q13, q10, #8
+    fetch_src_pixblock
+        vraddhn.u16 d28, q11, q8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d29, q12, q9
+        vraddhn.u16 d30, q13, q10
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q4, d2, d1, d0
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* source pixel data is in      {d0, d1, d2, XX} */
+    /* destination pixel data is in {d4, d5, d6, XX} */
+    vmvn.8      d7,  d15
+    vmull.u8    q6,  d15, d2
+    vmull.u8    q5,  d15, d1
+    vmull.u8    q4,  d15, d0
+    vmull.u8    q8,  d7,  d4
+    vmull.u8    q9,  d7,  d5
+    vmull.u8    q13, d7,  d6
+    vrshr.u16   q12, q6,  #8
+    vrshr.u16   q11, q5,  #8
+    vrshr.u16   q10, q4,  #8
+    vraddhn.u16 d2,  q6,  q12
+    vraddhn.u16 d1,  q5,  q11
+    vraddhn.u16 d0,  q4,  q10
+.endm
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q12, q13, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q13
+    vqadd.u8    q0,  q0,  q14
+    vqadd.u8    q1,  q1,  q15
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
+    fetch_mask_pixblock
+    pixman_composite_over_0565_8_0565_process_pixblock_tail
+    fetch_src_pixblock
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_over_0565_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_n_0565_init
+    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+    vpush       {d8-d15}
+    vld1.32     {d15[0]}, [DUMMY]
+    vdup.8      d15, d15[3]
+.endm
+
+.macro pixman_composite_over_0565_n_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_0565_n_0565_init, \
+    pixman_composite_over_0565_n_0565_cleanup, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q4, d2, d1, d0
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* source pixel data is in      {d0, d1, d2, XX} */
+    /* destination pixel data is in {d4, d5, d6, XX} */
+    vmull.u8    q6,  d15, d2
+    vmull.u8    q5,  d15, d1
+    vmull.u8    q4,  d15, d0
+    vrshr.u16   q12, q6,  #8
+    vrshr.u16   q11, q5,  #8
+    vrshr.u16   q10, q4,  #8
+    vraddhn.u16 d2,  q6,  q12
+    vraddhn.u16 d1,  q5,  q11
+    vraddhn.u16 d0,  q4,  q10
+.endm
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
+    vqadd.u8    q0,  q0,  q2
+    vqadd.u8    q1,  q1,  q3
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
+    fetch_mask_pixblock
+    pixman_composite_add_0565_8_0565_process_pixblock_tail
+    fetch_src_pixblock
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_add_0565_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_add_0565_8_0565_process_pixblock_head, \
+    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* destination pixel data is in {d4, d5, d6, xx} */
+    vmvn.8      d24, d15 /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+.endm
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vraddhn.u16 d0, q14, q8
+    vraddhn.u16 d1, q15, q9
+    vraddhn.u16 d2, q12, q10
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
+    fetch_src_pixblock
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_out_reverse_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    15, /* src_basereg   */ \
+    0   /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
+    /* src is in d0 */
+    /* destination pixel data is in {d4, d5, d6, d7} */
+    vmvn.8      d1, d0 /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d1, d4
+    vmull.u8    q9, d1, d5
+    vmull.u8    q10, d1, d6
+    vmull.u8    q11, d1, d7
+.endm
+
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+    /* 32bpp result is in {d28, d29, d30, d31} */
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail
+    vld4.8    {d4, d5, d6, d7}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_out_reverse_8_8888_process_pixblock_head
+    vst4.8    {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0   /* mask_basereg  */
+
+/******************************************************************************/
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+    .func fname
+    .global fname
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    vld1.32   {reg1}, [TMP1], STRIDE
+    vld1.32   {reg2}, [TMP1]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    vld1.32   {reg2[0]}, [TMP1], STRIDE
+    vld1.32   {reg2[1]}, [TMP1]
+    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+    bilinear_load_8888 reg1, reg2, tmp1
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    bilinear_load_8888 reg3, reg4, tmp2
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {acc2lo[1]}, [TMP1]
+    vld1.32   {acc2hi[1]}, [TMP2]
+    convert_0565_to_x888 acc2, reg3, reg2, reg1
+    vzip.u8   reg1, reg3
+    vzip.u8   reg2, reg4
+    vzip.u8   reg3, reg4
+    vzip.u8   reg1, reg2
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {xacc2lo[1]}, [TMP1]
+    vld1.32   {xacc2hi[1]}, [TMP2]
+    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
+    vzip.u8   xreg1, xreg3
+    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
+    vzip.u8   xreg2, xreg4
+    vld1.32   {yacc2lo[1]}, [TMP1]
+    vzip.u8   xreg3, xreg4
+    vld1.32   {yacc2hi[1]}, [TMP2]
+    vzip.u8   xreg1, xreg2
+    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+    vmull.u8  xacc1, xreg1, d28
+    vzip.u8   yreg1, yreg3
+    vmlal.u8  xacc1, xreg2, d29
+    vzip.u8   yreg2, yreg4
+    vmull.u8  xacc2, xreg3, d28
+    vzip.u8   yreg3, yreg4
+    vmlal.u8  xacc2, xreg4, d29
+    vzip.u8   yreg1, yreg2
+    vmull.u8  yacc1, yreg1, d28
+    vmlal.u8  yacc1, yreg2, d29
+    vmull.u8  yacc2, yreg3, d28
+    vmlal.u8  yacc2, yreg4, d29
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if numpix == 4
+    vst1.32   {d0, d1}, [OUT, :128]!
+.elseif numpix == 2
+    vst1.32   {d0}, [OUT, :64]!
+.elseif numpix == 1
+    vst1.32   {d0[0]}, [OUT, :32]!
+.else
+    .error bilinear_store_8888 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+    vuzp.u8 d0, d1
+    vuzp.u8 d2, d3
+    vuzp.u8 d1, d3
+    vuzp.u8 d0, d2
+    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
+.if numpix == 4
+    vst1.16   {d2}, [OUT, :64]!
+.elseif numpix == 2
+    vst1.32   {d2[0]}, [OUT, :32]!
+.elseif numpix == 1
+    vst1.16   {d2[0]}, [OUT, :16]!
+.else
+    .error bilinear_store_0565 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    bilinear_load_&src_fmt d0, d1, d2
+    vmull.u8  q1, d0, d28
+    vmlal.u8  q1, d1, d29
+    /* 5 cycles bubble */
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    /* 5 cycles bubble */
+    vshrn.u32 d0, q0, #16
+    /* 3 cycles bubble */
+    vmovn.u16 d0, q0
+    /* 1 cycle bubble */
+    bilinear_store_&dst_fmt 1, q2, q3
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    bilinear_load_and_vertical_interpolate_two_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #8
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q10, #16
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    vmovn.u16 d0, q0
+    bilinear_store_&dst_fmt 2, q2, q3
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_load_and_vertical_interpolate_four_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23 \
+                q3, q9,  d4, d5, d16, d17, d18, d19
+    pld       [TMP1, PF_OFFS]
+    sub       TMP1, TMP1, STRIDE
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #8
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshr.u16  q15, q12, #8
+    vshll.u16 q2, d6, #8
+    vmlsl.u16 q2, d6, d30
+    vmlal.u16 q2, d7, d30
+    vshll.u16 q8, d18, #8
+    pld       [TMP2, PF_OFFS]
+    vmlsl.u16 q8, d18, d31
+    vmlal.u16 q8, d19, d31
+    vadd.u16  q12, q12, q13
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q10, #16
+    vshrn.u32 d4, q2, #16
+    vshrn.u32 d5, q8, #16
+    vshr.u16  q15, q12, #8
+    vmovn.u16 d0, q0
+    vmovn.u16 d1, q2
+    vadd.u16  q12, q12, q13
+    bilinear_store_&dst_fmt 4, q2, q3
+.endm
+
+.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
+.else
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
+.endif
+.endm
+
+.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
+.else
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
+.else
+    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
+.else
+    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
+.else
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.endif
+.endm
+
+.set BILINEAR_FLAG_UNROLL_4,          0
+.set BILINEAR_FLAG_UNROLL_8,          1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline
+ * functions.
+ *
+ * Bilinear scanline scaler macro template uses the following arguments:
+ *  fname             - name of the function to generate
+ *  src_fmt           - source color format (8888 or 0565)
+ *  dst_fmt           - destination color format (8888 or 0565)
+ *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
+ *  prefetch_distance - prefetch in the source image by that many
+ *                      pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
+                                       src_bpp_shift, dst_bpp_shift, \
+                                       prefetch_distance, flags
+
+pixman_asm_function fname
+    OUT       .req      r0
+    TOP       .req      r1
+    BOTTOM    .req      r2
+    WT        .req      r3
+    WB        .req      r4
+    X         .req      r5
+    UX        .req      r6
+    WIDTH     .req      ip
+    TMP1      .req      r3
+    TMP2      .req      r4
+    PF_OFFS   .req      r7
+    TMP3      .req      r8
+    TMP4      .req      r9
+    STRIDE    .req      r2
+
+    mov       ip, sp
+    push      {r4, r5, r6, r7, r8, r9}
+    mov       PF_OFFS, #prefetch_distance
+    ldmia     ip, {WB, X, UX, WIDTH}
+    mul       PF_OFFS, PF_OFFS, UX
+
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpush     {d8-d15}
+.endif
+
+    sub       STRIDE, BOTTOM, TOP
+    .unreq    BOTTOM
+
+    cmp       WIDTH, #0
+    ble       3f
+
+    vdup.u16  q12, X
+    vdup.u16  q13, UX
+    vdup.u8   d28, WT
+    vdup.u8   d29, WB
+    vadd.u16  d25, d25, d26
+
+    /* ensure good destination alignment  */
+    cmp       WIDTH, #1
+    blt       0f
+    tst       OUT, #(1 << dst_bpp_shift)
+    beq       0f
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #1
+0:
+    vadd.u16  q13, q13, q13
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+
+    cmp       WIDTH, #2
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 1))
+    beq       0f
+    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #2
+0:
+.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
+/*********** 8 pixels per iteration *****************/
+    cmp       WIDTH, #4
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 2))
+    beq       0f
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #4
+0:
+    subs      WIDTH, WIDTH, #8
+    blt       1f
+    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #8
+    blt       5f
+0:
+    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #8
+    bge       0b
+5:
+    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+1:
+    tst       WIDTH, #4
+    beq       2f
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+2:
+.else
+/*********** 4 pixels per iteration *****************/
+    subs      WIDTH, WIDTH, #4
+    blt       1f
+    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #4
+    blt       5f
+0:
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #4
+    bge       0b
+5:
+    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+1:
+/****************************************************/
+.endif
+    /* handle the remaining trailing pixels */
+    tst       WIDTH, #2
+    beq       2f
+    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+2:
+    tst       WIDTH, #1
+    beq       3f
+    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+3:
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpop      {d8-d15}
+.endif
+    pop       {r4, r5, r6, r7, r8, r9}
+    bx        lr
+
+    .unreq    OUT
+    .unreq    TOP
+    .unreq    WT
+    .unreq    WB
+    .unreq    X
+    .unreq    UX
+    .unreq    WIDTH
+    .unreq    TMP1
+    .unreq    TMP2
+    .unreq    PF_OFFS
+    .unreq    TMP3
+    .unreq    TMP4
+    .unreq    STRIDE
+.endfunc
+
+.endm
+
+/*****************************************************************************/
+
+.set have_bilinear_interpolate_four_pixels_8888_8888, 1
+
+.macro bilinear_interpolate_four_pixels_8888_8888_head
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+
+    vld1.32   {d22}, [TMP1], STRIDE
+    vld1.32   {d23}, [TMP1]
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    vmull.u8  q8, d22, d28
+    vmlal.u8  q8, d23, d29
+
+    vld1.32   {d22}, [TMP2], STRIDE
+    vld1.32   {d23}, [TMP2]
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmull.u8  q9, d22, d28
+    vmlal.u8  q9, d23, d29
+
+    vld1.32   {d22}, [TMP3], STRIDE
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+
+    vshll.u16 q0, d16, #8
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+
+    vshll.u16 q1, d18, #8
+    vmlsl.u16 q1, d18, d31
+.endm
+
+.macro bilinear_interpolate_four_pixels_8888_8888_tail
+    vmlal.u16 q1, d19, d31
+    vshr.u16  q15, q12, #8
+    vshll.u16 q2, d20, #8
+    vmlsl.u16 q2, d20, d30
+    vmlal.u16 q2, d21, d30
+    vshll.u16 q3, d22, #8
+    vmlsl.u16 q3, d22, d31
+    vmlal.u16 q3, d23, d31
+    vadd.u16  q12, q12, q13
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q1, #16
+    vshrn.u32 d4, q2, #16
+    vshr.u16  q15, q12, #8
+    vshrn.u32 d5, q3, #16
+    vmovn.u16 d6, q0
+    vmovn.u16 d7, q2
+    vadd.u16  q12, q12, q13
+    vst1.32   {d6, d7}, [OUT, :128]!
+.endm
+
+.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+        vmlal.u16 q1, d19, d31
+        vshr.u16  q15, q12, #8
+        vshll.u16 q2, d20, #8
+        vmlsl.u16 q2, d20, d30
+        vmlal.u16 q2, d21, d30
+        vshll.u16 q3, d22, #8
+    vld1.32   {d20}, [TMP1], STRIDE
+        vmlsl.u16 q3, d22, d31
+        vmlal.u16 q3, d23, d31
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+        vshrn.u32 d0, q0, #16
+        vshrn.u32 d1, q1, #16
+        vshrn.u32 d4, q2, #16
+    vld1.32   {d22}, [TMP2], STRIDE
+        vshrn.u32 d5, q3, #16
+        vadd.u16  q12, q12, q13
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+        vshr.u16  q15, q12, #8
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+        vmovn.u16 d6, q0
+    vshll.u16 q0, d16, #8
+        vmovn.u16 d7, q2
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+        vadd.u16  q12, q12, q13
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+        vst1.32   {d6, d7}, [OUT, :128]!
+    vshll.u16 q1, d18, #8
+    vmlsl.u16 q1, d18, d31
+.endm
+
+/*****************************************************************************/
+
+.set have_bilinear_interpolate_eight_pixels_8888_0565, 1
+
+.macro bilinear_interpolate_eight_pixels_8888_0565_head
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+    vld1.32   {d20}, [TMP1], STRIDE
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+    vld1.32   {d22}, [TMP2], STRIDE
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+    vshll.u16 q0, d16, #8
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+    vshll.u16 q1, d18, #8
+    vmlsl.u16 q1, d18, d31
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+        vmlal.u16 q1, d19, d31
+        vshr.u16  q15, q12, #8
+        vshll.u16 q2, d20, #8
+        vmlsl.u16 q2, d20, d30
+        vmlal.u16 q2, d21, d30
+        vshll.u16 q3, d22, #8
+    vld1.32   {d20}, [TMP1], STRIDE
+        vmlsl.u16 q3, d22, d31
+        vmlal.u16 q3, d23, d31
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+        vshrn.u32 d0, q0, #16
+        vshrn.u32 d1, q1, #16
+        vshrn.u32 d4, q2, #16
+    vld1.32   {d22}, [TMP2], STRIDE
+        vshrn.u32 d5, q3, #16
+        vadd.u16  q12, q12, q13
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+        vshr.u16  q15, q12, #8
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+        vmovn.u16 d8, q0
+    vshll.u16 q0, d16, #8
+        vmovn.u16 d9, q2
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+        vadd.u16  q12, q12, q13
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+    vshll.u16 q1, d18, #8
+    vmlsl.u16 q1, d18, d31
+.endm
+
+.macro bilinear_interpolate_eight_pixels_8888_0565_tail
+    vmlal.u16 q1, d19, d31
+    vshr.u16  q15, q12, #8
+    vshll.u16 q2, d20, #8
+    vmlsl.u16 q2, d20, d30
+    vmlal.u16 q2, d21, d30
+    vshll.u16 q3, d22, #8
+    vmlsl.u16 q3, d22, d31
+    vmlal.u16 q3, d23, d31
+    vadd.u16  q12, q12, q13
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q1, #16
+    vshrn.u32 d4, q2, #16
+    vshr.u16  q15, q12, #8
+    vshrn.u32 d5, q3, #16
+    vmovn.u16 d10, q0
+    vmovn.u16 d11, q2
+    vadd.u16  q12, q12, q13
+
+    vuzp.u8   d8, d9
+    vuzp.u8   d10, d11
+    vuzp.u8   d9, d11
+    vuzp.u8   d8, d10
+    vshll.u8  q6, d9, #8
+    vshll.u8  q5, d10, #8
+    vshll.u8  q7, d8, #8
+    vsri.u16  q5, q6, #5
+    vsri.u16  q5, q7, #11
+    vst1.32   {d10, d11}, [OUT, :128]!
+.endm
+
+.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+        vmlal.u16 q1, d19, d31
+        vshr.u16  q15, q12, #8
+            vuzp.u8 d8, d9
+        vshll.u16 q2, d20, #8
+        vmlsl.u16 q2, d20, d30
+        vmlal.u16 q2, d21, d30
+        vshll.u16 q3, d22, #8
+    vld1.32   {d20}, [TMP1], STRIDE
+        vmlsl.u16 q3, d22, d31
+        vmlal.u16 q3, d23, d31
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+        vshrn.u32 d0, q0, #16
+        vshrn.u32 d1, q1, #16
+        vshrn.u32 d4, q2, #16
+    vld1.32   {d22}, [TMP2], STRIDE
+        vshrn.u32 d5, q3, #16
+        vadd.u16  q12, q12, q13
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+        vshr.u16  q15, q12, #8
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+        vmovn.u16 d10, q0
+    vshll.u16 q0, d16, #8
+        vmovn.u16 d11, q2
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+        vadd.u16  q12, q12, q13
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+            vuzp.u8 d10, d11
+    vshll.u16 q1, d18, #8
+    vmlsl.u16 q1, d18, d31
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+        vmlal.u16 q1, d19, d31
+            vuzp.u8 d9, d11
+        vshr.u16  q15, q12, #8
+        vshll.u16 q2, d20, #8
+            vuzp.u8 d8, d10
+        vmlsl.u16 q2, d20, d30
+        vmlal.u16 q2, d21, d30
+        vshll.u16 q3, d22, #8
+    vld1.32   {d20}, [TMP1], STRIDE
+        vmlsl.u16 q3, d22, d31
+        vmlal.u16 q3, d23, d31
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+            vshll.u8  q6, d9, #8
+            vshll.u8  q5, d10, #8
+            vshll.u8  q7, d8, #8
+        vshrn.u32 d0, q0, #16
+            vsri.u16  q5, q6, #5
+        vshrn.u32 d1, q1, #16
+            vsri.u16  q5, q7, #11
+        vshrn.u32 d4, q2, #16
+    vld1.32   {d22}, [TMP2], STRIDE
+        vshrn.u32 d5, q3, #16
+        vadd.u16  q12, q12, q13
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+        vshr.u16  q15, q12, #8
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+        vmovn.u16 d8, q0
+    vshll.u16 q0, d16, #8
+        vmovn.u16 d9, q2
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+        vadd.u16  q12, q12, q13
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+    vshll.u16 q1, d18, #8
+            vst1.32   {d10, d11}, [OUT, :128]!
+    vmlsl.u16 q1, d18, d31
+.endm
+/*****************************************************************************/
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
+    2, 2, 28, BILINEAR_FLAG_UNROLL_4
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
+    2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
+    1, 2, 28, BILINEAR_FLAG_UNROLL_4
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
+    1, 1, 28, BILINEAR_FLAG_UNROLL_4
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
new file mode 100644
index 0000000..97adc6a
--- /dev/null
+++ b/pixman/pixman-arm-neon-asm.h
@@ -0,0 +1,1177 @@
+/*
+ * Copyright Â© 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains a macro ('generate_composite_function') which can
+ * construct 2D image processing functions, based on a common template.
+ * Any combinations of source, destination and mask images with 8bpp,
+ * 16bpp, 24bpp, 32bpp color formats are supported.
+ *
+ * This macro takes care of:
+ *  - handling of leading and trailing unaligned pixels
+ *  - doing most of the work related to L2 cache preload
+ *  - encourages the use of software pipelining for better instructions
+ *    scheduling
+ *
+ * The user of this macro has to provide some configuration parameters
+ * (bit depths for the images, prefetch distance, etc.) and a set of
+ * macros, which should implement basic code chunks responsible for
+ * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
+ * examples.
+ *
+ * TODO:
+ *  - try overlapped pixel method (from Ian Rickards) when processing
+ *    exactly two blocks of pixels
+ *  - maybe add an option to do reverse scanline processing
+ */
+
+/*
+ * Bit flags for 'generate_composite_function' macro which are used
+ * to tune generated functions behavior.
+ */
+.set FLAG_DST_WRITEONLY,       0
+.set FLAG_DST_READWRITE,       1
+.set FLAG_DEINTERLEAVE_32BPP,  2
+
+/*
+ * Offset in stack where mask and source pointer/stride can be accessed
+ * from 'init' macro. This is useful for doing special handling for solid mask.
+ */
+.set ARGS_STACK_OFFSET,        40
+
+/*
+ * Constants for selecting preferable prefetch type.
+ */
+.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
+.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
+.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
+
+/*
+ * Definitions of supplementary pixld/pixst macros (for partial load/store of
+ * pixel data).
+ */
+
+.macro pixldst1 op, elem_size, reg1, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
+    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
+.endm
+
+.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
+    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
+.endm
+
+.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
+    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
+.endm
+
+.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
+.if numbytes == 32
+    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
+                              %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif numbytes == 16
+    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
+.elseif numbytes == 8
+    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
+.elseif numbytes == 4
+    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
+        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
+    .elseif elem_size == 16
+        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
+        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
+    .else
+        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
+    .endif
+.elseif numbytes == 2
+    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
+        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
+    .else
+        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
+    .endif
+.elseif numbytes == 1
+    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
+.else
+    .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
+                      %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+
+.macro pixst numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
+                      %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+
+.macro pixld_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+    pixld numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+
+.macro pixst_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+    pixst numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+
+/*
+ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
+ * aliases to be defined)
+ */
+.macro pixld1_s elem_size, reg1, mem_operand
+.if elem_size == 16
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #1
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #1
+    vld1.16 {d&reg1&[0]}, [TMP1, :16]
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #1
+    vld1.16 {d&reg1&[1]}, [TMP2, :16]
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #1
+    vld1.16 {d&reg1&[2]}, [TMP1, :16]
+    vld1.16 {d&reg1&[3]}, [TMP2, :16]
+.elseif elem_size == 32
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #2
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    vld1.32 {d&reg1&[0]}, [TMP1, :32]
+    vld1.32 {d&reg1&[1]}, [TMP2, :32]
+.else
+    .error "unsupported"
+.endif
+.endm
+
+.macro pixld2_s elem_size, reg1, reg2, mem_operand
+.if elem_size == 32
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X, asl #1
+    add     TMP1, mem_operand, TMP1, asl #2
+    mov     TMP2, VX, asr #16
+    sub     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    vld1.32 {d&reg1&[0]}, [TMP1, :32]
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X, asl #1
+    add     TMP1, mem_operand, TMP1, asl #2
+    vld1.32 {d&reg2&[0]}, [TMP2, :32]
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    vld1.32 {d&reg1&[1]}, [TMP1, :32]
+    vld1.32 {d&reg2&[1]}, [TMP2, :32]
+.else
+    pixld1_s elem_size, reg1, mem_operand
+    pixld1_s elem_size, reg2, mem_operand
+.endif
+.endm
+
+.macro pixld0_s elem_size, reg1, idx, mem_operand
+.if elem_size == 16
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #1
+    vld1.16 {d&reg1&[idx]}, [TMP1, :16]
+.elseif elem_size == 32
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #2
+    vld1.32 {d&reg1&[idx]}, [TMP1, :32]
+.endif
+.endm
+
+.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
+.if numbytes == 32
+    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
+    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
+    pixdeinterleave elem_size, %(basereg+4)
+.elseif numbytes == 16
+    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
+.elseif numbytes == 8
+    pixld1_s elem_size, %(basereg+1), mem_operand
+.elseif numbytes == 4
+    .if elem_size == 32
+        pixld0_s elem_size, %(basereg+0), 1, mem_operand
+    .elseif elem_size == 16
+        pixld0_s elem_size, %(basereg+0), 2, mem_operand
+        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+    .else
+        pixld0_s elem_size, %(basereg+0), 4, mem_operand
+        pixld0_s elem_size, %(basereg+0), 5, mem_operand
+        pixld0_s elem_size, %(basereg+0), 6, mem_operand
+        pixld0_s elem_size, %(basereg+0), 7, mem_operand
+    .endif
+.elseif numbytes == 2
+    .if elem_size == 16
+        pixld0_s elem_size, %(basereg+0), 1, mem_operand
+    .else
+        pixld0_s elem_size, %(basereg+0), 2, mem_operand
+        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+    .endif
+.elseif numbytes == 1
+    pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.else
+    .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld_s numpix, bpp, basereg, mem_operand
+.if bpp > 0
+    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
+.endif
+.endm
+
+.macro vuzp8 reg1, reg2
+    vuzp.8 d&reg1, d&reg2
+.endm
+
+.macro vzip8 reg1, reg2
+    vzip.8 d&reg1, d&reg2
+.endm
+
+/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixdeinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vuzp8 %(basereg+0), %(basereg+1)
+    vuzp8 %(basereg+2), %(basereg+3)
+    vuzp8 %(basereg+1), %(basereg+3)
+    vuzp8 %(basereg+0), %(basereg+2)
+.endif
+.endm
+
+/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vzip8 %(basereg+0), %(basereg+2)
+    vzip8 %(basereg+1), %(basereg+3)
+    vzip8 %(basereg+2), %(basereg+3)
+    vzip8 %(basereg+0), %(basereg+1)
+.endif
+.endm
+
+/*
+ * This is a macro for implementing cache preload. The main idea is that
+ * cache preload logic is mostly independent from the rest of pixels
+ * processing code. It starts at the top left pixel and moves forward
+ * across pixels and can jump across scanlines. Prefetch distance is
+ * handled in an 'incremental' way: it starts from 0 and advances to the
+ * optimal distance over time. After reaching optimal prefetch distance,
+ * it is kept constant. There are some checks which prevent prefetching
+ * unneeded pixel lines below the image (but it still can prefetch a bit
+ * more data on the right side of the image - not a big issue and may
+ * be actually helpful when rendering text glyphs). Additional trick is
+ * the use of LDR instruction for prefetch instead of PLD when moving to
+ * the next line, the point is that we have a high chance of getting TLB
+ * miss in this case, and PLD would be useless.
+ *
+ * This sounds like it may introduce a noticeable overhead (when working with
+ * fully cached data). But in reality, due to having a separate pipeline and
+ * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
+ * execute simultaneously with NEON and be completely shadowed by it. Thus
+ * we get no performance overhead at all (*). This looks like a very nice
+ * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
+ * but still can implement some rather advanced prefetch logic in sofware
+ * for almost zero cost!
+ *
+ * (*) The overhead of the prefetcher is visible when running some trivial
+ * pixels processing like simple copy. Anyway, having prefetch is a must
+ * when working with the graphics data.
+ */
+.macro PF a, x:vararg
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
+    a x
+.endif
+.endm
+
+.macro cache_preload std_increment, boost_increment
+.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
+.if regs_shortage
+    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+.endif
+.if std_increment != 0
+    PF add PF_X, PF_X, #std_increment
+.endif
+    PF tst PF_CTL, #0xF
+    PF addne PF_X, PF_X, #boost_increment
+    PF subne PF_CTL, PF_CTL, #1
+    PF cmp PF_X, ORIG_W
+.if src_bpp_shift >= 0
+    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+.endif
+.if dst_r_bpp != 0
+    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+.endif
+.if mask_bpp_shift >= 0
+    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+.endif
+    PF subge PF_X, PF_X, ORIG_W
+    PF subges PF_CTL, PF_CTL, #0x10
+.if src_bpp_shift >= 0
+    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endif
+.if dst_r_bpp != 0
+    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+.endif
+.if mask_bpp_shift >= 0
+    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+.endif
+.endif
+.endm
+
+.macro cache_preload_simple
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
+.if src_bpp > 0
+    pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
+.endif
+.if dst_r_bpp > 0
+    pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
+.endif
+.if mask_bpp > 0
+    pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
+.endif
+.endif
+.endm
+
+.macro fetch_mask_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+.endm
+
+/*
+ * Macro which is used to process leading pixels until destination
+ * pointer is properly aligned (at 16 bytes boundary). When destination
+ * buffer uses 16bpp format, this is unnecessary, or even pointless.
+ */
+.macro ensure_destination_ptr_alignment process_pixblock_head, \
+                                        process_pixblock_tail, \
+                                        process_pixblock_tail_head
+.if dst_w_bpp != 24
+    tst         DST_R, #0xF
+    beq         2f
+
+.irp lowbit, 1, 2, 4, 8, 16
+local skip1
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_R, #lowbit
+    beq         1f
+.endif
+    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
+.if dst_r_bpp > 0
+    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
+.else
+    add         DST_R, DST_R, #lowbit
+.endif
+    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
+    sub         W, W, #(lowbit * 8 / dst_w_bpp)
+1:
+.endif
+.endr
+    pixdeinterleave src_bpp, src_basereg
+    pixdeinterleave mask_bpp, mask_basereg
+    pixdeinterleave dst_r_bpp, dst_r_basereg
+
+    process_pixblock_head
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+    process_pixblock_tail
+
+    pixinterleave dst_w_bpp, dst_w_basereg
+.irp lowbit, 1, 2, 4, 8, 16
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_W, #lowbit
+    beq         1f
+.endif
+    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
+1:
+.endif
+.endr
+.endif
+2:
+.endm
+
+/*
+ * Special code for processing up to (pixblock_size - 1) remaining
+ * trailing pixels. As SIMD processing performs operation on
+ * pixblock_size pixels, anything smaller than this has to be loaded
+ * and stored in a special way. Loading and storing of pixel data is
+ * performed in such a way that we fill some 'slots' in the NEON
+ * registers (some slots naturally are unused), then perform compositing
+ * operation as usual. In the end, the data is taken from these 'slots'
+ * and saved to memory.
+ *
+ * cache_preload_flag - allows to suppress prefetch if
+ *                      set to 0
+ * dst_aligned_flag   - selects whether destination buffer
+ *                      is aligned
+ */
+.macro process_trailing_pixels cache_preload_flag, \
+                               dst_aligned_flag, \
+                               process_pixblock_head, \
+                               process_pixblock_tail, \
+                               process_pixblock_tail_head
+    tst         W, #(pixblock_size - 1)
+    beq         2f
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+    tst         W, #chunk_size
+    beq         1f
+    pixld_src   chunk_size, src_bpp, src_basereg, SRC
+    pixld       chunk_size, mask_bpp, mask_basereg, MASK
+.if dst_aligned_flag != 0
+    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.else
+    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.endif
+.if cache_preload_flag != 0
+    PF add      PF_X, PF_X, #chunk_size
+.endif
+1:
+.endif
+.endr
+    pixdeinterleave src_bpp, src_basereg
+    pixdeinterleave mask_bpp, mask_basereg
+    pixdeinterleave dst_r_bpp, dst_r_basereg
+
+    process_pixblock_head
+.if cache_preload_flag != 0
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+.endif
+    process_pixblock_tail
+    pixinterleave dst_w_bpp, dst_w_basereg
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+    tst         W, #chunk_size
+    beq         1f
+.if dst_aligned_flag != 0
+    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.else
+    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.endif
+1:
+.endif
+.endr
+2:
+.endm
+
+/*
+ * Macro, which performs all the needed operations to switch to the next
+ * scanline and start the next loop iteration unless all the scanlines
+ * are already processed.
+ */
+.macro advance_to_next_scanline start_of_loop_label
+.if regs_shortage
+    ldrd        W, [sp] /* load W and H (width and height) from stack */
+.else
+    mov         W, ORIG_W
+.endif
+    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
+.if src_bpp != 0
+    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
+.endif
+.if mask_bpp != 0
+    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
+.endif
+.if (dst_w_bpp != 24)
+    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
+.endif
+.if (src_bpp != 24) && (src_bpp != 0)
+    sub         SRC, SRC, W, lsl #src_bpp_shift
+.endif
+.if (mask_bpp != 24) && (mask_bpp != 0)
+    sub         MASK, MASK, W, lsl #mask_bpp_shift
+.endif
+    subs        H, H, #1
+    mov         DST_R, DST_W
+.if regs_shortage
+    str         H, [sp, #4] /* save updated height to stack */
+.endif
+    bge         start_of_loop_label
+.endm
+
+/*
+ * Registers are allocated in the following way by default:
+ * d0, d1, d2, d3     - reserved for loading source pixel data
+ * d4, d5, d6, d7     - reserved for loading destination pixel data
+ * d24, d25, d26, d27 - reserved for loading mask pixel data
+ * d28, d29, d30, d31 - final destination pixel data for writeback to memory
+ */
+.macro generate_composite_function fname, \
+                                   src_bpp_, \
+                                   mask_bpp_, \
+                                   dst_w_bpp_, \
+                                   flags, \
+                                   pixblock_size_, \
+                                   prefetch_distance, \
+                                   init, \
+                                   cleanup, \
+                                   process_pixblock_head, \
+                                   process_pixblock_tail, \
+                                   process_pixblock_tail_head, \
+                                   dst_w_basereg_ = 28, \
+                                   dst_r_basereg_ = 4, \
+                                   src_basereg_   = 0, \
+                                   mask_basereg_  = 24
+
+    .func fname
+    .global fname
+    /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+    push        {r4-r12, lr}        /* save all registers */
+
+/*
+ * Select prefetch type for this function. If prefetch distance is
+ * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
+ * has to be used instead of ADVANCED.
+ */
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
+.if prefetch_distance == 0
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
+        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
+.endif
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+    .set src_bpp, src_bpp_
+    .set mask_bpp, mask_bpp_
+    .set dst_w_bpp, dst_w_bpp_
+    .set pixblock_size, pixblock_size_
+    .set dst_w_basereg, dst_w_basereg_
+    .set dst_r_basereg, dst_r_basereg_
+    .set src_basereg, src_basereg_
+    .set mask_basereg, mask_basereg_
+
+    .macro pixld_src x:vararg
+        pixld x
+    .endm
+    .macro fetch_src_pixblock
+        pixld_src   pixblock_size, src_bpp, \
+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+    .endm
+/*
+ * Assign symbolic names to registers
+ */
+    W           .req        r0      /* width (is updated during processing) */
+    H           .req        r1      /* height (is updated during processing) */
+    DST_W       .req        r2      /* destination buffer pointer for writes */
+    DST_STRIDE  .req        r3      /* destination image stride */
+    SRC         .req        r4      /* source buffer pointer */
+    SRC_STRIDE  .req        r5      /* source image stride */
+    DST_R       .req        r6      /* destination buffer pointer for reads */
+
+    MASK        .req        r7      /* mask pointer */
+    MASK_STRIDE .req        r8      /* mask stride */
+
+    PF_CTL      .req        r9      /* combined lines counter and prefetch */
+                                    /* distance increment counter */
+    PF_X        .req        r10     /* pixel index in a scanline for current */
+                                    /* pretetch position */
+    PF_SRC      .req        r11     /* pointer to source scanline start */
+                                    /* for prefetch purposes */
+    PF_DST      .req        r12     /* pointer to destination scanline start */
+                                    /* for prefetch purposes */
+    PF_MASK     .req        r14     /* pointer to mask scanline start */
+                                    /* for prefetch purposes */
+/*
+ * Check whether we have enough registers for all the local variables.
+ * If we don't have enough registers, original width and height are
+ * kept on top of stack (and 'regs_shortage' variable is set to indicate
+ * this for the rest of code). Even if there are enough registers, the
+ * allocation scheme may be a bit different depending on whether source
+ * or mask is not used.
+ */
+.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
+    ORIG_W      .req        r10     /* saved original width */
+    DUMMY       .req        r12     /* temporary register */
+    .set        regs_shortage, 0
+.elseif mask_bpp == 0
+    ORIG_W      .req        r7      /* saved original width */
+    DUMMY       .req        r8      /* temporary register */
+    .set        regs_shortage, 0
+.elseif src_bpp == 0
+    ORIG_W      .req        r4      /* saved original width */
+    DUMMY       .req        r5      /* temporary register */
+    .set        regs_shortage, 0
+.else
+    ORIG_W      .req        r1      /* saved original width */
+    DUMMY       .req        r1      /* temporary register */
+    .set        regs_shortage, 1
+.endif
+
+    .set mask_bpp_shift, -1
+.if src_bpp == 32
+    .set src_bpp_shift, 2
+.elseif src_bpp == 24
+    .set src_bpp_shift, 0
+.elseif src_bpp == 16
+    .set src_bpp_shift, 1
+.elseif src_bpp == 8
+    .set src_bpp_shift, 0
+.elseif src_bpp == 0
+    .set src_bpp_shift, -1
+.else
+    .error "requested src bpp (src_bpp) is not supported"
+.endif
+.if mask_bpp == 32
+    .set mask_bpp_shift, 2
+.elseif mask_bpp == 24
+    .set mask_bpp_shift, 0
+.elseif mask_bpp == 8
+    .set mask_bpp_shift, 0
+.elseif mask_bpp == 0
+    .set mask_bpp_shift, -1
+.else
+    .error "requested mask bpp (mask_bpp) is not supported"
+.endif
+.if dst_w_bpp == 32
+    .set dst_bpp_shift, 2
+.elseif dst_w_bpp == 24
+    .set dst_bpp_shift, 0
+.elseif dst_w_bpp == 16
+    .set dst_bpp_shift, 1
+.elseif dst_w_bpp == 8
+    .set dst_bpp_shift, 0
+.else
+    .error "requested dst bpp (dst_w_bpp) is not supported"
+.endif
+
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+    .set dst_r_bpp, dst_w_bpp
+.else
+    .set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+    .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+    .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+.if prefetch_distance < 0 || prefetch_distance > 15
+    .error "invalid prefetch distance (prefetch_distance)"
+.endif
+
+.if src_bpp > 0
+    ldr         SRC, [sp, #40]
+.endif
+.if mask_bpp > 0
+    ldr         MASK, [sp, #48]
+.endif
+    PF mov      PF_X, #0
+.if src_bpp > 0
+    ldr         SRC_STRIDE, [sp, #44]
+.endif
+.if mask_bpp > 0
+    ldr         MASK_STRIDE, [sp, #52]
+.endif
+    mov         DST_R, DST_W
+
+.if src_bpp == 24
+    sub         SRC_STRIDE, SRC_STRIDE, W
+    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
+.endif
+.if mask_bpp == 24
+    sub         MASK_STRIDE, MASK_STRIDE, W
+    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
+.endif
+.if dst_w_bpp == 24
+    sub         DST_STRIDE, DST_STRIDE, W
+    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
+.endif
+
+/*
+ * Setup advanced prefetcher initial state
+ */
+    PF mov      PF_SRC, SRC
+    PF mov      PF_DST, DST_R
+    PF mov      PF_MASK, MASK
+    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
+    PF mov      PF_CTL, H, lsl #4
+    PF add      PF_CTL, #(prefetch_distance - 0x10)
+
+    init
+.if regs_shortage
+    push        {r0, r1}
+.endif
+    subs        H, H, #1
+.if regs_shortage
+    str         H, [sp, #4] /* save updated height to stack */
+.else
+    mov         ORIG_W, W
+.endif
+    blt         9f
+    cmp         W, #(pixblock_size * 2)
+    blt         8f
+/*
+ * This is the start of the pipelined loop, which if optimized for
+ * long scanlines
+ */
+0:
+    ensure_destination_ptr_alignment process_pixblock_head, \
+                                     process_pixblock_tail, \
+                                     process_pixblock_tail_head
+
+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+    pixld_a     pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    PF add      PF_X, PF_X, #pixblock_size
+    process_pixblock_head
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+    subs        W, W, #(pixblock_size * 2)
+    blt         2f
+1:
+    process_pixblock_tail_head
+    cache_preload_simple
+    subs        W, W, #pixblock_size
+    bge         1b
+2:
+    process_pixblock_tail
+    pixst_a     pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+
+    /* Process the remaining trailing pixels in the scanline */
+    process_trailing_pixels 1, 1, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+    advance_to_next_scanline 0b
+
+.if regs_shortage
+    pop         {r0, r1}
+.endif
+    cleanup
+    pop         {r4-r12, pc}  /* exit */
+/*
+ * This is the start of the loop, designed to process images with small width
+ * (less than pixblock_size * 2 pixels). In this case neither pipelining
+ * nor prefetch are used.
+ */
+8:
+    /* Process exactly pixblock_size pixels if needed */
+    tst         W, #pixblock_size
+    beq         1f
+    pixld       pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    process_pixblock_head
+    process_pixblock_tail
+    pixst       pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+1:
+    /* Process the remaining trailing pixels in the scanline */
+    process_trailing_pixels 0, 0, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+    advance_to_next_scanline 8b
+9:
+.if regs_shortage
+    pop         {r0, r1}
+.endif
+    cleanup
+    pop         {r4-r12, pc}  /* exit */
+
+    .purgem     fetch_src_pixblock
+    .purgem     pixld_src
+
+    .unreq      SRC
+    .unreq      MASK
+    .unreq      DST_R
+    .unreq      DST_W
+    .unreq      ORIG_W
+    .unreq      W
+    .unreq      H
+    .unreq      SRC_STRIDE
+    .unreq      DST_STRIDE
+    .unreq      MASK_STRIDE
+    .unreq      PF_CTL
+    .unreq      PF_X
+    .unreq      PF_SRC
+    .unreq      PF_DST
+    .unreq      PF_MASK
+    .unreq      DUMMY
+    .endfunc
+.endm
+
+/*
+ * A simplified variant of function generation template for a single
+ * scanline processing (for implementing pixman combine functions)
+ */
+.macro generate_composite_function_scanline        use_nearest_scaling, \
+                                                   fname, \
+                                                   src_bpp_, \
+                                                   mask_bpp_, \
+                                                   dst_w_bpp_, \
+                                                   flags, \
+                                                   pixblock_size_, \
+                                                   init, \
+                                                   cleanup, \
+                                                   process_pixblock_head, \
+                                                   process_pixblock_tail, \
+                                                   process_pixblock_tail_head, \
+                                                   dst_w_basereg_ = 28, \
+                                                   dst_r_basereg_ = 4, \
+                                                   src_basereg_   = 0, \
+                                                   mask_basereg_  = 24
+
+    .func fname
+    .global fname
+    /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+    .set src_bpp, src_bpp_
+    .set mask_bpp, mask_bpp_
+    .set dst_w_bpp, dst_w_bpp_
+    .set pixblock_size, pixblock_size_
+    .set dst_w_basereg, dst_w_basereg_
+    .set dst_r_basereg, dst_r_basereg_
+    .set src_basereg, src_basereg_
+    .set mask_basereg, mask_basereg_
+
+.if use_nearest_scaling != 0
+    /*
+     * Assign symbolic names to registers for nearest scaling
+     */
+    W           .req        r0
+    DST_W       .req        r1
+    SRC         .req        r2
+    VX          .req        r3
+    UNIT_X      .req        ip
+    MASK        .req        lr
+    TMP1        .req        r4
+    TMP2        .req        r5
+    DST_R       .req        r6
+
+    .macro pixld_src x:vararg
+        pixld_s x
+    .endm
+
+    ldr         UNIT_X, [sp]
+    push        {r4-r6, lr}
+    .if mask_bpp != 0
+    ldr         MASK, [sp, #(16 + 4)]
+    .endif
+.else
+    /*
+     * Assign symbolic names to registers
+     */
+    W           .req        r0      /* width (is updated during processing) */
+    DST_W       .req        r1      /* destination buffer pointer for writes */
+    SRC         .req        r2      /* source buffer pointer */
+    DST_R       .req        ip      /* destination buffer pointer for reads */
+    MASK        .req        r3      /* mask pointer */
+
+    .macro pixld_src x:vararg
+        pixld x
+    .endm
+.endif
+
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+    .set dst_r_bpp, dst_w_bpp
+.else
+    .set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+    .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+    .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+    .macro fetch_src_pixblock
+        pixld_src   pixblock_size, src_bpp, \
+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+    .endm
+
+    init
+    mov         DST_R, DST_W
+
+    cmp         W, #pixblock_size
+    blt         8f
+
+    ensure_destination_ptr_alignment process_pixblock_head, \
+                                     process_pixblock_tail, \
+                                     process_pixblock_tail_head
+
+    subs        W, W, #pixblock_size
+    blt         7f
+
+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+    pixld_a     pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    process_pixblock_head
+    subs        W, W, #pixblock_size
+    blt         2f
+1:
+    process_pixblock_tail_head
+    subs        W, W, #pixblock_size
+    bge         1b
+2:
+    process_pixblock_tail
+    pixst_a     pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+7:
+    /* Process the remaining trailing pixels in the scanline (dst aligned) */
+    process_trailing_pixels 0, 1, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+
+    cleanup
+.if use_nearest_scaling != 0
+    pop         {r4-r6, pc}  /* exit */
+.else
+    bx          lr  /* exit */
+.endif
+8:
+    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
+    process_trailing_pixels 0, 0, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+
+    cleanup
+
+.if use_nearest_scaling != 0
+    pop         {r4-r6, pc}  /* exit */
+
+    .unreq      DST_R
+    .unreq      SRC
+    .unreq      W
+    .unreq      VX
+    .unreq      UNIT_X
+    .unreq      TMP1
+    .unreq      TMP2
+    .unreq      DST_W
+    .unreq      MASK
+
+.else
+    bx          lr  /* exit */
+
+    .unreq      SRC
+    .unreq      MASK
+    .unreq      DST_R
+    .unreq      DST_W
+    .unreq      W
+.endif
+
+    .purgem     fetch_src_pixblock
+    .purgem     pixld_src
+
+    .endfunc
+.endm
+
+.macro generate_composite_function_single_scanline x:vararg
+    generate_composite_function_scanline 0, x
+.endm
+
+.macro generate_composite_function_nearest_scanline x:vararg
+    generate_composite_function_scanline 1, x
+.endm
+
+/* Default prologue/epilogue, nothing special needs to be done */
+
+.macro default_init
+.endm
+
+.macro default_cleanup
+.endm
+
+/*
+ * Prologue/epilogue variant which additionally saves/restores d8-d15
+ * registers (they need to be saved/restored by callee according to ABI).
+ * This is required if the code needs to use all the NEON registers.
+ */
+
+.macro default_init_need_all_regs
+    vpush       {d8-d15}
+.endm
+
+.macro default_cleanup_need_all_regs
+    vpop        {d8-d15}
+.endm
+
+/******************************************************************************/
+
+/*
+ * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
+ * into a planar a8r8g8b8 format (with a, r, g, b color components
+ * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
+ *
+ * Warning: the conversion is destructive and the original
+ *          value (in) is lost.
+ */
+.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
+    vshrn.u16   out_r, in,    #8
+    vshrn.u16   out_g, in,    #3
+    vsli.u16    in,    in,    #5
+    vmov.u8     out_a, #255
+    vsri.u8     out_r, out_r, #5
+    vsri.u8     out_g, out_g, #6
+    vshrn.u16   out_b, in,    #2
+.endm
+
+.macro convert_0565_to_x888 in, out_r, out_g, out_b
+    vshrn.u16   out_r, in,    #8
+    vshrn.u16   out_g, in,    #3
+    vsli.u16    in,    in,    #5
+    vsri.u8     out_r, out_r, #5
+    vsri.u8     out_g, out_g, #6
+    vshrn.u16   out_b, in,    #2
+.endm
+
+/*
+ * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
+ * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
+ * pixels packed in 128-bit register (out). Requires two temporary 128-bit
+ * registers (tmp1, tmp2)
+ */
+.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
+    vshll.u8    tmp1, in_g, #8
+    vshll.u8    out, in_r, #8
+    vshll.u8    tmp2, in_b, #8
+    vsri.u16    out, tmp1, #5
+    vsri.u16    out, tmp2, #11
+.endm
+
+/*
+ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
+ * returned in (out0, out1) registers pair. Requires one temporary
+ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
+ * value from 'in' is lost
+ */
+.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
+    vshl.u16    out0, in,   #5  /* G top 6 bits */
+    vshl.u16    tmp,  in,   #11 /* B top 5 bits */
+    vsri.u16    in,   in,   #5  /* R is ready in top bits */
+    vsri.u16    out0, out0, #6  /* G is ready in top bits */
+    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
+    vshr.u16    out1, in,   #8  /* R is in place */
+    vsri.u16    out0, tmp,  #8  /* G & B is in place */
+    vzip.u16    out0, out1      /* everything is in place */
+.endm
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
new file mode 100644
index 0000000..ca139de
--- /dev/null
+++ b/pixman/pixman-arm-neon.c
@@ -0,0 +1,517 @@
+/*
+ * Copyright Â© 2009 ARM Ltd, Movial Creative Technologies Oy
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of ARM Ltd not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  ARM Ltd makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Ian Rickards (ian.rickards@arm.com)
+ * Author:  Jonathan Morton (jonathan.morton@movial.com)
+ * Author:  Markku Vire (markku.vire@movial.com)
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <string.h>
+#include "pixman-private.h"
+#include "pixman-arm-common.h"
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_x888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_0565,
+                                   uint16_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0888,
+                                   uint8_t, 3, uint8_t, 3)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_0565,
+                                   uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_8888,
+                                   uint16_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_8888_rev,
+                                   uint8_t, 3, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0565_rev,
+                                   uint8_t, 3, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_pixbuf_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_rpixbuf_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8_8,
+                                   uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_0565,
+                                   uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_0565,
+                                   uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_8888,
+                                   uint8_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_0565,
+                                 uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_8888,
+                                 uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_reverse_n_8888,
+                                 uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, neon, in_n_8,
+                                 uint8_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_0565,
+                                      uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_8888_ca,
+                                      uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_0565_ca,
+				      uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8,
+                                      uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8,
+                                      uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8,
+                                      uint8_t, 1, uint8_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
+                                     uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_0565,
+                                     uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_0565_n_0565,
+                                     uint16_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, add_8888_n_8888,
+                                     uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8_8_8,
+                                        uint8_t, 1, uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_0565_8_0565,
+                                        uint16_t, 1, uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8_8888,
+                                        uint32_t, 1, uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8888_8888,
+                                        uint32_t, 1, uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_8888,
+                                        uint32_t, 1, uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8888_8888,
+                                        uint32_t, 1, uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_0565,
+                                        uint32_t, 1, uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_0565_8_0565,
+                                        uint16_t, 1, uint8_t, 1, uint16_t, 1)
+
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_8888, OVER,
+                                        uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, OVER,
+                                        uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, SRC,
+                                        uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 0565_8888, SRC,
+                                        uint16_t, uint32_t)
+
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_0565,
+                                           OVER, uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565,
+                                           OVER, uint16_t, uint16_t)
+
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC,
+                                         uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC,
+                                         uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,
+                                         uint16_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,
+                                         uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, OVER,
+                                         uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, ADD,
+                                         uint32_t, uint32_t)
+
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_8888, SRC,
+                                            uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_0565, SRC,
+                                            uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_x888, SRC,
+                                            uint16_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_0565, SRC,
+                                            uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, OVER,
+                                            uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, ADD,
+                                            uint32_t, uint32_t)
+
+void
+pixman_composite_src_n_8_asm_neon (int32_t   w,
+                                   int32_t   h,
+                                   uint8_t  *dst,
+                                   int32_t   dst_stride,
+                                   uint8_t   src);
+
+void
+pixman_composite_src_n_0565_asm_neon (int32_t   w,
+                                      int32_t   h,
+                                      uint16_t *dst,
+                                      int32_t   dst_stride,
+                                      uint16_t  src);
+
+void
+pixman_composite_src_n_8888_asm_neon (int32_t   w,
+                                      int32_t   h,
+                                      uint32_t *dst,
+                                      int32_t   dst_stride,
+                                      uint32_t  src);
+
+static pixman_bool_t
+pixman_fill_neon (uint32_t *bits,
+                  int       stride,
+                  int       bpp,
+                  int       x,
+                  int       y,
+                  int       width,
+                  int       height,
+                  uint32_t  _xor)
+{
+    /* stride is always multiple of 32bit units in pixman */
+    uint32_t byte_stride = stride * sizeof(uint32_t);
+
+    switch (bpp)
+    {
+    case 8:
+	pixman_composite_src_n_8_asm_neon (
+		width,
+		height,
+		(uint8_t *)(((char *) bits) + y * byte_stride + x),
+		byte_stride,
+		_xor & 0xff);
+	return TRUE;
+    case 16:
+	pixman_composite_src_n_0565_asm_neon (
+		width,
+		height,
+		(uint16_t *)(((char *) bits) + y * byte_stride + x * 2),
+		byte_stride / 2,
+		_xor & 0xffff);
+	return TRUE;
+    case 32:
+	pixman_composite_src_n_8888_asm_neon (
+		width,
+		height,
+		(uint32_t *)(((char *) bits) + y * byte_stride + x * 4),
+		byte_stride / 4,
+		_xor);
+	return TRUE;
+    default:
+	return FALSE;
+    }
+}
+
+static pixman_bool_t
+pixman_blt_neon (uint32_t *src_bits,
+                 uint32_t *dst_bits,
+                 int       src_stride,
+                 int       dst_stride,
+                 int       src_bpp,
+                 int       dst_bpp,
+                 int       src_x,
+                 int       src_y,
+                 int       dest_x,
+                 int       dest_y,
+                 int       width,
+                 int       height)
+{
+    if (src_bpp != dst_bpp)
+	return FALSE;
+
+    switch (src_bpp)
+    {
+    case 16:
+	pixman_composite_src_0565_0565_asm_neon (
+		width, height,
+		(uint16_t *)(((char *) dst_bits) +
+		dest_y * dst_stride * 4 + dest_x * 2), dst_stride * 2,
+		(uint16_t *)(((char *) src_bits) +
+		src_y * src_stride * 4 + src_x * 2), src_stride * 2);
+	return TRUE;
+    case 32:
+	pixman_composite_src_8888_8888_asm_neon (
+		width, height,
+		(uint32_t *)(((char *) dst_bits) +
+		dest_y * dst_stride * 4 + dest_x * 4), dst_stride,
+		(uint32_t *)(((char *) src_bits) +
+		src_y * src_stride * 4 + src_x * 4), src_stride);
+	return TRUE;
+    default:
+	return FALSE;
+    }
+}
+
+static const pixman_fast_path_t arm_neon_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     r5g6b5,   neon_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     b5g6r5,   neon_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     a8r8g8b8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     x8r8g8b8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     a8b8g8r8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     x8b8g8r8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     x8r8g8b8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     x8r8g8b8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     x8b8g8r8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     x8b8g8r8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     a8r8g8b8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     a8b8g8r8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  r8g8b8,   null,     r8g8b8,   neon_composite_src_0888_0888),
+    PIXMAN_STD_FAST_PATH (SRC,  b8g8r8,   null,     x8r8g8b8, neon_composite_src_0888_8888_rev),
+    PIXMAN_STD_FAST_PATH (SRC,  b8g8r8,   null,     r5g6b5,   neon_composite_src_0888_0565_rev),
+    PIXMAN_STD_FAST_PATH (SRC,  pixbuf,   pixbuf,   a8r8g8b8, neon_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  pixbuf,   pixbuf,   a8b8g8r8, neon_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  rpixbuf,  rpixbuf,  a8r8g8b8, neon_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  rpixbuf,  rpixbuf,  a8b8g8r8, neon_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       a8r8g8b8, neon_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       x8r8g8b8, neon_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       a8b8g8r8, neon_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       x8b8g8r8, neon_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       a8,       neon_composite_src_n_8_8),
+
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8,       neon_composite_over_n_8_8),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       r5g6b5,   neon_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       b5g6r5,   neon_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8r8g8b8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8r8g8b8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8b8g8r8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8b8g8r8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     r5g6b5,   neon_composite_over_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     a8r8g8b8, neon_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     x8r8g8b8, neon_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5,   neon_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5,   neon_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    a8r8g8b8, neon_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    x8r8g8b8, neon_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    r5g6b5,   neon_composite_over_8888_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid,    b5g6r5,   neon_composite_over_8888_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   solid,    r5g6b5,   neon_composite_over_0565_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   solid,    b5g6r5,   neon_composite_over_0565_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       a8r8g8b8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       x8r8g8b8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       a8b8g8r8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       x8b8g8r8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       r5g6b5,   neon_composite_over_8888_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       b5g6r5,   neon_composite_over_8888_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   a8,       r5g6b5,   neon_composite_over_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   a8,       b5g6r5,   neon_composite_over_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_over_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     r5g6b5,   neon_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     b5g6r5,   neon_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     a8r8g8b8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     x8r8g8b8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     a8b8g8r8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     x8b8g8r8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8,       neon_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8r8g8b8, neon_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8b8g8r8, neon_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8,       a8,       a8,       neon_composite_add_8_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  r5g6b5,   a8,       r5g6b5,   neon_composite_add_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (ADD,  b5g6r5,   a8,       b5g6r5,   neon_composite_add_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,       a8r8g8b8, neon_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,       a8b8g8r8, neon_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    a8r8g8b8, neon_composite_add_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    a8b8g8r8, neon_composite_add_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       neon_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     a8r8g8b8, neon_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     a8b8g8r8, neon_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (IN,   solid,    null,     a8,       neon_composite_in_n_8),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, r5g6b5,   neon_composite_out_reverse_8_0565),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, b5g6r5,   neon_composite_out_reverse_8_0565),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8r8g8b8, neon_composite_out_reverse_8_8888),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8b8g8r8, neon_composite_out_reverse_8_8888),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, neon_8888_8888),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, b5g6r5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, b5g6r5, neon_8888_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, neon_0565_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8888),
+    /* Note: NONE repeat is not supported yet */
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_8_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_8_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565),
+
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_8_0565),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_8_0565),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8_x888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_8_0565),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+
+    { PIXMAN_OP_NONE },
+};
+
+static pixman_bool_t
+arm_neon_blt (pixman_implementation_t *imp,
+              uint32_t *               src_bits,
+              uint32_t *               dst_bits,
+              int                      src_stride,
+              int                      dst_stride,
+              int                      src_bpp,
+              int                      dst_bpp,
+              int                      src_x,
+              int                      src_y,
+              int                      dest_x,
+              int                      dest_y,
+              int                      width,
+              int                      height)
+{
+    if (!pixman_blt_neon (
+            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+            src_x, src_y, dest_x, dest_y, width, height))
+
+    {
+	return _pixman_implementation_blt (
+	    imp->delegate,
+	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+	    src_x, src_y, dest_x, dest_y, width, height);
+    }
+
+    return TRUE;
+}
+
+static pixman_bool_t
+arm_neon_fill (pixman_implementation_t *imp,
+               uint32_t *               bits,
+               int                      stride,
+               int                      bpp,
+               int                      x,
+               int                      y,
+               int                      width,
+               int                      height,
+               uint32_t xor)
+{
+    if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
+	return TRUE;
+
+    return _pixman_implementation_fill (
+	imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+}
+
+#define BIND_COMBINE_U(name)                                             \
+void                                                                     \
+pixman_composite_scanline_##name##_mask_asm_neon (int32_t         w,     \
+                                                  const uint32_t *dst,   \
+                                                  const uint32_t *src,   \
+                                                  const uint32_t *mask); \
+                                                                         \
+void                                                                     \
+pixman_composite_scanline_##name##_asm_neon (int32_t         w,          \
+                                             const uint32_t *dst,        \
+                                             const uint32_t *src);       \
+                                                                         \
+static void                                                              \
+neon_combine_##name##_u (pixman_implementation_t *imp,                   \
+                         pixman_op_t              op,                    \
+                         uint32_t *               dest,                  \
+                         const uint32_t *         src,                   \
+                         const uint32_t *         mask,                  \
+                         int                      width)                 \
+{                                                                        \
+    if (mask)                                                            \
+	pixman_composite_scanline_##name##_mask_asm_neon (width, dest,   \
+	                                                  src, mask);    \
+    else                                                                 \
+	pixman_composite_scanline_##name##_asm_neon (width, dest, src);  \
+}
+
+BIND_COMBINE_U (over)
+BIND_COMBINE_U (add)
+BIND_COMBINE_U (out_reverse)
+
+pixman_implementation_t *
+_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp =
+	_pixman_implementation_create (fallback, arm_neon_fast_paths);
+
+    imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u;
+    imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = neon_combine_out_reverse_u;
+
+    imp->blt = arm_neon_blt;
+    imp->fill = arm_neon_fill;
+
+    return imp;
+}
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
new file mode 100644
index 0000000..8fe1b50
--- /dev/null
+++ b/pixman/pixman-arm-simd-asm.S
@@ -0,0 +1,439 @@
+/*
+ * Copyright Â© 2008 Mozilla Corporation
+ * Copyright Â© 2010 Nokia Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+	.text
+	.arch armv6
+	.object_arch armv4
+	.arm
+	.altmacro
+	.p2align 2
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+	.func fname
+	.global fname
+#ifdef __ELF__
+	.hidden fname
+	.type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * The code below was generated by gcc 4.3.4 from the commented out
+ * functions in 'pixman-arm-simd.c' file with the following optimization
+ * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
+ *
+ * TODO: replace gcc generated code with hand tuned versions because
+ * the code quality is not very good, introduce symbolic register
+ * aliases for better readability and maintainability.
+ */
+
+pixman_asm_function pixman_composite_add_8_8_asm_armv6
+	push	{r4, r5, r6, r7, r8, r9, r10, r11}
+	mov	r10, r1
+	sub	sp, sp, #4
+	subs	r10, r10, #1
+	mov	r11, r0
+	mov	r8, r2
+	str	r3, [sp]
+	ldr	r7, [sp, #36]
+	bcc	0f
+6:	cmp	r11, #0
+	beq	1f
+	orr	r3, r8, r7
+	tst	r3, #3
+	beq	2f
+	mov	r1, r8
+	mov	r0, r7
+	mov	r12, r11
+	b	3f
+5:	tst	r3, #3
+	beq	4f
+3:	ldrb	r2, [r0], #1
+	subs	r12, r12, #1
+	ldrb	r3, [r1]
+	uqadd8	r3, r2, r3
+	strb	r3, [r1], #1
+	orr	r3, r1, r0
+	bne	5b
+1:	ldr	r3, [sp]
+	add	r8, r8, r3
+	ldr	r3, [sp, #40]
+	add	r7, r7, r3
+10:	subs	r10, r10, #1
+	bcs	6b
+0:	add	sp, sp, #4
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
+	bx	lr
+2:	mov	r12, r11
+	mov	r1, r8
+	mov	r0, r7
+4:	cmp	r12, #3
+	subgt	r6, r12, #4
+	movgt	r9, r12
+	lsrgt	r5, r6, #2
+	addgt	r3, r5, #1
+	movgt	r12, #0
+	lslgt	r4, r3, #2
+	ble	7f
+8:	ldr	r3, [r0, r12]
+	ldr	r2, [r1, r12]
+	uqadd8	r3, r3, r2
+	str	r3, [r1, r12]
+	add	r12, r12, #4
+	cmp	r12, r4
+	bne	8b
+	sub	r3, r9, #4
+	bic	r3, r3, #3
+	add	r3, r3, #4
+	subs	r12, r6, r5, lsl #2
+	add	r1, r1, r3
+	add	r0, r0, r3
+	beq	1b
+7:	mov	r4, #0
+9:	ldrb	r3, [r1, r4]
+	ldrb	r2, [r0, r4]
+	uqadd8	r3, r2, r3
+	strb	r3, [r1, r4]
+	add	r4, r4, #1
+	cmp	r4, r12
+	bne	9b
+	ldr	r3, [sp]
+	add	r8, r8, r3
+	ldr	r3, [sp, #40]
+	add	r7, r7, r3
+	b	10b
+.endfunc
+
+pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
+	push	{r4, r5, r6, r7, r8, r9, r10, r11}
+	sub	sp, sp, #20
+	cmp	r1, #0
+	mov	r12, r2
+	str	r1, [sp, #12]
+	str	r0, [sp, #16]
+	ldr	r2, [sp, #52]
+	beq	0f
+	lsl	r3, r3, #2
+	str	r3, [sp]
+	ldr	r3, [sp, #56]
+	mov	r10, #0
+	lsl	r3, r3, #2
+	str	r3, [sp, #8]
+	mov	r11, r3
+	b	1f
+6:	ldr	r11, [sp, #8]
+1:	ldr	r9, [sp]
+	mov	r0, r12
+	add	r12, r12, r9
+	mov	r1, r2
+	str	r12, [sp, #4]
+	add	r2, r2, r11
+	ldr	r12, [sp, #16]
+	ldr	r3, =0x00800080
+	ldr	r9, =0xff00ff00
+	mov	r11, #255
+	cmp	r12, #0
+	beq	4f
+5:	ldr	r5, [r1], #4
+	ldr	r4, [r0]
+	sub	r8, r11, r5, lsr #24
+	uxtb16	r6, r4
+	uxtb16	r7, r4, ror #8
+	mla	r6, r6, r8, r3
+	mla	r7, r7, r8, r3
+	uxtab16	r6, r6, r6, ror #8
+	uxtab16	r7, r7, r7, ror #8
+	and	r7, r7, r9
+	uxtab16	r6, r7, r6, ror #8
+	uqadd8	r5, r6, r5
+	str	r5, [r0], #4
+	subs	r12, r12, #1
+	bne	5b
+4:	ldr	r3, [sp, #12]
+	add	r10, r10, #1
+	cmp	r10, r3
+	ldr	r12, [sp, #4]
+	bne	6b
+0:	add	sp, sp, #20
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
+	bx	lr
+.endfunc
+
+pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
+	push	{r4, r5, r6, r7, r8, r9, r10, r11}
+	sub	sp, sp, #28
+	cmp	r1, #0
+	str	r1, [sp, #12]
+	ldrb	r1, [sp, #71]
+	mov	r12, r2
+	str	r0, [sp, #16]
+	ldr	r2, [sp, #60]
+	str	r1, [sp, #24]
+	beq	0f
+	lsl	r3, r3, #2
+	str	r3, [sp, #20]
+	ldr	r3, [sp, #64]
+	mov	r10, #0
+	lsl	r3, r3, #2
+	str	r3, [sp, #8]
+	mov	r11, r3
+	b	1f
+5:	ldr	r11, [sp, #8]
+1:	ldr	r4, [sp, #20]
+	mov	r0, r12
+	mov	r1, r2
+	add	r12, r12, r4
+	add	r2, r2, r11
+	str	r12, [sp]
+	str	r2, [sp, #4]
+	ldr	r12, [sp, #16]
+	ldr	r2, =0x00800080
+	ldr	r3, [sp, #24]
+	mov	r11, #255
+	cmp	r12, #0
+	beq	3f
+4:	ldr	r5, [r1], #4
+	ldr	r4, [r0]
+	uxtb16	r6, r5
+	uxtb16	r7, r5, ror #8
+	mla	r6, r6, r3, r2
+	mla	r7, r7, r3, r2
+	uxtab16	r6, r6, r6, ror #8
+	uxtab16	r7, r7, r7, ror #8
+	uxtb16	r6, r6, ror #8
+	uxtb16	r7, r7, ror #8
+	orr	r5, r6, r7, lsl #8
+	uxtb16	r6, r4
+	uxtb16	r7, r4, ror #8
+	sub	r8, r11, r5, lsr #24
+	mla	r6, r6, r8, r2
+	mla	r7, r7, r8, r2
+	uxtab16	r6, r6, r6, ror #8
+	uxtab16	r7, r7, r7, ror #8
+	uxtb16	r6, r6, ror #8
+	uxtb16	r7, r7, ror #8
+	orr	r6, r6, r7, lsl #8
+	uqadd8	r5, r6, r5
+	str	r5, [r0], #4
+	subs	r12, r12, #1
+	bne	4b
+3:	ldr	r1, [sp, #12]
+	add	r10, r10, #1
+	cmp	r10, r1
+	ldr	r12, [sp]
+	ldr	r2, [sp, #4]
+	bne	5b
+0:	add	sp, sp, #28
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
+	bx	lr
+.endfunc
+
+pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
+	push	{r4, r5, r6, r7, r8, r9, r10, r11}
+	sub	sp, sp, #28
+	cmp	r1, #0
+	ldr	r9, [sp, #60]
+	str	r1, [sp, #12]
+	bic	r1, r9, #-16777216
+	str	r1, [sp, #20]
+	mov	r12, r2
+	lsr	r1, r9, #8
+	ldr	r2, [sp, #20]
+	bic	r1, r1, #-16777216
+	bic	r2, r2, #65280
+	bic	r1, r1, #65280
+	str	r2, [sp, #20]
+	str	r0, [sp, #16]
+	str	r1, [sp, #4]
+	ldr	r2, [sp, #68]
+	beq	0f
+	lsl	r3, r3, #2
+	str	r3, [sp, #24]
+	mov	r0, #0
+	b	1f
+5:	ldr	r3, [sp, #24]
+1:	ldr	r4, [sp, #72]
+	mov	r10, r12
+	mov	r1, r2
+	add	r12, r12, r3
+	add	r2, r2, r4
+	str	r12, [sp, #8]
+	str	r2, [sp]
+	ldr	r12, [sp, #16]
+	ldr	r11, =0x00800080
+	ldr	r2, [sp, #4]
+	ldr	r3, [sp, #20]
+	cmp	r12, #0
+	beq	3f
+4:	ldrb	r5, [r1], #1
+	ldr	r4, [r10]
+	mla	r6, r3, r5, r11
+	mla	r7, r2, r5, r11
+	uxtab16	r6, r6, r6, ror #8
+	uxtab16	r7, r7, r7, ror #8
+	uxtb16	r6, r6, ror #8
+	uxtb16	r7, r7, ror #8
+	orr	r5, r6, r7, lsl #8
+	uxtb16	r6, r4
+	uxtb16	r7, r4, ror #8
+	mvn	r8, r5
+	lsr	r8, r8, #24
+	mla	r6, r6, r8, r11
+	mla	r7, r7, r8, r11
+	uxtab16	r6, r6, r6, ror #8
+	uxtab16	r7, r7, r7, ror #8
+	uxtb16	r6, r6, ror #8
+	uxtb16	r7, r7, ror #8
+	orr	r6, r6, r7, lsl #8
+	uqadd8	r5, r6, r5
+	str	r5, [r10], #4
+	subs	r12, r12, #1
+	bne	4b
+3:	ldr	r4, [sp, #12]
+	add	r0, r0, #1
+	cmp	r0, r4
+	ldr	r12, [sp, #8]
+	ldr	r2, [sp]
+	bne	5b
+0:	add	sp, sp, #28
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
+	bx	lr
+.endfunc
+
+/*
+ * Note: This code is only using armv5te instructions (not even armv6),
+ *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
+ *       be split into a few variants, tuned for each microarchitecture.
+ *
+ * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
+ * have efficient write combining), it needs to be changed to use 16-byte
+ * aligned writes using STM instruction.
+ *
+ * Nearest scanline scaler macro template uses the following arguments:
+ *  fname                     - name of the function to generate
+ *  bpp_shift                 - (1 << bpp_shift) is the size of pixel in bytes
+ *  t                         - type suffix for LDR/STR instructions
+ *  prefetch_distance         - prefetch in the source image by that many
+ *                              pixels ahead
+ *  prefetch_braking_distance - stop prefetching when that many pixels are
+ *                              remaining before the end of scanline
+ */
+
+.macro generate_nearest_scanline_func fname, bpp_shift, t,      \
+                                      prefetch_distance,        \
+                                      prefetch_braking_distance
+
+pixman_asm_function fname
+	W	.req	r0
+	DST	.req	r1
+	SRC	.req	r2
+	VX	.req	r3
+	UNIT_X	.req	ip
+	TMP1	.req	r4
+	TMP2	.req	r5
+	VXMASK	.req	r6
+	PF_OFFS	.req	r7
+
+	ldr	UNIT_X, [sp]
+	push	{r4, r5, r6, r7}
+	mvn	VXMASK, #((1 << bpp_shift) - 1)
+
+	/* define helper macro */
+	.macro	scale_2_pixels
+		ldr&t	TMP1, [SRC, TMP1]
+		and	TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
+		add	VX, VX, UNIT_X
+		str&t	TMP1, [DST], #(1 << bpp_shift)
+
+		ldr&t	TMP2, [SRC, TMP2]
+		and	TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
+		add	VX, VX, UNIT_X
+		str&t	TMP2, [DST], #(1 << bpp_shift)
+	.endm
+
+	/* now do the scaling */
+	and	TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
+	add	VX, VX, UNIT_X
+	subs	W, W, #(8 + prefetch_braking_distance)
+	blt	2f
+	/* calculate prefetch offset */
+	mov	PF_OFFS, #prefetch_distance
+	mla	PF_OFFS, UNIT_X, PF_OFFS, VX
+1:	/* main loop, process 8 pixels per iteration with prefetch */
+	subs	W, W, #8
+	add	PF_OFFS, UNIT_X, lsl #3
+	scale_2_pixels
+	scale_2_pixels
+	scale_2_pixels
+	scale_2_pixels
+	pld	[SRC, PF_OFFS, lsr #(16 - bpp_shift)]
+	bge	1b
+2:
+	subs	W, W, #(4 - 8 - prefetch_braking_distance)
+	blt	2f
+1:	/* process the remaining pixels */
+	scale_2_pixels
+	scale_2_pixels
+	subs	W, W, #4
+	bge	1b
+2:
+	tst	W, #2
+	beq	2f
+	scale_2_pixels
+2:
+	tst	W, #1
+	ldrne&t	TMP1, [SRC, TMP1]
+	strne&t	TMP1, [DST]
+	/* cleanup helper macro */
+	.purgem	scale_2_pixels
+	.unreq	DST
+	.unreq	SRC
+	.unreq	W
+	.unreq	VX
+	.unreq	UNIT_X
+	.unreq	TMP1
+	.unreq	TMP2
+	.unreq	VXMASK
+	.unreq	PF_OFFS
+	/* return */
+	pop	{r4, r5, r6, r7}
+	bx	lr
+.endfunc
+.endm
+
+generate_nearest_scanline_func \
+    pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
+
+generate_nearest_scanline_func \
+    pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2,  , 48, 32
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
new file mode 100644
index 0000000..3d19bfa
--- /dev/null
+++ b/pixman/pixman-arm-simd.c
@@ -0,0 +1,432 @@
+/*
+ * Copyright Â© 2008 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+#include "pixman-arm-common.h"
+#include "pixman-inlines.h"
+
+#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
+
+void
+pixman_composite_add_8_8_asm_armv6 (int32_t  width,
+				    int32_t  height,
+				    uint8_t *dst_line,
+				    int32_t  dst_stride,
+				    uint8_t *src_line,
+				    int32_t  src_stride)
+{
+    uint8_t *dst, *src;
+    int32_t w;
+    uint8_t s, d;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	/* ensure both src and dst are properly aligned before doing 32 bit reads
+	 * we'll stay in this loop if src and dst have differing alignments
+	 */
+	while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
+	{
+	    s = *src;
+	    d = *dst;
+	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
+	    *dst = d;
+
+	    dst++;
+	    src++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    asm ("uqadd8 %0, %1, %2"
+		 : "=r" (*(uint32_t*)dst)
+		 : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
+	    dst += 4;
+	    src += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    s = *src;
+	    d = *dst;
+	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
+	    *dst = d;
+
+	    dst++;
+	    src++;
+	    w--;
+	}
+    }
+
+}
+
+void
+pixman_composite_over_8888_8888_asm_armv6 (int32_t   width,
+                                           int32_t   height,
+                                           uint32_t *dst_line,
+                                           int32_t   dst_stride,
+                                           uint32_t *src_line,
+                                           int32_t   src_stride)
+{
+    uint32_t    *dst;
+    uint32_t    *src;
+    int32_t w;
+    uint32_t component_half = 0x800080;
+    uint32_t upper_component_mask = 0xff00ff00;
+    uint32_t alpha_mask = 0xff;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+/* #define inner_branch */
+	asm volatile (
+	    "cmp %[w], #0\n\t"
+	    "beq 2f\n\t"
+	    "1:\n\t"
+	    /* load src */
+	    "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+	     * The 0x0 case also allows us to avoid doing an unecessary data
+	     * write which is more valuable so we only check for that
+	     */
+	    "cmp r5, #0\n\t"
+	    "beq 3f\n\t"
+
+	    /* = 255 - alpha */
+	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+
+	    "ldr r4, [%[dest]] \n\t"
+
+#else
+	    "ldr r4, [%[dest]] \n\t"
+
+	    /* = 255 - alpha */
+	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+#endif
+	    "uxtb16 r6, r4\n\t"
+	    "uxtb16 r7, r4, ror #8\n\t"
+
+	    /* multiply by 257 and divide by 65536 */
+	    "mla r6, r6, r8, %[component_half]\n\t"
+	    "mla r7, r7, r8, %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    /* recombine the 0xff00ff00 bytes of r6 and r7 */
+	    "and r7, r7, %[upper_component_mask]\n\t"
+	    "uxtab16 r6, r7, r6, ror #8\n\t"
+
+	    "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+	    "3:\n\t"
+
+#endif
+	    "str r5, [%[dest]], #4\n\t"
+	    /* increment counter and jmp to top */
+	    "subs	%[w], %[w], #1\n\t"
+	    "bne	1b\n\t"
+	    "2:\n\t"
+	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+	    : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
+	      [alpha_mask] "r" (alpha_mask)
+	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
+	    );
+    }
+}
+
+void
+pixman_composite_over_8888_n_8888_asm_armv6 (int32_t   width,
+                                             int32_t   height,
+                                             uint32_t *dst_line,
+                                             int32_t   dst_stride,
+                                             uint32_t *src_line,
+                                             int32_t   src_stride,
+                                             uint32_t  mask)
+{
+    uint32_t *dst;
+    uint32_t *src;
+    int32_t w;
+    uint32_t component_half = 0x800080;
+    uint32_t alpha_mask = 0xff;
+
+    mask = (mask) >> 24;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+/* #define inner_branch */
+	asm volatile (
+	    "cmp %[w], #0\n\t"
+	    "beq 2f\n\t"
+	    "1:\n\t"
+	    /* load src */
+	    "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+	     * The 0x0 case also allows us to avoid doing an unecessary data
+	     * write which is more valuable so we only check for that
+	     */
+	    "cmp r5, #0\n\t"
+	    "beq 3f\n\t"
+
+#endif
+	    "ldr r4, [%[dest]] \n\t"
+
+	    "uxtb16 r6, r5\n\t"
+	    "uxtb16 r7, r5, ror #8\n\t"
+
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
+	    "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
+
+	    /* recombine */
+	    "orr r5, r6, r7, lsl #8\n\t"
+
+	    "uxtb16 r6, r4\n\t"
+	    "uxtb16 r7, r4, ror #8\n\t"
+
+	    /* 255 - alpha */
+	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, r6, r8, %[component_half]\n\t"
+	    "mla r7, r7, r8, %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
+
+	    /* recombine */
+	    "orr r6, r6, r7, lsl #8\n\t"
+
+	    "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+	    "3:\n\t"
+
+#endif
+	    "str r5, [%[dest]], #4\n\t"
+	    /* increment counter and jmp to top */
+	    "subs	%[w], %[w], #1\n\t"
+	    "bne	1b\n\t"
+	    "2:\n\t"
+	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+	    : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
+	      [alpha_mask] "r" (alpha_mask)
+	    : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
+	    );
+    }
+}
+
+void
+pixman_composite_over_n_8_8888_asm_armv6 (int32_t   width,
+                                          int32_t   height,
+                                          uint32_t *dst_line,
+                                          int32_t   dst_stride,
+                                          uint32_t  src,
+                                          int32_t   unused,
+                                          uint8_t  *mask_line,
+                                          int32_t   mask_stride)
+{
+    uint32_t  srca;
+    uint32_t *dst;
+    uint8_t  *mask;
+    int32_t w;
+
+    srca = src >> 24;
+
+    uint32_t component_mask = 0xff00ff;
+    uint32_t component_half = 0x800080;
+
+    uint32_t src_hi = (src >> 8) & component_mask;
+    uint32_t src_lo = src & component_mask;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+/* #define inner_branch */
+	asm volatile (
+	    "cmp %[w], #0\n\t"
+	    "beq 2f\n\t"
+	    "1:\n\t"
+	    /* load mask */
+	    "ldrb r5, [%[mask]], #1\n\t"
+#ifdef inner_branch
+	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+	     * The 0x0 case also allows us to avoid doing an unecessary data
+	     * write which is more valuable so we only check for that
+	     */
+	    "cmp r5, #0\n\t"
+	    "beq 3f\n\t"
+
+#endif
+	    "ldr r4, [%[dest]] \n\t"
+
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, %[src_lo], r5, %[component_half]\n\t"
+	    "mla r7, %[src_hi], r5, %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
+
+	    /* recombine */
+	    "orr r5, r6, r7, lsl #8\n\t"
+
+	    "uxtb16 r6, r4\n\t"
+	    "uxtb16 r7, r4, ror #8\n\t"
+
+	    /* we could simplify this to use 'sub' if we were
+	     * willing to give up a register for alpha_mask
+	     */
+	    "mvn r8, r5\n\t"
+	    "mov r8, r8, lsr #24\n\t"
+
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, r6, r8, %[component_half]\n\t"
+	    "mla r7, r7, r8, %[component_half]\n\t"
+
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
+
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
+
+	    /* recombine */
+	    "orr r6, r6, r7, lsl #8\n\t"
+
+	    "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+	    "3:\n\t"
+
+#endif
+	    "str r5, [%[dest]], #4\n\t"
+	    /* increment counter and jmp to top */
+	    "subs	%[w], %[w], #1\n\t"
+	    "bne	1b\n\t"
+	    "2:\n\t"
+	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
+	    : [component_half] "r" (component_half),
+	      [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
+	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
+    }
+}
+
+#endif
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
+                                   uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
+                                     uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
+                                        uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
+                                        uint32_t, uint32_t)
+
+static const pixman_fast_path_t arm_simd_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
+
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
+
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
+
+    { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
+
+    return imp;
+}
diff --git a/pixman/pixman-bits-image.c b/pixman/pixman-bits-image.c
new file mode 100644
index 0000000..99c0dfe
--- /dev/null
+++ b/pixman/pixman-bits-image.c
@@ -0,0 +1,1511 @@
+/*
+ * Copyright Â© 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *             2008 Aaron Plattner, NVIDIA Corporation
+ * Copyright Â© 2000 SuSE, Inc.
+ * Copyright Â© 2007, 2009 Red Hat, Inc.
+ * Copyright Â© 2008 AndrÃ© TupinambÃ¡ <andrelrt@gmail.com>
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+/*
+ * By default, just evaluate the image at 32bpp and expand.  Individual image
+ * types can plug in a better scanline getter if they want to. For example
+ * we  could produce smoother gradients by evaluating them at higher color
+ * depth, but that's a project for the future.
+ */
+static void
+_pixman_image_get_scanline_generic_64 (pixman_image_t * image,
+                                       int              x,
+                                       int              y,
+                                       int              width,
+                                       uint32_t *       buffer,
+                                       const uint32_t * mask)
+{
+    uint32_t *mask8 = NULL;
+
+    /* Contract the mask image, if one exists, so that the 32-bit fetch
+     * function can use it.
+     */
+    if (mask)
+    {
+	mask8 = pixman_malloc_ab (width, sizeof(uint32_t));
+	if (!mask8)
+	    return;
+
+	pixman_contract (mask8, (uint64_t *)mask, width);
+    }
+
+    /* Fetch the source image into the first half of buffer. */
+    image->bits.get_scanline_32 (image, x, y, width, (uint32_t*)buffer, mask8);
+
+    /* Expand from 32bpp to 64bpp in place. */
+    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, width);
+
+    free (mask8);
+}
+
+/* Fetch functions */
+
+static force_inline uint32_t
+fetch_pixel_no_alpha (bits_image_t *image,
+		      int x, int y, pixman_bool_t check_bounds)
+{
+    if (check_bounds &&
+	(x < 0 || x >= image->width || y < 0 || y >= image->height))
+    {
+	return 0;
+    }
+
+    return image->fetch_pixel_32 (image, x, y);
+}
+
+typedef uint32_t (* get_pixel_t) (bits_image_t *image,
+				  int x, int y, pixman_bool_t check_bounds);
+
+static force_inline uint32_t
+bits_image_fetch_pixel_nearest (bits_image_t   *image,
+				pixman_fixed_t  x,
+				pixman_fixed_t  y,
+				get_pixel_t	get_pixel)
+{
+    int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+    int y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+    if (image->common.repeat != PIXMAN_REPEAT_NONE)
+    {
+	repeat (image->common.repeat, &x0, image->width);
+	repeat (image->common.repeat, &y0, image->height);
+
+	return get_pixel (image, x0, y0, FALSE);
+    }
+    else
+    {
+	return get_pixel (image, x0, y0, TRUE);
+    }
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_bilinear (bits_image_t   *image,
+				 pixman_fixed_t  x,
+				 pixman_fixed_t  y,
+				 get_pixel_t	 get_pixel)
+{
+    pixman_repeat_t repeat_mode = image->common.repeat;
+    int width = image->width;
+    int height = image->height;
+    int x1, y1, x2, y2;
+    uint32_t tl, tr, bl, br;
+    int32_t distx, disty;
+
+    x1 = x - pixman_fixed_1 / 2;
+    y1 = y - pixman_fixed_1 / 2;
+
+    distx = (x1 >> 8) & 0xff;
+    disty = (y1 >> 8) & 0xff;
+
+    x1 = pixman_fixed_to_int (x1);
+    y1 = pixman_fixed_to_int (y1);
+    x2 = x1 + 1;
+    y2 = y1 + 1;
+
+    if (repeat_mode != PIXMAN_REPEAT_NONE)
+    {
+	repeat (repeat_mode, &x1, width);
+	repeat (repeat_mode, &y1, height);
+	repeat (repeat_mode, &x2, width);
+	repeat (repeat_mode, &y2, height);
+
+	tl = get_pixel (image, x1, y1, FALSE);
+	bl = get_pixel (image, x1, y2, FALSE);
+	tr = get_pixel (image, x2, y1, FALSE);
+	br = get_pixel (image, x2, y2, FALSE);
+    }
+    else
+    {
+	tl = get_pixel (image, x1, y1, TRUE);
+	tr = get_pixel (image, x2, y1, TRUE);
+	bl = get_pixel (image, x1, y2, TRUE);
+	br = get_pixel (image, x2, y2, TRUE);
+    }
+
+    return bilinear_interpolation (tl, tr, bl, br, distx, disty);
+}
+
+static void
+bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
+					  int              offset,
+					  int              line,
+					  int              width,
+					  uint32_t *       buffer,
+					  const uint32_t * mask)
+{
+    bits_image_t *bits = &ima->bits;
+    pixman_fixed_t x_top, x_bottom, x;
+    pixman_fixed_t ux_top, ux_bottom, ux;
+    pixman_vector_t v;
+    uint32_t top_mask, bottom_mask;
+    uint32_t *top_row;
+    uint32_t *bottom_row;
+    uint32_t *end;
+    uint32_t zero[2] = { 0, 0 };
+    uint32_t one = 1;
+    int y, y1, y2;
+    int disty;
+    int mask_inc;
+    int w;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (bits->common.transform, &v))
+	return;
+
+    ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
+    x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
+
+    y = v.vector[1] - pixman_fixed_1/2;
+    disty = (y >> 8) & 0xff;
+
+    /* Load the pointers to the first and second lines from the source
+     * image that bilinear code must read.
+     *
+     * The main trick in this code is about the check if any line are
+     * outside of the image;
+     *
+     * When I realize that a line (any one) is outside, I change
+     * the pointer to a dummy area with zeros. Once I change this, I
+     * must be sure the pointer will not change, so I set the
+     * variables to each pointer increments inside the loop.
+     */
+    y1 = pixman_fixed_to_int (y);
+    y2 = y1 + 1;
+
+    if (y1 < 0 || y1 >= bits->height)
+    {
+	top_row = zero;
+	x_top = 0;
+	ux_top = 0;
+    }
+    else
+    {
+	top_row = bits->bits + y1 * bits->rowstride;
+	x_top = x;
+	ux_top = ux;
+    }
+
+    if (y2 < 0 || y2 >= bits->height)
+    {
+	bottom_row = zero;
+	x_bottom = 0;
+	ux_bottom = 0;
+    }
+    else
+    {
+	bottom_row = bits->bits + y2 * bits->rowstride;
+	x_bottom = x;
+	ux_bottom = ux;
+    }
+
+    /* Instead of checking whether the operation uses the mast in
+     * each loop iteration, verify this only once and prepare the
+     * variables to make the code smaller inside the loop.
+     */
+    if (!mask)
+    {
+        mask_inc = 0;
+        mask = &one;
+    }
+    else
+    {
+        /* If have a mask, prepare the variables to check it */
+        mask_inc = 1;
+    }
+
+    /* If both are zero, then the whole thing is zero */
+    if (top_row == zero && bottom_row == zero)
+    {
+	memset (buffer, 0, width * sizeof (uint32_t));
+	return;
+    }
+    else if (bits->format == PIXMAN_x8r8g8b8)
+    {
+	if (top_row == zero)
+	{
+	    top_mask = 0;
+	    bottom_mask = 0xff000000;
+	}
+	else if (bottom_row == zero)
+	{
+	    top_mask = 0xff000000;
+	    bottom_mask = 0;
+	}
+	else
+	{
+	    top_mask = 0xff000000;
+	    bottom_mask = 0xff000000;
+	}
+    }
+    else
+    {
+	top_mask = 0;
+	bottom_mask = 0;
+    }
+
+    end = buffer + width;
+
+    /* Zero fill to the left of the image */
+    while (buffer < end && x < pixman_fixed_minus_1)
+    {
+	*buffer++ = 0;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Left edge
+     */
+    while (buffer < end && x < 0)
+    {
+	uint32_t tr, br;
+	int32_t distx;
+
+	tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
+	br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+	distx = (x >> 8) & 0xff;
+
+	*buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
+
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Main part */
+    w = pixman_int_to_fixed (bits->width - 1);
+
+    while (buffer < end  &&  x < w)
+    {
+	if (*mask)
+	{
+	    uint32_t tl, tr, bl, br;
+	    int32_t distx;
+
+	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+	    tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask;
+	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+	    br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+	    distx = (x >> 8) & 0xff;
+
+	    *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
+	}
+
+	buffer++;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Right Edge */
+    w = pixman_int_to_fixed (bits->width);
+    while (buffer < end  &&  x < w)
+    {
+	if (*mask)
+	{
+	    uint32_t tl, bl;
+	    int32_t distx;
+
+	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+
+	    distx = (x >> 8) & 0xff;
+
+	    *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
+	}
+
+	buffer++;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Zero fill to the left of the image */
+    while (buffer < end)
+	*buffer++ = 0;
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_convolution (bits_image_t   *image,
+				    pixman_fixed_t  x,
+				    pixman_fixed_t  y,
+				    get_pixel_t     get_pixel)
+{
+    pixman_fixed_t *params = image->common.filter_params;
+    int x_off = (params[0] - pixman_fixed_1) >> 1;
+    int y_off = (params[1] - pixman_fixed_1) >> 1;
+    int32_t cwidth = pixman_fixed_to_int (params[0]);
+    int32_t cheight = pixman_fixed_to_int (params[1]);
+    int32_t srtot, sgtot, sbtot, satot;
+    int32_t i, j, x1, x2, y1, y2;
+    pixman_repeat_t repeat_mode = image->common.repeat;
+    int width = image->width;
+    int height = image->height;
+
+    params += 2;
+
+    x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
+    y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
+    x2 = x1 + cwidth;
+    y2 = y1 + cheight;
+
+    srtot = sgtot = sbtot = satot = 0;
+
+    for (i = y1; i < y2; ++i)
+    {
+	for (j = x1; j < x2; ++j)
+	{
+	    int rx = j;
+	    int ry = i;
+
+	    pixman_fixed_t f = *params;
+
+	    if (f)
+	    {
+		uint32_t pixel;
+
+		if (repeat_mode != PIXMAN_REPEAT_NONE)
+		{
+		    repeat (repeat_mode, &rx, width);
+		    repeat (repeat_mode, &ry, height);
+
+		    pixel = get_pixel (image, rx, ry, FALSE);
+		}
+		else
+		{
+		    pixel = get_pixel (image, rx, ry, TRUE);
+		}
+
+		srtot += RED_8 (pixel) * f;
+		sgtot += GREEN_8 (pixel) * f;
+		sbtot += BLUE_8 (pixel) * f;
+		satot += ALPHA_8 (pixel) * f;
+	    }
+
+	    params++;
+	}
+    }
+
+    satot >>= 16;
+    srtot >>= 16;
+    sgtot >>= 16;
+    sbtot >>= 16;
+
+    satot = CLIP (satot, 0, 0xff);
+    srtot = CLIP (srtot, 0, 0xff);
+    sgtot = CLIP (sgtot, 0, 0xff);
+    sbtot = CLIP (sbtot, 0, 0xff);
+
+    return ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot));
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_filtered (bits_image_t *image,
+				 pixman_fixed_t x,
+				 pixman_fixed_t y,
+				 get_pixel_t    get_pixel)
+{
+    switch (image->common.filter)
+    {
+    case PIXMAN_FILTER_NEAREST:
+    case PIXMAN_FILTER_FAST:
+	return bits_image_fetch_pixel_nearest (image, x, y, get_pixel);
+	break;
+
+    case PIXMAN_FILTER_BILINEAR:
+    case PIXMAN_FILTER_GOOD:
+    case PIXMAN_FILTER_BEST:
+	return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel);
+	break;
+
+    case PIXMAN_FILTER_CONVOLUTION:
+	return bits_image_fetch_pixel_convolution (image, x, y, get_pixel);
+	break;
+
+    default:
+        break;
+    }
+
+    return 0;
+}
+
+static void
+bits_image_fetch_affine_no_alpha (pixman_image_t * image,
+				  int              offset,
+				  int              line,
+				  int              width,
+				  uint32_t *       buffer,
+				  const uint32_t * mask)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (image->common.transform)
+    {
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return;
+
+	ux = image->common.transform->matrix[0][0];
+	uy = image->common.transform->matrix[1][0];
+    }
+    else
+    {
+	ux = pixman_fixed_1;
+	uy = 0;
+    }
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+	if (!mask || mask[i])
+	{
+	    buffer[i] = bits_image_fetch_pixel_filtered (
+		&image->bits, x, y, fetch_pixel_no_alpha);
+	}
+
+	x += ux;
+	y += uy;
+    }
+}
+
+/* General fetcher */
+static force_inline uint32_t
+fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds)
+{
+    uint32_t pixel;
+
+    if (check_bounds &&
+	(x < 0 || x >= image->width || y < 0 || y >= image->height))
+    {
+	return 0;
+    }
+
+    pixel = image->fetch_pixel_32 (image, x, y);
+
+    if (image->common.alpha_map)
+    {
+	uint32_t pixel_a;
+
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	if (x < 0 || x >= image->common.alpha_map->width ||
+	    y < 0 || y >= image->common.alpha_map->height)
+	{
+	    pixel_a = 0;
+	}
+	else
+	{
+	    pixel_a = image->common.alpha_map->fetch_pixel_32 (
+		image->common.alpha_map, x, y);
+
+	    pixel_a = ALPHA_8 (pixel_a);
+	}
+
+	pixel &= 0x00ffffff;
+	pixel |= (pixel_a << 24);
+    }
+
+    return pixel;
+}
+
+static void
+bits_image_fetch_general (pixman_image_t * image,
+			  int              offset,
+			  int              line,
+			  int              width,
+			  uint32_t *       buffer,
+			  const uint32_t * mask)
+{
+    pixman_fixed_t x, y, w;
+    pixman_fixed_t ux, uy, uw;
+    pixman_vector_t v;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (image->common.transform)
+    {
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return;
+
+	ux = image->common.transform->matrix[0][0];
+	uy = image->common.transform->matrix[1][0];
+	uw = image->common.transform->matrix[2][0];
+    }
+    else
+    {
+	ux = pixman_fixed_1;
+	uy = 0;
+	uw = 0;
+    }
+
+    x = v.vector[0];
+    y = v.vector[1];
+    w = v.vector[2];
+
+    for (i = 0; i < width; ++i)
+    {
+	pixman_fixed_t x0, y0;
+
+	if (!mask || mask[i])
+	{
+	    if (w != 0)
+	    {
+		x0 = ((pixman_fixed_48_16_t)x << 16) / w;
+		y0 = ((pixman_fixed_48_16_t)y << 16) / w;
+	    }
+	    else
+	    {
+		x0 = 0;
+		y0 = 0;
+	    }
+
+	    buffer[i] = bits_image_fetch_pixel_filtered (
+		&image->bits, x0, y0, fetch_pixel_general);
+	}
+
+	x += ux;
+	y += uy;
+	w += uw;
+    }
+}
+
+static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
+
+static force_inline void
+bits_image_fetch_bilinear_affine (pixman_image_t * image,
+				  int              offset,
+				  int              line,
+				  int              width,
+				  uint32_t *       buffer,
+				  const uint32_t * mask,
+
+				  convert_pixel_t	convert_pixel,
+				  pixman_format_code_t	format,
+				  pixman_repeat_t	repeat_mode)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    bits_image_t *bits = &image->bits;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+	return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+	int x1, y1, x2, y2;
+	uint32_t tl, tr, bl, br;
+	int32_t distx, disty;
+	int width = image->bits.width;
+	int height = image->bits.height;
+	const uint8_t *row1;
+	const uint8_t *row2;
+
+	if (mask && !mask[i])
+	    goto next;
+
+	x1 = x - pixman_fixed_1 / 2;
+	y1 = y - pixman_fixed_1 / 2;
+
+	distx = (x1 >> 8) & 0xff;
+	disty = (y1 >> 8) & 0xff;
+
+	y1 = pixman_fixed_to_int (y1);
+	y2 = y1 + 1;
+	x1 = pixman_fixed_to_int (x1);
+	x2 = x1 + 1;
+
+	if (repeat_mode != PIXMAN_REPEAT_NONE)
+	{
+	    uint32_t mask;
+
+	    mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+	    repeat (repeat_mode, &x1, width);
+	    repeat (repeat_mode, &y1, height);
+	    repeat (repeat_mode, &x2, width);
+	    repeat (repeat_mode, &y2, height);
+
+	    row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+	    row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+
+	    tl = convert_pixel (row1, x1) | mask;
+	    tr = convert_pixel (row1, x2) | mask;
+	    bl = convert_pixel (row2, x1) | mask;
+	    br = convert_pixel (row2, x2) | mask;
+	}
+	else
+	{
+	    uint32_t mask1, mask2;
+	    int bpp;
+
+	    /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value,
+	     * which means if you use it in expressions, those
+	     * expressions become unsigned themselves. Since
+	     * the variables below can be negative in some cases,
+	     * that will lead to crashes on 64 bit architectures.
+	     *
+	     * So this line makes sure bpp is signed
+	     */
+	    bpp = PIXMAN_FORMAT_BPP (format);
+
+	    if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0)
+	    {
+		buffer[i] = 0;
+		goto next;
+	    }
+
+	    if (y2 == 0)
+	    {
+		row1 = zero;
+		mask1 = 0;
+	    }
+	    else
+	    {
+		row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+		row1 += bpp / 8 * x1;
+
+		mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+	    }
+
+	    if (y1 == height - 1)
+	    {
+		row2 = zero;
+		mask2 = 0;
+	    }
+	    else
+	    {
+		row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+		row2 += bpp / 8 * x1;
+
+		mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+	    }
+
+	    if (x2 == 0)
+	    {
+		tl = 0;
+		bl = 0;
+	    }
+	    else
+	    {
+		tl = convert_pixel (row1, 0) | mask1;
+		bl = convert_pixel (row2, 0) | mask2;
+	    }
+
+	    if (x1 == width - 1)
+	    {
+		tr = 0;
+		br = 0;
+	    }
+	    else
+	    {
+		tr = convert_pixel (row1, 1) | mask1;
+		br = convert_pixel (row2, 1) | mask2;
+	    }
+	}
+
+	buffer[i] = bilinear_interpolation (
+	    tl, tr, bl, br, distx, disty);
+
+    next:
+	x += ux;
+	y += uy;
+    }
+}
+
+static force_inline void
+bits_image_fetch_nearest_affine (pixman_image_t * image,
+				 int              offset,
+				 int              line,
+				 int              width,
+				 uint32_t *       buffer,
+				 const uint32_t * mask,
+				 
+				 convert_pixel_t	convert_pixel,
+				 pixman_format_code_t	format,
+				 pixman_repeat_t	repeat_mode)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    bits_image_t *bits = &image->bits;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+	return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+	int width, height, x0, y0;
+	const uint8_t *row;
+
+	if (mask && !mask[i])
+	    goto next;
+	
+	width = image->bits.width;
+	height = image->bits.height;
+	x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+	y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+	if (repeat_mode == PIXMAN_REPEAT_NONE &&
+	    (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width))
+	{
+	    buffer[i] = 0;
+	}
+	else
+	{
+	    uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+	    if (repeat_mode != PIXMAN_REPEAT_NONE)
+	    {
+		repeat (repeat_mode, &x0, width);
+		repeat (repeat_mode, &y0, height);
+	    }
+
+	    row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
+
+	    buffer[i] = convert_pixel (row, x0) | mask;
+	}
+
+    next:
+	x += ux;
+	y += uy;
+    }
+}
+
+static force_inline uint32_t
+convert_a8r8g8b8 (const uint8_t *row, int x)
+{
+    return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_x8r8g8b8 (const uint8_t *row, int x)
+{
+    return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_a8 (const uint8_t *row, int x)
+{
+    return *(row + x) << 24;
+}
+
+static force_inline uint32_t
+convert_r5g6b5 (const uint8_t *row, int x)
+{
+    return CONVERT_0565_TO_0888 (*((uint16_t *)row + x));
+}
+
+#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode)		\
+    static void								\
+    bits_image_fetch_bilinear_affine_ ## name (pixman_image_t *image,	\
+					       int              offset,	\
+					       int              line,	\
+					       int              width,	\
+					       uint32_t *       buffer,	\
+					       const uint32_t * mask)	\
+    {									\
+	bits_image_fetch_bilinear_affine (image, offset, line,		\
+					  width, buffer, mask,		\
+					  convert_ ## format,		\
+					  PIXMAN_ ## format,		\
+					  repeat_mode);			\
+    }
+
+#define MAKE_NEAREST_FETCHER(name, format, repeat_mode)			\
+    static void								\
+    bits_image_fetch_nearest_affine_ ## name (pixman_image_t *image,	\
+					      int              offset,	\
+					      int              line,	\
+					      int              width,	\
+					      uint32_t *       buffer,	\
+					      const uint32_t * mask)	\
+    {									\
+	bits_image_fetch_nearest_affine (image, offset, line,		\
+					 width, buffer, mask,		\
+					 convert_ ## format,		\
+					 PIXMAN_ ## format,		\
+					 repeat_mode);			\
+    }
+
+#define MAKE_FETCHERS(name, format, repeat_mode)			\
+    MAKE_NEAREST_FETCHER (name, format, repeat_mode)			\
+    MAKE_BILINEAR_FETCHER (name, format, repeat_mode)
+
+MAKE_FETCHERS (pad_a8r8g8b8,     a8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8r8g8b8,    a8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8r8g8b8,  a8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_x8r8g8b8,     x8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_x8r8g8b8,    x8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_x8r8g8b8,  x8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_a8,           a8,       PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8,          a8,       PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8,	 a8,       PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8,	 a8,       PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_r5g6b5,       r5g6b5,   PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_r5g6b5,      r5g6b5,   PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_r5g6b5,   r5g6b5,   PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_r5g6b5,    r5g6b5,   PIXMAN_REPEAT_NORMAL)
+
+static void
+replicate_pixel_32 (bits_image_t *   bits,
+		    int              x,
+		    int              y,
+		    int              width,
+		    uint32_t *       buffer)
+{
+    uint32_t color;
+    uint32_t *end;
+
+    color = bits->fetch_pixel_32 (bits, x, y);
+
+    end = buffer + width;
+    while (buffer < end)
+	*(buffer++) = color;
+}
+
+static void
+replicate_pixel_64 (bits_image_t *   bits,
+		    int              x,
+		    int              y,
+		    int              width,
+		    uint32_t *       b)
+{
+    uint64_t color;
+    uint64_t *buffer = (uint64_t *)b;
+    uint64_t *end;
+
+    color = bits->fetch_pixel_64 (bits, x, y);
+
+    end = buffer + width;
+    while (buffer < end)
+	*(buffer++) = color;
+}
+
+static void
+bits_image_fetch_solid_32 (pixman_image_t * image,
+                           int              x,
+                           int              y,
+                           int              width,
+                           uint32_t *       buffer,
+                           const uint32_t * mask)
+{
+    replicate_pixel_32 (&image->bits, 0, 0, width, buffer);
+}
+
+static void
+bits_image_fetch_solid_64 (pixman_image_t * image,
+                           int              x,
+                           int              y,
+                           int              width,
+                           uint32_t *       b,
+                           const uint32_t * unused)
+{
+    replicate_pixel_64 (&image->bits, 0, 0, width, b);
+}
+
+static void
+bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
+                                            pixman_bool_t wide,
+                                            int           x,
+                                            int           y,
+                                            int           width,
+                                            uint32_t *    buffer)
+{
+    uint32_t w;
+
+    if (y < 0 || y >= image->height)
+    {
+	memset (buffer, 0, width * (wide? 8 : 4));
+	return;
+    }
+
+    if (x < 0)
+    {
+	w = MIN (width, -x);
+
+	memset (buffer, 0, w * (wide ? 8 : 4));
+
+	width -= w;
+	buffer += w * (wide? 2 : 1);
+	x += w;
+    }
+
+    if (x < image->width)
+    {
+	w = MIN (width, image->width - x);
+
+	if (wide)
+	    image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+	else
+	    image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+
+	width -= w;
+	buffer += w * (wide? 2 : 1);
+	x += w;
+    }
+
+    memset (buffer, 0, width * (wide ? 8 : 4));
+}
+
+static void
+bits_image_fetch_untransformed_repeat_normal (bits_image_t *image,
+                                              pixman_bool_t wide,
+                                              int           x,
+                                              int           y,
+                                              int           width,
+                                              uint32_t *    buffer)
+{
+    uint32_t w;
+
+    while (y < 0)
+	y += image->height;
+
+    while (y >= image->height)
+	y -= image->height;
+
+    if (image->width == 1)
+    {
+	if (wide)
+	    replicate_pixel_64 (image, 0, y, width, buffer);
+	else
+	    replicate_pixel_32 (image, 0, y, width, buffer);
+
+	return;
+    }
+
+    while (width)
+    {
+	while (x < 0)
+	    x += image->width;
+	while (x >= image->width)
+	    x -= image->width;
+
+	w = MIN (width, image->width - x);
+
+	if (wide)
+	    image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+	else
+	    image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+
+	buffer += w * (wide? 2 : 1);
+	x += w;
+	width -= w;
+    }
+}
+
+static void
+bits_image_fetch_untransformed_32 (pixman_image_t * image,
+                                   int              x,
+                                   int              y,
+                                   int              width,
+                                   uint32_t *       buffer,
+                                   const uint32_t * mask)
+{
+    if (image->common.repeat == PIXMAN_REPEAT_NONE)
+    {
+	bits_image_fetch_untransformed_repeat_none (
+	    &image->bits, FALSE, x, y, width, buffer);
+    }
+    else
+    {
+	bits_image_fetch_untransformed_repeat_normal (
+	    &image->bits, FALSE, x, y, width, buffer);
+    }
+}
+
+static void
+bits_image_fetch_untransformed_64 (pixman_image_t * image,
+                                   int              x,
+                                   int              y,
+                                   int              width,
+                                   uint32_t *       buffer,
+                                   const uint32_t * unused)
+{
+    if (image->common.repeat == PIXMAN_REPEAT_NONE)
+    {
+	bits_image_fetch_untransformed_repeat_none (
+	    &image->bits, TRUE, x, y, width, buffer);
+    }
+    else
+    {
+	bits_image_fetch_untransformed_repeat_normal (
+	    &image->bits, TRUE, x, y, width, buffer);
+    }
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    uint32_t			flags;
+    fetch_scanline_t		fetch_32;
+    fetch_scanline_t		fetch_64;
+} fetcher_info_t;
+
+static const fetcher_info_t fetcher_info[] =
+{
+    { PIXMAN_solid,
+      FAST_PATH_NO_ALPHA_MAP,
+      bits_image_fetch_solid_32,
+      bits_image_fetch_solid_64
+    },
+
+    { PIXMAN_any,
+      (FAST_PATH_NO_ALPHA_MAP			|
+       FAST_PATH_ID_TRANSFORM			|
+       FAST_PATH_NO_CONVOLUTION_FILTER		|
+       FAST_PATH_NO_PAD_REPEAT			|
+       FAST_PATH_NO_REFLECT_REPEAT),
+      bits_image_fetch_untransformed_32,
+      bits_image_fetch_untransformed_64
+    },
+
+#define FAST_BILINEAR_FLAGS						\
+    (FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_HAS_TRANSFORM		|				\
+     FAST_PATH_AFFINE_TRANSFORM		|				\
+     FAST_PATH_X_UNIT_POSITIVE		|				\
+     FAST_PATH_Y_UNIT_ZERO		|				\
+     FAST_PATH_NONE_REPEAT		|				\
+     FAST_PATH_BILINEAR_FILTER)
+
+    { PIXMAN_a8r8g8b8,
+      FAST_BILINEAR_FLAGS,
+      bits_image_fetch_bilinear_no_repeat_8888,
+      _pixman_image_get_scanline_generic_64
+    },
+
+    { PIXMAN_x8r8g8b8,
+      FAST_BILINEAR_FLAGS,
+      bits_image_fetch_bilinear_no_repeat_8888,
+      _pixman_image_get_scanline_generic_64
+    },
+
+#define GENERAL_BILINEAR_FLAGS						\
+    (FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_HAS_TRANSFORM		|				\
+     FAST_PATH_AFFINE_TRANSFORM		|				\
+     FAST_PATH_BILINEAR_FILTER)
+
+#define GENERAL_NEAREST_FLAGS						\
+    (FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_HAS_TRANSFORM		|				\
+     FAST_PATH_AFFINE_TRANSFORM		|				\
+     FAST_PATH_NEAREST_FILTER)
+
+#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
+    { PIXMAN_ ## format,						\
+      GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
+      bits_image_fetch_bilinear_affine_ ## name,			\
+      _pixman_image_get_scanline_generic_64				\
+    },
+
+#define NEAREST_AFFINE_FAST_PATH(name, format, repeat)			\
+    { PIXMAN_ ## format,						\
+      GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
+      bits_image_fetch_nearest_affine_ ## name,			\
+      _pixman_image_get_scanline_generic_64				\
+    },
+
+#define AFFINE_FAST_PATHS(name, format, repeat)				\
+    BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
+    NEAREST_AFFINE_FAST_PATH(name, format, repeat)
+    
+    AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
+    AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
+    AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
+    AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL)
+    AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
+    AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE)
+    AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
+    AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL)
+    AFFINE_FAST_PATHS (pad_a8, a8, PAD)
+    AFFINE_FAST_PATHS (none_a8, a8, NONE)
+    AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT)
+    AFFINE_FAST_PATHS (normal_a8, a8, NORMAL)
+    AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD)
+    AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE)
+    AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT)
+    AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL)
+
+    /* Affine, no alpha */
+    { PIXMAN_any,
+      (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
+      bits_image_fetch_affine_no_alpha,
+      _pixman_image_get_scanline_generic_64
+    },
+
+    /* General */
+    { PIXMAN_any, 0, bits_image_fetch_general, _pixman_image_get_scanline_generic_64 },
+
+    { PIXMAN_null },
+};
+
+static void
+bits_image_property_changed (pixman_image_t *image)
+{
+    uint32_t flags = image->common.flags;
+    pixman_format_code_t format = image->common.extended_format_code;
+    const fetcher_info_t *info;
+
+    _pixman_bits_image_setup_accessors (&image->bits);
+
+    info = fetcher_info;
+    while (info->format != PIXMAN_null)
+    {
+	if ((info->format == format || info->format == PIXMAN_any)	&&
+	    (info->flags & flags) == info->flags)
+	{
+	    image->bits.get_scanline_32 = info->fetch_32;
+	    image->bits.get_scanline_64 = info->fetch_64;
+	    break;
+	}
+
+	info++;
+    }
+}
+
+static uint32_t *
+src_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    iter->image->bits.get_scanline_32 (
+	iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask);
+
+    return iter->buffer;
+}
+
+static uint32_t *
+src_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    iter->image->bits.get_scanline_64 (
+	iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask);
+
+    return iter->buffer;
+}
+
+void
+_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->flags & ITER_NARROW)
+	iter->get_scanline = src_get_scanline_narrow;
+    else
+	iter->get_scanline = src_get_scanline_wide;
+}
+
+static uint32_t *
+dest_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    pixman_image_t *image  = iter->image;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *	    buffer = iter->buffer;
+
+    image->bits.fetch_scanline_32 (image, x, y, width, buffer, mask);
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	image->common.alpha_map->fetch_scanline_32 (
+	    (pixman_image_t *)image->common.alpha_map,
+	    x, y, width, buffer, mask);
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+dest_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *	    buffer = iter->buffer;
+
+    image->fetch_scanline_64 (
+	(pixman_image_t *)image, x, y, width, buffer, mask);
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	image->common.alpha_map->fetch_scanline_64 (
+	    (pixman_image_t *)image->common.alpha_map, x, y, width, buffer, mask);
+    }
+
+    return iter->buffer;
+}
+
+static void
+dest_write_back_narrow (pixman_iter_t *iter)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    const uint32_t *buffer = iter->buffer;
+
+    image->store_scanline_32 (image, x, y, width, buffer);
+
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	image->common.alpha_map->store_scanline_32 (
+	    image->common.alpha_map, x, y, width, buffer);
+    }
+
+    iter->y++;
+}
+
+static void
+dest_write_back_wide (pixman_iter_t *iter)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    const uint32_t *buffer = iter->buffer;
+
+    image->store_scanline_64 (image, x, y, width, buffer);
+
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	image->common.alpha_map->store_scanline_64 (
+	    image->common.alpha_map, x, y, width, buffer);
+    }
+
+    iter->y++;
+}
+
+void
+_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->flags & ITER_NARROW)
+    {
+	if ((iter->flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
+	    (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
+	{
+	    iter->get_scanline = _pixman_iter_get_scanline_noop;
+	}
+	else
+	{
+	    iter->get_scanline = dest_get_scanline_narrow;
+	}
+	
+	iter->write_back = dest_write_back_narrow;
+    }
+    else
+    {
+	iter->get_scanline = dest_get_scanline_wide;
+	iter->write_back = dest_write_back_wide;
+    }
+}
+
+static uint32_t *
+create_bits (pixman_format_code_t format,
+             int                  width,
+             int                  height,
+             int *		  rowstride_bytes)
+{
+    int stride;
+    size_t buf_size;
+    int bpp;
+
+    /* what follows is a long-winded way, avoiding any possibility of integer
+     * overflows, of saying:
+     * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t);
+     */
+
+    bpp = PIXMAN_FORMAT_BPP (format);
+    if (_pixman_multiply_overflows_int (width, bpp))
+	return NULL;
+
+    stride = width * bpp;
+    if (_pixman_addition_overflows_int (stride, 0x1f))
+	return NULL;
+
+    stride += 0x1f;
+    stride >>= 5;
+
+    stride *= sizeof (uint32_t);
+
+    if (_pixman_multiply_overflows_size (height, stride))
+	return NULL;
+
+    buf_size = height * stride;
+
+    if (rowstride_bytes)
+	*rowstride_bytes = stride;
+
+    return calloc (buf_size, 1);
+}
+
+pixman_bool_t
+_pixman_bits_image_init (pixman_image_t *     image,
+                         pixman_format_code_t format,
+                         int                  width,
+                         int                  height,
+                         uint32_t *           bits,
+                         int                  rowstride)
+{
+    uint32_t *free_me = NULL;
+
+    if (!bits && width && height)
+    {
+	int rowstride_bytes;
+
+	free_me = bits = create_bits (format, width, height, &rowstride_bytes);
+
+	if (!bits)
+	    return FALSE;
+
+	rowstride = rowstride_bytes / (int) sizeof (uint32_t);
+    }
+
+    _pixman_image_init (image);
+
+    image->type = BITS;
+    image->bits.format = format;
+    image->bits.width = width;
+    image->bits.height = height;
+    image->bits.bits = bits;
+    image->bits.free_me = free_me;
+    image->bits.read_func = NULL;
+    image->bits.write_func = NULL;
+    image->bits.rowstride = rowstride;
+    image->bits.indexed = NULL;
+
+    image->common.property_changed = bits_image_property_changed;
+
+    _pixman_image_reset_clip_region (image);
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_bits (pixman_format_code_t format,
+                          int                  width,
+                          int                  height,
+                          uint32_t *           bits,
+                          int                  rowstride_bytes)
+{
+    pixman_image_t *image;
+
+    /* must be a whole number of uint32_t's
+     */
+    return_val_if_fail (
+	bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
+
+    return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL);
+
+    image = _pixman_image_allocate ();
+
+    if (!image)
+	return NULL;
+
+    if (!_pixman_bits_image_init (image, format, width, height, bits,
+				  rowstride_bytes / (int) sizeof (uint32_t)))
+    {
+	free (image);
+	return NULL;
+    }
+
+    return image;
+}
diff --git a/pixman/pixman-combine.c.template b/pixman/pixman-combine.c.template
new file mode 100644
index 0000000..c17bcea
--- /dev/null
+++ b/pixman/pixman-combine.c.template
@@ -0,0 +1,2461 @@
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+
+#include "pixman-private.h"
+
+#include "pixman-combine.h"
+
+/*** per channel helper functions ***/
+
+static void
+combine_mask_ca (comp4_t *src, comp4_t *mask)
+{
+    comp4_t a = *mask;
+
+    comp4_t x;
+    comp2_t xa;
+
+    if (!a)
+    {
+	*(src) = 0;
+	return;
+    }
+
+    x = *(src);
+    if (a == ~0)
+    {
+	x = x >> A_SHIFT;
+	x |= x << G_SHIFT;
+	x |= x << R_SHIFT;
+	*(mask) = x;
+	return;
+    }
+
+    xa = x >> A_SHIFT;
+    UNcx4_MUL_UNcx4 (x, a);
+    *(src) = x;
+    
+    UNcx4_MUL_UNc (a, xa);
+    *(mask) = a;
+}
+
+static void
+combine_mask_value_ca (comp4_t *src, const comp4_t *mask)
+{
+    comp4_t a = *mask;
+    comp4_t x;
+
+    if (!a)
+    {
+	*(src) = 0;
+	return;
+    }
+
+    if (a == ~0)
+	return;
+
+    x = *(src);
+    UNcx4_MUL_UNcx4 (x, a);
+    *(src) = x;
+}
+
+static void
+combine_mask_alpha_ca (const comp4_t *src, comp4_t *mask)
+{
+    comp4_t a = *(mask);
+    comp4_t x;
+
+    if (!a)
+	return;
+
+    x = *(src) >> A_SHIFT;
+    if (x == MASK)
+	return;
+
+    if (a == ~0)
+    {
+	x |= x << G_SHIFT;
+	x |= x << R_SHIFT;
+	*(mask) = x;
+	return;
+    }
+
+    UNcx4_MUL_UNc (a, x);
+    *(mask) = a;
+}
+
+/*
+ * There are two ways of handling alpha -- either as a single unified value or
+ * a separate value for each component, hence each macro must have two
+ * versions.  The unified alpha version has a 'U' at the end of the name,
+ * the component version has a 'C'.  Similarly, functions which deal with
+ * this difference will have two versions using the same convention.
+ */
+
+/*
+ * All of the composing functions
+ */
+
+static force_inline comp4_t
+combine_mask (const comp4_t *src, const comp4_t *mask, int i)
+{
+    comp4_t s, m;
+
+    if (mask)
+    {
+	m = *(mask + i) >> A_SHIFT;
+
+	if (!m)
+	    return 0;
+    }
+
+    s = *(src + i);
+
+    if (mask)
+	UNcx4_MUL_UNc (s, m);
+
+    return s;
+}
+
+static void
+combine_clear (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    memset (dest, 0, width * sizeof(comp4_t));
+}
+
+static void
+combine_dst (pixman_implementation_t *imp,
+	     pixman_op_t	      op,
+	     comp4_t *		      dest,
+	     const comp4_t *	      src,
+	     const comp4_t *          mask,
+	     int		      width)
+{
+    return;
+}
+
+static void
+combine_src_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    if (!mask)
+	memcpy (dest, src, width * sizeof (comp4_t));
+    else
+    {
+	for (i = 0; i < width; ++i)
+	{
+	    comp4_t s = combine_mask (src, mask, i);
+
+	    *(dest + i) = s;
+	}
+    }
+}
+
+/* if the Src is opaque, call combine_src_u */
+static void
+combine_over_u (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t ia = ALPHA_c (~s);
+
+	UNcx4_MUL_UNc_ADD_UNcx4 (d, ia, s);
+	*(dest + i) = d;
+    }
+}
+
+/* if the Dst is opaque, this is a noop */
+static void
+combine_over_reverse_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t ia = ALPHA_c (~*(dest + i));
+	UNcx4_MUL_UNc_ADD_UNcx4 (s, ia, d);
+	*(dest + i) = s;
+    }
+}
+
+/* if the Dst is opaque, call combine_src_u */
+static void
+combine_in_u (pixman_implementation_t *imp,
+              pixman_op_t              op,
+              comp4_t *                dest,
+              const comp4_t *          src,
+              const comp4_t *          mask,
+              int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t a = ALPHA_c (*(dest + i));
+	UNcx4_MUL_UNc (s, a);
+	*(dest + i) = s;
+    }
+}
+
+/* if the Src is opaque, this is a noop */
+static void
+combine_in_reverse_u (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      comp4_t *                dest,
+                      const comp4_t *          src,
+                      const comp4_t *          mask,
+                      int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t a = ALPHA_c (s);
+	UNcx4_MUL_UNc (d, a);
+	*(dest + i) = d;
+    }
+}
+
+/* if the Dst is opaque, call combine_clear */
+static void
+combine_out_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t a = ALPHA_c (~*(dest + i));
+	UNcx4_MUL_UNc (s, a);
+	*(dest + i) = s;
+    }
+}
+
+/* if the Src is opaque, call combine_clear */
+static void
+combine_out_reverse_u (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       comp4_t *                dest,
+                       const comp4_t *          src,
+                       const comp4_t *          mask,
+                       int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t a = ALPHA_c (~s);
+	UNcx4_MUL_UNc (d, a);
+	*(dest + i) = d;
+    }
+}
+
+/* if the Src is opaque, call combine_in_u */
+/* if the Dst is opaque, call combine_over_u */
+/* if both the Src and Dst are opaque, call combine_src_u */
+static void
+combine_atop_u (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t dest_a = ALPHA_c (d);
+	comp4_t src_ia = ALPHA_c (~s);
+
+	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_a, d, src_ia);
+	*(dest + i) = s;
+    }
+}
+
+/* if the Src is opaque, call combine_over_reverse_u */
+/* if the Dst is opaque, call combine_in_reverse_u */
+/* if both the Src and Dst are opaque, call combine_dst_u */
+static void
+combine_atop_reverse_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t src_a = ALPHA_c (s);
+	comp4_t dest_ia = ALPHA_c (~d);
+
+	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_ia, d, src_a);
+	*(dest + i) = s;
+    }
+}
+
+/* if the Src is opaque, call combine_over_u */
+/* if the Dst is opaque, call combine_over_reverse_u */
+/* if both the Src and Dst are opaque, call combine_clear */
+static void
+combine_xor_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t src_ia = ALPHA_c (~s);
+	comp4_t dest_ia = ALPHA_c (~d);
+
+	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_ia, d, src_ia);
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_add_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	UNcx4_ADD_UNcx4 (d, s);
+	*(dest + i) = d;
+    }
+}
+
+/* if the Src is opaque, call combine_add_u */
+/* if the Dst is opaque, call combine_add_u */
+/* if both the Src and Dst are opaque, call combine_add_u */
+static void
+combine_saturate_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    comp4_t *                dest,
+                    const comp4_t *          src,
+                    const comp4_t *          mask,
+                    int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp2_t sa, da;
+
+	sa = s >> A_SHIFT;
+	da = ~d >> A_SHIFT;
+	if (sa > da)
+	{
+	    sa = DIV_UNc (da, sa);
+	    UNcx4_MUL_UNc (s, sa);
+	}
+	;
+	UNcx4_ADD_UNcx4 (d, s);
+	*(dest + i) = d;
+    }
+}
+
+/*
+ * PDF blend modes:
+ * The following blend modes have been taken from the PDF ISO 32000
+ * specification, which at this point in time is available from
+ * http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
+ * The relevant chapters are 11.3.5 and 11.3.6.
+ * The formula for computing the final pixel color given in 11.3.6 is:
+ * Î±r Ã Cr = (1 â Î±s) Ã Î±b Ã Cb + (1 â Î±b) Ã Î±s Ã Cs + Î±b Ã Î±s Ã B(Cb, Cs)
+ * with B() being the blend function.
+ * Note that OVER is a special case of this operation, using B(Cb, Cs) = Cs
+ *
+ * These blend modes should match the SVG filter draft specification, as
+ * it has been designed to mirror ISO 32000. Note that at the current point
+ * no released draft exists that shows this, as the formulas have not been
+ * updated yet after the release of ISO 32000.
+ *
+ * The default implementation here uses the PDF_SEPARABLE_BLEND_MODE and
+ * PDF_NON_SEPARABLE_BLEND_MODE macros, which take the blend function as an
+ * argument. Note that this implementation operates on premultiplied colors,
+ * while the PDF specification does not. Therefore the code uses the formula
+ * Cra = (1 â as) . Dca + (1 â ad) . Sca + B(Dca, ad, Sca, as)
+ */
+
+/*
+ * Multiply
+ * B(Dca, ad, Sca, as) = Dca.Sca
+ */
+
+static void
+combine_multiply_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    comp4_t *                dest,
+                    const comp4_t *          src,
+                    const comp4_t *          mask,
+                    int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t ss = s;
+	comp4_t src_ia = ALPHA_c (~s);
+	comp4_t dest_ia = ALPHA_c (~d);
+
+	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (ss, dest_ia, d, src_ia);
+	UNcx4_MUL_UNcx4 (d, s);
+	UNcx4_ADD_UNcx4 (d, ss);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_multiply_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     comp4_t *                dest,
+                     const comp4_t *          src,
+                     const comp4_t *          mask,
+                     int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t m = *(mask + i);
+	comp4_t s = *(src + i);
+	comp4_t d = *(dest + i);
+	comp4_t r = d;
+	comp4_t dest_ia = ALPHA_c (~d);
+
+	combine_mask_value_ca (&s, &m);
+
+	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (r, ~m, s, dest_ia);
+	UNcx4_MUL_UNcx4 (d, s);
+	UNcx4_ADD_UNcx4 (r, d);
+
+	*(dest + i) = r;
+    }
+}
+
+#define PDF_SEPARABLE_BLEND_MODE(name)					\
+    static void								\
+    combine_ ## name ## _u (pixman_implementation_t *imp,		\
+			    pixman_op_t              op,		\
+                            comp4_t *                dest,		\
+			    const comp4_t *          src,		\
+			    const comp4_t *          mask,		\
+			    int                      width)		\
+    {									\
+	int i;								\
+	for (i = 0; i < width; ++i) {					\
+	    comp4_t s = combine_mask (src, mask, i);			\
+	    comp4_t d = *(dest + i);					\
+	    comp1_t sa = ALPHA_c (s);					\
+	    comp1_t isa = ~sa;						\
+	    comp1_t da = ALPHA_c (d);					\
+	    comp1_t ida = ~da;						\
+	    comp4_t result;						\
+									\
+	    result = d;							\
+	    UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (result, isa, s, ida);	\
+	    								\
+	    *(dest + i) = result +					\
+		(DIV_ONE_UNc (sa * da) << A_SHIFT) +			\
+		(blend_ ## name (RED_c (d), da, RED_c (s), sa) << R_SHIFT) + \
+		(blend_ ## name (GREEN_c (d), da, GREEN_c (s), sa) << G_SHIFT) + \
+		(blend_ ## name (BLUE_c (d), da, BLUE_c (s), sa));	\
+	}								\
+    }									\
+    									\
+    static void								\
+    combine_ ## name ## _ca (pixman_implementation_t *imp,		\
+			     pixman_op_t              op,		\
+                             comp4_t *                dest,		\
+			     const comp4_t *          src,		\
+			     const comp4_t *          mask,		\
+			     int                     width)		\
+    {									\
+	int i;								\
+	for (i = 0; i < width; ++i) {					\
+	    comp4_t m = *(mask + i);					\
+	    comp4_t s = *(src + i);					\
+	    comp4_t d = *(dest + i);					\
+	    comp1_t da = ALPHA_c (d);					\
+	    comp1_t ida = ~da;						\
+	    comp4_t result;						\
+            								\
+	    combine_mask_value_ca (&s, &m);				\
+            								\
+	    result = d;							\
+	    UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (result, ~m, s, ida);     \
+            								\
+	    result +=							\
+	        (DIV_ONE_UNc (ALPHA_c (m) * da) << A_SHIFT) +		\
+	        (blend_ ## name (RED_c (d), da, RED_c (s), RED_c (m)) << R_SHIFT) + \
+	        (blend_ ## name (GREEN_c (d), da, GREEN_c (s), GREEN_c (m)) << G_SHIFT) + \
+	        (blend_ ## name (BLUE_c (d), da, BLUE_c (s), BLUE_c (m))); \
+	    								\
+	    *(dest + i) = result;					\
+	}								\
+    }
+
+/*
+ * Screen
+ * B(Dca, ad, Sca, as) = Dca.sa + Sca.da - Dca.Sca
+ */
+static inline comp4_t
+blend_screen (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    return DIV_ONE_UNc (sca * da + dca * sa - sca * dca);
+}
+
+PDF_SEPARABLE_BLEND_MODE (screen)
+
+/*
+ * Overlay
+ * B(Dca, Da, Sca, Sa) =
+ *   if 2.Dca < Da
+ *     2.Sca.Dca
+ *   otherwise
+ *     Sa.Da - 2.(Da - Dca).(Sa - Sca)
+ */
+static inline comp4_t
+blend_overlay (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    comp4_t rca;
+
+    if (2 * dca < da)
+	rca = 2 * sca * dca;
+    else
+	rca = sa * da - 2 * (da - dca) * (sa - sca);
+    return DIV_ONE_UNc (rca);
+}
+
+PDF_SEPARABLE_BLEND_MODE (overlay)
+
+/*
+ * Darken
+ * B(Dca, Da, Sca, Sa) = min (Sca.Da, Dca.Sa)
+ */
+static inline comp4_t
+blend_darken (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    comp4_t s, d;
+
+    s = sca * da;
+    d = dca * sa;
+    return DIV_ONE_UNc (s > d ? d : s);
+}
+
+PDF_SEPARABLE_BLEND_MODE (darken)
+
+/*
+ * Lighten
+ * B(Dca, Da, Sca, Sa) = max (Sca.Da, Dca.Sa)
+ */
+static inline comp4_t
+blend_lighten (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    comp4_t s, d;
+
+    s = sca * da;
+    d = dca * sa;
+    return DIV_ONE_UNc (s > d ? s : d);
+}
+
+PDF_SEPARABLE_BLEND_MODE (lighten)
+
+/*
+ * Color dodge
+ * B(Dca, Da, Sca, Sa) =
+ *   if Dca == 0
+ *     0
+ *   if Sca == Sa
+ *     Sa.Da
+ *   otherwise
+ *     Sa.Da. min (1, Dca / Da / (1 - Sca/Sa))
+ */
+static inline comp4_t
+blend_color_dodge (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    if (sca >= sa)
+    {
+	return dca == 0 ? 0 : DIV_ONE_UNc (sa * da);
+    }
+    else
+    {
+	comp4_t rca = dca * sa / (sa - sca);
+	return DIV_ONE_UNc (sa * MIN (rca, da));
+    }
+}
+
+PDF_SEPARABLE_BLEND_MODE (color_dodge)
+
+/*
+ * Color burn
+ * B(Dca, Da, Sca, Sa) =
+ *   if Dca == Da
+ *     Sa.Da
+ *   if Sca == 0
+ *     0
+ *   otherwise
+ *     Sa.Da.(1 - min (1, (1 - Dca/Da).Sa / Sca))
+ */
+static inline comp4_t
+blend_color_burn (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    if (sca == 0)
+    {
+	return dca < da ? 0 : DIV_ONE_UNc (sa * da);
+    }
+    else
+    {
+	comp4_t rca = (da - dca) * sa / sca;
+	return DIV_ONE_UNc (sa * (MAX (rca, da) - rca));
+    }
+}
+
+PDF_SEPARABLE_BLEND_MODE (color_burn)
+
+/*
+ * Hard light
+ * B(Dca, Da, Sca, Sa) =
+ *   if 2.Sca < Sa
+ *     2.Sca.Dca
+ *   otherwise
+ *     Sa.Da - 2.(Da - Dca).(Sa - Sca)
+ */
+static inline comp4_t
+blend_hard_light (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    if (2 * sca < sa)
+	return DIV_ONE_UNc (2 * sca * dca);
+    else
+	return DIV_ONE_UNc (sa * da - 2 * (da - dca) * (sa - sca));
+}
+
+PDF_SEPARABLE_BLEND_MODE (hard_light)
+
+/*
+ * Soft light
+ * B(Dca, Da, Sca, Sa) =
+ *   if (2.Sca <= Sa)
+ *     Dca.(Sa - (1 - Dca/Da).(2.Sca - Sa))
+ *   otherwise if Dca.4 <= Da
+ *     Dca.(Sa + (2.Sca - Sa).((16.Dca/Da - 12).Dca/Da + 3)
+ *   otherwise
+ *     (Dca.Sa + (SQRT (Dca/Da).Da - Dca).(2.Sca - Sa))
+ */
+static inline comp4_t
+blend_soft_light (comp4_t dca_org,
+		  comp4_t da_org,
+		  comp4_t sca_org,
+		  comp4_t sa_org)
+{
+    double dca = dca_org * (1.0 / MASK);
+    double da = da_org * (1.0 / MASK);
+    double sca = sca_org * (1.0 / MASK);
+    double sa = sa_org * (1.0 / MASK);
+    double rca;
+
+    if (2 * sca < sa)
+    {
+	if (da == 0)
+	    rca = dca * sa;
+	else
+	    rca = dca * sa - dca * (da - dca) * (sa - 2 * sca) / da;
+    }
+    else if (da == 0)
+    {
+	rca = 0;
+    }
+    else if (4 * dca <= da)
+    {
+	rca = dca * sa +
+	    (2 * sca - sa) * dca * ((16 * dca / da - 12) * dca / da + 3);
+    }
+    else
+    {
+	rca = dca * sa + (sqrt (dca * da) - dca) * (2 * sca - sa);
+    }
+    return rca * MASK + 0.5;
+}
+
+PDF_SEPARABLE_BLEND_MODE (soft_light)
+
+/*
+ * Difference
+ * B(Dca, Da, Sca, Sa) = abs (Dca.Sa - Sca.Da)
+ */
+static inline comp4_t
+blend_difference (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    comp4_t dcasa = dca * sa;
+    comp4_t scada = sca * da;
+
+    if (scada < dcasa)
+	return DIV_ONE_UNc (dcasa - scada);
+    else
+	return DIV_ONE_UNc (scada - dcasa);
+}
+
+PDF_SEPARABLE_BLEND_MODE (difference)
+
+/*
+ * Exclusion
+ * B(Dca, Da, Sca, Sa) = (Sca.Da + Dca.Sa - 2.Sca.Dca)
+ */
+
+/* This can be made faster by writing it directly and not using
+ * PDF_SEPARABLE_BLEND_MODE, but that's a performance optimization */
+
+static inline comp4_t
+blend_exclusion (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    return DIV_ONE_UNc (sca * da + dca * sa - 2 * dca * sca);
+}
+
+PDF_SEPARABLE_BLEND_MODE (exclusion)
+
+#undef PDF_SEPARABLE_BLEND_MODE
+
+/*
+ * PDF nonseperable blend modes are implemented using the following functions
+ * to operate in Hsl space, with Cmax, Cmid, Cmin referring to the max, mid
+ * and min value of the red, green and blue components.
+ *
+ * LUM (C) = 0.3 Ã Cred + 0.59 Ã Cgreen + 0.11 Ã Cblue
+ *
+ * clip_color (C):
+ *   l = LUM (C)
+ *   min = Cmin
+ *   max = Cmax
+ *   if n < 0.0
+ *     C = l + ( ( ( C â l ) Ã l ) â ( l â min ) )
+ *   if x > 1.0
+ *     C = l + ( ( ( C â l ) Ã ( 1 â l ) ) â ( max â l ) )
+ *   return C
+ *
+ * set_lum (C, l):
+ *   d = l â LUM (C)
+ *   C += d
+ *   return clip_color (C)
+ *
+ * SAT (C) = CH_MAX (C) - CH_MIN (C)
+ *
+ * set_sat (C, s):
+ *  if Cmax > Cmin
+ *    Cmid = ( ( ( Cmid â Cmin ) Ã s ) â ( Cmax â Cmin ) )
+ *    Cmax = s
+ *  else
+ *    Cmid = Cmax = 0.0
+ *  Cmin = 0.0
+ *  return C
+ */
+
+/* For premultiplied colors, we need to know what happens when C is
+ * multiplied by a real number. LUM and SAT are linear:
+ *
+ *    LUM (r Ã C) = r Ã LUM (C)		SAT (r * C) = r * SAT (C)
+ *
+ * If we extend clip_color with an extra argument a and change
+ *
+ *        if x >= 1.0
+ *
+ * into
+ *
+ *        if x >= a
+ *
+ * then clip_color is also linear:
+ *
+ *    r * clip_color (C, a) = clip_color (r_c, ra);
+ *
+ * for positive r.
+ *
+ * Similarly, we can extend set_lum with an extra argument that is just passed
+ * on to clip_color:
+ *
+ *   r * set_lum ( C, l, a)
+ *
+ *   = r Ã clip_color ( C + l - LUM (C), a)
+ *
+ *   = clip_color ( r * C + r Ã l - r * LUM (C), r * a)
+ *
+ *   = set_lum ( r * C, r * l, r * a)
+ *
+ * Finally, set_sat:
+ *
+ *    r * set_sat (C, s) = set_sat (x * C, r * s)
+ *
+ * The above holds for all non-zero x, because the x'es in the fraction for
+ * C_mid cancel out. Specifically, it holds for x = r:
+ *
+ *    r * set_sat (C, s) = set_sat (r_c, rs)
+ *
+ */
+
+/* So, for the non-separable PDF blend modes, we have (using s, d for
+ * non-premultiplied colors, and S, D for premultiplied:
+ *
+ *   Color:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (S/a_s, LUM (D/a_d), 1)
+ *   = set_lum (S * a_d, a_s * LUM (D), a_s * a_d)
+ *
+ *
+ *   Luminosity:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (D/a_d, LUM(S/a_s), 1)
+ *   = set_lum (a_s * D, a_d * LUM(S), a_s * a_d)
+ *
+ *
+ *   Saturation:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (set_sat (D/a_d, SAT (S/a_s)), LUM (D/a_d), 1)
+ *   = set_lum (a_s * a_d * set_sat (D/a_d, SAT (S/a_s)),
+ *                                        a_s * LUM (D), a_s * a_d)
+ *   = set_lum (set_sat (a_s * D, a_d * SAT (S), a_s * LUM (D), a_s * a_d))
+ *
+ *   Hue:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (set_sat (S/a_s, SAT (D/a_d)), LUM (D/a_d), 1)
+ *   = set_lum (set_sat (a_d * S, a_s * SAT (D)), a_s * LUM (D), a_s * a_d)
+ *
+ */
+
+#define CH_MIN(c) (c[0] < c[1] ? (c[0] < c[2] ? c[0] : c[2]) : (c[1] < c[2] ? c[1] : c[2]))
+#define CH_MAX(c) (c[0] > c[1] ? (c[0] > c[2] ? c[0] : c[2]) : (c[1] > c[2] ? c[1] : c[2]))
+#define LUM(c) ((c[0] * 30 + c[1] * 59 + c[2] * 11) / 100)
+#define SAT(c) (CH_MAX (c) - CH_MIN (c))
+
+#define PDF_NON_SEPARABLE_BLEND_MODE(name)				\
+    static void								\
+    combine_ ## name ## _u (pixman_implementation_t *imp,		\
+			    pixman_op_t op,				\
+                            comp4_t *dest,				\
+			    const comp4_t *src,				\
+			    const comp4_t *mask,			\
+			    int width)					\
+    {									\
+	int i;								\
+	for (i = 0; i < width; ++i)					\
+	{								\
+	    comp4_t s = combine_mask (src, mask, i);			\
+	    comp4_t d = *(dest + i);					\
+	    comp1_t sa = ALPHA_c (s);					\
+	    comp1_t isa = ~sa;						\
+	    comp1_t da = ALPHA_c (d);					\
+	    comp1_t ida = ~da;						\
+	    comp4_t result;						\
+	    comp4_t sc[3], dc[3], c[3];					\
+            								\
+	    result = d;							\
+	    UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (result, isa, s, ida);	\
+	    dc[0] = RED_c (d);						\
+	    sc[0] = RED_c (s);						\
+	    dc[1] = GREEN_c (d);					\
+	    sc[1] = GREEN_c (s);					\
+	    dc[2] = BLUE_c (d);						\
+	    sc[2] = BLUE_c (s);						\
+	    blend_ ## name (c, dc, da, sc, sa);				\
+            								\
+	    *(dest + i) = result +					\
+		(DIV_ONE_UNc (sa * da) << A_SHIFT) +			\
+		(DIV_ONE_UNc (c[0]) << R_SHIFT) +			\
+		(DIV_ONE_UNc (c[1]) << G_SHIFT) +			\
+		(DIV_ONE_UNc (c[2]));					\
+	}								\
+    }
+
+static void
+set_lum (comp4_t dest[3], comp4_t src[3], comp4_t sa, comp4_t lum)
+{
+    double a, l, min, max;
+    double tmp[3];
+
+    a = sa * (1.0 / MASK);
+
+    l = lum * (1.0 / MASK);
+    tmp[0] = src[0] * (1.0 / MASK);
+    tmp[1] = src[1] * (1.0 / MASK);
+    tmp[2] = src[2] * (1.0 / MASK);
+
+    l = l - LUM (tmp);
+    tmp[0] += l;
+    tmp[1] += l;
+    tmp[2] += l;
+
+    /* clip_color */
+    l = LUM (tmp);
+    min = CH_MIN (tmp);
+    max = CH_MAX (tmp);
+
+    if (min < 0)
+    {
+	if (l - min == 0.0)
+	{
+	    tmp[0] = 0;
+	    tmp[1] = 0;
+	    tmp[2] = 0;
+	}
+	else
+	{
+	    tmp[0] = l + (tmp[0] - l) * l / (l - min);
+	    tmp[1] = l + (tmp[1] - l) * l / (l - min);
+	    tmp[2] = l + (tmp[2] - l) * l / (l - min);
+	}
+    }
+    if (max > a)
+    {
+	if (max - l == 0.0)
+	{
+	    tmp[0] = a;
+	    tmp[1] = a;
+	    tmp[2] = a;
+	}
+	else
+	{
+	    tmp[0] = l + (tmp[0] - l) * (a - l) / (max - l);
+	    tmp[1] = l + (tmp[1] - l) * (a - l) / (max - l);
+	    tmp[2] = l + (tmp[2] - l) * (a - l) / (max - l);
+	}
+    }
+
+    dest[0] = tmp[0] * MASK + 0.5;
+    dest[1] = tmp[1] * MASK + 0.5;
+    dest[2] = tmp[2] * MASK + 0.5;
+}
+
+static void
+set_sat (comp4_t dest[3], comp4_t src[3], comp4_t sat)
+{
+    int id[3];
+    comp4_t min, max;
+
+    if (src[0] > src[1])
+    {
+	if (src[0] > src[2])
+	{
+	    id[0] = 0;
+	    if (src[1] > src[2])
+	    {
+		id[1] = 1;
+		id[2] = 2;
+	    }
+	    else
+	    {
+		id[1] = 2;
+		id[2] = 1;
+	    }
+	}
+	else
+	{
+	    id[0] = 2;
+	    id[1] = 0;
+	    id[2] = 1;
+	}
+    }
+    else
+    {
+	if (src[0] > src[2])
+	{
+	    id[0] = 1;
+	    id[1] = 0;
+	    id[2] = 2;
+	}
+	else
+	{
+	    id[2] = 0;
+	    if (src[1] > src[2])
+	    {
+		id[0] = 1;
+		id[1] = 2;
+	    }
+	    else
+	    {
+		id[0] = 2;
+		id[1] = 1;
+	    }
+	}
+    }
+
+    max = dest[id[0]];
+    min = dest[id[2]];
+    if (max > min)
+    {
+	dest[id[1]] = (dest[id[1]] - min) * sat / (max - min);
+	dest[id[0]] = sat;
+	dest[id[2]] = 0;
+    }
+    else
+    {
+	dest[0] = dest[1] = dest[2] = 0;
+    }
+}
+
+/*
+ * Hue:
+ * B(Cb, Cs) = set_lum (set_sat (Cs, SAT (Cb)), LUM (Cb))
+ */
+static inline void
+blend_hsl_hue (comp4_t c[3],
+               comp4_t dc[3],
+               comp4_t da,
+               comp4_t sc[3],
+               comp4_t sa)
+{
+    c[0] = sc[0] * da;
+    c[1] = sc[1] * da;
+    c[2] = sc[2] * da;
+    set_sat (c, c, SAT (dc) * sa);
+    set_lum (c, c, sa * da, LUM (dc) * sa);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_hue)
+
+/*
+ * Saturation:
+ * B(Cb, Cs) = set_lum (set_sat (Cb, SAT (Cs)), LUM (Cb))
+ */
+static inline void
+blend_hsl_saturation (comp4_t c[3],
+                      comp4_t dc[3],
+                      comp4_t da,
+                      comp4_t sc[3],
+                      comp4_t sa)
+{
+    c[0] = dc[0] * sa;
+    c[1] = dc[1] * sa;
+    c[2] = dc[2] * sa;
+    set_sat (c, c, SAT (sc) * da);
+    set_lum (c, c, sa * da, LUM (dc) * sa);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_saturation)
+
+/*
+ * Color:
+ * B(Cb, Cs) = set_lum (Cs, LUM (Cb))
+ */
+static inline void
+blend_hsl_color (comp4_t c[3],
+                 comp4_t dc[3],
+                 comp4_t da,
+                 comp4_t sc[3],
+                 comp4_t sa)
+{
+    c[0] = sc[0] * da;
+    c[1] = sc[1] * da;
+    c[2] = sc[2] * da;
+    set_lum (c, c, sa * da, LUM (dc) * sa);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_color)
+
+/*
+ * Luminosity:
+ * B(Cb, Cs) = set_lum (Cb, LUM (Cs))
+ */
+static inline void
+blend_hsl_luminosity (comp4_t c[3],
+                      comp4_t dc[3],
+                      comp4_t da,
+                      comp4_t sc[3],
+                      comp4_t sa)
+{
+    c[0] = dc[0] * sa;
+    c[1] = dc[1] * sa;
+    c[2] = dc[2] * sa;
+    set_lum (c, c, sa * da, LUM (sc) * da);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_luminosity)
+
+#undef SAT
+#undef LUM
+#undef CH_MAX
+#undef CH_MIN
+#undef PDF_NON_SEPARABLE_BLEND_MODE
+
+/* All of the disjoint/conjoint composing functions
+ *
+ * The four entries in the first column indicate what source contributions
+ * come from each of the four areas of the picture -- areas covered by neither
+ * A nor B, areas covered only by A, areas covered only by B and finally
+ * areas covered by both A and B.
+ * 
+ * Disjoint			Conjoint
+ * Fa		Fb		Fa		Fb
+ * (0,0,0,0)	0		0		0		0
+ * (0,A,0,A)	1		0		1		0
+ * (0,0,B,B)	0		1		0		1
+ * (0,A,B,A)	1		min((1-a)/b,1)	1		max(1-a/b,0)
+ * (0,A,B,B)	min((1-b)/a,1)	1		max(1-b/a,0)	1
+ * (0,0,0,A)	max(1-(1-b)/a,0) 0		min(1,b/a)	0
+ * (0,0,0,B)	0		max(1-(1-a)/b,0) 0		min(a/b,1)
+ * (0,A,0,0)	min(1,(1-b)/a)	0		max(1-b/a,0)	0
+ * (0,0,B,0)	0		min(1,(1-a)/b)	0		max(1-a/b,0)
+ * (0,0,B,A)	max(1-(1-b)/a,0) min(1,(1-a)/b)	 min(1,b/a)	max(1-a/b,0)
+ * (0,A,0,B)	min(1,(1-b)/a)	max(1-(1-a)/b,0) max(1-b/a,0)	min(1,a/b)
+ * (0,A,B,0)	min(1,(1-b)/a)	min(1,(1-a)/b)	max(1-b/a,0)	max(1-a/b,0)
+ *
+ * See  http://marc.info/?l=xfree-render&m=99792000027857&w=2  for more
+ * information about these operators.
+ */
+
+#define COMBINE_A_OUT 1
+#define COMBINE_A_IN  2
+#define COMBINE_B_OUT 4
+#define COMBINE_B_IN  8
+
+#define COMBINE_CLEAR   0
+#define COMBINE_A       (COMBINE_A_OUT | COMBINE_A_IN)
+#define COMBINE_B       (COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_ATOP  (COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_ATOP  (COMBINE_A_OUT | COMBINE_B_IN)
+#define COMBINE_XOR     (COMBINE_A_OUT | COMBINE_B_OUT)
+
+/* portion covered by a but not b */
+static comp1_t
+combine_disjoint_out_part (comp1_t a, comp1_t b)
+{
+    /* min (1, (1-b) / a) */
+
+    b = ~b;                 /* 1 - b */
+    if (b >= a)             /* 1 - b >= a -> (1-b)/a >= 1 */
+	return MASK;        /* 1 */
+    return DIV_UNc (b, a);     /* (1-b) / a */
+}
+
+/* portion covered by both a and b */
+static comp1_t
+combine_disjoint_in_part (comp1_t a, comp1_t b)
+{
+    /* max (1-(1-b)/a,0) */
+    /*  = - min ((1-b)/a - 1, 0) */
+    /*  = 1 - min (1, (1-b)/a) */
+
+    b = ~b;                 /* 1 - b */
+    if (b >= a)             /* 1 - b >= a -> (1-b)/a >= 1 */
+	return 0;           /* 1 - 1 */
+    return ~DIV_UNc(b, a);    /* 1 - (1-b) / a */
+}
+
+/* portion covered by a but not b */
+static comp1_t
+combine_conjoint_out_part (comp1_t a, comp1_t b)
+{
+    /* max (1-b/a,0) */
+    /* = 1-min(b/a,1) */
+
+    /* min (1, (1-b) / a) */
+
+    if (b >= a)             /* b >= a -> b/a >= 1 */
+	return 0x00;        /* 0 */
+    return ~DIV_UNc(b, a);    /* 1 - b/a */
+}
+
+/* portion covered by both a and b */
+static comp1_t
+combine_conjoint_in_part (comp1_t a, comp1_t b)
+{
+    /* min (1,b/a) */
+
+    if (b >= a)             /* b >= a -> b/a >= 1 */
+	return MASK;        /* 1 */
+    return DIV_UNc (b, a);     /* b/a */
+}
+
+#define GET_COMP(v, i)   ((comp2_t) (comp1_t) ((v) >> i))
+
+#define ADD(x, y, i, t)							\
+    ((t) = GET_COMP (x, i) + GET_COMP (y, i),				\
+     (comp4_t) ((comp1_t) ((t) | (0 - ((t) >> G_SHIFT)))) << (i))
+
+#define GENERIC(x, y, i, ax, ay, t, u, v)				\
+    ((t) = (MUL_UNc (GET_COMP (y, i), ay, (u)) +			\
+            MUL_UNc (GET_COMP (x, i), ax, (v))),			\
+     (comp4_t) ((comp1_t) ((t) |					\
+                           (0 - ((t) >> G_SHIFT)))) << (i))
+
+static void
+combine_disjoint_general_u (comp4_t *      dest,
+                            const comp4_t *src,
+                            const comp4_t *mask,
+                            int            width,
+                            comp1_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t m, n, o, p;
+	comp2_t Fa, Fb, t, u, v;
+	comp1_t sa = s >> A_SHIFT;
+	comp1_t da = d >> A_SHIFT;
+
+	switch (combine & COMBINE_A)
+	{
+	default:
+	    Fa = 0;
+	    break;
+
+	case COMBINE_A_OUT:
+	    Fa = combine_disjoint_out_part (sa, da);
+	    break;
+
+	case COMBINE_A_IN:
+	    Fa = combine_disjoint_in_part (sa, da);
+	    break;
+
+	case COMBINE_A:
+	    Fa = MASK;
+	    break;
+	}
+
+	switch (combine & COMBINE_B)
+	{
+	default:
+	    Fb = 0;
+	    break;
+
+	case COMBINE_B_OUT:
+	    Fb = combine_disjoint_out_part (da, sa);
+	    break;
+
+	case COMBINE_B_IN:
+	    Fb = combine_disjoint_in_part (da, sa);
+	    break;
+
+	case COMBINE_B:
+	    Fb = MASK;
+	    break;
+	}
+	m = GENERIC (s, d, 0, Fa, Fb, t, u, v);
+	n = GENERIC (s, d, G_SHIFT, Fa, Fb, t, u, v);
+	o = GENERIC (s, d, R_SHIFT, Fa, Fb, t, u, v);
+	p = GENERIC (s, d, A_SHIFT, Fa, Fb, t, u, v);
+	s = m | n | o | p;
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_disjoint_over_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp2_t a = s >> A_SHIFT;
+
+	if (s != 0x00)
+	{
+	    comp4_t d = *(dest + i);
+	    a = combine_disjoint_out_part (d >> A_SHIFT, a);
+	    UNcx4_MUL_UNc_ADD_UNcx4 (d, a, s);
+
+	    *(dest + i) = d;
+	}
+    }
+}
+
+static void
+combine_disjoint_in_u (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       comp4_t *                dest,
+                       const comp4_t *          src,
+                       const comp4_t *          mask,
+                       int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_disjoint_in_reverse_u (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               comp4_t *                dest,
+                               const comp4_t *          src,
+                               const comp4_t *          mask,
+                               int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_disjoint_out_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_disjoint_out_reverse_u (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                comp4_t *                dest,
+                                const comp4_t *          src,
+                                const comp4_t *          mask,
+                                int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_disjoint_atop_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_disjoint_atop_reverse_u (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_disjoint_xor_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_XOR);
+}
+
+static void
+combine_conjoint_general_u (comp4_t *      dest,
+                            const comp4_t *src,
+                            const comp4_t *mask,
+                            int            width,
+                            comp1_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t m, n, o, p;
+	comp2_t Fa, Fb, t, u, v;
+	comp1_t sa = s >> A_SHIFT;
+	comp1_t da = d >> A_SHIFT;
+
+	switch (combine & COMBINE_A)
+	{
+	default:
+	    Fa = 0;
+	    break;
+
+	case COMBINE_A_OUT:
+	    Fa = combine_conjoint_out_part (sa, da);
+	    break;
+
+	case COMBINE_A_IN:
+	    Fa = combine_conjoint_in_part (sa, da);
+	    break;
+
+	case COMBINE_A:
+	    Fa = MASK;
+	    break;
+	}
+
+	switch (combine & COMBINE_B)
+	{
+	default:
+	    Fb = 0;
+	    break;
+
+	case COMBINE_B_OUT:
+	    Fb = combine_conjoint_out_part (da, sa);
+	    break;
+
+	case COMBINE_B_IN:
+	    Fb = combine_conjoint_in_part (da, sa);
+	    break;
+
+	case COMBINE_B:
+	    Fb = MASK;
+	    break;
+	}
+
+	m = GENERIC (s, d, 0, Fa, Fb, t, u, v);
+	n = GENERIC (s, d, G_SHIFT, Fa, Fb, t, u, v);
+	o = GENERIC (s, d, R_SHIFT, Fa, Fb, t, u, v);
+	p = GENERIC (s, d, A_SHIFT, Fa, Fb, t, u, v);
+
+	s = m | n | o | p;
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_conjoint_over_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+combine_conjoint_over_reverse_u (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OVER);
+}
+
+static void
+combine_conjoint_in_u (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       comp4_t *                dest,
+                       const comp4_t *          src,
+                       const comp4_t *          mask,
+                       int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_conjoint_in_reverse_u (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               comp4_t *                dest,
+                               const comp4_t *          src,
+                               const comp4_t *          mask,
+                               int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_conjoint_out_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_conjoint_out_reverse_u (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                comp4_t *                dest,
+                                const comp4_t *          src,
+                                const comp4_t *          mask,
+                                int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_conjoint_atop_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_conjoint_atop_reverse_u (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_conjoint_xor_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_XOR);
+}
+
+/************************************************************************/
+/*********************** Per Channel functions **************************/
+/************************************************************************/
+
+static void
+combine_clear_ca (pixman_implementation_t *imp,
+                  pixman_op_t              op,
+                  comp4_t *                dest,
+                  const comp4_t *          src,
+                  const comp4_t *          mask,
+                  int                      width)
+{
+    memset (dest, 0, width * sizeof(comp4_t));
+}
+
+static void
+combine_src_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+
+	combine_mask_value_ca (&s, &m);
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_over_ca (pixman_implementation_t *imp,
+                 pixman_op_t              op,
+                 comp4_t *                dest,
+                 const comp4_t *          src,
+                 const comp4_t *          mask,
+                 int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t a;
+
+	combine_mask_ca (&s, &m);
+
+	a = ~m;
+	if (a)
+	{
+	    comp4_t d = *(dest + i);
+	    UNcx4_MUL_UNcx4_ADD_UNcx4 (d, a, s);
+	    s = d;
+	}
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_over_reverse_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp4_t a = ~d >> A_SHIFT;
+
+	if (a)
+	{
+	    comp4_t s = *(src + i);
+	    comp4_t m = *(mask + i);
+
+	    UNcx4_MUL_UNcx4 (s, m);
+	    UNcx4_MUL_UNc_ADD_UNcx4 (s, a, d);
+
+	    *(dest + i) = s;
+	}
+    }
+}
+
+static void
+combine_in_ca (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp2_t a = d >> A_SHIFT;
+	comp4_t s = 0;
+
+	if (a)
+	{
+	    comp4_t m = *(mask + i);
+
+	    s = *(src + i);
+	    combine_mask_value_ca (&s, &m);
+
+	    if (a != MASK)
+		UNcx4_MUL_UNc (s, a);
+	}
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_in_reverse_ca (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       comp4_t *                dest,
+                       const comp4_t *          src,
+                       const comp4_t *          mask,
+                       int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t a;
+
+	combine_mask_alpha_ca (&s, &m);
+
+	a = m;
+	if (a != ~0)
+	{
+	    comp4_t d = 0;
+
+	    if (a)
+	    {
+		d = *(dest + i);
+		UNcx4_MUL_UNcx4 (d, a);
+	    }
+
+	    *(dest + i) = d;
+	}
+    }
+}
+
+static void
+combine_out_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp2_t a = ~d >> A_SHIFT;
+	comp4_t s = 0;
+
+	if (a)
+	{
+	    comp4_t m = *(mask + i);
+
+	    s = *(src + i);
+	    combine_mask_value_ca (&s, &m);
+
+	    if (a != MASK)
+		UNcx4_MUL_UNc (s, a);
+	}
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_out_reverse_ca (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t a;
+
+	combine_mask_alpha_ca (&s, &m);
+
+	a = ~m;
+	if (a != ~0)
+	{
+	    comp4_t d = 0;
+
+	    if (a)
+	    {
+		d = *(dest + i);
+		UNcx4_MUL_UNcx4 (d, a);
+	    }
+
+	    *(dest + i) = d;
+	}
+    }
+}
+
+static void
+combine_atop_ca (pixman_implementation_t *imp,
+                 pixman_op_t              op,
+                 comp4_t *                dest,
+                 const comp4_t *          src,
+                 const comp4_t *          mask,
+                 int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t ad;
+	comp2_t as = d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	ad = ~m;
+
+	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_atop_reverse_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t ad;
+	comp2_t as = ~d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	ad = m;
+
+	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_xor_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t ad;
+	comp2_t as = ~d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	ad = ~m;
+
+	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_add_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t d = *(dest + i);
+
+	combine_mask_value_ca (&s, &m);
+
+	UNcx4_ADD_UNcx4 (d, s);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_saturate_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     comp4_t *                dest,
+                     const comp4_t *          src,
+                     const comp4_t *          mask,
+                     int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s, d;
+	comp2_t sa, sr, sg, sb, da;
+	comp2_t t, u, v;
+	comp4_t m, n, o, p;
+
+	d = *(dest + i);
+	s = *(src + i);
+	m = *(mask + i);
+
+	combine_mask_ca (&s, &m);
+
+	sa = (m >> A_SHIFT);
+	sr = (m >> R_SHIFT) & MASK;
+	sg = (m >> G_SHIFT) & MASK;
+	sb =  m             & MASK;
+	da = ~d >> A_SHIFT;
+
+	if (sb <= da)
+	    m = ADD (s, d, 0, t);
+	else
+	    m = GENERIC (s, d, 0, (da << G_SHIFT) / sb, MASK, t, u, v);
+
+	if (sg <= da)
+	    n = ADD (s, d, G_SHIFT, t);
+	else
+	    n = GENERIC (s, d, G_SHIFT, (da << G_SHIFT) / sg, MASK, t, u, v);
+
+	if (sr <= da)
+	    o = ADD (s, d, R_SHIFT, t);
+	else
+	    o = GENERIC (s, d, R_SHIFT, (da << G_SHIFT) / sr, MASK, t, u, v);
+
+	if (sa <= da)
+	    p = ADD (s, d, A_SHIFT, t);
+	else
+	    p = GENERIC (s, d, A_SHIFT, (da << G_SHIFT) / sa, MASK, t, u, v);
+
+	*(dest + i) = m | n | o | p;
+    }
+}
+
+static void
+combine_disjoint_general_ca (comp4_t *      dest,
+                             const comp4_t *src,
+                             const comp4_t *mask,
+                             int            width,
+                             comp1_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s, d;
+	comp4_t m, n, o, p;
+	comp4_t Fa, Fb;
+	comp2_t t, u, v;
+	comp4_t sa;
+	comp1_t da;
+
+	s = *(src + i);
+	m = *(mask + i);
+	d = *(dest + i);
+	da = d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	sa = m;
+
+	switch (combine & COMBINE_A)
+	{
+	default:
+	    Fa = 0;
+	    break;
+
+	case COMBINE_A_OUT:
+	    m = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> 0), da);
+	    n = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
+	    o = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
+	    p = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
+	    Fa = m | n | o | p;
+	    break;
+
+	case COMBINE_A_IN:
+	    m = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> 0), da);
+	    n = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
+	    o = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
+	    p = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
+	    Fa = m | n | o | p;
+	    break;
+
+	case COMBINE_A:
+	    Fa = ~0;
+	    break;
+	}
+
+	switch (combine & COMBINE_B)
+	{
+	default:
+	    Fb = 0;
+	    break;
+
+	case COMBINE_B_OUT:
+	    m = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> 0));
+	    n = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
+	    o = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
+	    p = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
+	    Fb = m | n | o | p;
+	    break;
+
+	case COMBINE_B_IN:
+	    m = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> 0));
+	    n = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
+	    o = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
+	    p = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
+	    Fb = m | n | o | p;
+	    break;
+
+	case COMBINE_B:
+	    Fb = ~0;
+	    break;
+	}
+	m = GENERIC (s, d, 0, GET_COMP (Fa, 0), GET_COMP (Fb, 0), t, u, v);
+	n = GENERIC (s, d, G_SHIFT, GET_COMP (Fa, G_SHIFT), GET_COMP (Fb, G_SHIFT), t, u, v);
+	o = GENERIC (s, d, R_SHIFT, GET_COMP (Fa, R_SHIFT), GET_COMP (Fb, R_SHIFT), t, u, v);
+	p = GENERIC (s, d, A_SHIFT, GET_COMP (Fa, A_SHIFT), GET_COMP (Fb, A_SHIFT), t, u, v);
+
+	s = m | n | o | p;
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_disjoint_over_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          comp4_t *                dest,
+                          const comp4_t *          src,
+                          const comp4_t *          mask,
+                          int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+combine_disjoint_in_ca (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_disjoint_in_reverse_ca (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                comp4_t *                dest,
+                                const comp4_t *          src,
+                                const comp4_t *          mask,
+                                int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_disjoint_out_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_disjoint_out_reverse_ca (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_disjoint_atop_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          comp4_t *                dest,
+                          const comp4_t *          src,
+                          const comp4_t *          mask,
+                          int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_disjoint_atop_reverse_ca (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  comp4_t *                dest,
+                                  const comp4_t *          src,
+                                  const comp4_t *          mask,
+                                  int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_disjoint_xor_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
+}
+
+static void
+combine_conjoint_general_ca (comp4_t *      dest,
+                             const comp4_t *src,
+                             const comp4_t *mask,
+                             int            width,
+                             comp1_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s, d;
+	comp4_t m, n, o, p;
+	comp4_t Fa, Fb;
+	comp2_t t, u, v;
+	comp4_t sa;
+	comp1_t da;
+
+	s = *(src + i);
+	m = *(mask + i);
+	d = *(dest + i);
+	da = d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	sa = m;
+
+	switch (combine & COMBINE_A)
+	{
+	default:
+	    Fa = 0;
+	    break;
+
+	case COMBINE_A_OUT:
+	    m = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> 0), da);
+	    n = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
+	    o = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
+	    p = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
+	    Fa = m | n | o | p;
+	    break;
+
+	case COMBINE_A_IN:
+	    m = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> 0), da);
+	    n = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
+	    o = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
+	    p = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
+	    Fa = m | n | o | p;
+	    break;
+
+	case COMBINE_A:
+	    Fa = ~0;
+	    break;
+	}
+
+	switch (combine & COMBINE_B)
+	{
+	default:
+	    Fb = 0;
+	    break;
+
+	case COMBINE_B_OUT:
+	    m = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> 0));
+	    n = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
+	    o = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
+	    p = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
+	    Fb = m | n | o | p;
+	    break;
+
+	case COMBINE_B_IN:
+	    m = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> 0));
+	    n = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
+	    o = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
+	    p = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
+	    Fb = m | n | o | p;
+	    break;
+
+	case COMBINE_B:
+	    Fb = ~0;
+	    break;
+	}
+	m = GENERIC (s, d, 0, GET_COMP (Fa, 0), GET_COMP (Fb, 0), t, u, v);
+	n = GENERIC (s, d, G_SHIFT, GET_COMP (Fa, G_SHIFT), GET_COMP (Fb, G_SHIFT), t, u, v);
+	o = GENERIC (s, d, R_SHIFT, GET_COMP (Fa, R_SHIFT), GET_COMP (Fb, R_SHIFT), t, u, v);
+	p = GENERIC (s, d, A_SHIFT, GET_COMP (Fa, A_SHIFT), GET_COMP (Fb, A_SHIFT), t, u, v);
+
+	s = m | n | o | p;
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_conjoint_over_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          comp4_t *                dest,
+                          const comp4_t *          src,
+                          const comp4_t *          mask,
+                          int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+combine_conjoint_over_reverse_ca (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  comp4_t *                dest,
+                                  const comp4_t *          src,
+                                  const comp4_t *          mask,
+                                  int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OVER);
+}
+
+static void
+combine_conjoint_in_ca (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_conjoint_in_reverse_ca (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                comp4_t *                dest,
+                                const comp4_t *          src,
+                                const comp4_t *          mask,
+                                int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_conjoint_out_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_conjoint_out_reverse_ca (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_conjoint_atop_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          comp4_t *                dest,
+                          const comp4_t *          src,
+                          const comp4_t *          mask,
+                          int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_conjoint_atop_reverse_ca (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  comp4_t *                dest,
+                                  const comp4_t *          src,
+                                  const comp4_t *          mask,
+                                  int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_conjoint_xor_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
+}
+
+void
+_pixman_setup_combiner_functions_width (pixman_implementation_t *imp)
+{
+    /* Unified alpha */
+    imp->combine_width[PIXMAN_OP_CLEAR] = combine_clear;
+    imp->combine_width[PIXMAN_OP_SRC] = combine_src_u;
+    imp->combine_width[PIXMAN_OP_DST] = combine_dst;
+    imp->combine_width[PIXMAN_OP_OVER] = combine_over_u;
+    imp->combine_width[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_u;
+    imp->combine_width[PIXMAN_OP_IN] = combine_in_u;
+    imp->combine_width[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_u;
+    imp->combine_width[PIXMAN_OP_OUT] = combine_out_u;
+    imp->combine_width[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_u;
+    imp->combine_width[PIXMAN_OP_ATOP] = combine_atop_u;
+    imp->combine_width[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_u;
+    imp->combine_width[PIXMAN_OP_XOR] = combine_xor_u;
+    imp->combine_width[PIXMAN_OP_ADD] = combine_add_u;
+    imp->combine_width[PIXMAN_OP_SATURATE] = combine_saturate_u;
+
+    /* Disjoint, unified */
+    imp->combine_width[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear;
+    imp->combine_width[PIXMAN_OP_DISJOINT_SRC] = combine_src_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_DST] = combine_dst;
+    imp->combine_width[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_u;
+
+    /* Conjoint, unified */
+    imp->combine_width[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear;
+    imp->combine_width[PIXMAN_OP_CONJOINT_SRC] = combine_src_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_DST] = combine_dst;
+    imp->combine_width[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_u;
+
+    imp->combine_width[PIXMAN_OP_MULTIPLY] = combine_multiply_u;
+    imp->combine_width[PIXMAN_OP_SCREEN] = combine_screen_u;
+    imp->combine_width[PIXMAN_OP_OVERLAY] = combine_overlay_u;
+    imp->combine_width[PIXMAN_OP_DARKEN] = combine_darken_u;
+    imp->combine_width[PIXMAN_OP_LIGHTEN] = combine_lighten_u;
+    imp->combine_width[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_u;
+    imp->combine_width[PIXMAN_OP_COLOR_BURN] = combine_color_burn_u;
+    imp->combine_width[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_u;
+    imp->combine_width[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_u;
+    imp->combine_width[PIXMAN_OP_DIFFERENCE] = combine_difference_u;
+    imp->combine_width[PIXMAN_OP_EXCLUSION] = combine_exclusion_u;
+    imp->combine_width[PIXMAN_OP_HSL_HUE] = combine_hsl_hue_u;
+    imp->combine_width[PIXMAN_OP_HSL_SATURATION] = combine_hsl_saturation_u;
+    imp->combine_width[PIXMAN_OP_HSL_COLOR] = combine_hsl_color_u;
+    imp->combine_width[PIXMAN_OP_HSL_LUMINOSITY] = combine_hsl_luminosity_u;
+
+    /* Component alpha combiners */
+    imp->combine_width_ca[PIXMAN_OP_CLEAR] = combine_clear_ca;
+    imp->combine_width_ca[PIXMAN_OP_SRC] = combine_src_ca;
+    /* dest */
+    imp->combine_width_ca[PIXMAN_OP_OVER] = combine_over_ca;
+    imp->combine_width_ca[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_IN] = combine_in_ca;
+    imp->combine_width_ca[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_OUT] = combine_out_ca;
+    imp->combine_width_ca[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_ATOP] = combine_atop_ca;
+    imp->combine_width_ca[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_XOR] = combine_xor_ca;
+    imp->combine_width_ca[PIXMAN_OP_ADD] = combine_add_ca;
+    imp->combine_width_ca[PIXMAN_OP_SATURATE] = combine_saturate_ca;
+
+    /* Disjoint CA */
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_SRC] = combine_src_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_DST] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_ca;
+
+    /* Conjoint CA */
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_SRC] = combine_src_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_DST] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_ca;
+
+    imp->combine_width_ca[PIXMAN_OP_MULTIPLY] = combine_multiply_ca;
+    imp->combine_width_ca[PIXMAN_OP_SCREEN] = combine_screen_ca;
+    imp->combine_width_ca[PIXMAN_OP_OVERLAY] = combine_overlay_ca;
+    imp->combine_width_ca[PIXMAN_OP_DARKEN] = combine_darken_ca;
+    imp->combine_width_ca[PIXMAN_OP_LIGHTEN] = combine_lighten_ca;
+    imp->combine_width_ca[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_ca;
+    imp->combine_width_ca[PIXMAN_OP_COLOR_BURN] = combine_color_burn_ca;
+    imp->combine_width_ca[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_ca;
+    imp->combine_width_ca[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_ca;
+    imp->combine_width_ca[PIXMAN_OP_DIFFERENCE] = combine_difference_ca;
+    imp->combine_width_ca[PIXMAN_OP_EXCLUSION] = combine_exclusion_ca;
+
+    /* It is not clear that these make sense, so make them noops for now */
+    imp->combine_width_ca[PIXMAN_OP_HSL_HUE] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_HSL_SATURATION] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_HSL_COLOR] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_HSL_LUMINOSITY] = combine_dst;
+}
+
diff --git a/pixman/pixman-combine.h.template b/pixman/pixman-combine.h.template
new file mode 100644
index 0000000..67ed309
--- /dev/null
+++ b/pixman/pixman-combine.h.template
@@ -0,0 +1,226 @@
+
+#define COMPONENT_SIZE
+#define MASK
+#define ONE_HALF
+
+#define A_SHIFT
+#define R_SHIFT
+#define G_SHIFT
+#define A_MASK
+#define R_MASK
+#define G_MASK
+
+#define RB_MASK
+#define AG_MASK
+#define RB_ONE_HALF
+#define RB_MASK_PLUS_ONE
+
+#define ALPHA_c(x) ((x) >> A_SHIFT)
+#define RED_c(x) (((x) >> R_SHIFT) & MASK)
+#define GREEN_c(x) (((x) >> G_SHIFT) & MASK)
+#define BLUE_c(x) ((x) & MASK)
+
+/*
+ * Helper macros.
+ */
+
+#define MUL_UNc(a, b, t)						\
+    ((t) = (a) * (b) + ONE_HALF, ((((t) >> G_SHIFT ) + (t) ) >> G_SHIFT ))
+
+#define DIV_UNc(a, b)							\
+    (((comp2_t) (a) * MASK) / (b))
+
+#define ADD_UNc(x, y, t)				     \
+    ((t) = (x) + (y),					     \
+     (comp4_t) (comp1_t) ((t) | (0 - ((t) >> G_SHIFT))))
+
+#define DIV_ONE_UNc(x)							\
+    (((x) + ONE_HALF + (((x) + ONE_HALF) >> G_SHIFT)) >> G_SHIFT)
+
+/*
+ * The methods below use some tricks to be able to do two color
+ * components at the same time.
+ */
+
+/*
+ * x_rb = (x_rb * a) / 255
+ */
+#define UNc_rb_MUL_UNc(x, a, t)						\
+    do									\
+    {									\
+	t  = ((x) & RB_MASK) * (a);					\
+	t += RB_ONE_HALF;						\
+	x = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
+	x &= RB_MASK;							\
+    } while (0)
+
+/*
+ * x_rb = min (x_rb + y_rb, 255)
+ */
+#define UNc_rb_ADD_UNc_rb(x, y, t)					\
+    do									\
+    {									\
+	t = ((x) + (y));						\
+	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);		\
+	x = (t & RB_MASK);						\
+    } while (0)
+
+/*
+ * x_rb = (x_rb * a_rb) / 255
+ */
+#define UNc_rb_MUL_UNc_rb(x, a, t)					\
+    do									\
+    {									\
+	t  = (x & MASK) * (a & MASK);					\
+	t |= (x & R_MASK) * ((a >> R_SHIFT) & MASK);			\
+	t += RB_ONE_HALF;						\
+	t = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
+	x = t & RB_MASK;						\
+    } while (0)
+
+/*
+ * x_c = (x_c * a) / 255
+ */
+#define UNcx4_MUL_UNc(x, a)						\
+    do									\
+    {									\
+	comp4_t r1__, r2__, t__;					\
+									\
+	r1__ = (x);							\
+	UNc_rb_MUL_UNc (r1__, (a), t__);				\
+									\
+	r2__ = (x) >> G_SHIFT;						\
+	UNc_rb_MUL_UNc (r2__, (a), t__);				\
+									\
+	(x) = r1__ | (r2__ << G_SHIFT);					\
+    } while (0)
+
+/*
+ * x_c = (x_c * a) / 255 + y_c
+ */
+#define UNcx4_MUL_UNc_ADD_UNcx4(x, a, y)				\
+    do									\
+    {									\
+	comp4_t r1__, r2__, r3__, t__;					\
+									\
+	r1__ = (x);							\
+	r2__ = (y) & RB_MASK;						\
+	UNc_rb_MUL_UNc (r1__, (a), t__);				\
+	UNc_rb_ADD_UNc_rb (r1__, r2__, t__);				\
+									\
+	r2__ = (x) >> G_SHIFT;						\
+	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
+	UNc_rb_MUL_UNc (r2__, (a), t__);				\
+	UNc_rb_ADD_UNc_rb (r2__, r3__, t__);				\
+									\
+	(x) = r1__ | (r2__ << G_SHIFT);					\
+    } while (0)
+
+/*
+ * x_c = (x_c * a + y_c * b) / 255
+ */
+#define UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc(x, a, y, b)			\
+    do									\
+    {									\
+	comp4_t r1__, r2__, r3__, t__;					\
+									\
+	r1__ = (x);							\
+	r2__ = (y);							\
+	UNc_rb_MUL_UNc (r1__, (a), t__);				\
+	UNc_rb_MUL_UNc (r2__, (b), t__);				\
+	UNc_rb_ADD_UNc_rb (r1__, r2__, t__);				\
+									\
+	r2__ = ((x) >> G_SHIFT);					\
+	r3__ = ((y) >> G_SHIFT);					\
+	UNc_rb_MUL_UNc (r2__, (a), t__);				\
+	UNc_rb_MUL_UNc (r3__, (b), t__);				\
+	UNc_rb_ADD_UNc_rb (r2__, r3__, t__);				\
+									\
+	(x) = r1__ | (r2__ << G_SHIFT);					\
+    } while (0)
+
+/*
+ * x_c = (x_c * a_c) / 255
+ */
+#define UNcx4_MUL_UNcx4(x, a)						\
+    do									\
+    {									\
+	comp4_t r1__, r2__, r3__, t__;					\
+									\
+	r1__ = (x);							\
+	r2__ = (a);							\
+	UNc_rb_MUL_UNc_rb (r1__, r2__, t__);				\
+									\
+	r2__ = (x) >> G_SHIFT;						\
+	r3__ = (a) >> G_SHIFT;						\
+	UNc_rb_MUL_UNc_rb (r2__, r3__, t__);				\
+									\
+	(x) = r1__ | (r2__ << G_SHIFT);					\
+    } while (0)
+
+/*
+ * x_c = (x_c * a_c) / 255 + y_c
+ */
+#define UNcx4_MUL_UNcx4_ADD_UNcx4(x, a, y)				\
+    do									\
+    {									\
+	comp4_t r1__, r2__, r3__, t__;					\
+									\
+	r1__ = (x);							\
+	r2__ = (a);							\
+	UNc_rb_MUL_UNc_rb (r1__, r2__, t__);				\
+	r2__ = (y) & RB_MASK;						\
+	UNc_rb_ADD_UNc_rb (r1__, r2__, t__);				\
+									\
+	r2__ = ((x) >> G_SHIFT);					\
+	r3__ = ((a) >> G_SHIFT);					\
+	UNc_rb_MUL_UNc_rb (r2__, r3__, t__);				\
+	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
+	UNc_rb_ADD_UNc_rb (r2__, r3__, t__);				\
+									\
+	(x) = r1__ | (r2__ << G_SHIFT);					\
+    } while (0)
+
+/*
+ * x_c = (x_c * a_c + y_c * b) / 255
+ */
+#define UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc(x, a, y, b)			\
+    do									\
+    {									\
+	comp4_t r1__, r2__, r3__, t__;					\
+									\
+	r1__ = (x);							\
+	r2__ = (a);							\
+	UNc_rb_MUL_UNc_rb (r1__, r2__, t__);				\
+	r2__ = (y);							\
+	UNc_rb_MUL_UNc (r2__, (b), t__);				\
+	UNc_rb_ADD_UNc_rb (r1__, r2__, t__);				\
+									\
+	r2__ = (x) >> G_SHIFT;						\
+	r3__ = (a) >> G_SHIFT;						\
+	UNc_rb_MUL_UNc_rb (r2__, r3__, t__);				\
+	r3__ = (y) >> G_SHIFT;						\
+	UNc_rb_MUL_UNc (r3__, (b), t__);				\
+	UNc_rb_ADD_UNc_rb (r2__, r3__, t__);				\
+									\
+	x = r1__ | (r2__ << G_SHIFT);					\
+    } while (0)
+
+/*
+  x_c = min(x_c + y_c, 255)
+*/
+#define UNcx4_ADD_UNcx4(x, y)						\
+    do									\
+    {									\
+	comp4_t r1__, r2__, r3__, t__;					\
+									\
+	r1__ = (x) & RB_MASK;						\
+	r2__ = (y) & RB_MASK;						\
+	UNc_rb_ADD_UNc_rb (r1__, r2__, t__);				\
+									\
+	r2__ = ((x) >> G_SHIFT) & RB_MASK;				\
+	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
+	UNc_rb_ADD_UNc_rb (r2__, r3__, t__);				\
+									\
+	x = r1__ | (r2__ << G_SHIFT);					\
+    } while (0)
diff --git a/pixman/pixman-compiler.h b/pixman/pixman-compiler.h
new file mode 100644
index 0000000..fe2a613
--- /dev/null
+++ b/pixman/pixman-compiler.h
@@ -0,0 +1,209 @@
+/* Pixman uses some non-standard compiler features. This file ensures
+ * they exist
+ *
+ * The features are:
+ *
+ *    FUNC	     must be defined to expand to the current function
+ *    PIXMAN_EXPORT  should be defined to whatever is required to
+ *                   export functions from a shared library
+ *    limits	     limits for various types must be defined
+ *    inline         must be defined
+ *    force_inline   must be defined
+ */
+#if defined (__GNUC__)
+#  define FUNC     ((const char*) (__PRETTY_FUNCTION__))
+#elif defined (__sun) || (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
+#  define FUNC     ((const char*) (__func__))
+#else
+#  define FUNC     ((const char*) ("???"))
+#endif
+
+#if defined (__GNUC__)
+#  define MAYBE_UNUSED  __attribute__((unused))
+#else
+#  define MAYBE_UNUSED
+#endif
+
+#ifndef INT16_MIN
+# define INT16_MIN              (-32767-1)
+#endif
+
+#ifndef INT16_MAX
+# define INT16_MAX              (32767)
+#endif
+
+#ifndef INT32_MIN
+# define INT32_MIN              (-2147483647-1)
+#endif
+
+#ifndef INT32_MAX
+# define INT32_MAX              (2147483647)
+#endif
+
+#ifndef UINT32_MIN
+# define UINT32_MIN             (0)
+#endif
+
+#ifndef UINT32_MAX
+# define UINT32_MAX             (4294967295U)
+#endif
+
+#ifndef M_PI
+# define M_PI			3.14159265358979323846
+#endif
+
+#ifdef _MSC_VER
+/* 'inline' is available only in C++ in MSVC */
+#   define inline __inline
+#   define force_inline __forceinline
+#   define noinline __declspec(noinline)
+#elif defined __GNUC__ || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
+#   define inline __inline__
+#   define force_inline __inline__ __attribute__ ((__always_inline__))
+#   define noinline __attribute__((noinline))
+#else
+#   ifndef force_inline
+#      define force_inline inline
+#   endif
+#   ifndef noinline
+#      define noinline
+#   endif
+#endif
+
+/* GCC visibility */
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(_WIN32)
+#   define PIXMAN_EXPORT __attribute__ ((visibility("default")))
+/* Sun Studio 8 visibility */
+#elif defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
+#   define PIXMAN_EXPORT __global
+#else
+#   define PIXMAN_EXPORT
+#endif
+
+/* TLS */
+#if defined(PIXMAN_NO_TLS)
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static type name
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    (&name)
+
+#elif defined(TOOLCHAIN_SUPPORTS__THREAD)
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static __thread type name
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    (&name)
+
+#elif defined(__MINGW32__)
+
+#   define _NO_W32_PSEUDO_MODIFIERS
+#   include <windows.h>
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static volatile int tls_ ## name ## _initialized = 0;		\
+    static void *tls_ ## name ## _mutex = NULL;				\
+    static unsigned tls_ ## name ## _index;				\
+									\
+    static type *							\
+    tls_ ## name ## _alloc (void)					\
+    {									\
+        type *value = calloc (1, sizeof (type));			\
+        if (value)							\
+            TlsSetValue (tls_ ## name ## _index, value);		\
+        return value;							\
+    }									\
+									\
+    static force_inline type *						\
+    tls_ ## name ## _get (void)						\
+    {									\
+	type *value;							\
+	if (!tls_ ## name ## _initialized)				\
+	{								\
+	    if (!tls_ ## name ## _mutex)				\
+	    {								\
+		void *mutex = CreateMutexA (NULL, 0, NULL);		\
+		if (InterlockedCompareExchangePointer (			\
+			&tls_ ## name ## _mutex, mutex, NULL) != NULL)	\
+		{							\
+		    CloseHandle (mutex);				\
+		}							\
+	    }								\
+	    WaitForSingleObject (tls_ ## name ## _mutex, 0xFFFFFFFF);	\
+	    if (!tls_ ## name ## _initialized)				\
+	    {								\
+		tls_ ## name ## _index = TlsAlloc ();			\
+		tls_ ## name ## _initialized = 1;			\
+	    }								\
+	    ReleaseMutex (tls_ ## name ## _mutex);			\
+	}								\
+	if (tls_ ## name ## _index == 0xFFFFFFFF)			\
+	    return NULL;						\
+	value = TlsGetValue (tls_ ## name ## _index);			\
+	if (!value)							\
+	    value = tls_ ## name ## _alloc ();				\
+	return value;							\
+    }
+
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    tls_ ## name ## _get ()
+
+#elif defined(_MSC_VER)
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static __declspec(thread) type name
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    (&name)
+
+#elif defined(HAVE_PTHREAD_SETSPECIFIC)
+
+#include <pthread.h>
+
+#  define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static pthread_once_t tls_ ## name ## _once_control = PTHREAD_ONCE_INIT; \
+    static pthread_key_t tls_ ## name ## _key;				\
+									\
+    static void								\
+    tls_ ## name ## _destroy_value (void *value)			\
+    {									\
+	free (value);							\
+    }									\
+									\
+    static void								\
+    tls_ ## name ## _make_key (void)					\
+    {									\
+	pthread_key_create (&tls_ ## name ## _key,			\
+			    tls_ ## name ## _destroy_value);		\
+    }									\
+									\
+    static type *							\
+    tls_ ## name ## _alloc (void)					\
+    {									\
+	type *value = calloc (1, sizeof (type));			\
+	if (value)							\
+	    pthread_setspecific (tls_ ## name ## _key, value);		\
+	return value;							\
+    }									\
+									\
+    static force_inline type *						\
+    tls_ ## name ## _get (void)						\
+    {									\
+	type *value = NULL;						\
+	if (pthread_once (&tls_ ## name ## _once_control,		\
+			  tls_ ## name ## _make_key) == 0)		\
+	{								\
+	    value = pthread_getspecific (tls_ ## name ## _key);		\
+	    if (!value)							\
+		value = tls_ ## name ## _alloc ();			\
+	}								\
+	return value;							\
+    }
+
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    tls_ ## name ## _get ()
+
+#else
+
+#    error "Unknown thread local support for this system. Pixman will not work with multiple threads. Define PIXMAN_NO_TLS to acknowledge and accept this limitation and compile pixman without thread-safety support."
+
+#endif
diff --git a/pixman/pixman-conical-gradient.c b/pixman/pixman-conical-gradient.c
new file mode 100644
index 0000000..791d4f3
--- /dev/null
+++ b/pixman/pixman-conical-gradient.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright Â© 2000 SuSE, Inc.
+ * Copyright Â© 2007 Red Hat, Inc.
+ * Copyright Â© 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+#include "pixman-private.h"
+
+static force_inline double
+coordinates_to_parameter (double x, double y, double angle)
+{
+    double t;
+
+    t = atan2 (y, x) + angle;
+
+    while (t < 0)
+	t += 2 * M_PI;
+
+    while (t >= 2 * M_PI)
+	t -= 2 * M_PI;
+
+    return 1 - t * (1 / (2 * M_PI)); /* Scale t to [0, 1] and
+				      * make rotation CCW
+				      */
+}
+
+static uint32_t *
+conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    pixman_image_t *image = iter->image;
+    int x = iter->x;
+    int y = iter->y;
+    int width = iter->width;
+    uint32_t *buffer = iter->buffer;
+
+    gradient_t *gradient = (gradient_t *)image;
+    conical_gradient_t *conical = (conical_gradient_t *)image;
+    uint32_t       *end = buffer + width;
+    pixman_gradient_walker_t walker;
+    pixman_bool_t affine = TRUE;
+    double cx = 1.;
+    double cy = 0.;
+    double cz = 0.;
+    double rx = x + 0.5;
+    double ry = y + 0.5;
+    double rz = 1.;
+
+    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
+
+    if (image->common.transform)
+    {
+	pixman_vector_t v;
+
+	/* reference point is the center of the pixel */
+	v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+	v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+	v.vector[2] = pixman_fixed_1;
+
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return iter->buffer;
+
+	cx = image->common.transform->matrix[0][0] / 65536.;
+	cy = image->common.transform->matrix[1][0] / 65536.;
+	cz = image->common.transform->matrix[2][0] / 65536.;
+
+	rx = v.vector[0] / 65536.;
+	ry = v.vector[1] / 65536.;
+	rz = v.vector[2] / 65536.;
+
+	affine =
+	    image->common.transform->matrix[2][0] == 0 &&
+	    v.vector[2] == pixman_fixed_1;
+    }
+
+    if (affine)
+    {
+	rx -= conical->center.x / 65536.;
+	ry -= conical->center.y / 65536.;
+
+	while (buffer < end)
+	{
+	    if (!mask || *mask++)
+	    {
+		double t = coordinates_to_parameter (rx, ry, conical->angle);
+
+		*buffer = _pixman_gradient_walker_pixel (
+		    &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
+	    }
+
+	    ++buffer;
+
+	    rx += cx;
+	    ry += cy;
+	}
+    }
+    else
+    {
+	while (buffer < end)
+	{
+	    double x, y;
+
+	    if (!mask || *mask++)
+	    {
+		double t;
+
+		if (rz != 0)
+		{
+		    x = rx / rz;
+		    y = ry / rz;
+		}
+		else
+		{
+		    x = y = 0.;
+		}
+
+		x -= conical->center.x / 65536.;
+		y -= conical->center.y / 65536.;
+
+		t = coordinates_to_parameter (x, y, conical->angle);
+
+		*buffer = _pixman_gradient_walker_pixel (
+		    &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
+	    }
+
+	    ++buffer;
+
+	    rx += cx;
+	    ry += cy;
+	    rz += cz;
+	}
+    }
+
+    iter->y++;
+    return iter->buffer;
+}
+
+static uint32_t *
+conical_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    uint32_t *buffer = conical_get_scanline_narrow (iter, NULL);
+
+    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+    return buffer;
+}
+
+void
+_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->flags & ITER_NARROW)
+	iter->get_scanline = conical_get_scanline_narrow;
+    else
+	iter->get_scanline = conical_get_scanline_wide;
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_conical_gradient (pixman_point_fixed_t *        center,
+                                      pixman_fixed_t                angle,
+                                      const pixman_gradient_stop_t *stops,
+                                      int                           n_stops)
+{
+    pixman_image_t *image = _pixman_image_allocate ();
+    conical_gradient_t *conical;
+
+    if (!image)
+	return NULL;
+
+    conical = &image->conical;
+
+    if (!_pixman_init_gradient (&conical->common, stops, n_stops))
+    {
+	free (image);
+	return NULL;
+    }
+
+    angle = MOD (angle, pixman_int_to_fixed (360));
+
+    image->type = CONICAL;
+
+    conical->center = *center;
+    conical->angle = (pixman_fixed_to_double (angle) / 180.0) * M_PI;
+
+    return image;
+}
+
diff --git a/pixman/pixman-cpu.c b/pixman/pixman-cpu.c
new file mode 100644
index 0000000..dff27d1
--- /dev/null
+++ b/pixman/pixman-cpu.c
@@ -0,0 +1,631 @@
+/*
+ * Copyright Â© 2000 SuSE, Inc.
+ * Copyright Â© 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <string.h>
+
+#if defined(USE_ARM_SIMD) && defined(_MSC_VER)
+/* Needed for EXCEPTION_ILLEGAL_INSTRUCTION */
+#include <windows.h>
+#endif
+
+#include "pixman-private.h"
+
+#ifdef USE_VMX
+
+/* The CPU detection code needs to be in a file not compiled with
+ * "-maltivec -mabi=altivec", as gcc would try to save vector register
+ * across function calls causing SIGILL on cpus without Altivec/vmx.
+ */
+static pixman_bool_t initialized = FALSE;
+static volatile pixman_bool_t have_vmx = TRUE;
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+    if (!initialized)
+    {
+	size_t length = sizeof(have_vmx);
+	int error =
+	    sysctlbyname ("hw.optional.altivec", &have_vmx, &length, NULL, 0);
+
+	if (error)
+	    have_vmx = FALSE;
+
+	initialized = TRUE;
+    }
+    return have_vmx;
+}
+
+#elif defined (__OpenBSD__)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+    if (!initialized)
+    {
+	int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
+	size_t length = sizeof(have_vmx);
+	int error =
+	    sysctl (mib, 2, &have_vmx, &length, NULL, 0);
+
+	if (error != 0)
+	    have_vmx = FALSE;
+
+	initialized = TRUE;
+    }
+    return have_vmx;
+}
+
+#elif defined (__linux__)
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <linux/auxvec.h>
+#include <asm/cputable.h>
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+    if (!initialized)
+    {
+	char fname[64];
+	unsigned long buf[64];
+	ssize_t count = 0;
+	pid_t pid;
+	int fd, i;
+
+	pid = getpid ();
+	snprintf (fname, sizeof(fname) - 1, "/proc/%d/auxv", pid);
+
+	fd = open (fname, O_RDONLY);
+	if (fd >= 0)
+	{
+	    for (i = 0; i <= (count / sizeof(unsigned long)); i += 2)
+	    {
+		/* Read more if buf is empty... */
+		if (i == (count / sizeof(unsigned long)))
+		{
+		    count = read (fd, buf, sizeof(buf));
+		    if (count <= 0)
+			break;
+		    i = 0;
+		}
+
+		if (buf[i] == AT_HWCAP)
+		{
+		    have_vmx = !!(buf[i + 1] & PPC_FEATURE_HAS_ALTIVEC);
+		    initialized = TRUE;
+		    break;
+		}
+		else if (buf[i] == AT_NULL)
+		{
+		    break;
+		}
+	    }
+	    close (fd);
+	}
+    }
+    if (!initialized)
+    {
+	/* Something went wrong. Assume 'no' rather than playing
+	   fragile tricks with catching SIGILL. */
+	have_vmx = FALSE;
+	initialized = TRUE;
+    }
+
+    return have_vmx;
+}
+
+#else /* !__APPLE__ && !__OpenBSD__ && !__linux__ */
+#include <signal.h>
+#include <setjmp.h>
+
+static jmp_buf jump_env;
+
+static void
+vmx_test (int        sig,
+	  siginfo_t *si,
+	  void *     unused)
+{
+    longjmp (jump_env, 1);
+}
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+    struct sigaction sa, osa;
+    int jmp_result;
+
+    if (!initialized)
+    {
+	sa.sa_flags = SA_SIGINFO;
+	sigemptyset (&sa.sa_mask);
+	sa.sa_sigaction = vmx_test;
+	sigaction (SIGILL, &sa, &osa);
+	jmp_result = setjmp (jump_env);
+	if (jmp_result == 0)
+	{
+	    asm volatile ( "vor 0, 0, 0" );
+	}
+	sigaction (SIGILL, &osa, NULL);
+	have_vmx = (jmp_result == 0);
+	initialized = TRUE;
+    }
+    return have_vmx;
+}
+
+#endif /* __APPLE__ */
+#endif /* USE_VMX */
+
+#if defined(USE_ARM_SIMD) || defined(USE_ARM_NEON) || defined(USE_ARM_IWMMXT)
+
+#if defined(_MSC_VER)
+
+#if defined(USE_ARM_SIMD)
+extern int pixman_msvc_try_arm_simd_op ();
+
+pixman_bool_t
+pixman_have_arm_simd (void)
+{
+    static pixman_bool_t initialized = FALSE;
+    static pixman_bool_t have_arm_simd = FALSE;
+
+    if (!initialized)
+    {
+	__try {
+	    pixman_msvc_try_arm_simd_op ();
+	    have_arm_simd = TRUE;
+	} __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION) {
+	    have_arm_simd = FALSE;
+	}
+	initialized = TRUE;
+    }
+
+    return have_arm_simd;
+}
+
+#endif /* USE_ARM_SIMD */
+
+#if defined(USE_ARM_NEON)
+extern int pixman_msvc_try_arm_neon_op ();
+
+pixman_bool_t
+pixman_have_arm_neon (void)
+{
+    static pixman_bool_t initialized = FALSE;
+    static pixman_bool_t have_arm_neon = FALSE;
+
+    if (!initialized)
+    {
+	__try
+	{
+	    pixman_msvc_try_arm_neon_op ();
+	    have_arm_neon = TRUE;
+	}
+	__except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION)
+	{
+	    have_arm_neon = FALSE;
+	}
+	initialized = TRUE;
+    }
+
+    return have_arm_neon;
+}
+
+#endif /* USE_ARM_NEON */
+
+#elif defined (__linux__) /* linux ELF */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <string.h>
+#include <elf.h>
+
+static pixman_bool_t arm_has_v7 = FALSE;
+static pixman_bool_t arm_has_v6 = FALSE;
+static pixman_bool_t arm_has_vfp = FALSE;
+static pixman_bool_t arm_has_neon = FALSE;
+static pixman_bool_t arm_has_iwmmxt = FALSE;
+static pixman_bool_t arm_tests_initialized = FALSE;
+
+static void
+pixman_arm_read_auxv ()
+{
+    int fd;
+    Elf32_auxv_t aux;
+
+    fd = open ("/proc/self/auxv", O_RDONLY);
+    if (fd >= 0)
+    {
+	while (read (fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t))
+	{
+	    if (aux.a_type == AT_HWCAP)
+	    {
+		uint32_t hwcap = aux.a_un.a_val;
+		/* hardcode these values to avoid depending on specific
+		 * versions of the hwcap header, e.g. HWCAP_NEON
+		 */
+		arm_has_vfp = (hwcap & 64) != 0;
+		arm_has_iwmmxt = (hwcap & 512) != 0;
+		/* this flag is only present on kernel 2.6.29 */
+		arm_has_neon = (hwcap & 4096) != 0;
+	    }
+	    else if (aux.a_type == AT_PLATFORM)
+	    {
+		const char *plat = (const char*) aux.a_un.a_val;
+		if (strncmp (plat, "v7l", 3) == 0)
+		{
+		    arm_has_v7 = TRUE;
+		    arm_has_v6 = TRUE;
+		}
+		else if (strncmp (plat, "v6l", 3) == 0)
+		{
+		    arm_has_v6 = TRUE;
+		}
+	    }
+	}
+	close (fd);
+    }
+
+    arm_tests_initialized = TRUE;
+}
+
+#if defined(USE_ARM_SIMD)
+pixman_bool_t
+pixman_have_arm_simd (void)
+{
+    if (!arm_tests_initialized)
+	pixman_arm_read_auxv ();
+
+    return arm_has_v6;
+}
+
+#endif /* USE_ARM_SIMD */
+
+#if defined(USE_ARM_NEON)
+pixman_bool_t
+pixman_have_arm_neon (void)
+{
+    if (!arm_tests_initialized)
+	pixman_arm_read_auxv ();
+
+    return arm_has_neon;
+}
+
+#endif /* USE_ARM_NEON */
+
+#if defined(USE_ARM_IWMMXT)
+pixman_bool_t
+pixman_have_arm_iwmmxt (void)
+{
+    if (!arm_tests_initialized)
+	pixman_arm_read_auxv ();
+
+    return arm_has_iwmmxt;
+}
+
+#endif /* USE_ARM_IWMMXT */
+
+#else /* linux ELF */
+
+#define pixman_have_arm_simd() FALSE
+#define pixman_have_arm_neon() FALSE
+#define pixman_have_arm_iwmmxt() FALSE
+
+#endif
+
+#endif /* USE_ARM_SIMD || USE_ARM_NEON || USE_ARM_IWMMXT */
+
+#if defined(USE_X86_MMX) || defined(USE_SSE2)
+/* The CPU detection code needs to be in a file not compiled with
+ * "-mmmx -msse", as gcc would generate CMOV instructions otherwise
+ * that would lead to SIGILL instructions on old CPUs that don't have
+ * it.
+ */
+#if !defined(__amd64__) && !defined(__x86_64__) && !defined(_M_AMD64)
+
+#ifdef HAVE_GETISAX
+#include <sys/auxv.h>
+#endif
+
+typedef enum
+{
+    NO_FEATURES = 0,
+    MMX = 0x1,
+    MMX_EXTENSIONS = 0x2,
+    SSE = 0x6,
+    SSE2 = 0x8,
+    CMOV = 0x10
+} cpu_features_t;
+
+
+static unsigned int
+detect_cpu_features (void)
+{
+    unsigned int features = 0;
+    unsigned int result = 0;
+
+#ifdef HAVE_GETISAX
+    if (getisax (&result, 1))
+    {
+	if (result & AV_386_CMOV)
+	    features |= CMOV;
+	if (result & AV_386_MMX)
+	    features |= MMX;
+	if (result & AV_386_AMD_MMX)
+	    features |= MMX_EXTENSIONS;
+	if (result & AV_386_SSE)
+	    features |= SSE;
+	if (result & AV_386_SSE2)
+	    features |= SSE2;
+    }
+#else
+    char vendor[13];
+#ifdef _MSC_VER
+    int vendor0 = 0, vendor1, vendor2;
+#endif
+    vendor[0] = 0;
+    vendor[12] = 0;
+
+#ifdef __GNUC__
+    /* see p. 118 of amd64 instruction set manual Vol3 */
+    /* We need to be careful about the handling of %ebx and
+     * %esp here. We can't declare either one as clobbered
+     * since they are special registers (%ebx is the "PIC
+     * register" holding an offset to global data, %esp the
+     * stack pointer), so we need to make sure they have their
+     * original values when we access the output operands.
+     */
+    __asm__ (
+        "pushf\n"
+        "pop %%eax\n"
+        "mov %%eax, %%ecx\n"
+        "xor $0x00200000, %%eax\n"
+        "push %%eax\n"
+        "popf\n"
+        "pushf\n"
+        "pop %%eax\n"
+        "mov $0x0, %%edx\n"
+        "xor %%ecx, %%eax\n"
+        "jz 1f\n"
+
+        "mov $0x00000000, %%eax\n"
+        "push %%ebx\n"
+        "cpuid\n"
+        "mov %%ebx, %%eax\n"
+        "pop %%ebx\n"
+        "mov %%eax, %1\n"
+        "mov %%edx, %2\n"
+        "mov %%ecx, %3\n"
+        "mov $0x00000001, %%eax\n"
+        "push %%ebx\n"
+        "cpuid\n"
+        "pop %%ebx\n"
+        "1:\n"
+        "mov %%edx, %0\n"
+	: "=r" (result),
+        "=m" (vendor[0]),
+        "=m" (vendor[4]),
+        "=m" (vendor[8])
+	:
+	: "%eax", "%ecx", "%edx"
+        );
+
+#elif defined (_MSC_VER)
+
+    _asm {
+	pushfd
+	pop eax
+	mov ecx, eax
+	xor eax, 00200000h
+	push eax
+	popfd
+	pushfd
+	pop eax
+	mov edx, 0
+	xor eax, ecx
+	jz nocpuid
+
+	mov eax, 0
+	push ebx
+	cpuid
+	mov eax, ebx
+	pop ebx
+	mov vendor0, eax
+	mov vendor1, edx
+	mov vendor2, ecx
+	mov eax, 1
+	push ebx
+	cpuid
+	pop ebx
+    nocpuid:
+	mov result, edx
+    }
+    memmove (vendor + 0, &vendor0, 4);
+    memmove (vendor + 4, &vendor1, 4);
+    memmove (vendor + 8, &vendor2, 4);
+
+#else
+#   error unsupported compiler
+#endif
+
+    features = 0;
+    if (result)
+    {
+	/* result now contains the standard feature bits */
+	if (result & (1 << 15))
+	    features |= CMOV;
+	if (result & (1 << 23))
+	    features |= MMX;
+	if (result & (1 << 25))
+	    features |= SSE;
+	if (result & (1 << 26))
+	    features |= SSE2;
+	if ((features & MMX) && !(features & SSE) &&
+	    (strcmp (vendor, "AuthenticAMD") == 0 ||
+	     strcmp (vendor, "Geode by NSC") == 0))
+	{
+	    /* check for AMD MMX extensions */
+#ifdef __GNUC__
+	    __asm__ (
+	        "	push %%ebx\n"
+	        "	mov $0x80000000, %%eax\n"
+	        "	cpuid\n"
+	        "	xor %%edx, %%edx\n"
+	        "	cmp $0x1, %%eax\n"
+	        "	jge 2f\n"
+	        "	mov $0x80000001, %%eax\n"
+	        "	cpuid\n"
+	        "2:\n"
+	        "	pop %%ebx\n"
+	        "	mov %%edx, %0\n"
+		: "=r" (result)
+		:
+		: "%eax", "%ecx", "%edx"
+	        );
+#elif defined _MSC_VER
+	    _asm {
+		push ebx
+		mov eax, 80000000h
+		cpuid
+		xor edx, edx
+		cmp eax, 1
+		jge notamd
+		mov eax, 80000001h
+		cpuid
+	    notamd:
+		pop ebx
+		mov result, edx
+	    }
+#endif
+	    if (result & (1 << 22))
+		features |= MMX_EXTENSIONS;
+	}
+    }
+#endif /* HAVE_GETISAX */
+
+    return features;
+}
+
+static pixman_bool_t
+pixman_have_mmx (void)
+{
+    static pixman_bool_t initialized = FALSE;
+    static pixman_bool_t mmx_present;
+
+    if (!initialized)
+    {
+	unsigned int features = detect_cpu_features ();
+	mmx_present = (features & (MMX | MMX_EXTENSIONS)) == (MMX | MMX_EXTENSIONS);
+	initialized = TRUE;
+    }
+
+    return mmx_present;
+}
+
+#ifdef USE_SSE2
+static pixman_bool_t
+pixman_have_sse2 (void)
+{
+    static pixman_bool_t initialized = FALSE;
+    static pixman_bool_t sse2_present;
+
+    if (!initialized)
+    {
+	unsigned int features = detect_cpu_features ();
+	sse2_present = (features & (MMX | MMX_EXTENSIONS | SSE | SSE2)) == (MMX | MMX_EXTENSIONS | SSE | SSE2);
+	initialized = TRUE;
+    }
+
+    return sse2_present;
+}
+
+#endif
+
+#else /* __amd64__ */
+#ifdef USE_X86_MMX
+#define pixman_have_mmx() TRUE
+#endif
+#ifdef USE_SSE2
+#define pixman_have_sse2() TRUE
+#endif
+#endif /* __amd64__ */
+#endif
+
+pixman_implementation_t *
+_pixman_choose_implementation (void)
+{
+    pixman_implementation_t *imp;
+
+    imp = _pixman_implementation_create_general();
+    imp = _pixman_implementation_create_fast_path (imp);
+    
+#ifdef USE_X86_MMX
+    if (pixman_have_mmx ())
+	imp = _pixman_implementation_create_mmx (imp);
+#endif
+
+#ifdef USE_SSE2
+    if (pixman_have_sse2 ())
+	imp = _pixman_implementation_create_sse2 (imp);
+#endif
+
+#ifdef USE_ARM_SIMD
+    if (pixman_have_arm_simd ())
+	imp = _pixman_implementation_create_arm_simd (imp);
+#endif
+
+#ifdef USE_ARM_IWMMXT
+    if (pixman_have_arm_iwmmxt ())
+	imp = _pixman_implementation_create_mmx (imp);
+#endif
+
+#ifdef USE_ARM_NEON
+    if (pixman_have_arm_neon ())
+	imp = _pixman_implementation_create_arm_neon (imp);
+#endif
+
+#ifdef USE_VMX
+    if (pixman_have_vmx ())
+	imp = _pixman_implementation_create_vmx (imp);
+#endif
+
+    imp = _pixman_implementation_create_noop (imp);
+    
+    return imp;
+}
+
diff --git a/pixman/pixman-edge-accessors.c b/pixman/pixman-edge-accessors.c
new file mode 100644
index 0000000..ea3a31e
--- /dev/null
+++ b/pixman/pixman-edge-accessors.c
@@ -0,0 +1,4 @@
+
+#define PIXMAN_FB_ACCESSORS
+
+#include "pixman-edge.c"
diff --git a/pixman/pixman-edge-imp.h b/pixman/pixman-edge-imp.h
new file mode 100644
index 0000000..a4698ed
--- /dev/null
+++ b/pixman/pixman-edge-imp.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright Â© 2004 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef rasterize_span
+#endif
+
+static void
+RASTERIZE_EDGES (pixman_image_t  *image,
+		pixman_edge_t	*l,
+		pixman_edge_t	*r,
+		pixman_fixed_t		t,
+		pixman_fixed_t		b)
+{
+    pixman_fixed_t  y = t;
+    uint32_t  *line;
+    uint32_t *buf = (image)->bits.bits;
+    int stride = (image)->bits.rowstride;
+    int width = (image)->bits.width;
+
+    line = buf + pixman_fixed_to_int (y) * stride;
+
+    for (;;)
+    {
+	pixman_fixed_t	lx;
+	pixman_fixed_t      rx;
+	int	lxi;
+	int rxi;
+
+	lx = l->x;
+	rx = r->x;
+#if N_BITS == 1
+	/* For the non-antialiased case, round the coordinates up, in effect
+	 * sampling just slightly to the left of the pixel. This is so that
+	 * when the sample point lies exactly on the line, we round towards
+	 * north-west.
+	 *
+	 * (The AA case does a similar  adjustment in RENDER_SAMPLES_X)
+	 */
+	lx += X_FRAC_FIRST(1) - pixman_fixed_e;
+	rx += X_FRAC_FIRST(1) - pixman_fixed_e;
+#endif
+	/* clip X */
+	if (lx < 0)
+	    lx = 0;
+	if (pixman_fixed_to_int (rx) >= width)
+#if N_BITS == 1
+	    rx = pixman_int_to_fixed (width);
+#else
+	    /* Use the last pixel of the scanline, covered 100%.
+	     * We can't use the first pixel following the scanline,
+	     * because accessing it could result in a buffer overrun.
+	     */
+	    rx = pixman_int_to_fixed (width) - 1;
+#endif
+
+	/* Skip empty (or backwards) sections */
+	if (rx > lx)
+	{
+
+	    /* Find pixel bounds for span */
+	    lxi = pixman_fixed_to_int (lx);
+	    rxi = pixman_fixed_to_int (rx);
+
+#if N_BITS == 1
+	    {
+
+#define LEFT_MASK(x)							\
+		(((x) & 0x1f) ?						\
+		 SCREEN_SHIFT_RIGHT (0xffffffff, (x) & 0x1f) : 0)
+#define RIGHT_MASK(x)							\
+		(((32 - (x)) & 0x1f) ?					\
+		 SCREEN_SHIFT_LEFT (0xffffffff, (32 - (x)) & 0x1f) : 0)
+		
+#define MASK_BITS(x,w,l,n,r) {						\
+		    n = (w);						\
+		    r = RIGHT_MASK ((x) + n);				\
+		    l = LEFT_MASK (x);					\
+		    if (l) {						\
+			n -= 32 - ((x) & 0x1f);				\
+			if (n < 0) {					\
+			    n = 0;					\
+			    l &= r;					\
+			    r = 0;					\
+			}						\
+		    }							\
+		    n >>= 5;						\
+		}
+		
+		uint32_t  *a = line;
+		uint32_t  startmask;
+		uint32_t  endmask;
+		int	    nmiddle;
+		int	    width = rxi - lxi;
+		int	    x = lxi;
+		
+		a += x >> 5;
+		x &= 0x1f;
+		
+		MASK_BITS (x, width, startmask, nmiddle, endmask);
+
+		if (startmask) {
+		    WRITE(image, a, READ(image, a) | startmask);
+		    a++;
+		}
+		while (nmiddle--)
+		    WRITE(image, a++, 0xffffffff);
+		if (endmask)
+		    WRITE(image, a, READ(image, a) | endmask);
+	    }
+#else
+	    {
+		DEFINE_ALPHA(line,lxi);
+		int	    lxs;
+		int     rxs;
+
+		/* Sample coverage for edge pixels */
+		lxs = RENDER_SAMPLES_X (lx, N_BITS);
+		rxs = RENDER_SAMPLES_X (rx, N_BITS);
+
+		/* Add coverage across row */
+		if (lxi == rxi)
+		{
+		    ADD_ALPHA (rxs - lxs);
+		}
+		else
+		{
+		    int	xi;
+
+		    ADD_ALPHA (N_X_FRAC(N_BITS) - lxs);
+		    STEP_ALPHA;
+		    for (xi = lxi + 1; xi < rxi; xi++)
+		    {
+			ADD_ALPHA (N_X_FRAC(N_BITS));
+			STEP_ALPHA;
+		    }
+		    ADD_ALPHA (rxs);
+		}
+	    }
+#endif
+	}
+
+	if (y == b)
+	    break;
+
+#if N_BITS > 1
+	if (pixman_fixed_frac (y) != Y_FRAC_LAST(N_BITS))
+	{
+	    RENDER_EDGE_STEP_SMALL (l);
+	    RENDER_EDGE_STEP_SMALL (r);
+	    y += STEP_Y_SMALL(N_BITS);
+	}
+	else
+#endif
+	{
+	    RENDER_EDGE_STEP_BIG (l);
+	    RENDER_EDGE_STEP_BIG (r);
+	    y += STEP_Y_BIG(N_BITS);
+	    line += stride;
+	}
+    }
+}
+
+#undef rasterize_span
diff --git a/pixman/pixman-edge.c b/pixman/pixman-edge.c
new file mode 100644
index 0000000..8d498ab
--- /dev/null
+++ b/pixman/pixman-edge.c
@@ -0,0 +1,384 @@
+/*
+ * Copyright Â© 2004 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <string.h>
+
+#include "pixman-private.h"
+#include "pixman-accessor.h"
+
+/*
+ * Step across a small sample grid gap
+ */
+#define RENDER_EDGE_STEP_SMALL(edge)					\
+    {									\
+	edge->x += edge->stepx_small;					\
+	edge->e += edge->dx_small;					\
+	if (edge->e > 0)						\
+	{								\
+	    edge->e -= edge->dy;					\
+	    edge->x += edge->signdx;					\
+	}								\
+    }
+
+/*
+ * Step across a large sample grid gap
+ */
+#define RENDER_EDGE_STEP_BIG(edge)					\
+    {									\
+	edge->x += edge->stepx_big;					\
+	edge->e += edge->dx_big;					\
+	if (edge->e > 0)						\
+	{								\
+	    edge->e -= edge->dy;					\
+	    edge->x += edge->signdx;					\
+	}								\
+    }
+
+#ifdef PIXMAN_FB_ACCESSORS
+#define PIXMAN_RASTERIZE_EDGES pixman_rasterize_edges_accessors
+#else
+#define PIXMAN_RASTERIZE_EDGES pixman_rasterize_edges_no_accessors
+#endif
+
+/*
+ * 4 bit alpha
+ */
+
+#define N_BITS  4
+#define RASTERIZE_EDGES rasterize_edges_4
+
+#ifndef WORDS_BIGENDIAN
+#define SHIFT_4(o)      ((o) << 2)
+#else
+#define SHIFT_4(o)      ((1 - (o)) << 2)
+#endif
+
+#define GET_4(x, o)      (((x) >> SHIFT_4 (o)) & 0xf)
+#define PUT_4(x, o, v)							\
+    (((x) & ~(0xf << SHIFT_4 (o))) | (((v) & 0xf) << SHIFT_4 (o)))
+
+#define DEFINE_ALPHA(line, x)						\
+    uint8_t   *__ap = (uint8_t *) line + ((x) >> 1);			\
+    int __ao = (x) & 1
+
+#define STEP_ALPHA      ((__ap += __ao), (__ao ^= 1))
+
+#define ADD_ALPHA(a)							\
+    {									\
+        uint8_t __o = READ (image, __ap);				\
+        uint8_t __a = (a) + GET_4 (__o, __ao);				\
+        WRITE (image, __ap, PUT_4 (__o, __ao, __a | (0 - ((__a) >> 4)))); \
+    }
+
+#include "pixman-edge-imp.h"
+
+#undef ADD_ALPHA
+#undef STEP_ALPHA
+#undef DEFINE_ALPHA
+#undef RASTERIZE_EDGES
+#undef N_BITS
+
+
+/*
+ * 1 bit alpha
+ */
+
+#define N_BITS 1
+#define RASTERIZE_EDGES rasterize_edges_1
+
+#include "pixman-edge-imp.h"
+
+#undef RASTERIZE_EDGES
+#undef N_BITS
+
+/*
+ * 8 bit alpha
+ */
+
+static force_inline uint8_t
+clip255 (int x)
+{
+    if (x > 255)
+	return 255;
+
+    return x;
+}
+
+#define ADD_SATURATE_8(buf, val, length)				\
+    do									\
+    {									\
+        int i__ = (length);						\
+        uint8_t *buf__ = (buf);						\
+        int val__ = (val);						\
+									\
+        while (i__--)							\
+        {								\
+            WRITE (image, (buf__), clip255 (READ (image, (buf__)) + (val__))); \
+            (buf__)++;							\
+	}								\
+    } while (0)
+
+/*
+ * We want to detect the case where we add the same value to a long
+ * span of pixels.  The triangles on the end are filled in while we
+ * count how many sub-pixel scanlines contribute to the middle section.
+ *
+ *                 +--------------------------+
+ *  fill_height =|   \                      /
+ *                     +------------------+
+ *                      |================|
+ *                   fill_start       fill_end
+ */
+static void
+rasterize_edges_8 (pixman_image_t *image,
+                   pixman_edge_t * l,
+                   pixman_edge_t * r,
+                   pixman_fixed_t  t,
+                   pixman_fixed_t  b)
+{
+    pixman_fixed_t y = t;
+    uint32_t  *line;
+    int fill_start = -1, fill_end = -1;
+    int fill_size = 0;
+    uint32_t *buf = (image)->bits.bits;
+    int stride = (image)->bits.rowstride;
+    int width = (image)->bits.width;
+
+    line = buf + pixman_fixed_to_int (y) * stride;
+
+    for (;;)
+    {
+        uint8_t *ap = (uint8_t *) line;
+        pixman_fixed_t lx, rx;
+        int lxi, rxi;
+
+        /* clip X */
+        lx = l->x;
+        if (lx < 0)
+	    lx = 0;
+
+        rx = r->x;
+
+        if (pixman_fixed_to_int (rx) >= width)
+	{
+	    /* Use the last pixel of the scanline, covered 100%.
+	     * We can't use the first pixel following the scanline,
+	     * because accessing it could result in a buffer overrun.
+	     */
+	    rx = pixman_int_to_fixed (width) - 1;
+	}
+
+        /* Skip empty (or backwards) sections */
+        if (rx > lx)
+        {
+            int lxs, rxs;
+
+            /* Find pixel bounds for span. */
+            lxi = pixman_fixed_to_int (lx);
+            rxi = pixman_fixed_to_int (rx);
+
+            /* Sample coverage for edge pixels */
+            lxs = RENDER_SAMPLES_X (lx, 8);
+            rxs = RENDER_SAMPLES_X (rx, 8);
+
+            /* Add coverage across row */
+            if (lxi == rxi)
+            {
+                WRITE (image, ap + lxi,
+		       clip255 (READ (image, ap + lxi) + rxs - lxs));
+	    }
+            else
+            {
+                WRITE (image, ap + lxi,
+		       clip255 (READ (image, ap + lxi) + N_X_FRAC (8) - lxs));
+
+                /* Move forward so that lxi/rxi is the pixel span */
+                lxi++;
+
+                /* Don't bother trying to optimize the fill unless
+		 * the span is longer than 4 pixels. */
+                if (rxi - lxi > 4)
+                {
+                    if (fill_start < 0)
+                    {
+                        fill_start = lxi;
+                        fill_end = rxi;
+                        fill_size++;
+		    }
+                    else
+                    {
+                        if (lxi >= fill_end || rxi < fill_start)
+                        {
+                            /* We're beyond what we saved, just fill it */
+                            ADD_SATURATE_8 (ap + fill_start,
+                                            fill_size * N_X_FRAC (8),
+                                            fill_end - fill_start);
+                            fill_start = lxi;
+                            fill_end = rxi;
+                            fill_size = 1;
+			}
+                        else
+                        {
+                            /* Update fill_start */
+                            if (lxi > fill_start)
+                            {
+                                ADD_SATURATE_8 (ap + fill_start,
+                                                fill_size * N_X_FRAC (8),
+                                                lxi - fill_start);
+                                fill_start = lxi;
+			    }
+                            else if (lxi < fill_start)
+                            {
+                                ADD_SATURATE_8 (ap + lxi, N_X_FRAC (8),
+                                                fill_start - lxi);
+			    }
+
+                            /* Update fill_end */
+                            if (rxi < fill_end)
+                            {
+                                ADD_SATURATE_8 (ap + rxi,
+                                                fill_size * N_X_FRAC (8),
+                                                fill_end - rxi);
+                                fill_end = rxi;
+			    }
+                            else if (fill_end < rxi)
+                            {
+                                ADD_SATURATE_8 (ap + fill_end,
+                                                N_X_FRAC (8),
+                                                rxi - fill_end);
+			    }
+                            fill_size++;
+			}
+		    }
+		}
+                else
+                {
+                    ADD_SATURATE_8 (ap + lxi, N_X_FRAC (8), rxi - lxi);
+		}
+
+                WRITE (image, ap + rxi, clip255 (READ (image, ap + rxi) + rxs));
+	    }
+	}
+
+        if (y == b)
+        {
+            /* We're done, make sure we clean up any remaining fill. */
+            if (fill_start != fill_end)
+            {
+                if (fill_size == N_Y_FRAC (8))
+                {
+                    MEMSET_WRAPPED (image, ap + fill_start,
+				    0xff, fill_end - fill_start);
+		}
+                else
+                {
+                    ADD_SATURATE_8 (ap + fill_start, fill_size * N_X_FRAC (8),
+                                    fill_end - fill_start);
+		}
+	    }
+            break;
+	}
+
+        if (pixman_fixed_frac (y) != Y_FRAC_LAST (8))
+        {
+            RENDER_EDGE_STEP_SMALL (l);
+            RENDER_EDGE_STEP_SMALL (r);
+            y += STEP_Y_SMALL (8);
+	}
+        else
+        {
+            RENDER_EDGE_STEP_BIG (l);
+            RENDER_EDGE_STEP_BIG (r);
+            y += STEP_Y_BIG (8);
+            if (fill_start != fill_end)
+            {
+                if (fill_size == N_Y_FRAC (8))
+                {
+                    MEMSET_WRAPPED (image, ap + fill_start,
+				    0xff, fill_end - fill_start);
+		}
+                else
+                {
+                    ADD_SATURATE_8 (ap + fill_start, fill_size * N_X_FRAC (8),
+                                    fill_end - fill_start);
+		}
+		
+                fill_start = fill_end = -1;
+                fill_size = 0;
+	    }
+	    
+            line += stride;
+	}
+    }
+}
+
+#ifndef PIXMAN_FB_ACCESSORS
+static
+#endif
+void
+PIXMAN_RASTERIZE_EDGES (pixman_image_t *image,
+                        pixman_edge_t * l,
+                        pixman_edge_t * r,
+                        pixman_fixed_t  t,
+                        pixman_fixed_t  b)
+{
+    switch (PIXMAN_FORMAT_BPP (image->bits.format))
+    {
+    case 1:
+	rasterize_edges_1 (image, l, r, t, b);
+	break;
+
+    case 4:
+	rasterize_edges_4 (image, l, r, t, b);
+	break;
+
+    case 8:
+	rasterize_edges_8 (image, l, r, t, b);
+	break;
+
+    default:
+        break;
+    }
+}
+
+#ifndef PIXMAN_FB_ACCESSORS
+
+PIXMAN_EXPORT void
+pixman_rasterize_edges (pixman_image_t *image,
+                        pixman_edge_t * l,
+                        pixman_edge_t * r,
+                        pixman_fixed_t  t,
+                        pixman_fixed_t  b)
+{
+    return_if_fail (image->type == BITS);
+    
+    if (image->bits.read_func || image->bits.write_func)
+	pixman_rasterize_edges_accessors (image, l, r, t, b);
+    else
+	pixman_rasterize_edges_no_accessors (image, l, r, t, b);
+}
+
+#endif
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
new file mode 100644
index 0000000..038dcf7
--- /dev/null
+++ b/pixman/pixman-fast-path.c
@@ -0,0 +1,2166 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright Â© 2000 SuSE, Inc.
+ * Copyright Â© 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <string.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+static force_inline uint32_t
+fetch_24 (uint8_t *a)
+{
+    if (((unsigned long)a) & 1)
+    {
+#ifdef WORDS_BIGENDIAN
+	return (*a << 16) | (*(uint16_t *)(a + 1));
+#else
+	return *a | (*(uint16_t *)(a + 1) << 8);
+#endif
+    }
+    else
+    {
+#ifdef WORDS_BIGENDIAN
+	return (*(uint16_t *)a << 8) | *(a + 2);
+#else
+	return *(uint16_t *)a | (*(a + 2) << 16);
+#endif
+    }
+}
+
+static force_inline void
+store_24 (uint8_t *a,
+          uint32_t v)
+{
+    if (((unsigned long)a) & 1)
+    {
+#ifdef WORDS_BIGENDIAN
+	*a = (uint8_t) (v >> 16);
+	*(uint16_t *)(a + 1) = (uint16_t) (v);
+#else
+	*a = (uint8_t) (v);
+	*(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
+#endif
+    }
+    else
+    {
+#ifdef WORDS_BIGENDIAN
+	*(uint16_t *)a = (uint16_t)(v >> 8);
+	*(a + 2) = (uint8_t)v;
+#else
+	*(uint16_t *)a = (uint16_t)v;
+	*(a + 2) = (uint8_t)(v >> 16);
+#endif
+    }
+}
+
+static force_inline uint32_t
+over (uint32_t src,
+      uint32_t dest)
+{
+    uint32_t a = ~src >> 24;
+
+    UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
+
+    return dest;
+}
+
+static uint32_t
+in (uint32_t x,
+    uint8_t  y)
+{
+    uint16_t a = y;
+
+    UN8x4_MUL_UN8 (x, a);
+
+    return x;
+}
+
+/*
+ * Naming convention:
+ *
+ *  op_src_mask_dest
+ */
+static void
+fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line;
+    uint32_t    *dst, *dst_line;
+    uint8_t     *mask, *mask_line;
+    int src_stride, mask_stride, dst_stride;
+    uint8_t m;
+    uint32_t s, d;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	src = src_line;
+	src_line += src_stride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	w = width;
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m)
+	    {
+		s = *src | 0xff000000;
+
+		if (m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    d = in (s, m);
+		    *dst = over (d, *dst);
+		}
+	    }
+	    src++;
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_in_n_8_8 (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint16_t t;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    if (srca == 0xff)
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    while (w--)
+	    {
+		m = *mask++;
+
+		if (m == 0)
+		    *dst = 0;
+		else if (m != 0xff)
+		    *dst = MUL_UN8 (m, *dst, t);
+
+		dst++;
+	    }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    while (w--)
+	    {
+		m = *mask++;
+		m = MUL_UN8 (m, srca, t);
+
+		if (m == 0)
+		    *dst = 0;
+		else if (m != 0xff)
+		    *dst = MUL_UN8 (m, *dst, t);
+
+		dst++;
+	    }
+	}
+    }
+}
+
+static void
+fast_composite_in_8_8 (pixman_implementation_t *imp,
+                       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint8_t s;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+
+	    if (s == 0)
+		*dst = 0;
+	    else if (s != 0xff)
+		*dst = MUL_UN8 (s, *dst, t);
+
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst, d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m == 0xff)
+	    {
+		if (srca == 0xff)
+		    *dst = src;
+		else
+		    *dst = over (src, *dst);
+	    }
+	    else if (m)
+	    {
+		d = in (src, m);
+		*dst = over (d, *dst);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+				   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, s;
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    ma = *mask++;
+
+	    if (ma)
+	    {
+		d = *dst;
+		s = src;
+
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
+
+		*dst = s;
+	    }
+
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca, s;
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    ma = *mask++;
+	    if (ma == 0xffffffff)
+	    {
+		if (srca == 0xff)
+		    *dst = src;
+		else
+		    *dst = over (src, *dst);
+	    }
+	    else if (ma)
+	    {
+		d = *dst;
+		s = src;
+
+		UN8x4_MUL_UN8x4 (s, ma);
+		UN8x4_MUL_UN8 (ma, srca);
+		ma = ~ma;
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+		*dst = d;
+	    }
+
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint8_t     *dst_line, *dst;
+    uint32_t d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m == 0xff)
+	    {
+		if (srca == 0xff)
+		{
+		    d = src;
+		}
+		else
+		{
+		    d = fetch_24 (dst);
+		    d = over (src, d);
+		}
+		store_24 (dst, d);
+	    }
+	    else if (m)
+	    {
+		d = over (in (src, m), fetch_24 (dst));
+		store_24 (dst, d);
+	    }
+	    dst += 3;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m == 0xff)
+	    {
+		if (srca == 0xff)
+		{
+		    d = src;
+		}
+		else
+		{
+		    d = *dst;
+		    d = over (src, CONVERT_0565_TO_0888 (d));
+		}
+		*dst = CONVERT_8888_TO_0565 (d);
+	    }
+	    else if (m)
+	    {
+		d = *dst;
+		d = over (in (src, m), CONVERT_0565_TO_0888 (d));
+		*dst = CONVERT_8888_TO_0565 (d);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t  src, srca, s;
+    uint16_t  src16;
+    uint16_t *dst_line, *dst;
+    uint32_t  d;
+    uint32_t *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    src16 = CONVERT_8888_TO_0565 (src);
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    ma = *mask++;
+	    if (ma == 0xffffffff)
+	    {
+		if (srca == 0xff)
+		{
+		    *dst = src16;
+		}
+		else
+		{
+		    d = *dst;
+		    d = over (src, CONVERT_0565_TO_0888 (d));
+		    *dst = CONVERT_8888_TO_0565 (d);
+		}
+	    }
+	    else if (ma)
+	    {
+		d = *dst;
+		d = CONVERT_0565_TO_0888 (d);
+
+		s = src;
+
+		UN8x4_MUL_UN8x4 (s, ma);
+		UN8x4_MUL_UN8 (ma, srca);
+		ma = ~ma;
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+		*dst = CONVERT_8888_TO_0565 (d);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_8888_8888 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    uint8_t a;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    a = s >> 24;
+	    if (a == 0xff)
+		*dst = s;
+	    else if (s)
+		*dst = over (s, *dst);
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_src_x888_8888 (pixman_implementation_t *imp,
+			      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	    *dst++ = (*src++) | 0xff000000;
+    }
+}
+
+#if 0
+static void
+fast_composite_over_8888_0888 (pixman_implementation_t *imp,
+			       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src, s;
+    uint8_t a;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    a = s >> 24;
+	    if (a)
+	    {
+		if (a == 0xff)
+		    d = s;
+		else
+		    d = over (s, fetch_24 (dst));
+
+		store_24 (dst, d);
+	    }
+	    dst += 3;
+	}
+    }
+}
+#endif
+
+static void
+fast_composite_over_8888_0565 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src, s;
+    uint8_t a;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    a = s >> 24;
+	    if (s)
+	    {
+		if (a == 0xff)
+		{
+		    d = s;
+		}
+		else
+		{
+		    d = *dst;
+		    d = over (s, CONVERT_0565_TO_0888 (d));
+		}
+		*dst = CONVERT_8888_TO_0565 (d);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_src_x888_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    *dst = CONVERT_8888_TO_0565 (s);
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_8_8 (pixman_implementation_t *imp,
+			pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint8_t s, d;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    if (s)
+	    {
+		if (s != 0xff)
+		{
+		    d = *dst;
+		    t = d + s;
+		    s = t | (0 - (t >> 8));
+		}
+		*dst = s;
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint32_t s, d;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    if (s)
+	    {
+		if (s != 0xffffffff)
+		{
+		    d = *dst;
+		    if (d)
+			UN8x4_ADD_UN8x4 (s, d);
+		}
+		*dst = s;
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_n_8_8 (pixman_implementation_t *imp,
+			  pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint8_t sa;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    sa = (src >> 24);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    uint16_t tmp;
+	    uint16_t a;
+	    uint32_t m, d;
+	    uint32_t r;
+
+	    a = *mask++;
+	    d = *dst;
+
+	    m = MUL_UN8 (sa, a, tmp);
+	    r = ADD_UN8 (m, d, tmp);
+
+	    *dst++ = r;
+	}
+    }
+}
+
+#ifdef WORDS_BIGENDIAN
+#define CREATE_BITMASK(n) (0x80000000 >> (n))
+#define UPDATE_BITMASK(n) ((n) >> 1)
+#else
+#define CREATE_BITMASK(n) (1 << (n))
+#define UPDATE_BITMASK(n) ((n) << 1)
+#endif
+
+#define TEST_BIT(p, n)					\
+    (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
+#define SET_BIT(p, n)							\
+    do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
+
+static void
+fast_composite_add_1000_1000 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     *dst_line, *dst;
+    uint32_t     *src_line, *src;
+    int           dst_stride, src_stride;
+    int32_t       w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
+                           src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, 0, dest_y, uint32_t,
+                           dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    /*
+	     * TODO: improve performance by processing uint32_t data instead
+	     *       of individual bits
+	     */
+	    if (TEST_BIT (src, src_x + w))
+		SET_BIT (dst, dest_x + w);
+	}
+    }
+}
+
+static void
+fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     src, srca;
+    uint32_t    *dst, *dst_line;
+    uint32_t    *mask, *mask_line;
+    int          mask_stride, dst_stride;
+    uint32_t     bitcache, bitmask;
+    int32_t      w;
+
+    if (width <= 0)
+	return;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t,
+                           dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+                           mask_stride, mask_line, 1);
+    mask_line += mask_x >> 5;
+
+    if (srca == 0xff)
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		    *dst = src;
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		    *dst = over (src, *dst);
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+}
+
+static void
+fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     src, srca;
+    uint16_t    *dst, *dst_line;
+    uint32_t    *mask, *mask_line;
+    int          mask_stride, dst_stride;
+    uint32_t     bitcache, bitmask;
+    int32_t      w;
+    uint32_t     d;
+    uint16_t     src565;
+
+    if (width <= 0)
+	return;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t,
+                           dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+                           mask_stride, mask_line, 1);
+    mask_line += mask_x >> 5;
+
+    if (srca == 0xff)
+    {
+	src565 = CONVERT_8888_TO_0565 (src);
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		    *dst = src565;
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		{
+		    d = over (src, CONVERT_0565_TO_0888 (*dst));
+		    *dst = CONVERT_8888_TO_0565 (d);
+		}
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+}
+
+/*
+ * Simple bitblt
+ */
+
+static void
+fast_composite_solid_fill (pixman_implementation_t *imp,
+                           pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (dest_image->bits.format == PIXMAN_a1)
+    {
+	src = src >> 31;
+    }
+    else if (dest_image->bits.format == PIXMAN_a8)
+    {
+	src = src >> 24;
+    }
+    else if (dest_image->bits.format == PIXMAN_r5g6b5 ||
+             dest_image->bits.format == PIXMAN_b5g6r5)
+    {
+	src = CONVERT_8888_TO_0565 (src);
+    }
+
+    pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+                 PIXMAN_FORMAT_BPP (dest_image->bits.format),
+                 dest_x, dest_y,
+                 width, height,
+                 src);
+}
+
+static void
+fast_composite_src_memcpy (pixman_implementation_t *imp,
+			   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8;
+    uint32_t n_bytes = width * bpp;
+    int dst_stride, src_stride;
+    uint8_t    *dst;
+    uint8_t    *src;
+
+    src_stride = src_image->bits.rowstride * 4;
+    dst_stride = dest_image->bits.rowstride * 4;
+
+    src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
+    dst = (uint8_t *)dest_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
+
+    while (height--)
+    {
+	memcpy (dst, src, n_bytes);
+
+	dst += dst_stride;
+	src += src_stride;
+    }
+}
+
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL)
+FAST_NEAREST (x888_8888_cover, x888, 8888, uint32_t, uint32_t, SRC, COVER)
+FAST_NEAREST (x888_8888_pad, x888, 8888, uint32_t, uint32_t, SRC, PAD)
+FAST_NEAREST (x888_8888_normal, x888, 8888, uint32_t, uint32_t, SRC, NORMAL)
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
+
+#define REPEAT_MIN_WIDTH    32
+
+static void
+fast_composite_tiled_repeat (pixman_implementation_t *imp,
+			     pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    pixman_composite_func_t func;
+    pixman_format_code_t mask_format;
+    uint32_t src_flags, mask_flags;
+
+    src_flags = (info->src_flags & ~FAST_PATH_NORMAL_REPEAT) |
+		    FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+
+    if (mask_image)
+    {
+	mask_format = mask_image->common.extended_format_code;
+	mask_flags = info->mask_flags;
+    }
+    else
+    {
+	mask_format = PIXMAN_null;
+	mask_flags = FAST_PATH_IS_OPAQUE;
+    }
+
+    if (_pixman_lookup_composite_function (
+	    imp->toplevel, info->op,
+	    src_image->common.extended_format_code, src_flags,
+	    mask_format, mask_flags,
+	    dest_image->common.extended_format_code, info->dest_flags,
+	    &imp, &func))
+    {
+	int32_t sx, sy;
+	int32_t width_remain;
+	int32_t num_pixels;
+	int32_t src_width;
+	int32_t i, j;
+	pixman_image_t extended_src_image;
+	uint32_t extended_src[REPEAT_MIN_WIDTH * 2];
+	pixman_bool_t need_src_extension;
+	uint32_t *src_line;
+	int32_t src_stride;
+	int32_t src_bpp;
+	pixman_composite_info_t info2 = *info;
+
+	src_bpp = PIXMAN_FORMAT_BPP (src_image->bits.format);
+
+	if (src_image->bits.width < REPEAT_MIN_WIDTH &&
+	    (src_bpp == 32 || src_bpp == 16 || src_bpp == 8))
+	{
+	    sx = src_x;
+	    sx = MOD (sx, src_image->bits.width);
+	    sx += width;
+	    src_width = 0;
+
+	    while (src_width < REPEAT_MIN_WIDTH && src_width <= sx)
+		src_width += src_image->bits.width;
+
+	    src_stride = (src_width * (src_bpp >> 3) + 3) / (int) sizeof (uint32_t);
+
+	    /* Initialize/validate stack-allocated temporary image */
+	    _pixman_bits_image_init (&extended_src_image, src_image->bits.format,
+				     src_width, 1, &extended_src[0], src_stride);
+	    _pixman_image_validate (&extended_src_image);
+
+	    info2.src_image = &extended_src_image;
+	    need_src_extension = TRUE;
+	}
+	else
+	{
+	    src_width = src_image->bits.width;
+	    need_src_extension = FALSE;
+	}
+
+	sx = src_x;
+	sy = src_y;
+
+	while (--height >= 0)
+	{
+	    sx = MOD (sx, src_width);
+	    sy = MOD (sy, src_image->bits.height);
+
+	    if (need_src_extension)
+	    {
+		if (src_bpp == 32)
+		{
+		    PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint32_t, src_stride, src_line, 1);
+
+		    for (i = 0; i < src_width; )
+		    {
+			for (j = 0; j < src_image->bits.width; j++, i++)
+			    extended_src[i] = src_line[j];
+		    }
+		}
+		else if (src_bpp == 16)
+		{
+		    uint16_t *src_line_16;
+
+		    PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint16_t, src_stride,
+					   src_line_16, 1);
+		    src_line = (uint32_t*)src_line_16;
+
+		    for (i = 0; i < src_width; )
+		    {
+			for (j = 0; j < src_image->bits.width; j++, i++)
+			    ((uint16_t*)extended_src)[i] = ((uint16_t*)src_line)[j];
+		    }
+		}
+		else if (src_bpp == 8)
+		{
+		    uint8_t *src_line_8;
+
+		    PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint8_t, src_stride,
+					   src_line_8, 1);
+		    src_line = (uint32_t*)src_line_8;
+
+		    for (i = 0; i < src_width; )
+		    {
+			for (j = 0; j < src_image->bits.width; j++, i++)
+			    ((uint8_t*)extended_src)[i] = ((uint8_t*)src_line)[j];
+		    }
+		}
+
+		info2.src_y = 0;
+	    }
+	    else
+	    {
+		info2.src_y = sy;
+	    }
+
+	    width_remain = width;
+
+	    while (width_remain > 0)
+	    {
+		num_pixels = src_width - sx;
+
+		if (num_pixels > width_remain)
+		    num_pixels = width_remain;
+
+		info2.src_x = sx;
+		info2.width = num_pixels;
+		info2.height = 1;
+
+		func (imp, &info2);
+
+		width_remain -= num_pixels;
+		info2.mask_x += num_pixels;
+		info2.dest_x += num_pixels;
+		sx = 0;
+	    }
+
+	    sx = src_x;
+	    sy++;
+	    info2.mask_x = info->mask_x;
+	    info2.mask_y++;
+	    info2.dest_x = info->dest_x;
+	    info2.dest_y++;
+	}
+
+	if (need_src_extension)
+	    _pixman_image_fini (&extended_src_image);
+    }
+    else
+    {
+	_pixman_log_error (FUNC, "Didn't find a suitable function ");
+    }
+}
+
+/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
+static force_inline void
+scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
+				     const uint16_t * src,
+				     int32_t          w,
+				     pixman_fixed_t   vx,
+				     pixman_fixed_t   unit_x,
+				     pixman_fixed_t   max_vx,
+				     pixman_bool_t    fully_transparent_src)
+{
+    uint16_t tmp1, tmp2, tmp3, tmp4;
+    while ((w -= 4) >= 0)
+    {
+	tmp1 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp2 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp3 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp4 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	*dst++ = tmp1;
+	*dst++ = tmp2;
+	*dst++ = tmp3;
+	*dst++ = tmp4;
+    }
+    if (w & 2)
+    {
+	tmp1 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp2 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	*dst++ = tmp1;
+	*dst++ = tmp2;
+    }
+    if (w & 1)
+	*dst++ = src[pixman_fixed_to_int (vx)];
+}
+
+FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
+		       scaled_nearest_scanline_565_565_SRC,
+		       uint16_t, uint16_t, COVER)
+FAST_NEAREST_MAINLOOP (565_565_none_SRC,
+		       scaled_nearest_scanline_565_565_SRC,
+		       uint16_t, uint16_t, NONE)
+FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
+		       scaled_nearest_scanline_565_565_SRC,
+		       uint16_t, uint16_t, PAD)
+
+static force_inline uint32_t
+fetch_nearest (pixman_repeat_t src_repeat,
+	       pixman_format_code_t format,
+	       uint32_t *src, int x, int src_width)
+{
+    if (repeat (src_repeat, &x, src_width))
+    {
+	if (format == PIXMAN_x8r8g8b8)
+	    return *(src + x) | 0xff000000;
+	else
+	    return *(src + x);
+    }
+    else
+    {
+	return 0;
+    }
+}
+
+static force_inline void
+combine_over (uint32_t s, uint32_t *dst)
+{
+    if (s)
+    {
+	uint8_t ia = 0xff - (s >> 24);
+
+	if (ia)
+	    UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
+	else
+	    *dst = s;
+    }
+}
+
+static force_inline void
+combine_src (uint32_t s, uint32_t *dst)
+{
+    *dst = s;
+}
+
+static void
+fast_composite_scaled_nearest (pixman_implementation_t *imp,
+			       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t       *dst_line;
+    uint32_t       *src_line;
+    int             dst_stride, src_stride;
+    int		    src_width, src_height;
+    pixman_repeat_t src_repeat;
+    pixman_fixed_t unit_x, unit_y;
+    pixman_format_code_t src_format;
+    pixman_vector_t v;
+    pixman_fixed_t vy;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
+     * transformed from destination space to source space
+     */
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))
+	return;
+
+    unit_x = src_image->common.transform->matrix[0][0];
+    unit_y = src_image->common.transform->matrix[1][1];
+
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
+    v.vector[0] -= pixman_fixed_e;
+    v.vector[1] -= pixman_fixed_e;
+
+    src_height = src_image->bits.height;
+    src_width = src_image->bits.width;
+    src_repeat = src_image->common.repeat;
+    src_format = src_image->bits.format;
+
+    vy = v.vector[1];
+    while (height--)
+    {
+        pixman_fixed_t vx = v.vector[0];
+	int y = pixman_fixed_to_int (vy);
+	uint32_t *dst = dst_line;
+
+	dst_line += dst_stride;
+
+        /* adjust the y location by a unit vector in the y direction
+         * this is equivalent to transforming y+1 of the destination point to source space */
+        vy += unit_y;
+
+	if (!repeat (src_repeat, &y, src_height))
+	{
+	    if (op == PIXMAN_OP_SRC)
+		memset (dst, 0, sizeof (*dst) * width);
+	}
+	else
+	{
+	    int w = width;
+
+	    uint32_t *src = src_line + y * src_stride;
+
+	    while (w >= 2)
+	    {
+		uint32_t s1, s2;
+		int x1, x2;
+
+		x1 = pixman_fixed_to_int (vx);
+		vx += unit_x;
+
+		x2 = pixman_fixed_to_int (vx);
+		vx += unit_x;
+
+		w -= 2;
+
+		s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
+		s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
+
+		if (op == PIXMAN_OP_OVER)
+		{
+		    combine_over (s1, dst++);
+		    combine_over (s2, dst++);
+		}
+		else
+		{
+		    combine_src (s1, dst++);
+		    combine_src (s2, dst++);
+		}
+	    }
+
+	    while (w--)
+	    {
+		uint32_t s;
+		int x;
+
+		x = pixman_fixed_to_int (vx);
+		vx += unit_x;
+
+		s = fetch_nearest (src_repeat, src_format, src, x, src_width);
+
+		if (op == PIXMAN_OP_OVER)
+		    combine_over (s, dst++);
+		else
+		    combine_src (s, dst++);
+	    }
+	}
+    }
+}
+
+#define CACHE_LINE_SIZE 64
+
+#define FAST_SIMPLE_ROTATE(suffix, pix_type)                                  \
+                                                                              \
+static void                                                                   \
+blt_rotated_90_trivial_##suffix (pix_type       *dst,                         \
+				 int             dst_stride,                  \
+				 const pix_type *src,                         \
+				 int             src_stride,                  \
+				 int             w,                           \
+				 int             h)                           \
+{                                                                             \
+    int x, y;                                                                 \
+    for (y = 0; y < h; y++)                                                   \
+    {                                                                         \
+	const pix_type *s = src + (h - y - 1);                                \
+	pix_type *d = dst + dst_stride * y;                                   \
+	for (x = 0; x < w; x++)                                               \
+	{                                                                     \
+	    *d++ = *s;                                                        \
+	    s += src_stride;                                                  \
+	}                                                                     \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_270_trivial_##suffix (pix_type       *dst,                        \
+				  int             dst_stride,                 \
+				  const pix_type *src,                        \
+				  int             src_stride,                 \
+				  int             w,                          \
+				  int             h)                          \
+{                                                                             \
+    int x, y;                                                                 \
+    for (y = 0; y < h; y++)                                                   \
+    {                                                                         \
+	const pix_type *s = src + src_stride * (w - 1) + y;                   \
+	pix_type *d = dst + dst_stride * y;                                   \
+	for (x = 0; x < w; x++)                                               \
+	{                                                                     \
+	    *d++ = *s;                                                        \
+	    s -= src_stride;                                                  \
+	}                                                                     \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_90_##suffix (pix_type       *dst,                                 \
+			 int             dst_stride,                          \
+			 const pix_type *src,                                 \
+			 int             src_stride,                          \
+			 int             W,                                   \
+			 int             H)                                   \
+{                                                                             \
+    int x;                                                                    \
+    int leading_pixels = 0, trailing_pixels = 0;                              \
+    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
+                                                                              \
+    /*                                                                        \
+     * split processing into handling destination as TILE_SIZExH cache line   \
+     * aligned vertical stripes (optimistically assuming that destination     \
+     * stride is a multiple of cache line, if not - it will be just a bit     \
+     * slower)                                                                \
+     */                                                                       \
+                                                                              \
+    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
+    {                                                                         \
+	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (leading_pixels > W)                                               \
+	    leading_pixels = W;                                               \
+                                                                              \
+	/* unaligned leading part NxH (where N < TILE_SIZE) */                \
+	blt_rotated_90_trivial_##suffix (                                     \
+	    dst,                                                              \
+	    dst_stride,                                                       \
+	    src,                                                              \
+	    src_stride,                                                       \
+	    leading_pixels,                                                   \
+	    H);                                                               \
+	                                                                      \
+	dst += leading_pixels;                                                \
+	src += leading_pixels * src_stride;                                   \
+	W -= leading_pixels;                                                  \
+    }                                                                         \
+                                                                              \
+    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
+    {                                                                         \
+	trailing_pixels = (((uintptr_t)(dst + W) &                            \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (trailing_pixels > W)                                              \
+	    trailing_pixels = W;                                              \
+	W -= trailing_pixels;                                                 \
+    }                                                                         \
+                                                                              \
+    for (x = 0; x < W; x += TILE_SIZE)                                        \
+    {                                                                         \
+	/* aligned middle part TILE_SIZExH */                                 \
+	blt_rotated_90_trivial_##suffix (                                     \
+	    dst + x,                                                          \
+	    dst_stride,                                                       \
+	    src + src_stride * x,                                             \
+	    src_stride,                                                       \
+	    TILE_SIZE,                                                        \
+	    H);                                                               \
+    }                                                                         \
+                                                                              \
+    if (trailing_pixels)                                                      \
+    {                                                                         \
+	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \
+	blt_rotated_90_trivial_##suffix (                                     \
+	    dst + W,                                                          \
+	    dst_stride,                                                       \
+	    src + W * src_stride,                                             \
+	    src_stride,                                                       \
+	    trailing_pixels,                                                  \
+	    H);                                                               \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_270_##suffix (pix_type       *dst,                                \
+			  int             dst_stride,                         \
+			  const pix_type *src,                                \
+			  int             src_stride,                         \
+			  int             W,                                  \
+			  int             H)                                  \
+{                                                                             \
+    int x;                                                                    \
+    int leading_pixels = 0, trailing_pixels = 0;                              \
+    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
+                                                                              \
+    /*                                                                        \
+     * split processing into handling destination as TILE_SIZExH cache line   \
+     * aligned vertical stripes (optimistically assuming that destination     \
+     * stride is a multiple of cache line, if not - it will be just a bit     \
+     * slower)                                                                \
+     */                                                                       \
+                                                                              \
+    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
+    {                                                                         \
+	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (leading_pixels > W)                                               \
+	    leading_pixels = W;                                               \
+                                                                              \
+	/* unaligned leading part NxH (where N < TILE_SIZE) */                \
+	blt_rotated_270_trivial_##suffix (                                    \
+	    dst,                                                              \
+	    dst_stride,                                                       \
+	    src + src_stride * (W - leading_pixels),                          \
+	    src_stride,                                                       \
+	    leading_pixels,                                                   \
+	    H);                                                               \
+	                                                                      \
+	dst += leading_pixels;                                                \
+	W -= leading_pixels;                                                  \
+    }                                                                         \
+                                                                              \
+    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
+    {                                                                         \
+	trailing_pixels = (((uintptr_t)(dst + W) &                            \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (trailing_pixels > W)                                              \
+	    trailing_pixels = W;                                              \
+	W -= trailing_pixels;                                                 \
+	src += trailing_pixels * src_stride;                                  \
+    }                                                                         \
+                                                                              \
+    for (x = 0; x < W; x += TILE_SIZE)                                        \
+    {                                                                         \
+	/* aligned middle part TILE_SIZExH */                                 \
+	blt_rotated_270_trivial_##suffix (                                    \
+	    dst + x,                                                          \
+	    dst_stride,                                                       \
+	    src + src_stride * (W - x - TILE_SIZE),                           \
+	    src_stride,                                                       \
+	    TILE_SIZE,                                                        \
+	    H);                                                               \
+    }                                                                         \
+                                                                              \
+    if (trailing_pixels)                                                      \
+    {                                                                         \
+	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \
+	blt_rotated_270_trivial_##suffix (                                    \
+	    dst + W,                                                          \
+	    dst_stride,                                                       \
+	    src - trailing_pixels * src_stride,                               \
+	    src_stride,                                                       \
+	    trailing_pixels,                                                  \
+	    H);                                                               \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+fast_composite_rotate_90_##suffix (pixman_implementation_t *imp,              \
+				   pixman_composite_info_t *info)	      \
+{									      \
+    PIXMAN_COMPOSITE_ARGS (info);					      \
+    pix_type       *dst_line;						      \
+    pix_type       *src_line;                                                 \
+    int             dst_stride, src_stride;                                   \
+    int             src_x_t, src_y_t;                                         \
+                                                                              \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
+			   dst_stride, dst_line, 1);                          \
+    src_x_t = -src_y + pixman_fixed_to_int (                                  \
+				src_image->common.transform->matrix[0][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e) - height;\
+    src_y_t = src_x + pixman_fixed_to_int (                                   \
+				src_image->common.transform->matrix[1][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e);         \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
+			   src_stride, src_line, 1);                          \
+    blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride,      \
+			     width, height);                                  \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+fast_composite_rotate_270_##suffix (pixman_implementation_t *imp,             \
+				    pixman_composite_info_t *info)            \
+{                                                                             \
+    PIXMAN_COMPOSITE_ARGS (info);					      \
+    pix_type       *dst_line;						      \
+    pix_type       *src_line;                                                 \
+    int             dst_stride, src_stride;                                   \
+    int             src_x_t, src_y_t;                                         \
+                                                                              \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
+			   dst_stride, dst_line, 1);                          \
+    src_x_t = src_y + pixman_fixed_to_int (                                   \
+				src_image->common.transform->matrix[0][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e);         \
+    src_y_t = -src_x + pixman_fixed_to_int (                                  \
+				src_image->common.transform->matrix[1][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e) - width; \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
+			   src_stride, src_line, 1);                          \
+    blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride,     \
+			      width, height);                                 \
+}
+
+FAST_SIMPLE_ROTATE (8, uint8_t)
+FAST_SIMPLE_ROTATE (565, uint16_t)
+FAST_SIMPLE_ROTATE (8888, uint32_t)
+
+static const pixman_fast_path_t c_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5,   fast_composite_over_n_1_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5,   fast_composite_over_n_1_0565),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
+    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
+
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
+
+#define NEAREST_FAST_PATH(op,s,d)		\
+    {   PIXMAN_OP_ ## op,			\
+	PIXMAN_ ## s, SCALED_NEAREST_FLAGS,	\
+	PIXMAN_null, 0,				\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,	\
+	fast_composite_scaled_nearest,		\
+    }
+
+    NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
+    NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
+
+    NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
+    NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
+
+    NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
+    NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
+
+    NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
+    NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
+
+#define SIMPLE_ROTATE_FLAGS(angle)					  \
+    (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM	|			  \
+     FAST_PATH_NEAREST_FILTER			|			  \
+     FAST_PATH_SAMPLES_COVER_CLIP_NEAREST	|			  \
+     FAST_PATH_STANDARD_FLAGS)
+
+#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix)				  \
+    {   PIXMAN_OP_ ## op,						  \
+	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90),				  \
+	PIXMAN_null, 0,							  \
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \
+	fast_composite_rotate_90_##suffix,				  \
+    },									  \
+    {   PIXMAN_OP_ ## op,						  \
+	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270),			  \
+	PIXMAN_null, 0,							  \
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \
+	fast_composite_rotate_270_##suffix,				  \
+    }
+
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565),
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8),
+
+    /* Simple repeat fast path entry. */
+    {	PIXMAN_OP_any,
+	PIXMAN_any,
+	(FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE |
+	 FAST_PATH_NORMAL_REPEAT),
+	PIXMAN_any, 0,
+	PIXMAN_any, FAST_PATH_STD_DEST_FLAGS,
+	fast_composite_tiled_repeat
+    },
+
+    {   PIXMAN_OP_NONE	},
+};
+
+#ifdef WORDS_BIGENDIAN
+#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n)))
+#else
+#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs))
+#endif
+
+static force_inline void
+pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
+{
+    if (offs)
+    {
+	int leading_pixels = 32 - offs;
+	if (leading_pixels >= width)
+	{
+	    if (v)
+		*dst |= A1_FILL_MASK (width, offs);
+	    else
+		*dst &= ~A1_FILL_MASK (width, offs);
+	    return;
+	}
+	else
+	{
+	    if (v)
+		*dst++ |= A1_FILL_MASK (leading_pixels, offs);
+	    else
+		*dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
+	    width -= leading_pixels;
+	}
+    }
+    while (width >= 32)
+    {
+	if (v)
+	    *dst++ = 0xFFFFFFFF;
+	else
+	    *dst++ = 0;
+	width -= 32;
+    }
+    if (width > 0)
+    {
+	if (v)
+	    *dst |= A1_FILL_MASK (width, 0);
+	else
+	    *dst &= ~A1_FILL_MASK (width, 0);
+    }
+}
+
+static void
+pixman_fill1 (uint32_t *bits,
+              int       stride,
+              int       x,
+              int       y,
+              int       width,
+              int       height,
+              uint32_t  xor)
+{
+    uint32_t *dst = bits + y * stride + (x >> 5);
+    int offs = x & 31;
+
+    if (xor & 1)
+    {
+	while (height--)
+	{
+	    pixman_fill1_line (dst, offs, width, 1);
+	    dst += stride;
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    pixman_fill1_line (dst, offs, width, 0);
+	    dst += stride;
+	}
+    }
+}
+
+static void
+pixman_fill8 (uint32_t *bits,
+              int       stride,
+              int       x,
+              int       y,
+              int       width,
+              int       height,
+              uint32_t xor)
+{
+    int byte_stride = stride * (int) sizeof (uint32_t);
+    uint8_t *dst = (uint8_t *) bits;
+    uint8_t v = xor & 0xff;
+    int i;
+
+    dst = dst + y * byte_stride + x;
+
+    while (height--)
+    {
+	for (i = 0; i < width; ++i)
+	    dst[i] = v;
+
+	dst += byte_stride;
+    }
+}
+
+static void
+pixman_fill16 (uint32_t *bits,
+               int       stride,
+               int       x,
+               int       y,
+               int       width,
+               int       height,
+               uint32_t xor)
+{
+    int short_stride =
+	(stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
+    uint16_t *dst = (uint16_t *)bits;
+    uint16_t v = xor & 0xffff;
+    int i;
+
+    dst = dst + y * short_stride + x;
+
+    while (height--)
+    {
+	for (i = 0; i < width; ++i)
+	    dst[i] = v;
+
+	dst += short_stride;
+    }
+}
+
+static void
+pixman_fill32 (uint32_t *bits,
+               int       stride,
+               int       x,
+               int       y,
+               int       width,
+               int       height,
+               uint32_t  xor)
+{
+    int i;
+
+    bits = bits + y * stride + x;
+
+    while (height--)
+    {
+	for (i = 0; i < width; ++i)
+	    bits[i] = xor;
+
+	bits += stride;
+    }
+}
+
+static pixman_bool_t
+fast_path_fill (pixman_implementation_t *imp,
+                uint32_t *               bits,
+                int                      stride,
+                int                      bpp,
+                int                      x,
+                int                      y,
+                int                      width,
+                int                      height,
+                uint32_t		 xor)
+{
+    switch (bpp)
+    {
+    case 1:
+	pixman_fill1 (bits, stride, x, y, width, height, xor);
+	break;
+
+    case 8:
+	pixman_fill8 (bits, stride, x, y, width, height, xor);
+	break;
+
+    case 16:
+	pixman_fill16 (bits, stride, x, y, width, height, xor);
+	break;
+
+    case 32:
+	pixman_fill32 (bits, stride, x, y, width, height, xor);
+	break;
+
+    default:
+	return _pixman_implementation_fill (
+	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+	break;
+    }
+
+    return TRUE;
+}
+
+pixman_implementation_t *
+_pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
+
+    imp->fill = fast_path_fill;
+
+    return imp;
+}
diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c
new file mode 100644
index 0000000..2ccdfcd
--- /dev/null
+++ b/pixman/pixman-general.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright Â© 2009 Red Hat, Inc.
+ * Copyright Â© 2000 SuSE, Inc.
+ * Copyright Â© 2007 Red Hat, Inc.
+ * Copyright Â© 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *             2008 Aaron Plattner, NVIDIA Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman-private.h"
+
+static void
+general_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+    if (image->type == SOLID)
+	_pixman_solid_fill_iter_init (image, iter);
+    else if (image->type == LINEAR)
+	_pixman_linear_gradient_iter_init (image, iter);
+    else if (image->type == RADIAL)
+	_pixman_radial_gradient_iter_init (image, iter);
+    else if (image->type == CONICAL)
+	_pixman_conical_gradient_iter_init (image, iter);
+    else if (image->type == BITS)
+	_pixman_bits_image_src_iter_init (image, iter);
+    else
+	_pixman_log_error (FUNC, "Pixman bug: unknown image type\n");
+}
+
+static void
+general_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    if (iter->image->type == BITS)
+    {
+	_pixman_bits_image_dest_iter_init (iter->image, iter);
+    }
+    else
+    {
+	_pixman_log_error (FUNC, "Trying to write to a non-writable image");
+    }
+}
+
+typedef struct op_info_t op_info_t;
+struct op_info_t
+{
+    uint8_t src, dst;
+};
+
+#define ITER_IGNORE_BOTH						\
+    (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB | ITER_LOCALIZED_ALPHA)
+
+static const op_info_t op_flags[PIXMAN_N_OPERATORS] =
+{
+    /* Src                   Dst                   */
+    { ITER_IGNORE_BOTH,      ITER_IGNORE_BOTH      }, /* CLEAR */
+    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_BOTH      }, /* SRC */
+    { ITER_IGNORE_BOTH,      ITER_LOCALIZED_ALPHA  }, /* DST */
+    { 0,                     ITER_LOCALIZED_ALPHA  }, /* OVER */
+    { ITER_LOCALIZED_ALPHA,  0                     }, /* OVER_REVERSE */
+    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_RGB       }, /* IN */
+    { ITER_IGNORE_RGB,       ITER_LOCALIZED_ALPHA  }, /* IN_REVERSE */
+    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_RGB       }, /* OUT */
+    { ITER_IGNORE_RGB,       ITER_LOCALIZED_ALPHA  }, /* OUT_REVERSE */
+    { 0,                     0                     }, /* ATOP */
+    { 0,                     0                     }, /* ATOP_REVERSE */
+    { 0,                     0                     }, /* XOR */
+    { ITER_LOCALIZED_ALPHA,  ITER_LOCALIZED_ALPHA  }, /* ADD */
+    { 0,                     0                     }, /* SATURATE */
+};
+
+#define SCANLINE_BUFFER_LENGTH 8192
+
+static void
+general_composite_rect  (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint64_t stack_scanline_buffer[(SCANLINE_BUFFER_LENGTH * 3 + 7) / 8];
+    uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer;
+    uint8_t *src_buffer, *mask_buffer, *dest_buffer;
+    pixman_iter_t src_iter, mask_iter, dest_iter;
+    pixman_combine_32_func_t compose;
+    pixman_bool_t component_alpha;
+    iter_flags_t narrow, src_flags;
+    int Bpp;
+    int i;
+
+    if ((src_image->common.flags & FAST_PATH_NARROW_FORMAT)		    &&
+	(!mask_image || mask_image->common.flags & FAST_PATH_NARROW_FORMAT) &&
+	(dest_image->common.flags & FAST_PATH_NARROW_FORMAT))
+    {
+	narrow = ITER_NARROW;
+	Bpp = 4;
+    }
+    else
+    {
+	narrow = 0;
+	Bpp = 8;
+    }
+
+    if (width * Bpp > SCANLINE_BUFFER_LENGTH)
+    {
+	scanline_buffer = pixman_malloc_abc (width, 3, Bpp);
+
+	if (!scanline_buffer)
+	    return;
+    }
+
+    src_buffer = scanline_buffer;
+    mask_buffer = src_buffer + width * Bpp;
+    dest_buffer = mask_buffer + width * Bpp;
+
+    /* src iter */
+    src_flags = narrow | op_flags[op].src;
+
+    _pixman_implementation_src_iter_init (imp->toplevel, &src_iter, src_image,
+					  src_x, src_y, width, height,
+					  src_buffer, src_flags);
+
+    /* mask iter */
+    if ((src_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
+	(ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
+    {
+	/* If it doesn't matter what the source is, then it doesn't matter
+	 * what the mask is
+	 */
+	mask_image = NULL;
+    }
+
+    component_alpha =
+        mask_image			      &&
+        mask_image->common.type == BITS       &&
+        mask_image->common.component_alpha    &&
+        PIXMAN_FORMAT_RGB (mask_image->bits.format);
+
+    _pixman_implementation_src_iter_init (
+	imp->toplevel, &mask_iter, mask_image, mask_x, mask_y, width, height,
+	mask_buffer, narrow | (component_alpha? 0 : ITER_IGNORE_RGB));
+
+    /* dest iter */
+    _pixman_implementation_dest_iter_init (
+	imp->toplevel, &dest_iter, dest_image, dest_x, dest_y, width, height,
+	dest_buffer, narrow | op_flags[op].dst);
+
+    if (narrow)
+    {
+	if (component_alpha)
+	    compose = _pixman_implementation_combine_32_ca;
+	else
+	    compose = _pixman_implementation_combine_32;
+    }
+    else
+    {
+	if (component_alpha)
+	    compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64_ca;
+	else
+	    compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64;
+    }
+
+    if (!compose)
+	return;
+
+    for (i = 0; i < height; ++i)
+    {
+	uint32_t *s, *m, *d;
+
+	m = mask_iter.get_scanline (&mask_iter, NULL);
+	s = src_iter.get_scanline (&src_iter, m);
+	d = dest_iter.get_scanline (&dest_iter, NULL);
+
+	compose (imp->toplevel, op, d, s, m, width);
+
+	dest_iter.write_back (&dest_iter);
+    }
+
+    if (scanline_buffer != (uint8_t *) stack_scanline_buffer)
+	free (scanline_buffer);
+}
+
+static const pixman_fast_path_t general_fast_path[] =
+{
+    { PIXMAN_OP_any, PIXMAN_any, 0, PIXMAN_any,	0, PIXMAN_any, 0, general_composite_rect },
+    { PIXMAN_OP_NONE }
+};
+
+static pixman_bool_t
+general_blt (pixman_implementation_t *imp,
+             uint32_t *               src_bits,
+             uint32_t *               dst_bits,
+             int                      src_stride,
+             int                      dst_stride,
+             int                      src_bpp,
+             int                      dst_bpp,
+             int                      src_x,
+             int                      src_y,
+             int                      dest_x,
+             int                      dest_y,
+             int                      width,
+             int                      height)
+{
+    /* We can't blit unless we have sse2 or mmx */
+
+    return FALSE;
+}
+
+static pixman_bool_t
+general_fill (pixman_implementation_t *imp,
+              uint32_t *               bits,
+              int                      stride,
+              int                      bpp,
+              int                      x,
+              int                      y,
+              int                      width,
+              int                      height,
+              uint32_t xor)
+{
+    return FALSE;
+}
+
+pixman_implementation_t *
+_pixman_implementation_create_general (void)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (NULL, general_fast_path);
+
+    _pixman_setup_combiner_functions_32 (imp);
+    _pixman_setup_combiner_functions_64 (imp);
+
+    imp->blt = general_blt;
+    imp->fill = general_fill;
+    imp->src_iter_init = general_src_iter_init;
+    imp->dest_iter_init = general_dest_iter_init;
+
+    return imp;
+}
+
diff --git a/pixman/pixman-gradient-walker.c b/pixman/pixman-gradient-walker.c
new file mode 100644
index 0000000..dd666b4
--- /dev/null
+++ b/pixman/pixman-gradient-walker.c
@@ -0,0 +1,254 @@
+/*
+ *
+ * Copyright Â© 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+
+void
+_pixman_gradient_walker_init (pixman_gradient_walker_t *walker,
+                              gradient_t *              gradient,
+                              unsigned int              spread)
+{
+    walker->num_stops = gradient->n_stops;
+    walker->stops     = gradient->stops;
+    walker->left_x    = 0;
+    walker->right_x   = 0x10000;
+    walker->stepper   = 0;
+    walker->left_ag   = 0;
+    walker->left_rb   = 0;
+    walker->right_ag  = 0;
+    walker->right_rb  = 0;
+    walker->spread    = spread;
+
+    walker->need_reset = TRUE;
+}
+
+void
+_pixman_gradient_walker_reset (pixman_gradient_walker_t *walker,
+                               pixman_fixed_32_32_t      pos)
+{
+    int32_t x, left_x, right_x;
+    pixman_color_t          *left_c, *right_c;
+    int n, count = walker->num_stops;
+    pixman_gradient_stop_t *      stops = walker->stops;
+
+    static const pixman_color_t transparent_black = { 0, 0, 0, 0 };
+
+    switch (walker->spread)
+    {
+    case PIXMAN_REPEAT_NORMAL:
+	x = (int32_t)pos & 0xFFFF;
+	for (n = 0; n < count; n++)
+	    if (x < stops[n].x)
+		break;
+	if (n == 0)
+	{
+	    left_x =  stops[count - 1].x - 0x10000;
+	    left_c = &stops[count - 1].color;
+	}
+	else
+	{
+	    left_x =  stops[n - 1].x;
+	    left_c = &stops[n - 1].color;
+	}
+
+	if (n == count)
+	{
+	    right_x =  stops[0].x + 0x10000;
+	    right_c = &stops[0].color;
+	}
+	else
+	{
+	    right_x =  stops[n].x;
+	    right_c = &stops[n].color;
+	}
+	left_x  += (pos - x);
+	right_x += (pos - x);
+	break;
+
+    case PIXMAN_REPEAT_PAD:
+	for (n = 0; n < count; n++)
+	    if (pos < stops[n].x)
+		break;
+
+	if (n == 0)
+	{
+	    left_x =  INT32_MIN;
+	    left_c = &stops[0].color;
+	}
+	else
+	{
+	    left_x =  stops[n - 1].x;
+	    left_c = &stops[n - 1].color;
+	}
+
+	if (n == count)
+	{
+	    right_x =  INT32_MAX;
+	    right_c = &stops[n - 1].color;
+	}
+	else
+	{
+	    right_x =  stops[n].x;
+	    right_c = &stops[n].color;
+	}
+	break;
+
+    case PIXMAN_REPEAT_REFLECT:
+	x = (int32_t)pos & 0xFFFF;
+	if ((int32_t)pos & 0x10000)
+	    x = 0x10000 - x;
+	for (n = 0; n < count; n++)
+	    if (x < stops[n].x)
+		break;
+
+	if (n == 0)
+	{
+	    left_x =  -stops[0].x;
+	    left_c = &stops[0].color;
+	}
+	else
+	{
+	    left_x =  stops[n - 1].x;
+	    left_c = &stops[n - 1].color;
+	}
+
+	if (n == count)
+	{
+	    right_x = 0x20000 - stops[n - 1].x;
+	    right_c = &stops[n - 1].color;
+	}
+	else
+	{
+	    right_x =  stops[n].x;
+	    right_c = &stops[n].color;
+	}
+
+	if ((int32_t)pos & 0x10000)
+	{
+	    pixman_color_t  *tmp_c;
+	    int32_t tmp_x;
+
+	    tmp_x   = 0x10000 - right_x;
+	    right_x = 0x10000 - left_x;
+	    left_x  = tmp_x;
+
+	    tmp_c   = right_c;
+	    right_c = left_c;
+	    left_c  = tmp_c;
+
+	    x = 0x10000 - x;
+	}
+	left_x  += (pos - x);
+	right_x += (pos - x);
+	break;
+
+    default:  /* REPEAT_NONE */
+	for (n = 0; n < count; n++)
+	    if (pos < stops[n].x)
+		break;
+
+	if (n == 0)
+	{
+	    left_x  =  INT32_MIN;
+	    right_x =  stops[0].x;
+	    left_c  = right_c = (pixman_color_t*) &transparent_black;
+	}
+	else if (n == count)
+	{
+	    left_x  = stops[n - 1].x;
+	    right_x = INT32_MAX;
+	    left_c  = right_c = (pixman_color_t*) &transparent_black;
+	}
+	else
+	{
+	    left_x  =  stops[n - 1].x;
+	    right_x =  stops[n].x;
+	    left_c  = &stops[n - 1].color;
+	    right_c = &stops[n].color;
+	}
+    }
+
+    walker->left_x   = left_x;
+    walker->right_x  = right_x;
+    walker->left_ag  = ((left_c->alpha >> 8) << 16)   | (left_c->green >> 8);
+    walker->left_rb  = ((left_c->red & 0xff00) << 8)  | (left_c->blue >> 8);
+    walker->right_ag = ((right_c->alpha >> 8) << 16)  | (right_c->green >> 8);
+    walker->right_rb = ((right_c->red & 0xff00) << 8) | (right_c->blue >> 8);
+
+    if (walker->left_x == walker->right_x                ||
+        ( walker->left_ag == walker->right_ag &&
+          walker->left_rb == walker->right_rb )   )
+    {
+	walker->stepper = 0;
+    }
+    else
+    {
+	int32_t width = right_x - left_x;
+	walker->stepper = ((1 << 24) + width / 2) / width;
+    }
+
+    walker->need_reset = FALSE;
+}
+
+#define  PIXMAN_GRADIENT_WALKER_NEED_RESET(w, x)                         \
+    ( (w)->need_reset || (x) < (w)->left_x || (x) >= (w)->right_x)
+
+
+/* the following assumes that PIXMAN_GRADIENT_WALKER_NEED_RESET(w,x) is FALSE */
+uint32_t
+_pixman_gradient_walker_pixel (pixman_gradient_walker_t *walker,
+                               pixman_fixed_32_32_t      x)
+{
+    int dist, idist;
+    uint32_t t1, t2, a, color;
+
+    if (PIXMAN_GRADIENT_WALKER_NEED_RESET (walker, x))
+	_pixman_gradient_walker_reset (walker, x);
+
+    dist  = ((int)(x - walker->left_x) * walker->stepper) >> 16;
+    idist = 256 - dist;
+
+    /* combined INTERPOLATE and premultiply */
+    t1 = walker->left_rb * idist + walker->right_rb * dist;
+    t1 = (t1 >> 8) & 0xff00ff;
+
+    t2  = walker->left_ag * idist + walker->right_ag * dist;
+    t2 &= 0xff00ff00;
+
+    color = t2 & 0xff000000;
+    a     = t2 >> 24;
+
+    t1  = t1 * a + 0x800080;
+    t1  = (t1 + ((t1 >> 8) & 0xff00ff)) >> 8;
+
+    t2  = (t2 >> 8) * a + 0x800080;
+    t2  = (t2 + ((t2 >> 8) & 0xff00ff));
+
+    return (color | (t1 & 0xff00ff) | (t2 & 0xff00));
+}
+
diff --git a/pixman/pixman-image.c b/pixman/pixman-image.c
new file mode 100644
index 0000000..afe587f
--- /dev/null
+++ b/pixman/pixman-image.c
@@ -0,0 +1,837 @@
+/*
+ * Copyright Â© 2000 SuSE, Inc.
+ * Copyright Â© 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pixman-private.h"
+
+pixman_bool_t
+_pixman_init_gradient (gradient_t *                  gradient,
+                       const pixman_gradient_stop_t *stops,
+                       int                           n_stops)
+{
+    return_val_if_fail (n_stops > 0, FALSE);
+
+    gradient->stops = pixman_malloc_ab (n_stops, sizeof (pixman_gradient_stop_t));
+    if (!gradient->stops)
+	return FALSE;
+
+    memcpy (gradient->stops, stops, n_stops * sizeof (pixman_gradient_stop_t));
+
+    gradient->n_stops = n_stops;
+
+    return TRUE;
+}
+
+void
+_pixman_image_init (pixman_image_t *image)
+{
+    image_common_t *common = &image->common;
+
+    pixman_region32_init (&common->clip_region);
+
+    common->alpha_count = 0;
+    common->have_clip_region = FALSE;
+    common->clip_sources = FALSE;
+    common->transform = NULL;
+    common->repeat = PIXMAN_REPEAT_NONE;
+    common->filter = PIXMAN_FILTER_NEAREST;
+    common->filter_params = NULL;
+    common->n_filter_params = 0;
+    common->alpha_map = NULL;
+    common->component_alpha = FALSE;
+    common->ref_count = 1;
+    common->property_changed = NULL;
+    common->client_clip = FALSE;
+    common->destroy_func = NULL;
+    common->destroy_data = NULL;
+    common->dirty = TRUE;
+}
+
+pixman_bool_t
+_pixman_image_fini (pixman_image_t *image)
+{
+    image_common_t *common = (image_common_t *)image;
+
+    common->ref_count--;
+
+    if (common->ref_count == 0)
+    {
+	if (image->common.destroy_func)
+	    image->common.destroy_func (image, image->common.destroy_data);
+
+	pixman_region32_fini (&common->clip_region);
+
+	if (common->transform)
+	    free (common->transform);
+
+	if (common->filter_params)
+	    free (common->filter_params);
+
+	if (common->alpha_map)
+	    pixman_image_unref ((pixman_image_t *)common->alpha_map);
+
+	if (image->type == LINEAR ||
+	    image->type == RADIAL ||
+	    image->type == CONICAL)
+	{
+	    if (image->gradient.stops)
+		free (image->gradient.stops);
+	}
+
+	if (image->type == BITS && image->bits.free_me)
+	    free (image->bits.free_me);
+
+	return TRUE;
+    }
+
+    return FALSE;
+}
+
+pixman_image_t *
+_pixman_image_allocate (void)
+{
+    pixman_image_t *image = malloc (sizeof (pixman_image_t));
+
+    if (image)
+	_pixman_image_init (image);
+
+    return image;
+}
+
+static void
+image_property_changed (pixman_image_t *image)
+{
+    image->common.dirty = TRUE;
+}
+
+/* Ref Counting */
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_ref (pixman_image_t *image)
+{
+    image->common.ref_count++;
+
+    return image;
+}
+
+/* returns TRUE when the image is freed */
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_unref (pixman_image_t *image)
+{
+    if (_pixman_image_fini (image))
+    {
+	free (image);
+	return TRUE;
+    }
+
+    return FALSE;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_destroy_function (pixman_image_t *            image,
+                                   pixman_image_destroy_func_t func,
+                                   void *                      data)
+{
+    image->common.destroy_func = func;
+    image->common.destroy_data = data;
+}
+
+PIXMAN_EXPORT void *
+pixman_image_get_destroy_data (pixman_image_t *image)
+{
+  return image->common.destroy_data;
+}
+
+void
+_pixman_image_reset_clip_region (pixman_image_t *image)
+{
+    image->common.have_clip_region = FALSE;
+}
+
+/* Executive Summary: This function is a no-op that only exists
+ * for historical reasons.
+ *
+ * There used to be a bug in the X server where it would rely on
+ * out-of-bounds accesses when it was asked to composite with a
+ * window as the source. It would create a pixman image pointing
+ * to some bogus position in memory, but then set a clip region
+ * to the position where the actual bits were.
+ *
+ * Due to a bug in old versions of pixman, where it would not clip
+ * against the image bounds when a clip region was set, this would
+ * actually work. So when the pixman bug was fixed, a workaround was
+ * added to allow certain out-of-bound accesses. This function disabled
+ * those workarounds.
+ *
+ * Since 0.21.2, pixman doesn't do these workarounds anymore, so now
+ * this function is a no-op.
+ */
+PIXMAN_EXPORT void
+pixman_disable_out_of_bounds_workaround (void)
+{
+}
+
+static void
+compute_image_info (pixman_image_t *image)
+{
+    pixman_format_code_t code;
+    uint32_t flags = 0;
+
+    /* Transform */
+    if (!image->common.transform)
+    {
+	flags |= (FAST_PATH_ID_TRANSFORM	|
+		  FAST_PATH_X_UNIT_POSITIVE	|
+		  FAST_PATH_Y_UNIT_ZERO		|
+		  FAST_PATH_AFFINE_TRANSFORM);
+    }
+    else
+    {
+	flags |= FAST_PATH_HAS_TRANSFORM;
+
+	if (image->common.transform->matrix[2][0] == 0			&&
+	    image->common.transform->matrix[2][1] == 0			&&
+	    image->common.transform->matrix[2][2] == pixman_fixed_1)
+	{
+	    flags |= FAST_PATH_AFFINE_TRANSFORM;
+
+	    if (image->common.transform->matrix[0][1] == 0 &&
+		image->common.transform->matrix[1][0] == 0)
+	    {
+		if (image->common.transform->matrix[0][0] == -pixman_fixed_1 &&
+		    image->common.transform->matrix[1][1] == -pixman_fixed_1)
+		{
+		    flags |= FAST_PATH_ROTATE_180_TRANSFORM;
+		}
+		flags |= FAST_PATH_SCALE_TRANSFORM;
+	    }
+	    else if (image->common.transform->matrix[0][0] == 0 &&
+	             image->common.transform->matrix[1][1] == 0)
+	    {
+		pixman_fixed_t m01 = image->common.transform->matrix[0][1];
+		if (m01 == -image->common.transform->matrix[1][0])
+		{
+			if (m01 == -pixman_fixed_1)
+			    flags |= FAST_PATH_ROTATE_90_TRANSFORM;
+			else if (m01 == pixman_fixed_1)
+			    flags |= FAST_PATH_ROTATE_270_TRANSFORM;
+		}
+	    }
+	}
+
+	if (image->common.transform->matrix[0][0] > 0)
+	    flags |= FAST_PATH_X_UNIT_POSITIVE;
+
+	if (image->common.transform->matrix[1][0] == 0)
+	    flags |= FAST_PATH_Y_UNIT_ZERO;
+    }
+
+    /* Filter */
+    switch (image->common.filter)
+    {
+    case PIXMAN_FILTER_NEAREST:
+    case PIXMAN_FILTER_FAST:
+	flags |= (FAST_PATH_NEAREST_FILTER | FAST_PATH_NO_CONVOLUTION_FILTER);
+	break;
+
+    case PIXMAN_FILTER_BILINEAR:
+    case PIXMAN_FILTER_GOOD:
+    case PIXMAN_FILTER_BEST:
+	flags |= (FAST_PATH_BILINEAR_FILTER | FAST_PATH_NO_CONVOLUTION_FILTER);
+
+	/* Here we have a chance to optimize BILINEAR filter to NEAREST if
+	 * they are equivalent for the currently used transformation matrix.
+	 */
+	if (flags & FAST_PATH_ID_TRANSFORM)
+	{
+	    flags |= FAST_PATH_NEAREST_FILTER;
+	}
+	else if (
+	    /* affine and integer translation components in matrix ... */
+	    ((flags & FAST_PATH_AFFINE_TRANSFORM) &&
+	     !pixman_fixed_frac (image->common.transform->matrix[0][2] |
+				 image->common.transform->matrix[1][2])) &&
+	    (
+		/* ... combined with a simple rotation */
+		(flags & (FAST_PATH_ROTATE_90_TRANSFORM |
+			  FAST_PATH_ROTATE_180_TRANSFORM |
+			  FAST_PATH_ROTATE_270_TRANSFORM)) ||
+		/* ... or combined with a simple non-rotated translation */
+		(image->common.transform->matrix[0][0] == pixman_fixed_1 &&
+		 image->common.transform->matrix[1][1] == pixman_fixed_1 &&
+		 image->common.transform->matrix[0][1] == 0 &&
+		 image->common.transform->matrix[1][0] == 0)
+		)
+	    )
+	{
+	    /* FIXME: there are some affine-test failures, showing that
+	     * handling of BILINEAR and NEAREST filter is not quite
+	     * equivalent when getting close to 32K for the translation
+	     * components of the matrix. That's likely some bug, but for
+	     * now just skip BILINEAR->NEAREST optimization in this case.
+	     */
+	    pixman_fixed_t magic_limit = pixman_int_to_fixed (30000);
+	    if (image->common.transform->matrix[0][2] <= magic_limit  &&
+	        image->common.transform->matrix[1][2] <= magic_limit  &&
+	        image->common.transform->matrix[0][2] >= -magic_limit &&
+	        image->common.transform->matrix[1][2] >= -magic_limit)
+	    {
+		flags |= FAST_PATH_NEAREST_FILTER;
+	    }
+	}
+	break;
+
+    case PIXMAN_FILTER_CONVOLUTION:
+	break;
+
+    default:
+	flags |= FAST_PATH_NO_CONVOLUTION_FILTER;
+	break;
+    }
+
+    /* Repeat mode */
+    switch (image->common.repeat)
+    {
+    case PIXMAN_REPEAT_NONE:
+	flags |=
+	    FAST_PATH_NO_REFLECT_REPEAT		|
+	    FAST_PATH_NO_PAD_REPEAT		|
+	    FAST_PATH_NO_NORMAL_REPEAT;
+	break;
+
+    case PIXMAN_REPEAT_REFLECT:
+	flags |=
+	    FAST_PATH_NO_PAD_REPEAT		|
+	    FAST_PATH_NO_NONE_REPEAT		|
+	    FAST_PATH_NO_NORMAL_REPEAT;
+	break;
+
+    case PIXMAN_REPEAT_PAD:
+	flags |=
+	    FAST_PATH_NO_REFLECT_REPEAT		|
+	    FAST_PATH_NO_NONE_REPEAT		|
+	    FAST_PATH_NO_NORMAL_REPEAT;
+	break;
+
+    default:
+	flags |=
+	    FAST_PATH_NO_REFLECT_REPEAT		|
+	    FAST_PATH_NO_PAD_REPEAT		|
+	    FAST_PATH_NO_NONE_REPEAT;
+	break;
+    }
+
+    /* Component alpha */
+    if (image->common.component_alpha)
+	flags |= FAST_PATH_COMPONENT_ALPHA;
+    else
+	flags |= FAST_PATH_UNIFIED_ALPHA;
+
+    flags |= (FAST_PATH_NO_ACCESSORS | FAST_PATH_NARROW_FORMAT);
+
+    /* Type specific checks */
+    switch (image->type)
+    {
+    case SOLID:
+	code = PIXMAN_solid;
+
+	if (image->solid.color.alpha == 0xffff)
+	    flags |= FAST_PATH_IS_OPAQUE;
+	break;
+
+    case BITS:
+	if (image->bits.width == 1	&&
+	    image->bits.height == 1	&&
+	    image->common.repeat != PIXMAN_REPEAT_NONE)
+	{
+	    code = PIXMAN_solid;
+	}
+	else
+	{
+	    code = image->bits.format;
+	    flags |= FAST_PATH_BITS_IMAGE;
+	}
+
+	if (!PIXMAN_FORMAT_A (image->bits.format)				&&
+	    PIXMAN_FORMAT_TYPE (image->bits.format) != PIXMAN_TYPE_GRAY		&&
+	    PIXMAN_FORMAT_TYPE (image->bits.format) != PIXMAN_TYPE_COLOR)
+	{
+	    flags |= FAST_PATH_SAMPLES_OPAQUE;
+
+	    if (image->common.repeat != PIXMAN_REPEAT_NONE)
+		flags |= FAST_PATH_IS_OPAQUE;
+	}
+
+	if (image->bits.read_func || image->bits.write_func)
+	    flags &= ~FAST_PATH_NO_ACCESSORS;
+
+	if (PIXMAN_FORMAT_IS_WIDE (image->bits.format))
+	    flags &= ~FAST_PATH_NARROW_FORMAT;
+	break;
+
+    case RADIAL:
+	code = PIXMAN_unknown;
+
+	/*
+	 * As explained in pixman-radial-gradient.c, every point of
+	 * the plane has a valid associated radius (and thus will be
+	 * colored) if and only if a is negative (i.e. one of the two
+	 * circles contains the other one).
+	 */
+
+        if (image->radial.a >= 0)
+	    break;
+
+	/* Fall through */
+
+    case CONICAL:
+    case LINEAR:
+	code = PIXMAN_unknown;
+
+	if (image->common.repeat != PIXMAN_REPEAT_NONE)
+	{
+	    int i;
+
+	    flags |= FAST_PATH_IS_OPAQUE;
+	    for (i = 0; i < image->gradient.n_stops; ++i)
+	    {
+		if (image->gradient.stops[i].color.alpha != 0xffff)
+		{
+		    flags &= ~FAST_PATH_IS_OPAQUE;
+		    break;
+		}
+	    }
+	}
+	break;
+
+    default:
+	code = PIXMAN_unknown;
+	break;
+    }
+
+    /* Alpha map */
+    if (!image->common.alpha_map)
+    {
+	flags |= FAST_PATH_NO_ALPHA_MAP;
+    }
+    else
+    {
+	if (PIXMAN_FORMAT_IS_WIDE (image->common.alpha_map->format))
+	    flags &= ~FAST_PATH_NARROW_FORMAT;
+    }
+
+    /* Both alpha maps and convolution filters can introduce
+     * non-opaqueness in otherwise opaque images. Also
+     * an image with component alpha turned on is only opaque
+     * if all channels are opaque, so we simply turn it off
+     * unconditionally for those images.
+     */
+    if (image->common.alpha_map					||
+	image->common.filter == PIXMAN_FILTER_CONVOLUTION	||
+	image->common.component_alpha)
+    {
+	flags &= ~(FAST_PATH_IS_OPAQUE | FAST_PATH_SAMPLES_OPAQUE);
+    }
+
+    image->common.flags = flags;
+    image->common.extended_format_code = code;
+}
+
+void
+_pixman_image_validate (pixman_image_t *image)
+{
+    if (image->common.dirty)
+    {
+	compute_image_info (image);
+
+	/* It is important that property_changed is
+	 * called *after* compute_image_info() because
+	 * property_changed() can make use of the flags
+	 * to set up accessors etc.
+	 */
+	if (image->common.property_changed)
+	    image->common.property_changed (image);
+
+	image->common.dirty = FALSE;
+    }
+
+    if (image->common.alpha_map)
+	_pixman_image_validate ((pixman_image_t *)image->common.alpha_map);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_clip_region32 (pixman_image_t *   image,
+                                pixman_region32_t *region)
+{
+    image_common_t *common = (image_common_t *)image;
+    pixman_bool_t result;
+
+    if (region)
+    {
+	if ((result = pixman_region32_copy (&common->clip_region, region)))
+	    image->common.have_clip_region = TRUE;
+    }
+    else
+    {
+	_pixman_image_reset_clip_region (image);
+
+	result = TRUE;
+    }
+
+    image_property_changed (image);
+
+    return result;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_clip_region (pixman_image_t *   image,
+                              pixman_region16_t *region)
+{
+    image_common_t *common = (image_common_t *)image;
+    pixman_bool_t result;
+
+    if (region)
+    {
+	if ((result = pixman_region32_copy_from_region16 (&common->clip_region, region)))
+	    image->common.have_clip_region = TRUE;
+    }
+    else
+    {
+	_pixman_image_reset_clip_region (image);
+
+	result = TRUE;
+    }
+
+    image_property_changed (image);
+
+    return result;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_has_client_clip (pixman_image_t *image,
+                                  pixman_bool_t   client_clip)
+{
+    image->common.client_clip = client_clip;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_transform (pixman_image_t *          image,
+                            const pixman_transform_t *transform)
+{
+    static const pixman_transform_t id =
+    {
+	{ { pixman_fixed_1, 0, 0 },
+	  { 0, pixman_fixed_1, 0 },
+	  { 0, 0, pixman_fixed_1 } }
+    };
+
+    image_common_t *common = (image_common_t *)image;
+    pixman_bool_t result;
+
+    if (common->transform == transform)
+	return TRUE;
+
+    if (!transform || memcmp (&id, transform, sizeof (pixman_transform_t)) == 0)
+    {
+	free (common->transform);
+	common->transform = NULL;
+	result = TRUE;
+
+	goto out;
+    }
+
+    if (common->transform &&
+	memcmp (common->transform, transform, sizeof (pixman_transform_t) == 0))
+    {
+	return TRUE;
+    }
+
+    if (common->transform == NULL)
+	common->transform = malloc (sizeof (pixman_transform_t));
+
+    if (common->transform == NULL)
+    {
+	result = FALSE;
+
+	goto out;
+    }
+
+    memcpy (common->transform, transform, sizeof(pixman_transform_t));
+
+    result = TRUE;
+
+out:
+    image_property_changed (image);
+
+    return result;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_repeat (pixman_image_t *image,
+                         pixman_repeat_t repeat)
+{
+    if (image->common.repeat == repeat)
+	return;
+
+    image->common.repeat = repeat;
+
+    image_property_changed (image);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_filter (pixman_image_t *      image,
+                         pixman_filter_t       filter,
+                         const pixman_fixed_t *params,
+                         int                   n_params)
+{
+    image_common_t *common = (image_common_t *)image;
+    pixman_fixed_t *new_params;
+
+    if (params == common->filter_params && filter == common->filter)
+	return TRUE;
+
+    new_params = NULL;
+    if (params)
+    {
+	new_params = pixman_malloc_ab (n_params, sizeof (pixman_fixed_t));
+	if (!new_params)
+	    return FALSE;
+
+	memcpy (new_params,
+	        params, n_params * sizeof (pixman_fixed_t));
+    }
+
+    common->filter = filter;
+
+    if (common->filter_params)
+	free (common->filter_params);
+
+    common->filter_params = new_params;
+    common->n_filter_params = n_params;
+
+    image_property_changed (image);
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_source_clipping (pixman_image_t *image,
+                                  pixman_bool_t   clip_sources)
+{
+    if (image->common.clip_sources == clip_sources)
+	return;
+
+    image->common.clip_sources = clip_sources;
+
+    image_property_changed (image);
+}
+
+/* Unlike all the other property setters, this function does not
+ * copy the content of indexed. Doing this copying is simply
+ * way, way too expensive.
+ */
+PIXMAN_EXPORT void
+pixman_image_set_indexed (pixman_image_t *        image,
+                          const pixman_indexed_t *indexed)
+{
+    bits_image_t *bits = (bits_image_t *)image;
+
+    if (bits->indexed == indexed)
+	return;
+
+    bits->indexed = indexed;
+
+    image_property_changed (image);
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_alpha_map (pixman_image_t *image,
+                            pixman_image_t *alpha_map,
+                            int16_t         x,
+                            int16_t         y)
+{
+    image_common_t *common = (image_common_t *)image;
+
+    return_if_fail (!alpha_map || alpha_map->type == BITS);
+
+    if (alpha_map && common->alpha_count > 0)
+    {
+	/* If this image is being used as an alpha map itself,
+	 * then you can't give it an alpha map of its own.
+	 */
+	return;
+    }
+
+    if (alpha_map && alpha_map->common.alpha_map)
+    {
+	/* If the image has an alpha map of its own,
+	 * then it can't be used as an alpha map itself
+	 */
+	return;
+    }
+
+    if (common->alpha_map != (bits_image_t *)alpha_map)
+    {
+	if (common->alpha_map)
+	{
+	    common->alpha_map->common.alpha_count--;
+
+	    pixman_image_unref ((pixman_image_t *)common->alpha_map);
+	}
+
+	if (alpha_map)
+	{
+	    common->alpha_map = (bits_image_t *)pixman_image_ref (alpha_map);
+
+	    common->alpha_map->common.alpha_count++;
+	}
+	else
+	{
+	    common->alpha_map = NULL;
+	}
+    }
+
+    common->alpha_origin_x = x;
+    common->alpha_origin_y = y;
+
+    image_property_changed (image);
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_component_alpha   (pixman_image_t *image,
+                                    pixman_bool_t   component_alpha)
+{
+    if (image->common.component_alpha == component_alpha)
+	return;
+
+    image->common.component_alpha = component_alpha;
+
+    image_property_changed (image);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_get_component_alpha   (pixman_image_t       *image)
+{
+    return image->common.component_alpha;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_accessors (pixman_image_t *           image,
+                            pixman_read_memory_func_t  read_func,
+                            pixman_write_memory_func_t write_func)
+{
+    return_if_fail (image != NULL);
+
+    if (image->type == BITS)
+    {
+	image->bits.read_func = read_func;
+	image->bits.write_func = write_func;
+
+	image_property_changed (image);
+    }
+}
+
+PIXMAN_EXPORT uint32_t *
+pixman_image_get_data (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return image->bits.bits;
+
+    return NULL;
+}
+
+PIXMAN_EXPORT int
+pixman_image_get_width (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return image->bits.width;
+
+    return 0;
+}
+
+PIXMAN_EXPORT int
+pixman_image_get_height (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return image->bits.height;
+
+    return 0;
+}
+
+PIXMAN_EXPORT int
+pixman_image_get_stride (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return image->bits.rowstride * (int) sizeof (uint32_t);
+
+    return 0;
+}
+
+PIXMAN_EXPORT int
+pixman_image_get_depth (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return PIXMAN_FORMAT_DEPTH (image->bits.format);
+
+    return 0;
+}
+
+PIXMAN_EXPORT pixman_format_code_t
+pixman_image_get_format (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return image->bits.format;
+
+    return 0;
+}
+
+uint32_t
+_pixman_image_get_solid (pixman_implementation_t *imp,
+			 pixman_image_t *         image,
+                         pixman_format_code_t     format)
+{
+    uint32_t result;
+    pixman_iter_t iter;
+
+    _pixman_implementation_src_iter_init (
+	imp, &iter, image, 0, 0, 1, 1,
+	(uint8_t *)&result, ITER_NARROW);
+
+    result = *iter.get_scanline (&iter, NULL);
+
+    /* If necessary, convert RGB <--> BGR. */
+    if (PIXMAN_FORMAT_TYPE (format) != PIXMAN_TYPE_ARGB)
+    {
+	result = (((result & 0xff000000) >>  0) |
+	          ((result & 0x00ff0000) >> 16) |
+	          ((result & 0x0000ff00) >>  0) |
+	          ((result & 0x000000ff) << 16));
+    }
+
+    return result;
+}
diff --git a/pixman/pixman-implementation.c b/pixman/pixman-implementation.c
new file mode 100644
index 0000000..2b7b19d
--- /dev/null
+++ b/pixman/pixman-implementation.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright Â© 2009 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include "pixman-private.h"
+
+static void
+delegate_combine_32 (pixman_implementation_t * imp,
+                     pixman_op_t               op,
+                     uint32_t *                dest,
+                     const uint32_t *          src,
+                     const uint32_t *          mask,
+                     int                       width)
+{
+    _pixman_implementation_combine_32 (imp->delegate,
+                                       op, dest, src, mask, width);
+}
+
+static void
+delegate_combine_64 (pixman_implementation_t * imp,
+                     pixman_op_t               op,
+                     uint64_t *                dest,
+                     const uint64_t *          src,
+                     const uint64_t *          mask,
+                     int                       width)
+{
+    _pixman_implementation_combine_64 (imp->delegate,
+                                       op, dest, src, mask, width);
+}
+
+static void
+delegate_combine_32_ca (pixman_implementation_t * imp,
+                        pixman_op_t               op,
+                        uint32_t *                dest,
+                        const uint32_t *          src,
+                        const uint32_t *          mask,
+                        int                       width)
+{
+    _pixman_implementation_combine_32_ca (imp->delegate,
+                                          op, dest, src, mask, width);
+}
+
+static void
+delegate_combine_64_ca (pixman_implementation_t * imp,
+                        pixman_op_t               op,
+                        uint64_t *                dest,
+                        const uint64_t *          src,
+                        const uint64_t *          mask,
+                        int                       width)
+{
+    _pixman_implementation_combine_64_ca (imp->delegate,
+                                          op, dest, src, mask, width);
+}
+
+static pixman_bool_t
+delegate_blt (pixman_implementation_t * imp,
+              uint32_t *                src_bits,
+              uint32_t *                dst_bits,
+              int                       src_stride,
+              int                       dst_stride,
+              int                       src_bpp,
+              int                       dst_bpp,
+              int                       src_x,
+              int                       src_y,
+              int                       dest_x,
+              int                       dest_y,
+              int                       width,
+              int                       height)
+{
+    return _pixman_implementation_blt (
+	imp->delegate, src_bits, dst_bits, src_stride, dst_stride,
+	src_bpp, dst_bpp, src_x, src_y, dest_x, dest_y,
+	width, height);
+}
+
+static pixman_bool_t
+delegate_fill (pixman_implementation_t *imp,
+               uint32_t *               bits,
+               int                      stride,
+               int                      bpp,
+               int                      x,
+               int                      y,
+               int                      width,
+               int                      height,
+               uint32_t                 xor)
+{
+    return _pixman_implementation_fill (
+	imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+}
+
+static void
+delegate_src_iter_init (pixman_implementation_t *imp,
+			pixman_iter_t *	         iter)
+{
+    imp->delegate->src_iter_init (imp->delegate, iter);
+}
+
+static void
+delegate_dest_iter_init (pixman_implementation_t *imp,
+			 pixman_iter_t *	  iter)
+{
+    imp->delegate->dest_iter_init (imp->delegate, iter);
+}
+
+pixman_implementation_t *
+_pixman_implementation_create (pixman_implementation_t *delegate,
+			       const pixman_fast_path_t *fast_paths)
+{
+    pixman_implementation_t *imp = malloc (sizeof (pixman_implementation_t));
+    pixman_implementation_t *d;
+    int i;
+
+    if (!imp)
+	return NULL;
+
+    assert (fast_paths);
+
+    /* Make sure the whole delegate chain has the right toplevel */
+    imp->delegate = delegate;
+    for (d = imp; d != NULL; d = d->delegate)
+	d->toplevel = imp;
+
+    /* Fill out function pointers with ones that just delegate
+     */
+    imp->blt = delegate_blt;
+    imp->fill = delegate_fill;
+    imp->src_iter_init = delegate_src_iter_init;
+    imp->dest_iter_init = delegate_dest_iter_init;
+
+    for (i = 0; i < PIXMAN_N_OPERATORS; ++i)
+    {
+	imp->combine_32[i] = delegate_combine_32;
+	imp->combine_64[i] = delegate_combine_64;
+	imp->combine_32_ca[i] = delegate_combine_32_ca;
+	imp->combine_64_ca[i] = delegate_combine_64_ca;
+    }
+
+    imp->fast_paths = fast_paths;
+
+    return imp;
+}
+
+void
+_pixman_implementation_combine_32 (pixman_implementation_t * imp,
+                                   pixman_op_t               op,
+                                   uint32_t *                dest,
+                                   const uint32_t *          src,
+                                   const uint32_t *          mask,
+                                   int                       width)
+{
+    (*imp->combine_32[op]) (imp, op, dest, src, mask, width);
+}
+
+void
+_pixman_implementation_combine_64 (pixman_implementation_t * imp,
+                                   pixman_op_t               op,
+                                   uint64_t *                dest,
+                                   const uint64_t *          src,
+                                   const uint64_t *          mask,
+                                   int                       width)
+{
+    (*imp->combine_64[op]) (imp, op, dest, src, mask, width);
+}
+
+void
+_pixman_implementation_combine_32_ca (pixman_implementation_t * imp,
+                                      pixman_op_t               op,
+                                      uint32_t *                dest,
+                                      const uint32_t *          src,
+                                      const uint32_t *          mask,
+                                      int                       width)
+{
+    (*imp->combine_32_ca[op]) (imp, op, dest, src, mask, width);
+}
+
+void
+_pixman_implementation_combine_64_ca (pixman_implementation_t * imp,
+                                      pixman_op_t               op,
+                                      uint64_t *                dest,
+                                      const uint64_t *          src,
+                                      const uint64_t *          mask,
+                                      int                       width)
+{
+    (*imp->combine_64_ca[op]) (imp, op, dest, src, mask, width);
+}
+
+pixman_bool_t
+_pixman_implementation_blt (pixman_implementation_t * imp,
+                            uint32_t *                src_bits,
+                            uint32_t *                dst_bits,
+                            int                       src_stride,
+                            int                       dst_stride,
+                            int                       src_bpp,
+                            int                       dst_bpp,
+                            int                       src_x,
+                            int                       src_y,
+                            int                       dest_x,
+                            int                       dest_y,
+                            int                       width,
+                            int                       height)
+{
+    return (*imp->blt) (imp, src_bits, dst_bits, src_stride, dst_stride,
+			src_bpp, dst_bpp, src_x, src_y, dest_x, dest_y,
+			width, height);
+}
+
+pixman_bool_t
+_pixman_implementation_fill (pixman_implementation_t *imp,
+                             uint32_t *               bits,
+                             int                      stride,
+                             int                      bpp,
+                             int                      x,
+                             int                      y,
+                             int                      width,
+                             int                      height,
+                             uint32_t                 xor)
+{
+    return (*imp->fill) (imp, bits, stride, bpp, x, y, width, height, xor);
+}
+
+void
+_pixman_implementation_src_iter_init (pixman_implementation_t	*imp,
+				      pixman_iter_t             *iter,
+				      pixman_image_t		*image,
+				      int			 x,
+				      int			 y,
+				      int			 width,
+				      int			 height,
+				      uint8_t			*buffer,
+				      iter_flags_t		 flags)
+{
+    iter->image = image;
+    iter->buffer = (uint32_t *)buffer;
+    iter->x = x;
+    iter->y = y;
+    iter->width = width;
+    iter->height = height;
+    iter->flags = flags;
+
+    (*imp->src_iter_init) (imp, iter);
+}
+
+void
+_pixman_implementation_dest_iter_init (pixman_implementation_t	*imp,
+				       pixman_iter_t            *iter,
+				       pixman_image_t		*image,
+				       int			 x,
+				       int			 y,
+				       int			 width,
+				       int			 height,
+				       uint8_t			*buffer,
+				       iter_flags_t		 flags)
+{
+    iter->image = image;
+    iter->buffer = (uint32_t *)buffer;
+    iter->x = x;
+    iter->y = y;
+    iter->width = width;
+    iter->height = height;
+    iter->flags = flags;
+
+    (*imp->dest_iter_init) (imp, iter);
+}
diff --git a/pixman/pixman-inlines.h b/pixman/pixman-inlines.h
new file mode 100644
index 0000000..3532867
--- /dev/null
+++ b/pixman/pixman-inlines.h
@@ -0,0 +1,1280 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright Â© 2000 SuSE, Inc.
+ * Copyright Â© 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifndef PIXMAN_FAST_PATH_H__
+#define PIXMAN_FAST_PATH_H__
+
+#include "pixman-private.h"
+
+#define PIXMAN_REPEAT_COVER -1
+
+/* Flags describing input parameters to fast path macro template.
+ * Turning on some flag values may indicate that
+ * "some property X is available so template can use this" or
+ * "some property X should be handled by template".
+ *
+ * FLAG_HAVE_SOLID_MASK
+ *  Input mask is solid so template should handle this.
+ *
+ * FLAG_HAVE_NON_SOLID_MASK
+ *  Input mask is bits mask so template should handle this.
+ *
+ * FLAG_HAVE_SOLID_MASK and FLAG_HAVE_NON_SOLID_MASK are mutually
+ * exclusive. (It's not allowed to turn both flags on)
+ */
+#define FLAG_NONE				(0)
+#define FLAG_HAVE_SOLID_MASK			(1 <<   1)
+#define FLAG_HAVE_NON_SOLID_MASK		(1 <<   2)
+
+/* To avoid too short repeated scanline function calls, extend source
+ * scanlines having width less than below constant value.
+ */
+#define REPEAT_NORMAL_MIN_WIDTH			64
+
+static force_inline pixman_bool_t
+repeat (pixman_repeat_t repeat, int *c, int size)
+{
+    if (repeat == PIXMAN_REPEAT_NONE)
+    {
+	if (*c < 0 || *c >= size)
+	    return FALSE;
+    }
+    else if (repeat == PIXMAN_REPEAT_NORMAL)
+    {
+	while (*c >= size)
+	    *c -= size;
+	while (*c < 0)
+	    *c += size;
+    }
+    else if (repeat == PIXMAN_REPEAT_PAD)
+    {
+	*c = CLIP (*c, 0, size - 1);
+    }
+    else /* REFLECT */
+    {
+	*c = MOD (*c, size * 2);
+	if (*c >= size)
+	    *c = size * 2 - *c - 1;
+    }
+    return TRUE;
+}
+
+#if SIZEOF_LONG > 4
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    uint64_t distxy, distxiy, distixy, distixiy;
+    uint64_t tl64, tr64, bl64, br64;
+    uint64_t f, r;
+
+    distxy = distx * disty;
+    distxiy = distx * (256 - disty);
+    distixy = (256 - distx) * disty;
+    distixiy = (256 - distx) * (256 - disty);
+
+    /* Alpha and Blue */
+    tl64 = tl & 0xff0000ff;
+    tr64 = tr & 0xff0000ff;
+    bl64 = bl & 0xff0000ff;
+    br64 = br & 0xff0000ff;
+
+    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+    r = f & 0x0000ff0000ff0000ull;
+
+    /* Red and Green */
+    tl64 = tl;
+    tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
+
+    tr64 = tr;
+    tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
+
+    bl64 = bl;
+    bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
+
+    br64 = br;
+    br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
+
+    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+    r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
+
+    return (uint32_t)(r >> 16);
+}
+
+#else
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    int distxy, distxiy, distixy, distixiy;
+    uint32_t f, r;
+
+    distxy = distx * disty;
+    distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
+    distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
+    distixiy =
+	256 * 256 - (disty << 8) -
+	(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
+
+    /* Blue */
+    r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
+
+    /* Green */
+    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
+    r |= f & 0xff000000;
+
+    tl >>= 16;
+    tr >>= 16;
+    bl >>= 16;
+    br >>= 16;
+    r >>= 16;
+
+    /* Red */
+    f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
+    r |= f & 0x00ff0000;
+
+    /* Alpha */
+    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
+    r |= f & 0xff000000;
+
+    return r;
+}
+
+#endif
+
+/*
+ * For each scanline fetched from source image with PAD repeat:
+ * - calculate how many pixels need to be padded on the left side
+ * - calculate how many pixels need to be padded on the right side
+ * - update width to only count pixels which are fetched from the image
+ * All this information is returned via 'width', 'left_pad', 'right_pad'
+ * arguments. The code is assuming that 'unit_x' is positive.
+ *
+ * Note: 64-bit math is used in order to avoid potential overflows, which
+ *       is probably excessive in many cases. This particular function
+ *       may need its own correctness test and performance tuning.
+ */
+static force_inline void
+pad_repeat_get_scanline_bounds (int32_t         source_image_width,
+				pixman_fixed_t  vx,
+				pixman_fixed_t  unit_x,
+				int32_t *       width,
+				int32_t *       left_pad,
+				int32_t *       right_pad)
+{
+    int64_t max_vx = (int64_t) source_image_width << 16;
+    int64_t tmp;
+    if (vx < 0)
+    {
+	tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
+	if (tmp > *width)
+	{
+	    *left_pad = *width;
+	    *width = 0;
+	}
+	else
+	{
+	    *left_pad = (int32_t) tmp;
+	    *width -= (int32_t) tmp;
+	}
+    }
+    else
+    {
+	*left_pad = 0;
+    }
+    tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
+    if (tmp < 0)
+    {
+	*right_pad = *width;
+	*width = 0;
+    }
+    else if (tmp >= *width)
+    {
+	*right_pad = 0;
+    }
+    else
+    {
+	*right_pad = *width - (int32_t) tmp;
+	*width = (int32_t) tmp;
+    }
+}
+
+/* A macroified version of specialized nearest scalers for some
+ * common 8888 and 565 formats. It supports SRC and OVER ops.
+ *
+ * There are two repeat versions, one that handles repeat normal,
+ * and one without repeat handling that only works if the src region
+ * used is completely covered by the pre-repeated source samples.
+ *
+ * The loops are unrolled to process two pixels per iteration for better
+ * performance on most CPU architectures (superscalar processors
+ * can issue several operations simultaneously, other processors can hide
+ * instructions latencies by pipelining operations). Unrolling more
+ * does not make much sense because the compiler will start running out
+ * of spare registers soon.
+ */
+
+#define GET_8888_ALPHA(s) ((s) >> 24)
+ /* This is not actually used since we don't have an OVER with
+    565 source, but it is needed to build. */
+#define GET_0565_ALPHA(s) 0xff
+#define GET_x888_ALPHA(s) 0xff
+
+#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,			\
+			      src_type_t, dst_type_t, OP, repeat_mode)				\
+static force_inline void									\
+scanline_func_name (dst_type_t       *dst,							\
+		    const src_type_t *src,							\
+		    int32_t           w,							\
+		    pixman_fixed_t    vx,							\
+		    pixman_fixed_t    unit_x,							\
+		    pixman_fixed_t    max_vx,							\
+		    pixman_bool_t     fully_transparent_src)					\
+{												\
+	uint32_t   d;										\
+	src_type_t s1, s2;									\
+	uint8_t    a1, a2;									\
+	int        x1, x2;									\
+												\
+	if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src)			\
+	    return;										\
+												\
+	if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
+	    abort();										\
+												\
+	while ((w -= 2) >= 0)									\
+	{											\
+	    x1 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= max_vx)								\
+		    vx -= max_vx;								\
+	    }											\
+	    s1 = src[x1];									\
+												\
+	    x2 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= max_vx)								\
+		    vx -= max_vx;								\
+	    }											\
+	    s2 = src[x2];									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst);				\
+		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+												\
+		if (a2 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+		}										\
+		else if (s2)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2);				\
+		    a2 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+	    }											\
+	}											\
+												\
+	if (w & 1)										\
+	{											\
+	    x1 = vx >> 16;									\
+	    s1 = src[x1];									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+	    }											\
+	}											\
+}
+
+#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
+static void											\
+fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,		\
+						   pixman_composite_info_t *info)               \
+{												\
+    PIXMAN_COMPOSITE_ARGS (info);					                        \
+    dst_type_t *dst_line;						                        \
+    mask_type_t *mask_line;									\
+    src_type_t *src_first_line;									\
+    int       y;										\
+    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
+    pixman_fixed_t max_vy;									\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+    int32_t left_pad, right_pad;								\
+												\
+    src_type_t *src;										\
+    dst_type_t *dst;										\
+    mask_type_t solid_mask;									\
+    const mask_type_t *mask = &solid_mask;							\
+    int src_stride, mask_stride, dst_stride;							\
+												\
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
+    if (have_mask)										\
+    {												\
+	if (mask_is_solid)									\
+	    solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
+	else											\
+	    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,			\
+				   mask_stride, mask_line, 1);					\
+    }												\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
+    v.vector[0] -= pixman_fixed_e;								\
+    v.vector[1] -= pixman_fixed_e;								\
+												\
+    vx = v.vector[0];										\
+    vy = v.vector[1];										\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
+    {												\
+	/* Clamp repeating positions inside the actual samples */				\
+	max_vx = src_image->bits.width << 16;							\
+	max_vy = src_image->bits.height << 16;							\
+												\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
+	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+    }												\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
+	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
+    {												\
+	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
+					&width, &left_pad, &right_pad);				\
+	vx += left_pad * unit_x;								\
+    }												\
+												\
+    while (--height >= 0)									\
+    {												\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+	if (have_mask && !mask_is_solid)							\
+	{											\
+	    mask = mask_line;									\
+	    mask_line += mask_stride;								\
+	}											\
+												\
+	y = vy >> 16;										\
+	vy += unit_y;										\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\
+	    src = src_first_line + src_stride * y;						\
+	    if (left_pad > 0)									\
+	    {											\
+		scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE);			\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
+			       dst + left_pad, src, width, vx, unit_x, 0, FALSE);		\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
+			       dst + left_pad + width, src + src_image->bits.width - 1,		\
+			       right_pad, 0, 0, 0, FALSE);					\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
+	{											\
+	    static const src_type_t zero[1] = { 0 };						\
+	    if (y < 0 || y >= src_image->bits.height)						\
+	    {											\
+		scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE);	\
+		continue;									\
+	    }											\
+	    src = src_first_line + src_stride * y;						\
+	    if (left_pad > 0)									\
+	    {											\
+		scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE);			\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
+			       dst + left_pad, src, width, vx, unit_x, 0, FALSE);		\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
+			       dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE);		\
+	    }											\
+	}											\
+	else											\
+	{											\
+	    src = src_first_line + src_stride * y;						\
+	    scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE);			\
+	}											\
+    }												\
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
+	FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)
+
+#define FAST_NEAREST_MAINLOOP_NOMASK(scale_func_name, scanline_func, src_type_t, dst_type_t,	\
+			      repeat_mode)							\
+    static force_inline void									\
+    scanline_func##scale_func_name##_wrapper (							\
+		    const uint8_t    *mask,							\
+		    dst_type_t       *dst,							\
+		    const src_type_t *src,							\
+		    int32_t          w,								\
+		    pixman_fixed_t   vx,							\
+		    pixman_fixed_t   unit_x,							\
+		    pixman_fixed_t   max_vx,							\
+		    pixman_bool_t    fully_transparent_src)					\
+    {												\
+	scanline_func (dst, src, w, vx, unit_x, max_vx, fully_transparent_src);			\
+    }												\
+    FAST_NEAREST_MAINLOOP_INT (scale_func_name, scanline_func##scale_func_name##_wrapper,	\
+			       src_type_t, uint8_t, dst_type_t, repeat_mode, FALSE, FALSE)
+
+#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t,		\
+			      repeat_mode)							\
+	FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name, scanline_func, src_type_t,		\
+			      dst_type_t, repeat_mode)
+
+#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,				\
+		     src_type_t, dst_type_t, OP, repeat_mode)				\
+    FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
+			  SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t,		\
+			  OP, repeat_mode)						\
+    FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name ## _ ## OP,			\
+			  scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
+			  src_type_t, dst_type_t, repeat_mode)
+
+
+#define SCALED_NEAREST_FLAGS						\
+    (FAST_PATH_SCALE_TRANSFORM	|					\
+     FAST_PATH_NO_ALPHA_MAP	|					\
+     FAST_PATH_NEAREST_FILTER	|					\
+     FAST_PATH_NO_ACCESSORS	|					\
+     FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,    \
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,	\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,	\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),			\
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)			\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func)		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
+
+/*****************************************************************************/
+
+/*
+ * Identify 5 zones in each scanline for bilinear scaling. Depending on
+ * whether 2 pixels to be interpolated are fetched from the image itself,
+ * from the padding area around it or from both image and padding area.
+ */
+static force_inline void
+bilinear_pad_repeat_get_scanline_bounds (int32_t         source_image_width,
+					 pixman_fixed_t  vx,
+					 pixman_fixed_t  unit_x,
+					 int32_t *       left_pad,
+					 int32_t *       left_tz,
+					 int32_t *       width,
+					 int32_t *       right_tz,
+					 int32_t *       right_pad)
+{
+	int width1 = *width, left_pad1, right_pad1;
+	int width2 = *width, left_pad2, right_pad2;
+
+	pad_repeat_get_scanline_bounds (source_image_width, vx, unit_x,
+					&width1, &left_pad1, &right_pad1);
+	pad_repeat_get_scanline_bounds (source_image_width, vx + pixman_fixed_1,
+					unit_x, &width2, &left_pad2, &right_pad2);
+
+	*left_pad = left_pad2;
+	*left_tz = left_pad1 - left_pad2;
+	*right_tz = right_pad2 - right_pad1;
+	*right_pad = right_pad1;
+	*width -= *left_pad + *left_tz + *right_tz + *right_pad;
+}
+
+/*
+ * Main loop template for single pass bilinear scaling. It needs to be
+ * provided with 'scanline_func' which should do the compositing operation.
+ * The needed function has the following prototype:
+ *
+ *	scanline_func (dst_type_t *       dst,
+ *		       const mask_type_ * mask,
+ *		       const src_type_t * src_top,
+ *		       const src_type_t * src_bottom,
+ *		       int32_t            width,
+ *		       int                weight_top,
+ *		       int                weight_bottom,
+ *		       pixman_fixed_t     vx,
+ *		       pixman_fixed_t     unit_x,
+ *		       pixman_fixed_t     max_vx,
+ *		       pixman_bool_t      zero_src)
+ *
+ * Where:
+ *  dst                 - destination scanline buffer for storing results
+ *  mask                - mask buffer (or single value for solid mask)
+ *  src_top, src_bottom - two source scanlines
+ *  width               - number of pixels to process
+ *  weight_top          - weight of the top row for interpolation
+ *  weight_bottom       - weight of the bottom row for interpolation
+ *  vx                  - initial position for fetching the first pair of
+ *                        pixels from the source buffer
+ *  unit_x              - position increment needed to move to the next pair
+ *                        of pixels
+ *  max_vx              - image size as a fixed point value, can be used for
+ *                        implementing NORMAL repeat (when it is supported)
+ *  zero_src            - boolean hint variable, which is set to TRUE when
+ *                        all source pixels are fetched from zero padding
+ *                        zone for NONE repeat
+ *
+ * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256,
+ *       but sometimes it may be less than that for NONE repeat when handling
+ *       fuzzy antialiased top or bottom image edges. Also both top and
+ *       bottom weight variables are guaranteed to have value in 0-255
+ *       range and can fit into unsigned byte or be used with 8-bit SIMD
+ *       multiplication instructions.
+ */
+#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, flags)				\
+static void											\
+fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,		\
+						   pixman_composite_info_t *info)		\
+{												\
+    PIXMAN_COMPOSITE_ARGS (info);								\
+    dst_type_t *dst_line;									\
+    mask_type_t *mask_line;									\
+    src_type_t *src_first_line;									\
+    int       y1, y2;										\
+    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+    int32_t left_pad, left_tz, right_tz, right_pad;						\
+												\
+    dst_type_t *dst;										\
+    mask_type_t solid_mask;									\
+    const mask_type_t *mask = &solid_mask;							\
+    int src_stride, mask_stride, dst_stride;							\
+												\
+    int src_width;										\
+    pixman_fixed_t src_width_fixed;								\
+    int max_x;											\
+    pixman_bool_t need_src_extension;								\
+												\
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
+    if (flags & FLAG_HAVE_SOLID_MASK)								\
+    {												\
+	solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
+	mask_stride = 0;									\
+    }												\
+    else if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
+    {												\
+	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,				\
+			       mask_stride, mask_line, 1);					\
+    }												\
+												\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    v.vector[0] -= pixman_fixed_1 / 2;								\
+    v.vector[1] -= pixman_fixed_1 / 2;								\
+												\
+    vy = v.vector[1];										\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
+	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
+    {												\
+	bilinear_pad_repeat_get_scanline_bounds (src_image->bits.width, v.vector[0], unit_x,	\
+					&left_pad, &left_tz, &width, &right_tz, &right_pad);	\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    /* PAD repeat does not need special handling for 'transition zones' and */		\
+	    /* they can be combined with 'padding zones' safely */				\
+	    left_pad += left_tz;								\
+	    right_pad += right_tz;								\
+	    left_tz = right_tz = 0;								\
+	}											\
+	v.vector[0] += left_pad * unit_x;							\
+    }												\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
+    {												\
+	vx = v.vector[0];									\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, pixman_int_to_fixed(src_image->bits.width));		\
+	max_x = pixman_fixed_to_int (vx + (width - 1) * unit_x) + 1;				\
+												\
+	if (src_image->bits.width < REPEAT_NORMAL_MIN_WIDTH)					\
+	{											\
+	    src_width = 0;									\
+												\
+	    while (src_width < REPEAT_NORMAL_MIN_WIDTH && src_width <= max_x)			\
+		src_width += src_image->bits.width;						\
+												\
+	    need_src_extension = TRUE;								\
+	}											\
+	else											\
+	{											\
+	    src_width = src_image->bits.width;							\
+	    need_src_extension = FALSE;								\
+	}											\
+												\
+	src_width_fixed = pixman_int_to_fixed (src_width);					\
+    }												\
+												\
+    while (--height >= 0)									\
+    {												\
+	int weight1, weight2;									\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+	vx = v.vector[0];									\
+	if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
+	{											\
+	    mask = mask_line;									\
+	    mask_line += mask_stride;								\
+	}											\
+												\
+	y1 = pixman_fixed_to_int (vy);								\
+	weight2 = (vy >> 8) & 0xff;								\
+	if (weight2)										\
+	{											\
+	    /* normal case, both row weights are in 0-255 range and fit unsigned byte */	\
+	    y2 = y1 + 1;									\
+	    weight1 = 256 - weight2;								\
+	}											\
+	else											\
+	{											\
+	    /* set both top and bottom row to the same scanline, and weights to 128+128 */	\
+	    y2 = y1;										\
+	    weight1 = weight2 = 128;								\
+	}											\
+	vy += unit_y;										\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    src_type_t *src1, *src2;								\
+	    src_type_t buf1[2];									\
+	    src_type_t buf2[2];									\
+	    repeat (PIXMAN_REPEAT_PAD, &y1, src_image->bits.height);				\
+	    repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height);				\
+	    src1 = src_first_line + src_stride * y1;						\
+	    src2 = src_first_line + src_stride * y2;						\
+												\
+	    if (left_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = src1[0];							\
+		buf2[0] = buf2[1] = src2[0];							\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE);		\
+		dst += left_pad;								\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += left_pad;								\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (dst, mask,							\
+			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
+		dst += width;									\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += width;								\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = src1[src_image->bits.width - 1];				\
+		buf2[0] = buf2[1] = src2[src_image->bits.width - 1];				\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE);	\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
+	{											\
+	    src_type_t *src1, *src2;								\
+	    src_type_t buf1[2];									\
+	    src_type_t buf2[2];									\
+	    /* handle top/bottom zero padding by just setting weights to 0 if needed */		\
+	    if (y1 < 0)										\
+	    {											\
+		weight1 = 0;									\
+		y1 = 0;										\
+	    }											\
+	    if (y1 >= src_image->bits.height)							\
+	    {											\
+		weight1 = 0;									\
+		y1 = src_image->bits.height - 1;						\
+	    }											\
+	    if (y2 < 0)										\
+	    {											\
+		weight2 = 0;									\
+		y2 = 0;										\
+	    }											\
+	    if (y2 >= src_image->bits.height)							\
+	    {											\
+		weight2 = 0;									\
+		y2 = src_image->bits.height - 1;						\
+	    }											\
+	    src1 = src_first_line + src_stride * y1;						\
+	    src2 = src_first_line + src_stride * y2;						\
+												\
+	    if (left_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = 0;								\
+		buf2[0] = buf2[1] = 0;								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE);		\
+		dst += left_pad;								\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += left_pad;								\
+	    }											\
+	    if (left_tz > 0)									\
+	    {											\
+		buf1[0] = 0;									\
+		buf1[1] = src1[0];								\
+		buf2[0] = 0;									\
+		buf2[1] = src2[0];								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_tz, weight1, weight2,				\
+			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
+		dst += left_tz;									\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += left_tz;								\
+		vx += left_tz * unit_x;								\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (dst, mask,							\
+			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
+		dst += width;									\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += width;								\
+		vx += width * unit_x;								\
+	    }											\
+	    if (right_tz > 0)									\
+	    {											\
+		buf1[0] = src1[src_image->bits.width - 1];					\
+		buf1[1] = 0;									\
+		buf2[0] = src2[src_image->bits.width - 1];					\
+		buf2[1] = 0;									\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_tz, weight1, weight2,				\
+			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
+		dst += right_tz;								\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += right_tz;								\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = 0;								\
+		buf2[0] = buf2[1] = 0;								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE);		\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	{											\
+	    int32_t	    num_pixels;								\
+	    int32_t	    width_remain;							\
+	    src_type_t *    src_line_top;							\
+	    src_type_t *    src_line_bottom;							\
+	    src_type_t	    buf1[2];								\
+	    src_type_t	    buf2[2];								\
+	    src_type_t	    extended_src_line0[REPEAT_NORMAL_MIN_WIDTH*2];			\
+	    src_type_t	    extended_src_line1[REPEAT_NORMAL_MIN_WIDTH*2];			\
+	    int		    i, j;								\
+												\
+	    repeat (PIXMAN_REPEAT_NORMAL, &y1, src_image->bits.height);				\
+	    repeat (PIXMAN_REPEAT_NORMAL, &y2, src_image->bits.height);				\
+	    src_line_top = src_first_line + src_stride * y1;					\
+	    src_line_bottom = src_first_line + src_stride * y2;					\
+												\
+	    if (need_src_extension)								\
+	    {											\
+		for (i=0; i<src_width;)								\
+		{										\
+		    for (j=0; j<src_image->bits.width; j++, i++)				\
+		    {										\
+			extended_src_line0[i] = src_line_top[j];				\
+			extended_src_line1[i] = src_line_bottom[j];				\
+		    }										\
+		}										\
+												\
+		src_line_top = &extended_src_line0[0];						\
+		src_line_bottom = &extended_src_line1[0];					\
+	    }											\
+												\
+	    /* Top & Bottom wrap around buffer */						\
+	    buf1[0] = src_line_top[src_width - 1];						\
+	    buf1[1] = src_line_top[0];								\
+	    buf2[0] = src_line_bottom[src_width - 1];						\
+	    buf2[1] = src_line_bottom[0];							\
+												\
+	    width_remain = width;								\
+												\
+	    while (width_remain > 0)								\
+	    {											\
+		/* We use src_width_fixed because it can make vx in original source range */	\
+		repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);				\
+												\
+		/* Wrap around part */								\
+		if (pixman_fixed_to_int (vx) == src_width - 1)					\
+		{										\
+		    /* for positive unit_x							\
+		     * num_pixels = max(n) + 1, where vx + n*unit_x < src_width_fixed		\
+		     *										\
+		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
+		     * So we are safe from overflow.						\
+		     */										\
+		    num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1;	\
+												\
+		    if (num_pixels > width_remain)						\
+			num_pixels = width_remain;						\
+												\
+		    scanline_func (dst, mask, buf1, buf2, num_pixels,				\
+				   weight1, weight2, pixman_fixed_frac(vx),			\
+				   unit_x, src_width_fixed, FALSE);				\
+												\
+		    width_remain -= num_pixels;							\
+		    vx += num_pixels * unit_x;							\
+		    dst += num_pixels;								\
+												\
+		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
+			mask += num_pixels;							\
+												\
+		    repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);			\
+		}										\
+												\
+		/* Normal scanline composite */							\
+		if (pixman_fixed_to_int (vx) != src_width - 1 && width_remain > 0)		\
+		{										\
+		    /* for positive unit_x							\
+		     * num_pixels = max(n) + 1, where vx + n*unit_x < (src_width_fixed - 1)	\
+		     *										\
+		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
+		     * So we are safe from overflow here.					\
+		     */										\
+		    num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e)	\
+				  / unit_x) + 1;						\
+												\
+		    if (num_pixels > width_remain)						\
+			num_pixels = width_remain;						\
+												\
+		    scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels,	\
+				   weight1, weight2, vx, unit_x, src_width_fixed, FALSE);	\
+												\
+		    width_remain -= num_pixels;							\
+		    vx += num_pixels * unit_x;							\
+		    dst += num_pixels;								\
+												\
+		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
+		        mask += num_pixels;							\
+		}										\
+	    }											\
+	}											\
+	else											\
+	{											\
+	    scanline_func (dst, mask, src_first_line + src_stride * y1,				\
+			   src_first_line + src_stride * y2, width,				\
+			   weight1, weight2, vx, unit_x, max_vx, FALSE);			\
+	}											\
+    }												\
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, flags)				\
+	FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\
+				  dst_type_t, repeat_mode, flags)
+
+#define SCALED_BILINEAR_FLAGS						\
+    (FAST_PATH_SCALE_TRANSFORM	|					\
+     FAST_PATH_NO_ALPHA_MAP	|					\
+     FAST_PATH_BILINEAR_FILTER	|					\
+     FAST_PATH_NO_ACCESSORS	|					\
+     FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_BILINEAR_FAST_PATH_PAD(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_NONE(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_COVER(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_NORMAL(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)	\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
+    }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_BILINEAR_FAST_PATH(op,s,d,func)				\
+    SIMPLE_BILINEAR_FAST_PATH_COVER (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_NONE (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_PAD (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op,s,d,func)			\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH(op,s,d,func)		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL (op,s,d,func)
+
+#endif
diff --git a/pixman/pixman-linear-gradient.c b/pixman/pixman-linear-gradient.c
new file mode 100644
index 0000000..6e1ea24
--- /dev/null
+++ b/pixman/pixman-linear-gradient.c
@@ -0,0 +1,286 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright Â© 2000 SuSE, Inc.
+ * Copyright Â© 2007 Red Hat, Inc.
+ * Copyright Â© 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include "pixman-private.h"
+
+static pixman_bool_t
+linear_gradient_is_horizontal (pixman_image_t *image,
+			       int             x,
+			       int             y,
+			       int             width,
+			       int             height)
+{
+    linear_gradient_t *linear = (linear_gradient_t *)image;
+    pixman_vector_t v;
+    pixman_fixed_32_32_t l;
+    pixman_fixed_48_16_t dx, dy;
+    double inc;
+
+    if (image->common.transform)
+    {
+	/* projective transformation */
+	if (image->common.transform->matrix[2][0] != 0 ||
+	    image->common.transform->matrix[2][1] != 0 ||
+	    image->common.transform->matrix[2][2] == 0)
+	{
+	    return FALSE;
+	}
+
+	v.vector[0] = image->common.transform->matrix[0][1];
+	v.vector[1] = image->common.transform->matrix[1][1];
+	v.vector[2] = image->common.transform->matrix[2][2];
+    }
+    else
+    {
+	v.vector[0] = 0;
+	v.vector[1] = pixman_fixed_1;
+	v.vector[2] = pixman_fixed_1;
+    }
+
+    dx = linear->p2.x - linear->p1.x;
+    dy = linear->p2.y - linear->p1.y;
+
+    l = dx * dx + dy * dy;
+
+    if (l == 0)
+	return FALSE;
+
+    /*
+     * compute how much the input of the gradient walked changes
+     * when moving vertically through the whole image
+     */
+    inc = height * (double) pixman_fixed_1 * pixman_fixed_1 *
+	(dx * v.vector[0] + dy * v.vector[1]) /
+	(v.vector[2] * (double) l);
+
+    /* check that casting to integer would result in 0 */
+    if (-1 < inc && inc < 1)
+	return TRUE;
+
+    return FALSE;
+}
+
+static uint32_t *
+linear_get_scanline_narrow (pixman_iter_t  *iter,
+			    const uint32_t *mask)
+{
+    pixman_image_t *image  = iter->image;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *      buffer = iter->buffer;
+
+    pixman_vector_t v, unit;
+    pixman_fixed_32_32_t l;
+    pixman_fixed_48_16_t dx, dy;
+    gradient_t *gradient = (gradient_t *)image;
+    linear_gradient_t *linear = (linear_gradient_t *)image;
+    uint32_t *end = buffer + width;
+    pixman_gradient_walker_t walker;
+
+    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (image->common.transform)
+    {
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return iter->buffer;
+
+	unit.vector[0] = image->common.transform->matrix[0][0];
+	unit.vector[1] = image->common.transform->matrix[1][0];
+	unit.vector[2] = image->common.transform->matrix[2][0];
+    }
+    else
+    {
+	unit.vector[0] = pixman_fixed_1;
+	unit.vector[1] = 0;
+	unit.vector[2] = 0;
+    }
+
+    dx = linear->p2.x - linear->p1.x;
+    dy = linear->p2.y - linear->p1.y;
+
+    l = dx * dx + dy * dy;
+
+    if (l == 0 || unit.vector[2] == 0)
+    {
+	/* affine transformation only */
+        pixman_fixed_32_32_t t, next_inc;
+	double inc;
+
+	if (l == 0 || v.vector[2] == 0)
+	{
+	    t = 0;
+	    inc = 0;
+	}
+	else
+	{
+	    double invden, v2;
+
+	    invden = pixman_fixed_1 * (double) pixman_fixed_1 /
+		(l * (double) v.vector[2]);
+	    v2 = v.vector[2] * (1. / pixman_fixed_1);
+	    t = ((dx * v.vector[0] + dy * v.vector[1]) - 
+		 (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
+	    inc = (dx * unit.vector[0] + dy * unit.vector[1]) * invden;
+	}
+	next_inc = 0;
+
+	if (((pixman_fixed_32_32_t )(inc * width)) == 0)
+	{
+	    register uint32_t color;
+
+	    color = _pixman_gradient_walker_pixel (&walker, t);
+	    while (buffer < end)
+		*buffer++ = color;
+	}
+	else
+	{
+	    int i;
+
+	    i = 0;
+	    while (buffer < end)
+	    {
+		if (!mask || *mask++)
+		{
+		    *buffer = _pixman_gradient_walker_pixel (&walker,
+							     t + next_inc);
+		}
+		i++;
+		next_inc = inc * i;
+		buffer++;
+	    }
+	}
+    }
+    else
+    {
+	/* projective transformation */
+        double t;
+
+	t = 0;
+
+	while (buffer < end)
+	{
+	    if (!mask || *mask++)
+	    {
+	        if (v.vector[2] != 0)
+		{
+		    double invden, v2;
+
+		    invden = pixman_fixed_1 * (double) pixman_fixed_1 /
+			(l * (double) v.vector[2]);
+		    v2 = v.vector[2] * (1. / pixman_fixed_1);
+		    t = ((dx * v.vector[0] + dy * v.vector[1]) - 
+			 (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
+		}
+
+		*buffer = _pixman_gradient_walker_pixel (&walker, t);
+	    }
+
+	    ++buffer;
+
+	    v.vector[0] += unit.vector[0];
+	    v.vector[1] += unit.vector[1];
+	    v.vector[2] += unit.vector[2];
+	}
+    }
+
+    iter->y++;
+
+    return iter->buffer;
+}
+
+static uint32_t *
+linear_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    uint32_t *buffer = linear_get_scanline_narrow (iter, NULL);
+
+    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+    return buffer;
+}
+
+void
+_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t  *iter)
+{
+    if (linear_gradient_is_horizontal (
+	    iter->image, iter->x, iter->y, iter->width, iter->height))
+    {
+	if (iter->flags & ITER_NARROW)
+	    linear_get_scanline_narrow (iter, NULL);
+	else
+	    linear_get_scanline_wide (iter, NULL);
+
+	iter->get_scanline = _pixman_iter_get_scanline_noop;
+    }
+    else
+    {
+	if (iter->flags & ITER_NARROW)
+	    iter->get_scanline = linear_get_scanline_narrow;
+	else
+	    iter->get_scanline = linear_get_scanline_wide;
+    }
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_linear_gradient (pixman_point_fixed_t *        p1,
+                                     pixman_point_fixed_t *        p2,
+                                     const pixman_gradient_stop_t *stops,
+                                     int                           n_stops)
+{
+    pixman_image_t *image;
+    linear_gradient_t *linear;
+
+    image = _pixman_image_allocate ();
+
+    if (!image)
+	return NULL;
+
+    linear = &image->linear;
+
+    if (!_pixman_init_gradient (&linear->common, stops, n_stops))
+    {
+	free (image);
+	return NULL;
+    }
+
+    linear->p1 = *p1;
+    linear->p2 = *p2;
+
+    image->type = LINEAR;
+
+    return image;
+}
+
diff --git a/pixman/pixman-matrix.c b/pixman/pixman-matrix.c
new file mode 100644
index 0000000..8d0d973
--- /dev/null
+++ b/pixman/pixman-matrix.c
@@ -0,0 +1,766 @@
+/*
+ * Copyright Â© 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+/*
+ * Matrix interfaces
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <math.h>
+#include <string.h>
+#include "pixman-private.h"
+
+#define F(x)    pixman_int_to_fixed (x)
+
+PIXMAN_EXPORT void
+pixman_transform_init_identity (struct pixman_transform *matrix)
+{
+    int i;
+
+    memset (matrix, '\0', sizeof (struct pixman_transform));
+    for (i = 0; i < 3; i++)
+	matrix->matrix[i][i] = F (1);
+}
+
+typedef pixman_fixed_32_32_t pixman_fixed_34_30_t;
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_point_3d (const struct pixman_transform *transform,
+                           struct pixman_vector *         vector)
+{
+    struct pixman_vector result;
+    pixman_fixed_32_32_t partial;
+    pixman_fixed_48_16_t v;
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+	v = 0;
+	for (i = 0; i < 3; i++)
+	{
+	    partial = ((pixman_fixed_48_16_t) transform->matrix[j][i] *
+	               (pixman_fixed_48_16_t) vector->vector[i]);
+	    v += partial >> 16;
+	}
+	
+	if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
+	    return FALSE;
+	
+	result.vector[j] = (pixman_fixed_t) v;
+    }
+    
+    *vector = result;
+
+    if (!result.vector[2])
+	return FALSE;
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_point (const struct pixman_transform *transform,
+                        struct pixman_vector *         vector)
+{
+    pixman_fixed_32_32_t partial;
+    pixman_fixed_34_30_t v[3];
+    pixman_fixed_48_16_t quo;
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+	v[j] = 0;
+	
+	for (i = 0; i < 3; i++)
+	{
+	    partial = ((pixman_fixed_32_32_t) transform->matrix[j][i] *
+	               (pixman_fixed_32_32_t) vector->vector[i]);
+	    v[j] += partial >> 2;
+	}
+    }
+    
+    if (!(v[2] >> 16))
+	return FALSE;
+
+    for (j = 0; j < 2; j++)
+    {
+	quo = v[j] / (v[2] >> 16);
+	if (quo > pixman_max_fixed_48_16 || quo < pixman_min_fixed_48_16)
+	    return FALSE;
+	vector->vector[j] = (pixman_fixed_t) quo;
+    }
+    
+    vector->vector[2] = pixman_fixed_1;
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_multiply (struct pixman_transform *      dst,
+                           const struct pixman_transform *l,
+                           const struct pixman_transform *r)
+{
+    struct pixman_transform d;
+    int dx, dy;
+    int o;
+
+    for (dy = 0; dy < 3; dy++)
+    {
+	for (dx = 0; dx < 3; dx++)
+	{
+	    pixman_fixed_48_16_t v;
+	    pixman_fixed_32_32_t partial;
+	    
+	    v = 0;
+	    for (o = 0; o < 3; o++)
+	    {
+		partial =
+		    (pixman_fixed_32_32_t) l->matrix[dy][o] *
+		    (pixman_fixed_32_32_t) r->matrix[o][dx];
+
+		v += partial >> 16;
+	    }
+
+	    if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
+		return FALSE;
+	    
+	    d.matrix[dy][dx] = (pixman_fixed_t) v;
+	}
+    }
+
+    *dst = d;
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_init_scale (struct pixman_transform *t,
+                             pixman_fixed_t           sx,
+                             pixman_fixed_t           sy)
+{
+    memset (t, '\0', sizeof (struct pixman_transform));
+
+    t->matrix[0][0] = sx;
+    t->matrix[1][1] = sy;
+    t->matrix[2][2] = F (1);
+}
+
+static pixman_fixed_t
+fixed_inverse (pixman_fixed_t x)
+{
+    return (pixman_fixed_t) ((((pixman_fixed_48_16_t) F (1)) * F (1)) / x);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_scale (struct pixman_transform *forward,
+                        struct pixman_transform *reverse,
+                        pixman_fixed_t           sx,
+                        pixman_fixed_t           sy)
+{
+    struct pixman_transform t;
+
+    if (sx == 0 || sy == 0)
+	return FALSE;
+
+    if (forward)
+    {
+	pixman_transform_init_scale (&t, sx, sy);
+	if (!pixman_transform_multiply (forward, &t, forward))
+	    return FALSE;
+    }
+    
+    if (reverse)
+    {
+	pixman_transform_init_scale (&t, fixed_inverse (sx),
+	                             fixed_inverse (sy));
+	if (!pixman_transform_multiply (reverse, reverse, &t))
+	    return FALSE;
+    }
+    
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_init_rotate (struct pixman_transform *t,
+                              pixman_fixed_t           c,
+                              pixman_fixed_t           s)
+{
+    memset (t, '\0', sizeof (struct pixman_transform));
+
+    t->matrix[0][0] = c;
+    t->matrix[0][1] = -s;
+    t->matrix[1][0] = s;
+    t->matrix[1][1] = c;
+    t->matrix[2][2] = F (1);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_rotate (struct pixman_transform *forward,
+                         struct pixman_transform *reverse,
+                         pixman_fixed_t           c,
+                         pixman_fixed_t           s)
+{
+    struct pixman_transform t;
+
+    if (forward)
+    {
+	pixman_transform_init_rotate (&t, c, s);
+	if (!pixman_transform_multiply (forward, &t, forward))
+	    return FALSE;
+    }
+
+    if (reverse)
+    {
+	pixman_transform_init_rotate (&t, c, -s);
+	if (!pixman_transform_multiply (reverse, reverse, &t))
+	    return FALSE;
+    }
+    
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_init_translate (struct pixman_transform *t,
+                                 pixman_fixed_t           tx,
+                                 pixman_fixed_t           ty)
+{
+    memset (t, '\0', sizeof (struct pixman_transform));
+
+    t->matrix[0][0] = F (1);
+    t->matrix[0][2] = tx;
+    t->matrix[1][1] = F (1);
+    t->matrix[1][2] = ty;
+    t->matrix[2][2] = F (1);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_translate (struct pixman_transform *forward,
+                            struct pixman_transform *reverse,
+                            pixman_fixed_t           tx,
+                            pixman_fixed_t           ty)
+{
+    struct pixman_transform t;
+
+    if (forward)
+    {
+	pixman_transform_init_translate (&t, tx, ty);
+
+	if (!pixman_transform_multiply (forward, &t, forward))
+	    return FALSE;
+    }
+
+    if (reverse)
+    {
+	pixman_transform_init_translate (&t, -tx, -ty);
+
+	if (!pixman_transform_multiply (reverse, reverse, &t))
+	    return FALSE;
+    }
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_bounds (const struct pixman_transform *matrix,
+                         struct pixman_box16 *          b)
+
+{
+    struct pixman_vector v[4];
+    int i;
+    int x1, y1, x2, y2;
+
+    v[0].vector[0] = F (b->x1);
+    v[0].vector[1] = F (b->y1);
+    v[0].vector[2] = F (1);
+
+    v[1].vector[0] = F (b->x2);
+    v[1].vector[1] = F (b->y1);
+    v[1].vector[2] = F (1);
+
+    v[2].vector[0] = F (b->x2);
+    v[2].vector[1] = F (b->y2);
+    v[2].vector[2] = F (1);
+
+    v[3].vector[0] = F (b->x1);
+    v[3].vector[1] = F (b->y2);
+    v[3].vector[2] = F (1);
+
+    for (i = 0; i < 4; i++)
+    {
+	if (!pixman_transform_point (matrix, &v[i]))
+	    return FALSE;
+
+	x1 = pixman_fixed_to_int (v[i].vector[0]);
+	y1 = pixman_fixed_to_int (v[i].vector[1]);
+	x2 = pixman_fixed_to_int (pixman_fixed_ceil (v[i].vector[0]));
+	y2 = pixman_fixed_to_int (pixman_fixed_ceil (v[i].vector[1]));
+
+	if (i == 0)
+	{
+	    b->x1 = x1;
+	    b->y1 = y1;
+	    b->x2 = x2;
+	    b->y2 = y2;
+	}
+	else
+	{
+	    if (x1 < b->x1) b->x1 = x1;
+	    if (y1 < b->y1) b->y1 = y1;
+	    if (x2 > b->x2) b->x2 = x2;
+	    if (y2 > b->y2) b->y2 = y2;
+	}
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_invert (struct pixman_transform *      dst,
+                         const struct pixman_transform *src)
+{
+    struct pixman_f_transform m, r;
+
+    pixman_f_transform_from_pixman_transform (&m, src);
+
+    if (!pixman_f_transform_invert (&r, &m))
+	return FALSE;
+
+    if (!pixman_transform_from_pixman_f_transform (dst, &r))
+	return FALSE;
+
+    return TRUE;
+}
+
+static pixman_bool_t
+within_epsilon (pixman_fixed_t a,
+                pixman_fixed_t b,
+                pixman_fixed_t epsilon)
+{
+    pixman_fixed_t t = a - b;
+
+    if (t < 0)
+	t = -t;
+
+    return t <= epsilon;
+}
+
+#define EPSILON (pixman_fixed_t) (2)
+
+#define IS_SAME(a, b) (within_epsilon (a, b, EPSILON))
+#define IS_ZERO(a)    (within_epsilon (a, 0, EPSILON))
+#define IS_ONE(a)     (within_epsilon (a, F (1), EPSILON))
+#define IS_UNIT(a)			    \
+    (within_epsilon (a, F (1), EPSILON) ||  \
+     within_epsilon (a, F (-1), EPSILON) || \
+     IS_ZERO (a))
+#define IS_INT(a)    (IS_ZERO (pixman_fixed_frac (a)))
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_is_identity (const struct pixman_transform *t)
+{
+    return (IS_SAME (t->matrix[0][0], t->matrix[1][1]) &&
+	    IS_SAME (t->matrix[0][0], t->matrix[2][2]) &&
+	    !IS_ZERO (t->matrix[0][0]) &&
+	    IS_ZERO (t->matrix[0][1]) &&
+	    IS_ZERO (t->matrix[0][2]) &&
+	    IS_ZERO (t->matrix[1][0]) &&
+	    IS_ZERO (t->matrix[1][2]) &&
+	    IS_ZERO (t->matrix[2][0]) &&
+	    IS_ZERO (t->matrix[2][1]));
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_is_scale (const struct pixman_transform *t)
+{
+    return (!IS_ZERO (t->matrix[0][0]) &&
+            IS_ZERO (t->matrix[0][1]) &&
+            IS_ZERO (t->matrix[0][2]) &&
+
+            IS_ZERO (t->matrix[1][0]) &&
+            !IS_ZERO (t->matrix[1][1]) &&
+            IS_ZERO (t->matrix[1][2]) &&
+
+            IS_ZERO (t->matrix[2][0]) &&
+            IS_ZERO (t->matrix[2][1]) &&
+            !IS_ZERO (t->matrix[2][2]));
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_is_int_translate (const struct pixman_transform *t)
+{
+    return (IS_ONE (t->matrix[0][0]) &&
+            IS_ZERO (t->matrix[0][1]) &&
+            IS_INT (t->matrix[0][2]) &&
+
+            IS_ZERO (t->matrix[1][0]) &&
+            IS_ONE (t->matrix[1][1]) &&
+            IS_INT (t->matrix[1][2]) &&
+
+            IS_ZERO (t->matrix[2][0]) &&
+            IS_ZERO (t->matrix[2][1]) &&
+            IS_ONE (t->matrix[2][2]));
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_is_inverse (const struct pixman_transform *a,
+                             const struct pixman_transform *b)
+{
+    struct pixman_transform t;
+
+    if (!pixman_transform_multiply (&t, a, b))
+	return FALSE;
+
+    return pixman_transform_is_identity (&t);
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_from_pixman_transform (struct pixman_f_transform *    ft,
+                                          const struct pixman_transform *t)
+{
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+	for (i = 0; i < 3; i++)
+	    ft->m[j][i] = pixman_fixed_to_double (t->matrix[j][i]);
+    }
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_from_pixman_f_transform (struct pixman_transform *        t,
+                                          const struct pixman_f_transform *ft)
+{
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+	for (i = 0; i < 3; i++)
+	{
+	    double d = ft->m[j][i];
+	    if (d < -32767.0 || d > 32767.0)
+		return FALSE;
+	    d = d * 65536.0 + 0.5;
+	    t->matrix[j][i] = (pixman_fixed_t) floor (d);
+	}
+    }
+    
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_invert (struct pixman_f_transform *      dst,
+                           const struct pixman_f_transform *src)
+{
+    double det;
+    int i, j;
+    static int a[3] = { 2, 2, 1 };
+    static int b[3] = { 1, 0, 0 };
+
+    det = 0;
+    for (i = 0; i < 3; i++)
+    {
+	double p;
+	int ai = a[i];
+	int bi = b[i];
+	p = src->m[i][0] * (src->m[ai][2] * src->m[bi][1] -
+	                    src->m[ai][1] * src->m[bi][2]);
+	if (i == 1)
+	    p = -p;
+	det += p;
+    }
+    
+    if (det == 0)
+	return FALSE;
+    
+    det = 1 / det;
+    for (j = 0; j < 3; j++)
+    {
+	for (i = 0; i < 3; i++)
+	{
+	    double p;
+	    int ai = a[i];
+	    int aj = a[j];
+	    int bi = b[i];
+	    int bj = b[j];
+
+	    p = (src->m[ai][aj] * src->m[bi][bj] -
+	         src->m[ai][bj] * src->m[bi][aj]);
+	    
+	    if (((i + j) & 1) != 0)
+		p = -p;
+	    
+	    dst->m[j][i] = det * p;
+	}
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_point (const struct pixman_f_transform *t,
+                          struct pixman_f_vector *         v)
+{
+    struct pixman_f_vector result;
+    int i, j;
+    double a;
+
+    for (j = 0; j < 3; j++)
+    {
+	a = 0;
+	for (i = 0; i < 3; i++)
+	    a += t->m[j][i] * v->v[i];
+	result.v[j] = a;
+    }
+    
+    if (!result.v[2])
+	return FALSE;
+
+    for (j = 0; j < 2; j++)
+	v->v[j] = result.v[j] / result.v[2];
+
+    v->v[2] = 1;
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_point_3d (const struct pixman_f_transform *t,
+                             struct pixman_f_vector *         v)
+{
+    struct pixman_f_vector result;
+    int i, j;
+    double a;
+
+    for (j = 0; j < 3; j++)
+    {
+	a = 0;
+	for (i = 0; i < 3; i++)
+	    a += t->m[j][i] * v->v[i];
+	result.v[j] = a;
+    }
+    
+    *v = result;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_multiply (struct pixman_f_transform *      dst,
+                             const struct pixman_f_transform *l,
+                             const struct pixman_f_transform *r)
+{
+    struct pixman_f_transform d;
+    int dx, dy;
+    int o;
+
+    for (dy = 0; dy < 3; dy++)
+    {
+	for (dx = 0; dx < 3; dx++)
+	{
+	    double v = 0;
+	    for (o = 0; o < 3; o++)
+		v += l->m[dy][o] * r->m[o][dx];
+	    d.m[dy][dx] = v;
+	}
+    }
+    
+    *dst = d;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_init_scale (struct pixman_f_transform *t,
+                               double                     sx,
+                               double                     sy)
+{
+    t->m[0][0] = sx;
+    t->m[0][1] = 0;
+    t->m[0][2] = 0;
+    t->m[1][0] = 0;
+    t->m[1][1] = sy;
+    t->m[1][2] = 0;
+    t->m[2][0] = 0;
+    t->m[2][1] = 0;
+    t->m[2][2] = 1;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_scale (struct pixman_f_transform *forward,
+                          struct pixman_f_transform *reverse,
+                          double                     sx,
+                          double                     sy)
+{
+    struct pixman_f_transform t;
+
+    if (sx == 0 || sy == 0)
+	return FALSE;
+
+    if (forward)
+    {
+	pixman_f_transform_init_scale (&t, sx, sy);
+	pixman_f_transform_multiply (forward, &t, forward);
+    }
+    
+    if (reverse)
+    {
+	pixman_f_transform_init_scale (&t, 1 / sx, 1 / sy);
+	pixman_f_transform_multiply (reverse, reverse, &t);
+    }
+    
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_init_rotate (struct pixman_f_transform *t,
+                                double                     c,
+                                double                     s)
+{
+    t->m[0][0] = c;
+    t->m[0][1] = -s;
+    t->m[0][2] = 0;
+    t->m[1][0] = s;
+    t->m[1][1] = c;
+    t->m[1][2] = 0;
+    t->m[2][0] = 0;
+    t->m[2][1] = 0;
+    t->m[2][2] = 1;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_rotate (struct pixman_f_transform *forward,
+                           struct pixman_f_transform *reverse,
+                           double                     c,
+                           double                     s)
+{
+    struct pixman_f_transform t;
+
+    if (forward)
+    {
+	pixman_f_transform_init_rotate (&t, c, s);
+	pixman_f_transform_multiply (forward, &t, forward);
+    }
+    
+    if (reverse)
+    {
+	pixman_f_transform_init_rotate (&t, c, -s);
+	pixman_f_transform_multiply (reverse, reverse, &t);
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_init_translate (struct pixman_f_transform *t,
+                                   double                     tx,
+                                   double                     ty)
+{
+    t->m[0][0] = 1;
+    t->m[0][1] = 0;
+    t->m[0][2] = tx;
+    t->m[1][0] = 0;
+    t->m[1][1] = 1;
+    t->m[1][2] = ty;
+    t->m[2][0] = 0;
+    t->m[2][1] = 0;
+    t->m[2][2] = 1;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_translate (struct pixman_f_transform *forward,
+                              struct pixman_f_transform *reverse,
+                              double                     tx,
+                              double                     ty)
+{
+    struct pixman_f_transform t;
+
+    if (forward)
+    {
+	pixman_f_transform_init_translate (&t, tx, ty);
+	pixman_f_transform_multiply (forward, &t, forward);
+    }
+
+    if (reverse)
+    {
+	pixman_f_transform_init_translate (&t, -tx, -ty);
+	pixman_f_transform_multiply (reverse, reverse, &t);
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_bounds (const struct pixman_f_transform *t,
+                           struct pixman_box16 *            b)
+{
+    struct pixman_f_vector v[4];
+    int i;
+    int x1, y1, x2, y2;
+
+    v[0].v[0] = b->x1;
+    v[0].v[1] = b->y1;
+    v[0].v[2] = 1;
+    v[1].v[0] = b->x2;
+    v[1].v[1] = b->y1;
+    v[1].v[2] = 1;
+    v[2].v[0] = b->x2;
+    v[2].v[1] = b->y2;
+    v[2].v[2] = 1;
+    v[3].v[0] = b->x1;
+    v[3].v[1] = b->y2;
+    v[3].v[2] = 1;
+
+    for (i = 0; i < 4; i++)
+    {
+	if (!pixman_f_transform_point (t, &v[i]))
+	    return FALSE;
+
+	x1 = floor (v[i].v[0]);
+	y1 = floor (v[i].v[1]);
+	x2 = ceil (v[i].v[0]);
+	y2 = ceil (v[i].v[1]);
+
+	if (i == 0)
+	{
+	    b->x1 = x1;
+	    b->y1 = y1;
+	    b->x2 = x2;
+	    b->y2 = y2;
+	}
+	else
+	{
+	    if (x1 < b->x1) b->x1 = x1;
+	    if (y1 < b->y1) b->y1 = y1;
+	    if (x2 > b->x2) b->x2 = x2;
+	    if (y2 > b->y2) b->y2 = y2;
+	}
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_init_identity (struct pixman_f_transform *t)
+{
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+	for (i = 0; i < 3; i++)
+	    t->m[j][i] = i == j ? 1 : 0;
+    }
+}
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
new file mode 100644
index 0000000..f848ab4
--- /dev/null
+++ b/pixman/pixman-mmx.c
@@ -0,0 +1,3237 @@
+/*
+ * Copyright Â© 2004, 2005 Red Hat, Inc.
+ * Copyright Â© 2004 Nicholas Miell
+ * Copyright Â© 2005 Trolltech AS
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  SÃ¸ren Sandmann (sandmann@redhat.com)
+ * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
+ * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
+ *
+ * Based on work by Owen Taylor
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#if defined USE_X86_MMX || defined USE_ARM_IWMMXT
+
+#include <mmintrin.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+
+#define no_vERBOSE
+
+#ifdef VERBOSE
+#define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
+#else
+#define CHECKPOINT()
+#endif
+
+#ifdef USE_ARM_IWMMXT
+/* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_empty (void)
+{
+
+}
+#endif
+
+/* Notes about writing mmx code
+ *
+ * give memory operands as the second operand. If you give it as the
+ * first, gcc will first load it into a register, then use that
+ * register
+ *
+ *   ie. use
+ *
+ *         _mm_mullo_pi16 (x, mmx_constant);
+ *
+ *   not
+ *
+ *         _mm_mullo_pi16 (mmx_constant, x);
+ *
+ * Also try to minimize dependencies. i.e. when you need a value, try
+ * to calculate it from a value that was calculated as early as
+ * possible.
+ */
+
+/* --------------- MMX primitives ------------------------------------- */
+
+#ifdef __GNUC__
+typedef uint64_t mmxdatafield;
+#else
+typedef __m64 mmxdatafield;
+/* If __m64 is defined as a struct or union, define M64_MEMBER to be the
+   name of the member used to access the data */
+# ifdef _MSC_VER
+#  define M64_MEMBER m64_u64
+# elif defined(__SUNPRO_C)
+#  define M64_MEMBER l_
+# endif
+#endif
+
+typedef struct
+{
+    mmxdatafield mmx_4x00ff;
+    mmxdatafield mmx_4x0080;
+    mmxdatafield mmx_565_rgb;
+    mmxdatafield mmx_565_unpack_multiplier;
+    mmxdatafield mmx_565_r;
+    mmxdatafield mmx_565_g;
+    mmxdatafield mmx_565_b;
+    mmxdatafield mmx_mask_0;
+    mmxdatafield mmx_mask_1;
+    mmxdatafield mmx_mask_2;
+    mmxdatafield mmx_mask_3;
+    mmxdatafield mmx_full_alpha;
+    mmxdatafield mmx_ffff0000ffff0000;
+    mmxdatafield mmx_0000ffff00000000;
+    mmxdatafield mmx_000000000000ffff;
+} mmx_data_t;
+
+#if defined(_MSC_VER)
+# define MMXDATA_INIT(field, val) { val ## UI64 }
+#elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
+# define MMXDATA_INIT(field, val) field =   { val ## ULL }
+#else                           /* __m64 is an integral type */
+# define MMXDATA_INIT(field, val) field =   val ## ULL
+#endif
+
+static const mmx_data_t c =
+{
+    MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
+    MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
+    MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
+    MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
+    MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
+    MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
+    MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
+    MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
+    MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
+    MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
+    MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
+    MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
+    MMXDATA_INIT (.mmx_ffff0000ffff0000,         0xffff0000ffff0000),
+    MMXDATA_INIT (.mmx_0000ffff00000000,         0x0000ffff00000000),
+    MMXDATA_INIT (.mmx_000000000000ffff,         0x000000000000ffff),
+};
+
+#ifdef __GNUC__
+#    ifdef __ICC
+#        define MC(x) to_m64 (c.mmx_ ## x)
+#    else
+#        define MC(x) ((__m64)c.mmx_ ## x)
+#    endif
+#else
+#    define MC(x) c.mmx_ ## x
+#endif
+
+static force_inline __m64
+to_m64 (uint64_t x)
+{
+#ifdef __ICC
+    return _mm_cvtsi64_m64 (x);
+#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
+    __m64 res;
+
+    res.M64_MEMBER = x;
+    return res;
+#else                           /* __m64 is an integral type */
+    return (__m64)x;
+#endif
+}
+
+static force_inline uint64_t
+to_uint64 (__m64 x)
+{
+#ifdef __ICC
+    return _mm_cvtm64_si64 (x);
+#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
+    uint64_t res = x.M64_MEMBER;
+    return res;
+#else                           /* __m64 is an integral type */
+    return (uint64_t)x;
+#endif
+}
+
+static force_inline __m64
+shift (__m64 v,
+       int   s)
+{
+    if (s > 0)
+	return _mm_slli_si64 (v, s);
+    else if (s < 0)
+	return _mm_srli_si64 (v, -s);
+    else
+	return v;
+}
+
+static force_inline __m64
+negate (__m64 mask)
+{
+    return _mm_xor_si64 (mask, MC (4x00ff));
+}
+
+static force_inline __m64
+pix_multiply (__m64 a, __m64 b)
+{
+    __m64 res;
+
+    res = _mm_mullo_pi16 (a, b);
+    res = _mm_adds_pu16 (res, MC (4x0080));
+    res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
+    res = _mm_srli_pi16 (res, 8);
+
+    return res;
+}
+
+static force_inline __m64
+pix_add (__m64 a, __m64 b)
+{
+    return _mm_adds_pu8 (a, b);
+}
+
+static force_inline __m64
+expand_alpha (__m64 pixel)
+{
+    __m64 t1, t2;
+
+    t1 = shift (pixel, -48);
+    t2 = shift (t1, 16);
+    t1 = _mm_or_si64 (t1, t2);
+    t2 = shift (t1, 32);
+    t1 = _mm_or_si64 (t1, t2);
+
+    return t1;
+}
+
+static force_inline __m64
+expand_alpha_rev (__m64 pixel)
+{
+    __m64 t1, t2;
+
+    /* move alpha to low 16 bits and zero the rest */
+    t1 = shift (pixel,  48);
+    t1 = shift (t1, -48);
+
+    t2 = shift (t1, 16);
+    t1 = _mm_or_si64 (t1, t2);
+    t2 = shift (t1, 32);
+    t1 = _mm_or_si64 (t1, t2);
+
+    return t1;
+}
+
+static force_inline __m64
+invert_colors (__m64 pixel)
+{
+    __m64 x, y, z;
+
+    x = y = z = pixel;
+
+    x = _mm_and_si64 (x, MC (ffff0000ffff0000));
+    y = _mm_and_si64 (y, MC (000000000000ffff));
+    z = _mm_and_si64 (z, MC (0000ffff00000000));
+
+    y = shift (y, 32);
+    z = shift (z, -32);
+
+    x = _mm_or_si64 (x, y);
+    x = _mm_or_si64 (x, z);
+
+    return x;
+}
+
+static force_inline __m64
+over (__m64 src,
+      __m64 srca,
+      __m64 dest)
+{
+    return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
+}
+
+static force_inline __m64
+over_rev_non_pre (__m64 src, __m64 dest)
+{
+    __m64 srca = expand_alpha (src);
+    __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
+
+    return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
+}
+
+static force_inline __m64
+in (__m64 src, __m64 mask)
+{
+    return pix_multiply (src, mask);
+}
+
+static force_inline __m64
+in_over_full_src_alpha (__m64 src, __m64 mask, __m64 dest)
+{
+    src = _mm_or_si64 (src, MC (full_alpha));
+
+    return over (in (src, mask), mask, dest);
+}
+
+#ifndef _MSC_VER
+static force_inline __m64
+in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
+{
+    return over (in (src, mask), pix_multiply (srca, mask), dest);
+}
+
+#else
+
+#define in_over(src, srca, mask, dest)					\
+    over (in (src, mask), pix_multiply (srca, mask), dest)
+
+#endif
+
+/* Elemental unaligned loads */
+
+static __inline__ uint64_t ldq_u(uint64_t *p)
+{
+#ifdef USE_X86_MMX
+    /* x86's alignment restrictions are very relaxed. */
+    return *p;
+#elif defined USE_ARM_IWMMXT
+    int align = (uintptr_t)p & 7;
+    __m64 *aligned_p;
+    if (align == 0)
+	return *p;
+    aligned_p = (__m64 *)((uintptr_t)p & ~7);
+    return _mm_align_si64 (aligned_p[0], aligned_p[1], align);
+#else
+    struct __una_u64 { uint64_t x __attribute__((packed)); };
+    const struct __una_u64 *ptr = (const struct __una_u64 *) p;
+    return ptr->x;
+#endif
+}
+
+static __inline__ uint32_t ldl_u(uint32_t *p)
+{
+#ifdef USE_X86_MMX
+    /* x86's alignment restrictions are very relaxed. */
+    return *p;
+#else
+    struct __una_u32 { uint32_t x __attribute__((packed)); };
+    const struct __una_u32 *ptr = (const struct __una_u32 *) p;
+    return ptr->x;
+#endif
+}
+
+static force_inline __m64
+load8888 (uint32_t v)
+{
+    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64 ());
+}
+
+static force_inline __m64
+pack8888 (__m64 lo, __m64 hi)
+{
+    return _mm_packs_pu16 (lo, hi);
+}
+
+static force_inline uint32_t
+store8888 (__m64 v)
+{
+    return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()));
+}
+
+/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
+ *
+ *    00RR00GG00BB
+ *
+ * --- Expanding 565 in the low word ---
+ *
+ * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
+ * m = m & (01f0003f001f);
+ * m = m * (008404100840);
+ * m = m >> 8;
+ *
+ * Note the trick here - the top word is shifted by another nibble to
+ * avoid it bumping into the middle word
+ */
+static force_inline __m64
+expand565 (__m64 pixel, int pos)
+{
+    __m64 p = pixel;
+    __m64 t1, t2;
+
+    /* move pixel to low 16 bit and zero the rest */
+    p = shift (shift (p, (3 - pos) * 16), -48);
+
+    t1 = shift (p, 36 - 11);
+    t2 = shift (p, 16 - 5);
+
+    p = _mm_or_si64 (t1, p);
+    p = _mm_or_si64 (t2, p);
+    p = _mm_and_si64 (p, MC (565_rgb));
+
+    pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
+    return _mm_srli_pi16 (pixel, 8);
+}
+
+static force_inline __m64
+expand8888 (__m64 in, int pos)
+{
+    if (pos == 0)
+	return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
+    else
+	return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
+}
+
+static force_inline __m64
+expandx888 (__m64 in, int pos)
+{
+    return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
+}
+
+static force_inline __m64
+pack_565 (__m64 pixel, __m64 target, int pos)
+{
+    __m64 p = pixel;
+    __m64 t = target;
+    __m64 r, g, b;
+
+    r = _mm_and_si64 (p, MC (565_r));
+    g = _mm_and_si64 (p, MC (565_g));
+    b = _mm_and_si64 (p, MC (565_b));
+
+    r = shift (r, -(32 - 8) + pos * 16);
+    g = shift (g, -(16 - 3) + pos * 16);
+    b = shift (b, -(0  + 3) + pos * 16);
+
+    if (pos == 0)
+	t = _mm_and_si64 (t, MC (mask_0));
+    else if (pos == 1)
+	t = _mm_and_si64 (t, MC (mask_1));
+    else if (pos == 2)
+	t = _mm_and_si64 (t, MC (mask_2));
+    else if (pos == 3)
+	t = _mm_and_si64 (t, MC (mask_3));
+
+    p = _mm_or_si64 (r, t);
+    p = _mm_or_si64 (g, p);
+
+    return _mm_or_si64 (b, p);
+}
+
+#ifndef _MSC_VER
+
+static force_inline __m64
+pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
+{
+    x = pix_multiply (x, a);
+    y = pix_multiply (y, b);
+
+    return pix_add (x, y);
+}
+
+#else
+
+#define pix_add_mul(x, a, y, b)	 \
+    ( x = pix_multiply (x, a),	 \
+      y = pix_multiply (y, a),	 \
+      pix_add (x, y) )
+
+#endif
+
+/* --------------- MMX code patch for fbcompose.c --------------------- */
+
+static force_inline uint32_t
+combine (const uint32_t *src, const uint32_t *mask)
+{
+    uint32_t ssrc = *src;
+
+    if (mask)
+    {
+	__m64 m = load8888 (*mask);
+	__m64 s = load8888 (ssrc);
+
+	m = expand_alpha (m);
+	s = pix_multiply (s, m);
+
+	ssrc = store8888 (s);
+    }
+
+    return ssrc;
+}
+
+static void
+mmx_combine_over_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	uint32_t ssrc = combine (src, mask);
+	uint32_t a = ssrc >> 24;
+
+	if (a == 0xff)
+	{
+	    *dest = ssrc;
+	}
+	else if (ssrc)
+	{
+	    __m64 s, sa;
+	    s = load8888 (ssrc);
+	    sa = expand_alpha (s);
+	    *dest = store8888 (over (s, sa, load8888 (*dest)));
+	}
+
+	++dest;
+	++src;
+	if (mask)
+	    ++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_over_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 d, da;
+	uint32_t s = combine (src, mask);
+
+	d = load8888 (*dest);
+	da = expand_alpha (d);
+	*dest = store8888 (over (d, da, load8888 (s)));
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_in_u (pixman_implementation_t *imp,
+                  pixman_op_t              op,
+                  uint32_t *               dest,
+                  const uint32_t *         src,
+                  const uint32_t *         mask,
+                  int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 x, a;
+
+	x = load8888 (combine (src, mask));
+	a = load8888 (*dest);
+	a = expand_alpha (a);
+	x = pix_multiply (x, a);
+
+	*dest = store8888 (x);
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_in_reverse_u (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          uint32_t *               dest,
+                          const uint32_t *         src,
+                          const uint32_t *         mask,
+                          int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 x, a;
+
+	x = load8888 (*dest);
+	a = load8888 (combine (src, mask));
+	a = expand_alpha (a);
+	x = pix_multiply (x, a);
+	*dest = store8888 (x);
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_out_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 x, a;
+
+	x = load8888 (combine (src, mask));
+	a = load8888 (*dest);
+	a = expand_alpha (a);
+	a = negate (a);
+	x = pix_multiply (x, a);
+	*dest = store8888 (x);
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_out_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 x, a;
+
+	x = load8888 (*dest);
+	a = load8888 (combine (src, mask));
+	a = expand_alpha (a);
+	a = negate (a);
+	x = pix_multiply (x, a);
+
+	*dest = store8888 (x);
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_atop_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 s, da, d, sia;
+
+	s = load8888 (combine (src, mask));
+	d = load8888 (*dest);
+	sia = expand_alpha (s);
+	sia = negate (sia);
+	da = expand_alpha (d);
+	s = pix_add_mul (s, da, d, sia);
+	*dest = store8888 (s);
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    const uint32_t *end;
+
+    end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 s, dia, d, sa;
+
+	s = load8888 (combine (src, mask));
+	d = load8888 (*dest);
+	sa = expand_alpha (s);
+	dia = expand_alpha (d);
+	dia = negate (dia);
+	s = pix_add_mul (s, dia, d, sa);
+	*dest = store8888 (s);
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_xor_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 s, dia, d, sia;
+
+	s = load8888 (combine (src, mask));
+	d = load8888 (*dest);
+	sia = expand_alpha (s);
+	dia = expand_alpha (d);
+	sia = negate (sia);
+	dia = negate (dia);
+	s = pix_add_mul (s, dia, d, sia);
+	*dest = store8888 (s);
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_add_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 s, d;
+
+	s = load8888 (combine (src, mask));
+	d = load8888 (*dest);
+	s = pix_add (s, d);
+	*dest = store8888 (s);
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_saturate_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        uint32_t *               dest,
+                        const uint32_t *         src,
+                        const uint32_t *         mask,
+                        int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	uint32_t s = combine (src, mask);
+	uint32_t d = *dest;
+	__m64 ms = load8888 (s);
+	__m64 md = load8888 (d);
+	uint32_t sa = s >> 24;
+	uint32_t da = ~d >> 24;
+
+	if (sa > da)
+	{
+	    __m64 msa = load8888 (DIV_UN8 (da, sa) << 24);
+	    msa = expand_alpha (msa);
+	    ms = pix_multiply (ms, msa);
+	}
+
+	md = pix_add (md, ms);
+	*dest = store8888 (md);
+
+	++src;
+	++dest;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_src_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+
+	s = pix_multiply (s, a);
+	*dest = store8888 (s);
+
+	++src;
+	++mask;
+	++dest;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_over_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 sa = expand_alpha (s);
+
+	*dest = store8888 (in_over (s, sa, a, d));
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 da = expand_alpha (d);
+
+	*dest = store8888 (over (d, da, in (s, a)));
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_in_ca (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 da = expand_alpha (d);
+
+	s = pix_multiply (s, a);
+	s = pix_multiply (s, da);
+	*dest = store8888 (s);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 sa = expand_alpha (s);
+
+	a = pix_multiply (a, sa);
+	d = pix_multiply (d, a);
+	*dest = store8888 (d);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_out_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 da = expand_alpha (d);
+
+	da = negate (da);
+	s = pix_multiply (s, a);
+	s = pix_multiply (s, da);
+	*dest = store8888 (s);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 sa = expand_alpha (s);
+
+	a = pix_multiply (a, sa);
+	a = negate (a);
+	d = pix_multiply (d, a);
+	*dest = store8888 (d);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_atop_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 da = expand_alpha (d);
+	__m64 sa = expand_alpha (s);
+
+	s = pix_multiply (s, a);
+	a = pix_multiply (a, sa);
+	a = negate (a);
+	d = pix_add_mul (d, a, s, da);
+	*dest = store8888 (d);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 da = expand_alpha (d);
+	__m64 sa = expand_alpha (s);
+
+	s = pix_multiply (s, a);
+	a = pix_multiply (a, sa);
+	da = negate (da);
+	d = pix_add_mul (d, a, s, da);
+	*dest = store8888 (d);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_xor_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 da = expand_alpha (d);
+	__m64 sa = expand_alpha (s);
+
+	s = pix_multiply (s, a);
+	a = pix_multiply (a, sa);
+	da = negate (da);
+	a = negate (a);
+	d = pix_add_mul (d, a, s, da);
+	*dest = store8888 (d);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_add_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+
+	s = pix_multiply (s, a);
+	d = pix_add (s, d);
+	*dest = store8888 (d);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+/* ------------- MMX code paths called from fbpict.c -------------------- */
+
+static void
+mmx_composite_over_n_8888 (pixman_implementation_t *imp,
+                           pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst;
+    int32_t w;
+    int dst_stride;
+    __m64 vsrc, vsrca;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	CHECKPOINT ();
+
+	while (w && (unsigned long)dst & 7)
+	{
+	    *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 2)
+	{
+	    __m64 vdest;
+	    __m64 dest0, dest1;
+
+	    vdest = *(__m64 *)dst;
+
+	    dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
+	    dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
+
+	    *(__m64 *)dst = pack8888 (dest0, dest1);
+
+	    dst += 2;
+	    w -= 2;
+	}
+
+	CHECKPOINT ();
+
+	if (w)
+	{
+	    *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_n_0565 (pixman_implementation_t *imp,
+                           pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line, *dst;
+    int32_t w;
+    int dst_stride;
+    __m64 vsrc, vsrca;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	CHECKPOINT ();
+
+	while (w && (unsigned long)dst & 7)
+	{
+	    uint64_t d = *dst;
+	    __m64 vdest = expand565 (to_m64 (d), 0);
+
+	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
+	    *dst = to_uint64 (vdest);
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 4)
+	{
+	    __m64 vdest;
+
+	    vdest = *(__m64 *)dst;
+
+	    vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0);
+	    vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1);
+	    vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2);
+	    vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3);
+
+	    *(__m64 *)dst = vdest;
+
+	    dst += 4;
+	    w -= 4;
+	}
+
+	CHECKPOINT ();
+
+	while (w)
+	{
+	    uint64_t d = *dst;
+	    __m64 vdest = expand565 (to_m64 (d), 0);
+
+	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
+	    *dst = to_uint64 (vdest);
+
+	    w--;
+	    dst++;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line;
+    uint32_t    *mask_line;
+    int dst_stride, mask_stride;
+    __m64 vsrc, vsrca;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+	int twidth = width;
+	uint32_t *p = (uint32_t *)mask_line;
+	uint32_t *q = (uint32_t *)dst_line;
+
+	while (twidth && (unsigned long)q & 7)
+	{
+	    uint32_t m = *(uint32_t *)p;
+
+	    if (m)
+	    {
+		__m64 vdest = load8888 (*q);
+		vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
+		*q = store8888 (vdest);
+	    }
+
+	    twidth--;
+	    p++;
+	    q++;
+	}
+
+	while (twidth >= 2)
+	{
+	    uint32_t m0, m1;
+	    m0 = *p;
+	    m1 = *(p + 1);
+
+	    if (m0 | m1)
+	    {
+		__m64 dest0, dest1;
+		__m64 vdest = *(__m64 *)q;
+
+		dest0 = in_over (vsrc, vsrca, load8888 (m0),
+		                 expand8888 (vdest, 0));
+		dest1 = in_over (vsrc, vsrca, load8888 (m1),
+		                 expand8888 (vdest, 1));
+
+		*(__m64 *)q = pack8888 (dest0, dest1);
+	    }
+
+	    p += 2;
+	    q += 2;
+	    twidth -= 2;
+	}
+
+	while (twidth)
+	{
+	    uint32_t m = *(uint32_t *)p;
+
+	    if (m)
+	    {
+		__m64 vdest = load8888 (*q);
+		vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
+		*q = store8888 (vdest);
+	    }
+
+	    twidth--;
+	    p++;
+	    q++;
+	}
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    __m64 vmask;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
+    mask &= 0xff000000;
+    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
+    vmask = load8888 (mask);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 7)
+	{
+	    __m64 s = load8888 (*src);
+	    __m64 d = load8888 (*dst);
+
+	    *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+
+	while (w >= 2)
+	{
+	    __m64 vs = (__m64)ldq_u((uint64_t *)src);
+	    __m64 vd = *(__m64 *)dst;
+	    __m64 vsrc0 = expand8888 (vs, 0);
+	    __m64 vsrc1 = expand8888 (vs, 1);
+
+	    *(__m64 *)dst = pack8888 (
+	        in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
+	        in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
+
+	    w -= 2;
+	    dst += 2;
+	    src += 2;
+	}
+
+	if (w)
+	{
+	    __m64 s = load8888 (*src);
+	    __m64 d = load8888 (*dst);
+
+	    *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    uint32_t mask;
+    __m64 vmask;
+    int dst_stride, src_stride;
+    int32_t w;
+    __m64 srca;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
+
+    mask &= 0xff000000;
+    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
+    vmask = load8888 (mask);
+    srca = MC (4x00ff);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 7)
+	{
+	    __m64 s = load8888 (*src | 0xff000000);
+	    __m64 d = load8888 (*dst);
+
+	    *dst = store8888 (in_over (s, srca, vmask, d));
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+
+	while (w >= 16)
+	{
+	    __m64 vd0 = *(__m64 *)(dst + 0);
+	    __m64 vd1 = *(__m64 *)(dst + 2);
+	    __m64 vd2 = *(__m64 *)(dst + 4);
+	    __m64 vd3 = *(__m64 *)(dst + 6);
+	    __m64 vd4 = *(__m64 *)(dst + 8);
+	    __m64 vd5 = *(__m64 *)(dst + 10);
+	    __m64 vd6 = *(__m64 *)(dst + 12);
+	    __m64 vd7 = *(__m64 *)(dst + 14);
+
+	    __m64 vs0 = (__m64)ldq_u((uint64_t *)(src + 0));
+	    __m64 vs1 = (__m64)ldq_u((uint64_t *)(src + 2));
+	    __m64 vs2 = (__m64)ldq_u((uint64_t *)(src + 4));
+	    __m64 vs3 = (__m64)ldq_u((uint64_t *)(src + 6));
+	    __m64 vs4 = (__m64)ldq_u((uint64_t *)(src + 8));
+	    __m64 vs5 = (__m64)ldq_u((uint64_t *)(src + 10));
+	    __m64 vs6 = (__m64)ldq_u((uint64_t *)(src + 12));
+	    __m64 vs7 = (__m64)ldq_u((uint64_t *)(src + 14));
+
+	    vd0 = pack8888 (
+	        in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
+	        in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
+
+	    vd1 = pack8888 (
+	        in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
+	        in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
+
+	    vd2 = pack8888 (
+	        in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
+	        in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
+
+	    vd3 = pack8888 (
+	        in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
+	        in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
+
+	    vd4 = pack8888 (
+	        in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
+	        in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
+
+	    vd5 = pack8888 (
+	        in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
+	        in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
+
+	    vd6 = pack8888 (
+	        in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
+	        in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
+
+	    vd7 = pack8888 (
+	        in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
+	        in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
+
+	    *(__m64 *)(dst + 0) = vd0;
+	    *(__m64 *)(dst + 2) = vd1;
+	    *(__m64 *)(dst + 4) = vd2;
+	    *(__m64 *)(dst + 6) = vd3;
+	    *(__m64 *)(dst + 8) = vd4;
+	    *(__m64 *)(dst + 10) = vd5;
+	    *(__m64 *)(dst + 12) = vd6;
+	    *(__m64 *)(dst + 14) = vd7;
+
+	    w -= 16;
+	    dst += 16;
+	    src += 16;
+	}
+
+	while (w)
+	{
+	    __m64 s = load8888 (*src | 0xff000000);
+	    __m64 d = load8888 (*dst);
+
+	    *dst = store8888 (in_over (s, srca, vmask, d));
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    uint32_t s;
+    int dst_stride, src_stride;
+    uint8_t a;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    a = s >> 24;
+
+	    if (a == 0xff)
+	    {
+		*dst = s;
+	    }
+	    else if (s)
+	    {
+		__m64 ms, sa;
+		ms = load8888 (s);
+		sa = expand_alpha (ms);
+		*dst = store8888 (over (ms, sa, load8888 (*dst)));
+	    }
+
+	    dst++;
+	}
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+#if 0
+    /* FIXME */
+    assert (src_image->drawable == mask_image->drawable);
+#endif
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	CHECKPOINT ();
+
+	while (w && (unsigned long)dst & 7)
+	{
+	    __m64 vsrc = load8888 (*src);
+	    uint64_t d = *dst;
+	    __m64 vdest = expand565 (to_m64 (d), 0);
+
+	    vdest = pack_565 (
+		over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
+
+	    *dst = to_uint64 (vdest);
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+
+	CHECKPOINT ();
+
+	while (w >= 4)
+	{
+	    __m64 vsrc0, vsrc1, vsrc2, vsrc3;
+	    __m64 vdest;
+
+	    vsrc0 = load8888 (*(src + 0));
+	    vsrc1 = load8888 (*(src + 1));
+	    vsrc2 = load8888 (*(src + 2));
+	    vsrc3 = load8888 (*(src + 3));
+
+	    vdest = *(__m64 *)dst;
+
+	    vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0);
+	    vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1);
+	    vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2);
+	    vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3);
+
+	    *(__m64 *)dst = vdest;
+
+	    w -= 4;
+	    dst += 4;
+	    src += 4;
+	}
+
+	CHECKPOINT ();
+
+	while (w)
+	{
+	    __m64 vsrc = load8888 (*src);
+	    uint64_t d = *dst;
+	    __m64 vdest = expand565 (to_m64 (d), 0);
+
+	    vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
+
+	    *dst = to_uint64 (vdest);
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    __m64 vsrc, vsrca;
+    uint64_t srcsrc;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    srcsrc = (uint64_t)src << 32 | src;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	CHECKPOINT ();
+
+	while (w && (unsigned long)dst & 7)
+	{
+	    uint64_t m = *mask;
+
+	    if (m)
+	    {
+		__m64 vdest = in_over (vsrc, vsrca,
+				       expand_alpha_rev (to_m64 (m)),
+				       load8888 (*dst));
+
+		*dst = store8888 (vdest);
+	    }
+
+	    w--;
+	    mask++;
+	    dst++;
+	}
+
+	CHECKPOINT ();
+
+	while (w >= 2)
+	{
+	    uint64_t m0, m1;
+
+	    m0 = *mask;
+	    m1 = *(mask + 1);
+
+	    if (srca == 0xff && (m0 & m1) == 0xff)
+	    {
+		*(uint64_t *)dst = srcsrc;
+	    }
+	    else if (m0 | m1)
+	    {
+		__m64 vdest;
+		__m64 dest0, dest1;
+
+		vdest = *(__m64 *)dst;
+
+		dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
+				 expand8888 (vdest, 0));
+		dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
+				 expand8888 (vdest, 1));
+
+		*(__m64 *)dst = pack8888 (dest0, dest1);
+	    }
+
+	    mask += 2;
+	    dst += 2;
+	    w -= 2;
+	}
+
+	CHECKPOINT ();
+
+	if (w)
+	{
+	    uint64_t m = *mask;
+
+	    if (m)
+	    {
+		__m64 vdest = load8888 (*dst);
+
+		vdest = in_over (
+		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
+		*dst = store8888 (vdest);
+	    }
+	}
+    }
+
+    _mm_empty ();
+}
+
+pixman_bool_t
+pixman_fill_mmx (uint32_t *bits,
+                 int       stride,
+                 int       bpp,
+                 int       x,
+                 int       y,
+                 int       width,
+                 int       height,
+                 uint32_t xor)
+{
+    uint64_t fill;
+    __m64 vfill;
+    uint32_t byte_width;
+    uint8_t     *byte_line;
+
+#if defined __GNUC__ && defined USE_X86_MMX
+    __m64 v1, v2, v3, v4, v5, v6, v7;
+#endif
+
+    if (bpp != 16 && bpp != 32 && bpp != 8)
+	return FALSE;
+
+    if (bpp == 8)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 1;
+	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+	byte_width = width;
+	stride *= 1;
+        xor = (xor & 0xff) * 0x01010101;
+    }
+    else if (bpp == 16)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 2;
+	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+	byte_width = 2 * width;
+	stride *= 2;
+        xor = (xor & 0xffff) * 0x00010001;
+    }
+    else
+    {
+	stride = stride * (int) sizeof (uint32_t) / 4;
+	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+	byte_width = 4 * width;
+	stride *= 4;
+    }
+
+    fill = ((uint64_t)xor << 32) | xor;
+    vfill = to_m64 (fill);
+
+#if defined __GNUC__ && defined USE_X86_MMX
+    __asm__ (
+        "movq		%7,	%0\n"
+        "movq		%7,	%1\n"
+        "movq		%7,	%2\n"
+        "movq		%7,	%3\n"
+        "movq		%7,	%4\n"
+        "movq		%7,	%5\n"
+        "movq		%7,	%6\n"
+	: "=&y" (v1), "=&y" (v2), "=&y" (v3),
+	  "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
+	: "y" (vfill));
+#endif
+
+    while (height--)
+    {
+	int w;
+	uint8_t *d = byte_line;
+
+	byte_line += stride;
+	w = byte_width;
+
+	while (w >= 1 && ((unsigned long)d & 1))
+	{
+	    *(uint8_t *)d = (xor & 0xff);
+	    w--;
+	    d++;
+	}
+
+	while (w >= 2 && ((unsigned long)d & 3))
+	{
+	    *(uint16_t *)d = xor;
+	    w -= 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((unsigned long)d & 7))
+	{
+	    *(uint32_t *)d = xor;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	while (w >= 64)
+	{
+#if defined __GNUC__ && defined USE_X86_MMX
+	    __asm__ (
+	        "movq	%1,	  (%0)\n"
+	        "movq	%2,	 8(%0)\n"
+	        "movq	%3,	16(%0)\n"
+	        "movq	%4,	24(%0)\n"
+	        "movq	%5,	32(%0)\n"
+	        "movq	%6,	40(%0)\n"
+	        "movq	%7,	48(%0)\n"
+	        "movq	%8,	56(%0)\n"
+		:
+		: "r" (d),
+		  "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
+		  "y" (v4), "y" (v5), "y" (v6), "y" (v7)
+		: "memory");
+#else
+	    *(__m64*) (d +  0) = vfill;
+	    *(__m64*) (d +  8) = vfill;
+	    *(__m64*) (d + 16) = vfill;
+	    *(__m64*) (d + 24) = vfill;
+	    *(__m64*) (d + 32) = vfill;
+	    *(__m64*) (d + 40) = vfill;
+	    *(__m64*) (d + 48) = vfill;
+	    *(__m64*) (d + 56) = vfill;
+#endif
+	    w -= 64;
+	    d += 64;
+	}
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = xor;
+
+	    w -= 4;
+	    d += 4;
+	}
+	while (w >= 2)
+	{
+	    *(uint16_t *)d = xor;
+	    w -= 2;
+	    d += 2;
+	}
+	while (w >= 1)
+	{
+	    *(uint8_t *)d = (xor & 0xff);
+	    w--;
+	    d++;
+	}
+
+    }
+
+    _mm_empty ();
+    return TRUE;
+}
+
+static void
+mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
+                            pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    __m64 vsrc;
+    uint64_t srcsrc;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+    {
+	pixman_fill_mmx (dest_image->bits.bits, dest_image->bits.rowstride,
+			 PIXMAN_FORMAT_BPP (dest_image->bits.format),
+	                 dest_x, dest_y, width, height, 0);
+	return;
+    }
+
+    srcsrc = (uint64_t)src << 32 | src;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    vsrc = load8888 (src);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	CHECKPOINT ();
+
+	while (w && (unsigned long)dst & 7)
+	{
+	    uint64_t m = *mask;
+
+	    if (m)
+	    {
+		__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
+
+		*dst = store8888 (vdest);
+	    }
+	    else
+	    {
+		*dst = 0;
+	    }
+
+	    w--;
+	    mask++;
+	    dst++;
+	}
+
+	CHECKPOINT ();
+
+	while (w >= 2)
+	{
+	    uint64_t m0, m1;
+	    m0 = *mask;
+	    m1 = *(mask + 1);
+
+	    if (srca == 0xff && (m0 & m1) == 0xff)
+	    {
+		*(uint64_t *)dst = srcsrc;
+	    }
+	    else if (m0 | m1)
+	    {
+		__m64 dest0, dest1;
+
+		dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
+		dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
+
+		*(__m64 *)dst = pack8888 (dest0, dest1);
+	    }
+	    else
+	    {
+		*(uint64_t *)dst = 0;
+	    }
+
+	    mask += 2;
+	    dst += 2;
+	    w -= 2;
+	}
+
+	CHECKPOINT ();
+
+	if (w)
+	{
+	    uint64_t m = *mask;
+
+	    if (m)
+	    {
+		__m64 vdest = load8888 (*dst);
+
+		vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
+		*dst = store8888 (vdest);
+	    }
+	    else
+	    {
+		*dst = 0;
+	    }
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint16_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    __m64 vsrc, vsrca, tmp;
+    uint64_t srcsrcsrcsrc, src16;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
+
+    tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
+    src16 = to_uint64 (tmp);
+
+    srcsrcsrcsrc =
+	(uint64_t)src16 << 48 | (uint64_t)src16 << 32 |
+	(uint64_t)src16 << 16 | (uint64_t)src16;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	CHECKPOINT ();
+
+	while (w && (unsigned long)dst & 7)
+	{
+	    uint64_t m = *mask;
+
+	    if (m)
+	    {
+		uint64_t d = *dst;
+		__m64 vd = to_m64 (d);
+		__m64 vdest = in_over (
+		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
+
+		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
+		*dst = to_uint64 (vd);
+	    }
+
+	    w--;
+	    mask++;
+	    dst++;
+	}
+
+	CHECKPOINT ();
+
+	while (w >= 4)
+	{
+	    uint64_t m0, m1, m2, m3;
+	    m0 = *mask;
+	    m1 = *(mask + 1);
+	    m2 = *(mask + 2);
+	    m3 = *(mask + 3);
+
+	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
+	    {
+		*(uint64_t *)dst = srcsrcsrcsrc;
+	    }
+	    else if (m0 | m1 | m2 | m3)
+	    {
+		__m64 vdest;
+		__m64 vm0, vm1, vm2, vm3;
+
+		vdest = *(__m64 *)dst;
+
+		vm0 = to_m64 (m0);
+		vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
+					   expand565 (vdest, 0)), vdest, 0);
+		vm1 = to_m64 (m1);
+		vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
+					   expand565 (vdest, 1)), vdest, 1);
+		vm2 = to_m64 (m2);
+		vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
+					   expand565 (vdest, 2)), vdest, 2);
+		vm3 = to_m64 (m3);
+		vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
+					   expand565 (vdest, 3)), vdest, 3);
+
+		*(__m64 *)dst = vdest;
+	    }
+
+	    w -= 4;
+	    mask += 4;
+	    dst += 4;
+	}
+
+	CHECKPOINT ();
+
+	while (w)
+	{
+	    uint64_t m = *mask;
+
+	    if (m)
+	    {
+		uint64_t d = *dst;
+		__m64 vd = to_m64 (d);
+		__m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
+				       expand565 (vd, 0));
+		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
+		*dst = to_uint64 (vd);
+	    }
+
+	    w--;
+	    mask++;
+	    dst++;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+#if 0
+    /* FIXME */
+    assert (src_image->drawable == mask_image->drawable);
+#endif
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	CHECKPOINT ();
+
+	while (w && (unsigned long)dst & 7)
+	{
+	    __m64 vsrc = load8888 (*src);
+	    uint64_t d = *dst;
+	    __m64 vdest = expand565 (to_m64 (d), 0);
+
+	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
+
+	    *dst = to_uint64 (vdest);
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+
+	CHECKPOINT ();
+
+	while (w >= 4)
+	{
+	    uint32_t s0, s1, s2, s3;
+	    unsigned char a0, a1, a2, a3;
+
+	    s0 = *src;
+	    s1 = *(src + 1);
+	    s2 = *(src + 2);
+	    s3 = *(src + 3);
+
+	    a0 = (s0 >> 24);
+	    a1 = (s1 >> 24);
+	    a2 = (s2 >> 24);
+	    a3 = (s3 >> 24);
+
+	    if ((a0 & a1 & a2 & a3) == 0xFF)
+	    {
+		__m64 vdest;
+		vdest = pack_565 (invert_colors (load8888 (s0)), _mm_setzero_si64 (), 0);
+		vdest = pack_565 (invert_colors (load8888 (s1)), vdest, 1);
+		vdest = pack_565 (invert_colors (load8888 (s2)), vdest, 2);
+		vdest = pack_565 (invert_colors (load8888 (s3)), vdest, 3);
+
+		*(__m64 *)dst = vdest;
+	    }
+	    else if (s0 | s1 | s2 | s3)
+	    {
+		__m64 vdest = *(__m64 *)dst;
+
+		vdest = pack_565 (over_rev_non_pre (load8888 (s0), expand565 (vdest, 0)), vdest, 0);
+		vdest = pack_565 (over_rev_non_pre (load8888 (s1), expand565 (vdest, 1)), vdest, 1);
+		vdest = pack_565 (over_rev_non_pre (load8888 (s2), expand565 (vdest, 2)), vdest, 2);
+		vdest = pack_565 (over_rev_non_pre (load8888 (s3), expand565 (vdest, 3)), vdest, 3);
+
+		*(__m64 *)dst = vdest;
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    src += 4;
+	}
+
+	CHECKPOINT ();
+
+	while (w)
+	{
+	    __m64 vsrc = load8888 (*src);
+	    uint64_t d = *dst;
+	    __m64 vdest = expand565 (to_m64 (d), 0);
+
+	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
+
+	    *dst = to_uint64 (vdest);
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+#if 0
+    /* FIXME */
+    assert (src_image->drawable == mask_image->drawable);
+#endif
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 7)
+	{
+	    __m64 s = load8888 (*src);
+	    __m64 d = load8888 (*dst);
+
+	    *dst = store8888 (over_rev_non_pre (s, d));
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+
+	while (w >= 2)
+	{
+	    uint64_t s0, s1;
+	    unsigned char a0, a1;
+	    __m64 d0, d1;
+
+	    s0 = *src;
+	    s1 = *(src + 1);
+
+	    a0 = (s0 >> 24);
+	    a1 = (s1 >> 24);
+
+	    if ((a0 & a1) == 0xFF)
+	    {
+		d0 = invert_colors (load8888 (s0));
+		d1 = invert_colors (load8888 (s1));
+
+		*(__m64 *)dst = pack8888 (d0, d1);
+	    }
+	    else if (s0 | s1)
+	    {
+		__m64 vdest = *(__m64 *)dst;
+
+		d0 = over_rev_non_pre (load8888 (s0), expand8888 (vdest, 0));
+		d1 = over_rev_non_pre (load8888 (s1), expand8888 (vdest, 1));
+
+		*(__m64 *)dst = pack8888 (d0, d1);
+	    }
+
+	    w -= 2;
+	    dst += 2;
+	    src += 2;
+	}
+
+	if (w)
+	{
+	    __m64 s = load8888 (*src);
+	    __m64 d = load8888 (*dst);
+
+	    *dst = store8888 (over_rev_non_pre (s, d));
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line;
+    uint32_t    *mask_line;
+    int dst_stride, mask_stride;
+    __m64 vsrc, vsrca;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+	int twidth = width;
+	uint32_t *p = (uint32_t *)mask_line;
+	uint16_t *q = (uint16_t *)dst_line;
+
+	while (twidth && ((unsigned long)q & 7))
+	{
+	    uint32_t m = *(uint32_t *)p;
+
+	    if (m)
+	    {
+		uint64_t d = *q;
+		__m64 vdest = expand565 (to_m64 (d), 0);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
+		*q = to_uint64 (vdest);
+	    }
+
+	    twidth--;
+	    p++;
+	    q++;
+	}
+
+	while (twidth >= 4)
+	{
+	    uint32_t m0, m1, m2, m3;
+
+	    m0 = *p;
+	    m1 = *(p + 1);
+	    m2 = *(p + 2);
+	    m3 = *(p + 3);
+
+	    if ((m0 | m1 | m2 | m3))
+	    {
+		__m64 vdest = *(__m64 *)q;
+
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m0), expand565 (vdest, 0)), vdest, 0);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m1), expand565 (vdest, 1)), vdest, 1);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m2), expand565 (vdest, 2)), vdest, 2);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m3), expand565 (vdest, 3)), vdest, 3);
+
+		*(__m64 *)q = vdest;
+	    }
+	    twidth -= 4;
+	    p += 4;
+	    q += 4;
+	}
+
+	while (twidth)
+	{
+	    uint32_t m;
+
+	    m = *(uint32_t *)p;
+	    if (m)
+	    {
+		uint64_t d = *q;
+		__m64 vdest = expand565 (to_m64 (d), 0);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
+		*q = to_uint64 (vdest);
+	    }
+
+	    twidth--;
+	    p++;
+	    q++;
+	}
+
+	mask_line += mask_stride;
+	dst_line += dst_stride;
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
+                        pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint8_t sa;
+    __m64 vsrc, vsrca;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    sa = src >> 24;
+
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 7)
+	{
+	    uint16_t tmp;
+	    uint8_t a;
+	    uint32_t m, d;
+
+	    a = *mask++;
+	    d = *dst;
+
+	    m = MUL_UN8 (sa, a, tmp);
+	    d = MUL_UN8 (m, d, tmp);
+
+	    *dst++ = d;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    __m64 vmask;
+	    __m64 vdest;
+
+	    vmask = load8888 (ldl_u((uint32_t *)mask));
+	    vdest = load8888 (*(uint32_t *)dst);
+
+	    *(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest));
+
+	    dst += 4;
+	    mask += 4;
+	    w -= 4;
+	}
+
+	while (w--)
+	{
+	    uint16_t tmp;
+	    uint8_t a;
+	    uint32_t m, d;
+
+	    a = *mask++;
+	    d = *dst;
+
+	    m = MUL_UN8 (sa, a, tmp);
+	    d = MUL_UN8 (m, d, tmp);
+
+	    *dst++ = d;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_in_8_8 (pixman_implementation_t *imp,
+                      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int src_stride, dst_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 3)
+	{
+	    uint8_t s, d;
+	    uint16_t tmp;
+
+	    s = *src;
+	    d = *dst;
+
+	    *dst = MUL_UN8 (s, d, tmp);
+
+	    src++;
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    uint32_t *s = (uint32_t *)src;
+	    uint32_t *d = (uint32_t *)dst;
+
+	    *d = store8888 (in (load8888 (ldl_u((uint32_t *)s)), load8888 (*d)));
+
+	    w -= 4;
+	    dst += 4;
+	    src += 4;
+	}
+
+	while (w--)
+	{
+	    uint8_t s, d;
+	    uint16_t tmp;
+
+	    s = *src;
+	    d = *dst;
+
+	    *dst = MUL_UN8 (s, d, tmp);
+
+	    src++;
+	    dst++;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
+			 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint8_t sa;
+    __m64 vsrc, vsrca;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    sa = src >> 24;
+
+    if (src == 0)
+	return;
+
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 3)
+	{
+	    uint16_t tmp;
+	    uint16_t a;
+	    uint32_t m, d;
+	    uint32_t r;
+
+	    a = *mask++;
+	    d = *dst;
+
+	    m = MUL_UN8 (sa, a, tmp);
+	    r = ADD_UN8 (m, d, tmp);
+
+	    *dst++ = r;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    __m64 vmask;
+	    __m64 vdest;
+
+	    vmask = load8888 (ldl_u((uint32_t *)mask));
+	    vdest = load8888 (*(uint32_t *)dst);
+
+	    *(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest));
+
+	    dst += 4;
+	    mask += 4;
+	    w -= 4;
+	}
+
+	while (w--)
+	{
+	    uint16_t tmp;
+	    uint16_t a;
+	    uint32_t m, d;
+	    uint32_t r;
+
+	    a = *mask++;
+	    d = *dst;
+
+	    m = MUL_UN8 (sa, a, tmp);
+	    r = ADD_UN8 (m, d, tmp);
+
+	    *dst++ = r;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_add_8_8 (pixman_implementation_t *imp,
+		       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t *dst_line, *dst;
+    uint8_t *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint8_t s, d;
+    uint16_t t;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 7)
+	{
+	    s = *src;
+	    d = *dst;
+	    t = d + s;
+	    s = t | (0 - (t >> 8));
+	    *dst = s;
+
+	    dst++;
+	    src++;
+	    w--;
+	}
+
+	while (w >= 8)
+	{
+	    *(__m64*)dst = _mm_adds_pu8 ((__m64)ldq_u((uint64_t *)src), *(__m64*)dst);
+	    dst += 8;
+	    src += 8;
+	    w -= 8;
+	}
+
+	while (w)
+	{
+	    s = *src;
+	    d = *dst;
+	    t = d + s;
+	    s = t | (0 - (t >> 8));
+	    *dst = s;
+
+	    dst++;
+	    src++;
+	    w--;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    __m64 dst64;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 7)
+	{
+	    *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
+	                                           _mm_cvtsi32_si64 (*dst)));
+	    dst++;
+	    src++;
+	    w--;
+	}
+
+	while (w >= 2)
+	{
+	    dst64 = _mm_adds_pu8 ((__m64)ldq_u((uint64_t *)src), *(__m64*)dst);
+	    *(uint64_t*)dst = to_uint64 (dst64);
+	    dst += 2;
+	    src += 2;
+	    w -= 2;
+	}
+
+	if (w)
+	{
+	    *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
+	                                           _mm_cvtsi32_si64 (*dst)));
+
+	}
+    }
+
+    _mm_empty ();
+}
+
+static pixman_bool_t
+pixman_blt_mmx (uint32_t *src_bits,
+                uint32_t *dst_bits,
+                int       src_stride,
+                int       dst_stride,
+                int       src_bpp,
+                int       dst_bpp,
+                int       src_x,
+                int       src_y,
+                int       dest_x,
+                int       dest_y,
+                int       width,
+                int       height)
+{
+    uint8_t *   src_bytes;
+    uint8_t *   dst_bytes;
+    int byte_width;
+
+    if (src_bpp != dst_bpp)
+	return FALSE;
+
+    if (src_bpp == 16)
+    {
+	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+	byte_width = 2 * width;
+	src_stride *= 2;
+	dst_stride *= 2;
+    }
+    else if (src_bpp == 32)
+    {
+	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+	byte_width = 4 * width;
+	src_stride *= 4;
+	dst_stride *= 4;
+    }
+    else
+    {
+	return FALSE;
+    }
+
+    while (height--)
+    {
+	int w;
+	uint8_t *s = src_bytes;
+	uint8_t *d = dst_bytes;
+	src_bytes += src_stride;
+	dst_bytes += dst_stride;
+	w = byte_width;
+
+	while (w >= 1 && ((unsigned long)d & 1))
+	{
+	    *(uint8_t *)d = *(uint8_t *)s;
+	    w -= 1;
+	    s += 1;
+	    d += 1;
+	}
+
+	while (w >= 2 && ((unsigned long)d & 3))
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((unsigned long)d & 7))
+	{
+	    *(uint32_t *)d = ldl_u((uint32_t *)s);
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+
+	while (w >= 64)
+	{
+#if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
+	    __asm__ (
+	        "movq	  (%1),	  %%mm0\n"
+	        "movq	 8(%1),	  %%mm1\n"
+	        "movq	16(%1),	  %%mm2\n"
+	        "movq	24(%1),	  %%mm3\n"
+	        "movq	32(%1),	  %%mm4\n"
+	        "movq	40(%1),	  %%mm5\n"
+	        "movq	48(%1),	  %%mm6\n"
+	        "movq	56(%1),	  %%mm7\n"
+
+	        "movq	%%mm0,	  (%0)\n"
+	        "movq	%%mm1,	 8(%0)\n"
+	        "movq	%%mm2,	16(%0)\n"
+	        "movq	%%mm3,	24(%0)\n"
+	        "movq	%%mm4,	32(%0)\n"
+	        "movq	%%mm5,	40(%0)\n"
+	        "movq	%%mm6,	48(%0)\n"
+	        "movq	%%mm7,	56(%0)\n"
+		:
+		: "r" (d), "r" (s)
+		: "memory",
+		  "%mm0", "%mm1", "%mm2", "%mm3",
+		  "%mm4", "%mm5", "%mm6", "%mm7");
+#else
+	    __m64 v0 = ldq_u((uint64_t *)(s + 0));
+	    __m64 v1 = ldq_u((uint64_t *)(s + 8));
+	    __m64 v2 = ldq_u((uint64_t *)(s + 16));
+	    __m64 v3 = ldq_u((uint64_t *)(s + 24));
+	    __m64 v4 = ldq_u((uint64_t *)(s + 32));
+	    __m64 v5 = ldq_u((uint64_t *)(s + 40));
+	    __m64 v6 = ldq_u((uint64_t *)(s + 48));
+	    __m64 v7 = ldq_u((uint64_t *)(s + 56));
+	    *(__m64 *)(d + 0)  = v0;
+	    *(__m64 *)(d + 8)  = v1;
+	    *(__m64 *)(d + 16) = v2;
+	    *(__m64 *)(d + 24) = v3;
+	    *(__m64 *)(d + 32) = v4;
+	    *(__m64 *)(d + 40) = v5;
+	    *(__m64 *)(d + 48) = v6;
+	    *(__m64 *)(d + 56) = v7;
+#endif
+
+	    w -= 64;
+	    s += 64;
+	    d += 64;
+	}
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = ldl_u((uint32_t *)s);
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+    }
+
+    _mm_empty ();
+
+    return TRUE;
+}
+
+static void
+mmx_composite_copy_area (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+
+    pixman_blt_mmx (src_image->bits.bits,
+                    dest_image->bits.bits,
+                    src_image->bits.rowstride,
+                    dest_image->bits.rowstride,
+                    PIXMAN_FORMAT_BPP (src_image->bits.format),
+                    PIXMAN_FORMAT_BPP (dest_image->bits.format),
+                    src_x, src_y, dest_x, dest_y, width, height);
+}
+
+#if 0
+static void
+mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t  *src, *src_line;
+    uint32_t  *dst, *dst_line;
+    uint8_t  *mask, *mask_line;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	src = src_line;
+	src_line += src_stride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	w = width;
+
+	while (w--)
+	{
+	    uint64_t m = *mask;
+
+	    if (m)
+	    {
+		__m64 s = load8888 (*src | 0xff000000);
+
+		if (m == 0xff)
+		{
+		    *dst = store8888 (s);
+		}
+		else
+		{
+		    __m64 sa = expand_alpha (s);
+		    __m64 vm = expand_alpha_rev (to_m64 (m));
+		    __m64 vdest = in_over (s, sa, vm, load8888 (*dst));
+
+		    *dst = store8888 (vdest);
+		}
+	    }
+
+	    mask++;
+	    dst++;
+	    src++;
+	}
+    }
+
+    _mm_empty ();
+}
+#endif
+
+static const pixman_fast_path_t mmx_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
+#if 0
+    /* FIXME: This code is commented out since it's apparently
+     * not actually faster than the generic code.
+     */
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
+#endif
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
+
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
+
+    PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
+    PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
+    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8		   ),
+    PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
+
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
+
+    PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
+    PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
+
+    { PIXMAN_OP_NONE },
+};
+
+static pixman_bool_t
+mmx_blt (pixman_implementation_t *imp,
+         uint32_t *               src_bits,
+         uint32_t *               dst_bits,
+         int                      src_stride,
+         int                      dst_stride,
+         int                      src_bpp,
+         int                      dst_bpp,
+         int                      src_x,
+         int                      src_y,
+         int                      dest_x,
+         int                      dest_y,
+         int                      width,
+         int                      height)
+{
+    if (!pixman_blt_mmx (
+            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+            src_x, src_y, dest_x, dest_y, width, height))
+
+    {
+	return _pixman_implementation_blt (
+	    imp->delegate,
+	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+	    src_x, src_y, dest_x, dest_y, width, height);
+    }
+
+    return TRUE;
+}
+
+static pixman_bool_t
+mmx_fill (pixman_implementation_t *imp,
+          uint32_t *               bits,
+          int                      stride,
+          int                      bpp,
+          int                      x,
+          int                      y,
+          int                      width,
+          int                      height,
+          uint32_t xor)
+{
+    if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
+    {
+	return _pixman_implementation_fill (
+	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+    }
+
+    return TRUE;
+}
+
+pixman_implementation_t *
+_pixman_implementation_create_mmx (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
+
+    imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
+    imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
+    imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
+
+    imp->blt = mmx_blt;
+    imp->fill = mmx_fill;
+
+    return imp;
+}
+
+#endif /* USE_X86_MMX || USE_ARM_IWMMXT */
diff --git a/pixman/pixman-noop.c b/pixman/pixman-noop.c
new file mode 100644
index 0000000..906a491
--- /dev/null
+++ b/pixman/pixman-noop.c
@@ -0,0 +1,137 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright Â© 2011 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <string.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+static void
+noop_composite (pixman_implementation_t *imp,
+		pixman_composite_info_t *info)
+{
+    return;
+}
+
+static void
+dest_write_back_direct (pixman_iter_t *iter)
+{
+    iter->buffer += iter->image->bits.rowstride;
+}
+
+static uint32_t *
+noop_get_scanline (pixman_iter_t *iter, const uint32_t *mask)
+{
+    uint32_t *result = iter->buffer;
+
+    iter->buffer += iter->image->bits.rowstride;
+
+    return result;
+}
+
+static uint32_t *
+get_scanline_null (pixman_iter_t *iter, const uint32_t *mask)
+{
+    return NULL;
+}
+
+static void
+noop_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+#define FLAGS						\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
+
+    if (!image)
+    {
+	iter->get_scanline = get_scanline_null;
+    }
+    else if ((iter->flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
+	     (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
+    {
+	iter->get_scanline = _pixman_iter_get_scanline_noop;
+    }
+    else if ((iter->flags & ITER_NARROW)				&&
+	     (image->common.flags & FLAGS) == FLAGS			&&
+	     iter->x >= 0 && iter->y >= 0				&&
+	     iter->x + iter->width <= image->bits.width			&&
+	     iter->y + iter->height <= image->bits.height		&&
+	     image->common.extended_format_code == PIXMAN_a8r8g8b8)
+    {
+	iter->buffer =
+	    image->bits.bits + iter->y * image->bits.rowstride + iter->x;
+
+	iter->get_scanline = noop_get_scanline;
+    }
+    else
+    {
+	(* imp->delegate->src_iter_init) (imp->delegate, iter);
+    }
+}
+
+static void
+noop_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+    uint32_t image_flags = image->common.flags;
+    uint32_t iter_flags = iter->flags;
+    
+    if ((image_flags & FAST_PATH_STD_DEST_FLAGS) == FAST_PATH_STD_DEST_FLAGS	&&
+	(iter_flags & ITER_NARROW) == ITER_NARROW				&&
+	((image->common.extended_format_code == PIXMAN_a8r8g8b8)	||
+	 (image->common.extended_format_code == PIXMAN_x8r8g8b8 &&
+	  (iter_flags & (ITER_LOCALIZED_ALPHA)))))
+    {
+	iter->buffer = image->bits.bits + iter->y * image->bits.rowstride + iter->x;
+
+	iter->get_scanline = _pixman_iter_get_scanline_noop;
+	iter->write_back = dest_write_back_direct;
+    }
+    else
+    {
+	(* imp->delegate->dest_iter_init) (imp->delegate, iter);
+    }
+}
+
+static const pixman_fast_path_t noop_fast_paths[] =
+{
+    { PIXMAN_OP_DST, PIXMAN_any, 0, PIXMAN_any, 0, PIXMAN_any, 0, noop_composite },
+    { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_noop (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp =
+	_pixman_implementation_create (fallback, noop_fast_paths);
+
+    imp->src_iter_init = noop_src_iter_init;
+    imp->dest_iter_init = noop_dest_iter_init;
+
+    return imp;
+}
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
new file mode 100644
index 0000000..cbd48f3
--- /dev/null
+++ b/pixman/pixman-private.h
@@ -0,0 +1,1001 @@
+#ifndef PACKAGE
+#  error config.h must be included before pixman-private.h
+#endif
+
+#ifndef PIXMAN_PRIVATE_H
+#define PIXMAN_PRIVATE_H
+
+#define PIXMAN_DISABLE_DEPRECATED
+#define PIXMAN_USE_INTERNAL_API
+
+#include "pixman.h"
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "pixman-compiler.h"
+
+/*
+ * Images
+ */
+typedef struct image_common image_common_t;
+typedef struct solid_fill solid_fill_t;
+typedef struct gradient gradient_t;
+typedef struct linear_gradient linear_gradient_t;
+typedef struct horizontal_gradient horizontal_gradient_t;
+typedef struct vertical_gradient vertical_gradient_t;
+typedef struct conical_gradient conical_gradient_t;
+typedef struct radial_gradient radial_gradient_t;
+typedef struct bits_image bits_image_t;
+typedef struct circle circle_t;
+
+typedef void (*fetch_scanline_t) (pixman_image_t *image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  uint32_t       *buffer,
+				  const uint32_t *mask);
+
+typedef uint32_t (*fetch_pixel_32_t) (bits_image_t *image,
+				      int           x,
+				      int           y);
+
+typedef uint64_t (*fetch_pixel_64_t) (bits_image_t *image,
+				      int           x,
+				      int           y);
+
+typedef void (*store_scanline_t) (bits_image_t *  image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  const uint32_t *values);
+
+typedef enum
+{
+    BITS,
+    LINEAR,
+    CONICAL,
+    RADIAL,
+    SOLID
+} image_type_t;
+
+typedef void (*property_changed_func_t) (pixman_image_t *image);
+
+struct image_common
+{
+    image_type_t                type;
+    int32_t                     ref_count;
+    pixman_region32_t           clip_region;
+    int32_t			alpha_count;	    /* How many times this image is being used as an alpha map */
+    pixman_bool_t               have_clip_region;   /* FALSE if there is no clip */
+    pixman_bool_t               client_clip;        /* Whether the source clip was
+						       set by a client */
+    pixman_bool_t               clip_sources;       /* Whether the clip applies when
+						     * the image is used as a source
+						     */
+    pixman_bool_t		dirty;
+    pixman_transform_t *        transform;
+    pixman_repeat_t             repeat;
+    pixman_filter_t             filter;
+    pixman_fixed_t *            filter_params;
+    int                         n_filter_params;
+    bits_image_t *              alpha_map;
+    int                         alpha_origin_x;
+    int                         alpha_origin_y;
+    pixman_bool_t               component_alpha;
+    property_changed_func_t     property_changed;
+
+    pixman_image_destroy_func_t destroy_func;
+    void *                      destroy_data;
+
+    uint32_t			flags;
+    pixman_format_code_t	extended_format_code;
+};
+
+struct solid_fill
+{
+    image_common_t common;
+    pixman_color_t color;
+    
+    uint32_t	   color_32;
+    uint64_t	   color_64;
+};
+
+struct gradient
+{
+    image_common_t	    common;
+    int                     n_stops;
+    pixman_gradient_stop_t *stops;
+};
+
+struct linear_gradient
+{
+    gradient_t           common;
+    pixman_point_fixed_t p1;
+    pixman_point_fixed_t p2;
+};
+
+struct circle
+{
+    pixman_fixed_t x;
+    pixman_fixed_t y;
+    pixman_fixed_t radius;
+};
+
+struct radial_gradient
+{
+    gradient_t common;
+
+    circle_t   c1;
+    circle_t   c2;
+
+    circle_t   delta;
+    double     a;
+    double     inva;
+    double     mindr;
+};
+
+struct conical_gradient
+{
+    gradient_t           common;
+    pixman_point_fixed_t center;
+    double		 angle;
+};
+
+struct bits_image
+{
+    image_common_t             common;
+    pixman_format_code_t       format;
+    const pixman_indexed_t *   indexed;
+    int                        width;
+    int                        height;
+    uint32_t *                 bits;
+    uint32_t *                 free_me;
+    int                        rowstride;  /* in number of uint32_t's */
+
+    fetch_scanline_t           get_scanline_32;
+    fetch_scanline_t           get_scanline_64;
+
+    fetch_scanline_t           fetch_scanline_32;
+    fetch_pixel_32_t	       fetch_pixel_32;
+    store_scanline_t           store_scanline_32;
+
+    fetch_scanline_t           fetch_scanline_64;
+    fetch_pixel_64_t	       fetch_pixel_64;
+    store_scanline_t           store_scanline_64;
+
+    /* Used for indirect access to the bits */
+    pixman_read_memory_func_t  read_func;
+    pixman_write_memory_func_t write_func;
+};
+
+union pixman_image
+{
+    image_type_t       type;
+    image_common_t     common;
+    bits_image_t       bits;
+    gradient_t         gradient;
+    linear_gradient_t  linear;
+    conical_gradient_t conical;
+    radial_gradient_t  radial;
+    solid_fill_t       solid;
+};
+
+typedef struct pixman_iter_t pixman_iter_t;
+typedef uint32_t *(* pixman_iter_get_scanline_t) (pixman_iter_t *iter, const uint32_t *mask);
+typedef void      (* pixman_iter_write_back_t)   (pixman_iter_t *iter);
+
+typedef enum
+{
+    ITER_NARROW =		(1 << 0),
+
+    /* "Localized alpha" is when the alpha channel is used only to compute
+     * the alpha value of the destination. This means that the computation
+     * of the RGB values of the result is independent of the alpha value.
+     *
+     * For example, the OVER operator has localized alpha for the
+     * destination, because the RGB values of the result can be computed
+     * without knowing the destination alpha. Similarly, ADD has localized
+     * alpha for both source and destination because the RGB values of the
+     * result can be computed without knowing the alpha value of source or
+     * destination.
+     *
+     * When he destination is xRGB, this is useful knowledge, because then
+     * we can treat it as if it were ARGB, which means in some cases we can
+     * avoid copying it to a temporary buffer.
+     */
+    ITER_LOCALIZED_ALPHA =	(1 << 1),
+    ITER_IGNORE_ALPHA =		(1 << 2),
+    ITER_IGNORE_RGB =		(1 << 3)
+} iter_flags_t;
+
+struct pixman_iter_t
+{
+    /* These are initialized by _pixman_implementation_{src,dest}_init */
+    pixman_image_t *		image;
+    uint32_t *			buffer;
+    int				x, y;
+    int				width;
+    int				height;
+    iter_flags_t		flags;
+
+    /* These function pointers are initialized by the implementation */
+    pixman_iter_get_scanline_t	get_scanline;
+    pixman_iter_write_back_t	write_back;
+
+    /* These fields are scratch data that implementations can use */
+    uint8_t *			bits;
+    int				stride;
+};
+
+void
+_pixman_bits_image_setup_accessors (bits_image_t *image);
+
+void
+_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_solid_fill_iter_init (pixman_image_t *image, pixman_iter_t  *iter);
+
+void
+_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t  *iter);
+
+void
+_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_image_init (pixman_image_t *image);
+
+pixman_bool_t
+_pixman_bits_image_init (pixman_image_t *     image,
+                         pixman_format_code_t format,
+                         int                  width,
+                         int                  height,
+                         uint32_t *           bits,
+                         int                  rowstride);
+pixman_bool_t
+_pixman_image_fini (pixman_image_t *image);
+
+pixman_image_t *
+_pixman_image_allocate (void);
+
+pixman_bool_t
+_pixman_init_gradient (gradient_t *                  gradient,
+                       const pixman_gradient_stop_t *stops,
+                       int                           n_stops);
+void
+_pixman_image_reset_clip_region (pixman_image_t *image);
+
+void
+_pixman_image_validate (pixman_image_t *image);
+
+#define PIXMAN_IMAGE_GET_LINE(image, x, y, type, out_stride, line, mul)	\
+    do									\
+    {									\
+	uint32_t *__bits__;						\
+	int       __stride__;						\
+        								\
+	__bits__ = image->bits.bits;					\
+	__stride__ = image->bits.rowstride;				\
+	(out_stride) =							\
+	    __stride__ * (int) sizeof (uint32_t) / (int) sizeof (type);	\
+	(line) =							\
+	    ((type *) __bits__) + (out_stride) * (y) + (mul) * (x);	\
+    } while (0)
+
+/*
+ * Gradient walker
+ */
+typedef struct
+{
+    uint32_t                left_ag;
+    uint32_t                left_rb;
+    uint32_t                right_ag;
+    uint32_t                right_rb;
+    int32_t                 left_x;
+    int32_t                 right_x;
+    int32_t                 stepper;
+
+    pixman_gradient_stop_t *stops;
+    int                     num_stops;
+    unsigned int            spread;
+
+    int                     need_reset;
+} pixman_gradient_walker_t;
+
+void
+_pixman_gradient_walker_init (pixman_gradient_walker_t *walker,
+                              gradient_t *              gradient,
+                              unsigned int              spread);
+
+void
+_pixman_gradient_walker_reset (pixman_gradient_walker_t *walker,
+                               pixman_fixed_32_32_t      pos);
+
+uint32_t
+_pixman_gradient_walker_pixel (pixman_gradient_walker_t *walker,
+                               pixman_fixed_32_32_t      x);
+
+/*
+ * Edges
+ */
+
+#define MAX_ALPHA(n)    ((1 << (n)) - 1)
+#define N_Y_FRAC(n)     ((n) == 1 ? 1 : (1 << ((n) / 2)) - 1)
+#define N_X_FRAC(n)     ((n) == 1 ? 1 : (1 << ((n) / 2)) + 1)
+
+#define STEP_Y_SMALL(n) (pixman_fixed_1 / N_Y_FRAC (n))
+#define STEP_Y_BIG(n)   (pixman_fixed_1 - (N_Y_FRAC (n) - 1) * STEP_Y_SMALL (n))
+
+#define Y_FRAC_FIRST(n) (STEP_Y_BIG (n) / 2)
+#define Y_FRAC_LAST(n)  (Y_FRAC_FIRST (n) + (N_Y_FRAC (n) - 1) * STEP_Y_SMALL (n))
+
+#define STEP_X_SMALL(n) (pixman_fixed_1 / N_X_FRAC (n))
+#define STEP_X_BIG(n)   (pixman_fixed_1 - (N_X_FRAC (n) - 1) * STEP_X_SMALL (n))
+
+#define X_FRAC_FIRST(n) (STEP_X_BIG (n) / 2)
+#define X_FRAC_LAST(n)  (X_FRAC_FIRST (n) + (N_X_FRAC (n) - 1) * STEP_X_SMALL (n))
+
+#define RENDER_SAMPLES_X(x, n)						\
+    ((n) == 1? 0 : (pixman_fixed_frac (x) +				\
+		    X_FRAC_FIRST (n)) / STEP_X_SMALL (n))
+
+void
+pixman_rasterize_edges_accessors (pixman_image_t *image,
+                                  pixman_edge_t * l,
+                                  pixman_edge_t * r,
+                                  pixman_fixed_t  t,
+                                  pixman_fixed_t  b);
+
+/*
+ * Implementations
+ */
+typedef struct pixman_implementation_t pixman_implementation_t;
+
+typedef struct
+{
+    pixman_op_t              op;
+    pixman_image_t *         src_image;
+    pixman_image_t *         mask_image;
+    pixman_image_t *         dest_image;
+    int32_t                  src_x;
+    int32_t                  src_y;
+    int32_t                  mask_x;
+    int32_t                  mask_y;
+    int32_t                  dest_x;
+    int32_t                  dest_y;
+    int32_t                  width;
+    int32_t                  height;
+
+    uint32_t                 src_flags;
+    uint32_t                 mask_flags;
+    uint32_t                 dest_flags;
+} pixman_composite_info_t;
+
+#define PIXMAN_COMPOSITE_ARGS(info)					\
+    MAYBE_UNUSED pixman_op_t        op = info->op;			\
+    MAYBE_UNUSED pixman_image_t *   src_image = info->src_image;	\
+    MAYBE_UNUSED pixman_image_t *   mask_image = info->mask_image;	\
+    MAYBE_UNUSED pixman_image_t *   dest_image = info->dest_image;	\
+    MAYBE_UNUSED int32_t            src_x = info->src_x;		\
+    MAYBE_UNUSED int32_t            src_y = info->src_y;		\
+    MAYBE_UNUSED int32_t            mask_x = info->mask_x;		\
+    MAYBE_UNUSED int32_t            mask_y = info->mask_y;		\
+    MAYBE_UNUSED int32_t            dest_x = info->dest_x;		\
+    MAYBE_UNUSED int32_t            dest_y = info->dest_y;		\
+    MAYBE_UNUSED int32_t            width = info->width;		\
+    MAYBE_UNUSED int32_t            height = info->height
+
+typedef void (*pixman_combine_32_func_t) (pixman_implementation_t *imp,
+					  pixman_op_t              op,
+					  uint32_t *               dest,
+					  const uint32_t *         src,
+					  const uint32_t *         mask,
+					  int                      width);
+
+typedef void (*pixman_combine_64_func_t) (pixman_implementation_t *imp,
+					  pixman_op_t              op,
+					  uint64_t *               dest,
+					  const uint64_t *         src,
+					  const uint64_t *         mask,
+					  int                      width);
+
+typedef void (*pixman_composite_func_t) (pixman_implementation_t *imp,
+					 pixman_composite_info_t *info);
+typedef pixman_bool_t (*pixman_blt_func_t) (pixman_implementation_t *imp,
+					    uint32_t *               src_bits,
+					    uint32_t *               dst_bits,
+					    int                      src_stride,
+					    int                      dst_stride,
+					    int                      src_bpp,
+					    int                      dst_bpp,
+					    int                      src_x,
+					    int                      src_y,
+					    int                      dest_x,
+					    int                      dest_y,
+					    int                      width,
+					    int                      height);
+typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp,
+					     uint32_t *               bits,
+					     int                      stride,
+					     int                      bpp,
+					     int                      x,
+					     int                      y,
+					     int                      width,
+					     int                      height,
+					     uint32_t                 xor);
+typedef void (*pixman_iter_init_func_t) (pixman_implementation_t *imp,
+                                         pixman_iter_t           *iter);
+
+void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp);
+void _pixman_setup_combiner_functions_64 (pixman_implementation_t *imp);
+
+typedef struct
+{
+    pixman_op_t             op;
+    pixman_format_code_t    src_format;
+    uint32_t		    src_flags;
+    pixman_format_code_t    mask_format;
+    uint32_t		    mask_flags;
+    pixman_format_code_t    dest_format;
+    uint32_t		    dest_flags;
+    pixman_composite_func_t func;
+} pixman_fast_path_t;
+
+struct pixman_implementation_t
+{
+    pixman_implementation_t *	toplevel;
+    pixman_implementation_t *	delegate;
+    const pixman_fast_path_t *	fast_paths;
+
+    pixman_blt_func_t		blt;
+    pixman_fill_func_t		fill;
+    pixman_iter_init_func_t     src_iter_init;
+    pixman_iter_init_func_t     dest_iter_init;
+
+    pixman_combine_32_func_t	combine_32[PIXMAN_N_OPERATORS];
+    pixman_combine_32_func_t	combine_32_ca[PIXMAN_N_OPERATORS];
+    pixman_combine_64_func_t	combine_64[PIXMAN_N_OPERATORS];
+    pixman_combine_64_func_t	combine_64_ca[PIXMAN_N_OPERATORS];
+};
+
+uint32_t
+_pixman_image_get_solid (pixman_implementation_t *imp,
+			 pixman_image_t *         image,
+                         pixman_format_code_t     format);
+
+pixman_implementation_t *
+_pixman_implementation_create (pixman_implementation_t *delegate,
+			       const pixman_fast_path_t *fast_paths);
+
+void
+_pixman_implementation_combine_32 (pixman_implementation_t *imp,
+                                   pixman_op_t              op,
+                                   uint32_t *               dest,
+                                   const uint32_t *         src,
+                                   const uint32_t *         mask,
+                                   int                      width);
+void
+_pixman_implementation_combine_64 (pixman_implementation_t *imp,
+                                   pixman_op_t              op,
+                                   uint64_t *               dest,
+                                   const uint64_t *         src,
+                                   const uint64_t *         mask,
+                                   int                      width);
+void
+_pixman_implementation_combine_32_ca (pixman_implementation_t *imp,
+                                      pixman_op_t              op,
+                                      uint32_t *               dest,
+                                      const uint32_t *         src,
+                                      const uint32_t *         mask,
+                                      int                      width);
+void
+_pixman_implementation_combine_64_ca (pixman_implementation_t *imp,
+                                      pixman_op_t              op,
+                                      uint64_t *               dest,
+                                      const uint64_t *         src,
+                                      const uint64_t *         mask,
+                                      int                      width);
+
+pixman_bool_t
+_pixman_implementation_blt (pixman_implementation_t *imp,
+                            uint32_t *               src_bits,
+                            uint32_t *               dst_bits,
+                            int                      src_stride,
+                            int                      dst_stride,
+                            int                      src_bpp,
+                            int                      dst_bpp,
+                            int                      src_x,
+                            int                      src_y,
+                            int                      dest_x,
+                            int                      dest_y,
+                            int                      width,
+                            int                      height);
+
+pixman_bool_t
+_pixman_implementation_fill (pixman_implementation_t *imp,
+                             uint32_t *               bits,
+                             int                      stride,
+                             int                      bpp,
+                             int                      x,
+                             int                      y,
+                             int                      width,
+                             int                      height,
+                             uint32_t                 xor);
+
+void
+_pixman_implementation_src_iter_init (pixman_implementation_t       *imp,
+				      pixman_iter_t                 *iter,
+				      pixman_image_t                *image,
+				      int                            x,
+				      int                            y,
+				      int                            width,
+				      int                            height,
+				      uint8_t                       *buffer,
+				      iter_flags_t                   flags);
+
+void
+_pixman_implementation_dest_iter_init (pixman_implementation_t       *imp,
+				       pixman_iter_t                 *iter,
+				       pixman_image_t                *image,
+				       int                            x,
+				       int                            y,
+				       int                            width,
+				       int                            height,
+				       uint8_t                       *buffer,
+				       iter_flags_t                   flags);
+
+/* Specific implementations */
+pixman_implementation_t *
+_pixman_implementation_create_general (void);
+
+pixman_implementation_t *
+_pixman_implementation_create_fast_path (pixman_implementation_t *fallback);
+
+pixman_implementation_t *
+_pixman_implementation_create_noop (pixman_implementation_t *fallback);
+
+#if defined USE_X86_MMX || defined USE_ARM_IWMMXT
+pixman_implementation_t *
+_pixman_implementation_create_mmx (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_SSE2
+pixman_implementation_t *
+_pixman_implementation_create_sse2 (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_ARM_SIMD
+pixman_implementation_t *
+_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_ARM_NEON
+pixman_implementation_t *
+_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_VMX
+pixman_implementation_t *
+_pixman_implementation_create_vmx (pixman_implementation_t *fallback);
+#endif
+
+pixman_implementation_t *
+_pixman_choose_implementation (void);
+
+
+
+/*
+ * Utilities
+ */
+uint32_t *
+_pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask);
+
+/* These "formats" all have depth 0, so they
+ * will never clash with any real ones
+ */
+#define PIXMAN_null             PIXMAN_FORMAT (0, 0, 0, 0, 0, 0)
+#define PIXMAN_solid            PIXMAN_FORMAT (0, 1, 0, 0, 0, 0)
+#define PIXMAN_pixbuf		PIXMAN_FORMAT (0, 2, 0, 0, 0, 0)
+#define PIXMAN_rpixbuf		PIXMAN_FORMAT (0, 3, 0, 0, 0, 0)
+#define PIXMAN_unknown		PIXMAN_FORMAT (0, 4, 0, 0, 0, 0)
+#define PIXMAN_any		PIXMAN_FORMAT (0, 5, 0, 0, 0, 0)
+
+#define PIXMAN_OP_any		(PIXMAN_N_OPERATORS + 1)
+
+#define FAST_PATH_ID_TRANSFORM			(1 <<  0)
+#define FAST_PATH_NO_ALPHA_MAP			(1 <<  1)
+#define FAST_PATH_NO_CONVOLUTION_FILTER		(1 <<  2)
+#define FAST_PATH_NO_PAD_REPEAT			(1 <<  3)
+#define FAST_PATH_NO_REFLECT_REPEAT		(1 <<  4)
+#define FAST_PATH_NO_ACCESSORS			(1 <<  5)
+#define FAST_PATH_NARROW_FORMAT			(1 <<  6)
+#define FAST_PATH_COMPONENT_ALPHA		(1 <<  8)
+#define FAST_PATH_SAMPLES_OPAQUE		(1 <<  7)
+#define FAST_PATH_UNIFIED_ALPHA			(1 <<  9)
+#define FAST_PATH_SCALE_TRANSFORM		(1 << 10)
+#define FAST_PATH_NEAREST_FILTER		(1 << 11)
+#define FAST_PATH_HAS_TRANSFORM			(1 << 12)
+#define FAST_PATH_IS_OPAQUE			(1 << 13)
+#define FAST_PATH_NO_NORMAL_REPEAT		(1 << 14)
+#define FAST_PATH_NO_NONE_REPEAT		(1 << 15)
+#define FAST_PATH_X_UNIT_POSITIVE		(1 << 16)
+#define FAST_PATH_AFFINE_TRANSFORM		(1 << 17)
+#define FAST_PATH_Y_UNIT_ZERO			(1 << 18)
+#define FAST_PATH_BILINEAR_FILTER		(1 << 19)
+#define FAST_PATH_ROTATE_90_TRANSFORM		(1 << 20)
+#define FAST_PATH_ROTATE_180_TRANSFORM		(1 << 21)
+#define FAST_PATH_ROTATE_270_TRANSFORM		(1 << 22)
+#define FAST_PATH_SAMPLES_COVER_CLIP_NEAREST	(1 << 23)
+#define FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR	(1 << 24)
+#define FAST_PATH_BITS_IMAGE			(1 << 25)
+
+#define FAST_PATH_PAD_REPEAT						\
+    (FAST_PATH_NO_NONE_REPEAT		|				\
+     FAST_PATH_NO_NORMAL_REPEAT		|				\
+     FAST_PATH_NO_REFLECT_REPEAT)
+
+#define FAST_PATH_NORMAL_REPEAT						\
+    (FAST_PATH_NO_NONE_REPEAT		|				\
+     FAST_PATH_NO_PAD_REPEAT		|				\
+     FAST_PATH_NO_REFLECT_REPEAT)
+
+#define FAST_PATH_NONE_REPEAT						\
+    (FAST_PATH_NO_NORMAL_REPEAT		|				\
+     FAST_PATH_NO_PAD_REPEAT		|				\
+     FAST_PATH_NO_REFLECT_REPEAT)
+
+#define FAST_PATH_REFLECT_REPEAT					\
+    (FAST_PATH_NO_NONE_REPEAT		|				\
+     FAST_PATH_NO_NORMAL_REPEAT		|				\
+     FAST_PATH_NO_PAD_REPEAT)
+
+#define FAST_PATH_STANDARD_FLAGS					\
+    (FAST_PATH_NO_CONVOLUTION_FILTER	|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NARROW_FORMAT)
+
+#define FAST_PATH_STD_DEST_FLAGS					\
+    (FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NARROW_FORMAT)
+
+#define SOURCE_FLAGS(format)						\
+    (FAST_PATH_STANDARD_FLAGS |						\
+     ((PIXMAN_ ## format == PIXMAN_solid) ?				\
+      0 : (FAST_PATH_SAMPLES_COVER_CLIP_NEAREST | FAST_PATH_NEAREST_FILTER | FAST_PATH_ID_TRANSFORM)))
+
+#define MASK_FLAGS(format, extra)					\
+    ((PIXMAN_ ## format == PIXMAN_null) ? 0 : (SOURCE_FLAGS (format) | extra))
+
+#define FAST_PATH(op, src, src_flags, mask, mask_flags, dest, dest_flags, func) \
+    PIXMAN_OP_ ## op,							\
+    PIXMAN_ ## src,							\
+    src_flags,							        \
+    PIXMAN_ ## mask,						        \
+    mask_flags,							        \
+    PIXMAN_ ## dest,	                                                \
+    dest_flags,							        \
+    func
+
+#define PIXMAN_STD_FAST_PATH(op, src, mask, dest, func)			\
+    { FAST_PATH (							\
+	    op,								\
+	    src,  SOURCE_FLAGS (src),					\
+	    mask, MASK_FLAGS (mask, FAST_PATH_UNIFIED_ALPHA),		\
+	    dest, FAST_PATH_STD_DEST_FLAGS,				\
+	    func) }
+
+#define PIXMAN_STD_FAST_PATH_CA(op, src, mask, dest, func)		\
+    { FAST_PATH (							\
+	    op,								\
+	    src,  SOURCE_FLAGS (src),					\
+	    mask, MASK_FLAGS (mask, FAST_PATH_COMPONENT_ALPHA),		\
+	    dest, FAST_PATH_STD_DEST_FLAGS,				\
+	    func) }
+
+/* Memory allocation helpers */
+void *
+pixman_malloc_ab (unsigned int n, unsigned int b);
+
+void *
+pixman_malloc_abc (unsigned int a, unsigned int b, unsigned int c);
+
+pixman_bool_t
+_pixman_multiply_overflows_size (size_t a, size_t b);
+
+pixman_bool_t
+_pixman_multiply_overflows_int (unsigned int a, unsigned int b);
+
+pixman_bool_t
+_pixman_addition_overflows_int (unsigned int a, unsigned int b);
+
+/* Compositing utilities */
+void
+pixman_expand (uint64_t *           dst,
+               const uint32_t *     src,
+               pixman_format_code_t format,
+               int                  width);
+
+void
+pixman_contract (uint32_t *      dst,
+                 const uint64_t *src,
+                 int             width);
+
+pixman_bool_t
+_pixman_lookup_composite_function (pixman_implementation_t     *toplevel,
+				   pixman_op_t			op,
+				   pixman_format_code_t		src_format,
+				   uint32_t			src_flags,
+				   pixman_format_code_t		mask_format,
+				   uint32_t			mask_flags,
+				   pixman_format_code_t		dest_format,
+				   uint32_t			dest_flags,
+				   pixman_implementation_t    **out_imp,
+				   pixman_composite_func_t     *out_func);
+
+/* Region Helpers */
+pixman_bool_t
+pixman_region32_copy_from_region16 (pixman_region32_t *dst,
+                                    pixman_region16_t *src);
+
+pixman_bool_t
+pixman_region16_copy_from_region32 (pixman_region16_t *dst,
+                                    pixman_region32_t *src);
+
+
+/* Misc macros */
+
+#ifndef FALSE
+#   define FALSE 0
+#endif
+
+#ifndef TRUE
+#   define TRUE 1
+#endif
+
+#ifndef MIN
+#  define MIN(a, b) ((a < b) ? a : b)
+#endif
+
+#ifndef MAX
+#  define MAX(a, b) ((a > b) ? a : b)
+#endif
+
+/* Integer division that rounds towards -infinity */
+#define DIV(a, b)					   \
+    ((((a) < 0) == ((b) < 0)) ? (a) / (b) :                \
+     ((a) - (b) + 1 - (((b) < 0) << 1)) / (b))
+
+/* Modulus that produces the remainder wrt. DIV */
+#define MOD(a, b) ((a) < 0 ? ((b) - ((-(a) - 1) % (b))) - 1 : (a) % (b))
+
+#define CLIP(v, low, high) ((v) < (low) ? (low) : ((v) > (high) ? (high) : (v)))
+
+/* Conversion between 8888 and 0565 */
+
+#define CONVERT_8888_TO_0565(s)						\
+    ((((s) >> 3) & 0x001f) |						\
+     (((s) >> 5) & 0x07e0) |						\
+     (((s) >> 8) & 0xf800))
+
+#define CONVERT_0565_TO_0888(s)						\
+    (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) |			\
+     ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) |			\
+     ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000)))
+
+#define CONVERT_0565_TO_8888(s) (CONVERT_0565_TO_0888(s) | 0xff000000)
+
+/* Trivial versions that are useful in macros */
+#define CONVERT_8888_TO_8888(s) (s)
+#define CONVERT_x888_TO_8888(s) ((s) | 0xff000000)
+#define CONVERT_0565_TO_0565(s) (s)
+
+#define PIXMAN_FORMAT_IS_WIDE(f)					\
+    (PIXMAN_FORMAT_A (f) > 8 ||						\
+     PIXMAN_FORMAT_R (f) > 8 ||						\
+     PIXMAN_FORMAT_G (f) > 8 ||						\
+     PIXMAN_FORMAT_B (f) > 8)
+
+#ifdef WORDS_BIGENDIAN
+#   define SCREEN_SHIFT_LEFT(x,n)	((x) << (n))
+#   define SCREEN_SHIFT_RIGHT(x,n)	((x) >> (n))
+#else
+#   define SCREEN_SHIFT_LEFT(x,n)	((x) >> (n))
+#   define SCREEN_SHIFT_RIGHT(x,n)	((x) << (n))
+#endif
+
+static force_inline uint32_t
+unorm_to_unorm (uint32_t val, int from_bits, int to_bits)
+{
+    uint32_t result;
+
+    if (from_bits == 0)
+	return 0;
+
+    /* Delete any extra bits */
+    val &= ((1 << from_bits) - 1);
+
+    if (from_bits >= to_bits)
+	return val >> (from_bits - to_bits);
+
+    /* Start out with the high bit of val in the high bit of result. */
+    result = val << (to_bits - from_bits);
+
+    /* Copy the bits in result, doubling the number of bits each time, until
+     * we fill all to_bits. Unrolled manually because from_bits and to_bits
+     * are usually known statically, so the compiler can turn all of this
+     * into a few shifts.
+     */
+#define REPLICATE()							\
+    do									\
+    {									\
+	if (from_bits < to_bits)					\
+	{								\
+	    result |= result >> from_bits;				\
+									\
+	    from_bits *= 2;						\
+	}								\
+    }									\
+    while (0)
+
+    REPLICATE();
+    REPLICATE();
+    REPLICATE();
+    REPLICATE();
+    REPLICATE();
+
+    return result;
+}
+
+/*
+ * Various debugging code
+ */
+
+#undef DEBUG
+
+#define COMPILE_TIME_ASSERT(x)						\
+    do { typedef int compile_time_assertion [(x)?1:-1]; } while (0)
+
+/* Turn on debugging depending on what type of release this is
+ */
+#if (((PIXMAN_VERSION_MICRO % 2) == 0) && ((PIXMAN_VERSION_MINOR % 2) == 1))
+
+/* Debugging gets turned on for development releases because these
+ * are the things that end up in bleeding edge distributions such
+ * as Rawhide etc.
+ *
+ * For performance reasons we don't turn it on for stable releases or
+ * random git checkouts. (Random git checkouts are often used for
+ * performance work).
+ */
+
+#    define DEBUG
+
+#endif
+
+#ifdef DEBUG
+
+void
+_pixman_log_error (const char *function, const char *message);
+
+#define return_if_fail(expr)                                            \
+    do                                                                  \
+    {                                                                   \
+	if (!(expr))							\
+	{								\
+	    _pixman_log_error (FUNC, "The expression " # expr " was false"); \
+	    return;							\
+	}								\
+    }                                                                   \
+    while (0)
+
+#define return_val_if_fail(expr, retval)                                \
+    do                                                                  \
+    {                                                                   \
+	if (!(expr))                                                    \
+	{								\
+	    _pixman_log_error (FUNC, "The expression " # expr " was false"); \
+	    return (retval);						\
+	}								\
+    }                                                                   \
+    while (0)
+
+#define critical_if_fail(expr)						\
+    do									\
+    {									\
+	if (!(expr))							\
+	    _pixman_log_error (FUNC, "The expression " # expr " was false"); \
+    }									\
+    while (0)
+
+
+#else
+
+#define _pixman_log_error(f,m) do { } while (0)				\
+
+#define return_if_fail(expr)						\
+    do                                                                  \
+    {                                                                   \
+	if (!(expr))							\
+	    return;							\
+    }                                                                   \
+    while (0)
+
+#define return_val_if_fail(expr, retval)                                \
+    do                                                                  \
+    {                                                                   \
+	if (!(expr))							\
+	    return (retval);						\
+    }                                                                   \
+    while (0)
+
+#define critical_if_fail(expr)						\
+    do									\
+    {									\
+    }									\
+    while (0)
+#endif
+
+/*
+ * Timers
+ */
+
+#ifdef PIXMAN_TIMERS
+
+static inline uint64_t
+oil_profile_stamp_rdtsc (void)
+{
+    uint64_t ts;
+
+    __asm__ __volatile__ ("rdtsc\n" : "=A" (ts));
+    return ts;
+}
+
+#define OIL_STAMP oil_profile_stamp_rdtsc
+
+typedef struct pixman_timer_t pixman_timer_t;
+
+struct pixman_timer_t
+{
+    int             initialized;
+    const char *    name;
+    uint64_t        n_times;
+    uint64_t        total;
+    pixman_timer_t *next;
+};
+
+extern int timer_defined;
+
+void pixman_timer_register (pixman_timer_t *timer);
+
+#define TIMER_BEGIN(tname)                                              \
+    {                                                                   \
+	static pixman_timer_t timer ## tname;                           \
+	uint64_t              begin ## tname;                           \
+        								\
+	if (!timer ## tname.initialized)				\
+	{                                                               \
+	    timer ## tname.initialized = 1;				\
+	    timer ## tname.name = # tname;				\
+	    pixman_timer_register (&timer ## tname);			\
+	}                                                               \
+									\
+	timer ## tname.n_times++;					\
+	begin ## tname = OIL_STAMP ();
+
+#define TIMER_END(tname)                                                \
+    timer ## tname.total += OIL_STAMP () - begin ## tname;		\
+    }
+
+#endif /* PIXMAN_TIMERS */
+
+#endif /* PIXMAN_PRIVATE_H */
diff --git a/pixman/pixman-radial-gradient.c b/pixman/pixman-radial-gradient.c
new file mode 100644
index 0000000..b6dd6b2
--- /dev/null
+++ b/pixman/pixman-radial-gradient.c
@@ -0,0 +1,470 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ *
+ * Copyright Â© 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * Copyright Â© 2000 SuSE, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ * Copyright Â© 2007 Red Hat, Inc.
+ *
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include <math.h>
+#include "pixman-private.h"
+
+static inline pixman_fixed_32_32_t
+dot (pixman_fixed_48_16_t x1,
+     pixman_fixed_48_16_t y1,
+     pixman_fixed_48_16_t z1,
+     pixman_fixed_48_16_t x2,
+     pixman_fixed_48_16_t y2,
+     pixman_fixed_48_16_t z2)
+{
+    /*
+     * Exact computation, assuming that the input values can
+     * be represented as pixman_fixed_16_16_t
+     */
+    return x1 * x2 + y1 * y2 + z1 * z2;
+}
+
+static inline double
+fdot (double x1,
+      double y1,
+      double z1,
+      double x2,
+      double y2,
+      double z2)
+{
+    /*
+     * Error can be unbound in some special cases.
+     * Using clever dot product algorithms (for example compensated
+     * dot product) would improve this but make the code much less
+     * obvious
+     */
+    return x1 * x2 + y1 * y2 + z1 * z2;
+}
+
+static uint32_t
+radial_compute_color (double                    a,
+		      double                    b,
+		      double                    c,
+		      double                    inva,
+		      double                    dr,
+		      double                    mindr,
+		      pixman_gradient_walker_t *walker,
+		      pixman_repeat_t           repeat)
+{
+    /*
+     * In this function error propagation can lead to bad results:
+     *  - discr can have an unbound error (if b*b-a*c is very small),
+     *    potentially making it the opposite sign of what it should have been
+     *    (thus clearing a pixel that would have been colored or vice-versa)
+     *    or propagating the error to sqrtdiscr;
+     *    if discr has the wrong sign or b is very small, this can lead to bad
+     *    results
+     *
+     *  - the algorithm used to compute the solutions of the quadratic
+     *    equation is not numerically stable (but saves one division compared
+     *    to the numerically stable one);
+     *    this can be a problem if a*c is much smaller than b*b
+     *
+     *  - the above problems are worse if a is small (as inva becomes bigger)
+     */
+    double discr;
+
+    if (a == 0)
+    {
+	double t;
+
+	if (b == 0)
+	    return 0;
+
+	t = pixman_fixed_1 / 2 * c / b;
+	if (repeat == PIXMAN_REPEAT_NONE)
+	{
+	    if (0 <= t && t <= pixman_fixed_1)
+		return _pixman_gradient_walker_pixel (walker, t);
+	}
+	else
+	{
+	    if (t * dr > mindr)
+		return _pixman_gradient_walker_pixel (walker, t);
+	}
+
+	return 0;
+    }
+
+    discr = fdot (b, a, 0, b, -c, 0);
+    if (discr >= 0)
+    {
+	double sqrtdiscr, t0, t1;
+
+	sqrtdiscr = sqrt (discr);
+	t0 = (b + sqrtdiscr) * inva;
+	t1 = (b - sqrtdiscr) * inva;
+
+	/*
+	 * The root that must be used is the biggest one that belongs
+	 * to the valid range ([0,1] for PIXMAN_REPEAT_NONE, any
+	 * solution that results in a positive radius otherwise).
+	 *
+	 * If a > 0, t0 is the biggest solution, so if it is valid, it
+	 * is the correct result.
+	 *
+	 * If a < 0, only one of the solutions can be valid, so the
+	 * order in which they are tested is not important.
+	 */
+	if (repeat == PIXMAN_REPEAT_NONE)
+	{
+	    if (0 <= t0 && t0 <= pixman_fixed_1)
+		return _pixman_gradient_walker_pixel (walker, t0);
+	    else if (0 <= t1 && t1 <= pixman_fixed_1)
+		return _pixman_gradient_walker_pixel (walker, t1);
+	}
+	else
+	{
+	    if (t0 * dr > mindr)
+		return _pixman_gradient_walker_pixel (walker, t0);
+	    else if (t1 * dr > mindr)
+		return _pixman_gradient_walker_pixel (walker, t1);
+	}
+    }
+
+    return 0;
+}
+
+static uint32_t *
+radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    /*
+     * Implementation of radial gradients following the PDF specification.
+     * See section 8.7.4.5.4 Type 3 (Radial) Shadings of the PDF Reference
+     * Manual (PDF 32000-1:2008 at the time of this writing).
+     *
+     * In the radial gradient problem we are given two circles (câ,râ) and
+     * (câ,râ) that define the gradient itself.
+     *
+     * Mathematically the gradient can be defined as the family of circles
+     *
+     *     ((1-t)Â·câ + tÂ·(câ), (1-t)Â·râ + tÂ·râ)
+     *
+     * excluding those circles whose radius would be < 0. When a point
+     * belongs to more than one circle, the one with a bigger t is the only
+     * one that contributes to its color. When a point does not belong
+     * to any of the circles, it is transparent black, i.e. RGBA (0, 0, 0, 0).
+     * Further limitations on the range of values for t are imposed when
+     * the gradient is not repeated, namely t must belong to [0,1].
+     *
+     * The graphical result is the same as drawing the valid (radius > 0)
+     * circles with increasing t in [-inf, +inf] (or in [0,1] if the gradient
+     * is not repeated) using SOURCE operator composition.
+     *
+     * It looks like a cone pointing towards the viewer if the ending circle
+     * is smaller than the starting one, a cone pointing inside the page if
+     * the starting circle is the smaller one and like a cylinder if they
+     * have the same radius.
+     *
+     * What we actually do is, given the point whose color we are interested
+     * in, compute the t values for that point, solving for t in:
+     *
+     *     length((1-t)Â·câ + tÂ·(câ) - p) = (1-t)Â·râ + tÂ·râ
+     *
+     * Let's rewrite it in a simpler way, by defining some auxiliary
+     * variables:
+     *
+     *     cd = câ - câ
+     *     pd = p - câ
+     *     dr = râ - râ
+     *     length(tÂ·cd - pd) = râ + tÂ·dr
+     *
+     * which actually means
+     *
+     *     hypot(tÂ·cdx - pdx, tÂ·cdy - pdy) = râ + tÂ·dr
+     *
+     * or
+     *
+     *     â·((tÂ·cdx - pdx)Â² + (tÂ·cdy - pdy)Â²) = râ + tÂ·dr.
+     *
+     * If we impose (as stated earlier) that râ + tÂ·dr >= 0, it becomes:
+     *
+     *     (tÂ·cdx - pdx)Â² + (tÂ·cdy - pdy)Â² = (râ + tÂ·dr)Â²
+     *
+     * where we can actually expand the squares and solve for t:
+     *
+     *     tÂ²cdxÂ² - 2tÂ·cdxÂ·pdx + pdxÂ² + tÂ²cdyÂ² - 2tÂ·cdyÂ·pdy + pdyÂ² =
+     *       = râÂ² + 2Â·râÂ·tÂ·dr + tÂ²Â·drÂ²
+     *
+     *     (cdxÂ² + cdyÂ² - drÂ²)tÂ² - 2(cdxÂ·pdx + cdyÂ·pdy + râÂ·dr)t +
+     *         (pdxÂ² + pdyÂ² - râÂ²) = 0
+     *
+     *     A = cdxÂ² + cdyÂ² - drÂ²
+     *     B = pdxÂ·cdx + pdyÂ·cdy + râÂ·dr
+     *     C = pdxÂ² + pdyÂ² - râÂ²
+     *     AtÂ² - 2Bt + C = 0
+     *
+     * The solutions (unless the equation degenerates because of A = 0) are:
+     *
+     *     t = (B Â± â·(BÂ² - AÂ·C)) / A
+     *
+     * The solution we are going to prefer is the bigger one, unless the
+     * radius associated to it is negative (or it falls outside the valid t
+     * range).
+     *
+     * Additional observations (useful for optimizations):
+     * A does not depend on p
+     *
+     * A < 0 <=> one of the two circles completely contains the other one
+     *   <=> for every p, the radiuses associated with the two t solutions
+     *       have opposite sign
+     */
+    pixman_image_t *image = iter->image;
+    int x = iter->x;
+    int y = iter->y;
+    int width = iter->width;
+    uint32_t *buffer = iter->buffer;
+
+    gradient_t *gradient = (gradient_t *)image;
+    radial_gradient_t *radial = (radial_gradient_t *)image;
+    uint32_t *end = buffer + width;
+    pixman_gradient_walker_t walker;
+    pixman_vector_t v, unit;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
+
+    if (image->common.transform)
+    {
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return iter->buffer;
+
+	unit.vector[0] = image->common.transform->matrix[0][0];
+	unit.vector[1] = image->common.transform->matrix[1][0];
+	unit.vector[2] = image->common.transform->matrix[2][0];
+    }
+    else
+    {
+	unit.vector[0] = pixman_fixed_1;
+	unit.vector[1] = 0;
+	unit.vector[2] = 0;
+    }
+
+    if (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1)
+    {
+	/*
+	 * Given:
+	 *
+	 * t = (B Â± â·(BÂ² - AÂ·C)) / A
+	 *
+	 * where
+	 *
+	 * A = cdxÂ² + cdyÂ² - drÂ²
+	 * B = pdxÂ·cdx + pdyÂ·cdy + râÂ·dr
+	 * C = pdxÂ² + pdyÂ² - râÂ²
+	 * det = BÂ² - AÂ·C
+	 *
+	 * Since we have an affine transformation, we know that (pdx, pdy)
+	 * increase linearly with each pixel,
+	 *
+	 * pdx = pdxâ + nÂ·ux,
+	 * pdy = pdyâ + nÂ·uy,
+	 *
+	 * we can then express B, C and det through multiple differentiation.
+	 */
+	pixman_fixed_32_32_t b, db, c, dc, ddc;
+
+	/* warning: this computation may overflow */
+	v.vector[0] -= radial->c1.x;
+	v.vector[1] -= radial->c1.y;
+
+	/*
+	 * B and C are computed and updated exactly.
+	 * If fdot was used instead of dot, in the worst case it would
+	 * lose 11 bits of precision in each of the multiplication and
+	 * summing up would zero out all the bit that were preserved,
+	 * thus making the result 0 instead of the correct one.
+	 * This would mean a worst case of unbound relative error or
+	 * about 2^10 absolute error
+	 */
+	b = dot (v.vector[0], v.vector[1], radial->c1.radius,
+		 radial->delta.x, radial->delta.y, radial->delta.radius);
+	db = dot (unit.vector[0], unit.vector[1], 0,
+		  radial->delta.x, radial->delta.y, 0);
+
+	c = dot (v.vector[0], v.vector[1],
+		 -((pixman_fixed_48_16_t) radial->c1.radius),
+		 v.vector[0], v.vector[1], radial->c1.radius);
+	dc = dot (2 * (pixman_fixed_48_16_t) v.vector[0] + unit.vector[0],
+		  2 * (pixman_fixed_48_16_t) v.vector[1] + unit.vector[1],
+		  0,
+		  unit.vector[0], unit.vector[1], 0);
+	ddc = 2 * dot (unit.vector[0], unit.vector[1], 0,
+		       unit.vector[0], unit.vector[1], 0);
+
+	while (buffer < end)
+	{
+	    if (!mask || *mask++)
+	    {
+		*buffer = radial_compute_color (radial->a, b, c,
+						radial->inva,
+						radial->delta.radius,
+						radial->mindr,
+						&walker,
+						image->common.repeat);
+	    }
+
+	    b += db;
+	    c += dc;
+	    dc += ddc;
+	    ++buffer;
+	}
+    }
+    else
+    {
+	/* projective */
+	/* Warning:
+	 * error propagation guarantees are much looser than in the affine case
+	 */
+	while (buffer < end)
+	{
+	    if (!mask || *mask++)
+	    {
+		if (v.vector[2] != 0)
+		{
+		    double pdx, pdy, invv2, b, c;
+
+		    invv2 = 1. * pixman_fixed_1 / v.vector[2];
+
+		    pdx = v.vector[0] * invv2 - radial->c1.x;
+		    /*    / pixman_fixed_1 */
+
+		    pdy = v.vector[1] * invv2 - radial->c1.y;
+		    /*    / pixman_fixed_1 */
+
+		    b = fdot (pdx, pdy, radial->c1.radius,
+			      radial->delta.x, radial->delta.y,
+			      radial->delta.radius);
+		    /*  / pixman_fixed_1 / pixman_fixed_1 */
+
+		    c = fdot (pdx, pdy, -radial->c1.radius,
+			      pdx, pdy, radial->c1.radius);
+		    /*  / pixman_fixed_1 / pixman_fixed_1 */
+
+		    *buffer = radial_compute_color (radial->a, b, c,
+						    radial->inva,
+						    radial->delta.radius,
+						    radial->mindr,
+						    &walker,
+						    image->common.repeat);
+		}
+		else
+		{
+		    *buffer = 0;
+		}
+	    }
+
+	    ++buffer;
+
+	    v.vector[0] += unit.vector[0];
+	    v.vector[1] += unit.vector[1];
+	    v.vector[2] += unit.vector[2];
+	}
+    }
+
+    iter->y++;
+    return iter->buffer;
+}
+
+static uint32_t *
+radial_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    uint32_t *buffer = radial_get_scanline_narrow (iter, NULL);
+
+    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+    return buffer;
+}
+
+void
+_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->flags & ITER_NARROW)
+	iter->get_scanline = radial_get_scanline_narrow;
+    else
+	iter->get_scanline = radial_get_scanline_wide;
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_radial_gradient (pixman_point_fixed_t *        inner,
+                                     pixman_point_fixed_t *        outer,
+                                     pixman_fixed_t                inner_radius,
+                                     pixman_fixed_t                outer_radius,
+                                     const pixman_gradient_stop_t *stops,
+                                     int                           n_stops)
+{
+    pixman_image_t *image;
+    radial_gradient_t *radial;
+
+    image = _pixman_image_allocate ();
+
+    if (!image)
+	return NULL;
+
+    radial = &image->radial;
+
+    if (!_pixman_init_gradient (&radial->common, stops, n_stops))
+    {
+	free (image);
+	return NULL;
+    }
+
+    image->type = RADIAL;
+
+    radial->c1.x = inner->x;
+    radial->c1.y = inner->y;
+    radial->c1.radius = inner_radius;
+    radial->c2.x = outer->x;
+    radial->c2.y = outer->y;
+    radial->c2.radius = outer_radius;
+
+    /* warning: this computations may overflow */
+    radial->delta.x = radial->c2.x - radial->c1.x;
+    radial->delta.y = radial->c2.y - radial->c1.y;
+    radial->delta.radius = radial->c2.radius - radial->c1.radius;
+
+    /* computed exactly, then cast to double -> every bit of the double
+       representation is correct (53 bits) */
+    radial->a = dot (radial->delta.x, radial->delta.y, -radial->delta.radius,
+		     radial->delta.x, radial->delta.y, radial->delta.radius);
+    if (radial->a != 0)
+	radial->inva = 1. * pixman_fixed_1 / radial->a;
+
+    radial->mindr = -1. * pixman_fixed_1 * radial->c1.radius;
+
+    return image;
+}
diff --git a/pixman/pixman-region.c b/pixman/pixman-region.c
new file mode 100644
index 0000000..47beb52
--- /dev/null
+++ b/pixman/pixman-region.c
@@ -0,0 +1,2810 @@
+/*
+ * Copyright 1987, 1988, 1989, 1998  The Open Group
+ * 
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation.
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ * Except as contained in this notice, the name of The Open Group shall not be
+ * used in advertising or otherwise to promote the sale, use or other dealings
+ * in this Software without prior written authorization from The Open Group.
+ * 
+ * Copyright 1987, 1988, 1989 by
+ * Digital Equipment Corporation, Maynard, Massachusetts.
+ * 
+ *                    All Rights Reserved
+ * 
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation for any purpose and without fee is hereby granted,
+ * provided that the above copyright notice appear in all copies and that
+ * both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of Digital not be
+ * used in advertising or publicity pertaining to distribution of the
+ * software without specific, written prior permission.
+ * 
+ * DIGITAL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
+ * ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL
+ * DIGITAL BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Copyright Â© 1998 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include <stdio.h>
+#include "pixman-private.h"
+
+#define PIXREGION_NIL(reg) ((reg)->data && !(reg)->data->numRects)
+/* not a region */
+#define PIXREGION_NAR(reg)      ((reg)->data == pixman_broken_data)
+#define PIXREGION_NUMRECTS(reg) ((reg)->data ? (reg)->data->numRects : 1)
+#define PIXREGION_SIZE(reg) ((reg)->data ? (reg)->data->size : 0)
+#define PIXREGION_RECTS(reg) \
+    ((reg)->data ? (box_type_t *)((reg)->data + 1) \
+     : &(reg)->extents)
+#define PIXREGION_BOXPTR(reg) ((box_type_t *)((reg)->data + 1))
+#define PIXREGION_BOX(reg, i) (&PIXREGION_BOXPTR (reg)[i])
+#define PIXREGION_TOP(reg) PIXREGION_BOX (reg, (reg)->data->numRects)
+#define PIXREGION_END(reg) PIXREGION_BOX (reg, (reg)->data->numRects - 1)
+
+#define GOOD_RECT(rect) ((rect)->x1 < (rect)->x2 && (rect)->y1 < (rect)->y2)
+#define BAD_RECT(rect) ((rect)->x1 > (rect)->x2 || (rect)->y1 > (rect)->y2)
+
+#ifdef DEBUG
+
+#define GOOD(reg)							\
+    do									\
+    {									\
+	if (!PREFIX (_selfcheck (reg)))					\
+	    _pixman_log_error (FUNC, "Malformed region " # reg);	\
+    } while (0)
+
+#else
+
+#define GOOD(reg)
+
+#endif
+
+static const box_type_t PREFIX (_empty_box_) = { 0, 0, 0, 0 };
+static const region_data_type_t PREFIX (_empty_data_) = { 0, 0 };
+#if defined (__llvm__) && !defined (__clang__)
+static const volatile region_data_type_t PREFIX (_broken_data_) = { 0, 0 };
+#else
+static const region_data_type_t PREFIX (_broken_data_) = { 0, 0 };
+#endif
+
+static box_type_t *pixman_region_empty_box =
+    (box_type_t *)&PREFIX (_empty_box_);
+static region_data_type_t *pixman_region_empty_data =
+    (region_data_type_t *)&PREFIX (_empty_data_);
+static region_data_type_t *pixman_broken_data =
+    (region_data_type_t *)&PREFIX (_broken_data_);
+
+static pixman_bool_t
+pixman_break (region_type_t *region);
+
+/*
+ * The functions in this file implement the Region abstraction used extensively
+ * throughout the X11 sample server. A Region is simply a set of disjoint
+ * (non-overlapping) rectangles, plus an "extent" rectangle which is the
+ * smallest single rectangle that contains all the non-overlapping rectangles.
+ *
+ * A Region is implemented as a "y-x-banded" array of rectangles.  This array
+ * imposes two degrees of order.  First, all rectangles are sorted by top side
+ * y coordinate first (y1), and then by left side x coordinate (x1).
+ *
+ * Furthermore, the rectangles are grouped into "bands".  Each rectangle in a
+ * band has the same top y coordinate (y1), and each has the same bottom y
+ * coordinate (y2).  Thus all rectangles in a band differ only in their left
+ * and right side (x1 and x2).  Bands are implicit in the array of rectangles:
+ * there is no separate list of band start pointers.
+ *
+ * The y-x band representation does not minimize rectangles.  In particular,
+ * if a rectangle vertically crosses a band (the rectangle has scanlines in
+ * the y1 to y2 area spanned by the band), then the rectangle may be broken
+ * down into two or more smaller rectangles stacked one atop the other.
+ *
+ *  -----------				    -----------
+ *  |         |				    |         |		    band 0
+ *  |         |  --------		    -----------  --------
+ *  |         |  |      |  in y-x banded    |         |  |      |   band 1
+ *  |         |  |      |  form is	    |         |  |      |
+ *  -----------  |      |		    -----------  --------
+ *               |      |				 |      |   band 2
+ *               --------				 --------
+ *
+ * An added constraint on the rectangles is that they must cover as much
+ * horizontal area as possible: no two rectangles within a band are allowed
+ * to touch.
+ *
+ * Whenever possible, bands will be merged together to cover a greater vertical
+ * distance (and thus reduce the number of rectangles). Two bands can be merged
+ * only if the bottom of one touches the top of the other and they have
+ * rectangles in the same places (of the same width, of course).
+ *
+ * Adam de Boor wrote most of the original region code.  Joel McCormack
+ * substantially modified or rewrote most of the core arithmetic routines, and
+ * added pixman_region_validate in order to support several speed improvements
+ * to pixman_region_validate_tree.  Bob Scheifler changed the representation
+ * to be more compact when empty or a single rectangle, and did a bunch of
+ * gratuitous reformatting. Carl Worth did further gratuitous reformatting
+ * while re-merging the server and client region code into libpixregion.
+ * Soren Sandmann did even more gratuitous reformatting.
+ */
+
+/*  true iff two Boxes overlap */
+#define EXTENTCHECK(r1, r2)	   \
+    (!( ((r1)->x2 <= (r2)->x1)  || \
+        ((r1)->x1 >= (r2)->x2)  || \
+        ((r1)->y2 <= (r2)->y1)  || \
+        ((r1)->y1 >= (r2)->y2) ) )
+
+/* true iff (x,y) is in Box */
+#define INBOX(r, x, y)	\
+    ( ((r)->x2 >  x) && \
+      ((r)->x1 <= x) && \
+      ((r)->y2 >  y) && \
+      ((r)->y1 <= y) )
+
+/* true iff Box r1 contains Box r2 */
+#define SUBSUMES(r1, r2)	\
+    ( ((r1)->x1 <= (r2)->x1) && \
+      ((r1)->x2 >= (r2)->x2) && \
+      ((r1)->y1 <= (r2)->y1) && \
+      ((r1)->y2 >= (r2)->y2) )
+
+static size_t
+PIXREGION_SZOF (size_t n)
+{
+    size_t size = n * sizeof(box_type_t);
+    
+    if (n > UINT32_MAX / sizeof(box_type_t))
+	return 0;
+
+    if (sizeof(region_data_type_t) > UINT32_MAX - size)
+	return 0;
+
+    return size + sizeof(region_data_type_t);
+}
+
+static void *
+alloc_data (size_t n)
+{
+    size_t sz = PIXREGION_SZOF (n);
+
+    if (!sz)
+	return NULL;
+
+    return malloc (sz);
+}
+
+#define FREE_DATA(reg) if ((reg)->data && (reg)->data->size) free ((reg)->data)
+
+#define RECTALLOC_BAIL(region, n, bail)					\
+    do									\
+    {									\
+	if (!(region)->data ||						\
+	    (((region)->data->numRects + (n)) > (region)->data->size))	\
+	{								\
+	    if (!pixman_rect_alloc (region, n))				\
+		goto bail;						\
+	}								\
+    } while (0)
+
+#define RECTALLOC(region, n)						\
+    do									\
+    {									\
+	if (!(region)->data ||						\
+	    (((region)->data->numRects + (n)) > (region)->data->size))	\
+	{								\
+	    if (!pixman_rect_alloc (region, n)) {			\
+		return FALSE;						\
+	    }								\
+	}								\
+    } while (0)
+
+#define ADDRECT(next_rect, nx1, ny1, nx2, ny2)      \
+    do						    \
+    {						    \
+	next_rect->x1 = nx1;                        \
+	next_rect->y1 = ny1;                        \
+	next_rect->x2 = nx2;                        \
+	next_rect->y2 = ny2;                        \
+	next_rect++;                                \
+    }						    \
+    while (0)
+
+#define NEWRECT(region, next_rect, nx1, ny1, nx2, ny2)			\
+    do									\
+    {									\
+	if (!(region)->data ||						\
+	    ((region)->data->numRects == (region)->data->size))		\
+	{								\
+	    if (!pixman_rect_alloc (region, 1))				\
+		return FALSE;						\
+	    next_rect = PIXREGION_TOP (region);				\
+	}								\
+	ADDRECT (next_rect, nx1, ny1, nx2, ny2);			\
+	region->data->numRects++;					\
+	critical_if_fail (region->data->numRects <= region->data->size);		\
+    } while (0)
+
+#define DOWNSIZE(reg, numRects)						\
+    do									\
+    {									\
+	if (((numRects) < ((reg)->data->size >> 1)) &&			\
+	    ((reg)->data->size > 50))					\
+	{								\
+	    region_data_type_t * new_data;				\
+	    size_t data_size = PIXREGION_SZOF (numRects);		\
+									\
+	    if (!data_size)						\
+	    {								\
+		new_data = NULL;					\
+	    }								\
+	    else							\
+	    {								\
+		new_data = (region_data_type_t *)			\
+		    realloc ((reg)->data, data_size);			\
+	    }								\
+									\
+	    if (new_data)						\
+	    {								\
+		new_data->size = (numRects);				\
+		(reg)->data = new_data;					\
+	    }								\
+	}								\
+    } while (0)
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_equal) (region_type_t *reg1, region_type_t *reg2)
+{
+    int i;
+    box_type_t *rects1;
+    box_type_t *rects2;
+
+    if (reg1->extents.x1 != reg2->extents.x1)
+	return FALSE;
+    
+    if (reg1->extents.x2 != reg2->extents.x2)
+	return FALSE;
+    
+    if (reg1->extents.y1 != reg2->extents.y1)
+	return FALSE;
+    
+    if (reg1->extents.y2 != reg2->extents.y2)
+	return FALSE;
+    
+    if (PIXREGION_NUMRECTS (reg1) != PIXREGION_NUMRECTS (reg2))
+	return FALSE;
+
+    rects1 = PIXREGION_RECTS (reg1);
+    rects2 = PIXREGION_RECTS (reg2);
+    
+    for (i = 0; i != PIXREGION_NUMRECTS (reg1); i++)
+    {
+	if (rects1[i].x1 != rects2[i].x1)
+	    return FALSE;
+	
+	if (rects1[i].x2 != rects2[i].x2)
+	    return FALSE;
+	
+	if (rects1[i].y1 != rects2[i].y1)
+	    return FALSE;
+	
+	if (rects1[i].y2 != rects2[i].y2)
+	    return FALSE;
+    }
+
+    return TRUE;
+}
+
+int
+PREFIX (_print) (region_type_t *rgn)
+{
+    int num, size;
+    int i;
+    box_type_t * rects;
+
+    num = PIXREGION_NUMRECTS (rgn);
+    size = PIXREGION_SIZE (rgn);
+    rects = PIXREGION_RECTS (rgn);
+
+    fprintf (stderr, "num: %d size: %d\n", num, size);
+    fprintf (stderr, "extents: %d %d %d %d\n",
+             rgn->extents.x1,
+	     rgn->extents.y1,
+	     rgn->extents.x2,
+	     rgn->extents.y2);
+    
+    for (i = 0; i < num; i++)
+    {
+	fprintf (stderr, "%d %d %d %d \n",
+	         rects[i].x1, rects[i].y1, rects[i].x2, rects[i].y2);
+    }
+    
+    fprintf (stderr, "\n");
+
+    return(num);
+}
+
+
+PIXMAN_EXPORT void
+PREFIX (_init) (region_type_t *region)
+{
+    region->extents = *pixman_region_empty_box;
+    region->data = pixman_region_empty_data;
+}
+
+PIXMAN_EXPORT void
+PREFIX (_init_rect) (region_type_t *	region,
+                     int		x,
+		     int		y,
+		     unsigned int	width,
+		     unsigned int	height)
+{
+    region->extents.x1 = x;
+    region->extents.y1 = y;
+    region->extents.x2 = x + width;
+    region->extents.y2 = y + height;
+
+    if (!GOOD_RECT (&region->extents))
+    {
+        if (BAD_RECT (&region->extents))
+            _pixman_log_error (FUNC, "Invalid rectangle passed");
+        PREFIX (_init) (region);
+        return;
+    }
+
+    region->data = NULL;
+}
+
+PIXMAN_EXPORT void
+PREFIX (_init_with_extents) (region_type_t *region, box_type_t *extents)
+{
+    if (!GOOD_RECT (extents))
+    {
+        if (BAD_RECT (extents))
+            _pixman_log_error (FUNC, "Invalid rectangle passed");
+        PREFIX (_init) (region);
+        return;
+    }
+    region->extents = *extents;
+
+    region->data = NULL;
+}
+
+PIXMAN_EXPORT void
+PREFIX (_fini) (region_type_t *region)
+{
+    GOOD (region);
+    FREE_DATA (region);
+}
+
+PIXMAN_EXPORT int
+PREFIX (_n_rects) (region_type_t *region)
+{
+    return PIXREGION_NUMRECTS (region);
+}
+
+PIXMAN_EXPORT box_type_t *
+PREFIX (_rectangles) (region_type_t *region,
+                      int               *n_rects)
+{
+    if (n_rects)
+	*n_rects = PIXREGION_NUMRECTS (region);
+
+    return PIXREGION_RECTS (region);
+}
+
+static pixman_bool_t
+pixman_break (region_type_t *region)
+{
+    FREE_DATA (region);
+
+    region->extents = *pixman_region_empty_box;
+    region->data = pixman_broken_data;
+
+    return FALSE;
+}
+
+static pixman_bool_t
+pixman_rect_alloc (region_type_t * region,
+                   int             n)
+{
+    region_data_type_t *data;
+
+    if (!region->data)
+    {
+	n++;
+	region->data = alloc_data (n);
+
+	if (!region->data)
+	    return pixman_break (region);
+
+	region->data->numRects = 1;
+	*PIXREGION_BOXPTR (region) = region->extents;
+    }
+    else if (!region->data->size)
+    {
+	region->data = alloc_data (n);
+
+	if (!region->data)
+	    return pixman_break (region);
+
+	region->data->numRects = 0;
+    }
+    else
+    {
+	size_t data_size;
+
+	if (n == 1)
+	{
+	    n = region->data->numRects;
+	    if (n > 500) /* XXX pick numbers out of a hat */
+		n = 250;
+	}
+
+	n += region->data->numRects;
+	data_size = PIXREGION_SZOF (n);
+
+	if (!data_size)
+	{
+	    data = NULL;
+	}
+	else
+	{
+	    data = (region_data_type_t *)
+		realloc (region->data, PIXREGION_SZOF (n));
+	}
+	
+	if (!data)
+	    return pixman_break (region);
+	
+	region->data = data;
+    }
+    
+    region->data->size = n;
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_copy) (region_type_t *dst, region_type_t *src)
+{
+    GOOD (dst);
+    GOOD (src);
+
+    if (dst == src)
+	return TRUE;
+    
+    dst->extents = src->extents;
+
+    if (!src->data || !src->data->size)
+    {
+	FREE_DATA (dst);
+	dst->data = src->data;
+	return TRUE;
+    }
+    
+    if (!dst->data || (dst->data->size < src->data->numRects))
+    {
+	FREE_DATA (dst);
+
+	dst->data = alloc_data (src->data->numRects);
+
+	if (!dst->data)
+	    return pixman_break (dst);
+
+	dst->data->size = src->data->numRects;
+    }
+
+    dst->data->numRects = src->data->numRects;
+
+    memmove ((char *)PIXREGION_BOXPTR (dst), (char *)PIXREGION_BOXPTR (src),
+             dst->data->numRects * sizeof(box_type_t));
+
+    return TRUE;
+}
+
+/*======================================================================
+ *	    Generic Region Operator
+ *====================================================================*/
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_coalesce --
+ *	Attempt to merge the boxes in the current band with those in the
+ *	previous one.  We are guaranteed that the current band extends to
+ *      the end of the rects array.  Used only by pixman_op.
+ *
+ * Results:
+ *	The new index for the previous band.
+ *
+ * Side Effects:
+ *	If coalescing takes place:
+ *	    - rectangles in the previous band will have their y2 fields
+ *	      altered.
+ *	    - region->data->numRects will be decreased.
+ *
+ *-----------------------------------------------------------------------
+ */
+static inline int
+pixman_coalesce (region_type_t * region,      /* Region to coalesce		 */
+		 int             prev_start,  /* Index of start of previous band */
+		 int             cur_start)   /* Index of start of current band  */
+{
+    box_type_t *prev_box;       /* Current box in previous band	     */
+    box_type_t *cur_box;        /* Current box in current band       */
+    int numRects;               /* Number rectangles in both bands   */
+    int y2;                     /* Bottom of current band	     */
+
+    /*
+     * Figure out how many rectangles are in the band.
+     */
+    numRects = cur_start - prev_start;
+    critical_if_fail (numRects == region->data->numRects - cur_start);
+
+    if (!numRects) return cur_start;
+
+    /*
+     * The bands may only be coalesced if the bottom of the previous
+     * matches the top scanline of the current.
+     */
+    prev_box = PIXREGION_BOX (region, prev_start);
+    cur_box = PIXREGION_BOX (region, cur_start);
+    if (prev_box->y2 != cur_box->y1) return cur_start;
+
+    /*
+     * Make sure the bands have boxes in the same places. This
+     * assumes that boxes have been added in such a way that they
+     * cover the most area possible. I.e. two boxes in a band must
+     * have some horizontal space between them.
+     */
+    y2 = cur_box->y2;
+
+    do
+    {
+	if ((prev_box->x1 != cur_box->x1) || (prev_box->x2 != cur_box->x2))
+	    return (cur_start);
+	
+	prev_box++;
+	cur_box++;
+	numRects--;
+    }
+    while (numRects);
+
+    /*
+     * The bands may be merged, so set the bottom y of each box
+     * in the previous band to the bottom y of the current band.
+     */
+    numRects = cur_start - prev_start;
+    region->data->numRects -= numRects;
+
+    do
+    {
+	prev_box--;
+	prev_box->y2 = y2;
+	numRects--;
+    }
+    while (numRects);
+
+    return prev_start;
+}
+
+/* Quicky macro to avoid trivial reject procedure calls to pixman_coalesce */
+
+#define COALESCE(new_reg, prev_band, cur_band)                          \
+    do									\
+    {									\
+	if (cur_band - prev_band == new_reg->data->numRects - cur_band)	\
+	    prev_band = pixman_coalesce (new_reg, prev_band, cur_band);	\
+	else								\
+	    prev_band = cur_band;					\
+    } while (0)
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_append_non_o --
+ *	Handle a non-overlapping band for the union and subtract operations.
+ *      Just adds the (top/bottom-clipped) rectangles into the region.
+ *      Doesn't have to check for subsumption or anything.
+ *
+ * Results:
+ *	None.
+ *
+ * Side Effects:
+ *	region->data->numRects is incremented and the rectangles overwritten
+ *	with the rectangles we're passed.
+ *
+ *-----------------------------------------------------------------------
+ */
+static inline pixman_bool_t
+pixman_region_append_non_o (region_type_t * region,
+			    box_type_t *    r,
+			    box_type_t *    r_end,
+			    int             y1,
+			    int             y2)
+{
+    box_type_t *next_rect;
+    int new_rects;
+
+    new_rects = r_end - r;
+
+    critical_if_fail (y1 < y2);
+    critical_if_fail (new_rects != 0);
+
+    /* Make sure we have enough space for all rectangles to be added */
+    RECTALLOC (region, new_rects);
+    next_rect = PIXREGION_TOP (region);
+    region->data->numRects += new_rects;
+
+    do
+    {
+	critical_if_fail (r->x1 < r->x2);
+	ADDRECT (next_rect, r->x1, y1, r->x2, y2);
+	r++;
+    }
+    while (r != r_end);
+
+    return TRUE;
+}
+
+#define FIND_BAND(r, r_band_end, r_end, ry1)			     \
+    do								     \
+    {								     \
+	ry1 = r->y1;						     \
+	r_band_end = r + 1;					     \
+	while ((r_band_end != r_end) && (r_band_end->y1 == ry1)) {   \
+	    r_band_end++;					     \
+	}							     \
+    } while (0)
+
+#define APPEND_REGIONS(new_reg, r, r_end)				\
+    do									\
+    {									\
+	int new_rects;							\
+	if ((new_rects = r_end - r)) {					\
+	    RECTALLOC_BAIL (new_reg, new_rects, bail);			\
+	    memmove ((char *)PIXREGION_TOP (new_reg), (char *)r,	\
+		     new_rects * sizeof(box_type_t));			\
+	    new_reg->data->numRects += new_rects;			\
+	}								\
+    } while (0)
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_op --
+ *	Apply an operation to two regions. Called by pixman_region_union, pixman_region_inverse,
+ *	pixman_region_subtract, pixman_region_intersect....  Both regions MUST have at least one
+ *      rectangle, and cannot be the same object.
+ *
+ * Results:
+ *	TRUE if successful.
+ *
+ * Side Effects:
+ *	The new region is overwritten.
+ *	overlap set to TRUE if overlap_func ever returns TRUE.
+ *
+ * Notes:
+ *	The idea behind this function is to view the two regions as sets.
+ *	Together they cover a rectangle of area that this function divides
+ *	into horizontal bands where points are covered only by one region
+ *	or by both. For the first case, the non_overlap_func is called with
+ *	each the band and the band's upper and lower extents. For the
+ *	second, the overlap_func is called to process the entire band. It
+ *	is responsible for clipping the rectangles in the band, though
+ *	this function provides the boundaries.
+ *	At the end of each band, the new region is coalesced, if possible,
+ *	to reduce the number of rectangles in the region.
+ *
+ *-----------------------------------------------------------------------
+ */
+
+typedef pixman_bool_t (*overlap_proc_ptr) (region_type_t *region,
+					   box_type_t *   r1,
+					   box_type_t *   r1_end,
+					   box_type_t *   r2,
+					   box_type_t *   r2_end,
+					   int            y1,
+					   int            y2,
+					   int *          overlap);
+
+static pixman_bool_t
+pixman_op (region_type_t *  new_reg,               /* Place to store result	    */
+	   region_type_t *  reg1,                  /* First region in operation     */
+	   region_type_t *  reg2,                  /* 2d region in operation        */
+	   overlap_proc_ptr overlap_func,          /* Function to call for over-
+						    * lapping bands		    */
+	   int              append_non1,           /* Append non-overlapping bands  
+						    * in region 1 ?
+						    */
+	   int              append_non2,           /* Append non-overlapping bands
+						    * in region 2 ?
+						    */
+	   int *            overlap)
+{
+    box_type_t *r1;                 /* Pointer into first region     */
+    box_type_t *r2;                 /* Pointer into 2d region	     */
+    box_type_t *r1_end;             /* End of 1st region	     */
+    box_type_t *r2_end;             /* End of 2d region		     */
+    int ybot;                       /* Bottom of intersection	     */
+    int ytop;                       /* Top of intersection	     */
+    region_data_type_t *old_data;   /* Old data for new_reg	     */
+    int prev_band;                  /* Index of start of
+				     * previous band in new_reg       */
+    int cur_band;                   /* Index of start of current
+				     * band in new_reg		     */
+    box_type_t * r1_band_end;       /* End of current band in r1     */
+    box_type_t * r2_band_end;       /* End of current band in r2     */
+    int top;                        /* Top of non-overlapping band   */
+    int bot;                        /* Bottom of non-overlapping band*/
+    int r1y1;                       /* Temps for r1->y1 and r2->y1   */
+    int r2y1;
+    int new_size;
+    int numRects;
+
+    /*
+     * Break any region computed from a broken region
+     */
+    if (PIXREGION_NAR (reg1) || PIXREGION_NAR (reg2))
+	return pixman_break (new_reg);
+
+    /*
+     * Initialization:
+     *	set r1, r2, r1_end and r2_end appropriately, save the rectangles
+     * of the destination region until the end in case it's one of
+     * the two source regions, then mark the "new" region empty, allocating
+     * another array of rectangles for it to use.
+     */
+
+    r1 = PIXREGION_RECTS (reg1);
+    new_size = PIXREGION_NUMRECTS (reg1);
+    r1_end = r1 + new_size;
+
+    numRects = PIXREGION_NUMRECTS (reg2);
+    r2 = PIXREGION_RECTS (reg2);
+    r2_end = r2 + numRects;
+    
+    critical_if_fail (r1 != r1_end);
+    critical_if_fail (r2 != r2_end);
+
+    old_data = (region_data_type_t *)NULL;
+
+    if (((new_reg == reg1) && (new_size > 1)) ||
+        ((new_reg == reg2) && (numRects > 1)))
+    {
+        old_data = new_reg->data;
+        new_reg->data = pixman_region_empty_data;
+    }
+
+    /* guess at new size */
+    if (numRects > new_size)
+	new_size = numRects;
+
+    new_size <<= 1;
+
+    if (!new_reg->data)
+	new_reg->data = pixman_region_empty_data;
+    else if (new_reg->data->size)
+	new_reg->data->numRects = 0;
+
+    if (new_size > new_reg->data->size)
+    {
+        if (!pixman_rect_alloc (new_reg, new_size))
+        {
+            if (old_data)
+		free (old_data);
+            return FALSE;
+	}
+    }
+
+    /*
+     * Initialize ybot.
+     * In the upcoming loop, ybot and ytop serve different functions depending
+     * on whether the band being handled is an overlapping or non-overlapping
+     * band.
+     *  In the case of a non-overlapping band (only one of the regions
+     * has points in the band), ybot is the bottom of the most recent
+     * intersection and thus clips the top of the rectangles in that band.
+     * ytop is the top of the next intersection between the two regions and
+     * serves to clip the bottom of the rectangles in the current band.
+     *	For an overlapping band (where the two regions intersect), ytop clips
+     * the top of the rectangles of both regions and ybot clips the bottoms.
+     */
+
+    ybot = MIN (r1->y1, r2->y1);
+
+    /*
+     * prev_band serves to mark the start of the previous band so rectangles
+     * can be coalesced into larger rectangles. qv. pixman_coalesce, above.
+     * In the beginning, there is no previous band, so prev_band == cur_band
+     * (cur_band is set later on, of course, but the first band will always
+     * start at index 0). prev_band and cur_band must be indices because of
+     * the possible expansion, and resultant moving, of the new region's
+     * array of rectangles.
+     */
+    prev_band = 0;
+
+    do
+    {
+        /*
+	 * This algorithm proceeds one source-band (as opposed to a
+	 * destination band, which is determined by where the two regions
+	 * intersect) at a time. r1_band_end and r2_band_end serve to mark the
+	 * rectangle after the last one in the current band for their
+	 * respective regions.
+	 */
+        critical_if_fail (r1 != r1_end);
+        critical_if_fail (r2 != r2_end);
+
+        FIND_BAND (r1, r1_band_end, r1_end, r1y1);
+        FIND_BAND (r2, r2_band_end, r2_end, r2y1);
+
+        /*
+	 * First handle the band that doesn't intersect, if any.
+	 *
+	 * Note that attention is restricted to one band in the
+	 * non-intersecting region at once, so if a region has n
+	 * bands between the current position and the next place it overlaps
+	 * the other, this entire loop will be passed through n times.
+	 */
+        if (r1y1 < r2y1)
+        {
+            if (append_non1)
+            {
+                top = MAX (r1y1, ybot);
+                bot = MIN (r1->y2, r2y1);
+                if (top != bot)
+                {
+                    cur_band = new_reg->data->numRects;
+                    if (!pixman_region_append_non_o (new_reg, r1, r1_band_end, top, bot))
+			goto bail;
+                    COALESCE (new_reg, prev_band, cur_band);
+		}
+	    }
+            ytop = r2y1;
+	}
+        else if (r2y1 < r1y1)
+        {
+            if (append_non2)
+            {
+                top = MAX (r2y1, ybot);
+                bot = MIN (r2->y2, r1y1);
+		
+                if (top != bot)
+                {
+                    cur_band = new_reg->data->numRects;
+
+                    if (!pixman_region_append_non_o (new_reg, r2, r2_band_end, top, bot))
+			goto bail;
+
+                    COALESCE (new_reg, prev_band, cur_band);
+		}
+	    }
+            ytop = r1y1;
+	}
+        else
+        {
+            ytop = r1y1;
+	}
+
+        /*
+	 * Now see if we've hit an intersecting band. The two bands only
+	 * intersect if ybot > ytop
+	 */
+        ybot = MIN (r1->y2, r2->y2);
+        if (ybot > ytop)
+        {
+            cur_band = new_reg->data->numRects;
+
+            if (!(*overlap_func)(new_reg,
+                                 r1, r1_band_end,
+                                 r2, r2_band_end,
+                                 ytop, ybot,
+                                 overlap))
+	    {
+		goto bail;
+	    }
+	    
+            COALESCE (new_reg, prev_band, cur_band);
+	}
+
+        /*
+	 * If we've finished with a band (y2 == ybot) we skip forward
+	 * in the region to the next band.
+	 */
+        if (r1->y2 == ybot)
+	    r1 = r1_band_end;
+
+        if (r2->y2 == ybot)
+	    r2 = r2_band_end;
+
+    }
+    while (r1 != r1_end && r2 != r2_end);
+
+    /*
+     * Deal with whichever region (if any) still has rectangles left.
+     *
+     * We only need to worry about banding and coalescing for the very first
+     * band left.  After that, we can just group all remaining boxes,
+     * regardless of how many bands, into one final append to the list.
+     */
+
+    if ((r1 != r1_end) && append_non1)
+    {
+        /* Do first non_overlap1Func call, which may be able to coalesce */
+        FIND_BAND (r1, r1_band_end, r1_end, r1y1);
+	
+        cur_band = new_reg->data->numRects;
+	
+        if (!pixman_region_append_non_o (new_reg,
+                                         r1, r1_band_end,
+                                         MAX (r1y1, ybot), r1->y2))
+	{
+	    goto bail;
+	}
+	
+        COALESCE (new_reg, prev_band, cur_band);
+
+        /* Just append the rest of the boxes  */
+        APPEND_REGIONS (new_reg, r1_band_end, r1_end);
+    }
+    else if ((r2 != r2_end) && append_non2)
+    {
+        /* Do first non_overlap2Func call, which may be able to coalesce */
+        FIND_BAND (r2, r2_band_end, r2_end, r2y1);
+
+	cur_band = new_reg->data->numRects;
+
+        if (!pixman_region_append_non_o (new_reg,
+                                         r2, r2_band_end,
+                                         MAX (r2y1, ybot), r2->y2))
+	{
+	    goto bail;
+	}
+
+        COALESCE (new_reg, prev_band, cur_band);
+
+        /* Append rest of boxes */
+        APPEND_REGIONS (new_reg, r2_band_end, r2_end);
+    }
+
+    if (old_data)
+	free (old_data);
+
+    if (!(numRects = new_reg->data->numRects))
+    {
+        FREE_DATA (new_reg);
+        new_reg->data = pixman_region_empty_data;
+    }
+    else if (numRects == 1)
+    {
+        new_reg->extents = *PIXREGION_BOXPTR (new_reg);
+        FREE_DATA (new_reg);
+        new_reg->data = (region_data_type_t *)NULL;
+    }
+    else
+    {
+        DOWNSIZE (new_reg, numRects);
+    }
+
+    return TRUE;
+
+bail:
+    if (old_data)
+	free (old_data);
+
+    return pixman_break (new_reg);
+}
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_set_extents --
+ *	Reset the extents of a region to what they should be. Called by
+ *	pixman_region_subtract and pixman_region_intersect as they can't
+ *      figure it out along the way or do so easily, as pixman_region_union can.
+ *
+ * Results:
+ *	None.
+ *
+ * Side Effects:
+ *	The region's 'extents' structure is overwritten.
+ *
+ *-----------------------------------------------------------------------
+ */
+static void
+pixman_set_extents (region_type_t *region)
+{
+    box_type_t *box, *box_end;
+
+    if (!region->data)
+	return;
+
+    if (!region->data->size)
+    {
+        region->extents.x2 = region->extents.x1;
+        region->extents.y2 = region->extents.y1;
+        return;
+    }
+
+    box = PIXREGION_BOXPTR (region);
+    box_end = PIXREGION_END (region);
+
+    /*
+     * Since box is the first rectangle in the region, it must have the
+     * smallest y1 and since box_end is the last rectangle in the region,
+     * it must have the largest y2, because of banding. Initialize x1 and
+     * x2 from  box and box_end, resp., as good things to initialize them
+     * to...
+     */
+    region->extents.x1 = box->x1;
+    region->extents.y1 = box->y1;
+    region->extents.x2 = box_end->x2;
+    region->extents.y2 = box_end->y2;
+
+    critical_if_fail (region->extents.y1 < region->extents.y2);
+
+    while (box <= box_end)
+    {
+        if (box->x1 < region->extents.x1)
+	    region->extents.x1 = box->x1;
+        if (box->x2 > region->extents.x2)
+	    region->extents.x2 = box->x2;
+        box++;
+    }
+
+    critical_if_fail (region->extents.x1 < region->extents.x2);
+}
+
+/*======================================================================
+ *	    Region Intersection
+ *====================================================================*/
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_intersect_o --
+ *	Handle an overlapping band for pixman_region_intersect.
+ *
+ * Results:
+ *	TRUE if successful.
+ *
+ * Side Effects:
+ *	Rectangles may be added to the region.
+ *
+ *-----------------------------------------------------------------------
+ */
+/*ARGSUSED*/
+static pixman_bool_t
+pixman_region_intersect_o (region_type_t *region,
+                           box_type_t *   r1,
+                           box_type_t *   r1_end,
+                           box_type_t *   r2,
+                           box_type_t *   r2_end,
+                           int            y1,
+                           int            y2,
+                           int *          overlap)
+{
+    int x1;
+    int x2;
+    box_type_t *        next_rect;
+
+    next_rect = PIXREGION_TOP (region);
+
+    critical_if_fail (y1 < y2);
+    critical_if_fail (r1 != r1_end && r2 != r2_end);
+
+    do
+    {
+        x1 = MAX (r1->x1, r2->x1);
+        x2 = MIN (r1->x2, r2->x2);
+
+        /*
+	 * If there's any overlap between the two rectangles, add that
+	 * overlap to the new region.
+	 */
+        if (x1 < x2)
+	    NEWRECT (region, next_rect, x1, y1, x2, y2);
+
+        /*
+	 * Advance the pointer(s) with the leftmost right side, since the next
+	 * rectangle on that list may still overlap the other region's
+	 * current rectangle.
+	 */
+        if (r1->x2 == x2)
+        {
+            r1++;
+	}
+        if (r2->x2 == x2)
+        {
+            r2++;
+	}
+    }
+    while ((r1 != r1_end) && (r2 != r2_end));
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_intersect) (region_type_t *     new_reg,
+                     region_type_t *        reg1,
+                     region_type_t *        reg2)
+{
+    GOOD (reg1);
+    GOOD (reg2);
+    GOOD (new_reg);
+
+    /* check for trivial reject */
+    if (PIXREGION_NIL (reg1) || PIXREGION_NIL (reg2) ||
+        !EXTENTCHECK (&reg1->extents, &reg2->extents))
+    {
+        /* Covers about 20% of all cases */
+        FREE_DATA (new_reg);
+        new_reg->extents.x2 = new_reg->extents.x1;
+        new_reg->extents.y2 = new_reg->extents.y1;
+        if (PIXREGION_NAR (reg1) || PIXREGION_NAR (reg2))
+        {
+            new_reg->data = pixman_broken_data;
+            return FALSE;
+	}
+        else
+	{
+	    new_reg->data = pixman_region_empty_data;
+	}
+    }
+    else if (!reg1->data && !reg2->data)
+    {
+        /* Covers about 80% of cases that aren't trivially rejected */
+        new_reg->extents.x1 = MAX (reg1->extents.x1, reg2->extents.x1);
+        new_reg->extents.y1 = MAX (reg1->extents.y1, reg2->extents.y1);
+        new_reg->extents.x2 = MIN (reg1->extents.x2, reg2->extents.x2);
+        new_reg->extents.y2 = MIN (reg1->extents.y2, reg2->extents.y2);
+
+        FREE_DATA (new_reg);
+
+	new_reg->data = (region_data_type_t *)NULL;
+    }
+    else if (!reg2->data && SUBSUMES (&reg2->extents, &reg1->extents))
+    {
+        return PREFIX (_copy) (new_reg, reg1);
+    }
+    else if (!reg1->data && SUBSUMES (&reg1->extents, &reg2->extents))
+    {
+        return PREFIX (_copy) (new_reg, reg2);
+    }
+    else if (reg1 == reg2)
+    {
+        return PREFIX (_copy) (new_reg, reg1);
+    }
+    else
+    {
+        /* General purpose intersection */
+        int overlap; /* result ignored */
+
+        if (!pixman_op (new_reg, reg1, reg2, pixman_region_intersect_o, FALSE, FALSE,
+                        &overlap))
+	{
+	    return FALSE;
+	}
+	
+        pixman_set_extents (new_reg);
+    }
+
+    GOOD (new_reg);
+    return(TRUE);
+}
+
+#define MERGERECT(r)							\
+    do									\
+    {									\
+        if (r->x1 <= x2)						\
+	{								\
+            /* Merge with current rectangle */				\
+            if (r->x1 < x2)						\
+		*overlap = TRUE;					\
+									\
+            if (x2 < r->x2)						\
+		x2 = r->x2;						\
+	}								\
+	else								\
+	{								\
+            /* Add current rectangle, start new one */			\
+            NEWRECT (region, next_rect, x1, y1, x2, y2);		\
+            x1 = r->x1;							\
+            x2 = r->x2;							\
+	}								\
+        r++;								\
+    } while (0)
+
+/*======================================================================
+ *	    Region Union
+ *====================================================================*/
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_union_o --
+ *	Handle an overlapping band for the union operation. Picks the
+ *	left-most rectangle each time and merges it into the region.
+ *
+ * Results:
+ *	TRUE if successful.
+ *
+ * Side Effects:
+ *	region is overwritten.
+ *	overlap is set to TRUE if any boxes overlap.
+ *
+ *-----------------------------------------------------------------------
+ */
+static pixman_bool_t
+pixman_region_union_o (region_type_t *region,
+		       box_type_t *   r1,
+		       box_type_t *   r1_end,
+		       box_type_t *   r2,
+		       box_type_t *   r2_end,
+		       int            y1,
+		       int            y2,
+		       int *          overlap)
+{
+    box_type_t *next_rect;
+    int x1;            /* left and right side of current union */
+    int x2;
+
+    critical_if_fail (y1 < y2);
+    critical_if_fail (r1 != r1_end && r2 != r2_end);
+
+    next_rect = PIXREGION_TOP (region);
+
+    /* Start off current rectangle */
+    if (r1->x1 < r2->x1)
+    {
+        x1 = r1->x1;
+        x2 = r1->x2;
+        r1++;
+    }
+    else
+    {
+        x1 = r2->x1;
+        x2 = r2->x2;
+        r2++;
+    }
+    while (r1 != r1_end && r2 != r2_end)
+    {
+        if (r1->x1 < r2->x1)
+	    MERGERECT (r1);
+	else
+	    MERGERECT (r2);
+    }
+
+    /* Finish off whoever (if any) is left */
+    if (r1 != r1_end)
+    {
+        do
+        {
+            MERGERECT (r1);
+	}
+        while (r1 != r1_end);
+    }
+    else if (r2 != r2_end)
+    {
+        do
+        {
+            MERGERECT (r2);
+	}
+        while (r2 != r2_end);
+    }
+
+    /* Add current rectangle */
+    NEWRECT (region, next_rect, x1, y1, x2, y2);
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX(_intersect_rect) (region_type_t *dest,
+			 region_type_t *source,
+			 int x, int y,
+			 unsigned int width,
+			 unsigned int height)
+{
+    region_type_t region;
+
+    region.data = NULL;
+    region.extents.x1 = x;
+    region.extents.y1 = y;
+    region.extents.x2 = x + width;
+    region.extents.y2 = y + height;
+
+    return PREFIX(_intersect) (dest, source, &region);
+}
+
+/* Convenience function for performing union of region with a
+ * single rectangle
+ */
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_union_rect) (region_type_t *dest,
+                      region_type_t *source,
+                      int            x,
+		      int            y,
+                      unsigned int   width,
+		      unsigned int   height)
+{
+    region_type_t region;
+
+    region.extents.x1 = x;
+    region.extents.y1 = y;
+    region.extents.x2 = x + width;
+    region.extents.y2 = y + height;
+
+    if (!GOOD_RECT (&region.extents))
+    {
+        if (BAD_RECT (&region.extents))
+            _pixman_log_error (FUNC, "Invalid rectangle passed");
+	return PREFIX (_copy) (dest, source);
+    }
+
+    region.data = NULL;
+
+    return PREFIX (_union) (dest, source, &region);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_union) (region_type_t *new_reg,
+                 region_type_t *reg1,
+                 region_type_t *reg2)
+{
+    int overlap; /* result ignored */
+
+    /* Return TRUE if some overlap
+     * between reg1, reg2
+     */
+    GOOD (reg1);
+    GOOD (reg2);
+    GOOD (new_reg);
+
+    /*  checks all the simple cases */
+
+    /*
+     * Region 1 and 2 are the same
+     */
+    if (reg1 == reg2)
+        return PREFIX (_copy) (new_reg, reg1);
+
+    /*
+     * Region 1 is empty
+     */
+    if (PIXREGION_NIL (reg1))
+    {
+        if (PIXREGION_NAR (reg1))
+	    return pixman_break (new_reg);
+
+        if (new_reg != reg2)
+	    return PREFIX (_copy) (new_reg, reg2);
+
+	return TRUE;
+    }
+
+    /*
+     * Region 2 is empty
+     */
+    if (PIXREGION_NIL (reg2))
+    {
+        if (PIXREGION_NAR (reg2))
+	    return pixman_break (new_reg);
+
+	if (new_reg != reg1)
+	    return PREFIX (_copy) (new_reg, reg1);
+
+	return TRUE;
+    }
+
+    /*
+     * Region 1 completely subsumes region 2
+     */
+    if (!reg1->data && SUBSUMES (&reg1->extents, &reg2->extents))
+    {
+        if (new_reg != reg1)
+	    return PREFIX (_copy) (new_reg, reg1);
+
+	return TRUE;
+    }
+
+    /*
+     * Region 2 completely subsumes region 1
+     */
+    if (!reg2->data && SUBSUMES (&reg2->extents, &reg1->extents))
+    {
+        if (new_reg != reg2)
+	    return PREFIX (_copy) (new_reg, reg2);
+
+	return TRUE;
+    }
+
+    if (!pixman_op (new_reg, reg1, reg2, pixman_region_union_o, TRUE, TRUE, &overlap))
+	return FALSE;
+
+    new_reg->extents.x1 = MIN (reg1->extents.x1, reg2->extents.x1);
+    new_reg->extents.y1 = MIN (reg1->extents.y1, reg2->extents.y1);
+    new_reg->extents.x2 = MAX (reg1->extents.x2, reg2->extents.x2);
+    new_reg->extents.y2 = MAX (reg1->extents.y2, reg2->extents.y2);
+    
+    GOOD (new_reg);
+
+    return TRUE;
+}
+
+/*======================================================================
+ *	    Batch Rectangle Union
+ *====================================================================*/
+
+#define EXCHANGE_RECTS(a, b)	\
+    {                           \
+        box_type_t t;		\
+        t = rects[a];           \
+        rects[a] = rects[b];    \
+        rects[b] = t;           \
+    }
+
+static void
+quick_sort_rects (
+    box_type_t rects[],
+    int        numRects)
+{
+    int y1;
+    int x1;
+    int i, j;
+    box_type_t *r;
+
+    /* Always called with numRects > 1 */
+
+    do
+    {
+        if (numRects == 2)
+        {
+            if (rects[0].y1 > rects[1].y1 ||
+                (rects[0].y1 == rects[1].y1 && rects[0].x1 > rects[1].x1))
+	    {
+		EXCHANGE_RECTS (0, 1);
+	    }
+
+            return;
+	}
+
+        /* Choose partition element, stick in location 0 */
+        EXCHANGE_RECTS (0, numRects >> 1);
+        y1 = rects[0].y1;
+        x1 = rects[0].x1;
+
+        /* Partition array */
+        i = 0;
+        j = numRects;
+
+        do
+        {
+            r = &(rects[i]);
+            do
+            {
+                r++;
+                i++;
+	    }
+
+            while (i != numRects && (r->y1 < y1 || (r->y1 == y1 && r->x1 < x1)))
+		;
+
+	    r = &(rects[j]);
+            do
+            {
+                r--;
+                j--;
+	    }
+            while (y1 < r->y1 || (y1 == r->y1 && x1 < r->x1));
+	    
+            if (i < j)
+		EXCHANGE_RECTS (i, j);
+	}
+        while (i < j);
+
+        /* Move partition element back to middle */
+        EXCHANGE_RECTS (0, j);
+
+        /* Recurse */
+        if (numRects - j - 1 > 1)
+	    quick_sort_rects (&rects[j + 1], numRects - j - 1);
+
+        numRects = j;
+    }
+    while (numRects > 1);
+}
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_validate --
+ *
+ *      Take a ``region'' which is a non-y-x-banded random collection of
+ *      rectangles, and compute a nice region which is the union of all the
+ *      rectangles.
+ *
+ * Results:
+ *	TRUE if successful.
+ *
+ * Side Effects:
+ *      The passed-in ``region'' may be modified.
+ *	overlap set to TRUE if any retangles overlapped,
+ *      else FALSE;
+ *
+ * Strategy:
+ *      Step 1. Sort the rectangles into ascending order with primary key y1
+ *		and secondary key x1.
+ *
+ *      Step 2. Split the rectangles into the minimum number of proper y-x
+ *		banded regions.  This may require horizontally merging
+ *		rectangles, and vertically coalescing bands.  With any luck,
+ *		this step in an identity transformation (ala the Box widget),
+ *		or a coalescing into 1 box (ala Menus).
+ *
+ *	Step 3. Merge the separate regions down to a single region by calling
+ *		pixman_region_union.  Maximize the work each pixman_region_union call does by using
+ *		a binary merge.
+ *
+ *-----------------------------------------------------------------------
+ */
+
+static pixman_bool_t
+validate (region_type_t * badreg,
+          int *           overlap)
+{
+    /* Descriptor for regions under construction  in Step 2. */
+    typedef struct
+    {
+        region_type_t reg;
+        int prev_band;
+        int cur_band;
+    } region_info_t;
+
+    region_info_t stack_regions[64];
+
+    int numRects;                   /* Original numRects for badreg	    */
+    region_info_t *ri;              /* Array of current regions		    */
+    int num_ri;                     /* Number of entries used in ri	    */
+    int size_ri;                    /* Number of entries available in ri    */
+    int i;                          /* Index into rects			    */
+    int j;                          /* Index into ri			    */
+    region_info_t *rit;             /* &ri[j]				    */
+    region_type_t *reg;             /* ri[j].reg			    */
+    box_type_t *box;                /* Current box in rects		    */
+    box_type_t *ri_box;             /* Last box in ri[j].reg		    */
+    region_type_t *hreg;            /* ri[j_half].reg			    */
+    pixman_bool_t ret = TRUE;
+
+    *overlap = FALSE;
+    if (!badreg->data)
+    {
+        GOOD (badreg);
+        return TRUE;
+    }
+    
+    numRects = badreg->data->numRects;
+    if (!numRects)
+    {
+        if (PIXREGION_NAR (badreg))
+	    return FALSE;
+        GOOD (badreg);
+        return TRUE;
+    }
+    
+    if (badreg->extents.x1 < badreg->extents.x2)
+    {
+        if ((numRects) == 1)
+        {
+            FREE_DATA (badreg);
+            badreg->data = (region_data_type_t *) NULL;
+	}
+        else
+        {
+            DOWNSIZE (badreg, numRects);
+	}
+
+        GOOD (badreg);
+
+	return TRUE;
+    }
+
+    /* Step 1: Sort the rects array into ascending (y1, x1) order */
+    quick_sort_rects (PIXREGION_BOXPTR (badreg), numRects);
+
+    /* Step 2: Scatter the sorted array into the minimum number of regions */
+
+    /* Set up the first region to be the first rectangle in badreg */
+    /* Note that step 2 code will never overflow the ri[0].reg rects array */
+    ri = stack_regions;
+    size_ri = sizeof (stack_regions) / sizeof (stack_regions[0]);
+    num_ri = 1;
+    ri[0].prev_band = 0;
+    ri[0].cur_band = 0;
+    ri[0].reg = *badreg;
+    box = PIXREGION_BOXPTR (&ri[0].reg);
+    ri[0].reg.extents = *box;
+    ri[0].reg.data->numRects = 1;
+    badreg->extents = *pixman_region_empty_box;
+    badreg->data = pixman_region_empty_data;
+
+    /* Now scatter rectangles into the minimum set of valid regions.  If the
+     * next rectangle to be added to a region would force an existing rectangle
+     * in the region to be split up in order to maintain y-x banding, just
+     * forget it.  Try the next region.  If it doesn't fit cleanly into any
+     * region, make a new one.
+     */
+
+    for (i = numRects; --i > 0;)
+    {
+        box++;
+        /* Look for a region to append box to */
+        for (j = num_ri, rit = ri; --j >= 0; rit++)
+        {
+            reg = &rit->reg;
+            ri_box = PIXREGION_END (reg);
+
+            if (box->y1 == ri_box->y1 && box->y2 == ri_box->y2)
+            {
+                /* box is in same band as ri_box.  Merge or append it */
+                if (box->x1 <= ri_box->x2)
+                {
+                    /* Merge it with ri_box */
+                    if (box->x1 < ri_box->x2)
+			*overlap = TRUE;
+
+                    if (box->x2 > ri_box->x2)
+			ri_box->x2 = box->x2;
+		}
+                else
+                {
+                    RECTALLOC_BAIL (reg, 1, bail);
+                    *PIXREGION_TOP (reg) = *box;
+                    reg->data->numRects++;
+		}
+		
+                goto next_rect;   /* So sue me */
+	    }
+            else if (box->y1 >= ri_box->y2)
+            {
+                /* Put box into new band */
+                if (reg->extents.x2 < ri_box->x2)
+		    reg->extents.x2 = ri_box->x2;
+		
+                if (reg->extents.x1 > box->x1)
+		    reg->extents.x1 = box->x1;
+		
+                COALESCE (reg, rit->prev_band, rit->cur_band);
+                rit->cur_band = reg->data->numRects;
+                RECTALLOC_BAIL (reg, 1, bail);
+                *PIXREGION_TOP (reg) = *box;
+                reg->data->numRects++;
+
+                goto next_rect;
+	    }
+            /* Well, this region was inappropriate.  Try the next one. */
+	} /* for j */
+
+        /* Uh-oh.  No regions were appropriate.  Create a new one. */
+        if (size_ri == num_ri)
+        {
+            size_t data_size;
+
+            /* Oops, allocate space for new region information */
+            size_ri <<= 1;
+
+            data_size = size_ri * sizeof(region_info_t);
+            if (data_size / size_ri != sizeof(region_info_t))
+		goto bail;
+
+            if (ri == stack_regions)
+            {
+                rit = malloc (data_size);
+                if (!rit)
+		    goto bail;
+                memcpy (rit, ri, num_ri * sizeof (region_info_t));
+	    }
+            else
+            {
+                rit = (region_info_t *) realloc (ri, data_size);
+                if (!rit)
+		    goto bail;
+	    }
+            ri = rit;
+            rit = &ri[num_ri];
+	}
+        num_ri++;
+        rit->prev_band = 0;
+        rit->cur_band = 0;
+        rit->reg.extents = *box;
+        rit->reg.data = (region_data_type_t *)NULL;
+
+	/* MUST force allocation */
+        if (!pixman_rect_alloc (&rit->reg, (i + num_ri) / num_ri))
+	    goto bail;
+	
+    next_rect: ;
+    } /* for i */
+
+    /* Make a final pass over each region in order to COALESCE and set
+     * extents.x2 and extents.y2
+     */
+    for (j = num_ri, rit = ri; --j >= 0; rit++)
+    {
+        reg = &rit->reg;
+        ri_box = PIXREGION_END (reg);
+        reg->extents.y2 = ri_box->y2;
+
+        if (reg->extents.x2 < ri_box->x2)
+	    reg->extents.x2 = ri_box->x2;
+	
+        COALESCE (reg, rit->prev_band, rit->cur_band);
+
+	if (reg->data->numRects == 1) /* keep unions happy below */
+        {
+            FREE_DATA (reg);
+            reg->data = (region_data_type_t *)NULL;
+	}
+    }
+
+    /* Step 3: Union all regions into a single region */
+    while (num_ri > 1)
+    {
+        int half = num_ri / 2;
+        for (j = num_ri & 1; j < (half + (num_ri & 1)); j++)
+        {
+            reg = &ri[j].reg;
+            hreg = &ri[j + half].reg;
+
+            if (!pixman_op (reg, reg, hreg, pixman_region_union_o, TRUE, TRUE, overlap))
+		ret = FALSE;
+
+            if (hreg->extents.x1 < reg->extents.x1)
+		reg->extents.x1 = hreg->extents.x1;
+
+            if (hreg->extents.y1 < reg->extents.y1)
+		reg->extents.y1 = hreg->extents.y1;
+
+            if (hreg->extents.x2 > reg->extents.x2)
+		reg->extents.x2 = hreg->extents.x2;
+
+            if (hreg->extents.y2 > reg->extents.y2)
+		reg->extents.y2 = hreg->extents.y2;
+
+            FREE_DATA (hreg);
+	}
+
+        num_ri -= half;
+
+	if (!ret)
+	    goto bail;
+    }
+
+    *badreg = ri[0].reg;
+
+    if (ri != stack_regions)
+	free (ri);
+
+    GOOD (badreg);
+    return ret;
+
+bail:
+    for (i = 0; i < num_ri; i++)
+	FREE_DATA (&ri[i].reg);
+
+    if (ri != stack_regions)
+	free (ri);
+
+    return pixman_break (badreg);
+}
+
+/*======================================================================
+ *                Region Subtraction
+ *====================================================================*/
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_subtract_o --
+ *	Overlapping band subtraction. x1 is the left-most point not yet
+ *	checked.
+ *
+ * Results:
+ *	TRUE if successful.
+ *
+ * Side Effects:
+ *	region may have rectangles added to it.
+ *
+ *-----------------------------------------------------------------------
+ */
+/*ARGSUSED*/
+static pixman_bool_t
+pixman_region_subtract_o (region_type_t * region,
+			  box_type_t *    r1,
+			  box_type_t *    r1_end,
+			  box_type_t *    r2,
+			  box_type_t *    r2_end,
+			  int             y1,
+			  int             y2,
+			  int *           overlap)
+{
+    box_type_t *        next_rect;
+    int x1;
+
+    x1 = r1->x1;
+
+    critical_if_fail (y1 < y2);
+    critical_if_fail (r1 != r1_end && r2 != r2_end);
+
+    next_rect = PIXREGION_TOP (region);
+
+    do
+    {
+        if (r2->x2 <= x1)
+        {
+            /*
+	     * Subtrahend entirely to left of minuend: go to next subtrahend.
+	     */
+            r2++;
+	}
+        else if (r2->x1 <= x1)
+        {
+            /*
+	     * Subtrahend preceeds minuend: nuke left edge of minuend.
+	     */
+            x1 = r2->x2;
+            if (x1 >= r1->x2)
+            {
+                /*
+		 * Minuend completely covered: advance to next minuend and
+		 * reset left fence to edge of new minuend.
+		 */
+                r1++;
+                if (r1 != r1_end)
+		    x1 = r1->x1;
+	    }
+            else
+            {
+                /*
+		 * Subtrahend now used up since it doesn't extend beyond
+		 * minuend
+		 */
+                r2++;
+	    }
+	}
+        else if (r2->x1 < r1->x2)
+        {
+            /*
+	     * Left part of subtrahend covers part of minuend: add uncovered
+	     * part of minuend to region and skip to next subtrahend.
+	     */
+            critical_if_fail (x1 < r2->x1);
+            NEWRECT (region, next_rect, x1, y1, r2->x1, y2);
+
+            x1 = r2->x2;
+            if (x1 >= r1->x2)
+            {
+                /*
+		 * Minuend used up: advance to new...
+		 */
+                r1++;
+                if (r1 != r1_end)
+		    x1 = r1->x1;
+	    }
+            else
+            {
+                /*
+		 * Subtrahend used up
+		 */
+                r2++;
+	    }
+	}
+        else
+        {
+            /*
+	     * Minuend used up: add any remaining piece before advancing.
+	     */
+            if (r1->x2 > x1)
+		NEWRECT (region, next_rect, x1, y1, r1->x2, y2);
+
+            r1++;
+
+	    if (r1 != r1_end)
+		x1 = r1->x1;
+	}
+    }
+    while ((r1 != r1_end) && (r2 != r2_end));
+
+    /*
+     * Add remaining minuend rectangles to region.
+     */
+    while (r1 != r1_end)
+    {
+        critical_if_fail (x1 < r1->x2);
+
+        NEWRECT (region, next_rect, x1, y1, r1->x2, y2);
+
+        r1++;
+        if (r1 != r1_end)
+	    x1 = r1->x1;
+    }
+    return TRUE;
+}
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_subtract --
+ *	Subtract reg_s from reg_m and leave the result in reg_d.
+ *	S stands for subtrahend, M for minuend and D for difference.
+ *
+ * Results:
+ *	TRUE if successful.
+ *
+ * Side Effects:
+ *	reg_d is overwritten.
+ *
+ *-----------------------------------------------------------------------
+ */
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_subtract) (region_type_t *reg_d,
+                    region_type_t *reg_m,
+                    region_type_t *reg_s)
+{
+    int overlap; /* result ignored */
+
+    GOOD (reg_m);
+    GOOD (reg_s);
+    GOOD (reg_d);
+    
+    /* check for trivial rejects */
+    if (PIXREGION_NIL (reg_m) || PIXREGION_NIL (reg_s) ||
+        !EXTENTCHECK (&reg_m->extents, &reg_s->extents))
+    {
+        if (PIXREGION_NAR (reg_s))
+	    return pixman_break (reg_d);
+	
+        return PREFIX (_copy) (reg_d, reg_m);
+    }
+    else if (reg_m == reg_s)
+    {
+        FREE_DATA (reg_d);
+        reg_d->extents.x2 = reg_d->extents.x1;
+        reg_d->extents.y2 = reg_d->extents.y1;
+        reg_d->data = pixman_region_empty_data;
+
+        return TRUE;
+    }
+
+    /* Add those rectangles in region 1 that aren't in region 2,
+       do yucky substraction for overlaps, and
+       just throw away rectangles in region 2 that aren't in region 1 */
+    if (!pixman_op (reg_d, reg_m, reg_s, pixman_region_subtract_o, TRUE, FALSE, &overlap))
+	return FALSE;
+
+    /*
+     * Can't alter reg_d's extents before we call pixman_op because
+     * it might be one of the source regions and pixman_op depends
+     * on the extents of those regions being unaltered. Besides, this
+     * way there's no checking against rectangles that will be nuked
+     * due to coalescing, so we have to examine fewer rectangles.
+     */
+    pixman_set_extents (reg_d);
+    GOOD (reg_d);
+    return TRUE;
+}
+
+/*======================================================================
+ *	    Region Inversion
+ *====================================================================*/
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_inverse --
+ *	Take a region and a box and return a region that is everything
+ *	in the box but not in the region. The careful reader will note
+ *	that this is the same as subtracting the region from the box...
+ *
+ * Results:
+ *	TRUE.
+ *
+ * Side Effects:
+ *	new_reg is overwritten.
+ *
+ *-----------------------------------------------------------------------
+ */
+pixman_bool_t
+PIXMAN_EXPORT PREFIX (_inverse) (region_type_t *new_reg,  /* Destination region */
+                                 region_type_t *reg1,     /* Region to invert */
+                                 box_type_t *   inv_rect) /* Bounding box for inversion */
+{
+    region_type_t inv_reg; /* Quick and dirty region made from the
+			    * bounding box */
+    int overlap;           /* result ignored */
+
+    GOOD (reg1);
+    GOOD (new_reg);
+    
+    /* check for trivial rejects */
+    if (PIXREGION_NIL (reg1) || !EXTENTCHECK (inv_rect, &reg1->extents))
+    {
+        if (PIXREGION_NAR (reg1))
+	    return pixman_break (new_reg);
+	
+        new_reg->extents = *inv_rect;
+        FREE_DATA (new_reg);
+        new_reg->data = (region_data_type_t *)NULL;
+	
+        return TRUE;
+    }
+
+    /* Add those rectangles in region 1 that aren't in region 2,
+     * do yucky substraction for overlaps, and
+     * just throw away rectangles in region 2 that aren't in region 1
+     */
+    inv_reg.extents = *inv_rect;
+    inv_reg.data = (region_data_type_t *)NULL;
+    if (!pixman_op (new_reg, &inv_reg, reg1, pixman_region_subtract_o, TRUE, FALSE, &overlap))
+	return FALSE;
+
+    /*
+     * Can't alter new_reg's extents before we call pixman_op because
+     * it might be one of the source regions and pixman_op depends
+     * on the extents of those regions being unaltered. Besides, this
+     * way there's no checking against rectangles that will be nuked
+     * due to coalescing, so we have to examine fewer rectangles.
+     */
+    pixman_set_extents (new_reg);
+    GOOD (new_reg);
+    return TRUE;
+}
+
+/* In time O(log n), locate the first box whose y2 is greater than y.
+ * Return @end if no such box exists.
+ */
+static box_type_t *
+find_box_for_y (box_type_t *begin, box_type_t *end, int y)
+{
+    box_type_t *mid;
+
+    if (end == begin)
+	return end;
+
+    if (end - begin == 1)
+    {
+	if (begin->y2 > y)
+	    return begin;
+	else
+	    return end;
+    }
+
+    mid = begin + (end - begin) / 2;
+    if (mid->y2 > y)
+    {
+	/* If no box is found in [begin, mid], the function
+	 * will return @mid, which is then known to be the
+	 * correct answer.
+	 */
+	return find_box_for_y (begin, mid, y);
+    }
+    else
+    {
+	return find_box_for_y (mid, end, y);
+    }
+}
+
+/*
+ *   rect_in(region, rect)
+ *   This routine takes a pointer to a region and a pointer to a box
+ *   and determines if the box is outside/inside/partly inside the region.
+ *
+ *   The idea is to travel through the list of rectangles trying to cover the
+ *   passed box with them. Anytime a piece of the rectangle isn't covered
+ *   by a band of rectangles, part_out is set TRUE. Any time a rectangle in
+ *   the region covers part of the box, part_in is set TRUE. The process ends
+ *   when either the box has been completely covered (we reached a band that
+ *   doesn't overlap the box, part_in is TRUE and part_out is false), the
+ *   box has been partially covered (part_in == part_out == TRUE -- because of
+ *   the banding, the first time this is true we know the box is only
+ *   partially in the region) or is outside the region (we reached a band
+ *   that doesn't overlap the box at all and part_in is false)
+ */
+pixman_region_overlap_t
+PIXMAN_EXPORT PREFIX (_contains_rectangle) (region_type_t *  region,
+                                            box_type_t *     prect)
+{
+    box_type_t *     pbox;
+    box_type_t *     pbox_end;
+    int part_in, part_out;
+    int numRects;
+    int x, y;
+
+    GOOD (region);
+
+    numRects = PIXREGION_NUMRECTS (region);
+
+    /* useful optimization */
+    if (!numRects || !EXTENTCHECK (&region->extents, prect))
+	return(PIXMAN_REGION_OUT);
+
+    if (numRects == 1)
+    {
+        /* We know that it must be PIXMAN_REGION_IN or PIXMAN_REGION_PART */
+        if (SUBSUMES (&region->extents, prect))
+	    return(PIXMAN_REGION_IN);
+        else
+	    return(PIXMAN_REGION_PART);
+    }
+
+    part_out = FALSE;
+    part_in = FALSE;
+
+    /* (x,y) starts at upper left of rect, moving to the right and down */
+    x = prect->x1;
+    y = prect->y1;
+
+    /* can stop when both part_out and part_in are TRUE, or we reach prect->y2 */
+    for (pbox = PIXREGION_BOXPTR (region), pbox_end = pbox + numRects;
+	 pbox != pbox_end;
+	 pbox++)
+    {
+	/* getting up to speed or skipping remainder of band */
+	if (pbox->y2 <= y)
+	{
+	    if ((pbox = find_box_for_y (pbox, pbox_end, y)) == pbox_end)
+		break;
+	}
+
+        if (pbox->y1 > y)
+        {
+            part_out = TRUE;     /* missed part of rectangle above */
+            if (part_in || (pbox->y1 >= prect->y2))
+		break;
+            y = pbox->y1;       /* x guaranteed to be == prect->x1 */
+	}
+
+        if (pbox->x2 <= x)
+	    continue;           /* not far enough over yet */
+
+        if (pbox->x1 > x)
+        {
+            part_out = TRUE;     /* missed part of rectangle to left */
+            if (part_in)
+		break;
+	}
+
+        if (pbox->x1 < prect->x2)
+        {
+            part_in = TRUE;      /* definitely overlap */
+            if (part_out)
+		break;
+	}
+
+        if (pbox->x2 >= prect->x2)
+        {
+            y = pbox->y2;       /* finished with this band */
+            if (y >= prect->y2)
+		break;
+            x = prect->x1;      /* reset x out to left again */
+	}
+        else
+        {
+            /*
+	     * Because boxes in a band are maximal width, if the first box
+	     * to overlap the rectangle doesn't completely cover it in that
+	     * band, the rectangle must be partially out, since some of it
+	     * will be uncovered in that band. part_in will have been set true
+	     * by now...
+	     */
+            part_out = TRUE;
+            break;
+	}
+    }
+
+    if (part_in)
+    {
+        if (y < prect->y2)
+	    return PIXMAN_REGION_PART;
+        else
+	    return PIXMAN_REGION_IN;
+    }
+    else
+    {
+        return PIXMAN_REGION_OUT;
+    }
+}
+
+/* PREFIX(_translate) (region, x, y)
+ * translates in place
+ */
+
+PIXMAN_EXPORT void
+PREFIX (_translate) (region_type_t *region, int x, int y)
+{
+    overflow_int_t x1, x2, y1, y2;
+    int nbox;
+    box_type_t * pbox;
+
+    GOOD (region);
+    region->extents.x1 = x1 = region->extents.x1 + x;
+    region->extents.y1 = y1 = region->extents.y1 + y;
+    region->extents.x2 = x2 = region->extents.x2 + x;
+    region->extents.y2 = y2 = region->extents.y2 + y;
+    
+    if (((x1 - PIXMAN_REGION_MIN) | (y1 - PIXMAN_REGION_MIN) | (PIXMAN_REGION_MAX - x2) | (PIXMAN_REGION_MAX - y2)) >= 0)
+    {
+        if (region->data && (nbox = region->data->numRects))
+        {
+            for (pbox = PIXREGION_BOXPTR (region); nbox--; pbox++)
+            {
+                pbox->x1 += x;
+                pbox->y1 += y;
+                pbox->x2 += x;
+                pbox->y2 += y;
+	    }
+	}
+        return;
+    }
+
+    if (((x2 - PIXMAN_REGION_MIN) | (y2 - PIXMAN_REGION_MIN) | (PIXMAN_REGION_MAX - x1) | (PIXMAN_REGION_MAX - y1)) <= 0)
+    {
+        region->extents.x2 = region->extents.x1;
+        region->extents.y2 = region->extents.y1;
+        FREE_DATA (region);
+        region->data = pixman_region_empty_data;
+        return;
+    }
+
+    if (x1 < PIXMAN_REGION_MIN)
+	region->extents.x1 = PIXMAN_REGION_MIN;
+    else if (x2 > PIXMAN_REGION_MAX)
+	region->extents.x2 = PIXMAN_REGION_MAX;
+
+    if (y1 < PIXMAN_REGION_MIN)
+	region->extents.y1 = PIXMAN_REGION_MIN;
+    else if (y2 > PIXMAN_REGION_MAX)
+	region->extents.y2 = PIXMAN_REGION_MAX;
+
+    if (region->data && (nbox = region->data->numRects))
+    {
+        box_type_t * pbox_out;
+
+        for (pbox_out = pbox = PIXREGION_BOXPTR (region); nbox--; pbox++)
+        {
+            pbox_out->x1 = x1 = pbox->x1 + x;
+            pbox_out->y1 = y1 = pbox->y1 + y;
+            pbox_out->x2 = x2 = pbox->x2 + x;
+            pbox_out->y2 = y2 = pbox->y2 + y;
+
+            if (((x2 - PIXMAN_REGION_MIN) | (y2 - PIXMAN_REGION_MIN) |
+                 (PIXMAN_REGION_MAX - x1) | (PIXMAN_REGION_MAX - y1)) <= 0)
+            {
+                region->data->numRects--;
+                continue;
+	    }
+
+            if (x1 < PIXMAN_REGION_MIN)
+		pbox_out->x1 = PIXMAN_REGION_MIN;
+            else if (x2 > PIXMAN_REGION_MAX)
+		pbox_out->x2 = PIXMAN_REGION_MAX;
+
+            if (y1 < PIXMAN_REGION_MIN)
+		pbox_out->y1 = PIXMAN_REGION_MIN;
+            else if (y2 > PIXMAN_REGION_MAX)
+		pbox_out->y2 = PIXMAN_REGION_MAX;
+
+            pbox_out++;
+	}
+
+        if (pbox_out != pbox)
+        {
+            if (region->data->numRects == 1)
+            {
+                region->extents = *PIXREGION_BOXPTR (region);
+                FREE_DATA (region);
+                region->data = (region_data_type_t *)NULL;
+	    }
+            else
+	    {
+		pixman_set_extents (region);
+	    }
+	}
+    }
+
+    GOOD (region);
+}
+
+PIXMAN_EXPORT void
+PREFIX (_reset) (region_type_t *region, box_type_t *box)
+{
+    GOOD (region);
+
+    critical_if_fail (GOOD_RECT (box));
+
+    region->extents = *box;
+
+    FREE_DATA (region);
+
+    region->data = NULL;
+}
+
+/* box is "return" value */
+PIXMAN_EXPORT int
+PREFIX (_contains_point) (region_type_t * region,
+                          int x, int y,
+                          box_type_t * box)
+{
+    box_type_t *pbox, *pbox_end;
+    int numRects;
+
+    GOOD (region);
+    numRects = PIXREGION_NUMRECTS (region);
+
+    if (!numRects || !INBOX (&region->extents, x, y))
+	return(FALSE);
+
+    if (numRects == 1)
+    {
+        if (box)
+	    *box = region->extents;
+
+        return(TRUE);
+    }
+
+    pbox = PIXREGION_BOXPTR (region);
+    pbox_end = pbox + numRects;
+
+    pbox = find_box_for_y (pbox, pbox_end, y);
+
+    for (;pbox != pbox_end; pbox++)
+    {
+        if ((y < pbox->y1) || (x < pbox->x1))
+	    break;              /* missed it */
+
+        if (x >= pbox->x2)
+	    continue;           /* not there yet */
+
+        if (box)
+	    *box = *pbox;
+
+        return(TRUE);
+    }
+
+    return(FALSE);
+}
+
+PIXMAN_EXPORT int
+PREFIX (_not_empty) (region_type_t * region)
+{
+    GOOD (region);
+
+    return(!PIXREGION_NIL (region));
+}
+
+PIXMAN_EXPORT box_type_t *
+PREFIX (_extents) (region_type_t * region)
+{
+    GOOD (region);
+
+    return(&region->extents);
+}
+
+/*
+ * Clip a list of scanlines to a region.  The caller has allocated the
+ * space.  FSorted is non-zero if the scanline origins are in ascending order.
+ *
+ * returns the number of new, clipped scanlines.
+ */
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_selfcheck) (region_type_t *reg)
+{
+    int i, numRects;
+
+    if ((reg->extents.x1 > reg->extents.x2) ||
+        (reg->extents.y1 > reg->extents.y2))
+    {
+	return FALSE;
+    }
+
+    numRects = PIXREGION_NUMRECTS (reg);
+    if (!numRects)
+    {
+	return ((reg->extents.x1 == reg->extents.x2) &&
+	        (reg->extents.y1 == reg->extents.y2) &&
+	        (reg->data->size || (reg->data == pixman_region_empty_data)));
+    }
+    else if (numRects == 1)
+    {
+	return (!reg->data);
+    }
+    else
+    {
+        box_type_t * pbox_p, * pbox_n;
+        box_type_t box;
+
+        pbox_p = PIXREGION_RECTS (reg);
+        box = *pbox_p;
+        box.y2 = pbox_p[numRects - 1].y2;
+        pbox_n = pbox_p + 1;
+
+        for (i = numRects; --i > 0; pbox_p++, pbox_n++)
+        {
+            if ((pbox_n->x1 >= pbox_n->x2) ||
+                (pbox_n->y1 >= pbox_n->y2))
+	    {
+		return FALSE;
+	    }
+
+            if (pbox_n->x1 < box.x1)
+		box.x1 = pbox_n->x1;
+	    
+            if (pbox_n->x2 > box.x2)
+		box.x2 = pbox_n->x2;
+	    
+            if ((pbox_n->y1 < pbox_p->y1) ||
+                ((pbox_n->y1 == pbox_p->y1) &&
+                 ((pbox_n->x1 < pbox_p->x2) || (pbox_n->y2 != pbox_p->y2))))
+	    {
+		return FALSE;
+	    }
+	}
+
+        return ((box.x1 == reg->extents.x1) &&
+                (box.x2 == reg->extents.x2) &&
+                (box.y1 == reg->extents.y1) &&
+                (box.y2 == reg->extents.y2));
+    }
+}
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_init_rects) (region_type_t *region,
+                      const box_type_t *boxes, int count)
+{
+    box_type_t *rects;
+    int displacement;
+    int i;
+
+    /* if it's 1, then we just want to set the extents, so call
+     * the existing method. */
+    if (count == 1)
+    {
+        PREFIX (_init_rect) (region,
+                             boxes[0].x1,
+                             boxes[0].y1,
+                             boxes[0].x2 - boxes[0].x1,
+                             boxes[0].y2 - boxes[0].y1);
+        return TRUE;
+    }
+
+    PREFIX (_init) (region);
+
+    /* if it's 0, don't call pixman_rect_alloc -- 0 rectangles is
+     * a special case, and causing pixman_rect_alloc would cause
+     * us to leak memory (because the 0-rect case should be the
+     * static pixman_region_empty_data data).
+     */
+    if (count == 0)
+	return TRUE;
+
+    if (!pixman_rect_alloc (region, count))
+	return FALSE;
+
+    rects = PIXREGION_RECTS (region);
+
+    /* Copy in the rects */
+    memcpy (rects, boxes, sizeof(box_type_t) * count);
+    region->data->numRects = count;
+
+    /* Eliminate empty and malformed rectangles */
+    displacement = 0;
+
+    for (i = 0; i < count; ++i)
+    {
+        box_type_t *box = &rects[i];
+
+        if (box->x1 >= box->x2 || box->y1 >= box->y2)
+	    displacement++;
+        else if (displacement)
+	    rects[i - displacement] = rects[i];
+    }
+
+    region->data->numRects -= displacement;
+
+    /* If eliminating empty rectangles caused there
+     * to be only 0 or 1 rectangles, deal with that.
+     */
+    if (region->data->numRects == 0)
+    {
+        FREE_DATA (region);
+        PREFIX (_init) (region);
+
+        return TRUE;
+    }
+
+    if (region->data->numRects == 1)
+    {
+        region->extents = rects[0];
+
+        FREE_DATA (region);
+        region->data = NULL;
+
+        GOOD (region);
+
+        return TRUE;
+    }
+
+    /* Validate */
+    region->extents.x1 = region->extents.x2 = 0;
+
+    return validate (region, &i);
+}
+
+#define READ(_ptr) (*(_ptr))
+
+static inline box_type_t *
+bitmap_addrect (region_type_t *reg,
+                box_type_t *r,
+                box_type_t **first_rect,
+                int rx1, int ry1,
+                int rx2, int ry2)
+{
+    if ((rx1 < rx2) && (ry1 < ry2) &&
+	(!(reg->data->numRects &&
+	   ((r-1)->y1 == ry1) && ((r-1)->y2 == ry2) &&
+	   ((r-1)->x1 <= rx1) && ((r-1)->x2 >= rx2))))
+    {
+	if (reg->data->numRects == reg->data->size)
+	{
+	    if (!pixman_rect_alloc (reg, 1))
+		return NULL;
+	    *first_rect = PIXREGION_BOXPTR(reg);
+	    r = *first_rect + reg->data->numRects;
+	}
+	r->x1 = rx1;
+	r->y1 = ry1;
+	r->x2 = rx2;
+	r->y2 = ry2;
+	reg->data->numRects++;
+	if (r->x1 < reg->extents.x1)
+	    reg->extents.x1 = r->x1;
+	if (r->x2 > reg->extents.x2)
+	    reg->extents.x2 = r->x2;
+	r++;
+    }
+    return r;
+}
+
+/* Convert bitmap clip mask into clipping region.
+ * First, goes through each line and makes boxes by noting the transitions
+ * from 0 to 1 and 1 to 0.
+ * Then it coalesces the current line with the previous if they have boxes
+ * at the same X coordinates.
+ * Stride is in number of uint32_t per line.
+ */
+PIXMAN_EXPORT void
+PREFIX (_init_from_image) (region_type_t *region,
+                           pixman_image_t *image)
+{
+    uint32_t mask0 = 0xffffffff & ~SCREEN_SHIFT_RIGHT(0xffffffff, 1);
+    box_type_t *first_rect, *rects, *prect_line_start;
+    box_type_t *old_rect, *new_rect;
+    uint32_t *pw, w, *pw_line, *pw_line_end;
+    int	irect_prev_start, irect_line_start;
+    int	h, base, rx1 = 0, crects;
+    int	ib;
+    pixman_bool_t in_box, same;
+    int width, height, stride;
+
+    PREFIX(_init) (region);
+
+    critical_if_fail (region->data);
+
+    return_if_fail (image->type == BITS);
+    return_if_fail (image->bits.format == PIXMAN_a1);
+
+    pw_line = pixman_image_get_data (image);
+    width = pixman_image_get_width (image);
+    height = pixman_image_get_height (image);
+    stride = pixman_image_get_stride (image) / 4;
+
+    first_rect = PIXREGION_BOXPTR(region);
+    rects = first_rect;
+
+    region->extents.x1 = width - 1;
+    region->extents.x2 = 0;
+    irect_prev_start = -1;
+    for (h = 0; h < height; h++)
+    {
+        pw = pw_line;
+        pw_line += stride;
+        irect_line_start = rects - first_rect;
+
+        /* If the Screen left most bit of the word is set, we're starting in
+         * a box */
+        if (READ(pw) & mask0)
+        {
+            in_box = TRUE;
+            rx1 = 0;
+        }
+        else
+        {
+            in_box = FALSE;
+        }
+
+        /* Process all words which are fully in the pixmap */
+        pw_line_end = pw + (width >> 5);
+        for (base = 0; pw < pw_line_end; base += 32)
+        {
+            w = READ(pw++);
+            if (in_box)
+            {
+                if (!~w)
+                    continue;
+            }
+            else
+            {
+                if (!w)
+                    continue;
+            }
+            for (ib = 0; ib < 32; ib++)
+            {
+                /* If the Screen left most bit of the word is set, we're
+                 * starting a box */
+                if (w & mask0)
+                {
+                    if (!in_box)
+                    {
+                        rx1 = base + ib;
+                        /* start new box */
+                        in_box = TRUE;
+                    }
+                }
+                else
+                {
+                    if (in_box)
+                    {
+                        /* end box */
+                        rects = bitmap_addrect (region, rects, &first_rect,
+                                                rx1, h, base + ib, h + 1);
+                        if (rects == NULL)
+                            goto error;
+                        in_box = FALSE;
+                    }
+                }
+                /* Shift the word VISUALLY left one. */
+                w = SCREEN_SHIFT_LEFT(w, 1);
+            }
+        }
+
+        if (width & 31)
+        {
+            /* Process final partial word on line */
+             w = READ(pw++);
+            for (ib = 0; ib < (width & 31); ib++)
+            {
+                /* If the Screen left most bit of the word is set, we're
+                 * starting a box */
+                if (w & mask0)
+                {
+                    if (!in_box)
+                    {
+                        rx1 = base + ib;
+                        /* start new box */
+                        in_box = TRUE;
+                    }
+                }
+                else
+                {
+                    if (in_box)
+                    {
+                        /* end box */
+                        rects = bitmap_addrect(region, rects, &first_rect,
+					       rx1, h, base + ib, h + 1);
+			if (rects == NULL)
+			    goto error;
+                        in_box = FALSE;
+                    }
+                }
+                /* Shift the word VISUALLY left one. */
+                w = SCREEN_SHIFT_LEFT(w, 1);
+            }
+        }
+        /* If scanline ended with last bit set, end the box */
+        if (in_box)
+        {
+            rects = bitmap_addrect(region, rects, &first_rect,
+				   rx1, h, base + (width & 31), h + 1);
+	    if (rects == NULL)
+		goto error;
+        }
+        /* if all rectangles on this line have the same x-coords as
+         * those on the previous line, then add 1 to all the previous  y2s and
+         * throw away all the rectangles from this line
+         */
+        same = FALSE;
+        if (irect_prev_start != -1)
+        {
+            crects = irect_line_start - irect_prev_start;
+            if (crects != 0 &&
+                crects == ((rects - first_rect) - irect_line_start))
+            {
+                old_rect = first_rect + irect_prev_start;
+                new_rect = prect_line_start = first_rect + irect_line_start;
+                same = TRUE;
+                while (old_rect < prect_line_start)
+                {
+                    if ((old_rect->x1 != new_rect->x1) ||
+                        (old_rect->x2 != new_rect->x2))
+                    {
+                          same = FALSE;
+                          break;
+                    }
+                    old_rect++;
+                    new_rect++;
+                }
+                if (same)
+                {
+                    old_rect = first_rect + irect_prev_start;
+                    while (old_rect < prect_line_start)
+                    {
+                        old_rect->y2 += 1;
+                        old_rect++;
+                    }
+                    rects -= crects;
+                    region->data->numRects -= crects;
+                }
+            }
+        }
+        if(!same)
+            irect_prev_start = irect_line_start;
+    }
+    if (!region->data->numRects)
+    {
+        region->extents.x1 = region->extents.x2 = 0;
+    }
+    else
+    {
+        region->extents.y1 = PIXREGION_BOXPTR(region)->y1;
+        region->extents.y2 = PIXREGION_END(region)->y2;
+        if (region->data->numRects == 1)
+        {
+            free (region->data);
+            region->data = NULL;
+        }
+    }
+
+ error:
+    return;
+}
diff --git a/pixman/pixman-region16.c b/pixman/pixman-region16.c
new file mode 100644
index 0000000..d88d338
--- /dev/null
+++ b/pixman/pixman-region16.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright Â© 2008 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without
+ * fee, provided that the above copyright notice appear in all copies
+ * and that both that copyright notice and this permission notice
+ * appear in supporting documentation, and that the name of
+ * Red Hat, Inc. not be used in advertising or publicity pertaining to
+ * distribution of the software without specific, written prior
+ * permission. Red Hat, Inc. makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * RED HAT, INC. DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL RED HAT, INC. BE LIABLE FOR ANY SPECIAL,
+ * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
+ * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+ * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author: Soren Sandmann <sandmann@redhat.com>
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#undef PIXMAN_DISABLE_DEPRECATED
+
+#include "pixman-private.h"
+
+#include <stdlib.h>
+
+typedef pixman_box16_t		box_type_t;
+typedef pixman_region16_data_t	region_data_type_t;
+typedef pixman_region16_t	region_type_t;
+typedef int32_t                 overflow_int_t;
+
+typedef struct {
+    int x, y;
+} point_type_t;
+
+#define PREFIX(x) pixman_region##x
+
+#define PIXMAN_REGION_MAX INT16_MAX
+#define PIXMAN_REGION_MIN INT16_MIN
+
+#include "pixman-region.c"
+
+/* This function exists only to make it possible to preserve the X ABI -
+ * it should go away at first opportunity.
+ *
+ * The problem is that the X ABI exports the three structs and has used
+ * them through macros. So the X server calls this function with
+ * the addresses of those structs which makes the existing code continue to
+ * work.
+ */
+PIXMAN_EXPORT void
+pixman_region_set_static_pointers (pixman_box16_t *empty_box,
+				   pixman_region16_data_t *empty_data,
+				   pixman_region16_data_t *broken_data)
+{
+    pixman_region_empty_box = empty_box;
+    pixman_region_empty_data = empty_data;
+    pixman_broken_data = broken_data;
+}
diff --git a/pixman/pixman-region32.c b/pixman/pixman-region32.c
new file mode 100644
index 0000000..abd6b1a
--- /dev/null
+++ b/pixman/pixman-region32.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright Â© 2008 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without
+ * fee, provided that the above copyright notice appear in all copies
+ * and that both that copyright notice and this permission notice
+ * appear in supporting documentation, and that the name of
+ * Red Hat, Inc. not be used in advertising or publicity pertaining to
+ * distribution of the software without specific, written prior
+ * permission. Red Hat, Inc. makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * RED HAT, INC. DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL RED HAT, INC. BE LIABLE FOR ANY SPECIAL,
+ * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
+ * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+ * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author: Soren Sandmann <sandmann@redhat.com>
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+
+#include <stdlib.h>
+
+typedef pixman_box32_t		box_type_t;
+typedef pixman_region32_data_t	region_data_type_t;
+typedef pixman_region32_t	region_type_t;
+typedef int64_t                 overflow_int_t;
+
+typedef struct {
+    int x, y;
+} point_type_t;
+
+#define PREFIX(x) pixman_region32##x
+
+#define PIXMAN_REGION_MAX INT32_MAX
+#define PIXMAN_REGION_MIN INT32_MIN
+
+#include "pixman-region.c"
diff --git a/pixman/pixman-solid-fill.c b/pixman/pixman-solid-fill.c
new file mode 100644
index 0000000..852e135
--- /dev/null
+++ b/pixman/pixman-solid-fill.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright Â© 2000 SuSE, Inc.
+ * Copyright Â© 2007, 2009 Red Hat, Inc.
+ * Copyright Â© 2009 Soren Sandmann
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+
+void
+_pixman_solid_fill_iter_init (pixman_image_t *image, pixman_iter_t  *iter)
+{
+    if (iter->flags & ITER_NARROW)
+    {
+	uint32_t *b = (uint32_t *)iter->buffer;
+	uint32_t *e = b + iter->width;
+	uint32_t color = iter->image->solid.color_32;
+
+	while (b < e)
+	    *(b++) = color;
+    }
+    else
+    {
+	uint64_t *b = (uint64_t *)iter->buffer;
+	uint64_t *e = b + iter->width;
+	uint64_t color = image->solid.color_64;
+
+	while (b < e)
+	    *(b++) = color;
+    }
+
+    iter->get_scanline = _pixman_iter_get_scanline_noop;
+}
+
+static uint32_t
+color_to_uint32 (const pixman_color_t *color)
+{
+    return
+        (color->alpha >> 8 << 24) |
+        (color->red >> 8 << 16) |
+        (color->green & 0xff00) |
+        (color->blue >> 8);
+}
+
+static uint64_t
+color_to_uint64 (const pixman_color_t *color)
+{
+    return
+        ((uint64_t)color->alpha << 48) |
+        ((uint64_t)color->red << 32) |
+        ((uint64_t)color->green << 16) |
+        ((uint64_t)color->blue);
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_solid_fill (pixman_color_t *color)
+{
+    pixman_image_t *img = _pixman_image_allocate ();
+
+    if (!img)
+	return NULL;
+
+    img->type = SOLID;
+    img->solid.color = *color;
+    img->solid.color_32 = color_to_uint32 (color);
+    img->solid.color_64 = color_to_uint64 (color);
+
+    return img;
+}
+
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
new file mode 100644
index 0000000..c419511
--- /dev/null
+++ b/pixman/pixman-sse2.c
@@ -0,0 +1,6071 @@
+/*
+ * Copyright Â© 2008 Rodrigo Kumpera
+ * Copyright Â© 2008 AndrÃ© TupinambÃ¡
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Rodrigo Kumpera (kumpera@gmail.com)
+ *          AndrÃ© TupinambÃ¡ (andrelrt@gmail.com)
+ *
+ * Based on work by Owen Taylor and SÃ¸ren Sandmann
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
+#include <emmintrin.h> /* for SSE2 intrinsics */
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+static __m128i mask_0080;
+static __m128i mask_00ff;
+static __m128i mask_0101;
+static __m128i mask_ffff;
+static __m128i mask_ff000000;
+static __m128i mask_alpha;
+
+static __m128i mask_565_r;
+static __m128i mask_565_g1, mask_565_g2;
+static __m128i mask_565_b;
+static __m128i mask_red;
+static __m128i mask_green;
+static __m128i mask_blue;
+
+static __m128i mask_565_fix_rb;
+static __m128i mask_565_fix_g;
+
+static force_inline __m128i
+unpack_32_1x128 (uint32_t data)
+{
+    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
+}
+
+static force_inline void
+unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
+{
+    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
+    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
+}
+
+static force_inline __m128i
+unpack_565_to_8888 (__m128i lo)
+{
+    __m128i r, g, b, rb, t;
+
+    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
+    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
+    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
+
+    rb = _mm_or_si128 (r, b);
+    t  = _mm_and_si128 (rb, mask_565_fix_rb);
+    t  = _mm_srli_epi32 (t, 5);
+    rb = _mm_or_si128 (rb, t);
+
+    t  = _mm_and_si128 (g, mask_565_fix_g);
+    t  = _mm_srli_epi32 (t, 6);
+    g  = _mm_or_si128 (g, t);
+
+    return _mm_or_si128 (rb, g);
+}
+
+static force_inline void
+unpack_565_128_4x128 (__m128i  data,
+                      __m128i* data0,
+                      __m128i* data1,
+                      __m128i* data2,
+                      __m128i* data3)
+{
+    __m128i lo, hi;
+
+    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
+    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
+
+    lo = unpack_565_to_8888 (lo);
+    hi = unpack_565_to_8888 (hi);
+
+    unpack_128_2x128 (lo, data0, data1);
+    unpack_128_2x128 (hi, data2, data3);
+}
+
+static force_inline uint16_t
+pack_565_32_16 (uint32_t pixel)
+{
+    return (uint16_t) (((pixel >> 8) & 0xf800) |
+		       ((pixel >> 5) & 0x07e0) |
+		       ((pixel >> 3) & 0x001f));
+}
+
+static force_inline __m128i
+pack_2x128_128 (__m128i lo, __m128i hi)
+{
+    return _mm_packus_epi16 (lo, hi);
+}
+
+static force_inline __m128i
+pack_565_2x128_128 (__m128i lo, __m128i hi)
+{
+    __m128i data;
+    __m128i r, g1, g2, b;
+
+    data = pack_2x128_128 (lo, hi);
+
+    r  = _mm_and_si128 (data, mask_565_r);
+    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
+    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
+    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
+
+    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
+}
+
+static force_inline __m128i
+pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
+{
+    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
+			     pack_565_2x128_128 (*xmm2, *xmm3));
+}
+
+static force_inline int
+is_opaque (__m128i x)
+{
+    __m128i ffs = _mm_cmpeq_epi8 (x, x);
+
+    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
+}
+
+static force_inline int
+is_zero (__m128i x)
+{
+    return _mm_movemask_epi8 (
+	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
+}
+
+static force_inline int
+is_transparent (__m128i x)
+{
+    return (_mm_movemask_epi8 (
+		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
+}
+
+static force_inline __m128i
+expand_pixel_32_1x128 (uint32_t data)
+{
+    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
+}
+
+static force_inline __m128i
+expand_alpha_1x128 (__m128i data)
+{
+    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
+						     _MM_SHUFFLE (3, 3, 3, 3)),
+				_MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline void
+expand_alpha_2x128 (__m128i  data_lo,
+                    __m128i  data_hi,
+                    __m128i* alpha_lo,
+                    __m128i* alpha_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
+
+    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
+    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline void
+expand_alpha_rev_2x128 (__m128i  data_lo,
+                        __m128i  data_hi,
+                        __m128i* alpha_lo,
+                        __m128i* alpha_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
+    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
+    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline void
+pix_multiply_2x128 (__m128i* data_lo,
+                    __m128i* data_hi,
+                    __m128i* alpha_lo,
+                    __m128i* alpha_hi,
+                    __m128i* ret_lo,
+                    __m128i* ret_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
+    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
+    lo = _mm_adds_epu16 (lo, mask_0080);
+    hi = _mm_adds_epu16 (hi, mask_0080);
+    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
+    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
+}
+
+static force_inline void
+pix_add_multiply_2x128 (__m128i* src_lo,
+                        __m128i* src_hi,
+                        __m128i* alpha_dst_lo,
+                        __m128i* alpha_dst_hi,
+                        __m128i* dst_lo,
+                        __m128i* dst_hi,
+                        __m128i* alpha_src_lo,
+                        __m128i* alpha_src_hi,
+                        __m128i* ret_lo,
+                        __m128i* ret_hi)
+{
+    __m128i t1_lo, t1_hi;
+    __m128i t2_lo, t2_hi;
+
+    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
+    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
+
+    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
+    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
+}
+
+static force_inline void
+negate_2x128 (__m128i  data_lo,
+              __m128i  data_hi,
+              __m128i* neg_lo,
+              __m128i* neg_hi)
+{
+    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
+    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
+}
+
+static force_inline void
+invert_colors_2x128 (__m128i  data_lo,
+                     __m128i  data_hi,
+                     __m128i* inv_lo,
+                     __m128i* inv_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
+    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
+    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline void
+over_2x128 (__m128i* src_lo,
+            __m128i* src_hi,
+            __m128i* alpha_lo,
+            __m128i* alpha_hi,
+            __m128i* dst_lo,
+            __m128i* dst_hi)
+{
+    __m128i t1, t2;
+
+    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
+
+    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
+
+    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
+    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
+}
+
+static force_inline void
+over_rev_non_pre_2x128 (__m128i  src_lo,
+                        __m128i  src_hi,
+                        __m128i* dst_lo,
+                        __m128i* dst_hi)
+{
+    __m128i lo, hi;
+    __m128i alpha_lo, alpha_hi;
+
+    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
+
+    lo = _mm_or_si128 (alpha_lo, mask_alpha);
+    hi = _mm_or_si128 (alpha_hi, mask_alpha);
+
+    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
+
+    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
+
+    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
+}
+
+static force_inline void
+in_over_2x128 (__m128i* src_lo,
+               __m128i* src_hi,
+               __m128i* alpha_lo,
+               __m128i* alpha_hi,
+               __m128i* mask_lo,
+               __m128i* mask_hi,
+               __m128i* dst_lo,
+               __m128i* dst_hi)
+{
+    __m128i s_lo, s_hi;
+    __m128i a_lo, a_hi;
+
+    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
+    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
+
+    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
+}
+
+/* load 4 pixels from a 16-byte boundary aligned address */
+static force_inline __m128i
+load_128_aligned (__m128i* src)
+{
+    return _mm_load_si128 (src);
+}
+
+/* load 4 pixels from a unaligned address */
+static force_inline __m128i
+load_128_unaligned (const __m128i* src)
+{
+    return _mm_loadu_si128 (src);
+}
+
+/* save 4 pixels using Write Combining memory on a 16-byte
+ * boundary aligned address
+ */
+static force_inline void
+save_128_write_combining (__m128i* dst,
+                          __m128i  data)
+{
+    _mm_stream_si128 (dst, data);
+}
+
+/* save 4 pixels on a 16-byte boundary aligned address */
+static force_inline void
+save_128_aligned (__m128i* dst,
+                  __m128i  data)
+{
+    _mm_store_si128 (dst, data);
+}
+
+/* save 4 pixels on a unaligned address */
+static force_inline void
+save_128_unaligned (__m128i* dst,
+                    __m128i  data)
+{
+    _mm_storeu_si128 (dst, data);
+}
+
+static force_inline __m128i
+load_32_1x128 (uint32_t data)
+{
+    return _mm_cvtsi32_si128 (data);
+}
+
+static force_inline __m128i
+expand_alpha_rev_1x128 (__m128i data)
+{
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m128i
+expand_pixel_8_1x128 (uint8_t data)
+{
+    return _mm_shufflelo_epi16 (
+	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m128i
+pix_multiply_1x128 (__m128i data,
+		    __m128i alpha)
+{
+    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
+					    mask_0080),
+			    mask_0101);
+}
+
+static force_inline __m128i
+pix_add_multiply_1x128 (__m128i* src,
+			__m128i* alpha_dst,
+			__m128i* dst,
+			__m128i* alpha_src)
+{
+    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
+    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
+
+    return _mm_adds_epu8 (t1, t2);
+}
+
+static force_inline __m128i
+negate_1x128 (__m128i data)
+{
+    return _mm_xor_si128 (data, mask_00ff);
+}
+
+static force_inline __m128i
+invert_colors_1x128 (__m128i data)
+{
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline __m128i
+over_1x128 (__m128i src, __m128i alpha, __m128i dst)
+{
+    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
+}
+
+static force_inline __m128i
+in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
+{
+    return over_1x128 (pix_multiply_1x128 (*src, *mask),
+		       pix_multiply_1x128 (*alpha, *mask),
+		       *dst);
+}
+
+static force_inline __m128i
+over_rev_non_pre_1x128 (__m128i src, __m128i dst)
+{
+    __m128i alpha = expand_alpha_1x128 (src);
+
+    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
+					   _mm_or_si128 (alpha, mask_alpha)),
+		       alpha,
+		       dst);
+}
+
+static force_inline uint32_t
+pack_1x128_32 (__m128i data)
+{
+    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
+}
+
+static force_inline __m128i
+expand565_16_1x128 (uint16_t pixel)
+{
+    __m128i m = _mm_cvtsi32_si128 (pixel);
+
+    m = unpack_565_to_8888 (m);
+
+    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
+}
+
+static force_inline uint32_t
+core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
+{
+    uint8_t a;
+    __m128i xmms;
+
+    a = src >> 24;
+
+    if (a == 0xff)
+    {
+	return src;
+    }
+    else if (src)
+    {
+	xmms = unpack_32_1x128 (src);
+	return pack_1x128_32 (
+	    over_1x128 (xmms, expand_alpha_1x128 (xmms),
+			unpack_32_1x128 (dst)));
+    }
+
+    return dst;
+}
+
+static force_inline uint32_t
+combine1 (const uint32_t *ps, const uint32_t *pm)
+{
+    uint32_t s = *ps;
+
+    if (pm)
+    {
+	__m128i ms, mm;
+
+	mm = unpack_32_1x128 (*pm);
+	mm = expand_alpha_1x128 (mm);
+
+	ms = unpack_32_1x128 (s);
+	ms = pix_multiply_1x128 (ms, mm);
+
+	s = pack_1x128_32 (ms);
+    }
+
+    return s;
+}
+
+static force_inline __m128i
+combine4 (const __m128i *ps, const __m128i *pm)
+{
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_msk_lo, xmm_msk_hi;
+    __m128i s;
+
+    if (pm)
+    {
+	xmm_msk_lo = load_128_unaligned (pm);
+
+	if (is_transparent (xmm_msk_lo))
+	    return _mm_setzero_si128 ();
+    }
+
+    s = load_128_unaligned (ps);
+
+    if (pm)
+    {
+	unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
+
+	expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_msk_lo, &xmm_msk_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+
+	s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
+    }
+
+    return s;
+}
+
+static force_inline void
+core_combine_over_u_sse2_mask (uint32_t *	  pd,
+			       const uint32_t*    ps,
+			       const uint32_t*    pm,
+			       int                w)
+{
+    uint32_t s, d;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((unsigned long)pd & 15))
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	pm++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i mask = load_128_unaligned ((__m128i *)pm);
+
+	if (!is_zero (mask))
+	{
+	    __m128i src;
+	    __m128i src_hi, src_lo;
+	    __m128i mask_hi, mask_lo;
+	    __m128i alpha_hi, alpha_lo;
+
+	    src = load_128_unaligned ((__m128i *)ps);
+
+	    if (is_opaque (_mm_and_si128 (src, mask)))
+	    {
+		save_128_aligned ((__m128i *)pd, src);
+	    }
+	    else
+	    {
+		__m128i dst = load_128_aligned ((__m128i *)pd);
+		__m128i dst_hi, dst_lo;
+
+		unpack_128_2x128 (mask, &mask_lo, &mask_hi);
+		unpack_128_2x128 (src, &src_lo, &src_hi);
+
+		expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
+		pix_multiply_2x128 (&src_lo, &src_hi,
+				    &mask_lo, &mask_hi,
+				    &src_lo, &src_hi);
+
+		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+		expand_alpha_2x128 (src_lo, src_hi,
+				    &alpha_lo, &alpha_hi);
+
+		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+			    &dst_lo, &dst_hi);
+
+		save_128_aligned (
+		    (__m128i *)pd,
+		    pack_2x128_128 (dst_lo, dst_hi));
+	    }
+	}
+
+	pm += 4;
+	ps += 4;
+	pd += 4;
+	w -= 4;
+    }
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	pm++;
+
+	w--;
+    }
+}
+
+static force_inline void
+core_combine_over_u_sse2_no_mask (uint32_t *	  pd,
+				  const uint32_t*    ps,
+				  int                w)
+{
+    uint32_t s, d;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((unsigned long)pd & 15))
+    {
+	d = *pd;
+	s = *ps;
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i src;
+	__m128i src_hi, src_lo, dst_hi, dst_lo;
+	__m128i alpha_hi, alpha_lo;
+
+	src = load_128_unaligned ((__m128i *)ps);
+
+	if (!is_zero (src))
+	{
+	    if (is_opaque (src))
+	    {
+		save_128_aligned ((__m128i *)pd, src);
+	    }
+	    else
+	    {
+		__m128i dst = load_128_aligned ((__m128i *)pd);
+
+		unpack_128_2x128 (src, &src_lo, &src_hi);
+		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+		expand_alpha_2x128 (src_lo, src_hi,
+				    &alpha_lo, &alpha_hi);
+		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+			    &dst_lo, &dst_hi);
+
+		save_128_aligned (
+		    (__m128i *)pd,
+		    pack_2x128_128 (dst_lo, dst_hi));
+	    }
+	}
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+    }
+    while (w)
+    {
+	d = *pd;
+	s = *ps;
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+
+	w--;
+    }
+}
+
+static force_inline void
+sse2_combine_over_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    if (pm)
+	core_combine_over_u_sse2_mask (pd, ps, pm, w);
+    else
+	core_combine_over_u_sse2_no_mask (pd, ps, w);
+}
+
+static void
+sse2_combine_over_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    /* Align dst on a 16-byte boundary */
+    while (w &&
+           ((unsigned long)pd & 15))
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	/* I'm loading unaligned because I'm not sure
+	 * about the address alignment.
+	 */
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+		    &xmm_alpha_lo, &xmm_alpha_hi,
+		    &xmm_src_lo, &xmm_src_hi);
+
+	/* rebuid the 4 pixel data and save*/
+	save_128_aligned ((__m128i*)pd,
+			  pack_2x128_128 (xmm_src_lo, xmm_src_hi));
+
+	w -= 4;
+	ps += 4;
+	pd += 4;
+
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
+{
+    uint32_t maska = src >> 24;
+
+    if (maska == 0)
+    {
+	return 0;
+    }
+    else if (maska != 0xff)
+    {
+	return pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (dst),
+				expand_alpha_1x128 (unpack_32_1x128 (src))));
+    }
+
+    return dst;
+}
+
+static void
+sse2_combine_in_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               pd,
+                   const uint32_t *         ps,
+                   const uint32_t *         pm,
+                   int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    while (w && ((unsigned long) pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned ((__m128i*)pd,
+			  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static void
+sse2_combine_in_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               pd,
+                           const uint32_t *         ps,
+                           const uint32_t *         pm,
+                           int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    while (w && ((unsigned long) pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static void
+sse2_combine_out_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               pd,
+                            const uint32_t *         ps,
+                            const uint32_t *         pm,
+                            int                      w)
+{
+    while (w && ((unsigned long) pd & 15))
+    {
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
+
+	if (pm)
+	    pm++;
+	ps++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i xmm_src_lo, xmm_src_hi;
+	__m128i xmm_dst_lo, xmm_dst_hi;
+
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	if (pm)
+	    pm += 4;
+
+	w -= 4;
+    }
+
+    while (w)
+    {
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
+	ps++;
+	if (pm)
+	    pm++;
+	w--;
+    }
+}
+
+static void
+sse2_combine_out_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               pd,
+                    const uint32_t *         ps,
+                    const uint32_t *         pm,
+                    int                      w)
+{
+    while (w && ((unsigned long) pd & 15))
+    {
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (s), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	__m128i xmm_src_lo, xmm_src_hi;
+	__m128i xmm_dst_lo, xmm_dst_hi;
+
+	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (s), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_atop_u_pixel_sse2 (uint32_t src,
+                                uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
+    __m128i da = expand_alpha_1x128 (d);
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
+}
+
+static void
+sse2_combine_atop_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+    while (w && ((unsigned long) pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
+                                        uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i sa = expand_alpha_1x128 (s);
+    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
+}
+
+static void
+sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+    while (w && ((unsigned long) pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_xor_u_pixel_sse2 (uint32_t src,
+                               uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
+    __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
+}
+
+static void
+sse2_combine_xor_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int w = width;
+    uint32_t s, d;
+    uint32_t* pd = dst;
+    const uint32_t* ps = src;
+    const uint32_t* pm = mask;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+    while (w && ((unsigned long) pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
+	xmm_dst = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline void
+sse2_combine_add_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int w = width;
+    uint32_t s, d;
+    uint32_t* pd = dst;
+    const uint32_t* ps = src;
+    const uint32_t* pm = mask;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	ps++;
+	if (pm)
+	    pm++;
+	*pd++ = _mm_cvtsi128_si32 (
+	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i s;
+
+	s = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+	save_128_aligned (
+	    (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
+
+	pd += 4;
+	ps += 4;
+	if (pm)
+	    pm += 4;
+	w -= 4;
+    }
+
+    while (w--)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	ps++;
+	*pd++ = _mm_cvtsi128_si32 (
+	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_saturate_u_pixel_sse2 (uint32_t src,
+                                    uint32_t dst)
+{
+    __m128i ms = unpack_32_1x128 (src);
+    __m128i md = unpack_32_1x128 (dst);
+    uint32_t sa = src >> 24;
+    uint32_t da = ~dst >> 24;
+
+    if (sa > da)
+    {
+	ms = pix_multiply_1x128 (
+	    ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
+    }
+
+    return pack_1x128_32 (_mm_adds_epu16 (md, ms));
+}
+
+static void
+sse2_combine_saturate_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *               pd,
+                         const uint32_t *         ps,
+                         const uint32_t *         pm,
+                         int                      w)
+{
+    uint32_t s, d;
+
+    uint32_t pack_cmp;
+    __m128i xmm_src, xmm_dst;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst = load_128_aligned  ((__m128i*)pd);
+	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+	pack_cmp = _mm_movemask_epi8 (
+	    _mm_cmpgt_epi32 (
+		_mm_srli_epi32 (xmm_src, 24),
+		_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
+
+	/* if some alpha src is grater than respective ~alpha dst */
+	if (pack_cmp)
+	{
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	    if (pm)
+		pm++;
+
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	    if (pm)
+		pm++;
+
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	    if (pm)
+		pm++;
+
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	    if (pm)
+		pm++;
+	}
+	else
+	{
+	    save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
+
+	    pd += 4;
+	    ps += 4;
+	    if (pm)
+		pm += 4;
+	}
+
+	w -= 4;
+    }
+
+    while (w--)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static void
+sse2_combine_src_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_over_ca_pixel_sse2 (uint32_t src,
+                                 uint32_t mask,
+                                 uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i expAlpha = expand_alpha_1x128 (s);
+    __m128i unpk_mask = unpack_32_1x128 (mask);
+    __m128i unpk_dst  = unpack_32_1x128 (dst);
+
+    return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
+}
+
+static void
+sse2_combine_over_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               pd,
+                      const uint32_t *         ps,
+                      const uint32_t *         pm,
+                      int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+		       &xmm_alpha_lo, &xmm_alpha_hi,
+		       &xmm_mask_lo, &xmm_mask_hi,
+		       &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
+                                         uint32_t mask,
+                                         uint32_t dst)
+{
+    __m128i d = unpack_32_1x128 (dst);
+
+    return pack_1x128_32 (
+	over_1x128 (d, expand_alpha_1x128 (d),
+		    pix_multiply_1x128 (unpack_32_1x128 (src),
+					unpack_32_1x128 (mask))));
+}
+
+static void
+sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               pd,
+                              const uint32_t *         ps,
+                              const uint32_t *         pm,
+                              int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+		    &xmm_alpha_lo, &xmm_alpha_hi,
+		    &xmm_mask_lo, &xmm_mask_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static void
+sse2_combine_in_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               pd,
+                    const uint32_t *         ps,
+                    const uint32_t *         pm,
+                    int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		expand_alpha_1x128 (unpack_32_1x128 (d))));
+
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		expand_alpha_1x128 (unpack_32_1x128 (d))));
+
+	w--;
+    }
+}
+
+static void
+sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               pd,
+                            const uint32_t *         ps,
+                            const uint32_t *         pm,
+                            int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		pix_multiply_1x128 (unpack_32_1x128 (m),
+				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		pix_multiply_1x128 (unpack_32_1x128 (m),
+				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
+	w--;
+    }
+}
+
+static void
+sse2_combine_out_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+	negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
+		      &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
+
+	w--;
+    }
+}
+
+static void
+sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		negate_1x128 (pix_multiply_1x128 (
+				 unpack_32_1x128 (m),
+				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+		      &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		negate_1x128 (pix_multiply_1x128 (
+				 unpack_32_1x128 (m),
+				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_atop_ca_pixel_sse2 (uint32_t src,
+                                 uint32_t mask,
+                                 uint32_t dst)
+{
+    __m128i m = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+    __m128i sa = expand_alpha_1x128 (s);
+    __m128i da = expand_alpha_1x128 (d);
+
+    s = pix_multiply_1x128 (s, m);
+    m = negate_1x128 (pix_multiply_1x128 (m, sa));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
+}
+
+static void
+sse2_combine_atop_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               pd,
+                      const uint32_t *         ps,
+                      const uint32_t *         pm,
+                      int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
+                                         uint32_t mask,
+                                         uint32_t dst)
+{
+    __m128i m = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+    __m128i sa = expand_alpha_1x128 (s);
+
+    s = pix_multiply_1x128 (s, m);
+    m = pix_multiply_1x128 (m, sa);
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
+}
+
+static void
+sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               pd,
+                              const uint32_t *         ps,
+                              const uint32_t *         pm,
+                              int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_xor_ca_pixel_sse2 (uint32_t src,
+                                uint32_t mask,
+                                uint32_t dst)
+{
+    __m128i a = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
+				       a, expand_alpha_1x128 (s)));
+    __m128i dest      = pix_multiply_1x128 (s, a);
+    __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d,
+                                                &alpha_dst,
+                                                &dest,
+                                                &alpha_src));
+}
+
+static void
+sse2_combine_xor_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+		      &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static void
+sse2_combine_add_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+					       unpack_32_1x128 (m)),
+			   unpack_32_1x128 (d)));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (
+		_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
+		_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+					       unpack_32_1x128 (m)),
+			   unpack_32_1x128 (d)));
+	w--;
+    }
+}
+
+static force_inline __m128i
+create_mask_16_128 (uint16_t mask)
+{
+    return _mm_set1_epi16 (mask);
+}
+
+/* Work around a code generation bug in Sun Studio 12. */
+#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
+# define create_mask_2x32_128(mask0, mask1)				\
+    (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
+#else
+static force_inline __m128i
+create_mask_2x32_128 (uint32_t mask0,
+                      uint32_t mask1)
+{
+    return _mm_set_epi32 (mask0, mask1, mask0, mask1);
+}
+#endif
+
+static void
+sse2_composite_over_n_8888 (pixman_implementation_t *imp,
+                            pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst, d;
+    int32_t w;
+    int dst_stride;
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+
+    while (height--)
+    {
+	dst = dst_line;
+
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    d = *dst;
+	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+						xmm_alpha,
+						unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    over_2x128 (&xmm_src, &xmm_src,
+			&xmm_alpha, &xmm_alpha,
+			&xmm_dst_lo, &xmm_dst_hi);
+
+	    /* rebuid the 4 pixel data and save*/
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    w -= 4;
+	    dst += 4;
+	}
+
+	while (w)
+	{
+	    d = *dst;
+	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+						xmm_alpha,
+						unpack_32_1x128 (d)));
+	    w--;
+	}
+
+    }
+}
+
+static void
+sse2_composite_over_n_0565 (pixman_implementation_t *imp,
+                            pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    int32_t w;
+    int dst_stride;
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+
+    while (height--)
+    {
+	dst = dst_line;
+
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    d = *dst;
+
+	    *dst++ = pack_565_32_16 (
+		pack_1x128_32 (over_1x128 (xmm_src,
+					   xmm_alpha,
+					   expand565_16_1x128 (d))));
+	    w--;
+	}
+
+	while (w >= 8)
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+	    over_2x128 (&xmm_src, &xmm_src,
+			&xmm_alpha, &xmm_alpha,
+			&xmm_dst0, &xmm_dst1);
+	    over_2x128 (&xmm_src, &xmm_src,
+			&xmm_alpha, &xmm_alpha,
+			&xmm_dst2, &xmm_dst3);
+
+	    xmm_dst = pack_565_4x128_128 (
+		&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+	    save_128_aligned ((__m128i*)dst, xmm_dst);
+
+	    dst += 8;
+	    w -= 8;
+	}
+
+	while (w--)
+	{
+	    d = *dst;
+	    *dst++ = pack_565_32_16 (
+		pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
+					   expand565_16_1x128 (d))));
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+				   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    __m128i xmm_src;
+    __m128i xmm_dst;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m128i mmx_src, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    xmm_src = _mm_unpacklo_epi8 (
+	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+    mmx_src   = xmm_src;
+
+    while (height--)
+    {
+	int w = width;
+	const uint32_t *pm = (uint32_t *)mask_line;
+	uint32_t *pd = (uint32_t *)dst_line;
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+
+	while (w && (unsigned long)pd & 15)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*pd = pack_1x128_32 (
+		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+				   mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+	    pack_cmp =
+		_mm_movemask_epi8 (
+		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+	    if (pack_cmp != 0xffff)
+	    {
+		xmm_dst = load_128_aligned ((__m128i*)pd);
+
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		pix_multiply_2x128 (&xmm_src, &xmm_src,
+				    &xmm_mask_lo, &xmm_mask_hi,
+				    &xmm_mask_lo, &xmm_mask_hi);
+		xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
+
+		save_128_aligned (
+		    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
+	    }
+
+	    pd += 4;
+	    pm += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*pd = pack_1x128_32 (
+		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+				   mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    xmm_src = _mm_unpacklo_epi8 (
+	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	int w = width;
+	const uint32_t *pm = (uint32_t *)mask_line;
+	uint32_t *pd = (uint32_t *)dst_line;
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+
+	while (w && (unsigned long)pd & 15)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
+		                                  &mmx_alpha,
+		                                  &mmx_mask,
+		                                  &mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+	    pack_cmp =
+		_mm_movemask_epi8 (
+		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+	    if (pack_cmp != 0xffff)
+	    {
+		xmm_dst = load_128_aligned ((__m128i*)pd);
+
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    pd += 4;
+	    pm += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*pd = pack_1x128_32 (
+		    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    int32_t w;
+    int dst_stride, src_stride;
+
+    __m128i xmm_mask;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
+
+    xmm_mask = create_mask_16_128 (mask >> 24);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    uint32_t s = *src++;
+
+	    if (s)
+	    {
+		uint32_t d = *dst;
+		
+		__m128i ms = unpack_32_1x128 (s);
+		__m128i alpha    = expand_alpha_1x128 (ms);
+		__m128i dest     = xmm_mask;
+		__m128i alpha_dst = unpack_32_1x128 (d);
+		
+		*dst = pack_1x128_32 (
+		    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+	    }
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_src = load_128_unaligned ((__m128i*)src);
+
+	    if (!is_zero (xmm_src))
+	    {
+		xmm_dst = load_128_aligned ((__m128i*)dst);
+		
+		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				    &xmm_alpha_lo, &xmm_alpha_hi);
+		
+		in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			       &xmm_alpha_lo, &xmm_alpha_hi,
+			       &xmm_mask, &xmm_mask,
+			       &xmm_dst_lo, &xmm_dst_hi);
+		
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+		
+	    dst += 4;
+	    src += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    uint32_t s = *src++;
+
+	    if (s)
+	    {
+		uint32_t d = *dst;
+		
+		__m128i ms = unpack_32_1x128 (s);
+		__m128i alpha = expand_alpha_1x128 (ms);
+		__m128i mask  = xmm_mask;
+		__m128i dest  = unpack_32_1x128 (d);
+		
+		*dst = pack_1x128_32 (
+		    in_over_1x128 (&ms, &alpha, &mask, &dest));
+	    }
+
+	    dst++;
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
+			      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int32_t w;
+    int dst_stride, src_stride;
+
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    *dst++ = *src++ | 0xff000000;
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
+	    
+	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
+	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
+	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
+	    xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
+	    
+	    save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
+	    save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
+	    save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
+	    save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
+	    
+	    dst += 16;
+	    src += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    *dst++ = *src++ | 0xff000000;
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    __m128i xmm_mask, xmm_alpha;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
+
+    xmm_mask = create_mask_16_128 (mask >> 24);
+    xmm_alpha = mask_00ff;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    uint32_t s = (*src++) | 0xff000000;
+	    uint32_t d = *dst;
+
+	    __m128i src   = unpack_32_1x128 (s);
+	    __m128i alpha = xmm_alpha;
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
+
+	    *dst++ = pack_1x128_32 (
+		in_over_1x128 (&src, &alpha, &mask, &dest));
+
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_src = _mm_or_si128 (
+		load_128_unaligned ((__m128i*)src), mask_ff000000);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			   &xmm_alpha, &xmm_alpha,
+			   &xmm_mask, &xmm_mask,
+			   &xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    dst += 4;
+	    src += 4;
+	    w -= 4;
+
+	}
+
+	while (w)
+	{
+	    uint32_t s = (*src++) | 0xff000000;
+	    uint32_t d = *dst;
+
+	    __m128i src  = unpack_32_1x128 (s);
+	    __m128i alpha = xmm_alpha;
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
+
+	    *dst++ = pack_1x128_32 (
+		in_over_1x128 (&src, &alpha, &mask, &dest));
+
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    int dst_stride, src_stride;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    dst = dst_line;
+    src = src_line;
+
+    while (height--)
+    {
+	sse2_combine_over_u (imp, op, dst, src, NULL, width);
+
+	dst += dst_stride;
+	src += src_stride;
+    }
+}
+
+static force_inline uint16_t
+composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
+{
+    __m128i ms;
+
+    ms = unpack_32_1x128 (src);
+    return pack_565_32_16 (
+	pack_1x128_32 (
+	    over_1x128 (
+		ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
+}
+
+static void
+sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	src = src_line;
+
+	dst_line += dst_stride;
+	src_line += src_stride;
+	w = width;
+
+	/* Align dst on a 16-byte boundary */
+	while (w &&
+	       ((unsigned long)dst & 15))
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = composite_over_8888_0565pixel (s, d);
+	    w--;
+	}
+
+	/* It's a 8 pixel loop */
+	while (w >= 8)
+	{
+	    /* I'm loading unaligned because I'm not sure
+	     * about the address alignment.
+	     */
+	    xmm_src = load_128_unaligned ((__m128i*) src);
+	    xmm_dst = load_128_aligned ((__m128i*) dst);
+
+	    /* Unpacking */
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				&xmm_alpha_lo, &xmm_alpha_hi);
+
+	    /* I'm loading next 4 pixels from memory
+	     * before to optimze the memory read.
+	     */
+	    xmm_src = load_128_unaligned ((__m128i*) (src + 4));
+
+	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			&xmm_alpha_lo, &xmm_alpha_hi,
+			&xmm_dst0, &xmm_dst1);
+
+	    /* Unpacking */
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				&xmm_alpha_lo, &xmm_alpha_hi);
+
+	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			&xmm_alpha_lo, &xmm_alpha_hi,
+			&xmm_dst2, &xmm_dst3);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    dst += 8;
+	    src += 8;
+	}
+
+	while (w--)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = composite_over_8888_0565pixel (s, d);
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m, d;
+
+    __m128i xmm_src, xmm_alpha, xmm_def;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    xmm_def = create_mask_2x32_128 (src, src);
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_pixel_8_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
+		                                   &mmx_alpha,
+		                                   &mmx_mask,
+		                                   &mmx_dest));
+	    }
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 4)
+	{
+	    m = *((uint32_t*)mask);
+
+	    if (srca == 0xff && m == 0xffffffff)
+	    {
+		save_128_aligned ((__m128i*)dst, xmm_def);
+	    }
+	    else if (m)
+	    {
+		xmm_dst = load_128_aligned ((__m128i*) dst);
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    mask += 4;
+	}
+
+	while (w)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_pixel_8_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
+		                                   &mmx_alpha,
+		                                   &mmx_mask,
+		                                   &mmx_dest));
+	    }
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+static pixman_bool_t
+pixman_fill_sse2 (uint32_t *bits,
+                  int       stride,
+                  int       bpp,
+                  int       x,
+                  int       y,
+                  int       width,
+                  int       height,
+                  uint32_t  data)
+{
+    uint32_t byte_width;
+    uint8_t         *byte_line;
+
+    __m128i xmm_def;
+
+    if (bpp == 8)
+    {
+	uint8_t b;
+	uint16_t w;
+
+	stride = stride * (int) sizeof (uint32_t) / 1;
+	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+	byte_width = width;
+	stride *= 1;
+
+	b = data & 0xff;
+	w = (b << 8) | b;
+	data = (w << 16) | w;
+    }
+    else if (bpp == 16)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 2;
+	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+	byte_width = 2 * width;
+	stride *= 2;
+
+        data = (data & 0xffff) * 0x00010001;
+    }
+    else if (bpp == 32)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 4;
+	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+	byte_width = 4 * width;
+	stride *= 4;
+    }
+    else
+    {
+	return FALSE;
+    }
+
+    xmm_def = create_mask_2x32_128 (data, data);
+
+    while (height--)
+    {
+	int w;
+	uint8_t *d = byte_line;
+	byte_line += stride;
+	w = byte_width;
+
+	while (w >= 1 && ((unsigned long)d & 1))
+	{
+	    *(uint8_t *)d = data;
+	    w -= 1;
+	    d += 1;
+	}
+
+	while (w >= 2 && ((unsigned long)d & 3))
+	{
+	    *(uint16_t *)d = data;
+	    w -= 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((unsigned long)d & 15))
+	{
+	    *(uint32_t *)d = data;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	while (w >= 128)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 64),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 80),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 96),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 112), xmm_def);
+
+	    d += 128;
+	    w -= 128;
+	}
+
+	if (w >= 64)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
+
+	    d += 64;
+	    w -= 64;
+	}
+
+	if (w >= 32)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+
+	    d += 32;
+	    w -= 32;
+	}
+
+	if (w >= 16)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+
+	    d += 16;
+	    w -= 16;
+	}
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = data;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = data;
+	    w -= 2;
+	    d += 2;
+	}
+
+	if (w >= 1)
+	{
+	    *(uint8_t *)d = data;
+	    w -= 1;
+	    d += 1;
+	}
+    }
+
+    return TRUE;
+}
+
+static void
+sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m;
+
+    __m128i xmm_src, xmm_def;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+    {
+	pixman_fill_sse2 (dest_image->bits.bits, dest_image->bits.rowstride,
+	                  PIXMAN_FORMAT_BPP (dest_image->bits.format),
+	                  dest_x, dest_y, width, height, 0);
+	return;
+    }
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    xmm_def = create_mask_2x32_128 (src, src);
+    xmm_src = expand_pixel_32_1x128 (src);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		*dst = pack_1x128_32 (
+		    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
+	    }
+	    else
+	    {
+		*dst = 0;
+	    }
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 4)
+	{
+	    m = *((uint32_t*)mask);
+
+	    if (srca == 0xff && m == 0xffffffff)
+	    {
+		save_128_aligned ((__m128i*)dst, xmm_def);
+	    }
+	    else if (m)
+	    {
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		pix_multiply_2x128 (&xmm_src, &xmm_src,
+				    &xmm_mask_lo, &xmm_mask_hi,
+				    &xmm_mask_lo, &xmm_mask_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+	    }
+	    else
+	    {
+		save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    mask += 4;
+	}
+
+	while (w)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		*dst = pack_1x128_32 (
+		    pix_multiply_1x128 (
+			xmm_src, expand_pixel_8_1x128 (m)));
+	    }
+	    else
+	    {
+		*dst = 0;
+	    }
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		mmx_dest = expand565_16_1x128 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x128_32 (
+			in_over_1x128 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 8)
+	{
+	    xmm_dst = load_128_aligned ((__m128i*) dst);
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+	    m = *((uint32_t*)mask);
+	    mask += 4;
+
+	    if (m)
+	    {
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst0, &xmm_dst1);
+	    }
+
+	    m = *((uint32_t*)mask);
+	    mask += 4;
+
+	    if (m)
+	    {
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst2, &xmm_dst3);
+	    }
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    dst += 8;
+	}
+
+	while (w)
+	{
+	    m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		mmx_dest = expand565_16_1x128 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x128_32 (
+			in_over_1x128 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint32_t opaque, zero;
+
+    __m128i ms;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    ms = unpack_32_1x128 (s);
+
+	    *dst++ = pack_565_32_16 (
+		pack_1x128_32 (
+		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
+	    w--;
+	}
+
+	while (w >= 8)
+	{
+	    /* First round */
+	    xmm_src = load_128_unaligned ((__m128i*)src);
+	    xmm_dst = load_128_aligned  ((__m128i*)dst);
+
+	    opaque = is_opaque (xmm_src);
+	    zero = is_zero (xmm_src);
+
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+	    /* preload next round*/
+	    xmm_src = load_128_unaligned ((__m128i*)(src + 4));
+
+	    if (opaque)
+	    {
+		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+				     &xmm_dst0, &xmm_dst1);
+	    }
+	    else if (!zero)
+	    {
+		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+					&xmm_dst0, &xmm_dst1);
+	    }
+
+	    /* Second round */
+	    opaque = is_opaque (xmm_src);
+	    zero = is_zero (xmm_src);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+	    if (opaque)
+	    {
+		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+				     &xmm_dst2, &xmm_dst3);
+	    }
+	    else if (!zero)
+	    {
+		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+					&xmm_dst2, &xmm_dst3);
+	    }
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    src += 8;
+	    dst += 8;
+	}
+
+	while (w)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    ms = unpack_32_1x128 (s);
+
+	    *dst++ = pack_565_32_16 (
+		pack_1x128_32 (
+		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint32_t opaque, zero;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = pack_1x128_32 (
+		over_rev_non_pre_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
+
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
+
+	    opaque = is_opaque (xmm_src_hi);
+	    zero = is_zero (xmm_src_hi);
+
+	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+	    if (opaque)
+	    {
+		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+				     &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+	    else if (!zero)
+	    {
+		xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
+
+		unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+					&xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    src += 4;
+	}
+
+	while (w)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = pack_1x128_32 (
+		over_rev_non_pre_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
+
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int w;
+    uint32_t pack_cmp;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	w = width;
+	mask = mask_line;
+	dst = dst_line;
+	mask_line += mask_stride;
+	dst_line += dst_stride;
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    m = *(uint32_t *) mask;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = expand565_16_1x128 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x128_32 (
+			in_over_1x128 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	    mask++;
+	}
+
+	while (w >= 8)
+	{
+	    /* First round */
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    pack_cmp = _mm_movemask_epi8 (
+		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+	    /* preload next round */
+	    xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
+
+	    /* preload next round */
+	    if (pack_cmp != 0xffff)
+	    {
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst0, &xmm_dst1);
+	    }
+
+	    /* Second round */
+	    pack_cmp = _mm_movemask_epi8 (
+		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+	    if (pack_cmp != 0xffff)
+	    {
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst2, &xmm_dst3);
+	    }
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    dst += 8;
+	    mask += 8;
+	}
+
+	while (w)
+	{
+	    m = *(uint32_t *) mask;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = expand565_16_1x128 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x128_32 (
+			in_over_1x128 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	    mask++;
+	}
+    }
+
+}
+
+static void
+sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint32_t d, m;
+    uint32_t src;
+    int32_t w;
+
+    __m128i xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    pix_multiply_1x128 (xmm_alpha,
+				       unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+				&xmm_mask_lo, &xmm_mask_hi,
+				&xmm_mask_lo, &xmm_mask_hi);
+
+	    pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+				&xmm_dst_lo, &xmm_dst_hi,
+				&xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    mask += 16;
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_in_n_8 (pixman_implementation_t *imp,
+		       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    int dst_stride;
+    uint32_t d;
+    uint32_t src;
+    int32_t w;
+
+    __m128i xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+    src = src >> 24;
+
+    if (src == 0xff)
+	return;
+
+    if (src == 0x00)
+    {
+	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+		     8, dest_x, dest_y, width, height, src);
+
+	return;
+    }
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    xmm_alpha,
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    
+	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+				&xmm_dst_lo, &xmm_dst_hi,
+				&xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    xmm_alpha,
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_in_8_8 (pixman_implementation_t *imp,
+                       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int src_stride, dst_stride;
+    int32_t w;
+    uint32_t s, d;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    s = (uint32_t) *src++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    xmm_src = load_128_unaligned ((__m128i*)src);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+				&xmm_dst_lo, &xmm_dst_hi,
+				&xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    src += 16;
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    s = (uint32_t) *src++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
+			  pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint32_t m, d;
+
+    __m128i xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		_mm_adds_epu16 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+				&xmm_mask_lo, &xmm_mask_hi,
+				&xmm_mask_lo, &xmm_mask_hi);
+
+	    xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+	    xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    mask += 16;
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		_mm_adds_epu16 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
+
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_n_8 (pixman_implementation_t *imp,
+			pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    int dst_stride;
+    int32_t w;
+    uint32_t src;
+
+    __m128i xmm_src;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    src >>= 24;
+
+    if (src == 0x00)
+	return;
+
+    if (src == 0xff)
+    {
+	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+		     8, dest_x, dest_y, width, height, 0xff);
+
+	return;
+    }
+
+    src = (src << 24) | (src << 16) | (src << 8) | src;
+    xmm_src = _mm_set_epi32 (src, src, src, src);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    *dst = (uint8_t)_mm_cvtsi128_si32 (
+		_mm_adds_epu8 (
+		    xmm_src,
+		    _mm_cvtsi32_si128 (*dst)));
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 16)
+	{
+	    save_128_aligned (
+		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
+
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    *dst = (uint8_t)_mm_cvtsi128_si32 (
+		_mm_adds_epu8 (
+		    xmm_src,
+		    _mm_cvtsi32_si128 (*dst)));
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_8_8 (pixman_implementation_t *imp,
+			pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	src = src_line;
+
+	dst_line += dst_stride;
+	src_line += src_stride;
+	w = width;
+
+	/* Small head */
+	while (w && (unsigned long)dst & 3)
+	{
+	    t = (*dst) + (*src++);
+	    *dst++ = t | (0 - (t >> 8));
+	    w--;
+	}
+
+	sse2_combine_add_u (imp, op,
+			    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+	/* Small tail */
+	dst += w & 0xfffc;
+	src += w & 0xfffc;
+
+	w &= 3;
+
+	while (w)
+	{
+	    t = (*dst) + (*src++);
+	    *dst++ = t | (0 - (t >> 8));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	sse2_combine_add_u (imp, op, dst, src, NULL, width);
+    }
+
+}
+
+static pixman_bool_t
+pixman_blt_sse2 (uint32_t *src_bits,
+                 uint32_t *dst_bits,
+                 int       src_stride,
+                 int       dst_stride,
+                 int       src_bpp,
+                 int       dst_bpp,
+                 int       src_x,
+                 int       src_y,
+                 int       dest_x,
+                 int       dest_y,
+                 int       width,
+                 int       height)
+{
+    uint8_t *   src_bytes;
+    uint8_t *   dst_bytes;
+    int byte_width;
+
+    if (src_bpp != dst_bpp)
+	return FALSE;
+
+    if (src_bpp == 16)
+    {
+	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+	src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+	byte_width = 2 * width;
+	src_stride *= 2;
+	dst_stride *= 2;
+    }
+    else if (src_bpp == 32)
+    {
+	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+	byte_width = 4 * width;
+	src_stride *= 4;
+	dst_stride *= 4;
+    }
+    else
+    {
+	return FALSE;
+    }
+
+    while (height--)
+    {
+	int w;
+	uint8_t *s = src_bytes;
+	uint8_t *d = dst_bytes;
+	src_bytes += src_stride;
+	dst_bytes += dst_stride;
+	w = byte_width;
+
+	while (w >= 2 && ((unsigned long)d & 3))
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((unsigned long)d & 15))
+	{
+	    *(uint32_t *)d = *(uint32_t *)s;
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+
+	while (w >= 64)
+	{
+	    __m128i xmm0, xmm1, xmm2, xmm3;
+
+	    xmm0 = load_128_unaligned ((__m128i*)(s));
+	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
+	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
+	    xmm3 = load_128_unaligned ((__m128i*)(s + 48));
+
+	    save_128_aligned ((__m128i*)(d),    xmm0);
+	    save_128_aligned ((__m128i*)(d + 16), xmm1);
+	    save_128_aligned ((__m128i*)(d + 32), xmm2);
+	    save_128_aligned ((__m128i*)(d + 48), xmm3);
+
+	    s += 64;
+	    d += 64;
+	    w -= 64;
+	}
+
+	while (w >= 16)
+	{
+	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
+
+	    w -= 16;
+	    d += 16;
+	    s += 16;
+	}
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = *(uint32_t *)s;
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+    }
+
+
+    return TRUE;
+}
+
+static void
+sse2_composite_copy_area (pixman_implementation_t *imp,
+                          pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    pixman_blt_sse2 (src_image->bits.bits,
+                     dest_image->bits.bits,
+                     src_image->bits.rowstride,
+                     dest_image->bits.rowstride,
+                     PIXMAN_FORMAT_BPP (src_image->bits.format),
+                     PIXMAN_FORMAT_BPP (dest_image->bits.format),
+                     src_x, src_y, dest_x, dest_y, width, height);
+}
+
+static void
+sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint8_t         *mask, *mask_line;
+    uint32_t m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+    __m128i ms;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        while (w && (unsigned long)dst & 15)
+        {
+            s = 0xff000000 | *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+            ms = unpack_32_1x128 (s);
+
+            if (m != 0xff)
+            {
+		__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		__m128i md = unpack_32_1x128 (d);
+
+                ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
+            }
+
+            *dst++ = pack_1x128_32 (ms);
+            w--;
+        }
+
+        while (w >= 4)
+        {
+            m = *(uint32_t*) mask;
+            xmm_src = _mm_or_si128 (
+		load_128_unaligned ((__m128i*)src), mask_ff000000);
+
+            if (m == 0xffffffff)
+            {
+                save_128_aligned ((__m128i*)dst, xmm_src);
+            }
+            else
+            {
+                xmm_dst = load_128_aligned ((__m128i*)dst);
+
+                xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+                unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+                expand_alpha_rev_2x128 (
+		    xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+                in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			       &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst_lo, &xmm_dst_hi);
+
+                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+            }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+            m = (uint32_t) *mask++;
+
+            if (m)
+            {
+                s = 0xff000000 | *src;
+
+                if (m == 0xff)
+                {
+                    *dst = s;
+                }
+                else
+                {
+		    __m128i ma, md, ms;
+
+                    d = *dst;
+
+		    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		    md = unpack_32_1x128 (d);
+		    ms = unpack_32_1x128 (s);
+
+                    *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
+                }
+
+            }
+
+            src++;
+            dst++;
+            w--;
+        }
+    }
+
+}
+
+static void
+sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint8_t         *mask, *mask_line;
+    uint32_t m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        while (w && (unsigned long)dst & 15)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+
+	    sa = s >> 24;
+
+	    if (m)
+	    {
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m128i ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
+
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+		}
+	    }
+
+	    dst++;
+            w--;
+        }
+
+        while (w >= 4)
+        {
+            m = *(uint32_t *) mask;
+
+	    if (m)
+	    {
+		xmm_src = load_128_unaligned ((__m128i*)src);
+
+		if (m == 0xffffffff && is_opaque (xmm_src))
+		{
+		    save_128_aligned ((__m128i *)dst, xmm_src);
+		}
+		else
+		{
+		    xmm_dst = load_128_aligned ((__m128i *)dst);
+
+		    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+		    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+		}
+	    }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+
+	    sa = s >> 24;
+
+	    if (m)
+	    {
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m128i ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
+
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+		}
+	    }
+
+	    dst++;
+            w--;
+        }
+    }
+
+}
+
+static void
+sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
+				    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst;
+    __m128i xmm_src;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_dsta_hi, xmm_dsta_lo;
+    int dst_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+
+    while (height--)
+    {
+	dst = dst_line;
+
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    __m128i vd;
+
+	    vd = unpack_32_1x128 (*dst);
+
+	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+					      xmm_src));
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 4)
+	{
+	    __m128i tmp_lo, tmp_hi;
+
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
+
+	    tmp_lo = xmm_src;
+	    tmp_hi = xmm_src;
+
+	    over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			&xmm_dsta_lo, &xmm_dsta_hi,
+			&tmp_lo, &tmp_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
+
+	    w -= 4;
+	    dst += 4;
+	}
+
+	while (w)
+	{
+	    __m128i vd;
+
+	    vd = unpack_32_1x128 (*dst);
+
+	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+					      xmm_src));
+	    w--;
+	    dst++;
+	}
+
+    }
+
+}
+
+static void
+sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
+				    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint32_t    *mask, *mask_line;
+    uint32_t    m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        while (w && (unsigned long)dst & 15)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (*mask++) >> 24;
+            d = *dst;
+
+	    sa = s >> 24;
+
+	    if (m)
+	    {
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m128i ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
+
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+		}
+	    }
+
+	    dst++;
+            w--;
+        }
+
+        while (w >= 4)
+        {
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+
+	    if (!is_transparent (xmm_mask))
+	    {
+		xmm_src = load_128_unaligned ((__m128i*)src);
+
+		if (is_opaque (xmm_mask) && is_opaque (xmm_src))
+		{
+		    save_128_aligned ((__m128i *)dst, xmm_src);
+		}
+		else
+		{
+		    xmm_dst = load_128_aligned ((__m128i *)dst);
+
+		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+		    expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+		}
+	    }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (*mask++) >> 24;
+            d = *dst;
+
+	    sa = s >> 24;
+
+	    if (m)
+	    {
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m128i ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
+
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+		}
+	    }
+
+	    dst++;
+            w--;
+        }
+    }
+
+}
+
+/* A variant of 'sse2_combine_over_u' with minor tweaks */
+static force_inline void
+scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
+                                             const uint32_t* ps,
+                                             int32_t         w,
+                                             pixman_fixed_t  vx,
+                                             pixman_fixed_t  unit_x,
+                                             pixman_fixed_t  max_vx,
+                                             pixman_bool_t   fully_transparent_src)
+{
+    uint32_t s, d;
+    const uint32_t* pm = NULL;
+
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    if (fully_transparent_src)
+	return;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((unsigned long)pd & 15))
+    {
+	d = *pd;
+	s = combine1 (ps + (vx >> 16), pm);
+	vx += unit_x;
+
+	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
+	if (pm)
+	    pm++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i tmp;
+	uint32_t tmp1, tmp2, tmp3, tmp4;
+
+	tmp1 = ps[vx >> 16];
+	vx += unit_x;
+	tmp2 = ps[vx >> 16];
+	vx += unit_x;
+	tmp3 = ps[vx >> 16];
+	vx += unit_x;
+	tmp4 = ps[vx >> 16];
+	vx += unit_x;
+
+	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
+
+	if (is_opaque (xmm_src_hi))
+	{
+	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
+	}
+	else if (!is_zero (xmm_src_hi))
+	{
+	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	    expand_alpha_2x128 (
+		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+
+	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			&xmm_alpha_lo, &xmm_alpha_hi,
+			&xmm_dst_lo, &xmm_dst_hi);
+
+	    /* rebuid the 4 pixel data and save*/
+	    save_128_aligned ((__m128i*)pd,
+			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	}
+
+	w -= 4;
+	pd += 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps + (vx >> 16), pm);
+	vx += unit_x;
+
+	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
+	if (pm)
+	    pm++;
+
+	w--;
+    }
+}
+
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, PAD)
+
+static force_inline void
+scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
+					       uint32_t *       dst,
+					       const uint32_t * src,
+					       int32_t          w,
+					       pixman_fixed_t   vx,
+					       pixman_fixed_t   unit_x,
+					       pixman_fixed_t   max_vx,
+					       pixman_bool_t    zero_src)
+{
+    __m128i xmm_mask;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    if (zero_src || (*mask >> 24) == 0)
+	return;
+
+    xmm_mask = create_mask_16_128 (*mask >> 24);
+
+    while (w && (unsigned long)dst & 15)
+    {
+	uint32_t s = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+
+	if (s)
+	{
+	    uint32_t d = *dst;
+
+	    __m128i ms = unpack_32_1x128 (s);
+	    __m128i alpha     = expand_alpha_1x128 (ms);
+	    __m128i dest      = xmm_mask;
+	    __m128i alpha_dst = unpack_32_1x128 (d);
+
+	    *dst = pack_1x128_32 (
+		in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+	}
+	dst++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	uint32_t tmp1, tmp2, tmp3, tmp4;
+
+	tmp1 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp2 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp3 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+	tmp4 = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+
+	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+	if (!is_zero (xmm_src))
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			        &xmm_alpha_lo, &xmm_alpha_hi);
+
+	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			   &xmm_alpha_lo, &xmm_alpha_hi,
+			   &xmm_mask, &xmm_mask,
+			   &xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	}
+
+	dst += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	uint32_t s = src[pixman_fixed_to_int (vx)];
+	vx += unit_x;
+
+	if (s)
+	{
+	    uint32_t d = *dst;
+
+	    __m128i ms = unpack_32_1x128 (s);
+	    __m128i alpha = expand_alpha_1x128 (ms);
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
+
+	    *dst = pack_1x128_32 (
+		in_over_1x128 (&ms, &alpha, &mask, &dest));
+	}
+
+	dst++;
+	w--;
+    }
+
+}
+
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+
+#define BILINEAR_DECLARE_VARIABLES						\
+    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
+    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
+    const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
+    const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,	\
+					  unit_x, unit_x, unit_x, unit_x);	\
+    const __m128i xmm_zero = _mm_setzero_si128 ();				\
+    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
+do {										\
+    __m128i xmm_wh, xmm_lo, xmm_hi, a;						\
+    /* fetch 2x2 pixel block into sse2 register */				\
+    uint32_t tl = src_top [pixman_fixed_to_int (vx)];				\
+    uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1];			\
+    uint32_t bl = src_bottom [pixman_fixed_to_int (vx)];			\
+    uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1];			\
+    a = _mm_set_epi32 (tr, tl, br, bl);						\
+    vx += unit_x;								\
+    /* vertical interpolation */						\
+    a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero),	\
+					xmm_wt),				\
+		       _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero),	\
+					xmm_wb));				\
+    /* calculate horizontal weights */						\
+    xmm_wh = _mm_add_epi16 (xmm_addc,						\
+			    _mm_xor_si128 (xmm_xorc,				\
+					   _mm_srli_epi16 (xmm_x, 8)));		\
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
+    /* horizontal interpolation */						\
+    xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
+    xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
+    a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
+		       _mm_unpackhi_epi16 (xmm_lo, xmm_hi));			\
+    /* shift and pack the result */						\
+    a = _mm_srli_epi32 (a, 16);							\
+    a = _mm_packs_epi32 (a, a);							\
+    a = _mm_packus_epi16 (a, a);						\
+    pix = _mm_cvtsi128_si32 (a);						\
+} while (0)
+
+#define BILINEAR_SKIP_ONE_PIXEL()						\
+do {										\
+    vx += unit_x;								\
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
+} while(0)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
+					     const uint32_t * mask,
+					     const uint32_t * src_top,
+					     const uint32_t * src_bottom,
+					     int32_t          w,
+					     int              wt,
+					     int              wb,
+					     pixman_fixed_t   vx,
+					     pixman_fixed_t   unit_x,
+					     pixman_fixed_t   max_vx,
+					     pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+
+    while ((w -= 4) >= 0)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+	*dst++ = pix1;
+	*dst++ = pix2;
+	*dst++ = pix3;
+	*dst++ = pix4;
+    }
+
+    if (w & 2)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+	*dst++ = pix1;
+	*dst++ = pix2;
+    }
+
+    if (w & 1)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	*dst = pix1;
+    }
+
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
+					      const uint32_t * mask,
+					      const uint32_t * src_top,
+					      const uint32_t * src_bottom,
+					      int32_t          w,
+					      int              wt,
+					      int              wb,
+					      pixman_fixed_t   vx,
+					      pixman_fixed_t   unit_x,
+					      pixman_fixed_t   max_vx,
+					      pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+
+    while (w && ((unsigned long)dst & 15))
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+	if (pix1)
+	{
+	    pix2 = *dst;
+	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
+	}
+
+	w--;
+	dst++;
+    }
+
+    while (w  >= 4)
+    {
+	__m128i xmm_src;
+	__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
+	__m128i xmm_alpha_hi, xmm_alpha_lo;
+
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+	xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+	if (!is_zero (xmm_src))
+	{
+	    if (is_opaque (xmm_src))
+	    {
+		save_128_aligned ((__m128i *)dst, xmm_src);
+	    }
+	    else
+	    {
+		__m128i xmm_dst = load_128_aligned ((__m128i *)dst);
+
+		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+		over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+	}
+
+	w -= 4;
+	dst += 4;
+    }
+
+    while (w)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+	if (pix1)
+	{
+	    pix2 = *dst;
+	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
+	}
+
+	w--;
+	dst++;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
+						const uint8_t  * mask,
+						const uint32_t * src_top,
+						const uint32_t * src_bottom,
+						int32_t          w,
+						int              wt,
+						int              wb,
+						pixman_fixed_t   vx,
+						pixman_fixed_t   unit_x,
+						pixman_fixed_t   max_vx,
+						pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+    uint32_t m;
+
+    while (w && ((unsigned long)dst & 15))
+    {
+	uint32_t sa;
+
+	m = (uint32_t) *mask++;
+
+	if (m)
+	{
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	    sa = pix1 >> 24;
+
+	    if (sa == 0xff && m == 0xff)
+	    {
+		*dst = pix1;
+	    }
+	    else
+	    {
+		__m128i ms, md, ma, msa;
+
+		pix2 = *dst;
+		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		ms = unpack_32_1x128 (pix1);
+		md = unpack_32_1x128 (pix2);
+
+		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+	    }
+	}
+	else
+	{
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	}
+
+	w--;
+	dst++;
+    }
+
+    while (w >= 4)
+    {
+	__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+	m = *(uint32_t*)mask;
+
+	if (m)
+	{
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+	    if (m == 0xffffffff && is_opaque (xmm_src))
+	    {
+		save_128_aligned ((__m128i *)dst, xmm_src);
+	    }
+	    else
+	    {
+		xmm_dst = load_128_aligned ((__m128i *)dst);
+
+		xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+		in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+			       &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+	}
+	else
+	{
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	}
+
+	w -= 4;
+	dst += 4;
+	mask += 4;
+    }
+
+    while (w)
+    {
+	uint32_t sa;
+
+	m = (uint32_t) *mask++;
+
+	if (m)
+	{
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	    sa = pix1 >> 24;
+
+	    if (sa == 0xff && m == 0xff)
+	    {
+		*dst = pix1;
+	    }
+	    else
+	    {
+		__m128i ms, md, ma, msa;
+
+		pix2 = *dst;
+		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		ms = unpack_32_1x128 (pix1);
+		md = unpack_32_1x128 (pix2);
+
+		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+	    }
+	}
+	else
+	{
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	}
+
+	w--;
+	dst++;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       COVER, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       PAD, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       NONE, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
+
+static const pixman_fast_path_t sse2_fast_paths[] =
+{
+    /* PIXMAN_OP_OVER */
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    
+    /* PIXMAN_OP_OVER_REVERSE */
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
+
+    /* PIXMAN_OP_ADD */
+    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
+
+    /* PIXMAN_OP_SRC */
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
+
+    /* PIXMAN_OP_IN */
+    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
+
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
+
+    { PIXMAN_OP_NONE },
+};
+
+static pixman_bool_t
+sse2_blt (pixman_implementation_t *imp,
+          uint32_t *               src_bits,
+          uint32_t *               dst_bits,
+          int                      src_stride,
+          int                      dst_stride,
+          int                      src_bpp,
+          int                      dst_bpp,
+          int                      src_x,
+          int                      src_y,
+          int                      dest_x,
+          int                      dest_y,
+          int                      width,
+          int                      height)
+{
+    if (!pixman_blt_sse2 (
+            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+            src_x, src_y, dest_x, dest_y, width, height))
+
+    {
+	return _pixman_implementation_blt (
+	    imp->delegate,
+	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+	    src_x, src_y, dest_x, dest_y, width, height);
+    }
+
+    return TRUE;
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+static pixman_bool_t
+sse2_fill (pixman_implementation_t *imp,
+           uint32_t *               bits,
+           int                      stride,
+           int                      bpp,
+           int                      x,
+           int                      y,
+           int                      width,
+           int                      height,
+           uint32_t xor)
+{
+    if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
+    {
+	return _pixman_implementation_fill (
+	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+    }
+
+    return TRUE;
+}
+
+static uint32_t *
+sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    __m128i ff000000 = mask_ff000000;
+    uint32_t *dst = iter->buffer;
+    uint32_t *src = (uint32_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && ((unsigned long)dst) & 0x0f)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	save_128_aligned (
+	    (__m128i *)dst, _mm_or_si128 (
+		load_128_unaligned ((__m128i *)src), ff000000));
+
+	dst += 4;
+	src += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint16_t *src = (uint16_t *)iter->bits;
+    __m128i ff000000 = mask_ff000000;
+
+    iter->bits += iter->stride;
+
+    while (w && ((unsigned long)dst) & 0x0f)
+    {
+	uint16_t s = *src++;
+
+	*dst++ = CONVERT_0565_TO_8888 (s);
+	w--;
+    }
+
+    while (w >= 8)
+    {
+	__m128i lo, hi, s;
+
+	s = _mm_loadu_si128 ((__m128i *)src);
+
+	lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
+	hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
+
+	save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
+	save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
+
+	dst += 8;
+	src += 8;
+	w -= 8;
+    }
+
+    while (w)
+    {
+	uint16_t s = *src++;
+
+	*dst++ = CONVERT_0565_TO_8888 (s);
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint8_t *src = iter->bits;
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+
+    iter->bits += iter->stride;
+
+    while (w && (((unsigned long)dst) & 15))
+    {
+        *dst++ = *(src++) << 24;
+        w--;
+    }
+
+    while (w >= 16)
+    {
+	xmm0 = _mm_loadu_si128((__m128i *)src);
+
+	xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
+	xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
+	xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
+	xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
+	xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
+	xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
+
+	_mm_store_si128(((__m128i *)(dst +  0)), xmm3);
+	_mm_store_si128(((__m128i *)(dst +  4)), xmm4);
+	_mm_store_si128(((__m128i *)(dst +  8)), xmm5);
+	_mm_store_si128(((__m128i *)(dst + 12)), xmm6);
+
+	dst += 16;
+	src += 16;
+	w -= 16;
+    }
+
+    while (w)
+    {
+	*dst++ = *(src++) << 24;
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    pixman_iter_get_scanline_t	get_scanline;
+} fetcher_info_t;
+
+static const fetcher_info_t fetchers[] =
+{
+    { PIXMAN_x8r8g8b8,		sse2_fetch_x8r8g8b8 },
+    { PIXMAN_r5g6b5,		sse2_fetch_r5g6b5 },
+    { PIXMAN_a8,		sse2_fetch_a8 },
+    { PIXMAN_null }
+};
+
+static void
+sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+    int x = iter->x;
+    int y = iter->y;
+    int width = iter->width;
+    int height = iter->height;
+
+#define FLAGS								\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
+
+    if ((iter->flags & ITER_NARROW)				&&
+	(image->common.flags & FLAGS) == FLAGS			&&
+	x >= 0 && y >= 0					&&
+	x + width <= image->bits.width				&&
+	y + height <= image->bits.height)
+    {
+	const fetcher_info_t *f;
+
+	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+	{
+	    if (image->common.extended_format_code == f->format)
+	    {
+		uint8_t *b = (uint8_t *)image->bits.bits;
+		int s = image->bits.rowstride * 4;
+
+		iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
+		iter->stride = s;
+
+		iter->get_scanline = f->get_scanline;
+		return;
+	    }
+	}
+    }
+
+    imp->delegate->src_iter_init (imp->delegate, iter);
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+pixman_implementation_t *
+_pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
+
+    /* SSE2 constants */
+    mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+    mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
+    mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
+    mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
+    mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+    mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
+    mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
+    mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
+    mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
+    mask_0080 = create_mask_16_128 (0x0080);
+    mask_00ff = create_mask_16_128 (0x00ff);
+    mask_0101 = create_mask_16_128 (0x0101);
+    mask_ffff = create_mask_16_128 (0xffff);
+    mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
+    mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
+
+    /* Set up function pointers */
+    imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
+    imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
+
+    imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
+
+    imp->blt = sse2_blt;
+    imp->fill = sse2_fill;
+
+    imp->src_iter_init = sse2_src_iter_init;
+
+    return imp;
+}
diff --git a/pixman/pixman-timer.c b/pixman/pixman-timer.c
new file mode 100644
index 0000000..f5ae18e
--- /dev/null
+++ b/pixman/pixman-timer.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright Â© 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL RED HAT
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "pixman-private.h"
+
+#ifdef PIXMAN_TIMERS
+
+static pixman_timer_t *timers;
+
+static void
+dump_timers (void)
+{
+    pixman_timer_t *timer;
+
+    for (timer = timers; timer != NULL; timer = timer->next)
+    {
+	printf ("%s:   total: %llu     n: %llu      avg: %f\n",
+	        timer->name,
+	        timer->total,
+	        timer->n_times,
+	        timer->total / (double)timer->n_times);
+    }
+}
+
+void
+pixman_timer_register (pixman_timer_t *timer)
+{
+    static int initialized;
+
+    int atexit (void (*function)(void));
+
+    if (!initialized)
+    {
+	atexit (dump_timers);
+	initialized = 1;
+    }
+
+    timer->next = timers;
+    timers = timer;
+}
+
+#endif
diff --git a/pixman/pixman-trap.c b/pixman/pixman-trap.c
new file mode 100644
index 0000000..c99f03e
--- /dev/null
+++ b/pixman/pixman-trap.c
@@ -0,0 +1,668 @@
+/*
+ * Copyright Â© 2002 Keith Packard, member of The XFree86 Project, Inc.
+ * Copyright Â© 2004 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+
+/*
+ * Compute the smallest value greater than or equal to y which is on a
+ * grid row.
+ */
+
+PIXMAN_EXPORT pixman_fixed_t
+pixman_sample_ceil_y (pixman_fixed_t y, int n)
+{
+    pixman_fixed_t f = pixman_fixed_frac (y);
+    pixman_fixed_t i = pixman_fixed_floor (y);
+
+    f = DIV (f - Y_FRAC_FIRST (n) + (STEP_Y_SMALL (n) - pixman_fixed_e), STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) +
+	Y_FRAC_FIRST (n);
+    
+    if (f > Y_FRAC_LAST (n))
+    {
+	if (pixman_fixed_to_int (i) == 0x7fff)
+	{
+	    f = 0xffff; /* saturate */
+	}
+	else
+	{
+	    f = Y_FRAC_FIRST (n);
+	    i += pixman_fixed_1;
+	}
+    }
+    return (i | f);
+}
+
+/*
+ * Compute the largest value strictly less than y which is on a
+ * grid row.
+ */
+PIXMAN_EXPORT pixman_fixed_t
+pixman_sample_floor_y (pixman_fixed_t y,
+                       int            n)
+{
+    pixman_fixed_t f = pixman_fixed_frac (y);
+    pixman_fixed_t i = pixman_fixed_floor (y);
+
+    f = DIV (f - pixman_fixed_e - Y_FRAC_FIRST (n), STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) +
+	Y_FRAC_FIRST (n);
+
+    if (f < Y_FRAC_FIRST (n))
+    {
+	if (pixman_fixed_to_int (i) == 0x8000)
+	{
+	    f = 0; /* saturate */
+	}
+	else
+	{
+	    f = Y_FRAC_LAST (n);
+	    i -= pixman_fixed_1;
+	}
+    }
+    return (i | f);
+}
+
+/*
+ * Step an edge by any amount (including negative values)
+ */
+PIXMAN_EXPORT void
+pixman_edge_step (pixman_edge_t *e,
+                  int            n)
+{
+    pixman_fixed_48_16_t ne;
+
+    e->x += n * e->stepx;
+
+    ne = e->e + n * (pixman_fixed_48_16_t) e->dx;
+
+    if (n >= 0)
+    {
+	if (ne > 0)
+	{
+	    int nx = (ne + e->dy - 1) / e->dy;
+	    e->e = ne - nx * (pixman_fixed_48_16_t) e->dy;
+	    e->x += nx * e->signdx;
+	}
+    }
+    else
+    {
+	if (ne <= -e->dy)
+	{
+	    int nx = (-ne) / e->dy;
+	    e->e = ne + nx * (pixman_fixed_48_16_t) e->dy;
+	    e->x -= nx * e->signdx;
+	}
+    }
+}
+
+/*
+ * A private routine to initialize the multi-step
+ * elements of an edge structure
+ */
+static void
+_pixman_edge_multi_init (pixman_edge_t * e,
+                         int             n,
+                         pixman_fixed_t *stepx_p,
+                         pixman_fixed_t *dx_p)
+{
+    pixman_fixed_t stepx;
+    pixman_fixed_48_16_t ne;
+
+    ne = n * (pixman_fixed_48_16_t) e->dx;
+    stepx = n * e->stepx;
+
+    if (ne > 0)
+    {
+	int nx = ne / e->dy;
+	ne -= nx * e->dy;
+	stepx += nx * e->signdx;
+    }
+
+    *dx_p = ne;
+    *stepx_p = stepx;
+}
+
+/*
+ * Initialize one edge structure given the line endpoints and a
+ * starting y value
+ */
+PIXMAN_EXPORT void
+pixman_edge_init (pixman_edge_t *e,
+                  int            n,
+                  pixman_fixed_t y_start,
+                  pixman_fixed_t x_top,
+                  pixman_fixed_t y_top,
+                  pixman_fixed_t x_bot,
+                  pixman_fixed_t y_bot)
+{
+    pixman_fixed_t dx, dy;
+
+    e->x = x_top;
+    e->e = 0;
+    dx = x_bot - x_top;
+    dy = y_bot - y_top;
+    e->dy = dy;
+    e->dx = 0;
+
+    if (dy)
+    {
+	if (dx >= 0)
+	{
+	    e->signdx = 1;
+	    e->stepx = dx / dy;
+	    e->dx = dx % dy;
+	    e->e = -dy;
+	}
+	else
+	{
+	    e->signdx = -1;
+	    e->stepx = -(-dx / dy);
+	    e->dx = -dx % dy;
+	    e->e = 0;
+	}
+
+	_pixman_edge_multi_init (e, STEP_Y_SMALL (n),
+				 &e->stepx_small, &e->dx_small);
+
+	_pixman_edge_multi_init (e, STEP_Y_BIG (n),
+				 &e->stepx_big, &e->dx_big);
+    }
+    pixman_edge_step (e, y_start - y_top);
+}
+
+/*
+ * Initialize one edge structure given a line, starting y value
+ * and a pixel offset for the line
+ */
+PIXMAN_EXPORT void
+pixman_line_fixed_edge_init (pixman_edge_t *            e,
+                             int                        n,
+                             pixman_fixed_t             y,
+                             const pixman_line_fixed_t *line,
+                             int                        x_off,
+                             int                        y_off)
+{
+    pixman_fixed_t x_off_fixed = pixman_int_to_fixed (x_off);
+    pixman_fixed_t y_off_fixed = pixman_int_to_fixed (y_off);
+    const pixman_point_fixed_t *top, *bot;
+
+    if (line->p1.y <= line->p2.y)
+    {
+	top = &line->p1;
+	bot = &line->p2;
+    }
+    else
+    {
+	top = &line->p2;
+	bot = &line->p1;
+    }
+    
+    pixman_edge_init (e, n, y,
+                      top->x + x_off_fixed,
+                      top->y + y_off_fixed,
+                      bot->x + x_off_fixed,
+                      bot->y + y_off_fixed);
+}
+
+PIXMAN_EXPORT void
+pixman_add_traps (pixman_image_t * image,
+                  int16_t          x_off,
+                  int16_t          y_off,
+                  int              ntrap,
+                  pixman_trap_t *  traps)
+{
+    int bpp;
+    int height;
+
+    pixman_fixed_t x_off_fixed;
+    pixman_fixed_t y_off_fixed;
+    pixman_edge_t l, r;
+    pixman_fixed_t t, b;
+
+    _pixman_image_validate (image);
+    
+    height = image->bits.height;
+    bpp = PIXMAN_FORMAT_BPP (image->bits.format);
+
+    x_off_fixed = pixman_int_to_fixed (x_off);
+    y_off_fixed = pixman_int_to_fixed (y_off);
+
+    while (ntrap--)
+    {
+	t = traps->top.y + y_off_fixed;
+	if (t < 0)
+	    t = 0;
+	t = pixman_sample_ceil_y (t, bpp);
+
+	b = traps->bot.y + y_off_fixed;
+	if (pixman_fixed_to_int (b) >= height)
+	    b = pixman_int_to_fixed (height) - 1;
+	b = pixman_sample_floor_y (b, bpp);
+
+	if (b >= t)
+	{
+	    /* initialize edge walkers */
+	    pixman_edge_init (&l, bpp, t,
+	                      traps->top.l + x_off_fixed,
+	                      traps->top.y + y_off_fixed,
+	                      traps->bot.l + x_off_fixed,
+	                      traps->bot.y + y_off_fixed);
+
+	    pixman_edge_init (&r, bpp, t,
+	                      traps->top.r + x_off_fixed,
+	                      traps->top.y + y_off_fixed,
+	                      traps->bot.r + x_off_fixed,
+	                      traps->bot.y + y_off_fixed);
+
+	    pixman_rasterize_edges (image, &l, &r, t, b);
+	}
+
+	traps++;
+    }
+}
+
+#if 0
+static void
+dump_image (pixman_image_t *image,
+            const char *    title)
+{
+    int i, j;
+
+    if (!image->type == BITS)
+	printf ("%s is not a regular image\n", title);
+
+    if (!image->bits.format == PIXMAN_a8)
+	printf ("%s is not an alpha mask\n", title);
+
+    printf ("\n\n\n%s: \n", title);
+
+    for (i = 0; i < image->bits.height; ++i)
+    {
+	uint8_t *line =
+	    (uint8_t *)&(image->bits.bits[i * image->bits.rowstride]);
+
+	for (j = 0; j < image->bits.width; ++j)
+	    printf ("%c", line[j] ? '#' : ' ');
+
+	printf ("\n");
+    }
+}
+#endif
+
+PIXMAN_EXPORT void
+pixman_add_trapezoids (pixman_image_t *          image,
+                       int16_t                   x_off,
+                       int                       y_off,
+                       int                       ntraps,
+                       const pixman_trapezoid_t *traps)
+{
+    int i;
+
+#if 0
+    dump_image (image, "before");
+#endif
+
+    for (i = 0; i < ntraps; ++i)
+    {
+	const pixman_trapezoid_t *trap = &(traps[i]);
+
+	if (!pixman_trapezoid_valid (trap))
+	    continue;
+
+	pixman_rasterize_trapezoid (image, trap, x_off, y_off);
+    }
+
+#if 0
+    dump_image (image, "after");
+#endif
+}
+
+PIXMAN_EXPORT void
+pixman_rasterize_trapezoid (pixman_image_t *          image,
+                            const pixman_trapezoid_t *trap,
+                            int                       x_off,
+                            int                       y_off)
+{
+    int bpp;
+    int height;
+
+    pixman_fixed_t y_off_fixed;
+    pixman_edge_t l, r;
+    pixman_fixed_t t, b;
+
+    return_if_fail (image->type == BITS);
+
+    _pixman_image_validate (image);
+    
+    if (!pixman_trapezoid_valid (trap))
+	return;
+
+    height = image->bits.height;
+    bpp = PIXMAN_FORMAT_BPP (image->bits.format);
+
+    y_off_fixed = pixman_int_to_fixed (y_off);
+
+    t = trap->top + y_off_fixed;
+    if (t < 0)
+	t = 0;
+    t = pixman_sample_ceil_y (t, bpp);
+
+    b = trap->bottom + y_off_fixed;
+    if (pixman_fixed_to_int (b) >= height)
+	b = pixman_int_to_fixed (height) - 1;
+    b = pixman_sample_floor_y (b, bpp);
+    
+    if (b >= t)
+    {
+	/* initialize edge walkers */
+	pixman_line_fixed_edge_init (&l, bpp, t, &trap->left, x_off, y_off);
+	pixman_line_fixed_edge_init (&r, bpp, t, &trap->right, x_off, y_off);
+
+	pixman_rasterize_edges (image, &l, &r, t, b);
+    }
+}
+
+/*
+ * pixman_composite_trapezoids()
+ *
+ * All the trapezoids are conceptually rendered to an infinitely big image.
+ * The (0, 0) coordinates of this image are then aligned with the (x, y)
+ * coordinates of the source image, and then both images are aligned with
+ * the (x, y) coordinates of the destination. Then, in principle, compositing
+ * of these three images takes place across the entire destination.
+ *
+ * FIXME: However, there is currently a bug, where we restrict this compositing
+ * to the bounding box of the trapezoids. This is incorrect for operators such
+ * as SRC and IN where blank source pixels do have an effect on the destination.
+ */
+PIXMAN_EXPORT void
+pixman_composite_trapezoids (pixman_op_t		op,
+			     pixman_image_t *		src,
+			     pixman_image_t *		dst,
+			     pixman_format_code_t	mask_format,
+			     int			x_src,
+			     int			y_src,
+			     int			x_dst,
+			     int			y_dst,
+			     int			n_traps,
+			     const pixman_trapezoid_t *	traps)
+{
+    int i;
+
+    if (n_traps <= 0)
+	return;
+
+    _pixman_image_validate (src);
+    _pixman_image_validate (dst);
+
+    if (op == PIXMAN_OP_ADD &&
+	(src->common.flags & FAST_PATH_IS_OPAQUE)		&&
+	(mask_format == dst->common.extended_format_code)	&&
+	!(dst->common.have_clip_region))
+    {
+	for (i = 0; i < n_traps; ++i)
+	{
+	    const pixman_trapezoid_t *trap = &(traps[i]);
+	    
+	    if (!pixman_trapezoid_valid (trap))
+		continue;
+	    
+	    pixman_rasterize_trapezoid (dst, trap, x_dst, y_dst);
+	}
+    }
+    else
+    {
+	pixman_image_t *tmp;
+	pixman_box32_t box;
+	
+	box.x1 = INT32_MAX;
+	box.y1 = INT32_MAX;
+	box.x2 = INT32_MIN;
+	box.y2 = INT32_MIN;
+	
+	for (i = 0; i < n_traps; ++i)
+	{
+	    const pixman_trapezoid_t *trap = &(traps[i]);
+	    int y1, y2;
+	    
+	    if (!pixman_trapezoid_valid (trap))
+		continue;
+	    
+	    y1 = pixman_fixed_to_int (trap->top);
+	    if (y1 < box.y1)
+		box.y1 = y1;
+	    
+	    y2 = pixman_fixed_to_int (pixman_fixed_ceil (trap->bottom));
+	    if (y2 > box.y2)
+		box.y2 = y2;
+	    
+#define EXTEND_MIN(x)							\
+	    if (pixman_fixed_to_int ((x)) < box.x1)			\
+		box.x1 = pixman_fixed_to_int ((x));
+#define EXTEND_MAX(x)							\
+	    if (pixman_fixed_to_int (pixman_fixed_ceil ((x))) > box.x2)	\
+		box.x2 = pixman_fixed_to_int (pixman_fixed_ceil ((x)));
+	    
+#define EXTEND(x)							\
+	    EXTEND_MIN(x);						\
+	    EXTEND_MAX(x);
+	    
+	    EXTEND(trap->left.p1.x);
+	    EXTEND(trap->left.p2.x);
+	    EXTEND(trap->right.p1.x);
+	    EXTEND(trap->right.p2.x);
+	}
+	
+	if (box.x1 >= box.x2 || box.y1 >= box.y2)
+	    return;
+	
+	tmp = pixman_image_create_bits (
+	    mask_format, box.x2 - box.x1, box.y2 - box.y1, NULL, -1);
+	
+	for (i = 0; i < n_traps; ++i)
+	{
+	    const pixman_trapezoid_t *trap = &(traps[i]);
+	    
+	    if (!pixman_trapezoid_valid (trap))
+		continue;
+	    
+	    pixman_rasterize_trapezoid (tmp, trap, - box.x1, - box.y1);
+	}
+	
+	pixman_image_composite (op, src, tmp, dst,
+				x_src + box.x1, y_src + box.y1,
+				0, 0,
+				x_dst + box.x1, y_dst + box.y1,
+				box.x2 - box.x1, box.y2 - box.y1);
+	
+	pixman_image_unref (tmp);
+    }
+}
+
+static int
+greater_y (const pixman_point_fixed_t *a, const pixman_point_fixed_t *b)
+{
+    if (a->y == b->y)
+	return a->x > b->x;
+    return a->y > b->y;
+}
+
+/*
+ * Note that the definition of this function is a bit odd because
+ * of the X coordinate space (y increasing downwards).
+ */
+static int
+clockwise (const pixman_point_fixed_t *ref,
+	   const pixman_point_fixed_t *a,
+	   const pixman_point_fixed_t *b)
+{
+    pixman_point_fixed_t	ad, bd;
+
+    ad.x = a->x - ref->x;
+    ad.y = a->y - ref->y;
+    bd.x = b->x - ref->x;
+    bd.y = b->y - ref->y;
+
+    return ((pixman_fixed_32_32_t) bd.y * ad.x -
+	    (pixman_fixed_32_32_t) ad.y * bd.x) < 0;
+}
+
+static void
+triangle_to_trapezoids (const pixman_triangle_t *tri, pixman_trapezoid_t *traps)
+{
+    const pixman_point_fixed_t *top, *left, *right, *tmp;
+
+    top = &tri->p1;
+    left = &tri->p2;
+    right = &tri->p3;
+
+    if (greater_y (top, left))
+    {
+	tmp = left;
+	left = top;
+	top = tmp;
+    }
+
+    if (greater_y (top, right))
+    {
+	tmp = right;
+	right = top;
+	top = tmp;
+    }
+
+    if (clockwise (top, right, left))
+    {
+	tmp = right;
+	right = left;
+	left = tmp;
+    }
+    
+    /*
+     * Two cases:
+     *
+     *		+		+
+     *	       / \             / \
+     *	      /   \           /	  \
+     *	     /     +         +	   \
+     *      /    --           --    \
+     *     /   --               --   \
+     *    / ---                   --- \
+     *	 +--                         --+
+     */
+
+    traps->top = top->y;
+    traps->left.p1 = *top;
+    traps->left.p2 = *left;
+    traps->right.p1 = *top;
+    traps->right.p2 = *right;
+
+    if (right->y < left->y)
+	traps->bottom = right->y;
+    else
+	traps->bottom = left->y;
+
+    traps++;
+
+    *traps = *(traps - 1);
+    
+    if (right->y < left->y)
+    {
+	traps->top = right->y;
+	traps->bottom = left->y;
+	traps->right.p1 = *right;
+	traps->right.p2 = *left;
+    }
+    else
+    {
+	traps->top = left->y;
+	traps->bottom = right->y;
+	traps->left.p1 = *left;
+	traps->left.p2 = *right;
+    }
+}
+
+static pixman_trapezoid_t *
+convert_triangles (int n_tris, const pixman_triangle_t *tris)
+{
+    pixman_trapezoid_t *traps;
+    int i;
+
+    if (n_tris <= 0)
+	return NULL;
+    
+    traps = pixman_malloc_ab (n_tris, 2 * sizeof (pixman_trapezoid_t));
+    if (!traps)
+	return NULL;
+
+    for (i = 0; i < n_tris; ++i)
+	triangle_to_trapezoids (&(tris[i]), traps + 2 * i);
+
+    return traps;
+}
+
+PIXMAN_EXPORT void
+pixman_composite_triangles (pixman_op_t			op,
+			    pixman_image_t *		src,
+			    pixman_image_t *		dst,
+			    pixman_format_code_t	mask_format,
+			    int				x_src,
+			    int				y_src,
+			    int				x_dst,
+			    int				y_dst,
+			    int				n_tris,
+			    const pixman_triangle_t *	tris)
+{
+    pixman_trapezoid_t *traps;
+
+    if ((traps = convert_triangles (n_tris, tris)))
+    {
+	pixman_composite_trapezoids (op, src, dst, mask_format,
+				     x_src, y_src, x_dst, y_dst,
+				     n_tris * 2, traps);
+	
+	free (traps);
+    }
+}
+
+PIXMAN_EXPORT void
+pixman_add_triangles (pixman_image_t          *image,
+		      int32_t	               x_off,
+		      int32_t	               y_off,
+		      int	               n_tris,
+		      const pixman_triangle_t *tris)
+{
+    pixman_trapezoid_t *traps;
+
+    if ((traps = convert_triangles (n_tris, tris)))
+    {
+	pixman_add_trapezoids (image, x_off, y_off,
+			       n_tris * 2, traps);
+
+	free (traps);
+    }
+}
diff --git a/pixman/pixman-utils.c b/pixman/pixman-utils.c
new file mode 100644
index 0000000..d2af51a
--- /dev/null
+++ b/pixman/pixman-utils.c
@@ -0,0 +1,356 @@
+/*
+ * Copyright Â© 2000 SuSE, Inc.
+ * Copyright Â© 1999 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pixman-private.h"
+
+#define N_CACHED_FAST_PATHS 8
+
+typedef struct
+{
+    struct
+    {
+	pixman_implementation_t *	imp;
+	pixman_fast_path_t		fast_path;
+    } cache [N_CACHED_FAST_PATHS];
+} cache_t;
+
+PIXMAN_DEFINE_THREAD_LOCAL (cache_t, fast_path_cache);
+
+pixman_bool_t
+_pixman_lookup_composite_function (pixman_implementation_t     *toplevel,
+				   pixman_op_t			op,
+				   pixman_format_code_t		src_format,
+				   uint32_t			src_flags,
+				   pixman_format_code_t		mask_format,
+				   uint32_t			mask_flags,
+				   pixman_format_code_t		dest_format,
+				   uint32_t			dest_flags,
+				   pixman_implementation_t    **out_imp,
+				   pixman_composite_func_t     *out_func)
+{
+    pixman_implementation_t *imp;
+    cache_t *cache;
+    int i;
+
+    /* Check cache for fast paths */
+    cache = PIXMAN_GET_THREAD_LOCAL (fast_path_cache);
+
+    for (i = 0; i < N_CACHED_FAST_PATHS; ++i)
+    {
+	const pixman_fast_path_t *info = &(cache->cache[i].fast_path);
+
+	/* Note that we check for equality here, not whether
+	 * the cached fast path matches. This is to prevent
+	 * us from selecting an overly general fast path
+	 * when a more specific one would work.
+	 */
+	if (info->op == op			&&
+	    info->src_format == src_format	&&
+	    info->mask_format == mask_format	&&
+	    info->dest_format == dest_format	&&
+	    info->src_flags == src_flags	&&
+	    info->mask_flags == mask_flags	&&
+	    info->dest_flags == dest_flags	&&
+	    info->func)
+	{
+	    *out_imp = cache->cache[i].imp;
+	    *out_func = cache->cache[i].fast_path.func;
+
+	    goto update_cache;
+	}
+    }
+
+    for (imp = toplevel; imp != NULL; imp = imp->delegate)
+    {
+	const pixman_fast_path_t *info = imp->fast_paths;
+
+	while (info->op != PIXMAN_OP_NONE)
+	{
+	    if ((info->op == op || info->op == PIXMAN_OP_any)		&&
+		/* Formats */
+		((info->src_format == src_format) ||
+		 (info->src_format == PIXMAN_any))			&&
+		((info->mask_format == mask_format) ||
+		 (info->mask_format == PIXMAN_any))			&&
+		((info->dest_format == dest_format) ||
+		 (info->dest_format == PIXMAN_any))			&&
+		/* Flags */
+		(info->src_flags & src_flags) == info->src_flags	&&
+		(info->mask_flags & mask_flags) == info->mask_flags	&&
+		(info->dest_flags & dest_flags) == info->dest_flags)
+	    {
+		*out_imp = imp;
+		*out_func = info->func;
+
+		/* Set i to the last spot in the cache so that the
+		 * move-to-front code below will work
+		 */
+		i = N_CACHED_FAST_PATHS - 1;
+
+		goto update_cache;
+	    }
+
+	    ++info;
+	}
+    }
+    return FALSE;
+
+update_cache:
+    if (i)
+    {
+	while (i--)
+	    cache->cache[i + 1] = cache->cache[i];
+
+	cache->cache[0].imp = *out_imp;
+	cache->cache[0].fast_path.op = op;
+	cache->cache[0].fast_path.src_format = src_format;
+	cache->cache[0].fast_path.src_flags = src_flags;
+	cache->cache[0].fast_path.mask_format = mask_format;
+	cache->cache[0].fast_path.mask_flags = mask_flags;
+	cache->cache[0].fast_path.dest_format = dest_format;
+	cache->cache[0].fast_path.dest_flags = dest_flags;
+	cache->cache[0].fast_path.func = *out_func;
+    }
+
+    return TRUE;
+}
+
+pixman_bool_t
+_pixman_multiply_overflows_size (size_t a, size_t b)
+{
+    return a >= SIZE_MAX / b;
+}
+
+pixman_bool_t
+_pixman_multiply_overflows_int (unsigned int a, unsigned int b)
+{
+    return a >= INT32_MAX / b;
+}
+
+pixman_bool_t
+_pixman_addition_overflows_int (unsigned int a, unsigned int b)
+{
+    return a > INT32_MAX - b;
+}
+
+void *
+pixman_malloc_ab (unsigned int a,
+                  unsigned int b)
+{
+    if (a >= INT32_MAX / b)
+	return NULL;
+
+    return malloc (a * b);
+}
+
+void *
+pixman_malloc_abc (unsigned int a,
+                   unsigned int b,
+                   unsigned int c)
+{
+    if (a >= INT32_MAX / b)
+	return NULL;
+    else if (a * b >= INT32_MAX / c)
+	return NULL;
+    else
+	return malloc (a * b * c);
+}
+
+/*
+ * This function expands images from ARGB8 format to ARGB16.  To preserve
+ * precision, it needs to know the original source format.  For example, if the
+ * source was PIXMAN_x1r5g5b5 and the red component contained bits 12345, then
+ * the expanded value is 12345123.  To correctly expand this to 16 bits, it
+ * should be 1234512345123451 and not 1234512312345123.
+ */
+void
+pixman_expand (uint64_t *           dst,
+               const uint32_t *     src,
+               pixman_format_code_t format,
+               int                  width)
+{
+    /*
+     * Determine the sizes of each component and the masks and shifts
+     * required to extract them from the source pixel.
+     */
+    const int a_size = PIXMAN_FORMAT_A (format),
+              r_size = PIXMAN_FORMAT_R (format),
+              g_size = PIXMAN_FORMAT_G (format),
+              b_size = PIXMAN_FORMAT_B (format);
+    const int a_shift = 32 - a_size,
+              r_shift = 24 - r_size,
+              g_shift = 16 - g_size,
+              b_shift =  8 - b_size;
+    const uint8_t a_mask = ~(~0 << a_size),
+                  r_mask = ~(~0 << r_size),
+                  g_mask = ~(~0 << g_size),
+                  b_mask = ~(~0 << b_size);
+    int i;
+
+    /* Start at the end so that we can do the expansion in place
+     * when src == dst
+     */
+    for (i = width - 1; i >= 0; i--)
+    {
+	const uint32_t pixel = src[i];
+	const uint8_t a = (pixel >> a_shift) & a_mask,
+	              r = (pixel >> r_shift) & r_mask,
+	              g = (pixel >> g_shift) & g_mask,
+	              b = (pixel >> b_shift) & b_mask;
+	const uint64_t
+	    a16 = a_size ? unorm_to_unorm (a, a_size, 16) : 0xffff,
+	    r16 = unorm_to_unorm (r, r_size, 16),
+	    g16 = unorm_to_unorm (g, g_size, 16),
+	    b16 = unorm_to_unorm (b, b_size, 16);
+
+	dst[i] = a16 << 48 | r16 << 32 | g16 << 16 | b16;
+    }
+}
+
+/*
+ * Contracting is easier than expanding.  We just need to truncate the
+ * components.
+ */
+void
+pixman_contract (uint32_t *      dst,
+                 const uint64_t *src,
+                 int             width)
+{
+    int i;
+
+    /* Start at the beginning so that we can do the contraction in
+     * place when src == dst
+     */
+    for (i = 0; i < width; i++)
+    {
+	const uint8_t a = src[i] >> 56,
+	              r = src[i] >> 40,
+	              g = src[i] >> 24,
+	              b = src[i] >> 8;
+
+	dst[i] = a << 24 | r << 16 | g << 8 | b;
+    }
+}
+
+uint32_t *
+_pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask)
+{
+    return iter->buffer;
+}
+
+#define N_TMP_BOXES (16)
+
+pixman_bool_t
+pixman_region16_copy_from_region32 (pixman_region16_t *dst,
+                                    pixman_region32_t *src)
+{
+    int n_boxes, i;
+    pixman_box32_t *boxes32;
+    pixman_box16_t *boxes16;
+    pixman_bool_t retval;
+
+    boxes32 = pixman_region32_rectangles (src, &n_boxes);
+
+    boxes16 = pixman_malloc_ab (n_boxes, sizeof (pixman_box16_t));
+
+    if (!boxes16)
+	return FALSE;
+
+    for (i = 0; i < n_boxes; ++i)
+    {
+	boxes16[i].x1 = boxes32[i].x1;
+	boxes16[i].y1 = boxes32[i].y1;
+	boxes16[i].x2 = boxes32[i].x2;
+	boxes16[i].y2 = boxes32[i].y2;
+    }
+
+    pixman_region_fini (dst);
+    retval = pixman_region_init_rects (dst, boxes16, n_boxes);
+    free (boxes16);
+    return retval;
+}
+
+pixman_bool_t
+pixman_region32_copy_from_region16 (pixman_region32_t *dst,
+                                    pixman_region16_t *src)
+{
+    int n_boxes, i;
+    pixman_box16_t *boxes16;
+    pixman_box32_t *boxes32;
+    pixman_box32_t tmp_boxes[N_TMP_BOXES];
+    pixman_bool_t retval;
+
+    boxes16 = pixman_region_rectangles (src, &n_boxes);
+
+    if (n_boxes > N_TMP_BOXES)
+	boxes32 = pixman_malloc_ab (n_boxes, sizeof (pixman_box32_t));
+    else
+	boxes32 = tmp_boxes;
+
+    if (!boxes32)
+	return FALSE;
+
+    for (i = 0; i < n_boxes; ++i)
+    {
+	boxes32[i].x1 = boxes16[i].x1;
+	boxes32[i].y1 = boxes16[i].y1;
+	boxes32[i].x2 = boxes16[i].x2;
+	boxes32[i].y2 = boxes16[i].y2;
+    }
+
+    pixman_region32_fini (dst);
+    retval = pixman_region32_init_rects (dst, boxes32, n_boxes);
+
+    if (boxes32 != tmp_boxes)
+	free (boxes32);
+
+    return retval;
+}
+
+#ifdef DEBUG
+
+void
+_pixman_log_error (const char *function, const char *message)
+{
+    static int n_messages = 0;
+
+    if (n_messages < 10)
+    {
+	fprintf (stderr,
+		 "*** BUG ***\n"
+		 "In %s: %s\n"
+		 "Set a breakpoint on '_pixman_log_error' to debug\n\n",
+                 function, message);
+
+	n_messages++;
+    }
+}
+
+#endif
diff --git a/pixman/pixman-version.h.in b/pixman/pixman-version.h.in
new file mode 100644
index 0000000..256b2e6
--- /dev/null
+++ b/pixman/pixman-version.h.in
@@ -0,0 +1,50 @@
+/*
+ * Copyright Â© 2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Author: Carl D. Worth <cworth@cworth.org>
+ */
+
+#ifndef PIXMAN_VERSION_H__
+#define PIXMAN_VERSION_H__
+
+#ifndef PIXMAN_H__
+#  error pixman-version.h should only be included by pixman.h
+#endif
+
+#define PIXMAN_VERSION_MAJOR @PIXMAN_VERSION_MAJOR@
+#define PIXMAN_VERSION_MINOR @PIXMAN_VERSION_MINOR@
+#define PIXMAN_VERSION_MICRO @PIXMAN_VERSION_MICRO@
+
+#define PIXMAN_VERSION_STRING "@PIXMAN_VERSION_MAJOR@.@PIXMAN_VERSION_MINOR@.@PIXMAN_VERSION_MICRO@"
+
+#define PIXMAN_VERSION_ENCODE(major, minor, micro) (	\
+	  ((major) * 10000)				\
+	+ ((minor) *   100)				\
+	+ ((micro) *     1))
+
+#define PIXMAN_VERSION PIXMAN_VERSION_ENCODE(	\
+	PIXMAN_VERSION_MAJOR,			\
+	PIXMAN_VERSION_MINOR,			\
+	PIXMAN_VERSION_MICRO)
+
+#endif /* PIXMAN_VERSION_H__ */
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
new file mode 100644
index 0000000..6868704
--- /dev/null
+++ b/pixman/pixman-vmx.c
@@ -0,0 +1,1647 @@
+/*
+ * Copyright Â© 2007 Luca Barbato
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Luca Barbato not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Luca Barbato makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Luca Barbato (lu_zero@gentoo.org)
+ *
+ * Based on fbmmx.c by Owen Taylor, SÃ¸ren Sandmann and Nicholas Miell
+ */
+
+#include <config.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include <altivec.h>
+
+#define AVV(x...) {x}
+
+static force_inline vector unsigned int
+splat_alpha (vector unsigned int pix)
+{
+    return vec_perm (pix, pix,
+		     (vector unsigned char)AVV (
+			 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
+			 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
+}
+
+static force_inline vector unsigned int
+pix_multiply (vector unsigned int p, vector unsigned int a)
+{
+    vector unsigned short hi, lo, mod;
+
+    /* unpack to short */
+    hi = (vector unsigned short)
+	vec_mergeh ((vector unsigned char)AVV (0),
+		    (vector unsigned char)p);
+
+    mod = (vector unsigned short)
+	vec_mergeh ((vector unsigned char)AVV (0),
+		    (vector unsigned char)a);
+
+    hi = vec_mladd (hi, mod, (vector unsigned short)
+                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
+                         0x0080, 0x0080, 0x0080, 0x0080));
+
+    hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
+
+    hi = vec_sr (hi, vec_splat_u16 (8));
+
+    /* unpack to short */
+    lo = (vector unsigned short)
+	vec_mergel ((vector unsigned char)AVV (0),
+		    (vector unsigned char)p);
+    mod = (vector unsigned short)
+	vec_mergel ((vector unsigned char)AVV (0),
+		    (vector unsigned char)a);
+
+    lo = vec_mladd (lo, mod, (vector unsigned short)
+                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
+                         0x0080, 0x0080, 0x0080, 0x0080));
+
+    lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
+
+    lo = vec_sr (lo, vec_splat_u16 (8));
+
+    return (vector unsigned int)vec_packsu (hi, lo);
+}
+
+static force_inline vector unsigned int
+pix_add (vector unsigned int a, vector unsigned int b)
+{
+    return (vector unsigned int)vec_adds ((vector unsigned char)a,
+                                          (vector unsigned char)b);
+}
+
+static force_inline vector unsigned int
+pix_add_mul (vector unsigned int x,
+             vector unsigned int a,
+             vector unsigned int y,
+             vector unsigned int b)
+{
+    vector unsigned int t1, t2;
+
+    t1 = pix_multiply (x, a);
+    t2 = pix_multiply (y, b);
+
+    return pix_add (t1, t2);
+}
+
+static force_inline vector unsigned int
+negate (vector unsigned int src)
+{
+    return vec_nor (src, src);
+}
+
+/* dest*~srca + src */
+static force_inline vector unsigned int
+over (vector unsigned int src,
+      vector unsigned int srca,
+      vector unsigned int dest)
+{
+    vector unsigned char tmp = (vector unsigned char)
+	pix_multiply (dest, negate (srca));
+
+    tmp = vec_adds ((vector unsigned char)src, tmp);
+    return (vector unsigned int)tmp;
+}
+
+/* in == pix_multiply */
+#define in_over(src, srca, mask, dest)					\
+    over (pix_multiply (src, mask),					\
+          pix_multiply (srca, mask), dest)
+
+
+#define COMPUTE_SHIFT_MASK(source)					\
+    source ## _mask = vec_lvsl (0, source);
+
+#define COMPUTE_SHIFT_MASKS(dest, source)				\
+    dest ## _mask = vec_lvsl (0, dest);					\
+    source ## _mask = vec_lvsl (0, source);				\
+    store_mask = vec_lvsr (0, dest);
+
+#define COMPUTE_SHIFT_MASKC(dest, source, mask)				\
+    mask ## _mask = vec_lvsl (0, mask);					\
+    dest ## _mask = vec_lvsl (0, dest);					\
+    source ## _mask = vec_lvsl (0, source);				\
+    store_mask = vec_lvsr (0, dest);
+
+/* notice you have to declare temp vars...
+ * Note: tmp3 and tmp4 must remain untouched!
+ */
+
+#define LOAD_VECTORS(dest, source)			  \
+    tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
+    tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
+    tmp3 = (typeof(tmp3))vec_ld (0, dest);		  \
+    v ## source = (typeof(v ## source))			  \
+	vec_perm (tmp1, tmp2, source ## _mask);		  \
+    tmp4 = (typeof(tmp4))vec_ld (15, dest);		  \
+    v ## dest = (typeof(v ## dest))			  \
+	vec_perm (tmp3, tmp4, dest ## _mask);
+
+#define LOAD_VECTORSC(dest, source, mask)		  \
+    tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
+    tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
+    tmp3 = (typeof(tmp3))vec_ld (0, dest);		  \
+    v ## source = (typeof(v ## source))			  \
+	vec_perm (tmp1, tmp2, source ## _mask);		  \
+    tmp4 = (typeof(tmp4))vec_ld (15, dest);		  \
+    tmp1 = (typeof(tmp1))vec_ld (0, mask);		  \
+    v ## dest = (typeof(v ## dest))			  \
+	vec_perm (tmp3, tmp4, dest ## _mask);		  \
+    tmp2 = (typeof(tmp2))vec_ld (15, mask);		  \
+    v ## mask = (typeof(v ## mask))			  \
+	vec_perm (tmp1, tmp2, mask ## _mask);
+
+#define LOAD_VECTORSM(dest, source, mask)				\
+    LOAD_VECTORSC (dest, source, mask)					\
+    v ## source = pix_multiply (v ## source,				\
+                                splat_alpha (v ## mask));
+
+#define STORE_VECTOR(dest)						\
+    edges = vec_perm (tmp4, tmp3, dest ## _mask);			\
+    tmp3 = vec_perm ((vector unsigned char)v ## dest, edges, store_mask); \
+    tmp1 = vec_perm (edges, (vector unsigned char)v ## dest, store_mask); \
+    vec_st ((vector unsigned int) tmp3, 15, dest);			\
+    vec_st ((vector unsigned int) tmp1, 0, dest);
+
+static void
+vmx_combine_over_u_no_mask (uint32_t *      dest,
+                            const uint32_t *src,
+                            int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+
+	LOAD_VECTORS (dest, src);
+
+	vdest = over (vsrc, splat_alpha (vsrc), vdest);
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_over_u_mask (uint32_t *      dest,
+                         const uint32_t *src,
+                         const uint32_t *mask,
+                         int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = over (vsrc, splat_alpha (vsrc), vdest);
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t ia;
+
+	UN8x4_MUL_UN8 (s, m);
+
+	ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_over_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    if (mask)
+	vmx_combine_over_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_over_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
+                                    const uint32_t *src,
+                                    int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+
+	LOAD_VECTORS (dest, src);
+
+	vdest = over (vdest, splat_alpha (vdest), vsrc);
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t ia = ALPHA_8 (~dest[i]);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_over_reverse_u_mask (uint32_t *      dest,
+                                 const uint32_t *src,
+                                 const uint32_t *mask,
+                                 int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = over (vdest, splat_alpha (vdest), vsrc);
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t ia = ALPHA_8 (~dest[i]);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_over_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    if (mask)
+	vmx_combine_over_reverse_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_over_reverse_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_in_u_no_mask (uint32_t *      dest,
+                          const uint32_t *src,
+                          int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
+
+	vdest = pix_multiply (vsrc, splat_alpha (vdest));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t a = ALPHA_8 (dest[i]);
+
+	UN8x4_MUL_UN8 (s, a);
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_in_u_mask (uint32_t *      dest,
+                       const uint32_t *src,
+                       const uint32_t *mask,
+                       int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = pix_multiply (vsrc, splat_alpha (vdest));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t a = ALPHA_8 (dest[i]);
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_MUL_UN8 (s, a);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_in_u (pixman_implementation_t *imp,
+                  pixman_op_t              op,
+                  uint32_t *               dest,
+                  const uint32_t *         src,
+                  const uint32_t *         mask,
+                  int                      width)
+{
+    if (mask)
+	vmx_combine_in_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_in_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
+                                  const uint32_t *src,
+                                  int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
+
+	vdest = pix_multiply (vdest, splat_alpha (vsrc));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t d = dest[i];
+	uint32_t a = ALPHA_8 (src[i]);
+
+	UN8x4_MUL_UN8 (d, a);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_in_reverse_u_mask (uint32_t *      dest,
+                               const uint32_t *src,
+                               const uint32_t *mask,
+                               int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = pix_multiply (vdest, splat_alpha (vsrc));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t d = dest[i];
+	uint32_t a = src[i];
+
+	UN8x4_MUL_UN8 (a, m);
+	a = ALPHA_8 (a);
+	UN8x4_MUL_UN8 (d, a);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_in_reverse_u (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          uint32_t *               dest,
+                          const uint32_t *         src,
+                          const uint32_t *         mask,
+                          int                      width)
+{
+    if (mask)
+	vmx_combine_in_reverse_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_in_reverse_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_out_u_no_mask (uint32_t *      dest,
+                           const uint32_t *src,
+                           int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
+
+	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t a = ALPHA_8 (~dest[i]);
+
+	UN8x4_MUL_UN8 (s, a);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_out_u_mask (uint32_t *      dest,
+                        const uint32_t *src,
+                        const uint32_t *mask,
+                        int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t a = ALPHA_8 (~dest[i]);
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_MUL_UN8 (s, a);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_out_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    if (mask)
+	vmx_combine_out_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_out_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
+                                   const uint32_t *src,
+                                   int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+
+	LOAD_VECTORS (dest, src);
+
+	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t d = dest[i];
+	uint32_t a = ALPHA_8 (~src[i]);
+
+	UN8x4_MUL_UN8 (d, a);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_out_reverse_u_mask (uint32_t *      dest,
+                                const uint32_t *src,
+                                const uint32_t *mask,
+                                int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t d = dest[i];
+	uint32_t a = src[i];
+
+	UN8x4_MUL_UN8 (a, m);
+	a = ALPHA_8 (~a);
+	UN8x4_MUL_UN8 (d, a);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_out_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    if (mask)
+	vmx_combine_out_reverse_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_out_reverse_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_atop_u_no_mask (uint32_t *      dest,
+                            const uint32_t *src,
+                            int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
+
+	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
+			     vdest, splat_alpha (negate (vsrc)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t dest_a = ALPHA_8 (d);
+	uint32_t src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_atop_u_mask (uint32_t *      dest,
+                         const uint32_t *src,
+                         const uint32_t *mask,
+                         int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
+			     vdest, splat_alpha (negate (vsrc)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t dest_a = ALPHA_8 (d);
+	uint32_t src_ia;
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_atop_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    if (mask)
+	vmx_combine_atop_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_atop_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
+                                    const uint32_t *src,
+                                    int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
+
+	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
+			     vsrc, splat_alpha (negate (vdest)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t src_a = ALPHA_8 (s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
+                                 const uint32_t *src,
+                                 const uint32_t *mask,
+                                 int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
+			     vsrc, splat_alpha (negate (vdest)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t src_a;
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_a = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_atop_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    if (mask)
+	vmx_combine_atop_reverse_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_atop_reverse_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_xor_u_no_mask (uint32_t *      dest,
+                           const uint32_t *src,
+                           int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
+
+	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
+			     vdest, splat_alpha (negate (vsrc)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t src_ia = ALPHA_8 (~s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_xor_u_mask (uint32_t *      dest,
+                        const uint32_t *src,
+                        const uint32_t *mask,
+                        int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
+			     vdest, splat_alpha (negate (vsrc)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t src_ia;
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_xor_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    if (mask)
+	vmx_combine_xor_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_xor_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_add_u_no_mask (uint32_t *      dest,
+                           const uint32_t *src,
+                           int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
+
+	vdest = pix_add (vsrc, vdest);
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+
+	UN8x4_ADD_UN8x4 (d, s);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_add_u_mask (uint32_t *      dest,
+                        const uint32_t *src,
+                        const uint32_t *mask,
+                        int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = pix_add (vsrc, vdest);
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_ADD_UN8x4 (d, s);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_add_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    if (mask)
+	vmx_combine_add_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_add_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_src_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = pix_multiply (vsrc, vmask);
+
+	STORE_VECTOR (dest);
+
+	mask += 4;
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+
+	UN8x4_MUL_UN8x4 (s, a);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_over_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
+
+	STORE_VECTOR (dest);
+
+	mask += 4;
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask));
+
+	STORE_VECTOR (dest);
+
+	mask += 4;
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t ida = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_in_ca (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t da = ALPHA_8 (dest[i]);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (s, da);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (src[i]);
+
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4 (d, a);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_out_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = pix_multiply (
+	    pix_multiply (vsrc, vmask), splat_alpha (negate (vdest)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (s, da);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = pix_multiply (
+	    vdest, negate (pix_multiply (vmask, splat_alpha (vsrc))));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4 (d, ~a);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_atop_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask, vsrca;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vsrca = splat_alpha (vsrc);
+
+	vsrc = pix_multiply (vsrc, vmask);
+	vmask = pix_multiply (vmask, vsrca);
+
+	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
+			     negate (vmask), vdest);
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = pix_add_mul (vdest,
+			     pix_multiply (vmask, splat_alpha (vsrc)),
+			     pix_multiply (vsrc, vmask),
+			     negate (splat_alpha (vdest)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_xor_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = pix_add_mul (vdest,
+			     negate (pix_multiply (vmask, splat_alpha (vsrc))),
+			     pix_multiply (vsrc, vmask),
+			     negate (splat_alpha (vdest)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_add_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+	dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = pix_add (pix_multiply (vsrc, vmask), vdest);
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_ADD_UN8x4 (s, d);
+
+	dest[i] = s;
+    }
+}
+
+static const pixman_fast_path_t vmx_fast_paths[] =
+{
+    {   PIXMAN_OP_NONE	},
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_vmx (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
+
+    /* Set up function pointers */
+
+    imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = vmx_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = vmx_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = vmx_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = vmx_combine_xor_u;
+
+    imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = vmx_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = vmx_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = vmx_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = vmx_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = vmx_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
+
+    return imp;
+}
diff --git a/pixman/pixman.c b/pixman/pixman.c
new file mode 100644
index 0000000..8fb5356
--- /dev/null
+++ b/pixman/pixman.c
@@ -0,0 +1,1140 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright Â© 2000 SuSE, Inc.
+ * Copyright Â© 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+
+#include <stdlib.h>
+
+static pixman_implementation_t *global_implementation;
+
+#ifdef TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR
+static void __attribute__((constructor))
+pixman_constructor (void)
+{
+    global_implementation = _pixman_choose_implementation ();
+}
+#endif
+
+static force_inline pixman_implementation_t *
+get_implementation (void)
+{
+#ifndef TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR
+    if (!global_implementation)
+	global_implementation = _pixman_choose_implementation ();
+#endif
+    return global_implementation;
+}
+
+typedef struct operator_info_t operator_info_t;
+
+struct operator_info_t
+{
+    uint8_t	opaque_info[4];
+};
+
+#define PACK(neither, src, dest, both)			\
+    {{	    (uint8_t)PIXMAN_OP_ ## neither,		\
+	    (uint8_t)PIXMAN_OP_ ## src,			\
+	    (uint8_t)PIXMAN_OP_ ## dest,		\
+	    (uint8_t)PIXMAN_OP_ ## both		}}
+
+static const operator_info_t operator_table[] =
+{
+    /*    Neither Opaque         Src Opaque             Dst Opaque             Both Opaque */
+    PACK (CLEAR,                 CLEAR,                 CLEAR,                 CLEAR),
+    PACK (SRC,                   SRC,                   SRC,                   SRC),
+    PACK (DST,                   DST,                   DST,                   DST),
+    PACK (OVER,                  SRC,                   OVER,                  SRC),
+    PACK (OVER_REVERSE,          OVER_REVERSE,          DST,                   DST),
+    PACK (IN,                    IN,                    SRC,                   SRC),
+    PACK (IN_REVERSE,            DST,                   IN_REVERSE,            DST),
+    PACK (OUT,                   OUT,                   CLEAR,                 CLEAR),
+    PACK (OUT_REVERSE,           CLEAR,                 OUT_REVERSE,           CLEAR),
+    PACK (ATOP,                  IN,                    OVER,                  SRC),
+    PACK (ATOP_REVERSE,          OVER_REVERSE,          IN_REVERSE,            DST),
+    PACK (XOR,                   OUT,                   OUT_REVERSE,           CLEAR),
+    PACK (ADD,                   ADD,                   ADD,                   ADD),
+    PACK (SATURATE,              OVER_REVERSE,          DST,                   DST),
+
+    {{ 0 /* 0x0e */ }},
+    {{ 0 /* 0x0f */ }},
+
+    PACK (CLEAR,                 CLEAR,                 CLEAR,                 CLEAR),
+    PACK (SRC,                   SRC,                   SRC,                   SRC),
+    PACK (DST,                   DST,                   DST,                   DST),
+    PACK (DISJOINT_OVER,         DISJOINT_OVER,         DISJOINT_OVER,         DISJOINT_OVER),
+    PACK (DISJOINT_OVER_REVERSE, DISJOINT_OVER_REVERSE, DISJOINT_OVER_REVERSE, DISJOINT_OVER_REVERSE),
+    PACK (DISJOINT_IN,           DISJOINT_IN,           DISJOINT_IN,           DISJOINT_IN),
+    PACK (DISJOINT_IN_REVERSE,   DISJOINT_IN_REVERSE,   DISJOINT_IN_REVERSE,   DISJOINT_IN_REVERSE),
+    PACK (DISJOINT_OUT,          DISJOINT_OUT,          DISJOINT_OUT,          DISJOINT_OUT),
+    PACK (DISJOINT_OUT_REVERSE,  DISJOINT_OUT_REVERSE,  DISJOINT_OUT_REVERSE,  DISJOINT_OUT_REVERSE),
+    PACK (DISJOINT_ATOP,         DISJOINT_ATOP,         DISJOINT_ATOP,         DISJOINT_ATOP),
+    PACK (DISJOINT_ATOP_REVERSE, DISJOINT_ATOP_REVERSE, DISJOINT_ATOP_REVERSE, DISJOINT_ATOP_REVERSE),
+    PACK (DISJOINT_XOR,          DISJOINT_XOR,          DISJOINT_XOR,          DISJOINT_XOR),
+
+    {{ 0 /* 0x1c */ }},
+    {{ 0 /* 0x1d */ }},
+    {{ 0 /* 0x1e */ }},
+    {{ 0 /* 0x1f */ }},
+
+    PACK (CLEAR,                 CLEAR,                 CLEAR,                 CLEAR),
+    PACK (SRC,                   SRC,                   SRC,                   SRC),
+    PACK (DST,                   DST,                   DST,                   DST),
+    PACK (CONJOINT_OVER,         CONJOINT_OVER,         CONJOINT_OVER,         CONJOINT_OVER),
+    PACK (CONJOINT_OVER_REVERSE, CONJOINT_OVER_REVERSE, CONJOINT_OVER_REVERSE, CONJOINT_OVER_REVERSE),
+    PACK (CONJOINT_IN,           CONJOINT_IN,           CONJOINT_IN,           CONJOINT_IN),
+    PACK (CONJOINT_IN_REVERSE,   CONJOINT_IN_REVERSE,   CONJOINT_IN_REVERSE,   CONJOINT_IN_REVERSE),
+    PACK (CONJOINT_OUT,          CONJOINT_OUT,          CONJOINT_OUT,          CONJOINT_OUT),
+    PACK (CONJOINT_OUT_REVERSE,  CONJOINT_OUT_REVERSE,  CONJOINT_OUT_REVERSE,  CONJOINT_OUT_REVERSE),
+    PACK (CONJOINT_ATOP,         CONJOINT_ATOP,         CONJOINT_ATOP,         CONJOINT_ATOP),
+    PACK (CONJOINT_ATOP_REVERSE, CONJOINT_ATOP_REVERSE, CONJOINT_ATOP_REVERSE, CONJOINT_ATOP_REVERSE),
+    PACK (CONJOINT_XOR,          CONJOINT_XOR,          CONJOINT_XOR,          CONJOINT_XOR),
+
+    {{ 0 /* 0x2c */ }},
+    {{ 0 /* 0x2d */ }},
+    {{ 0 /* 0x2e */ }},
+    {{ 0 /* 0x2f */ }},
+
+    PACK (MULTIPLY,              MULTIPLY,              MULTIPLY,              MULTIPLY),
+    PACK (SCREEN,                SCREEN,                SCREEN,                SCREEN),
+    PACK (OVERLAY,               OVERLAY,               OVERLAY,               OVERLAY),
+    PACK (DARKEN,                DARKEN,                DARKEN,                DARKEN),
+    PACK (LIGHTEN,               LIGHTEN,               LIGHTEN,               LIGHTEN),
+    PACK (COLOR_DODGE,           COLOR_DODGE,           COLOR_DODGE,           COLOR_DODGE),
+    PACK (COLOR_BURN,            COLOR_BURN,            COLOR_BURN,            COLOR_BURN),
+    PACK (HARD_LIGHT,            HARD_LIGHT,            HARD_LIGHT,            HARD_LIGHT),
+    PACK (SOFT_LIGHT,            SOFT_LIGHT,            SOFT_LIGHT,            SOFT_LIGHT),
+    PACK (DIFFERENCE,            DIFFERENCE,            DIFFERENCE,            DIFFERENCE),
+    PACK (EXCLUSION,             EXCLUSION,             EXCLUSION,             EXCLUSION),
+    PACK (HSL_HUE,               HSL_HUE,               HSL_HUE,               HSL_HUE),
+    PACK (HSL_SATURATION,        HSL_SATURATION,        HSL_SATURATION,        HSL_SATURATION),
+    PACK (HSL_COLOR,             HSL_COLOR,             HSL_COLOR,             HSL_COLOR),
+    PACK (HSL_LUMINOSITY,        HSL_LUMINOSITY,        HSL_LUMINOSITY,        HSL_LUMINOSITY),
+};
+
+/*
+ * Optimize the current operator based on opacity of source or destination
+ * The output operator should be mathematically equivalent to the source.
+ */
+static pixman_op_t
+optimize_operator (pixman_op_t     op,
+		   uint32_t        src_flags,
+		   uint32_t        mask_flags,
+		   uint32_t        dst_flags)
+{
+    pixman_bool_t is_source_opaque, is_dest_opaque;
+
+#define OPAQUE_SHIFT 13
+    
+    COMPILE_TIME_ASSERT (FAST_PATH_IS_OPAQUE == (1 << OPAQUE_SHIFT));
+    
+    is_dest_opaque = (dst_flags & FAST_PATH_IS_OPAQUE);
+    is_source_opaque = ((src_flags & mask_flags) & FAST_PATH_IS_OPAQUE);
+
+    is_dest_opaque >>= OPAQUE_SHIFT - 1;
+    is_source_opaque >>= OPAQUE_SHIFT;
+
+    return operator_table[op].opaque_info[is_dest_opaque | is_source_opaque];
+}
+
+/*
+ * Computing composite region
+ */
+static inline pixman_bool_t
+clip_general_image (pixman_region32_t * region,
+                    pixman_region32_t * clip,
+                    int                 dx,
+                    int                 dy)
+{
+    if (pixman_region32_n_rects (region) == 1 &&
+        pixman_region32_n_rects (clip) == 1)
+    {
+	pixman_box32_t *  rbox = pixman_region32_rectangles (region, NULL);
+	pixman_box32_t *  cbox = pixman_region32_rectangles (clip, NULL);
+	int v;
+
+	if (rbox->x1 < (v = cbox->x1 + dx))
+	    rbox->x1 = v;
+	if (rbox->x2 > (v = cbox->x2 + dx))
+	    rbox->x2 = v;
+	if (rbox->y1 < (v = cbox->y1 + dy))
+	    rbox->y1 = v;
+	if (rbox->y2 > (v = cbox->y2 + dy))
+	    rbox->y2 = v;
+	if (rbox->x1 >= rbox->x2 || rbox->y1 >= rbox->y2)
+	{
+	    pixman_region32_init (region);
+	    return FALSE;
+	}
+    }
+    else if (!pixman_region32_not_empty (clip))
+    {
+	return FALSE;
+    }
+    else
+    {
+	if (dx || dy)
+	    pixman_region32_translate (region, -dx, -dy);
+
+	if (!pixman_region32_intersect (region, region, clip))
+	    return FALSE;
+
+	if (dx || dy)
+	    pixman_region32_translate (region, dx, dy);
+    }
+
+    return pixman_region32_not_empty (region);
+}
+
+static inline pixman_bool_t
+clip_source_image (pixman_region32_t * region,
+                   pixman_image_t *    image,
+                   int                 dx,
+                   int                 dy)
+{
+    /* Source clips are ignored, unless they are explicitly turned on
+     * and the clip in question was set by an X client. (Because if
+     * the clip was not set by a client, then it is a hierarchy
+     * clip and those should always be ignored for sources).
+     */
+    if (!image->common.clip_sources || !image->common.client_clip)
+	return TRUE;
+
+    return clip_general_image (region,
+                               &image->common.clip_region,
+                               dx, dy);
+}
+
+/*
+ * returns FALSE if the final region is empty.  Indistinguishable from
+ * an allocation failure, but rendering ignores those anyways.
+ */
+static pixman_bool_t
+pixman_compute_composite_region32 (pixman_region32_t * region,
+                                   pixman_image_t *    src_image,
+                                   pixman_image_t *    mask_image,
+                                   pixman_image_t *    dest_image,
+                                   int32_t             src_x,
+                                   int32_t             src_y,
+                                   int32_t             mask_x,
+                                   int32_t             mask_y,
+                                   int32_t             dest_x,
+                                   int32_t             dest_y,
+                                   int32_t             width,
+                                   int32_t             height)
+{
+    region->extents.x1 = dest_x;
+    region->extents.x2 = dest_x + width;
+    region->extents.y1 = dest_y;
+    region->extents.y2 = dest_y + height;
+
+    region->extents.x1 = MAX (region->extents.x1, 0);
+    region->extents.y1 = MAX (region->extents.y1, 0);
+    region->extents.x2 = MIN (region->extents.x2, dest_image->bits.width);
+    region->extents.y2 = MIN (region->extents.y2, dest_image->bits.height);
+
+    region->data = 0;
+
+    /* Check for empty operation */
+    if (region->extents.x1 >= region->extents.x2 ||
+        region->extents.y1 >= region->extents.y2)
+    {
+	region->extents.x1 = 0;
+	region->extents.x2 = 0;
+	region->extents.y1 = 0;
+	region->extents.y2 = 0;
+	return FALSE;
+    }
+
+    if (dest_image->common.have_clip_region)
+    {
+	if (!clip_general_image (region, &dest_image->common.clip_region, 0, 0))
+	    return FALSE;
+    }
+
+    if (dest_image->common.alpha_map)
+    {
+	if (!pixman_region32_intersect_rect (region, region,
+					     dest_image->common.alpha_origin_x,
+					     dest_image->common.alpha_origin_y,
+					     dest_image->common.alpha_map->width,
+					     dest_image->common.alpha_map->height))
+	{
+	    return FALSE;
+	}
+	if (!pixman_region32_not_empty (region))
+	    return FALSE;
+	if (dest_image->common.alpha_map->common.have_clip_region)
+	{
+	    if (!clip_general_image (region, &dest_image->common.alpha_map->common.clip_region,
+				     -dest_image->common.alpha_origin_x,
+				     -dest_image->common.alpha_origin_y))
+	    {
+		return FALSE;
+	    }
+	}
+    }
+
+    /* clip against src */
+    if (src_image->common.have_clip_region)
+    {
+	if (!clip_source_image (region, src_image, dest_x - src_x, dest_y - src_y))
+	    return FALSE;
+    }
+    if (src_image->common.alpha_map && src_image->common.alpha_map->common.have_clip_region)
+    {
+	if (!clip_source_image (region, (pixman_image_t *)src_image->common.alpha_map,
+	                        dest_x - (src_x - src_image->common.alpha_origin_x),
+	                        dest_y - (src_y - src_image->common.alpha_origin_y)))
+	{
+	    return FALSE;
+	}
+    }
+    /* clip against mask */
+    if (mask_image && mask_image->common.have_clip_region)
+    {
+	if (!clip_source_image (region, mask_image, dest_x - mask_x, dest_y - mask_y))
+	    return FALSE;
+
+	if (mask_image->common.alpha_map && mask_image->common.alpha_map->common.have_clip_region)
+	{
+	    if (!clip_source_image (region, (pixman_image_t *)mask_image->common.alpha_map,
+	                            dest_x - (mask_x - mask_image->common.alpha_origin_x),
+	                            dest_y - (mask_y - mask_image->common.alpha_origin_y)))
+	    {
+		return FALSE;
+	    }
+	}
+    }
+
+    return TRUE;
+}
+
+typedef struct
+{
+    pixman_fixed_48_16_t	x1;
+    pixman_fixed_48_16_t	y1;
+    pixman_fixed_48_16_t	x2;
+    pixman_fixed_48_16_t	y2;
+} box_48_16_t;
+
+static pixman_bool_t
+compute_transformed_extents (pixman_transform_t *transform,
+			     const pixman_box32_t *extents,
+			     box_48_16_t *transformed)
+{
+    pixman_fixed_48_16_t tx1, ty1, tx2, ty2;
+    pixman_fixed_t x1, y1, x2, y2;
+    int i;
+
+    x1 = pixman_int_to_fixed (extents->x1) + pixman_fixed_1 / 2;
+    y1 = pixman_int_to_fixed (extents->y1) + pixman_fixed_1 / 2;
+    x2 = pixman_int_to_fixed (extents->x2) - pixman_fixed_1 / 2;
+    y2 = pixman_int_to_fixed (extents->y2) - pixman_fixed_1 / 2;
+
+    if (!transform)
+    {
+	transformed->x1 = x1;
+	transformed->y1 = y1;
+	transformed->x2 = x2;
+	transformed->y2 = y2;
+
+	return TRUE;
+    }
+
+    tx1 = ty1 = INT64_MAX;
+    tx2 = ty2 = INT64_MIN;
+
+    for (i = 0; i < 4; ++i)
+    {
+	pixman_fixed_48_16_t tx, ty;
+	pixman_vector_t v;
+
+	v.vector[0] = (i & 0x01)? x1 : x2;
+	v.vector[1] = (i & 0x02)? y1 : y2;
+	v.vector[2] = pixman_fixed_1;
+
+	if (!pixman_transform_point (transform, &v))
+	    return FALSE;
+
+	tx = (pixman_fixed_48_16_t)v.vector[0];
+	ty = (pixman_fixed_48_16_t)v.vector[1];
+
+	if (tx < tx1)
+	    tx1 = tx;
+	if (ty < ty1)
+	    ty1 = ty;
+	if (tx > tx2)
+	    tx2 = tx;
+	if (ty > ty2)
+	    ty2 = ty;
+    }
+
+    transformed->x1 = tx1;
+    transformed->y1 = ty1;
+    transformed->x2 = tx2;
+    transformed->y2 = ty2;
+
+    return TRUE;
+}
+
+#define IS_16BIT(x) (((x) >= INT16_MIN) && ((x) <= INT16_MAX))
+#define ABS(f)      (((f) < 0)?  (-(f)) : (f))
+#define IS_16_16(f) (((f) >= pixman_min_fixed_48_16 && ((f) <= pixman_max_fixed_48_16)))
+
+static pixman_bool_t
+analyze_extent (pixman_image_t       *image,
+		const pixman_box32_t *extents,
+		uint32_t             *flags)
+{
+    pixman_transform_t *transform;
+    pixman_fixed_t x_off, y_off;
+    pixman_fixed_t width, height;
+    pixman_fixed_t *params;
+    box_48_16_t transformed;
+    pixman_box32_t exp_extents;
+
+    if (!image)
+	return TRUE;
+
+    /* Some compositing functions walk one step
+     * outside the destination rectangle, so we
+     * check here that the expanded-by-one source
+     * extents in destination space fits in 16 bits
+     */
+    if (!IS_16BIT (extents->x1 - 1)		||
+	!IS_16BIT (extents->y1 - 1)		||
+	!IS_16BIT (extents->x2 + 1)		||
+	!IS_16BIT (extents->y2 + 1))
+    {
+	return FALSE;
+    }
+
+    transform = image->common.transform;
+    if (image->common.type == BITS)
+    {
+	/* During repeat mode calculations we might convert the
+	 * width/height of an image to fixed 16.16, so we need
+	 * them to be smaller than 16 bits.
+	 */
+	if (image->bits.width >= 0x7fff	|| image->bits.height >= 0x7fff)
+	    return FALSE;
+
+	if ((image->common.flags & FAST_PATH_ID_TRANSFORM) == FAST_PATH_ID_TRANSFORM &&
+	    extents->x1 >= 0 &&
+	    extents->y1 >= 0 &&
+	    extents->x2 <= image->bits.width &&
+	    extents->y2 <= image->bits.height)
+	{
+	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+	    return TRUE;
+	}
+
+	switch (image->common.filter)
+	{
+	case PIXMAN_FILTER_CONVOLUTION:
+	    params = image->common.filter_params;
+	    x_off = - pixman_fixed_e - ((params[0] - pixman_fixed_1) >> 1);
+	    y_off = - pixman_fixed_e - ((params[1] - pixman_fixed_1) >> 1);
+	    width = params[0];
+	    height = params[1];
+	    break;
+
+	case PIXMAN_FILTER_GOOD:
+	case PIXMAN_FILTER_BEST:
+	case PIXMAN_FILTER_BILINEAR:
+	    x_off = - pixman_fixed_1 / 2;
+	    y_off = - pixman_fixed_1 / 2;
+	    width = pixman_fixed_1;
+	    height = pixman_fixed_1;
+	    break;
+
+	case PIXMAN_FILTER_FAST:
+	case PIXMAN_FILTER_NEAREST:
+	    x_off = - pixman_fixed_e;
+	    y_off = - pixman_fixed_e;
+	    width = 0;
+	    height = 0;
+	    break;
+
+	default:
+	    return FALSE;
+	}
+    }
+    else
+    {
+	x_off = 0;
+	y_off = 0;
+	width = 0;
+	height = 0;
+    }
+
+    if (!compute_transformed_extents (transform, extents, &transformed))
+	return FALSE;
+
+    /* Expand the source area by a tiny bit so account of different rounding that
+     * may happen during sampling. Note that (8 * pixman_fixed_e) is very far from
+     * 0.5 so this won't cause the area computed to be overly pessimistic.
+     */
+    transformed.x1 -= 8 * pixman_fixed_e;
+    transformed.y1 -= 8 * pixman_fixed_e;
+    transformed.x2 += 8 * pixman_fixed_e;
+    transformed.y2 += 8 * pixman_fixed_e;
+
+    if (image->common.type == BITS)
+    {
+	if (pixman_fixed_to_int (transformed.x1) >= 0			&&
+	    pixman_fixed_to_int (transformed.y1) >= 0			&&
+	    pixman_fixed_to_int (transformed.x2) < image->bits.width	&&
+	    pixman_fixed_to_int (transformed.y2) < image->bits.height)
+	{
+	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+	}
+
+	if (pixman_fixed_to_int (transformed.x1 - pixman_fixed_1 / 2) >= 0		  &&
+	    pixman_fixed_to_int (transformed.y1 - pixman_fixed_1 / 2) >= 0		  &&
+	    pixman_fixed_to_int (transformed.x2 + pixman_fixed_1 / 2) < image->bits.width &&
+	    pixman_fixed_to_int (transformed.y2 + pixman_fixed_1 / 2) < image->bits.height)
+	{
+	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR;
+	}
+    }
+
+    /* Check we don't overflow when the destination extents are expanded by one.
+     * This ensures that compositing functions can simply walk the source space
+     * using 16.16 variables without worrying about overflow.
+     */
+    exp_extents = *extents;
+    exp_extents.x1 -= 1;
+    exp_extents.y1 -= 1;
+    exp_extents.x2 += 1;
+    exp_extents.y2 += 1;
+
+    if (!compute_transformed_extents (transform, &exp_extents, &transformed))
+	return FALSE;
+    
+    if (!IS_16_16 (transformed.x1 + x_off - 8 * pixman_fixed_e)	||
+	!IS_16_16 (transformed.y1 + y_off - 8 * pixman_fixed_e)	||
+	!IS_16_16 (transformed.x2 + x_off + 8 * pixman_fixed_e + width)	||
+	!IS_16_16 (transformed.y2 + y_off + 8 * pixman_fixed_e + height))
+    {
+	return FALSE;
+    }
+
+    return TRUE;
+}
+
+/*
+ * Work around GCC bug causing crashes in Mozilla with SSE2
+ *
+ * When using -msse, gcc generates movdqa instructions assuming that
+ * the stack is 16 byte aligned. Unfortunately some applications, such
+ * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
+ * causes the movdqa instructions to fail.
+ *
+ * The __force_align_arg_pointer__ makes gcc generate a prologue that
+ * realigns the stack pointer to 16 bytes.
+ *
+ * On x86-64 this is not necessary because the standard ABI already
+ * calls for a 16 byte aligned stack.
+ *
+ * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
+ */
+#if defined (USE_SSE2) && defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+PIXMAN_EXPORT void
+pixman_image_composite32 (pixman_op_t      op,
+                          pixman_image_t * src,
+                          pixman_image_t * mask,
+                          pixman_image_t * dest,
+                          int32_t          src_x,
+                          int32_t          src_y,
+                          int32_t          mask_x,
+                          int32_t          mask_y,
+                          int32_t          dest_x,
+                          int32_t          dest_y,
+                          int32_t          width,
+                          int32_t          height)
+{
+    pixman_format_code_t src_format, mask_format, dest_format;
+    uint32_t src_flags, mask_flags, dest_flags;
+    pixman_region32_t region;
+    pixman_box32_t extents;
+    pixman_implementation_t *imp;
+    pixman_composite_func_t func;
+
+    _pixman_image_validate (src);
+    if (mask)
+	_pixman_image_validate (mask);
+    _pixman_image_validate (dest);
+
+    src_format = src->common.extended_format_code;
+    src_flags = src->common.flags;
+
+    if (mask)
+    {
+	mask_format = mask->common.extended_format_code;
+	mask_flags = mask->common.flags;
+    }
+    else
+    {
+	mask_format = PIXMAN_null;
+	mask_flags = FAST_PATH_IS_OPAQUE;
+    }
+
+    dest_format = dest->common.extended_format_code;
+    dest_flags = dest->common.flags;
+
+    /* Check for pixbufs */
+    if ((mask_format == PIXMAN_a8r8g8b8 || mask_format == PIXMAN_a8b8g8r8) &&
+	(src->type == BITS && src->bits.bits == mask->bits.bits)	   &&
+	(src->common.repeat == mask->common.repeat)			   &&
+	(src_x == mask_x && src_y == mask_y))
+    {
+	if (src_format == PIXMAN_x8b8g8r8)
+	    src_format = mask_format = PIXMAN_pixbuf;
+	else if (src_format == PIXMAN_x8r8g8b8)
+	    src_format = mask_format = PIXMAN_rpixbuf;
+    }
+
+    pixman_region32_init (&region);
+
+    if (!pixman_compute_composite_region32 (
+	    &region, src, mask, dest,
+	    src_x, src_y, mask_x, mask_y, dest_x, dest_y, width, height))
+    {
+	goto out;
+    }
+
+    extents = *pixman_region32_extents (&region);
+
+    extents.x1 -= dest_x - src_x;
+    extents.y1 -= dest_y - src_y;
+    extents.x2 -= dest_x - src_x;
+    extents.y2 -= dest_y - src_y;
+
+    if (!analyze_extent (src, &extents, &src_flags))
+	goto out;
+
+    extents.x1 -= src_x - mask_x;
+    extents.y1 -= src_y - mask_y;
+    extents.x2 -= src_x - mask_x;
+    extents.y2 -= src_y - mask_y;
+
+    if (!analyze_extent (mask, &extents, &mask_flags))
+	goto out;
+
+    /* If the clip is within the source samples, and the samples are
+     * opaque, then the source is effectively opaque.
+     */
+#define NEAREST_OPAQUE	(FAST_PATH_SAMPLES_OPAQUE |			\
+			 FAST_PATH_NEAREST_FILTER |			\
+			 FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+#define BILINEAR_OPAQUE	(FAST_PATH_SAMPLES_OPAQUE |			\
+			 FAST_PATH_BILINEAR_FILTER |			\
+			 FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR)
+
+    if ((src_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
+	(src_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
+    {
+	src_flags |= FAST_PATH_IS_OPAQUE;
+    }
+
+    if ((mask_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
+	(mask_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
+    {
+	mask_flags |= FAST_PATH_IS_OPAQUE;
+    }
+
+    /*
+     * Check if we can replace our operator by a simpler one
+     * if the src or dest are opaque. The output operator should be
+     * mathematically equivalent to the source.
+     */
+    op = optimize_operator (op, src_flags, mask_flags, dest_flags);
+
+    if (_pixman_lookup_composite_function (
+	    get_implementation (), op,
+	    src_format, src_flags, mask_format, mask_flags, dest_format, dest_flags,
+	    &imp, &func))
+    {
+	pixman_composite_info_t info;
+	const pixman_box32_t *pbox;
+	int n;
+
+	info.op = op;
+	info.src_image = src;
+	info.mask_image = mask;
+	info.dest_image = dest;
+	info.src_flags = src_flags;
+	info.mask_flags = mask_flags;
+	info.dest_flags = dest_flags;
+
+	pbox = pixman_region32_rectangles (&region, &n);
+
+	while (n--)
+	{
+	    info.src_x = pbox->x1 + src_x - dest_x;
+	    info.src_y = pbox->y1 + src_y - dest_y;
+	    info.mask_x = pbox->x1 + mask_x - dest_x;
+	    info.mask_y = pbox->y1 + mask_y - dest_y;
+	    info.dest_x = pbox->x1;
+	    info.dest_y = pbox->y1;
+	    info.width = pbox->x2 - pbox->x1;
+	    info.height = pbox->y2 - pbox->y1;
+
+	    func (imp, &info);
+
+	    pbox++;
+	}
+    }
+
+out:
+    pixman_region32_fini (&region);
+}
+
+PIXMAN_EXPORT void
+pixman_image_composite (pixman_op_t      op,
+                        pixman_image_t * src,
+                        pixman_image_t * mask,
+                        pixman_image_t * dest,
+                        int16_t          src_x,
+                        int16_t          src_y,
+                        int16_t          mask_x,
+                        int16_t          mask_y,
+                        int16_t          dest_x,
+                        int16_t          dest_y,
+                        uint16_t         width,
+                        uint16_t         height)
+{
+    pixman_image_composite32 (op, src, mask, dest, src_x, src_y, 
+                              mask_x, mask_y, dest_x, dest_y, width, height);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_blt (uint32_t *src_bits,
+            uint32_t *dst_bits,
+            int       src_stride,
+            int       dst_stride,
+            int       src_bpp,
+            int       dst_bpp,
+            int       src_x,
+            int       src_y,
+            int       dest_x,
+            int       dest_y,
+            int       width,
+            int       height)
+{
+    return _pixman_implementation_blt (get_implementation(),
+				       src_bits, dst_bits, src_stride, dst_stride,
+                                       src_bpp, dst_bpp,
+                                       src_x, src_y,
+                                       dest_x, dest_y,
+                                       width, height);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_fill (uint32_t *bits,
+             int       stride,
+             int       bpp,
+             int       x,
+             int       y,
+             int       width,
+             int       height,
+             uint32_t xor)
+{
+    return _pixman_implementation_fill (
+	get_implementation(), bits, stride, bpp, x, y, width, height, xor);
+}
+
+static uint32_t
+color_to_uint32 (const pixman_color_t *color)
+{
+    return
+        (color->alpha >> 8 << 24) |
+        (color->red >> 8 << 16) |
+        (color->green & 0xff00) |
+        (color->blue >> 8);
+}
+
+static pixman_bool_t
+color_to_pixel (pixman_color_t *     color,
+                uint32_t *           pixel,
+                pixman_format_code_t format)
+{
+    uint32_t c = color_to_uint32 (color);
+
+    if (!(format == PIXMAN_a8r8g8b8     ||
+          format == PIXMAN_x8r8g8b8     ||
+          format == PIXMAN_a8b8g8r8     ||
+          format == PIXMAN_x8b8g8r8     ||
+          format == PIXMAN_b8g8r8a8     ||
+          format == PIXMAN_b8g8r8x8     ||
+          format == PIXMAN_r8g8b8a8     ||
+          format == PIXMAN_r8g8b8x8     ||
+          format == PIXMAN_r5g6b5       ||
+          format == PIXMAN_b5g6r5       ||
+          format == PIXMAN_a8           ||
+          format == PIXMAN_a1))
+    {
+	return FALSE;
+    }
+
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_ABGR)
+    {
+	c = ((c & 0xff000000) >>  0) |
+	    ((c & 0x00ff0000) >> 16) |
+	    ((c & 0x0000ff00) >>  0) |
+	    ((c & 0x000000ff) << 16);
+    }
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_BGRA)
+    {
+	c = ((c & 0xff000000) >> 24) |
+	    ((c & 0x00ff0000) >>  8) |
+	    ((c & 0x0000ff00) <<  8) |
+	    ((c & 0x000000ff) << 24);
+    }
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_RGBA)
+	c = ((c & 0xff000000) >> 24) | (c << 8);
+
+    if (format == PIXMAN_a1)
+	c = c >> 31;
+    else if (format == PIXMAN_a8)
+	c = c >> 24;
+    else if (format == PIXMAN_r5g6b5 ||
+             format == PIXMAN_b5g6r5)
+	c = CONVERT_8888_TO_0565 (c);
+
+#if 0
+    printf ("color: %x %x %x %x\n", color->alpha, color->red, color->green, color->blue);
+    printf ("pixel: %x\n", c);
+#endif
+
+    *pixel = c;
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_fill_rectangles (pixman_op_t                 op,
+                              pixman_image_t *            dest,
+                              pixman_color_t *            color,
+                              int                         n_rects,
+                              const pixman_rectangle16_t *rects)
+{
+    pixman_box32_t stack_boxes[6];
+    pixman_box32_t *boxes;
+    pixman_bool_t result;
+    int i;
+
+    if (n_rects > 6)
+    {
+        boxes = pixman_malloc_ab (sizeof (pixman_box32_t), n_rects);
+        if (boxes == NULL)
+            return FALSE;
+    }
+    else
+    {
+        boxes = stack_boxes;
+    }
+
+    for (i = 0; i < n_rects; ++i)
+    {
+        boxes[i].x1 = rects[i].x;
+        boxes[i].y1 = rects[i].y;
+        boxes[i].x2 = boxes[i].x1 + rects[i].width;
+        boxes[i].y2 = boxes[i].y1 + rects[i].height;
+    }
+
+    result = pixman_image_fill_boxes (op, dest, color, n_rects, boxes);
+
+    if (boxes != stack_boxes)
+        free (boxes);
+    
+    return result;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_fill_boxes (pixman_op_t           op,
+                         pixman_image_t *      dest,
+                         pixman_color_t *      color,
+                         int                   n_boxes,
+                         const pixman_box32_t *boxes)
+{
+    pixman_image_t *solid;
+    pixman_color_t c;
+    int i;
+
+    _pixman_image_validate (dest);
+    
+    if (color->alpha == 0xffff)
+    {
+        if (op == PIXMAN_OP_OVER)
+            op = PIXMAN_OP_SRC;
+    }
+
+    if (op == PIXMAN_OP_CLEAR)
+    {
+        c.red = 0;
+        c.green = 0;
+        c.blue = 0;
+        c.alpha = 0;
+
+        color = &c;
+
+        op = PIXMAN_OP_SRC;
+    }
+
+    if (op == PIXMAN_OP_SRC)
+    {
+        uint32_t pixel;
+
+        if (color_to_pixel (color, &pixel, dest->bits.format))
+        {
+            pixman_region32_t fill_region;
+            int n_rects, j;
+            pixman_box32_t *rects;
+
+            if (!pixman_region32_init_rects (&fill_region, boxes, n_boxes))
+                return FALSE;
+
+            if (dest->common.have_clip_region)
+            {
+                if (!pixman_region32_intersect (&fill_region,
+                                                &fill_region,
+                                                &dest->common.clip_region))
+                    return FALSE;
+            }
+
+            rects = pixman_region32_rectangles (&fill_region, &n_rects);
+            for (j = 0; j < n_rects; ++j)
+            {
+                const pixman_box32_t *rect = &(rects[j]);
+                pixman_fill (dest->bits.bits, dest->bits.rowstride, PIXMAN_FORMAT_BPP (dest->bits.format),
+                             rect->x1, rect->y1, rect->x2 - rect->x1, rect->y2 - rect->y1,
+                             pixel);
+            }
+
+            pixman_region32_fini (&fill_region);
+            return TRUE;
+        }
+    }
+
+    solid = pixman_image_create_solid_fill (color);
+    if (!solid)
+        return FALSE;
+
+    for (i = 0; i < n_boxes; ++i)
+    {
+        const pixman_box32_t *box = &(boxes[i]);
+
+        pixman_image_composite32 (op, solid, NULL, dest,
+                                  0, 0, 0, 0,
+                                  box->x1, box->y1,
+                                  box->x2 - box->x1, box->y2 - box->y1);
+    }
+
+    pixman_image_unref (solid);
+
+    return TRUE;
+}
+
+/**
+ * pixman_version:
+ *
+ * Returns the version of the pixman library encoded in a single
+ * integer as per %PIXMAN_VERSION_ENCODE. The encoding ensures that
+ * later versions compare greater than earlier versions.
+ *
+ * A run-time comparison to check that pixman's version is greater than
+ * or equal to version X.Y.Z could be performed as follows:
+ *
+ * <informalexample><programlisting>
+ * if (pixman_version() >= PIXMAN_VERSION_ENCODE(X,Y,Z)) {...}
+ * </programlisting></informalexample>
+ *
+ * See also pixman_version_string() as well as the compile-time
+ * equivalents %PIXMAN_VERSION and %PIXMAN_VERSION_STRING.
+ *
+ * Return value: the encoded version.
+ **/
+PIXMAN_EXPORT int
+pixman_version (void)
+{
+    return PIXMAN_VERSION;
+}
+
+/**
+ * pixman_version_string:
+ *
+ * Returns the version of the pixman library as a human-readable string
+ * of the form "X.Y.Z".
+ *
+ * See also pixman_version() as well as the compile-time equivalents
+ * %PIXMAN_VERSION_STRING and %PIXMAN_VERSION.
+ *
+ * Return value: a string containing the version.
+ **/
+PIXMAN_EXPORT const char*
+pixman_version_string (void)
+{
+    return PIXMAN_VERSION_STRING;
+}
+
+/**
+ * pixman_format_supported_source:
+ * @format: A pixman_format_code_t format
+ *
+ * Return value: whether the provided format code is a supported
+ * format for a pixman surface used as a source in
+ * rendering.
+ *
+ * Currently, all pixman_format_code_t values are supported.
+ **/
+PIXMAN_EXPORT pixman_bool_t
+pixman_format_supported_source (pixman_format_code_t format)
+{
+    switch (format)
+    {
+    /* 32 bpp formats */
+    case PIXMAN_a2b10g10r10:
+    case PIXMAN_x2b10g10r10:
+    case PIXMAN_a2r10g10b10:
+    case PIXMAN_x2r10g10b10:
+    case PIXMAN_a8r8g8b8:
+    case PIXMAN_x8r8g8b8:
+    case PIXMAN_a8b8g8r8:
+    case PIXMAN_x8b8g8r8:
+    case PIXMAN_b8g8r8a8:
+    case PIXMAN_b8g8r8x8:
+    case PIXMAN_r8g8b8a8:
+    case PIXMAN_r8g8b8x8:
+    case PIXMAN_r8g8b8:
+    case PIXMAN_b8g8r8:
+    case PIXMAN_r5g6b5:
+    case PIXMAN_b5g6r5:
+    case PIXMAN_x14r6g6b6:
+    /* 16 bpp formats */
+    case PIXMAN_a1r5g5b5:
+    case PIXMAN_x1r5g5b5:
+    case PIXMAN_a1b5g5r5:
+    case PIXMAN_x1b5g5r5:
+    case PIXMAN_a4r4g4b4:
+    case PIXMAN_x4r4g4b4:
+    case PIXMAN_a4b4g4r4:
+    case PIXMAN_x4b4g4r4:
+    /* 8bpp formats */
+    case PIXMAN_a8:
+    case PIXMAN_r3g3b2:
+    case PIXMAN_b2g3r3:
+    case PIXMAN_a2r2g2b2:
+    case PIXMAN_a2b2g2r2:
+    case PIXMAN_c8:
+    case PIXMAN_g8:
+    case PIXMAN_x4a4:
+    /* Collides with PIXMAN_c8
+       case PIXMAN_x4c4:
+     */
+    /* Collides with PIXMAN_g8
+       case PIXMAN_x4g4:
+     */
+    /* 4bpp formats */
+    case PIXMAN_a4:
+    case PIXMAN_r1g2b1:
+    case PIXMAN_b1g2r1:
+    case PIXMAN_a1r1g1b1:
+    case PIXMAN_a1b1g1r1:
+    case PIXMAN_c4:
+    case PIXMAN_g4:
+    /* 1bpp formats */
+    case PIXMAN_a1:
+    case PIXMAN_g1:
+    /* YUV formats */
+    case PIXMAN_yuy2:
+    case PIXMAN_yv12:
+	return TRUE;
+
+    default:
+	return FALSE;
+    }
+}
+
+/**
+ * pixman_format_supported_destination:
+ * @format: A pixman_format_code_t format
+ *
+ * Return value: whether the provided format code is a supported
+ * format for a pixman surface used as a destination in
+ * rendering.
+ *
+ * Currently, all pixman_format_code_t values are supported
+ * except for the YUV formats.
+ **/
+PIXMAN_EXPORT pixman_bool_t
+pixman_format_supported_destination (pixman_format_code_t format)
+{
+    /* YUV formats cannot be written to at the moment */
+    if (format == PIXMAN_yuy2 || format == PIXMAN_yv12)
+	return FALSE;
+
+    return pixman_format_supported_source (format);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_compute_composite_region (pixman_region16_t * region,
+                                 pixman_image_t *    src_image,
+                                 pixman_image_t *    mask_image,
+                                 pixman_image_t *    dest_image,
+                                 int16_t             src_x,
+                                 int16_t             src_y,
+                                 int16_t             mask_x,
+                                 int16_t             mask_y,
+                                 int16_t             dest_x,
+                                 int16_t             dest_y,
+                                 uint16_t            width,
+                                 uint16_t            height)
+{
+    pixman_region32_t r32;
+    pixman_bool_t retval;
+
+    pixman_region32_init (&r32);
+
+    retval = pixman_compute_composite_region32 (
+	&r32, src_image, mask_image, dest_image,
+	src_x, src_y, mask_x, mask_y, dest_x, dest_y,
+	width, height);
+
+    if (retval)
+    {
+	if (!pixman_region16_copy_from_region32 (region, &r32))
+	    retval = FALSE;
+    }
+
+    pixman_region32_fini (&r32);
+    return retval;
+}
diff --git a/pixman/pixman.h b/pixman/pixman.h
new file mode 100644
index 0000000..c57092a
--- /dev/null
+++ b/pixman/pixman.h
@@ -0,0 +1,990 @@
+/***********************************************************
+
+Copyright 1987, 1998  The Open Group
+
+Permission to use, copy, modify, distribute, and sell this software and its
+documentation for any purpose is hereby granted without fee, provided that
+the above copyright notice appear in all copies and that both that
+copyright notice and this permission notice appear in supporting
+documentation.
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+Except as contained in this notice, the name of The Open Group shall not be
+used in advertising or otherwise to promote the sale, use or other dealings
+in this Software without prior written authorization from The Open Group.
+
+Copyright 1987 by Digital Equipment Corporation, Maynard, Massachusetts.
+
+                        All Rights Reserved
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted,
+provided that the above copyright notice appear in all copies and that
+both that copyright notice and this permission notice appear in
+supporting documentation, and that the name of Digital not be
+used in advertising or publicity pertaining to distribution of the
+software without specific, written prior permission.
+
+DIGITAL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
+ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL
+DIGITAL BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
+ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+SOFTWARE.
+
+******************************************************************/
+/*
+ * Copyright Â© 1998, 2004 Keith Packard
+ * Copyright   2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef PIXMAN_H__
+#define PIXMAN_H__
+
+#include <pixman-version.h>
+
+#ifdef  __cplusplus
+#define PIXMAN_BEGIN_DECLS extern "C" {
+#define PIXMAN_END_DECLS }
+#else
+#define PIXMAN_BEGIN_DECLS
+#define PIXMAN_END_DECLS
+#endif
+
+PIXMAN_BEGIN_DECLS
+
+/*
+ * Standard integers
+ */
+
+#if !defined (PIXMAN_DONT_DEFINE_STDINT)
+
+#if defined (_SVR4) || defined (SVR4) || defined (__OpenBSD__) || defined (_sgi) || defined (__sun) || defined (sun) || defined (__digital__) || defined (__HP_cc)
+#  include <inttypes.h>
+/* VS 2010 (_MSC_VER 1600) has stdint.h */
+#elif defined (_MSC_VER) && _MSC_VER < 1600
+typedef __int8 int8_t;
+typedef unsigned __int8 uint8_t;
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#elif defined (_AIX)
+#  include <sys/inttypes.h>
+#else
+#  include <stdint.h>
+#endif
+
+#endif
+
+/*
+ * Boolean
+ */
+typedef int pixman_bool_t;
+
+/*
+ * Fixpoint numbers
+ */
+typedef int64_t			pixman_fixed_32_32_t;
+typedef pixman_fixed_32_32_t	pixman_fixed_48_16_t;
+typedef uint32_t		pixman_fixed_1_31_t;
+typedef uint32_t		pixman_fixed_1_16_t;
+typedef int32_t			pixman_fixed_16_16_t;
+typedef pixman_fixed_16_16_t	pixman_fixed_t;
+
+#define pixman_fixed_e			((pixman_fixed_t) 1)
+#define pixman_fixed_1			(pixman_int_to_fixed(1))
+#define pixman_fixed_1_minus_e		(pixman_fixed_1 - pixman_fixed_e)
+#define pixman_fixed_minus_1		(pixman_int_to_fixed(-1))
+#define pixman_fixed_to_int(f)		((int) ((f) >> 16))
+#define pixman_int_to_fixed(i)		((pixman_fixed_t) ((i) << 16))
+#define pixman_fixed_to_double(f)	(double) ((f) / (double) pixman_fixed_1)
+#define pixman_double_to_fixed(d)	((pixman_fixed_t) ((d) * 65536.0))
+#define pixman_fixed_frac(f)		((f) & pixman_fixed_1_minus_e)
+#define pixman_fixed_floor(f)		((f) & ~pixman_fixed_1_minus_e)
+#define pixman_fixed_ceil(f)		pixman_fixed_floor ((f) + pixman_fixed_1_minus_e)
+#define pixman_fixed_fraction(f)	((f) & pixman_fixed_1_minus_e)
+#define pixman_fixed_mod_2(f)		((f) & (pixman_fixed1 | pixman_fixed_1_minus_e))
+#define pixman_max_fixed_48_16		((pixman_fixed_48_16_t) 0x7fffffff)
+#define pixman_min_fixed_48_16		(-((pixman_fixed_48_16_t) 1 << 31))
+
+/*
+ * Misc structs
+ */
+typedef struct pixman_color pixman_color_t;
+typedef struct pixman_point_fixed pixman_point_fixed_t;
+typedef struct pixman_line_fixed pixman_line_fixed_t;
+typedef struct pixman_vector pixman_vector_t;
+typedef struct pixman_transform pixman_transform_t;
+
+struct pixman_color
+{
+    uint16_t	red;
+    uint16_t    green;
+    uint16_t    blue;
+    uint16_t    alpha;
+};
+
+struct pixman_point_fixed
+{
+    pixman_fixed_t	x;
+    pixman_fixed_t	y;
+};
+
+struct pixman_line_fixed
+{
+    pixman_point_fixed_t	p1, p2;
+};
+
+/*
+ * Fixed point matrices
+ */
+
+struct pixman_vector
+{
+    pixman_fixed_t	vector[3];
+};
+
+struct pixman_transform
+{
+    pixman_fixed_t	matrix[3][3];
+};
+
+/* forward declaration (sorry) */
+struct pixman_box16;
+typedef  union pixman_image		pixman_image_t;
+
+void          pixman_transform_init_identity    (struct pixman_transform       *matrix);
+pixman_bool_t pixman_transform_point_3d         (const struct pixman_transform *transform,
+						 struct pixman_vector          *vector);
+pixman_bool_t pixman_transform_point            (const struct pixman_transform *transform,
+						 struct pixman_vector          *vector);
+pixman_bool_t pixman_transform_multiply         (struct pixman_transform       *dst,
+						 const struct pixman_transform *l,
+						 const struct pixman_transform *r);
+void          pixman_transform_init_scale       (struct pixman_transform       *t,
+						 pixman_fixed_t                 sx,
+						 pixman_fixed_t                 sy);
+pixman_bool_t pixman_transform_scale            (struct pixman_transform       *forward,
+						 struct pixman_transform       *reverse,
+						 pixman_fixed_t                 sx,
+						 pixman_fixed_t                 sy);
+void          pixman_transform_init_rotate      (struct pixman_transform       *t,
+						 pixman_fixed_t                 cos,
+						 pixman_fixed_t                 sin);
+pixman_bool_t pixman_transform_rotate           (struct pixman_transform       *forward,
+						 struct pixman_transform       *reverse,
+						 pixman_fixed_t                 c,
+						 pixman_fixed_t                 s);
+void          pixman_transform_init_translate   (struct pixman_transform       *t,
+						 pixman_fixed_t                 tx,
+						 pixman_fixed_t                 ty);
+pixman_bool_t pixman_transform_translate        (struct pixman_transform       *forward,
+						 struct pixman_transform       *reverse,
+						 pixman_fixed_t                 tx,
+						 pixman_fixed_t                 ty);
+pixman_bool_t pixman_transform_bounds           (const struct pixman_transform *matrix,
+						 struct pixman_box16           *b);
+pixman_bool_t pixman_transform_invert           (struct pixman_transform       *dst,
+						 const struct pixman_transform *src);
+pixman_bool_t pixman_transform_is_identity      (const struct pixman_transform *t);
+pixman_bool_t pixman_transform_is_scale         (const struct pixman_transform *t);
+pixman_bool_t pixman_transform_is_int_translate (const struct pixman_transform *t);
+pixman_bool_t pixman_transform_is_inverse       (const struct pixman_transform *a,
+						 const struct pixman_transform *b);
+
+/*
+ * Floating point matrices
+ */
+struct pixman_f_vector
+{
+    double  v[3];
+};
+
+struct pixman_f_transform
+{
+    double  m[3][3];
+};
+
+pixman_bool_t pixman_transform_from_pixman_f_transform (struct pixman_transform         *t,
+							const struct pixman_f_transform *ft);
+void          pixman_f_transform_from_pixman_transform (struct pixman_f_transform       *ft,
+							const struct pixman_transform   *t);
+pixman_bool_t pixman_f_transform_invert                (struct pixman_f_transform       *dst,
+							const struct pixman_f_transform *src);
+pixman_bool_t pixman_f_transform_point                 (const struct pixman_f_transform *t,
+							struct pixman_f_vector          *v);
+void          pixman_f_transform_point_3d              (const struct pixman_f_transform *t,
+							struct pixman_f_vector          *v);
+void          pixman_f_transform_multiply              (struct pixman_f_transform       *dst,
+							const struct pixman_f_transform *l,
+							const struct pixman_f_transform *r);
+void          pixman_f_transform_init_scale            (struct pixman_f_transform       *t,
+							double                           sx,
+							double                           sy);
+pixman_bool_t pixman_f_transform_scale                 (struct pixman_f_transform       *forward,
+							struct pixman_f_transform       *reverse,
+							double                           sx,
+							double                           sy);
+void          pixman_f_transform_init_rotate           (struct pixman_f_transform       *t,
+							double                           cos,
+							double                           sin);
+pixman_bool_t pixman_f_transform_rotate                (struct pixman_f_transform       *forward,
+							struct pixman_f_transform       *reverse,
+							double                           c,
+							double                           s);
+void          pixman_f_transform_init_translate        (struct pixman_f_transform       *t,
+							double                           tx,
+							double                           ty);
+pixman_bool_t pixman_f_transform_translate             (struct pixman_f_transform       *forward,
+							struct pixman_f_transform       *reverse,
+							double                           tx,
+							double                           ty);
+pixman_bool_t pixman_f_transform_bounds                (const struct pixman_f_transform *t,
+							struct pixman_box16             *b);
+void          pixman_f_transform_init_identity         (struct pixman_f_transform       *t);
+
+typedef enum
+{
+    PIXMAN_REPEAT_NONE,
+    PIXMAN_REPEAT_NORMAL,
+    PIXMAN_REPEAT_PAD,
+    PIXMAN_REPEAT_REFLECT
+} pixman_repeat_t;
+
+typedef enum
+{
+    PIXMAN_FILTER_FAST,
+    PIXMAN_FILTER_GOOD,
+    PIXMAN_FILTER_BEST,
+    PIXMAN_FILTER_NEAREST,
+    PIXMAN_FILTER_BILINEAR,
+    PIXMAN_FILTER_CONVOLUTION
+} pixman_filter_t;
+
+typedef enum
+{
+    PIXMAN_OP_CLEAR			= 0x00,
+    PIXMAN_OP_SRC			= 0x01,
+    PIXMAN_OP_DST			= 0x02,
+    PIXMAN_OP_OVER			= 0x03,
+    PIXMAN_OP_OVER_REVERSE		= 0x04,
+    PIXMAN_OP_IN			= 0x05,
+    PIXMAN_OP_IN_REVERSE		= 0x06,
+    PIXMAN_OP_OUT			= 0x07,
+    PIXMAN_OP_OUT_REVERSE		= 0x08,
+    PIXMAN_OP_ATOP			= 0x09,
+    PIXMAN_OP_ATOP_REVERSE		= 0x0a,
+    PIXMAN_OP_XOR			= 0x0b,
+    PIXMAN_OP_ADD			= 0x0c,
+    PIXMAN_OP_SATURATE			= 0x0d,
+
+    PIXMAN_OP_DISJOINT_CLEAR		= 0x10,
+    PIXMAN_OP_DISJOINT_SRC		= 0x11,
+    PIXMAN_OP_DISJOINT_DST		= 0x12,
+    PIXMAN_OP_DISJOINT_OVER		= 0x13,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE	= 0x14,
+    PIXMAN_OP_DISJOINT_IN		= 0x15,
+    PIXMAN_OP_DISJOINT_IN_REVERSE	= 0x16,
+    PIXMAN_OP_DISJOINT_OUT		= 0x17,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE	= 0x18,
+    PIXMAN_OP_DISJOINT_ATOP		= 0x19,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE	= 0x1a,
+    PIXMAN_OP_DISJOINT_XOR		= 0x1b,
+
+    PIXMAN_OP_CONJOINT_CLEAR		= 0x20,
+    PIXMAN_OP_CONJOINT_SRC		= 0x21,
+    PIXMAN_OP_CONJOINT_DST		= 0x22,
+    PIXMAN_OP_CONJOINT_OVER		= 0x23,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE	= 0x24,
+    PIXMAN_OP_CONJOINT_IN		= 0x25,
+    PIXMAN_OP_CONJOINT_IN_REVERSE	= 0x26,
+    PIXMAN_OP_CONJOINT_OUT		= 0x27,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE	= 0x28,
+    PIXMAN_OP_CONJOINT_ATOP		= 0x29,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE	= 0x2a,
+    PIXMAN_OP_CONJOINT_XOR		= 0x2b,
+
+    PIXMAN_OP_MULTIPLY                  = 0x30,
+    PIXMAN_OP_SCREEN                    = 0x31,
+    PIXMAN_OP_OVERLAY                   = 0x32,
+    PIXMAN_OP_DARKEN                    = 0x33,
+    PIXMAN_OP_LIGHTEN                   = 0x34,
+    PIXMAN_OP_COLOR_DODGE               = 0x35,
+    PIXMAN_OP_COLOR_BURN                = 0x36,
+    PIXMAN_OP_HARD_LIGHT                = 0x37,
+    PIXMAN_OP_SOFT_LIGHT                = 0x38,
+    PIXMAN_OP_DIFFERENCE                = 0x39,
+    PIXMAN_OP_EXCLUSION                 = 0x3a,
+    PIXMAN_OP_HSL_HUE			= 0x3b,
+    PIXMAN_OP_HSL_SATURATION		= 0x3c,
+    PIXMAN_OP_HSL_COLOR			= 0x3d,
+    PIXMAN_OP_HSL_LUMINOSITY		= 0x3e
+
+#ifdef PIXMAN_USE_INTERNAL_API
+    ,
+    PIXMAN_N_OPERATORS,
+    PIXMAN_OP_NONE = PIXMAN_N_OPERATORS
+#endif
+} pixman_op_t;
+
+/*
+ * Regions
+ */
+typedef struct pixman_region16_data	pixman_region16_data_t;
+typedef struct pixman_box16		pixman_box16_t;
+typedef struct pixman_rectangle16	pixman_rectangle16_t;
+typedef struct pixman_region16		pixman_region16_t;
+
+struct pixman_region16_data {
+    long		size;
+    long		numRects;
+/*  pixman_box16_t	rects[size];   in memory but not explicitly declared */
+};
+
+struct pixman_rectangle16
+{
+    int16_t	x, y;
+    uint16_t	width, height;
+};
+
+struct pixman_box16
+{
+    int16_t x1, y1, x2, y2;
+};
+
+struct pixman_region16
+{
+    pixman_box16_t          extents;
+    pixman_region16_data_t *data;
+};
+
+typedef enum
+{
+    PIXMAN_REGION_OUT,
+    PIXMAN_REGION_IN,
+    PIXMAN_REGION_PART
+} pixman_region_overlap_t;
+
+/* This function exists only to make it possible to preserve
+ * the X ABI - it should go away at first opportunity.
+ */
+void pixman_region_set_static_pointers (pixman_box16_t         *empty_box,
+					pixman_region16_data_t *empty_data,
+					pixman_region16_data_t *broken_data);
+
+/* creation/destruction */
+void                    pixman_region_init               (pixman_region16_t *region);
+void                    pixman_region_init_rect          (pixman_region16_t *region,
+							  int                x,
+							  int                y,
+							  unsigned int       width,
+							  unsigned int       height);
+pixman_bool_t           pixman_region_init_rects         (pixman_region16_t *region,
+							  const pixman_box16_t *boxes,
+							  int                count);
+void                    pixman_region_init_with_extents  (pixman_region16_t *region,
+							  pixman_box16_t    *extents);
+void                    pixman_region_init_from_image    (pixman_region16_t *region,
+							  pixman_image_t    *image);
+void                    pixman_region_fini               (pixman_region16_t *region);
+
+
+/* manipulation */
+void                    pixman_region_translate          (pixman_region16_t *region,
+							  int                x,
+							  int                y);
+pixman_bool_t           pixman_region_copy               (pixman_region16_t *dest,
+							  pixman_region16_t *source);
+pixman_bool_t           pixman_region_intersect          (pixman_region16_t *new_reg,
+							  pixman_region16_t *reg1,
+							  pixman_region16_t *reg2);
+pixman_bool_t           pixman_region_union              (pixman_region16_t *new_reg,
+							  pixman_region16_t *reg1,
+							  pixman_region16_t *reg2);
+pixman_bool_t           pixman_region_union_rect         (pixman_region16_t *dest,
+							  pixman_region16_t *source,
+							  int                x,
+							  int                y,
+							  unsigned int       width,
+							  unsigned int       height);
+pixman_bool_t		pixman_region_intersect_rect     (pixman_region16_t *dest,
+							  pixman_region16_t *source,
+							  int                x,
+							  int                y,
+							  unsigned int       width,
+							  unsigned int       height);
+pixman_bool_t           pixman_region_subtract           (pixman_region16_t *reg_d,
+							  pixman_region16_t *reg_m,
+							  pixman_region16_t *reg_s);
+pixman_bool_t           pixman_region_inverse            (pixman_region16_t *new_reg,
+							  pixman_region16_t *reg1,
+							  pixman_box16_t    *inv_rect);
+pixman_bool_t           pixman_region_contains_point     (pixman_region16_t *region,
+							  int                x,
+							  int                y,
+							  pixman_box16_t    *box);
+pixman_region_overlap_t pixman_region_contains_rectangle (pixman_region16_t *region,
+							  pixman_box16_t    *prect);
+pixman_bool_t           pixman_region_not_empty          (pixman_region16_t *region);
+pixman_box16_t *        pixman_region_extents            (pixman_region16_t *region);
+int                     pixman_region_n_rects            (pixman_region16_t *region);
+pixman_box16_t *        pixman_region_rectangles         (pixman_region16_t *region,
+							  int               *n_rects);
+pixman_bool_t           pixman_region_equal              (pixman_region16_t *region1,
+							  pixman_region16_t *region2);
+pixman_bool_t           pixman_region_selfcheck          (pixman_region16_t *region);
+void                    pixman_region_reset              (pixman_region16_t *region,
+							  pixman_box16_t    *box);
+/*
+ * 32 bit regions
+ */
+typedef struct pixman_region32_data	pixman_region32_data_t;
+typedef struct pixman_box32		pixman_box32_t;
+typedef struct pixman_rectangle32	pixman_rectangle32_t;
+typedef struct pixman_region32		pixman_region32_t;
+
+struct pixman_region32_data {
+    long		size;
+    long		numRects;
+/*  pixman_box32_t	rects[size];   in memory but not explicitly declared */
+};
+
+struct pixman_rectangle32
+{
+    int32_t x, y;
+    uint32_t width, height;
+};
+
+struct pixman_box32
+{
+    int32_t x1, y1, x2, y2;
+};
+
+struct pixman_region32
+{
+    pixman_box32_t          extents;
+    pixman_region32_data_t  *data;
+};
+
+/* creation/destruction */
+void                    pixman_region32_init               (pixman_region32_t *region);
+void                    pixman_region32_init_rect          (pixman_region32_t *region,
+							    int                x,
+							    int                y,
+							    unsigned int       width,
+							    unsigned int       height);
+pixman_bool_t           pixman_region32_init_rects         (pixman_region32_t *region,
+							    const pixman_box32_t *boxes,
+							    int                count);
+void                    pixman_region32_init_with_extents  (pixman_region32_t *region,
+							    pixman_box32_t    *extents);
+void                    pixman_region32_init_from_image    (pixman_region32_t *region,
+							    pixman_image_t    *image);
+void                    pixman_region32_fini               (pixman_region32_t *region);
+
+
+/* manipulation */
+void                    pixman_region32_translate          (pixman_region32_t *region,
+							    int                x,
+							    int                y);
+pixman_bool_t           pixman_region32_copy               (pixman_region32_t *dest,
+							    pixman_region32_t *source);
+pixman_bool_t           pixman_region32_intersect          (pixman_region32_t *new_reg,
+							    pixman_region32_t *reg1,
+							    pixman_region32_t *reg2);
+pixman_bool_t           pixman_region32_union              (pixman_region32_t *new_reg,
+							    pixman_region32_t *reg1,
+							    pixman_region32_t *reg2);
+pixman_bool_t		pixman_region32_intersect_rect     (pixman_region32_t *dest,
+							    pixman_region32_t *source,
+							    int                x,
+							    int                y,
+							    unsigned int       width,
+							    unsigned int       height);
+pixman_bool_t           pixman_region32_union_rect         (pixman_region32_t *dest,
+							    pixman_region32_t *source,
+							    int                x,
+							    int                y,
+							    unsigned int       width,
+							    unsigned int       height);
+pixman_bool_t           pixman_region32_subtract           (pixman_region32_t *reg_d,
+							    pixman_region32_t *reg_m,
+							    pixman_region32_t *reg_s);
+pixman_bool_t           pixman_region32_inverse            (pixman_region32_t *new_reg,
+							    pixman_region32_t *reg1,
+							    pixman_box32_t    *inv_rect);
+pixman_bool_t           pixman_region32_contains_point     (pixman_region32_t *region,
+							    int                x,
+							    int                y,
+							    pixman_box32_t    *box);
+pixman_region_overlap_t pixman_region32_contains_rectangle (pixman_region32_t *region,
+							    pixman_box32_t    *prect);
+pixman_bool_t           pixman_region32_not_empty          (pixman_region32_t *region);
+pixman_box32_t *        pixman_region32_extents            (pixman_region32_t *region);
+int                     pixman_region32_n_rects            (pixman_region32_t *region);
+pixman_box32_t *        pixman_region32_rectangles         (pixman_region32_t *region,
+							    int               *n_rects);
+pixman_bool_t           pixman_region32_equal              (pixman_region32_t *region1,
+							    pixman_region32_t *region2);
+pixman_bool_t           pixman_region32_selfcheck          (pixman_region32_t *region);
+void                    pixman_region32_reset              (pixman_region32_t *region,
+							    pixman_box32_t    *box);
+
+
+/* Copy / Fill / Misc */
+pixman_bool_t pixman_blt                (uint32_t           *src_bits,
+					 uint32_t           *dst_bits,
+					 int                 src_stride,
+					 int                 dst_stride,
+					 int                 src_bpp,
+					 int                 dst_bpp,
+					 int                 src_x,
+					 int                 src_y,
+					 int                 dest_x,
+					 int                 dest_y,
+					 int                 width,
+					 int                 height);
+pixman_bool_t pixman_fill               (uint32_t           *bits,
+					 int                 stride,
+					 int                 bpp,
+					 int                 x,
+					 int                 y,
+					 int                 width,
+					 int                 height,
+					 uint32_t            _xor);
+
+int           pixman_version            (void);
+const char*   pixman_version_string     (void);
+
+/*
+ * Images
+ */
+typedef struct pixman_indexed		pixman_indexed_t;
+typedef struct pixman_gradient_stop	pixman_gradient_stop_t;
+
+typedef uint32_t (* pixman_read_memory_func_t) (const void *src, int size);
+typedef void     (* pixman_write_memory_func_t) (void *dst, uint32_t value, int size);
+
+typedef void     (* pixman_image_destroy_func_t) (pixman_image_t *image, void *data);
+
+struct pixman_gradient_stop {
+    pixman_fixed_t x;
+    pixman_color_t color;
+};
+
+#define PIXMAN_MAX_INDEXED  256 /* XXX depth must be <= 8 */
+
+#if PIXMAN_MAX_INDEXED <= 256
+typedef uint8_t pixman_index_type;
+#endif
+
+struct pixman_indexed
+{
+    pixman_bool_t       color;
+    uint32_t		rgba[PIXMAN_MAX_INDEXED];
+    pixman_index_type	ent[32768];
+};
+
+/*
+ * While the protocol is generous in format support, the
+ * sample implementation allows only packed RGB and GBR
+ * representations for data to simplify software rendering,
+ */
+#define PIXMAN_FORMAT(bpp,type,a,r,g,b)	(((bpp) << 24) |  \
+					 ((type) << 16) | \
+					 ((a) << 12) |	  \
+					 ((r) << 8) |	  \
+					 ((g) << 4) |	  \
+					 ((b)))
+
+#define PIXMAN_FORMAT_BPP(f)	(((f) >> 24)       )
+#define PIXMAN_FORMAT_TYPE(f)	(((f) >> 16) & 0xff)
+#define PIXMAN_FORMAT_A(f)	(((f) >> 12) & 0x0f)
+#define PIXMAN_FORMAT_R(f)	(((f) >>  8) & 0x0f)
+#define PIXMAN_FORMAT_G(f)	(((f) >>  4) & 0x0f)
+#define PIXMAN_FORMAT_B(f)	(((f)      ) & 0x0f)
+#define PIXMAN_FORMAT_RGB(f)	(((f)      ) & 0xfff)
+#define PIXMAN_FORMAT_VIS(f)	(((f)      ) & 0xffff)
+#define PIXMAN_FORMAT_DEPTH(f)	(PIXMAN_FORMAT_A(f) +	\
+				 PIXMAN_FORMAT_R(f) +	\
+				 PIXMAN_FORMAT_G(f) +	\
+				 PIXMAN_FORMAT_B(f))
+
+#define PIXMAN_TYPE_OTHER	0
+#define PIXMAN_TYPE_A		1
+#define PIXMAN_TYPE_ARGB	2
+#define PIXMAN_TYPE_ABGR	3
+#define PIXMAN_TYPE_COLOR	4
+#define PIXMAN_TYPE_GRAY	5
+#define PIXMAN_TYPE_YUY2	6
+#define PIXMAN_TYPE_YV12	7
+#define PIXMAN_TYPE_BGRA	8
+#define PIXMAN_TYPE_RGBA	9
+
+#define PIXMAN_FORMAT_COLOR(f)				\
+	(PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ARGB ||	\
+	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ABGR ||	\
+	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_BGRA ||	\
+	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_RGBA)
+
+/* 32bpp formats */
+typedef enum {
+    PIXMAN_a8r8g8b8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,8,8,8,8),
+    PIXMAN_x8r8g8b8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,8,8,8),
+    PIXMAN_a8b8g8r8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,8,8,8,8),
+    PIXMAN_x8b8g8r8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,8,8,8),
+    PIXMAN_b8g8r8a8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,8,8,8,8),
+    PIXMAN_b8g8r8x8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,0,8,8,8),
+    PIXMAN_r8g8b8a8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_RGBA,8,8,8,8),
+    PIXMAN_r8g8b8x8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_RGBA,0,8,8,8),
+    PIXMAN_x14r6g6b6 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,6,6,6),
+    PIXMAN_x2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,10,10,10),
+    PIXMAN_a2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,2,10,10,10),
+    PIXMAN_x2b10g10r10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,10,10,10),
+    PIXMAN_a2b10g10r10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,2,10,10,10),
+
+/* 24bpp formats */
+    PIXMAN_r8g8b8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ARGB,0,8,8,8),
+    PIXMAN_b8g8r8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ABGR,0,8,8,8),
+
+/* 16bpp formats */
+    PIXMAN_r5g6b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,6,5),
+    PIXMAN_b5g6r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,6,5),
+
+    PIXMAN_a1r5g5b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,1,5,5,5),
+    PIXMAN_x1r5g5b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,5,5),
+    PIXMAN_a1b5g5r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,1,5,5,5),
+    PIXMAN_x1b5g5r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,5,5),
+    PIXMAN_a4r4g4b4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,4,4,4,4),
+    PIXMAN_x4r4g4b4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,4,4,4),
+    PIXMAN_a4b4g4r4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,4,4,4,4),
+    PIXMAN_x4b4g4r4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,4,4,4),
+
+/* 8bpp formats */
+    PIXMAN_a8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_A,8,0,0,0),
+    PIXMAN_r3g3b2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,0,3,3,2),
+    PIXMAN_b2g3r3 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,0,3,3,2),
+    PIXMAN_a2r2g2b2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,2,2,2,2),
+    PIXMAN_a2b2g2r2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,2,2,2,2),
+
+    PIXMAN_c8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_g8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
+
+    PIXMAN_x4a4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_A,4,0,0,0),
+
+    PIXMAN_x4c4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_x4g4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
+
+/* 4bpp formats */
+    PIXMAN_a4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_A,4,0,0,0),
+    PIXMAN_r1g2b1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,0,1,2,1),
+    PIXMAN_b1g2r1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,0,1,2,1),
+    PIXMAN_a1r1g1b1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,1,1,1,1),
+    PIXMAN_a1b1g1r1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,1,1,1,1),
+
+    PIXMAN_c4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_g4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_GRAY,0,0,0,0),
+
+/* 1bpp formats */
+    PIXMAN_a1 =		 PIXMAN_FORMAT(1,PIXMAN_TYPE_A,1,0,0,0),
+
+    PIXMAN_g1 =		 PIXMAN_FORMAT(1,PIXMAN_TYPE_GRAY,0,0,0,0),
+
+/* YUV formats */
+    PIXMAN_yuy2 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_YUY2,0,0,0,0),
+    PIXMAN_yv12 =	 PIXMAN_FORMAT(12,PIXMAN_TYPE_YV12,0,0,0,0)
+} pixman_format_code_t;
+
+/* Querying supported format values. */
+pixman_bool_t pixman_format_supported_destination (pixman_format_code_t format);
+pixman_bool_t pixman_format_supported_source      (pixman_format_code_t format);
+
+/* Constructors */
+pixman_image_t *pixman_image_create_solid_fill       (pixman_color_t               *color);
+pixman_image_t *pixman_image_create_linear_gradient  (pixman_point_fixed_t         *p1,
+						      pixman_point_fixed_t         *p2,
+						      const pixman_gradient_stop_t *stops,
+						      int                           n_stops);
+pixman_image_t *pixman_image_create_radial_gradient  (pixman_point_fixed_t         *inner,
+						      pixman_point_fixed_t         *outer,
+						      pixman_fixed_t                inner_radius,
+						      pixman_fixed_t                outer_radius,
+						      const pixman_gradient_stop_t *stops,
+						      int                           n_stops);
+pixman_image_t *pixman_image_create_conical_gradient (pixman_point_fixed_t         *center,
+						      pixman_fixed_t                angle,
+						      const pixman_gradient_stop_t *stops,
+						      int                           n_stops);
+pixman_image_t *pixman_image_create_bits             (pixman_format_code_t          format,
+						      int                           width,
+						      int                           height,
+						      uint32_t                     *bits,
+						      int                           rowstride_bytes);
+
+/* Destructor */
+pixman_image_t *pixman_image_ref                     (pixman_image_t               *image);
+pixman_bool_t   pixman_image_unref                   (pixman_image_t               *image);
+
+void		pixman_image_set_destroy_function    (pixman_image_t		   *image,
+						      pixman_image_destroy_func_t   function,
+						      void			   *data);
+void *		pixman_image_get_destroy_data        (pixman_image_t		   *image);
+
+/* Set properties */
+pixman_bool_t   pixman_image_set_clip_region         (pixman_image_t               *image,
+						      pixman_region16_t            *region);
+pixman_bool_t   pixman_image_set_clip_region32       (pixman_image_t               *image,
+						      pixman_region32_t            *region);
+void		pixman_image_set_has_client_clip     (pixman_image_t               *image,
+						      pixman_bool_t		    clien_clip);
+pixman_bool_t   pixman_image_set_transform           (pixman_image_t               *image,
+						      const pixman_transform_t     *transform);
+void            pixman_image_set_repeat              (pixman_image_t               *image,
+						      pixman_repeat_t               repeat);
+pixman_bool_t   pixman_image_set_filter              (pixman_image_t               *image,
+						      pixman_filter_t               filter,
+						      const pixman_fixed_t         *filter_params,
+						      int                           n_filter_params);
+void		pixman_image_set_source_clipping     (pixman_image_t		   *image,
+						      pixman_bool_t                 source_clipping);
+void            pixman_image_set_alpha_map           (pixman_image_t               *image,
+						      pixman_image_t               *alpha_map,
+						      int16_t                       x,
+						      int16_t                       y);
+void            pixman_image_set_component_alpha     (pixman_image_t               *image,
+						      pixman_bool_t                 component_alpha);
+pixman_bool_t   pixman_image_get_component_alpha     (pixman_image_t               *image);
+void		pixman_image_set_accessors	     (pixman_image_t		   *image,
+						      pixman_read_memory_func_t	    read_func,
+						      pixman_write_memory_func_t    write_func);
+void		pixman_image_set_indexed	     (pixman_image_t		   *image,
+						      const pixman_indexed_t	   *indexed);
+uint32_t       *pixman_image_get_data                (pixman_image_t               *image);
+int		pixman_image_get_width               (pixman_image_t               *image);
+int             pixman_image_get_height              (pixman_image_t               *image);
+int		pixman_image_get_stride              (pixman_image_t               *image); /* in bytes */
+int		pixman_image_get_depth               (pixman_image_t		   *image);
+pixman_format_code_t pixman_image_get_format	     (pixman_image_t		   *image);
+pixman_bool_t	pixman_image_fill_rectangles	     (pixman_op_t		    op,
+						      pixman_image_t		   *image,
+						      pixman_color_t		   *color,
+						      int			    n_rects,
+						      const pixman_rectangle16_t   *rects);
+pixman_bool_t   pixman_image_fill_boxes              (pixman_op_t                   op,
+                                                      pixman_image_t               *dest,
+                                                      pixman_color_t               *color,
+                                                      int                           n_boxes,
+                                                      const pixman_box32_t         *boxes);
+
+/* Composite */
+pixman_bool_t pixman_compute_composite_region (pixman_region16_t *region,
+					       pixman_image_t    *src_image,
+					       pixman_image_t    *mask_image,
+					       pixman_image_t    *dest_image,
+					       int16_t            src_x,
+					       int16_t            src_y,
+					       int16_t            mask_x,
+					       int16_t            mask_y,
+					       int16_t            dest_x,
+					       int16_t            dest_y,
+					       uint16_t           width,
+					       uint16_t           height);
+void          pixman_image_composite          (pixman_op_t        op,
+					       pixman_image_t    *src,
+					       pixman_image_t    *mask,
+					       pixman_image_t    *dest,
+					       int16_t            src_x,
+					       int16_t            src_y,
+					       int16_t            mask_x,
+					       int16_t            mask_y,
+					       int16_t            dest_x,
+					       int16_t            dest_y,
+					       uint16_t           width,
+					       uint16_t           height);
+void          pixman_image_composite32        (pixman_op_t        op,
+					       pixman_image_t    *src,
+					       pixman_image_t    *mask,
+					       pixman_image_t    *dest,
+					       int32_t            src_x,
+					       int32_t            src_y,
+					       int32_t            mask_x,
+					       int32_t            mask_y,
+					       int32_t            dest_x,
+					       int32_t            dest_y,
+					       int32_t            width,
+					       int32_t            height);
+
+/* Executive Summary: This function is a no-op that only exists
+ * for historical reasons.
+ *
+ * There used to be a bug in the X server where it would rely on
+ * out-of-bounds accesses when it was asked to composite with a
+ * window as the source. It would create a pixman image pointing
+ * to some bogus position in memory, but then set a clip region
+ * to the position where the actual bits were.
+ *
+ * Due to a bug in old versions of pixman, where it would not clip
+ * against the image bounds when a clip region was set, this would
+ * actually work. So when the pixman bug was fixed, a workaround was
+ * added to allow certain out-of-bound accesses. This function disabled
+ * those workarounds.
+ *
+ * Since 0.21.2, pixman doesn't do these workarounds anymore, so now this
+ * function is a no-op.
+ */
+void pixman_disable_out_of_bounds_workaround (void);
+
+/*
+ * Trapezoids
+ */
+typedef struct pixman_edge pixman_edge_t;
+typedef struct pixman_trapezoid pixman_trapezoid_t;
+typedef struct pixman_trap pixman_trap_t;
+typedef struct pixman_span_fix pixman_span_fix_t;
+typedef struct pixman_triangle pixman_triangle_t;
+
+/*
+ * An edge structure.  This represents a single polygon edge
+ * and can be quickly stepped across small or large gaps in the
+ * sample grid
+ */
+struct pixman_edge
+{
+    pixman_fixed_t	x;
+    pixman_fixed_t	e;
+    pixman_fixed_t	stepx;
+    pixman_fixed_t	signdx;
+    pixman_fixed_t	dy;
+    pixman_fixed_t	dx;
+
+    pixman_fixed_t	stepx_small;
+    pixman_fixed_t	stepx_big;
+    pixman_fixed_t	dx_small;
+    pixman_fixed_t	dx_big;
+};
+
+struct pixman_trapezoid
+{
+    pixman_fixed_t	top, bottom;
+    pixman_line_fixed_t	left, right;
+};
+
+struct pixman_triangle
+{
+    pixman_point_fixed_t p1, p2, p3;
+};
+
+/* whether 't' is a well defined not obviously empty trapezoid */
+#define pixman_trapezoid_valid(t)				   \
+    ((t)->left.p1.y != (t)->left.p2.y &&			   \
+     (t)->right.p1.y != (t)->right.p2.y &&			   \
+     (int) ((t)->bottom - (t)->top) > 0)
+
+struct pixman_span_fix
+{
+    pixman_fixed_t	l, r, y;
+};
+
+struct pixman_trap
+{
+    pixman_span_fix_t	top, bot;
+};
+
+pixman_fixed_t pixman_sample_ceil_y        (pixman_fixed_t             y,
+					    int                        bpp);
+pixman_fixed_t pixman_sample_floor_y       (pixman_fixed_t             y,
+					    int                        bpp);
+void           pixman_edge_step            (pixman_edge_t             *e,
+					    int                        n);
+void           pixman_edge_init            (pixman_edge_t             *e,
+					    int                        bpp,
+					    pixman_fixed_t             y_start,
+					    pixman_fixed_t             x_top,
+					    pixman_fixed_t             y_top,
+					    pixman_fixed_t             x_bot,
+					    pixman_fixed_t             y_bot);
+void           pixman_line_fixed_edge_init (pixman_edge_t             *e,
+					    int                        bpp,
+					    pixman_fixed_t             y,
+					    const pixman_line_fixed_t *line,
+					    int                        x_off,
+					    int                        y_off);
+void           pixman_rasterize_edges      (pixman_image_t            *image,
+					    pixman_edge_t             *l,
+					    pixman_edge_t             *r,
+					    pixman_fixed_t             t,
+					    pixman_fixed_t             b);
+void           pixman_add_traps            (pixman_image_t            *image,
+					    int16_t                    x_off,
+					    int16_t                    y_off,
+					    int                        ntrap,
+					    pixman_trap_t             *traps);
+void           pixman_add_trapezoids       (pixman_image_t            *image,
+					    int16_t                    x_off,
+					    int                        y_off,
+					    int                        ntraps,
+					    const pixman_trapezoid_t  *traps);
+void           pixman_rasterize_trapezoid  (pixman_image_t            *image,
+					    const pixman_trapezoid_t  *trap,
+					    int                        x_off,
+					    int                        y_off);
+void          pixman_composite_trapezoids (pixman_op_t		       op,
+					   pixman_image_t *	       src,
+					   pixman_image_t *	       dst,
+					   pixman_format_code_t	       mask_format,
+					   int			       x_src,
+					   int			       y_src,
+					   int			       x_dst,
+					   int			       y_dst,
+					   int			       n_traps,
+					   const pixman_trapezoid_t *  traps);
+void          pixman_composite_triangles (pixman_op_t		       op,
+					  pixman_image_t *	       src,
+					  pixman_image_t *	       dst,
+					  pixman_format_code_t	       mask_format,
+					  int			       x_src,
+					  int			       y_src,
+					  int			       x_dst,
+					  int			       y_dst,
+					  int			       n_tris,
+					  const pixman_triangle_t *    tris);
+void	      pixman_add_triangles       (pixman_image_t              *image,
+					  int32_t	               x_off,
+					  int32_t	               y_off,
+					  int	                       n_tris,
+					  const pixman_triangle_t     *tris);
+
+PIXMAN_END_DECLS
+
+#endif /* PIXMAN_H__ */
diff --git a/pixman/refactor b/pixman/refactor
new file mode 100644
index 0000000..52fceab
--- /dev/null
+++ b/pixman/refactor
@@ -0,0 +1,478 @@
+Roadmap
+
+- Move all the fetchers etc. into pixman-image to make pixman-compose.c
+  less intimidating.
+
+  DONE
+
+- Make combiners for unified alpha take a mask argument. That way
+  we won't need two separate paths for unified vs component in the
+  general compositing code.
+
+  DONE, except that the Altivec code needs to be updated. Luca is
+  looking into that.
+
+- Delete separate 'unified alpha' path
+ 
+  DONE
+
+- Split images into their own files
+
+  DONE
+
+- Split the gradient walker code out into its own file
+
+  DONE
+
+- Add scanline getters per image
+
+  DONE
+
+- Generic 64 bit fetcher 
+
+  DONE
+
+- Split fast path tables into their respective architecture dependent
+  files.
+
+See "Render Algorithm" below for rationale
+
+Images will eventually have these virtual functions:
+
+       get_scanline()
+       get_scanline_wide()
+       get_pixel()
+       get_pixel_wide()
+       get_untransformed_pixel()
+       get_untransformed_pixel_wide()
+       get_unfiltered_pixel()
+       get_unfiltered_pixel_wide()
+
+       store_scanline()
+       store_scanline_wide()
+
+1.
+
+Initially we will just have get_scanline() and get_scanline_wide();
+these will be based on the ones in pixman-compose. Hopefully this will
+reduce the complexity in pixman_composite_rect_general().
+
+Note that there is access considerations - the compose function is
+being compiled twice.
+
+
+2.
+
+Split image types into their own source files. Export noop virtual
+reinit() call.  Call this whenever a property of the image changes.
+
+
+3. 
+
+Split the get_scanline() call into smaller functions that are
+initialized by the reinit() call.
+
+The Render Algorithm:
+	(first repeat, then filter, then transform, then clip)
+
+Starting from a destination pixel (x, y), do
+
+	1 x = x - xDst + xSrc
+	  y = y - yDst + ySrc
+
+	2 reject pixel that is outside the clip
+
+	This treats clipping as something that happens after
+	transformation, which I think is correct for client clips. For
+	hierarchy clips it is wrong, but who really cares? Without
+	GraphicsExposes hierarchy clips are basically irrelevant. Yes,
+	you could imagine cases where the pixels of a subwindow of a
+	redirected, transformed window should be treated as
+	transparent. I don't really care
+
+	Basically, I think the render spec should say that pixels that
+	are unavailable due to the hierarcy have undefined content,
+	and that GraphicsExposes are not generated. Ie., basically
+	that using non-redirected windows as sources is fail. This is
+	at least consistent with the current implementation and we can
+	update the spec later if someone makes it work.
+
+	The implication for render is that it should stop passing the
+	hierarchy clip to pixman. In pixman, if a souce image has a
+	clip it should be used in computing the composite region and
+	nowhere else, regardless of what "has_client_clip" says. The
+	default should be for there to not be any clip.
+
+	I would really like to get rid of the client clip as well for
+	source images, but unfortunately there is at least one
+	application in the wild that uses them.
+
+	3 Transform pixel: (x, y) = T(x, y)
+
+	4 Call p = GetUntransformedPixel (x, y)
+
+	5 If the image has an alpha map, then
+
+		Call GetUntransformedPixel (x, y) on the alpha map
+		
+		add resulting alpha channel to p
+
+	   return p
+
+	Where GetUnTransformedPixel is:
+
+	6 switch (filter)
+	  {
+	  case NEAREST:
+		return GetUnfilteredPixel (x, y);
+		break;
+
+	  case BILINEAR:
+		return GetUnfilteredPixel (...) // 4 times 
+		break;
+
+	  case CONVOLUTION:
+		return GetUnfilteredPixel (...) // as many times as necessary.
+		break;
+	  }
+
+	Where GetUnfilteredPixel (x, y) is
+
+	7 switch (repeat)
+	   {
+	   case REPEAT_NORMAL:
+	   case REPEAT_PAD:
+	   case REPEAT_REFLECT:
+		// adjust x, y as appropriate
+		break;
+
+	   case REPEAT_NONE:
+	        if (x, y) is outside image bounds
+		     return 0;
+		break;
+	   }
+
+	   return GetRawPixel(x, y)
+
+	Where GetRawPixel (x, y) is
+
+	8 Compute the pixel in question, depending on image type.
+
+For gradients, repeat has a totally different meaning, so
+UnfilteredPixel() and RawPixel() must be the same function so that
+gradients can do their own repeat algorithm.
+
+So, the GetRawPixel
+
+	for bits must deal with repeats
+	for gradients must deal with repeats (differently)
+	for solids, should ignore repeats.
+
+	for polygons, when we add them, either ignore repeats or do
+	something similar to bits (in which case, we may want an extra
+	layer of indirection to modify the coordinates).
+
+It is then possible to build things like "get scanline" or "get tile" on
+top of this. In the simplest case, just repeatedly calling GetPixel()
+would work, but specialized get_scanline()s or get_tile()s could be
+plugged in for common cases. 
+
+By not plugging anything in for images with access functions, we only
+have to compile the pixel functions twice, not the scanline functions.
+
+And we can get rid of fetchers for the bizarre formats that no one
+uses. Such as b2g3r3 etc. r1g2b1? Seriously? It is also worth
+considering a generic format based pixel fetcher for these edge cases.
+
+Since the actual routines depend on the image attributes, the images
+must be notified when those change and update their function pointers
+appropriately. So there should probably be a virtual function called
+(* reinit) or something like that.
+
+There will also be wide fetchers for both pixels and lines. The line
+fetcher will just call the wide pixel fetcher. The wide pixel fetcher
+will just call expand, except for 10 bit formats.
+
+Rendering pipeline:
+
+Drawable:
+	0. if (picture has alpha map)
+		0.1. Position alpha map according to the alpha_x/alpha_y
+	        0.2. Where the two drawables intersect, the alpha channel
+		     Replace the alpha channel of source with the one
+		     from the alpha map. Replacement only takes place
+		     in the intersection of the two drawables' geometries.
+	1. Repeat the drawable according to the repeat attribute
+	2. Reconstruct a continuous image according to the filter
+	3. Transform according to the transform attribute
+	4. Position image such that src_x, src_y is over dst_x, dst_y
+	5. Sample once per destination pixel 
+	6. Clip. If a pixel is not within the source clip, then no
+	   compositing takes place at that pixel. (Ie., it's *not*
+	   treated as 0).
+
+	Sampling a drawable: 
+
+	- If the channel does not have an alpha channel, the pixels in it
+	  are treated as opaque.
+
+	Note on reconstruction:
+
+	- The top left pixel has coordinates (0.5, 0.5) and pixels are
+	  spaced 1 apart.
+
+Gradient:
+	1. Unless gradient type is conical, repeat the underlying (0, 1)
+		gradient according to the repeat attribute
+	2. Integrate the gradient across the plane according to type.
+	3. Transform according to transform attribute
+	4. Position gradient 
+	5. Sample once per destination pixel.
+ 	6. Clip
+
+Solid Fill:
+	1. Repeat has no effect
+	2. Image is already continuous and defined for the entire plane
+	3. Transform has no effect
+	4. Positioning has no effect
+	5. Sample once per destination pixel.
+	6. Clip
+
+Polygon:
+	1. Repeat has no effect
+	2. Image is already continuous and defined on the whole plane
+	3. Transform according to transform attribute
+	4. Position image
+	5. Supersample 15x17 per destination pixel.
+	6. Clip
+
+Possibly interesting additions:
+	- More general transformations, such as warping, or general
+	  shading.
+
+	- Shader image where a function is called to generate the
+          pixel (ie., uploading assembly code).
+
+	- Resampling kernels
+
+	  In principle the polygon image uses a 15x17 box filter for
+	  resampling. If we allow general resampling filters, then we
+	  get all the various antialiasing types for free. 
+
+	  Bilinear downsampling looks terrible and could be much 
+	  improved by a resampling filter. NEAREST reconstruction
+	  combined with a box resampling filter is what GdkPixbuf
+	  does, I believe.
+
+	  Useful for high frequency gradients as well.
+
+	  (Note that the difference between a reconstruction and a
+	  resampling filter is mainly where in the pipeline they
+	  occur. High quality resampling should use a correctly
+	  oriented kernel so it should happen after transformation.
+
+	  An implementation can transform the resampling kernel and
+	  convolve it with the reconstruction if it so desires, but it
+	  will need to deal with the fact that the resampling kernel
+	  will not necessarily be pixel aligned.
+
+	  "Output kernels"
+
+	  One could imagine doing the resampling after compositing,
+	  ie., for each destination pixel sample each source image 16
+	  times, then composite those subpixels individually, then
+	  finally apply a kernel.
+
+	  However, this is effectively the same as full screen
+	  antialiasing, which is a simpler way to think about it. So
+	  resampling kernels may make sense for individual images, but
+	  not as a post-compositing step.
+	  
+	  Fullscreen AA is inefficient without chained compositing
+	  though. Consider an (image scaled up to oversample size IN
+	  some polygon) scaled down to screen size. With the current
+	  implementation, there will be a huge temporary. With chained
+	  compositing, the whole thing ends up being equivalent to the
+	  output kernel from above.
+
+	- Color space conversion
+
+	  The complete model here is that each surface has a color
+	  space associated with it and that the compositing operation
+	  also has one associated with it. Note also that gradients
+	  should have associcated colorspaces.
+
+	- Dithering
+
+	  If people dither something that is already dithered, it will
+	  look terrible, but don't do that, then. (Dithering happens
+	  after resampling if at all - what is the relationship
+	  with color spaces? Presumably dithering should happen in linear
+	  intensity space).
+
+	- Floating point surfaces, 16, 32 and possibly 64 bit per
+	  channel.
+
+	Maybe crack:
+
+	- Glyph polygons
+
+	  If glyphs could be given as polygons, they could be
+	  positioned and rasterized more accurately. The glyph
+	  structure would need subpixel positioning though.
+
+	- Luminance vs. coverage for the alpha channel
+
+	  Whether the alpha channel should be interpreted as luminance
+          modulation or as coverage (intensity modulation). This is a
+          bit of a departure from the rendering model though. It could
+	  also be considered whether it should be possible to have 
+	  both channels in the same drawable.
+
+	- Alternative for component alpha
+
+	  - Set component-alpha on the output image.
+
+	    - This means each of the components are sampled
+	      independently and composited in the corresponding
+	      channel only.
+
+	  - Have 3 x oversampled mask
+
+	  - Scale it down by 3 horizontally, with [ 1/3, 1/3, 1/3 ]
+            resampling filter. 
+
+	    Is this equivalent to just using a component alpha mask?
+
+	Incompatible changes:
+
+	- Gradients could be specified with premultiplied colors. (You
+	  can use a mask to get things like gradients from solid red to
+	  transparent red.
+
+Refactoring pixman
+
+The pixman code is not particularly nice to put it mildly. Among the
+issues are
+
+- inconsistent naming style (fb vs Fb, camelCase vs
+  underscore_naming). Sometimes there is even inconsistency *within*
+  one name.
+
+      fetchProc32 ACCESS(pixman_fetchProcForPicture32)
+
+  may be one of the uglies names ever created.
+
+  coding style: 
+  	 use the one from cairo except that pixman uses this brace style:
+	 
+		while (blah)
+		{
+		}
+
+	Format do while like this:
+
+	       do 
+	       {
+
+	       } 
+	       while (...);
+
+- PIXMAN_COMPOSITE_RECT_GENERAL() is horribly complex
+
+- switch case logic in pixman-access.c
+
+  Instead it would be better to just store function pointers in the
+  image objects themselves,
+
+  	get_pixel()
+	get_scanline()
+
+- Much of the scanline fetching code is for formats that no one 
+  ever uses. a2r2g2b2 anyone?
+
+  It would probably be worthwhile having a generic fetcher for any
+  pixman format whatsoever.
+
+- Code related to particular image types should be split into individual
+  files.
+
+	pixman-bits-image.c
+	pixman-linear-gradient-image.c
+	pixman-radial-gradient-image.c
+	pixman-solid-image.c
+
+- Fast path code should be split into files based on architecture:
+
+       pixman-mmx-fastpath.c
+       pixman-sse2-fastpath.c
+       pixman-c-fastpath.c
+
+       etc.
+
+  Each of these files should then export a fastpath table, which would
+  be declared in pixman-private.h. This should allow us to get rid
+  of the pixman-mmx.h files.
+
+  The fast path table should describe each fast path. Ie there should
+  be bitfields indicating what things the fast path can handle, rather than
+  like now where it is only allowed to take one format per src/mask/dest. Ie., 
+
+  { 
+    FAST_a8r8g8b8 | FAST_x8r8g8b8,
+    FAST_null,
+    FAST_x8r8g8b8,
+    FAST_repeat_normal | FAST_repeat_none,
+    the_fast_path
+  }
+
+There should then be *one* file that implements pixman_image_composite(). 
+This should do this:
+
+     optimize_operator();
+
+     convert 1x1 repeat to solid (actually this should be done at
+     image creation time).
+     
+     is there a useful fastpath?
+
+There should be a file called pixman-cpu.c that contains all the
+architecture specific stuff to detect what CPU features we have.
+
+Issues that must be kept in mind:
+
+       - we need accessor code to be preserved
+
+       - maybe there should be a "store_scanline" too?
+
+         Is this sufficient?
+
+	 We should preserve the optimization where the
+	 compositing happens directly in the destination
+	 whenever possible.
+
+	- It should be possible to create GPU samplers from the
+	  images.
+
+The "horizontal" classification should be a bit in the image, the
+"vertical" classification should just happen inside the gradient
+file. Note though that
+
+      (a) these will change if the tranformation/repeat changes.
+
+      (b) at the moment the optimization for linear gradients
+          takes the source rectangle into account. Presumably
+	  this is to also optimize the case where the gradient
+	  is close enough to horizontal?
+
+Who is responsible for repeats? In principle it should be the scanline
+fetch. Right now NORMAL repeats are handled by walk_composite_region()
+while other repeats are handled by the scanline code.
+
+
+(Random note on filtering: do you filter before or after
+transformation?  Hardware is going to filter after transformation;
+this is also what pixman does currently). It's not completely clear
+what filtering *after* transformation means. One thing that might look
+good would be to do *supersampling*, ie., compute multiple subpixels
+per destination pixel, then average them together.
diff --git a/pixman/solaris-hwcap.mapfile b/pixman/solaris-hwcap.mapfile
new file mode 100644
index 0000000..87efce1
--- /dev/null
+++ b/pixman/solaris-hwcap.mapfile
@@ -0,0 +1,30 @@
+###############################################################################
+#
+# Copyright 2009, Oracle and/or its affiliates. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+#
+# Override the linker's detection of CMOV/MMX/SSE instructions so this
+# library isn't flagged as only usable on CPU's with those ISA's, since it
+# checks at runtime for availability before calling them
+
+hwcap_1 = V0x0 FPU OVERRIDE;
diff --git a/test/Makefile.am b/test/Makefile.am
new file mode 100755
index 0000000..eeb3679
--- /dev/null
+++ b/test/Makefile.am
@@ -0,0 +1,13 @@
+include $(top_srcdir)/test/Makefile.sources
+
+AM_CFLAGS = $(OPENMP_CFLAGS)
+AM_LDFLAGS = $(OPENMP_CFLAGS) $(TESTPROGS_EXTRA_LDFLAGS)
+LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la -lm  $(PNG_LIBS)
+INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(PNG_CFLAGS)
+
+libutils_la_SOURCES = $(libutils_sources) $(libutils_headers)
+
+noinst_LTLIBRARIES = libutils.la
+noinst_PROGRAMS = $(TESTPROGRAMS) $(BENCHMARKS)
+
+TESTS = $(TESTPROGRAMS)
diff --git a/test/Makefile.sources b/test/Makefile.sources
new file mode 100644
index 0000000..99eb705
--- /dev/null
+++ b/test/Makefile.sources
@@ -0,0 +1,36 @@
+# Tests (sorted by expected completion time)
+TESTPROGRAMS =			\
+	a1-trap-test		\
+	pdf-op-test		\
+	region-test		\
+	region-translate-test	\
+	fetch-test		\
+	oob-test		\
+	trap-crasher		\
+	alpha-loop		\
+	scaling-crash-test	\
+	scaling-helpers-test	\
+	gradient-crash-test	\
+	region-contains-test	\
+	alphamap		\
+	stress-test		\
+	composite-traps-test	\
+	blitters-test		\
+	scaling-test		\
+	affine-test		\
+	composite		\
+	$(NULL)
+
+# Benchmarks
+BENCHMARKS =			\
+	lowlevel-blt-bench	\
+	$(NULL)
+
+# Utility functions
+libutils_sources =		\
+	utils.c			\
+	$(NULL)
+
+libutils_headers =		\
+	utils.h			\
+	$(NULL)
diff --git a/test/Makefile.win32 b/test/Makefile.win32
new file mode 100755
index 0000000..307ba0c
--- /dev/null
+++ b/test/Makefile.win32
@@ -0,0 +1,31 @@
+default: all
+
+top_srcdir = ..
+include $(top_srcdir)/test/Makefile.sources
+include $(top_srcdir)/Makefile.win32.common
+
+TEST_LDADD = \
+	$(top_builddir)/pixman/$(CFG_VAR)/$(LIBRARY).lib \
+	$(CFG_VAR)/libutils.lib \
+	$(NULL)
+
+libutils_OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(libutils_sources))
+
+SOURCES = $(patsubst %,   %.c,              $(TESTPROGRAMS) $(BENCHMARKS))
+OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(SOURCES))
+TESTS   = $(patsubst %,   $(CFG_VAR)/%.exe, $(TESTPROGRAMS))
+BENCHS  = $(patsubst %,   $(CFG_VAR)/%.exe, $(BENCHMARKS))
+
+all: inform $(TESTS) $(BENCHS)
+
+check: inform $(TESTS)
+	@for test in $(TESTS) ; do ./$$test && echo "PASS: $$test" || echo "FAIL: $$test" ; done
+
+$(CFG_VAR)/libutils.lib: $(libutils_OBJECTS)
+	@$(AR) $(PIXMAN_ARFLAGS) -OUT:$@ $^
+
+$(CFG_VAR)/%.exe: $(CFG_VAR)/%.obj $(TEST_LDADD)
+	@$(LD) $(PIXMAN_LDFLAGS) -OUT:$@ $^
+
+$(top_builddir)/pixman/$(CFG_VAR)/$(LIBRARY).lib:
+	@$(MAKE) -C $(top_builddir)/pixman -f Makefile.win32
diff --git a/test/a1-trap-test.c b/test/a1-trap-test.c
new file mode 100644
index 0000000..6163e7c
--- /dev/null
+++ b/test/a1-trap-test.c
@@ -0,0 +1,50 @@
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 20
+#define HEIGHT 20
+
+    pixman_image_t *src_img;
+    pixman_image_t *mask_img;
+    pixman_image_t *dest_img;
+    pixman_trap_t trap;
+    pixman_color_t red = { 0xffff, 0x0000, 0x0000, 0xffff };
+    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *mbits = malloc (WIDTH * HEIGHT);
+
+    memset (mbits, 0, WIDTH * HEIGHT);
+    memset (bits, 0xff, WIDTH * HEIGHT * 4);
+    
+    trap.top.l = pixman_double_to_fixed (0.5);
+    trap.top.r = pixman_double_to_fixed (1.5);
+    trap.top.y = pixman_double_to_fixed (0.5);
+
+    trap.bot.l = pixman_double_to_fixed (0.5);
+    trap.bot.r = pixman_double_to_fixed (1.5);
+    trap.bot.y = pixman_double_to_fixed (1.5);
+
+    mask_img = pixman_image_create_bits (
+	PIXMAN_a1, WIDTH, HEIGHT, mbits, WIDTH);
+    src_img = pixman_image_create_solid_fill (&red);
+    dest_img = pixman_image_create_bits (
+	PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, WIDTH * 4);
+    
+    pixman_add_traps (mask_img, 0, 0, 1, &trap);
+
+    pixman_image_composite (PIXMAN_OP_OVER,
+			    src_img, mask_img, dest_img,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    assert (bits[0] == 0xffff0000);
+    assert (bits[1] == 0xffffffff);
+    assert (bits[1 * WIDTH + 0] == 0xffffffff);
+    assert (bits[1 * WIDTH + 1] == 0xffffffff);
+    
+    return 0;
+}
diff --git a/test/affine-test.c b/test/affine-test.c
new file mode 100755
index 0000000..a4ceed3
--- /dev/null
+++ b/test/affine-test.c
@@ -0,0 +1,311 @@
+/*
+ * Test program, which can detect some problems with affine transformations
+ * in pixman. Testing is done by running lots of random SRC and OVER
+ * compositing operations a8r8g8b8, x8a8r8g8b8, r5g6b5 and a8 color formats
+ * with random scaled, rotated and translated transforms.
+ *
+ * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in
+ * the case of test failure.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+#define MAX_SRC_WIDTH  16
+#define MAX_SRC_HEIGHT 16
+#define MAX_DST_WIDTH  16
+#define MAX_DST_HEIGHT 16
+#define MAX_STRIDE     4
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (int      testnum,
+		int      verbose)
+{
+    int                i;
+    pixman_image_t *   src_img;
+    pixman_image_t *   dst_img;
+    pixman_transform_t transform;
+    pixman_region16_t  clip;
+    int                src_width, src_height;
+    int                dst_width, dst_height;
+    int                src_stride, dst_stride;
+    int                src_x, src_y;
+    int                dst_x, dst_y;
+    int                src_bpp;
+    int                dst_bpp;
+    int                w, h;
+    pixman_fixed_t     scale_x = 65536, scale_y = 65536;
+    pixman_fixed_t     translate_x = 0, translate_y = 0;
+    pixman_op_t        op;
+    pixman_repeat_t    repeat = PIXMAN_REPEAT_NONE;
+    pixman_format_code_t src_fmt, dst_fmt;
+    uint32_t *         srcbuf;
+    uint32_t *         dstbuf;
+    uint32_t           crc32;
+    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
+
+    lcg_srand (testnum);
+
+    src_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
+    dst_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
+    op = (lcg_rand_n (2) == 0) ? PIXMAN_OP_SRC : PIXMAN_OP_OVER;
+
+    src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
+    src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
+    dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1;
+    dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1;
+    src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp;
+    dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp;
+
+    if (src_stride & 3)
+	src_stride += 2;
+
+    if (dst_stride & 3)
+	dst_stride += 2;
+
+    src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2);
+    src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2);
+    dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2);
+    dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2);
+    w = lcg_rand_n (dst_width * 3 / 2 - dst_x);
+    h = lcg_rand_n (dst_height * 3 / 2 - dst_y);
+
+    srcbuf = (uint32_t *)malloc (src_stride * src_height);
+    dstbuf = (uint32_t *)malloc (dst_stride * dst_height);
+
+    for (i = 0; i < src_stride * src_height; i++)
+	*((uint8_t *)srcbuf + i) = lcg_rand_n (256);
+
+    for (i = 0; i < dst_stride * dst_height; i++)
+	*((uint8_t *)dstbuf + i) = lcg_rand_n (256);
+
+    src_fmt = src_bpp == 4 ? (lcg_rand_n (2) == 0 ?
+                              PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
+
+    dst_fmt = dst_bpp == 4 ? (lcg_rand_n (2) == 0 ?
+                              PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
+
+    src_img = pixman_image_create_bits (
+        src_fmt, src_width, src_height, srcbuf, src_stride);
+
+    dst_img = pixman_image_create_bits (
+        dst_fmt, dst_width, dst_height, dstbuf, dst_stride);
+
+    image_endian_swap (src_img);
+    image_endian_swap (dst_img);
+
+    pixman_transform_init_identity (&transform);
+
+    if (lcg_rand_n (3) > 0)
+    {
+	scale_x = -65536 * 3 + lcg_rand_N (65536 * 6);
+	if (lcg_rand_n (2))
+	    scale_y = -65536 * 3 + lcg_rand_N (65536 * 6);
+	else
+	    scale_y = scale_x;
+	pixman_transform_init_scale (&transform, scale_x, scale_y);
+    }
+    if (lcg_rand_n (3) > 0)
+    {
+	translate_x = -65536 * 3 + lcg_rand_N (6 * 65536);
+	if (lcg_rand_n (2))
+	    translate_y = -65536 * 3 + lcg_rand_N (6 * 65536);
+	else
+	    translate_y = translate_x;
+	pixman_transform_translate (&transform, NULL, translate_x, translate_y);
+    }
+
+    if (lcg_rand_n (4) > 0)
+    {
+	int c, s, tx = 0, ty = 0;
+	switch (lcg_rand_n (4))
+	{
+	case 0:
+	    /* 90 degrees */
+	    c = 0;
+	    s = pixman_fixed_1;
+	    tx = pixman_int_to_fixed (MAX_SRC_HEIGHT);
+	    break;
+	case 1:
+	    /* 180 degrees */
+	    c = -pixman_fixed_1;
+	    s = 0;
+	    tx = pixman_int_to_fixed (MAX_SRC_WIDTH);
+	    ty = pixman_int_to_fixed (MAX_SRC_HEIGHT);
+	    break;
+	case 2:
+	    /* 270 degrees */
+	    c = 0;
+	    s = -pixman_fixed_1;
+	    ty = pixman_int_to_fixed (MAX_SRC_WIDTH);
+	    break;
+	default:
+	    /* arbitrary rotation */
+	    c = lcg_rand_N (2 * 65536) - 65536;
+	    s = lcg_rand_N (2 * 65536) - 65536;
+	    break;
+	}
+	pixman_transform_rotate (&transform, NULL, c, s);
+	pixman_transform_translate (&transform, NULL, tx, ty);
+    }
+
+    if (lcg_rand_n (8) == 0)
+    {
+	/* Flip random bits */
+	int maxflipcount = 8;
+	while (maxflipcount--)
+	{
+	    int i = lcg_rand_n (2);
+	    int j = lcg_rand_n (3);
+	    int bitnum = lcg_rand_n (32);
+	    transform.matrix[i][j] ^= 1 << bitnum;
+	    if (lcg_rand_n (2))
+		break;
+	}
+    }
+
+    pixman_image_set_transform (src_img, &transform);
+
+    switch (lcg_rand_n (4))
+    {
+    case 0:
+	repeat = PIXMAN_REPEAT_NONE;
+	break;
+
+    case 1:
+	repeat = PIXMAN_REPEAT_NORMAL;
+	break;
+
+    case 2:
+	repeat = PIXMAN_REPEAT_PAD;
+	break;
+
+    case 3:
+	repeat = PIXMAN_REPEAT_REFLECT;
+	break;
+
+    default:
+        break;
+    }
+    pixman_image_set_repeat (src_img, repeat);
+
+    if (lcg_rand_n (2))
+	pixman_image_set_filter (src_img, PIXMAN_FILTER_NEAREST, NULL, 0);
+    else
+	pixman_image_set_filter (src_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    if (verbose)
+    {
+	printf ("src_fmt=%08X, dst_fmt=%08X\n", src_fmt, dst_fmt);
+	printf ("op=%d, scale_x=%d, scale_y=%d, repeat=%d\n",
+	        op, scale_x, scale_y, repeat);
+	printf ("translate_x=%d, translate_y=%d\n",
+	        translate_x, translate_y);
+	printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
+	        src_width, src_height, dst_width, dst_height);
+	printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
+	        src_x, src_y, dst_x, dst_y);
+	printf ("w=%d, h=%d\n", w, h);
+    }
+
+    if (lcg_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = lcg_rand_n (2) + 1;
+
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = lcg_rand_n (src_width);
+	    clip_boxes[i].y1 = lcg_rand_n (src_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("source clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (src_img, &clip);
+	pixman_image_set_source_clipping (src_img, 1);
+	pixman_region_fini (&clip);
+    }
+
+    if (lcg_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = lcg_rand_n (2) + 1;
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = lcg_rand_n (dst_width);
+	    clip_boxes[i].y1 = lcg_rand_n (dst_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("destination clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (dst_img, &clip);
+	pixman_region_fini (&clip);
+    }
+
+    pixman_image_composite (op, src_img, NULL, dst_img,
+                            src_x, src_y, 0, 0, dst_x, dst_y, w, h);
+
+    if (dst_fmt == PIXMAN_x8r8g8b8)
+    {
+	/* ignore unused part */
+	for (i = 0; i < dst_stride * dst_height / 4; i++)
+	    dstbuf[i] &= 0xFFFFFF;
+    }
+
+    image_endian_swap (dst_img);
+
+    if (verbose)
+    {
+	int j;
+
+	for (i = 0; i < dst_height; i++)
+	{
+	    for (j = 0; j < dst_stride; j++)
+		printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
+
+	    printf ("\n");
+	}
+    }
+
+    pixman_image_unref (src_img);
+    pixman_image_unref (dst_img);
+
+    crc32 = compute_crc32 (0, dstbuf, dst_stride * dst_height);
+    free (srcbuf);
+    free (dstbuf);
+
+    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    pixman_disable_out_of_bounds_workaround ();
+
+    return fuzzer_test_main ("affine", 8000000, 0x1EF2175A,
+			     test_composite, argc, argv);
+}
diff --git a/test/alpha-loop.c b/test/alpha-loop.c
new file mode 100644
index 0000000..e4d90a9
--- /dev/null
+++ b/test/alpha-loop.c
@@ -0,0 +1,29 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils.h"
+
+#define WIDTH 400
+#define HEIGHT 200
+
+int
+main (int argc, char **argv)
+{
+    uint8_t *alpha = make_random_bytes (WIDTH * HEIGHT);
+    uint32_t *src = (uint32_t *)make_random_bytes (WIDTH * HEIGHT * 4);
+    uint32_t *dest = (uint32_t *)make_random_bytes (WIDTH * HEIGHT * 4);
+
+    pixman_image_t *a = pixman_image_create_bits (PIXMAN_a8, WIDTH, HEIGHT, (uint32_t *)alpha, WIDTH);
+    pixman_image_t *d = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, dest, WIDTH * 4);
+    pixman_image_t *s = pixman_image_create_bits (PIXMAN_a2r10g10b10, WIDTH, HEIGHT, src, WIDTH * 4);
+
+    fail_after (5, "Infinite loop detected: 5 seconds without progress\n");
+
+    pixman_image_set_alpha_map (s, a, 0, 0);
+    pixman_image_set_alpha_map (a, s, 0, 0);
+
+    pixman_image_composite (PIXMAN_OP_SRC, s, NULL, d, 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    pixman_image_unref (s);
+
+    return 0;
+}
diff --git a/test/alphamap.c b/test/alphamap.c
new file mode 100644
index 0000000..554b309
--- /dev/null
+++ b/test/alphamap.c
@@ -0,0 +1,256 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils.h"
+
+#define WIDTH 100
+#define HEIGHT 100
+
+static const pixman_format_code_t formats[] =
+{
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_a8
+};
+
+static const pixman_format_code_t alpha_formats[] =
+{
+    PIXMAN_null,
+    PIXMAN_a8,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_a4r4g4b4
+};
+
+static const int origins[] =
+{
+    0, 10, -100
+};
+
+static const char *
+format_name (pixman_format_code_t format)
+{
+    if (format == PIXMAN_a8)
+	return "a8";
+    else if (format == PIXMAN_a2r10g10b10)
+	return "a2r10g10b10";
+    else if (format == PIXMAN_a8r8g8b8)
+	return "a8r8g8b8";
+    else if (format == PIXMAN_a4r4g4b4)
+	return "a4r4g4b4";
+    else if (format == PIXMAN_null)
+	return "none";
+    else
+	assert (0);
+
+    return "<unknown - bug in alphamap.c>";
+}
+
+static void
+on_destroy (pixman_image_t *image, void *data)
+{
+    uint32_t *bits = pixman_image_get_data (image);
+
+    fence_free (bits);
+}
+
+static pixman_image_t *
+make_image (pixman_format_code_t format)
+{
+    uint32_t *bits;
+    uint8_t bpp = PIXMAN_FORMAT_BPP (format) / 8;
+    pixman_image_t *image;
+
+    bits = (uint32_t *)make_random_bytes (WIDTH * HEIGHT * bpp);
+
+    image = pixman_image_create_bits (format, WIDTH, HEIGHT, bits, WIDTH * bpp);
+
+    if (image && bits)
+	pixman_image_set_destroy_function (image, on_destroy, NULL);
+
+    return image;
+}
+
+static pixman_image_t *
+create_image (pixman_format_code_t format, pixman_format_code_t alpha_format,
+	      int alpha_origin_x, int alpha_origin_y)
+{
+    pixman_image_t *image = make_image (format);
+
+    if (alpha_format != PIXMAN_null)
+    {
+	pixman_image_t *alpha = make_image (alpha_format);
+
+	pixman_image_set_alpha_map (image, alpha,
+				    alpha_origin_x, alpha_origin_y);
+	pixman_image_unref (alpha);
+    }
+
+    return image;
+}
+
+static uint8_t
+get_alpha (pixman_image_t *image, int x, int y, int orig_x, int orig_y)
+{
+    uint8_t *bits;
+    uint8_t r;
+
+    if (image->common.alpha_map)
+    {
+	if (x - orig_x >= 0 && x - orig_x < WIDTH &&
+	    y - orig_y >= 0 && y - orig_y < HEIGHT)
+	{
+	    image = (pixman_image_t *)image->common.alpha_map;
+
+	    x -= orig_x;
+	    y -= orig_y;
+	}
+	else
+	{
+	    return 0;
+	}
+    }
+
+    bits = (uint8_t *)image->bits.bits;
+
+    if (image->bits.format == PIXMAN_a8)
+    {
+	r = bits[y * WIDTH + x];
+    }
+    else if (image->bits.format == PIXMAN_a2r10g10b10)
+    {
+	r = ((uint32_t *)bits)[y * WIDTH + x] >> 30;
+	r |= r << 2;
+	r |= r << 4;
+    }
+    else if (image->bits.format == PIXMAN_a8r8g8b8)
+    {
+	r = ((uint32_t *)bits)[y * WIDTH + x] >> 24;
+    }
+    else if (image->bits.format == PIXMAN_a4r4g4b4)
+    {
+	r = ((uint16_t *)bits)[y * WIDTH + x] >> 12;
+	r |= r << 4;
+    }
+    else
+    {
+	assert (0);
+    }
+
+    return r;
+}
+
+#define ARRAY_LENGTH(A) ((int) (sizeof (A) / sizeof ((A) [0])))
+
+static int
+run_test (int s, int d, int sa, int da, int soff, int doff)
+{
+    pixman_format_code_t sf = formats[s];
+    pixman_format_code_t df = formats[d];
+    pixman_format_code_t saf = alpha_formats[sa];
+    pixman_format_code_t daf = alpha_formats[da];
+    pixman_image_t *src, *dst, *orig_dst;
+    pixman_transform_t t1;
+    int j, k;
+    int n_alpha_bits;
+
+    soff = origins[soff];
+    doff = origins[doff];
+
+    n_alpha_bits = PIXMAN_FORMAT_A (df);
+    if (daf != PIXMAN_null)
+	n_alpha_bits = PIXMAN_FORMAT_A (daf);
+
+
+    src = create_image (sf, saf, soff, soff);
+    orig_dst = create_image (df, daf, doff, doff);
+    dst = create_image (df, daf, doff, doff);
+
+    /* Transformations, repeats and filters on destinations should be ignored,
+     * so just set some random ones.
+     */
+    pixman_transform_init_identity (&t1);
+    pixman_transform_scale (&t1, NULL, pixman_int_to_fixed (100), pixman_int_to_fixed (11));
+    pixman_transform_rotate (&t1, NULL, pixman_double_to_fixed (0.5), pixman_double_to_fixed (0.11));
+    pixman_transform_translate (&t1, NULL, pixman_int_to_fixed (11), pixman_int_to_fixed (17));
+
+    pixman_image_set_transform (dst, &t1);
+    pixman_image_set_filter (dst, PIXMAN_FILTER_BILINEAR, NULL, 0);
+    pixman_image_set_repeat (dst, PIXMAN_REPEAT_REFLECT);
+
+    pixman_image_composite (PIXMAN_OP_SRC, orig_dst, NULL, dst,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    pixman_image_composite (PIXMAN_OP_ADD, src, NULL, dst,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    for (j = MAX (doff, 0); j < MIN (HEIGHT, HEIGHT + doff); ++j)
+    {
+	for (k = MAX (doff, 0); k < MIN (WIDTH, WIDTH + doff); ++k)
+	{
+	    uint8_t sa, da, oda, ref;
+
+	    sa = get_alpha (src, k, j, soff, soff);
+	    da = get_alpha (dst, k, j, doff, doff);
+	    oda = get_alpha (orig_dst, k, j, doff, doff);
+
+	    if (sa + oda > 255)
+		ref = 255;
+	    else
+		ref = sa + oda;
+
+	    if (da >> (8 - n_alpha_bits) != ref >> (8 - n_alpha_bits))
+	    {
+		printf ("\nWrong alpha value at (%d, %d). Should be 0x%x; got 0x%x. Source was 0x%x, original dest was 0x%x\n",
+			k, j, ref, da, sa, oda);
+
+		printf ("src: %s, alpha: %s, origin %d %d\ndst: %s, alpha: %s, origin: %d %d\n\n",
+			format_name (sf),
+			format_name (saf),
+			soff, soff,
+			format_name (df),
+			format_name (daf),
+			doff, doff);
+		return 1;
+	    }
+	}
+    }
+
+    pixman_image_set_alpha_map (src, NULL, 0, 0);
+    pixman_image_set_alpha_map (dst, NULL, 0, 0);
+    pixman_image_set_alpha_map (orig_dst, NULL, 0, 0);
+
+    pixman_image_unref (src);
+    pixman_image_unref (dst);
+    pixman_image_unref (orig_dst);
+
+    return 0;
+}
+
+int
+main (int argc, char **argv)
+{
+    int i, j, a, b, x, y;
+
+    for (i = 0; i < ARRAY_LENGTH (formats); ++i)
+    {
+	for (j = 0; j < ARRAY_LENGTH (formats); ++j)
+	{
+	    for (a = 0; a < ARRAY_LENGTH (alpha_formats); ++a)
+	    {
+		for (b = 0; b < ARRAY_LENGTH (alpha_formats); ++b)
+		{
+		    for (x = 0; x < ARRAY_LENGTH (origins); ++x)
+		    {
+			for (y = 0; y < ARRAY_LENGTH (origins); ++y)
+			{
+			    if (run_test (i, j, a, b, x, y) != 0)
+				return 1;
+			}
+		    }
+		}
+	    }
+	}
+    }
+
+    return 0;
+}
diff --git a/test/blitters-test.c b/test/blitters-test.c
new file mode 100755
index 0000000..4f931c4
--- /dev/null
+++ b/test/blitters-test.c
@@ -0,0 +1,430 @@
+/*
+ * Test program, which stresses the use of different color formats and
+ * compositing operations.
+ *
+ * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in
+ * the case of test failure.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+static pixman_indexed_t rgb_palette[9];
+static pixman_indexed_t y_palette[9];
+
+/* The first eight format in the list are by far the most widely
+ * used formats, so we test those more than the others
+ */
+#define N_MOST_LIKELY_FORMATS 8
+
+/* Create random image for testing purposes */
+static pixman_image_t *
+create_random_image (pixman_format_code_t *allowed_formats,
+		     int                   max_width,
+		     int                   max_height,
+		     int                   max_extra_stride,
+		     pixman_format_code_t *used_fmt)
+{
+    int n = 0, i, width, height, stride;
+    pixman_format_code_t fmt;
+    uint32_t *buf;
+    pixman_image_t *img;
+
+    while (allowed_formats[n] != PIXMAN_null)
+	n++;
+
+    if (n > N_MOST_LIKELY_FORMATS && lcg_rand_n (4) != 0)
+	n = N_MOST_LIKELY_FORMATS;
+    fmt = allowed_formats[lcg_rand_n (n)];
+
+    width = lcg_rand_n (max_width) + 1;
+    height = lcg_rand_n (max_height) + 1;
+    stride = (width * PIXMAN_FORMAT_BPP (fmt) + 7) / 8 +
+	lcg_rand_n (max_extra_stride + 1);
+    stride = (stride + 3) & ~3;
+
+    /* do the allocation */
+    buf = aligned_malloc (64, stride * height);
+
+    /* initialize image with random data */
+    for (i = 0; i < stride * height; i++)
+    {
+	/* generation is biased to having more 0 or 255 bytes as
+	 * they are more likely to be special-cased in code
+	 */
+	*((uint8_t *)buf + i) = lcg_rand_n (4) ? lcg_rand_n (256) :
+	    (lcg_rand_n (2) ? 0 : 255);
+    }
+
+    img = pixman_image_create_bits (fmt, width, height, buf, stride);
+
+    if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_COLOR)
+    {
+	pixman_image_set_indexed (img, &(rgb_palette[PIXMAN_FORMAT_BPP (fmt)]));
+    }
+    else if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_GRAY)
+    {
+	pixman_image_set_indexed (img, &(y_palette[PIXMAN_FORMAT_BPP (fmt)]));
+    }
+
+    if (lcg_rand_n (16) == 0)
+	pixman_image_set_filter (img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    image_endian_swap (img);
+
+    if (used_fmt) *used_fmt = fmt;
+    return img;
+}
+
+/* Free random image, and optionally update crc32 based on its data */
+static uint32_t
+free_random_image (uint32_t initcrc,
+		   pixman_image_t *img,
+		   pixman_format_code_t fmt)
+{
+    uint32_t crc32 = 0;
+    int stride = pixman_image_get_stride (img);
+    uint32_t *data = pixman_image_get_data (img);
+    int height = pixman_image_get_height (img);
+
+    if (fmt != PIXMAN_null)
+    {
+	/* mask unused 'x' part */
+	if (PIXMAN_FORMAT_BPP (fmt) - PIXMAN_FORMAT_DEPTH (fmt) &&
+	    PIXMAN_FORMAT_DEPTH (fmt) != 0)
+	{
+	    int i;
+	    uint32_t *data = pixman_image_get_data (img);
+	    uint32_t mask = (1 << PIXMAN_FORMAT_DEPTH (fmt)) - 1;
+
+	    if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_BGRA ||
+		PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_RGBA)
+	    {
+		mask <<= (PIXMAN_FORMAT_BPP (fmt) - PIXMAN_FORMAT_DEPTH (fmt));
+	    }
+
+	    for (i = 0; i < 32; i++)
+		mask |= mask << (i * PIXMAN_FORMAT_BPP (fmt));
+
+	    for (i = 0; i < stride * height / 4; i++)
+		data[i] &= mask;
+	}
+
+	/* swap endiannes in order to provide identical results on both big
+	 * and litte endian systems
+	 */
+	image_endian_swap (img);
+	crc32 = compute_crc32 (initcrc, data, stride * height);
+    }
+
+    pixman_image_unref (img);
+    free (data);
+
+    return crc32;
+}
+
+static pixman_op_t op_list[] = {
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE,
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+    PIXMAN_OP_DISJOINT_DST,
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+    PIXMAN_OP_CONJOINT_DST,
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+#if 0 /* these use floating point math and are not always bitexact on different platforms */
+    PIXMAN_OP_SOFT_LIGHT,
+    PIXMAN_OP_HSL_HUE,
+    PIXMAN_OP_HSL_SATURATION,
+    PIXMAN_OP_HSL_COLOR,
+    PIXMAN_OP_HSL_LUMINOSITY,
+#endif
+};
+
+static pixman_format_code_t img_fmt_list[] = {
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a8b8g8r8,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_x8b8g8r8,
+    PIXMAN_r5g6b5,
+    PIXMAN_b5g6r5,
+    PIXMAN_a8,
+    PIXMAN_a1,
+    PIXMAN_r3g3b2,
+    PIXMAN_b8g8r8a8,
+    PIXMAN_b8g8r8x8,
+    PIXMAN_r8g8b8a8,
+    PIXMAN_r8g8b8x8,
+    PIXMAN_x14r6g6b6,
+    PIXMAN_r8g8b8,
+    PIXMAN_b8g8r8,
+    PIXMAN_x2r10g10b10,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_x2b10g10r10,
+    PIXMAN_a2b10g10r10,
+    PIXMAN_a1r5g5b5,
+    PIXMAN_x1r5g5b5,
+    PIXMAN_a1b5g5r5,
+    PIXMAN_x1b5g5r5,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_x4r4g4b4,
+    PIXMAN_a4b4g4r4,
+    PIXMAN_x4b4g4r4,
+    PIXMAN_r3g3b2,
+    PIXMAN_b2g3r3,
+    PIXMAN_a2r2g2b2,
+    PIXMAN_a2b2g2r2,
+    PIXMAN_c8,
+    PIXMAN_g8,
+    PIXMAN_x4c4,
+    PIXMAN_x4g4,
+    PIXMAN_c4,
+    PIXMAN_g4,
+    PIXMAN_g1,
+    PIXMAN_x4a4,
+    PIXMAN_a4,
+    PIXMAN_r1g2b1,
+    PIXMAN_b1g2r1,
+    PIXMAN_a1r1g1b1,
+    PIXMAN_a1b1g1r1,
+    PIXMAN_null
+};
+
+static pixman_format_code_t mask_fmt_list[] = {
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a8,
+    PIXMAN_a4,
+    PIXMAN_a1,
+    PIXMAN_null
+};
+
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (int testnum, int verbose)
+{
+    int i;
+    pixman_image_t *src_img = NULL;
+    pixman_image_t *dst_img = NULL;
+    pixman_image_t *mask_img = NULL;
+    int src_width, src_height;
+    int dst_width, dst_height;
+    int src_stride, dst_stride;
+    int src_x, src_y;
+    int dst_x, dst_y;
+    int mask_x, mask_y;
+    int w, h;
+    pixman_op_t op;
+    pixman_format_code_t src_fmt, dst_fmt, mask_fmt;
+    uint32_t *dstbuf, *srcbuf, *maskbuf;
+    uint32_t crc32;
+    int max_width, max_height, max_extra_stride;
+    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
+
+    max_width = max_height = 24 + testnum / 10000;
+    max_extra_stride = 4 + testnum / 1000000;
+
+    if (max_width > 256)
+	max_width = 256;
+
+    if (max_height > 16)
+	max_height = 16;
+
+    if (max_extra_stride > 8)
+	max_extra_stride = 8;
+
+    lcg_srand (testnum);
+
+    op = op_list[lcg_rand_n (sizeof (op_list) / sizeof (op_list[0]))];
+
+    if (lcg_rand_n (8))
+    {
+	/* normal image */
+	src_img = create_random_image (img_fmt_list, max_width, max_height,
+				       max_extra_stride, &src_fmt);
+    }
+    else
+    {
+	/* solid case */
+	src_img = create_random_image (img_fmt_list, 1, 1,
+				       max_extra_stride, &src_fmt);
+
+	pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
+    }
+
+    dst_img = create_random_image (img_fmt_list, max_width, max_height,
+				   max_extra_stride, &dst_fmt);
+
+    src_width = pixman_image_get_width (src_img);
+    src_height = pixman_image_get_height (src_img);
+    src_stride = pixman_image_get_stride (src_img);
+
+    dst_width = pixman_image_get_width (dst_img);
+    dst_height = pixman_image_get_height (dst_img);
+    dst_stride = pixman_image_get_stride (dst_img);
+
+    dstbuf = pixman_image_get_data (dst_img);
+    srcbuf = pixman_image_get_data (src_img);
+
+    src_x = lcg_rand_n (src_width);
+    src_y = lcg_rand_n (src_height);
+    dst_x = lcg_rand_n (dst_width);
+    dst_y = lcg_rand_n (dst_height);
+
+    mask_img = NULL;
+    mask_fmt = PIXMAN_null;
+    mask_x = 0;
+    mask_y = 0;
+    maskbuf = NULL;
+
+    if ((src_fmt == PIXMAN_x8r8g8b8 || src_fmt == PIXMAN_x8b8g8r8) &&
+	(lcg_rand_n (4) == 0))
+    {
+	/* PIXBUF */
+	mask_fmt = lcg_rand_n (2) ? PIXMAN_a8r8g8b8 : PIXMAN_a8b8g8r8;
+	mask_img = pixman_image_create_bits (mask_fmt,
+	                                     src_width,
+	                                     src_height,
+	                                     srcbuf,
+	                                     src_stride);
+	mask_x = src_x;
+	mask_y = src_y;
+	maskbuf = srcbuf;
+    }
+    else if (lcg_rand_n (2))
+    {
+	if (lcg_rand_n (2))
+	{
+	    mask_img = create_random_image (mask_fmt_list, max_width, max_height,
+					   max_extra_stride, &mask_fmt);
+	}
+	else
+	{
+	    /* solid case */
+	    mask_img = create_random_image (mask_fmt_list, 1, 1,
+					   max_extra_stride, &mask_fmt);
+	    pixman_image_set_repeat (mask_img, PIXMAN_REPEAT_NORMAL);
+	}
+
+	if (lcg_rand_n (2))
+	    pixman_image_set_component_alpha (mask_img, 1);
+
+	mask_x = lcg_rand_n (pixman_image_get_width (mask_img));
+	mask_y = lcg_rand_n (pixman_image_get_height (mask_img));
+    }
+
+
+    w = lcg_rand_n (dst_width - dst_x + 1);
+    h = lcg_rand_n (dst_height - dst_y + 1);
+
+    if (verbose)
+    {
+	printf ("op=%d, src_fmt=%08X, dst_fmt=%08X, mask_fmt=%08X\n",
+	    op, src_fmt, dst_fmt, mask_fmt);
+	printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
+	    src_width, src_height, dst_width, dst_height);
+	printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
+	    src_x, src_y, dst_x, dst_y);
+	printf ("src_stride=%d, dst_stride=%d\n",
+	    src_stride, dst_stride);
+	printf ("w=%d, h=%d\n", w, h);
+    }
+
+    pixman_image_composite (op, src_img, mask_img, dst_img,
+			    src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h);
+
+    if (verbose)
+    {
+	int j;
+
+	printf ("---\n");
+	for (i = 0; i < dst_height; i++)
+	{
+	    for (j = 0; j < dst_stride; j++)
+	    {
+		if (j == (dst_width * PIXMAN_FORMAT_BPP (dst_fmt) + 7) / 8)
+		    printf ("| ");
+
+		printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
+	    }
+	    printf ("\n");
+	}
+	printf ("---\n");
+    }
+
+    free_random_image (0, src_img, PIXMAN_null);
+    crc32 = free_random_image (0, dst_img, dst_fmt);
+
+    if (mask_img)
+    {
+	if (srcbuf == maskbuf)
+	    pixman_image_unref(mask_img);
+	else
+	    free_random_image (0, mask_img, PIXMAN_null);
+    }
+
+    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    int i;
+
+    for (i = 1; i <= 8; i++)
+    {
+	initialize_palette (&(rgb_palette[i]), i, TRUE);
+	initialize_palette (&(y_palette[i]), i, FALSE);
+    }
+
+    return fuzzer_test_main("blitters", 2000000,
+			    0x29137844,
+			    test_composite, argc, argv);
+}
diff --git a/test/composite-traps-test.c b/test/composite-traps-test.c
new file mode 100755
index 0000000..fa6d8a9
--- /dev/null
+++ b/test/composite-traps-test.c
@@ -0,0 +1,257 @@
+/* Based loosely on scaling-test */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+#define MAX_SRC_WIDTH  48
+#define MAX_SRC_HEIGHT 48
+#define MAX_DST_WIDTH  48
+#define MAX_DST_HEIGHT 48
+#define MAX_STRIDE     4
+
+static pixman_format_code_t formats[] =
+{
+    PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_r5g6b5, PIXMAN_a1, PIXMAN_a4
+};
+
+static pixman_format_code_t mask_formats[] =
+{
+    PIXMAN_a1, PIXMAN_a4, PIXMAN_a8,
+};
+
+static pixman_op_t operators[] =
+{
+    PIXMAN_OP_OVER, PIXMAN_OP_ADD, PIXMAN_OP_SRC, PIXMAN_OP_IN
+};
+
+#define RANDOM_ELT(array)						\
+    ((array)[lcg_rand_n(ARRAY_LENGTH((array)))])
+
+static void
+destroy_bits (pixman_image_t *image, void *data)
+{
+    fence_free (data);
+}
+
+static pixman_fixed_t
+random_fixed (int n)
+{
+    return lcg_rand_N (n << 16);
+}
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (int      testnum,
+		int      verbose)
+{
+    int                i;
+    pixman_image_t *   src_img;
+    pixman_image_t *   dst_img;
+    pixman_region16_t  clip;
+    int                dst_width, dst_height;
+    int                dst_stride;
+    int                dst_x, dst_y;
+    int                dst_bpp;
+    pixman_op_t        op;
+    uint32_t *         dst_bits;
+    uint32_t           crc32;
+    pixman_format_code_t mask_format, dst_format;
+    pixman_trapezoid_t *traps;
+    int src_x, src_y;
+    int n_traps;
+
+    static pixman_color_t colors[] =
+    {
+	{ 0xffff, 0xffff, 0xffff, 0xffff },
+	{ 0x0000, 0x0000, 0x0000, 0x0000 },
+	{ 0xabcd, 0xabcd, 0x0000, 0xabcd },
+	{ 0x0000, 0x0000, 0x0000, 0xffff },
+	{ 0x0101, 0x0101, 0x0101, 0x0101 },
+	{ 0x7777, 0x6666, 0x5555, 0x9999 },
+    };
+    
+    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
+
+    lcg_srand (testnum);
+
+    op = RANDOM_ELT (operators);
+    mask_format = RANDOM_ELT (mask_formats);
+
+    /* Create source image */
+    
+    if (lcg_rand_n (4) == 0)
+    {
+	src_img = pixman_image_create_solid_fill (
+	    &(colors[lcg_rand_n (ARRAY_LENGTH (colors))]));
+
+	src_x = 10;
+	src_y = 234;
+    }
+    else
+    {
+	pixman_format_code_t src_format = RANDOM_ELT(formats);
+	int src_bpp = (PIXMAN_FORMAT_BPP (src_format) + 7) / 8;
+	int src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
+	int src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
+	int src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp;
+	uint32_t *bits;
+
+	src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2);
+	src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2);
+
+	src_stride = (src_stride + 3) & ~3;
+	
+	bits = (uint32_t *)make_random_bytes (src_stride * src_height);
+
+	src_img = pixman_image_create_bits (
+	    src_format, src_width, src_height, bits, src_stride);
+
+	pixman_image_set_destroy_function (src_img, destroy_bits, bits);
+
+	if (lcg_rand_n (8) == 0)
+	{
+	    pixman_box16_t clip_boxes[2];
+	    int            n = lcg_rand_n (2) + 1;
+	    
+	    for (i = 0; i < n; i++)
+	    {
+		clip_boxes[i].x1 = lcg_rand_n (src_width);
+		clip_boxes[i].y1 = lcg_rand_n (src_height);
+		clip_boxes[i].x2 =
+		    clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1);
+		clip_boxes[i].y2 =
+		    clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1);
+		
+		if (verbose)
+		{
+		    printf ("source clip box: [%d,%d-%d,%d]\n",
+			    clip_boxes[i].x1, clip_boxes[i].y1,
+			    clip_boxes[i].x2, clip_boxes[i].y2);
+		}
+	    }
+	    
+	    pixman_region_init_rects (&clip, clip_boxes, n);
+	    pixman_image_set_clip_region (src_img, &clip);
+	    pixman_image_set_source_clipping (src_img, 1);
+	    pixman_region_fini (&clip);
+	}
+
+	image_endian_swap (src_img);
+    }
+
+    /* Create destination image */
+    {
+	dst_format = RANDOM_ELT(formats);
+	dst_bpp = (PIXMAN_FORMAT_BPP (dst_format) + 7) / 8;
+	dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1;
+	dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1;
+	dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp;
+	dst_stride = (dst_stride + 3) & ~3;
+	
+	dst_bits = (uint32_t *)make_random_bytes (dst_stride * dst_height);
+
+	dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2);
+	dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2);
+	
+	dst_img = pixman_image_create_bits (
+	    dst_format, dst_width, dst_height, dst_bits, dst_stride);
+
+	image_endian_swap (dst_img);
+    }
+
+    /* Create traps */
+    {
+	int i;
+
+	n_traps = lcg_rand_n (25);
+	traps = fence_malloc (n_traps * sizeof (pixman_trapezoid_t));
+
+	for (i = 0; i < n_traps; ++i)
+	{
+	    pixman_trapezoid_t *t = &(traps[i]);
+	    
+	    t->top = random_fixed (MAX_DST_HEIGHT) - MAX_DST_HEIGHT / 2;
+	    t->bottom = t->top + random_fixed (MAX_DST_HEIGHT);
+	    t->left.p1.x = random_fixed (MAX_DST_WIDTH) - MAX_DST_WIDTH / 2;
+	    t->left.p1.y = t->top - random_fixed (50);
+	    t->left.p2.x = random_fixed (MAX_DST_WIDTH) - MAX_DST_WIDTH / 2;
+	    t->left.p2.y = t->bottom + random_fixed (50);
+	    t->right.p1.x = t->left.p1.x + random_fixed (MAX_DST_WIDTH);
+	    t->right.p1.y = t->top - random_fixed (50);
+	    t->right.p2.x = t->left.p2.x + random_fixed (MAX_DST_WIDTH);
+	    t->right.p2.y = t->bottom - random_fixed (50);
+	}
+    }
+    
+    if (lcg_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = lcg_rand_n (2) + 1;
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = lcg_rand_n (dst_width);
+	    clip_boxes[i].y1 = lcg_rand_n (dst_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("destination clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (dst_img, &clip);
+	pixman_region_fini (&clip);
+    }
+
+    pixman_composite_trapezoids (op, src_img, dst_img, mask_format,
+				 src_x, src_y, dst_x, dst_y, n_traps, traps);
+
+    if (dst_format == PIXMAN_x8r8g8b8)
+    {
+	/* ignore unused part */
+	for (i = 0; i < dst_stride * dst_height / 4; i++)
+	    dst_bits[i] &= 0xFFFFFF;
+    }
+
+    image_endian_swap (dst_img);
+
+    if (verbose)
+    {
+	int j;
+	
+	for (i = 0; i < dst_height; i++)
+	{
+	    for (j = 0; j < dst_stride; j++)
+		printf ("%02X ", *((uint8_t *)dst_bits + i * dst_stride + j));
+
+	    printf ("\n");
+	}
+    }
+
+    crc32 = compute_crc32 (0, dst_bits, dst_stride * dst_height);
+
+    fence_free (dst_bits);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dst_img);
+    fence_free (traps);
+
+    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    return fuzzer_test_main("composite traps", 40000, 0xE3112106,
+			    test_composite, argc, argv);
+}
diff --git a/test/composite.c b/test/composite.c
new file mode 100755
index 0000000..408c363
--- /dev/null
+++ b/test/composite.c
@@ -0,0 +1,920 @@
+/*
+ * Copyright Â© 2005 Eric Anholt
+ * Copyright Â© 2009 Chris Wilson
+ * Copyright Â© 2010 Soeren Sandmann
+ * Copyright Â© 2010 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Eric Anholt not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Eric Anholt makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * ERIC ANHOLT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL ERIC ANHOLT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+#define PIXMAN_USE_INTERNAL_API
+#include <pixman.h>
+#include <stdio.h>
+#include <stdlib.h> /* abort() */
+#include <math.h>
+#include <time.h>
+#include "utils.h"
+
+typedef struct color_t color_t;
+typedef struct format_t format_t;
+typedef struct image_t image_t;
+typedef struct operator_t operator_t;
+
+struct color_t
+{
+    double r, g, b, a;
+};
+
+struct format_t
+{
+    pixman_format_code_t format;
+    const char *name;
+};
+
+static const color_t colors[] =
+{
+    { 1.0, 1.0, 1.0, 1.0 },
+    { 1.0, 1.0, 1.0, 0.0 },
+    { 0.0, 0.0, 0.0, 1.0 },
+    { 0.0, 0.0, 0.0, 0.0 },
+    { 1.0, 0.0, 0.0, 1.0 },
+    { 0.0, 1.0, 0.0, 1.0 },
+    { 0.0, 0.0, 1.0, 1.0 },
+    { 0.5, 0.0, 0.0, 0.5 },
+};
+
+static uint16_t
+_color_double_to_short (double d)
+{
+    uint32_t i;
+
+    i = (uint32_t) (d * 65536);
+    i -= (i >> 16);
+
+    return i;
+}
+
+static void
+compute_pixman_color (const color_t *color,
+		      pixman_color_t *out)
+{
+    out->red   = _color_double_to_short (color->r);
+    out->green = _color_double_to_short (color->g);
+    out->blue  = _color_double_to_short (color->b);
+    out->alpha = _color_double_to_short (color->a);
+}
+
+#define REPEAT 0x01000000
+#define FLAGS  0xff000000
+
+static const int sizes[] =
+{
+    0,
+    1,
+    1 | REPEAT,
+    10
+};
+
+static const format_t formats[] =
+{
+#define P(x) { PIXMAN_##x, #x }
+
+    /* 32 bpp formats */
+    P(a8r8g8b8),
+    P(x8r8g8b8),
+    P(a8b8g8r8),
+    P(x8b8g8r8),
+    P(b8g8r8a8),
+    P(b8g8r8x8),
+    P(r8g8b8a8),
+    P(r8g8b8x8),
+    P(x2r10g10b10),
+    P(x2b10g10r10),
+    P(a2r10g10b10),
+    P(a2b10g10r10),
+
+    /* 24 bpp formats */
+    P(r8g8b8),
+    P(b8g8r8),
+    P(r5g6b5),
+    P(b5g6r5),
+
+    /* 16 bpp formats */
+    P(x1r5g5b5),
+    P(x1b5g5r5),
+    P(a1r5g5b5),
+    P(a1b5g5r5),
+    P(a4b4g4r4),
+    P(x4b4g4r4),
+    P(a4r4g4b4),
+    P(x4r4g4b4),
+
+    /* 8 bpp formats */
+    P(a8),
+    P(r3g3b2),
+    P(b2g3r3),
+    P(a2r2g2b2),
+    P(a2b2g2r2),
+    P(x4a4),
+
+    /* 4 bpp formats */
+    P(a4),
+    P(r1g2b1),
+    P(b1g2r1),
+    P(a1r1g1b1),
+    P(a1b1g1r1),
+
+    /* 1 bpp formats */
+    P(a1)
+#undef P
+};
+
+struct image_t
+{
+    pixman_image_t *image;
+    const format_t *format;
+    const color_t *color;
+    pixman_repeat_t repeat;
+    int size;
+};
+
+struct operator_t
+{
+    pixman_op_t op;
+    const char *name;
+};
+
+static const operator_t operators[] =
+{
+#define P(x) { PIXMAN_OP_##x, #x }
+    P(CLEAR),
+    P(SRC),
+    P(DST),
+    P(OVER),
+    P(OVER_REVERSE),
+    P(IN),
+    P(IN_REVERSE),
+    P(OUT),
+    P(OUT_REVERSE),
+    P(ATOP),
+    P(ATOP_REVERSE),
+    P(XOR),
+    P(ADD),
+    P(SATURATE),
+
+    P(DISJOINT_CLEAR),
+    P(DISJOINT_SRC),
+    P(DISJOINT_DST),
+    P(DISJOINT_OVER),
+    P(DISJOINT_OVER_REVERSE),
+    P(DISJOINT_IN),
+    P(DISJOINT_IN_REVERSE),
+    P(DISJOINT_OUT),
+    P(DISJOINT_OUT_REVERSE),
+    P(DISJOINT_ATOP),
+    P(DISJOINT_ATOP_REVERSE),
+    P(DISJOINT_XOR),
+
+    P(CONJOINT_CLEAR),
+    P(CONJOINT_SRC),
+    P(CONJOINT_DST),
+    P(CONJOINT_OVER),
+    P(CONJOINT_OVER_REVERSE),
+    P(CONJOINT_IN),
+    P(CONJOINT_IN_REVERSE),
+    P(CONJOINT_OUT),
+    P(CONJOINT_OUT_REVERSE),
+    P(CONJOINT_ATOP),
+    P(CONJOINT_ATOP_REVERSE),
+    P(CONJOINT_XOR),
+#undef P
+};
+
+static double
+calc_op (pixman_op_t op, double src, double dst, double srca, double dsta)
+{
+#define mult_chan(src, dst, Fa, Fb) MIN ((src) * (Fa) + (dst) * (Fb), 1.0)
+
+    double Fa, Fb;
+
+    switch (op)
+    {
+    case PIXMAN_OP_CLEAR:
+    case PIXMAN_OP_DISJOINT_CLEAR:
+    case PIXMAN_OP_CONJOINT_CLEAR:
+	return mult_chan (src, dst, 0.0, 0.0);
+
+    case PIXMAN_OP_SRC:
+    case PIXMAN_OP_DISJOINT_SRC:
+    case PIXMAN_OP_CONJOINT_SRC:
+	return mult_chan (src, dst, 1.0, 0.0);
+
+    case PIXMAN_OP_DST:
+    case PIXMAN_OP_DISJOINT_DST:
+    case PIXMAN_OP_CONJOINT_DST:
+	return mult_chan (src, dst, 0.0, 1.0);
+
+    case PIXMAN_OP_OVER:
+	return mult_chan (src, dst, 1.0, 1.0 - srca);
+
+    case PIXMAN_OP_OVER_REVERSE:
+	return mult_chan (src, dst, 1.0 - dsta, 1.0);
+
+    case PIXMAN_OP_IN:
+	return mult_chan (src, dst, dsta, 0.0);
+
+    case PIXMAN_OP_IN_REVERSE:
+	return mult_chan (src, dst, 0.0, srca);
+
+    case PIXMAN_OP_OUT:
+	return mult_chan (src, dst, 1.0 - dsta, 0.0);
+
+    case PIXMAN_OP_OUT_REVERSE:
+	return mult_chan (src, dst, 0.0, 1.0 - srca);
+
+    case PIXMAN_OP_ATOP:
+	return mult_chan (src, dst, dsta, 1.0 - srca);
+
+    case PIXMAN_OP_ATOP_REVERSE:
+	return mult_chan (src, dst, 1.0 - dsta,  srca);
+
+    case PIXMAN_OP_XOR:
+	return mult_chan (src, dst, 1.0 - dsta, 1.0 - srca);
+
+    case PIXMAN_OP_ADD:
+	return mult_chan (src, dst, 1.0, 1.0);
+
+    case PIXMAN_OP_SATURATE:
+    case PIXMAN_OP_DISJOINT_OVER_REVERSE:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, (1.0 - dsta) / srca);
+	return mult_chan (src, dst, Fa, 1.0);
+
+    case PIXMAN_OP_DISJOINT_OVER:
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, (1.0 - srca) / dsta);
+	return mult_chan (src, dst, 1.0, Fb);
+
+    case PIXMAN_OP_DISJOINT_IN:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - (1.0 - dsta) / srca);
+	return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_DISJOINT_IN_REVERSE:
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - (1.0 - srca) / dsta);
+	return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_DISJOINT_OUT:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, (1.0 - dsta) / srca);
+	return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_DISJOINT_OUT_REVERSE:
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, (1.0 - srca) / dsta);
+	return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_DISJOINT_ATOP:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - (1.0 - dsta) / srca);
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, (1.0 - srca) / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_DISJOINT_ATOP_REVERSE:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, (1.0 - dsta) / srca);
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - (1.0 - srca) / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_DISJOINT_XOR:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, (1.0 - dsta) / srca);
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, (1.0 - srca) / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_CONJOINT_OVER:
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - srca / dsta);
+	return mult_chan (src, dst, 1.0, Fb);
+
+    case PIXMAN_OP_CONJOINT_OVER_REVERSE:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - dsta / srca);
+	return mult_chan (src, dst, Fa, 1.0);
+
+    case PIXMAN_OP_CONJOINT_IN:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, dsta / srca);
+	return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_CONJOINT_IN_REVERSE:
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, srca / dsta);
+	return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_CONJOINT_OUT:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - dsta / srca);
+	return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_CONJOINT_OUT_REVERSE:
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - srca / dsta);
+	return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_CONJOINT_ATOP:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, dsta / srca);
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - srca / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_CONJOINT_ATOP_REVERSE:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - dsta / srca);
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, srca / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_CONJOINT_XOR:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - dsta / srca);
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - srca / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_MULTIPLY:
+    case PIXMAN_OP_SCREEN:
+    case PIXMAN_OP_OVERLAY:
+    case PIXMAN_OP_DARKEN:
+    case PIXMAN_OP_LIGHTEN:
+    case PIXMAN_OP_COLOR_DODGE:
+    case PIXMAN_OP_COLOR_BURN:
+    case PIXMAN_OP_HARD_LIGHT:
+    case PIXMAN_OP_SOFT_LIGHT:
+    case PIXMAN_OP_DIFFERENCE:
+    case PIXMAN_OP_EXCLUSION:
+    case PIXMAN_OP_HSL_HUE:
+    case PIXMAN_OP_HSL_SATURATION:
+    case PIXMAN_OP_HSL_COLOR:
+    case PIXMAN_OP_HSL_LUMINOSITY:
+    default:
+	abort();
+	return 0; /* silence MSVC */
+    }
+#undef mult_chan
+}
+
+static void
+do_composite (pixman_op_t op,
+	      const color_t *src,
+	      const color_t *mask,
+	      const color_t *dst,
+	      color_t *result,
+	      pixman_bool_t component_alpha)
+{
+    color_t srcval, srcalpha;
+
+    if (mask == NULL)
+    {
+	srcval = *src;
+
+	srcalpha.r = src->a;
+	srcalpha.g = src->a;
+	srcalpha.b = src->a;
+	srcalpha.a = src->a;
+    }
+    else if (component_alpha)
+    {
+	srcval.r = src->r * mask->r;
+	srcval.g = src->g * mask->g;
+	srcval.b = src->b * mask->b;
+	srcval.a = src->a * mask->a;
+
+	srcalpha.r = src->a * mask->r;
+	srcalpha.g = src->a * mask->g;
+	srcalpha.b = src->a * mask->b;
+	srcalpha.a = src->a * mask->a;
+    }
+    else
+    {
+	srcval.r = src->r * mask->a;
+	srcval.g = src->g * mask->a;
+	srcval.b = src->b * mask->a;
+	srcval.a = src->a * mask->a;
+
+	srcalpha.r = src->a * mask->a;
+	srcalpha.g = src->a * mask->a;
+	srcalpha.b = src->a * mask->a;
+	srcalpha.a = src->a * mask->a;
+    }
+
+    result->r = calc_op (op, srcval.r, dst->r, srcalpha.r, dst->a);
+    result->g = calc_op (op, srcval.g, dst->g, srcalpha.g, dst->a);
+    result->b = calc_op (op, srcval.b, dst->b, srcalpha.b, dst->a);
+    result->a = calc_op (op, srcval.a, dst->a, srcalpha.a, dst->a);
+}
+
+static void
+color_correct (pixman_format_code_t format,
+	       color_t *color)
+{
+#define MASK(x) ((1 << (x)) - 1)
+#define round_pix(pix, m)						\
+    ((int)((pix) * (MASK(m)) + .5) / (double) (MASK(m)))
+
+    if (PIXMAN_FORMAT_R (format) == 0)
+    {
+	color->r = 0.0;
+	color->g = 0.0;
+	color->b = 0.0;
+    }
+    else
+    {
+	color->r = round_pix (color->r, PIXMAN_FORMAT_R (format));
+	color->g = round_pix (color->g, PIXMAN_FORMAT_G (format));
+	color->b = round_pix (color->b, PIXMAN_FORMAT_B (format));
+    }
+
+    if (PIXMAN_FORMAT_A (format) == 0)
+	color->a = 1.0;
+    else
+	color->a = round_pix (color->a, PIXMAN_FORMAT_A (format));
+
+#undef round_pix
+#undef MASK
+}
+
+static void
+get_pixel (pixman_image_t *image,
+	   pixman_format_code_t format,
+	   color_t *color)
+{
+#define MASK(N) ((1UL << (N))-1)
+
+    unsigned long rs, gs, bs, as;
+    int a, r, g, b;
+    unsigned long val;
+
+    val = *(unsigned long *) pixman_image_get_data (image);
+#ifdef WORDS_BIGENDIAN
+    val >>= 8 * sizeof(val) - PIXMAN_FORMAT_BPP (format);
+#endif
+
+    /* Number of bits in each channel */
+    a = PIXMAN_FORMAT_A (format);
+    r = PIXMAN_FORMAT_R (format);
+    g = PIXMAN_FORMAT_G (format);
+    b = PIXMAN_FORMAT_B (format);
+
+    switch (PIXMAN_FORMAT_TYPE (format))
+    {
+    case PIXMAN_TYPE_ARGB:
+        bs = 0;
+        gs = b + bs;
+        rs = g + gs;
+        as = r + rs;
+	break;
+
+    case PIXMAN_TYPE_ABGR:
+        rs = 0;
+        gs = r + rs;
+        bs = g + gs;
+        as = b + bs;
+	break;
+
+    case PIXMAN_TYPE_BGRA:
+        as = 0;
+	rs = PIXMAN_FORMAT_BPP (format) - (b + g + r);
+        gs = r + rs;
+        bs = g + gs;
+	break;
+
+    case PIXMAN_TYPE_RGBA:
+	as = 0;
+	bs = PIXMAN_FORMAT_BPP (format) - (b + g + r);
+	gs = b + bs;
+	rs = g + gs;
+	break;
+
+    case PIXMAN_TYPE_A:
+        as = 0;
+        rs = 0;
+        gs = 0;
+        bs = 0;
+	break;
+
+    case PIXMAN_TYPE_OTHER:
+    case PIXMAN_TYPE_COLOR:
+    case PIXMAN_TYPE_GRAY:
+    case PIXMAN_TYPE_YUY2:
+    case PIXMAN_TYPE_YV12:
+    default:
+	abort ();
+        as = 0;
+        rs = 0;
+        gs = 0;
+        bs = 0;
+	break;
+    }
+
+    if (MASK (a) != 0)
+	color->a = ((val >> as) & MASK (a)) / (double) MASK (a);
+    else
+	color->a = 1.0;
+
+    if (MASK (r) != 0)
+    {
+	color->r = ((val >> rs) & MASK (r)) / (double) MASK (r);
+	color->g = ((val >> gs) & MASK (g)) / (double) MASK (g);
+	color->b = ((val >> bs) & MASK (b)) / (double) MASK (b);
+    }
+    else
+    {
+	color->r = 0.0;
+	color->g = 0.0;
+	color->b = 0.0;
+    }
+
+#undef MASK
+}
+
+static double
+eval_diff (color_t *expected, color_t *test, pixman_format_code_t format)
+{
+    double rscale, gscale, bscale, ascale;
+    double rdiff, gdiff, bdiff, adiff;
+
+    rscale = 1.0 * ((1 << PIXMAN_FORMAT_R (format)) - 1);
+    gscale = 1.0 * ((1 << PIXMAN_FORMAT_G (format)) - 1);
+    bscale = 1.0 * ((1 << PIXMAN_FORMAT_B (format)) - 1);
+    ascale = 1.0 * ((1 << PIXMAN_FORMAT_A (format)) - 1);
+
+    rdiff = fabs (test->r - expected->r) * rscale;
+    bdiff = fabs (test->g - expected->g) * gscale;
+    gdiff = fabs (test->b - expected->b) * bscale;
+    adiff = fabs (test->a - expected->a) * ascale;
+
+    return MAX (MAX (MAX (rdiff, gdiff), bdiff), adiff);
+}
+
+static char *
+describe_image (image_t *info, char *buf)
+{
+    if (info->size)
+    {
+	sprintf (buf, "%s %dx%d%s",
+		 info->format->name,
+		 info->size, info->size,
+		 info->repeat ? "R" :"");
+    }
+    else
+    {
+	sprintf (buf, "solid");
+    }
+
+    return buf;
+}
+
+/* Test a composite of a given operation, source, mask, and destination
+ * picture.
+ * Fills the window, and samples from the 0,0 pixel corner.
+ */
+static pixman_bool_t
+composite_test (image_t *dst,
+		const operator_t *op,
+		image_t *src,
+		image_t *mask,
+		pixman_bool_t component_alpha)
+{
+    pixman_color_t fill;
+    pixman_rectangle16_t rect;
+    color_t expected, result, tdst, tsrc, tmsk;
+    double diff;
+    pixman_bool_t success = TRUE;
+
+    compute_pixman_color (dst->color, &fill);
+    rect.x = rect.y = 0;
+    rect.width = rect.height = dst->size;
+    pixman_image_fill_rectangles (PIXMAN_OP_SRC, dst->image,
+				  &fill, 1, &rect);
+
+    if (mask != NULL)
+    {
+	pixman_image_set_component_alpha (mask->image, component_alpha);
+	pixman_image_composite (op->op, src->image, mask->image, dst->image,
+				0, 0,
+				0, 0,
+				0, 0,
+				dst->size, dst->size);
+
+	tmsk = *mask->color;
+	if (mask->size)
+	{
+	    color_correct (mask->format->format, &tmsk);
+
+	    if (component_alpha &&
+		PIXMAN_FORMAT_R (mask->format->format) == 0)
+	    {
+		/* Ax component-alpha masks expand alpha into
+		 * all color channels.
+		 */
+		tmsk.r = tmsk.g = tmsk.b = tmsk.a;
+	    }
+	}
+    }
+    else
+    {
+	pixman_image_composite (op->op, src->image, NULL, dst->image,
+				0, 0,
+				0, 0,
+				0, 0,
+				dst->size, dst->size);
+    }
+    get_pixel (dst->image, dst->format->format, &result);
+
+    tdst = *dst->color;
+    color_correct (dst->format->format, &tdst);
+    tsrc = *src->color;
+    if (src->size)
+	color_correct (src->format->format, &tsrc);
+    do_composite (op->op, &tsrc, mask ? &tmsk : NULL, &tdst,
+		  &expected, component_alpha);
+    color_correct (dst->format->format, &expected);
+
+    diff = eval_diff (&expected, &result, dst->format->format);
+
+    /* FIXME: We should find out what deviation is acceptable. 3.0
+     * is clearly absurd for 2 bit formats for example. On the other
+     * hand currently 1.0 does not work.
+     */
+    if (diff > 3.0)
+    {
+	char buf[40];
+
+	sprintf (buf, "%s %scomposite",
+		 op->name,
+		 component_alpha ? "CA " : "");
+
+	printf ("%s test error of %.4f --\n"
+		"           R    G    B    A\n"
+		"got:       %.2f %.2f %.2f %.2f [%08lx]\n"
+		"expected:  %.2f %.2f %.2f %.2f\n",
+		buf, diff,
+		result.r, result.g, result.b, result.a,
+		*(unsigned long *) pixman_image_get_data (dst->image),
+		expected.r, expected.g, expected.b, expected.a);
+
+	if (mask != NULL)
+	{
+	    printf ("src color: %.2f %.2f %.2f %.2f\n"
+		    "msk color: %.2f %.2f %.2f %.2f\n"
+		    "dst color: %.2f %.2f %.2f %.2f\n",
+		    src->color->r, src->color->g,
+		    src->color->b, src->color->a,
+		    mask->color->r, mask->color->g,
+		    mask->color->b, mask->color->a,
+		    dst->color->r, dst->color->g,
+		    dst->color->b, dst->color->a);
+	    printf ("src: %s, ", describe_image (src, buf));
+	    printf ("mask: %s, ", describe_image (mask, buf));
+	    printf ("dst: %s\n\n", describe_image (dst, buf));
+	}
+	else
+	{
+	    printf ("src color: %.2f %.2f %.2f %.2f\n"
+		    "dst color: %.2f %.2f %.2f %.2f\n",
+		    src->color->r, src->color->g,
+		    src->color->b, src->color->a,
+		    dst->color->r, dst->color->g,
+		    dst->color->b, dst->color->a);
+	    printf ("src: %s, ", describe_image (src, buf));
+	    printf ("dst: %s\n\n", describe_image (dst, buf));
+	}
+
+	success = FALSE;
+    }
+
+    return success;
+}
+
+static void
+image_init (image_t *info,
+	    int color,
+	    int format,
+	    int size)
+{
+    pixman_color_t fill;
+
+    info->color = &colors[color];
+    compute_pixman_color (info->color, &fill);
+
+    info->format = &formats[format];
+    info->size = sizes[size] & ~FLAGS;
+    info->repeat = PIXMAN_REPEAT_NONE;
+
+    if (info->size)
+    {
+	pixman_rectangle16_t rect;
+
+	info->image = pixman_image_create_bits (info->format->format,
+						info->size, info->size,
+						NULL, 0);
+
+	rect.x = rect.y = 0;
+	rect.width = rect.height = info->size;
+	pixman_image_fill_rectangles (PIXMAN_OP_SRC, info->image, &fill,
+				      1, &rect);
+
+	if (size & REPEAT)
+	{
+	    pixman_image_set_repeat (info->image, PIXMAN_REPEAT_NORMAL);
+	    info->repeat = PIXMAN_REPEAT_NORMAL;
+	}
+    }
+    else
+    {
+	info->image = pixman_image_create_solid_fill (&fill);
+    }
+}
+
+static void
+image_fini (image_t *info)
+{
+    pixman_image_unref (info->image);
+}
+
+static int
+random_size (void)
+{
+    return lcg_rand_n (ARRAY_LENGTH (sizes));
+}
+
+static int
+random_color (void)
+{
+    return lcg_rand_n (ARRAY_LENGTH (colors));
+}
+
+static int
+random_format (void)
+{
+    return lcg_rand_n (ARRAY_LENGTH (formats));
+}
+
+static pixman_bool_t
+run_test (uint32_t seed)
+{
+    image_t src, mask, dst;
+    const operator_t *op;
+    int ca;
+    int ok;
+
+    lcg_srand (seed);
+    
+    image_init (&dst, random_color(), random_format(), 1);
+    image_init (&src, random_color(), random_format(), random_size());
+    image_init (&mask, random_color(), random_format(), random_size());
+
+    op = &(operators [lcg_rand_n (ARRAY_LENGTH (operators))]);
+
+    ca = lcg_rand_n (3);
+
+    switch (ca)
+    {
+    case 0:
+	ok = composite_test (&dst, op, &src, NULL, FALSE);
+	break;
+    case 1:
+	ok = composite_test (&dst, op, &src, &mask, FALSE);
+	break;
+    case 2:
+	ok = composite_test (&dst, op, &src, &mask,
+			     mask.size? TRUE : FALSE);
+	break;
+    default:
+	ok = FALSE;
+	break;
+    }
+
+    image_fini (&src);
+    image_fini (&mask);
+    image_fini (&dst);
+
+    return ok;
+}
+
+int
+main (int argc, char **argv)
+{
+#define N_TESTS (8 * 1024 * 1024)
+    int result = 0;
+    uint32_t i, seed;
+
+    if (argc > 1)
+    {
+	char *end;
+	
+	i = strtol (argv[1], &end, 0);
+
+	if (end != argv[1])
+	{
+	    if (!run_test (i))
+		return 1;
+	    else
+		return 0;
+	}
+	else
+	{
+	    printf ("Usage:\n\n   %s <number>\n\n", argv[0]);
+	    return -1;
+	}
+    }
+
+    if (getenv ("PIXMAN_RANDOMIZE_TESTS"))
+	seed = get_random_seed();
+    else
+	seed = 1;
+    
+#ifdef USE_OPENMP
+#   pragma omp parallel for default(none) shared(result, argv, seed)
+#endif
+    for (i = 0; i <= N_TESTS; ++i)
+    {
+	if (!result && !run_test (i + seed))
+	{
+	    printf ("Test 0x%08X failed.\n", seed + i);
+	    
+	    result = seed + i;
+	}
+    }
+    
+    return result;
+}
diff --git a/test/fetch-test.c b/test/fetch-test.c
new file mode 100755
index 0000000..9f80eec
--- /dev/null
+++ b/test/fetch-test.c
@@ -0,0 +1,209 @@
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "pixman.h"
+
+#define SIZE 1024
+
+static pixman_indexed_t mono_palette =
+{
+    0, { 0x00000000, 0x00ffffff },
+};
+
+
+typedef struct {
+    pixman_format_code_t format;
+    int width, height;
+    int stride;
+    uint32_t src[SIZE];
+    uint32_t dst[SIZE];
+    pixman_indexed_t *indexed;
+} testcase_t;
+
+static testcase_t testcases[] =
+{
+    {
+	PIXMAN_a8r8g8b8,
+	2, 2,
+	8,
+	{ 0x00112233, 0x44556677,
+	  0x8899aabb, 0xccddeeff },
+	{ 0x00112233, 0x44556677,
+	  0x8899aabb, 0xccddeeff },
+	NULL,
+    },
+    {
+	PIXMAN_r8g8b8a8,
+	2, 2,
+	8,
+	{ 0x11223300, 0x55667744,
+	  0x99aabb88, 0xddeeffcc },
+	{ 0x00112233, 0x44556677,
+	  0x8899aabb, 0xccddeeff },
+	NULL,
+    },
+    {
+	PIXMAN_g1,
+	8, 2,
+	4,
+#ifdef WORDS_BIGENDIAN
+	{
+	    0xaa000000,
+	    0x55000000
+	},
+#else
+	{
+	    0x00000055,
+	    0x000000aa
+	},
+#endif
+	{
+	    0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000,
+	    0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff
+	},
+	&mono_palette,
+    },
+#if 0
+    {
+	PIXMAN_g8,
+	4, 2,
+	4,
+	{ 0x01234567,
+	  0x89abcdef },
+	{ 0x00010101, 0x00232323, 0x00454545, 0x00676767,
+	  0x00898989, 0x00ababab, 0x00cdcdcd, 0x00efefef, },
+    },
+#endif
+    /* FIXME: make this work on big endian */
+    {
+	PIXMAN_yv12,
+	8, 2,
+	8,
+#ifdef WORDS_BIGENDIAN
+	{
+	    0x00ff00ff, 0x00ff00ff,
+	    0xff00ff00, 0xff00ff00,
+	    0x80ff8000,
+	    0x800080ff
+	},
+#else
+	{
+	    0xff00ff00, 0xff00ff00,
+	    0x00ff00ff, 0x00ff00ff,
+	    0x0080ff80,
+	    0xff800080
+	},
+#endif
+	{
+	    0xff000000, 0xffffffff, 0xffb80000, 0xffffe113,
+	    0xff000000, 0xffffffff, 0xff0023ee, 0xff4affff,
+	    0xffffffff, 0xff000000, 0xffffe113, 0xffb80000,
+	    0xffffffff, 0xff000000, 0xff4affff, 0xff0023ee,
+	},
+    },
+};
+
+int n_test_cases = sizeof(testcases)/sizeof(testcases[0]);
+
+
+static uint32_t
+reader (const void *src, int size)
+{
+    switch (size)
+    {
+    case 1:
+	return *(uint8_t *)src;
+    case 2:
+	return *(uint16_t *)src;
+    case 4:
+	return *(uint32_t *)src;
+    default:
+	assert(0);
+	return 0; /* silence MSVC */
+    }
+}
+
+
+static void
+writer (void *src, uint32_t value, int size)
+{
+    switch (size)
+    {
+    case 1:
+	*(uint8_t *)src = value;
+	break;
+    case 2:
+	*(uint16_t *)src = value;
+	break;
+    case 4:
+	*(uint32_t *)src = value;
+	break;
+    default:
+	assert(0);
+    }
+}
+
+
+int
+main (int argc, char **argv)
+{
+    uint32_t dst[SIZE];
+    pixman_image_t *src_img;
+    pixman_image_t *dst_img;
+    int i, j, x, y;
+    int ret = 0;
+
+    for (i = 0; i < n_test_cases; ++i)
+    {
+	for (j = 0; j < 2; ++j)
+	{
+	    src_img = pixman_image_create_bits (testcases[i].format,
+						testcases[i].width,
+						testcases[i].height,
+						testcases[i].src,
+						testcases[i].stride);
+	    pixman_image_set_indexed(src_img, testcases[i].indexed);
+
+	    dst_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+						testcases[i].width,
+						testcases[i].height,
+						dst,
+						testcases[i].width*4);
+
+	    if (j)
+	    {
+		pixman_image_set_accessors (src_img, reader, writer);
+		pixman_image_set_accessors (dst_img, reader, writer);
+	    }
+
+	    pixman_image_composite (PIXMAN_OP_SRC, src_img, NULL, dst_img,
+				    0, 0, 0, 0, 0, 0, testcases[i].width, testcases[i].height);
+
+	    pixman_image_unref (src_img);
+	    pixman_image_unref (dst_img);
+
+	    for (y = 0; y < testcases[i].height; ++y)
+	    {
+		for (x = 0; x < testcases[i].width; ++x)
+		{
+		    int offset = y * testcases[i].width + x;
+
+		    if (dst[offset] != testcases[i].dst[offset])
+		    {
+			printf ("test %i%c: pixel mismatch at (x=%d,y=%d): %08x expected, %08x obtained\n",
+			        i + 1, 'a' + j,
+			        x, y,
+			        testcases[i].dst[offset], dst[offset]);
+			ret = 1;
+		    }
+		}
+	    }
+	}
+    }
+
+    return ret;
+}
diff --git a/test/fuzzer-find-diff.pl b/test/fuzzer-find-diff.pl
new file mode 100644
index 0000000..53d9b8d
--- /dev/null
+++ b/test/fuzzer-find-diff.pl
@@ -0,0 +1,68 @@
+#!/usr/bin/env perl
+
+$usage = "Usage:
+  fuzzer-find-diff.pl reference_binary new_binary [number_of_tests_to_run]
+
+The first two input arguments are the commands to run the test programs
+based on fuzzer_test_main() function from 'util.c' (preferably they should
+be statically compiled, this can be achieved via '--disable-shared' pixman
+configure option). The third optional argument is the number of test rounds
+to run (if not specified, then testing runs infinitely or until some problem
+is detected).
+
+Usage examples:
+  fuzzer-find-diff.pl ./blitters-test-with-sse-disabled ./blitters-test 9000000
+  fuzzer-find-diff.pl ./blitters-test \"ssh ppc64_host /path/to/blitters-test\"
+";
+
+$#ARGV >= 1 or die $usage;
+
+$batch_size = 10000;
+
+if ($#ARGV >= 2) {
+    $number_of_tests = int($ARGV[2]);
+} else {
+    $number_of_tests = -1
+}
+
+sub test_range {
+    my $min = shift;
+    my $max = shift;
+
+    if (`$ARGV[0] $min $max 2>/dev/null` eq `$ARGV[1] $min $max 2>/dev/null`) {
+        return;
+    }
+
+    while ($max != $min + 1) {
+        my $avg = int(($min + $max) / 2);
+        my $res1 = `$ARGV[0] $min $avg 2>/dev/null`;
+        my $res2 = `$ARGV[1] $min $avg 2>/dev/null`;
+        if ($res1 ne $res2) {
+            $max = $avg;
+        } else {
+            $min = $avg;
+        }
+    }
+    return $max;
+}
+
+$base = 1;
+while ($number_of_tests <= 0 || $base <= $number_of_tests) {
+    printf("testing %-12d\r", $base + $batch_size - 1);
+    my $res = test_range($base, $base + $batch_size - 1);
+    if ($res) {
+        printf("Failure: results are different for test %d:\n", $res);
+
+        printf("\n-- ref --\n");
+        print `$ARGV[0] $res`;
+        printf("-- new --\n");
+        print `$ARGV[1] $res`;
+
+        printf("The problematic conditions can be reproduced by running:\n");
+        printf("$ARGV[1] %d\n", $res);
+
+        exit(1);
+    }
+    $base += $batch_size;
+}
+printf("Success: %d tests finished\n", $base - 1);
diff --git a/test/gradient-crash-test.c b/test/gradient-crash-test.c
new file mode 100644
index 0000000..c85712d
--- /dev/null
+++ b/test/gradient-crash-test.c
@@ -0,0 +1,158 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 400
+#define HEIGHT 200
+    
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *src_img;
+    pixman_image_t *dest_img;
+    int i, j, k, p;
+
+    typedef struct
+    {
+	pixman_point_fixed_t p0;
+	pixman_point_fixed_t p1;
+    } point_pair_t;
+    
+    pixman_gradient_stop_t onestop[1] =
+	{
+	    { pixman_int_to_fixed (1), { 0xffff, 0xeeee, 0xeeee, 0xeeee } },
+	};
+
+    pixman_gradient_stop_t subsetstops[2] =
+	{
+	    { pixman_int_to_fixed (1), { 0xffff, 0xeeee, 0xeeee, 0xeeee } },
+	    { pixman_int_to_fixed (1), { 0xffff, 0xeeee, 0xeeee, 0xeeee } },
+	};
+
+    pixman_gradient_stop_t stops01[2] =
+	{
+	    { pixman_int_to_fixed (0), { 0xffff, 0xeeee, 0xeeee, 0xeeee } },
+	    { pixman_int_to_fixed (1), { 0xffff, 0x1111, 0x1111, 0x1111 } }
+	};
+
+    point_pair_t point_pairs [] =
+	{ { { pixman_double_to_fixed (0), 0 },
+	    { pixman_double_to_fixed (WIDTH / 8.), pixman_int_to_fixed (0) } },
+	  { { pixman_double_to_fixed (WIDTH / 2.0), pixman_double_to_fixed (HEIGHT / 2.0) },
+	    { pixman_double_to_fixed (WIDTH / 2.0), pixman_double_to_fixed (HEIGHT / 2.0) } }
+	};
+    
+    pixman_transform_t transformations[] = {
+	{
+	    { { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), },
+	      { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
+	    }
+	},
+	{
+	    { { pixman_double_to_fixed (1), pixman_double_to_fixed (0), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
+	    }
+	},
+	{
+	    { { pixman_double_to_fixed (2), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (1), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (2), pixman_double_to_fixed (1.000), pixman_double_to_fixed (1.0) } 
+	    }
+	},
+	{
+	    { { pixman_double_to_fixed (2), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (1), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (0) } 
+	    }
+	},
+	{
+	    { { pixman_double_to_fixed (2), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (1), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (2), pixman_double_to_fixed (-1), pixman_double_to_fixed (0) } 
+	    }
+	},
+	{
+	    { { pixman_double_to_fixed (2), pixman_double_to_fixed (1), pixman_double_to_fixed (3), },
+	      { pixman_double_to_fixed (1), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (2), pixman_double_to_fixed (-1), pixman_double_to_fixed (0) } 
+	    }
+	},
+    };
+    
+    pixman_fixed_t r_inner;
+    pixman_fixed_t r_outer;
+
+    enable_fp_exceptions();
+    
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	dest[i] = 0x4f00004f; /* pale blue */
+    
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT, 
+					 dest,
+					 WIDTH * 4);
+
+    r_inner = 0;
+    r_outer = pixman_double_to_fixed (50.0);
+    
+    for (i = 0; i < 3; ++i)
+    {
+	pixman_gradient_stop_t *stops;
+        int num_stops;
+
+	if (i == 0)
+	{
+	    stops = onestop;
+	    num_stops = sizeof(onestop) / sizeof(onestop[0]);
+	}
+	else if (i == 1)
+	{
+	    stops = subsetstops;
+	    num_stops = sizeof(subsetstops) / sizeof(subsetstops[0]);
+	}
+	else
+	{
+	    stops = stops01;
+	    num_stops = sizeof(stops01) / sizeof(stops01[0]);
+	}
+	
+	for (j = 0; j < 3; ++j)
+	{
+	    for (p = 0; p < ARRAY_LENGTH (point_pairs); ++p)
+	    {
+		point_pair_t *pair = &(point_pairs[p]);
+
+		if (j == 0)
+		    src_img = pixman_image_create_conical_gradient (&(pair->p0), r_inner,
+								    stops, num_stops);
+		else if (j == 1)
+		    src_img = pixman_image_create_radial_gradient  (&(pair->p0), &(pair->p1),
+								    r_inner, r_outer,
+								    stops, num_stops);
+		else
+		    src_img = pixman_image_create_linear_gradient  (&(pair->p0), &(pair->p1),
+								    stops, num_stops);
+		
+		for (k = 0; k < ARRAY_LENGTH (transformations); ++k)
+		{
+		    pixman_image_set_transform (src_img, &transformations[k]);
+		    
+		    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NONE);
+		    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img,
+					    0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
+		}
+
+		pixman_image_unref (src_img);
+	    }
+
+	}
+    }
+
+    pixman_image_unref (dest_img);
+    free (dest);
+    
+    return 0;
+}
diff --git a/test/lowlevel-blt-bench.c b/test/lowlevel-blt-bench.c
new file mode 100644
index 0000000..bdafb35
--- /dev/null
+++ b/test/lowlevel-blt-bench.c
@@ -0,0 +1,727 @@
+/*
+ * Copyright Â© 2009 Nokia Corporation
+ * Copyright Â© 2010 Movial Creative Technologies Oy
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define PIXMAN_USE_INTERNAL_API
+#include <pixman.h>
+
+#include "utils.h"
+
+#define SOLID_FLAG 1
+#define CA_FLAG    2
+
+#define L1CACHE_SIZE (8 * 1024)
+#define L2CACHE_SIZE (128 * 1024)
+
+#define WIDTH  1920
+#define HEIGHT 1080
+#define BUFSIZE (WIDTH * HEIGHT * 4)
+#define XWIDTH 256
+#define XHEIGHT 256
+#define TILEWIDTH 32
+#define TINYWIDTH 8
+
+#define EXCLUDE_OVERHEAD 1
+
+uint32_t *dst;
+uint32_t *src;
+uint32_t *mask;
+
+double bandwidth = 0;
+
+double
+bench_memcpy ()
+{
+    int64_t n = 0, total;
+    double  t1, t2;
+    int     x = 0;
+
+    t1 = gettime ();
+    while (1)
+    {
+	memcpy (dst, src, BUFSIZE - 64);
+	memcpy (src, dst, BUFSIZE - 64);
+	n += 4 * (BUFSIZE - 64);
+	t2 = gettime ();
+	if (t2 - t1 > 0.5)
+	    break;
+    }
+    n = total = n * 5;
+    t1 = gettime ();
+    while (n > 0)
+    {
+	if (++x >= 64)
+	    x = 0;
+	memcpy ((char *)dst + 1, (char *)src + x, BUFSIZE - 64);
+	memcpy ((char *)src + 1, (char *)dst + x, BUFSIZE - 64);
+	n -= 4 * (BUFSIZE - 64);
+    }
+    t2 = gettime ();
+    return (double)total / (t2 - t1);
+}
+
+static void
+pixman_image_composite_wrapper (pixman_implementation_t *impl,
+				pixman_composite_info_t *info)
+{
+    pixman_image_composite (info->op,
+			    info->src_image, info->mask_image, info->dest_image,
+			    info->src_x, info->src_y,
+			    info->mask_x, info->mask_y,
+			    info->dest_x, info->dest_y,
+			    info->width, info->height);
+}
+
+static void
+pixman_image_composite_empty (pixman_implementation_t *impl,
+			      pixman_composite_info_t *info)
+{
+    pixman_image_composite (info->op,
+			    info->src_image, info->mask_image, info->dest_image,
+			    0, 0, 0, 0, 0, 0, 1, 1);
+}
+
+static inline void
+call_func (pixman_composite_func_t func,
+	   pixman_op_t             op,
+	   pixman_image_t *        src_image,
+	   pixman_image_t *        mask_image,
+	   pixman_image_t *        dest_image,
+	   int32_t		   src_x,
+	   int32_t		   src_y,
+	   int32_t                 mask_x,
+	   int32_t                 mask_y,
+	   int32_t                 dest_x,
+	   int32_t                 dest_y,
+	   int32_t                 width,
+	   int32_t                 height)
+{
+    pixman_composite_info_t info;
+
+    info.op = op;
+    info.src_image = src_image;
+    info.mask_image = mask_image;
+    info.dest_image = dest_image;
+    info.src_x = src_x;
+    info.src_y = src_y;
+    info.mask_x = mask_x;
+    info.mask_y = mask_y;
+    info.dest_x = dest_x;
+    info.dest_y = dest_y;
+    info.width = width;
+    info.height = height;
+
+    func (0, &info);
+}
+
+void
+noinline
+bench_L  (pixman_op_t              op,
+          pixman_image_t *         src_img,
+          pixman_image_t *         mask_img,
+          pixman_image_t *         dst_img,
+          int64_t                  n,
+          pixman_composite_func_t  func,
+          int                      width,
+          int                      lines_count)
+{
+    int64_t      i, j;
+    int          x = 0;
+    int          q = 0;
+    volatile int qx;
+
+    for (i = 0; i < n; i++)
+    {
+	/* touch destination buffer to fetch it into L1 cache */
+	for (j = 0; j < width + 64; j += 16) {
+	    q += dst[j];
+	    q += src[j];
+	}
+	if (++x >= 64)
+	    x = 0;
+	call_func (func, op, src_img, mask_img, dst_img, x, 0, x, 0, 63 - x, 0, width, lines_count);
+    }
+    qx = q;
+}
+
+void
+noinline
+bench_M (pixman_op_t              op,
+         pixman_image_t *         src_img,
+         pixman_image_t *         mask_img,
+         pixman_image_t *         dst_img,
+         int64_t                  n,
+         pixman_composite_func_t  func)
+{
+    int64_t i;
+    int     x = 0;
+
+    for (i = 0; i < n; i++)
+    {
+	if (++x >= 64)
+	    x = 0;
+	call_func (func, op, src_img, mask_img, dst_img, x, 0, x, 0, 1, 0, WIDTH - 64, HEIGHT);
+    }
+}
+
+double
+noinline
+bench_HT (pixman_op_t              op,
+          pixman_image_t *         src_img,
+          pixman_image_t *         mask_img,
+          pixman_image_t *         dst_img,
+          int64_t                  n,
+          pixman_composite_func_t  func)
+{
+    double  pix_cnt = 0;
+    int     x = 0;
+    int     y = 0;
+    int64_t i;
+
+    srand (0);
+    for (i = 0; i < n; i++)
+    {
+	int w = (rand () % (TILEWIDTH * 2)) + 1;
+	int h = (rand () % (TILEWIDTH * 2)) + 1;
+	if (x + w > WIDTH)
+	{
+	    x = 0;
+	    y += TILEWIDTH * 2;
+	}
+	if (y + h > HEIGHT)
+	{
+	    y = 0;
+	}
+	call_func (func, op, src_img, mask_img, dst_img, x, y, x, y, x, y, w, h);
+	x += w;
+	pix_cnt += w * h;
+    }
+    return pix_cnt;
+}
+
+double
+noinline
+bench_VT (pixman_op_t              op,
+          pixman_image_t *         src_img,
+          pixman_image_t *         mask_img,
+          pixman_image_t *         dst_img,
+          int64_t                  n,
+          pixman_composite_func_t  func)
+{
+    double  pix_cnt = 0;
+    int     x = 0;
+    int     y = 0;
+    int64_t i;
+
+    srand (0);
+    for (i = 0; i < n; i++)
+    {
+	int w = (rand () % (TILEWIDTH * 2)) + 1;
+	int h = (rand () % (TILEWIDTH * 2)) + 1;
+	if (y + h > HEIGHT)
+	{
+	    y = 0;
+	    x += TILEWIDTH * 2;
+	}
+	if (x + w > WIDTH)
+	{
+	    x = 0;
+	}
+	call_func (func, op, src_img, mask_img, dst_img, x, y, x, y, x, y, w, h);
+	y += h;
+	pix_cnt += w * h;
+    }
+    return pix_cnt;
+}
+
+double
+noinline
+bench_R (pixman_op_t              op,
+         pixman_image_t *         src_img,
+         pixman_image_t *         mask_img,
+         pixman_image_t *         dst_img,
+         int64_t                  n,
+         pixman_composite_func_t  func,
+         int                      maxw,
+         int                      maxh)
+{
+    double  pix_cnt = 0;
+    int64_t i;
+
+    if (maxw <= TILEWIDTH * 2 || maxh <= TILEWIDTH * 2)
+    {
+	printf("error: maxw <= TILEWIDTH * 2 || maxh <= TILEWIDTH * 2\n");
+        return 0;
+    }
+
+    srand (0);
+    for (i = 0; i < n; i++)
+    {
+	int w = (rand () % (TILEWIDTH * 2)) + 1;
+	int h = (rand () % (TILEWIDTH * 2)) + 1;
+	int sx = rand () % (maxw - TILEWIDTH * 2);
+	int sy = rand () % (maxh - TILEWIDTH * 2);
+	int dx = rand () % (maxw - TILEWIDTH * 2);
+	int dy = rand () % (maxh - TILEWIDTH * 2);
+	call_func (func, op, src_img, mask_img, dst_img, sx, sy, sx, sy, dx, dy, w, h);
+	pix_cnt += w * h;
+    }
+    return pix_cnt;
+}
+
+double
+noinline
+bench_RT (pixman_op_t              op,
+          pixman_image_t *         src_img,
+          pixman_image_t *         mask_img,
+          pixman_image_t *         dst_img,
+          int64_t                  n,
+          pixman_composite_func_t  func,
+          int                      maxw,
+          int                      maxh)
+{
+    double  pix_cnt = 0;
+    int64_t i;
+
+    if (maxw <= TINYWIDTH * 2 || maxh <= TINYWIDTH * 2)
+    {
+	printf("error: maxw <= TINYWIDTH * 2 || maxh <= TINYWIDTH * 2\n");
+        return 0;
+    }
+
+    srand (0);
+    for (i = 0; i < n; i++)
+    {
+	int w = (rand () % (TINYWIDTH * 2)) + 1;
+	int h = (rand () % (TINYWIDTH * 2)) + 1;
+	int sx = rand () % (maxw - TINYWIDTH * 2);
+	int sy = rand () % (maxh - TINYWIDTH * 2);
+	int dx = rand () % (maxw - TINYWIDTH * 2);
+	int dy = rand () % (maxh - TINYWIDTH * 2);
+	call_func (func, op, src_img, mask_img, dst_img, sx, sy, sx, sy, dx, dy, w, h);
+	pix_cnt += w * h;
+    }
+    return pix_cnt;
+}
+
+void
+bench_composite (char * testname,
+                 int    src_fmt,
+                 int    src_flags,
+                 int    op,
+                 int    mask_fmt,
+                 int    mask_flags,
+                 int    dst_fmt,
+                 double npix)
+{
+    pixman_image_t *                src_img;
+    pixman_image_t *                dst_img;
+    pixman_image_t *                mask_img;
+    pixman_image_t *                xsrc_img;
+    pixman_image_t *                xdst_img;
+    pixman_image_t *                xmask_img;
+    double                          t1, t2, t3, pix_cnt;
+    int64_t                         n, l1test_width, nlines;
+    double                             bytes_per_pix = 0;
+
+    pixman_composite_func_t func = pixman_image_composite_wrapper;
+
+    if (!(src_flags & SOLID_FLAG))
+    {
+        bytes_per_pix += (src_fmt >> 24) / 8.0;
+        src_img = pixman_image_create_bits (src_fmt,
+                                            WIDTH, HEIGHT,
+                                            src,
+                                            WIDTH * 4);
+        xsrc_img = pixman_image_create_bits (src_fmt,
+                                             XWIDTH, XHEIGHT,
+                                             src,
+                                             XWIDTH * 4);
+    }
+    else
+    {
+        src_img = pixman_image_create_bits (src_fmt,
+                                            1, 1,
+                                            src,
+                                            4);
+        xsrc_img = pixman_image_create_bits (src_fmt,
+                                             1, 1,
+                                             src,
+                                             4);
+        pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
+        pixman_image_set_repeat (xsrc_img, PIXMAN_REPEAT_NORMAL);
+    }
+
+    bytes_per_pix += (dst_fmt >> 24) / 8.0;
+    dst_img = pixman_image_create_bits (dst_fmt,
+                                        WIDTH, HEIGHT,
+                                        dst,
+                                        WIDTH * 4);
+
+    mask_img = NULL;
+    xmask_img = NULL;
+    if (!(mask_flags & SOLID_FLAG) && mask_fmt != PIXMAN_null)
+    {
+        bytes_per_pix += (mask_fmt >> 24) / ((op == PIXMAN_OP_SRC) ? 8.0 : 4.0);
+        mask_img = pixman_image_create_bits (mask_fmt,
+                                             WIDTH, HEIGHT,
+                                             mask,
+                                             WIDTH * 4);
+        xmask_img = pixman_image_create_bits (mask_fmt,
+                                             XWIDTH, XHEIGHT,
+                                             mask,
+                                             XWIDTH * 4);
+    }
+    else if (mask_fmt != PIXMAN_null)
+    {
+        mask_img = pixman_image_create_bits (mask_fmt,
+                                             1, 1,
+                                             mask,
+                                             4);
+        xmask_img = pixman_image_create_bits (mask_fmt,
+                                             1, 1,
+                                             mask,
+                                             4 * 4);
+       pixman_image_set_repeat (mask_img, PIXMAN_REPEAT_NORMAL);
+       pixman_image_set_repeat (xmask_img, PIXMAN_REPEAT_NORMAL);
+    }
+    if ((mask_flags & CA_FLAG) && mask_fmt != PIXMAN_null)
+    {
+       pixman_image_set_component_alpha (mask_img, 1);
+    }
+    xdst_img = pixman_image_create_bits (dst_fmt,
+                                         XWIDTH, XHEIGHT,
+                                         dst,
+                                         XWIDTH * 4);
+
+
+    printf ("%24s %c", testname, func != pixman_image_composite_wrapper ?
+            '-' : '=');
+
+    memcpy (src, dst, BUFSIZE);
+    memcpy (dst, src, BUFSIZE);
+
+    l1test_width = L1CACHE_SIZE / 8 - 64;
+    if (l1test_width < 1)
+	l1test_width = 1;
+    if (l1test_width > WIDTH - 64)
+	l1test_width = WIDTH - 64;
+    n = 1 + npix / (l1test_width * 8);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    bench_L (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, l1test_width, 1);
+#endif
+    t2 = gettime ();
+    bench_L (op, src_img, mask_img, dst_img, n, func, l1test_width, 1);
+    t3 = gettime ();
+    printf ("  L1:%7.2f", (double)n * l1test_width * 1 /
+            ((t3 - t2) - (t2 - t1)) / 1000000.);
+    fflush (stdout);
+
+    memcpy (src, dst, BUFSIZE);
+    memcpy (dst, src, BUFSIZE);
+
+    nlines = (L2CACHE_SIZE / l1test_width) /
+	((PIXMAN_FORMAT_BPP(src_fmt) + PIXMAN_FORMAT_BPP(dst_fmt)) / 8);
+    if (nlines < 1)
+	nlines = 1;
+    n = 1 + npix / (l1test_width * nlines);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    bench_L (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, l1test_width, nlines);
+#endif
+    t2 = gettime ();
+    bench_L (op, src_img, mask_img, dst_img, n, func, l1test_width, nlines);
+    t3 = gettime ();
+    printf ("  L2:%7.2f", (double)n * l1test_width * nlines /
+            ((t3 - t2) - (t2 - t1)) / 1000000.);
+    fflush (stdout);
+
+    memcpy (src, dst, BUFSIZE);
+    memcpy (dst, src, BUFSIZE);
+
+    n = 1 + npix / (WIDTH * HEIGHT);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    bench_M (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty);
+#endif
+    t2 = gettime ();
+    bench_M (op, src_img, mask_img, dst_img, n, func);
+    t3 = gettime ();
+    printf ("  M:%6.2f (%6.2f%%)",
+        ((double)n * (WIDTH - 64) * HEIGHT / ((t3 - t2) - (t2 - t1))) / 1000000.,
+        ((double)n * (WIDTH - 64) * HEIGHT / ((t3 - t2) - (t2 - t1)) * bytes_per_pix) * (100.0 / bandwidth) );
+    fflush (stdout);
+
+    memcpy (src, dst, BUFSIZE);
+    memcpy (dst, src, BUFSIZE);
+
+    n = 1 + npix / (8 * TILEWIDTH * TILEWIDTH);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    pix_cnt = bench_HT (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty);
+#endif
+    t2 = gettime ();
+    pix_cnt = bench_HT (op, src_img, mask_img, dst_img, n, func);
+    t3 = gettime ();
+    printf ("  HT:%6.2f", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000.);
+    fflush (stdout);
+
+    memcpy (src, dst, BUFSIZE);
+    memcpy (dst, src, BUFSIZE);
+
+    n = 1 + npix / (8 * TILEWIDTH * TILEWIDTH);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    pix_cnt = bench_VT (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty);
+#endif
+    t2 = gettime ();
+    pix_cnt = bench_VT (op, src_img, mask_img, dst_img, n, func);
+    t3 = gettime ();
+    printf ("  VT:%6.2f", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000.);
+    fflush (stdout);
+
+    memcpy (src, dst, BUFSIZE);
+    memcpy (dst, src, BUFSIZE);
+
+    n = 1 + npix / (8 * TILEWIDTH * TILEWIDTH);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    pix_cnt = bench_R (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, WIDTH, HEIGHT);
+#endif
+    t2 = gettime ();
+    pix_cnt = bench_R (op, src_img, mask_img, dst_img, n, func, WIDTH, HEIGHT);
+    t3 = gettime ();
+    printf ("  R:%6.2f", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000.);
+    fflush (stdout);
+
+    memcpy (src, dst, BUFSIZE);
+    memcpy (dst, src, BUFSIZE);
+
+    n = 1 + npix / (16 * TINYWIDTH * TINYWIDTH);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    pix_cnt = bench_RT (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, WIDTH, HEIGHT);
+#endif
+    t2 = gettime ();
+    pix_cnt = bench_RT (op, src_img, mask_img, dst_img, n, func, WIDTH, HEIGHT);
+    t3 = gettime ();
+    printf ("  RT:%6.2f (%4.0fKops/s)\n", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000., (double) n / ((t3 - t2) * 1000));
+
+    if (mask_img) {
+	pixman_image_unref (mask_img);
+	pixman_image_unref (xmask_img);
+    }
+    pixman_image_unref (src_img);
+    pixman_image_unref (dst_img);
+    pixman_image_unref (xsrc_img);
+    pixman_image_unref (xdst_img);
+}
+
+#define PIXMAN_OP_OUT_REV (PIXMAN_OP_OUT_REVERSE)
+
+struct
+{
+    char *testname;
+    int   src_fmt;
+    int   src_flags;
+    int   op;
+    int   mask_fmt;
+    int   mask_flags;
+    int   dst_fmt;
+}
+tests_tbl[] =
+{
+    { "add_8_8_8",             PIXMAN_a8,          0, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8 },
+    { "add_n_8_8",             PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8 },
+    { "add_n_8_8888",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "add_n_8_x888",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "add_n_8_0565",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "add_n_8_1555",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
+    { "add_n_8_4444",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a4r4g4b4 },
+    { "add_n_8_2222",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a2r2g2b2 },
+    { "add_n_8_2x10",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_x2r10g10b10 },
+    { "add_n_8_2a10",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a2r10g10b10 },
+    { "add_n_8",               PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8 },
+    { "add_n_8888",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "add_n_x888",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "add_n_0565",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "add_n_1555",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "add_n_4444",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a4r4g4b4 },
+    { "add_n_2222",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r2g2b2 },
+    { "add_n_2x10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
+    { "add_n_2a10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
+    { "add_8_8",               PIXMAN_a8,          0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8 },
+    { "add_x888_x888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "add_8888_8888",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "add_8888_0565",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "add_8888_1555",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "add_8888_4444",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a4r4g4b4 },
+    { "add_8888_2222",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r2g2b2 },
+    { "add_0565_0565",         PIXMAN_r5g6b5,      0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "add_1555_1555",         PIXMAN_a1r5g5b5,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "add_0565_2x10",         PIXMAN_r5g6b5,      0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
+    { "add_2a10_2a10",         PIXMAN_a2r10g10b10, 0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
+    { "src_n_2222",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a2r2g2b2 },
+    { "src_n_0565",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_n_1555",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "src_n_4444",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a4r4g4b4 },
+    { "src_n_x888",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_n_8888",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "src_n_2x10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
+    { "src_n_2a10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
+    { "src_8888_0565",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_8888_4444",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a4r4g4b4 },
+    { "src_8888_2222",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a2r2g2b2 },
+    { "src_8888_2x10",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
+    { "src_8888_2a10",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
+    { "src_0888_0565",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_0888_8888",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "src_0888_x888",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_x888_x888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_x888_8888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "src_8888_8888",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "src_0565_0565",         PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_1555_0565",         PIXMAN_a1r5g5b5,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_0565_1555",         PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "src_n_8_0565",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "src_n_8_1555",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
+    { "src_n_8_4444",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a4r4g4b4 },
+    { "src_n_8_2222",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a2r2g2b2 },
+    { "src_n_8_x888",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "src_n_8_8888",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "src_n_8_2x10",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_x2r10g10b10 },
+    { "src_n_8_2a10",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a2r10g10b10 },
+    { "src_8888_8_0565",       PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "src_0888_8_0565",       PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "src_0888_8_8888",       PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "src_0888_8_x888",       PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "src_x888_8_x888",       PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "src_x888_8_8888",       PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "src_0565_8_0565",       PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "src_1555_8_0565",       PIXMAN_a1r5g5b5,    0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "src_0565_8_1555",       PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
+    { "over_n_x888",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "over_n_8888",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "over_n_0565",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "over_n_1555",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "over_8888_0565",        PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "over_8888_x888",        PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "over_x888_8_0565",      PIXMAN_x8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "over_x888_8_8888",      PIXMAN_x8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "over_n_8_0565",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "over_n_8_1555",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
+    { "over_n_8_4444",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a4r4g4b4 },
+    { "over_n_8_2222",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a2r2g2b2 },
+    { "over_n_8_x888",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "over_n_8_8888",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "over_n_8_2x10",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_x2r10g10b10 },
+    { "over_n_8_2a10",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a2r10g10b10 },
+    { "over_n_8888_8888_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_a8r8g8b8 },
+    { "over_n_8888_x888_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_x8r8g8b8 },
+    { "over_n_8888_0565_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_r5g6b5 },
+    { "over_n_8888_1555_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_a1r5g5b5 },
+    { "over_n_8888_4444_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_a4r4g4b4 },
+    { "over_n_8888_2222_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_a2r2g2b2 },
+    { "over_n_8888_2x10_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_x2r10g10b10 },
+    { "over_n_8888_2a10_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_a2r10g10b10 },
+    { "over_8888_n_8888",      PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_a8r8g8b8 },
+    { "over_8888_n_x888",      PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_x8r8g8b8 },
+    { "over_8888_n_0565",      PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_r5g6b5 },
+    { "over_8888_n_1555",      PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_a1r5g5b5 },
+    { "outrev_n_8_0565",       PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "outrev_n_8_1555",       PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
+    { "outrev_n_8_x888",       PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "outrev_n_8_8888",       PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "outrev_n_8888_0565_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_r5g6b5 },
+    { "outrev_n_8888_1555_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_a1r5g5b5 },
+    { "outrev_n_8888_x888_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_x8r8g8b8 },
+    { "outrev_n_8888_8888_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_a8r8g8b8 },
+};
+
+int
+main (int argc, char *argv[])
+{
+    double x;
+    int i;
+    char *pattern = argc > 1 ? argv[1] : "all";
+
+    src = aligned_malloc (4096, BUFSIZE * 3);
+    memset (src, 0xCC, BUFSIZE * 3);
+    dst = src + (BUFSIZE / 4);
+    mask = dst + (BUFSIZE / 4);
+
+    printf ("Benchmark for a set of most commonly used functions\n");
+    printf ("---\n");
+    printf ("All results are presented in millions of pixels per second\n");
+    printf ("L1  - small Xx1 rectangle (fitting L1 cache), always blitted at the same\n");
+    printf ("      memory location with small drift in horizontal direction\n");
+    printf ("L2  - small XxY rectangle (fitting L2 cache), always blitted at the same\n");
+    printf ("      memory location with small drift in horizontal direction\n");
+    printf ("M   - large %dx%d rectangle, always blitted at the same\n",
+            WIDTH - 64, HEIGHT);
+    printf ("      memory location with small drift in horizontal direction\n");
+    printf ("HT  - random rectangles with %dx%d average size are copied from\n",
+            TILEWIDTH, TILEWIDTH);
+    printf ("      one %dx%d buffer to another, traversing from left to right\n",
+            WIDTH, HEIGHT);
+    printf ("      and from top to bottom\n");
+    printf ("VT  - random rectangles with %dx%d average size are copied from\n",
+            TILEWIDTH, TILEWIDTH);
+    printf ("      one %dx%d buffer to another, traversing from top to bottom\n",
+            WIDTH, HEIGHT);
+    printf ("      and from left to right\n");
+    printf ("R   - random rectangles with %dx%d average size are copied from\n",
+            TILEWIDTH, TILEWIDTH);
+    printf ("      random locations of one %dx%d buffer to another\n",
+            WIDTH, HEIGHT);
+    printf ("RT  - as R, but %dx%d average sized rectangles are copied\n",
+            TINYWIDTH, TINYWIDTH);
+    printf ("---\n");
+    bandwidth = x = bench_memcpy ();
+    printf ("reference memcpy speed = %.1fMB/s (%.1fMP/s for 32bpp fills)\n",
+            x / 1000000., x / 4000000);
+    printf ("---\n");
+
+    for (i = 0; i < sizeof(tests_tbl) / sizeof(tests_tbl[0]); i++)
+    {
+	if (strcmp (pattern, "all") == 0 || strstr (tests_tbl[i].testname, pattern))
+	{
+	    bench_composite (tests_tbl[i].testname,
+			     tests_tbl[i].src_fmt,
+			     tests_tbl[i].src_flags,
+			     tests_tbl[i].op,
+			     tests_tbl[i].mask_fmt,
+			     tests_tbl[i].mask_flags,
+			     tests_tbl[i].dst_fmt,
+			     bandwidth/8);
+	}
+    }
+
+    free (src);
+    return 0;
+}
diff --git a/test/oob-test.c b/test/oob-test.c
new file mode 100644
index 0000000..4f9e5a2
--- /dev/null
+++ b/test/oob-test.c
@@ -0,0 +1,101 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+
+typedef struct
+{
+    int				width;
+    int				height;
+    int				stride;
+    pixman_format_code_t	format;
+    
+} image_info_t;
+
+typedef struct
+{
+    pixman_op_t		op;
+    
+    image_info_t	src;
+    image_info_t	dest;
+
+    int			src_x;
+    int			src_y;
+    int			dest_x;
+    int			dest_y;
+    int			width;
+    int			height;
+} composite_info_t;
+
+const composite_info_t info[] =
+{
+    {
+	PIXMAN_OP_SRC,
+	{  3, 6, 16, PIXMAN_a8r8g8b8 },
+	{  5, 7, 20, PIXMAN_x8r8g8b8 },
+	1, 8,
+	1, -1,
+	1, 8
+    },
+    {
+	PIXMAN_OP_SRC,
+	{ 7, 5, 36, PIXMAN_a8r8g8b8 },
+	{ 6, 5, 28, PIXMAN_x8r8g8b8 },
+	8, 5,
+	5, 3,
+	1, 2
+    },
+    {
+	PIXMAN_OP_OVER,
+	{ 10, 10, 40, PIXMAN_a2b10g10r10 },
+	{ 10, 10, 40, PIXMAN_a2b10g10r10 },
+	0, 0,
+	0, 0,
+	10, 10
+    },
+    {
+	PIXMAN_OP_OVER,
+	{ 10, 10, 40, PIXMAN_x2b10g10r10 },
+	{ 10, 10, 40, PIXMAN_x2b10g10r10 },
+	0, 0,
+	0, 0,
+	10, 10
+    },
+};
+
+static pixman_image_t *
+make_image (const image_info_t *info)
+{
+    char *data = malloc (info->stride * info->height);
+    int i;
+
+    for (i = 0; i < info->height * info->stride; ++i)
+	data[i] = (i % 255) ^ (((i % 16) << 4) | (i & 0xf0));
+
+    return pixman_image_create_bits (info->format, info->width, info->height, (uint32_t *)data, info->stride);
+}
+    
+static void
+test_composite (const composite_info_t *info)
+{
+    pixman_image_t *src = make_image (&info->src);
+    pixman_image_t *dest = make_image (&info->dest);
+
+    pixman_image_composite (PIXMAN_OP_SRC, src, NULL, dest,
+			    info->src_x, info->src_y,
+			    0, 0,
+			    info->dest_x, info->dest_y,
+			    info->width, info->height);
+}
+
+
+
+int
+main (int argc, char **argv)
+{
+    int i;
+
+    for (i = 0; i < sizeof (info) / sizeof (info[0]); ++i)
+	test_composite (&info[i]);
+    
+    return 0;
+}
diff --git a/test/pdf-op-test.c b/test/pdf-op-test.c
new file mode 100644
index 0000000..99cb7df
--- /dev/null
+++ b/test/pdf-op-test.c
@@ -0,0 +1,83 @@
+#include <stdlib.h>
+#include "utils.h"
+
+static const pixman_op_t pdf_ops[] =
+{
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_SOFT_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+    PIXMAN_OP_HSL_HUE,
+    PIXMAN_OP_HSL_SATURATION,
+    PIXMAN_OP_HSL_COLOR,
+    PIXMAN_OP_HSL_LUMINOSITY
+};
+
+static const uint32_t pixels[] =
+{
+    0x00808080,
+    0x80123456,
+    0x00000000,
+    0xffffffff,
+    0x00ffffff,
+    0x80808080,
+    0x00123456,
+};
+
+int
+main ()
+{
+    int o, s, m, d;
+
+    enable_fp_exceptions();
+
+    for (o = 0; o < ARRAY_LENGTH (pdf_ops); ++o)
+    {
+	pixman_op_t op = pdf_ops[o];
+
+	for (s = 0; s < ARRAY_LENGTH (pixels); ++s)
+	{
+	    pixman_image_t *src;
+
+	    src = pixman_image_create_bits (
+		PIXMAN_a8r8g8b8, 1, 1, (uint32_t *)&(pixels[s]), 4);
+
+	    for (m = -1; m < ARRAY_LENGTH (pixels); ++m)
+	    {
+		pixman_image_t *msk = NULL;
+		if (m >= 0)
+		{
+		    msk = pixman_image_create_bits (
+			PIXMAN_a8r8g8b8, 1, 1, (uint32_t *)&(pixels[m]), 4);
+		}
+
+		for (d = 0; d < ARRAY_LENGTH (pixels); ++d)
+		{
+		    pixman_image_t *dst;
+		    uint32_t dp = pixels[d];
+
+		    dst = pixman_image_create_bits (
+			PIXMAN_a8r8g8b8, 1, 1, &dp, 4);
+
+		    pixman_image_composite (op, src, msk, dst,
+					    0, 0, 0, 0, 0, 0, 1, 1);
+
+		    pixman_image_unref (dst);
+		}
+		if (msk)
+		    pixman_image_unref (msk);
+	    }
+
+	    pixman_image_unref (src);
+	}
+    }
+
+    return 0;
+}
diff --git a/test/region-contains-test.c b/test/region-contains-test.c
new file mode 100644
index 0000000..b660fdf
--- /dev/null
+++ b/test/region-contains-test.c
@@ -0,0 +1,170 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+static void
+make_random_region (pixman_region32_t *region)
+{
+    int n_boxes;
+
+    pixman_region32_init (region);
+
+    n_boxes = lcg_rand_n (64);
+    while (n_boxes--)
+    {
+	int32_t x, y;
+	uint32_t w, h;
+
+	x = (int32_t)lcg_rand_u32() >> 2;
+	y = (int32_t)lcg_rand_u32() >> 2;
+	w = lcg_rand_u32() >> 2;
+	h = lcg_rand_u32() >> 2;
+
+	pixman_region32_union_rect (region, region, x, y, w, h);
+    }
+}
+
+static void
+print_box (pixman_box32_t *box)
+{
+    printf ("    %d %d %d %d\n", box->x1, box->y1, box->x2, box->y2);
+}
+
+static int32_t
+random_coord (pixman_region32_t *region, pixman_bool_t x)
+{
+    pixman_box32_t *b, *bb;
+    int n_boxes;
+    int begin, end;
+
+    if (lcg_rand_n (14))
+    {
+	bb = pixman_region32_rectangles (region, &n_boxes);
+	if (n_boxes == 0)
+	    goto use_extent;
+	b = bb + lcg_rand_n (n_boxes);
+    }
+    else
+    {
+    use_extent:
+	b = pixman_region32_extents (region);
+	n_boxes = 1;
+    }
+
+    if (x)
+    {
+	begin = b->x1;
+	end = b->x2;
+    }
+    else
+    {
+	begin = b->y1;
+	end = b->y2;
+    }
+
+    switch (lcg_rand_n (5))
+    {
+    case 0:
+	return begin - lcg_rand_u32();
+    case 1:
+	return end + lcg_rand_u32 ();
+    case 2:
+	return end;
+    case 3:
+	return begin;
+    default:
+	return (begin + end) / 2;
+    }
+    return 0;
+}
+
+static uint32_t
+compute_crc32_u32 (uint32_t crc32, uint32_t v)
+{
+    if (!is_little_endian())
+    {
+	v = ((v & 0xff000000) >> 24)	|
+	    ((v & 0x00ff0000) >> 8)	|
+	    ((v & 0x0000ff00) << 8)	|
+	    ((v & 0x000000ff) << 24);
+    }
+
+    return compute_crc32 (crc32, &v, sizeof (int32_t));
+}
+
+static uint32_t
+crc32_box32 (uint32_t crc32, pixman_box32_t *box)
+{
+    crc32 = compute_crc32_u32 (crc32, box->x1);
+    crc32 = compute_crc32_u32 (crc32, box->y1);
+    crc32 = compute_crc32_u32 (crc32, box->x2);
+    crc32 = compute_crc32_u32 (crc32, box->y2);
+
+    return crc32;
+}
+
+static uint32_t
+test_region_contains_rectangle (int i, int verbose)
+{
+    pixman_box32_t box;
+    pixman_box32_t rbox = { 0, 0, 0, 0 };
+    pixman_region32_t region;
+    uint32_t r, r1, r2, r3, r4, crc32;
+
+    lcg_srand (i);
+
+    make_random_region (&region);
+
+    box.x1 = random_coord (&region, TRUE);
+    box.x2 = box.x1 + lcg_rand_u32 ();
+    box.y1 = random_coord (&region, FALSE);
+    box.y2 = box.y1 + lcg_rand_u32 ();
+
+    if (verbose)
+    {
+	int n_rects;
+	pixman_box32_t *boxes;
+
+	boxes = pixman_region32_rectangles (&region, &n_rects);
+
+	printf ("region:\n");
+	while (n_rects--)
+	    print_box (boxes++);
+	printf ("box:\n");
+	print_box (&box);
+    }
+
+    crc32 = 0;
+
+    r1 = pixman_region32_contains_point (&region, box.x1, box.y1, &rbox);
+    crc32 = crc32_box32 (crc32, &rbox);
+    r2 = pixman_region32_contains_point (&region, box.x1, box.y2, &rbox);
+    crc32 = crc32_box32 (crc32, &rbox);
+    r3 = pixman_region32_contains_point (&region, box.x2, box.y1, &rbox);
+    crc32 = crc32_box32 (crc32, &rbox);
+    r4 = pixman_region32_contains_point (&region, box.x2, box.y2, &rbox);
+    crc32 = crc32_box32 (crc32, &rbox);
+
+    r = pixman_region32_contains_rectangle (&region, &box);
+    r = (i << 8) | (r << 4) | (r1 << 3) | (r2 << 2) | (r3 << 1) | (r4 << 0);
+
+    crc32 = compute_crc32_u32 (crc32, r);
+
+    if (verbose)
+	printf ("results: %d %d %d %d %d\n", (r & 0xf0) >> 4, r1, r2, r3, r4);
+
+    pixman_region32_fini (&region);
+
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    return fuzzer_test_main ("region_contains",
+			     1000000,
+			     0xD7C297CC,
+			     test_region_contains_rectangle,
+			     argc, argv);
+}
diff --git a/test/region-test.c b/test/region-test.c
new file mode 100644
index 0000000..9d5a41e
--- /dev/null
+++ b/test/region-test.c
@@ -0,0 +1,123 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+int
+main ()
+{
+    pixman_region32_t r1;
+    pixman_region32_t r2;
+    pixman_region32_t r3;
+    pixman_box32_t boxes[] = {
+	{ 10, 10, 20, 20 },
+	{ 30, 30, 30, 40 },
+	{ 50, 45, 60, 44 },
+    };
+    pixman_box32_t boxes2[] = {
+	{ 2, 6, 7, 6 },
+	{ 4, 1, 6, 7 },
+    };
+    pixman_box32_t boxes3[] = {
+	{ 2, 6, 7, 6 },
+	{ 4, 1, 6, 1 },
+    };
+    int i, j;
+    pixman_box32_t *b;
+    pixman_image_t *image, *fill;
+    pixman_color_t white = {
+	0xffff,
+	0xffff,
+	0xffff,
+	0xffff
+    };
+
+    /* This used to go into an infinite loop before pixman-region.c
+     * was fixed to not use explict "short" variables
+     */
+    pixman_region32_init_rect (&r1, 0, 0, 20, 64000);
+    pixman_region32_init_rect (&r2, 0, 0, 20, 64000);
+    pixman_region32_init_rect (&r3, 0, 0, 20, 64000);
+
+    pixman_region32_subtract (&r1, &r2, &r3);
+
+
+    /* This would produce a region containing an empty
+     * rectangle in it. Such regions are considered malformed,
+     * but using an empty rectangle for initialization should
+     * work.
+     */
+    pixman_region32_init_rects (&r1, boxes, 3);
+
+    b = pixman_region32_rectangles (&r1, &i);
+
+    assert (i == 1);
+    
+    while (i--)
+    {
+	assert (b[i].x1 < b[i].x2);
+	assert (b[i].y1 < b[i].y2);
+    }
+
+    /* This would produce a rectangle containing the bounding box
+     * of the two rectangles. The correct result is to eliminate
+     * the broken rectangle.
+     */
+    pixman_region32_init_rects (&r1, boxes2, 2);
+
+    b = pixman_region32_rectangles (&r1, &i);
+
+    assert (i == 1);
+
+    assert (b[0].x1 == 4);
+    assert (b[0].y1 == 1);
+    assert (b[0].x2 == 6);
+    assert (b[0].y2 == 7);
+
+    /* This should produce an empty region */
+    pixman_region32_init_rects (&r1, boxes3, 2);
+
+    b = pixman_region32_rectangles (&r1, &i);
+
+    assert (i == 0);
+
+    fill = pixman_image_create_solid_fill (&white);
+    for (i = 0; i < 100; i++)
+    {
+	int image_size = 128;
+
+	pixman_region32_init (&r1);
+
+	/* Add some random rectangles */
+	for (j = 0; j < 64; j++)
+	    pixman_region32_union_rect (&r1, &r1,
+					lcg_rand_n (image_size),
+					lcg_rand_n (image_size),
+					lcg_rand_n (25),
+					lcg_rand_n (25));
+
+	/* Clip to image size */
+	pixman_region32_init_rect (&r2, 0, 0, image_size, image_size);
+	pixman_region32_intersect (&r1, &r1, &r2);
+	pixman_region32_fini (&r2);
+
+	/* render region to a1 mask */
+	image = pixman_image_create_bits (PIXMAN_a1, image_size, image_size, NULL, 0);
+	pixman_image_set_clip_region32 (image, &r1);
+	pixman_image_composite32 (PIXMAN_OP_SRC,
+				  fill, NULL, image,
+				  0, 0, 0, 0, 0, 0,
+				  image_size, image_size);
+	pixman_region32_init_from_image (&r2, image);
+
+	pixman_image_unref (image);
+
+	assert (pixman_region32_equal (&r1, &r2));
+	pixman_region32_fini (&r1);
+	pixman_region32_fini (&r2);
+
+    }
+    pixman_image_unref (fill);
+
+    return 0;
+}
diff --git a/test/region-translate-test.c b/test/region-translate-test.c
new file mode 100644
index 0000000..0e96a5e
--- /dev/null
+++ b/test/region-translate-test.c
@@ -0,0 +1,30 @@
+#include <pixman.h>
+#include <assert.h>
+
+/* Pixman had a bug where 32bit regions where clipped to 16bit sizes when
+ * pixman_region32_translate() was called. This test exercises that bug.
+ */
+
+#define LARGE 32000
+
+int
+main (int argc, char **argv)
+{
+  pixman_box32_t rect = { -LARGE, -LARGE, LARGE, LARGE };
+  pixman_region32_t r1, r2;
+
+  pixman_region32_init_rects (&r1, &rect, 1);
+  pixman_region32_init_rect (&r2, rect.x1, rect.y1, rect.x2 - rect.x1, rect.y2 - rect.y1);
+
+  assert (pixman_region32_equal (&r1,  &r2));
+
+  pixman_region32_translate (&r1, -LARGE, LARGE);
+  pixman_region32_translate (&r1, LARGE, -LARGE);
+
+  assert (pixman_region32_equal (&r1,  &r2));
+
+  pixman_region32_fini (&r1);
+  pixman_region32_fini (&r2);
+
+  return 0;
+}
diff --git a/test/scaling-crash-test.c b/test/scaling-crash-test.c
new file mode 100644
index 0000000..40323d4
--- /dev/null
+++ b/test/scaling-crash-test.c
@@ -0,0 +1,217 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "pixman.h"
+
+/*
+ * We have a source image filled with solid color, set NORMAL or PAD repeat,
+ * and some transform which results in nearest neighbour scaling.
+ *
+ * The expected result is either that the destination image filled with this solid
+ * color or, if the transformation is such that we can't composite anything at
+ * all, that nothing has changed in the destination.
+ *
+ * The surrounding memory of the source image is a different solid color so that
+ * we are sure to get failures if we access it.
+ */
+static int
+run_test (int32_t		dst_width,
+	  int32_t		dst_height,
+	  int32_t		src_width,
+	  int32_t		src_height,
+	  int32_t		src_x,
+	  int32_t		src_y,
+	  int32_t		scale_x,
+	  int32_t		scale_y,
+	  pixman_filter_t	filter,
+	  pixman_repeat_t	repeat)
+{
+    pixman_image_t *   src_img;
+    pixman_image_t *   dst_img;
+    pixman_transform_t transform;
+    uint32_t *         srcbuf;
+    uint32_t *         dstbuf;
+    pixman_box32_t     box = { 0, 0, src_width, src_height };
+    pixman_color_t     color_cc = { 0xcccc, 0xcccc, 0xcccc, 0xcccc };
+    int result;
+    int i;
+
+    static const pixman_fixed_t kernel[] =
+    {
+#define D(f)	(pixman_double_to_fixed (f) + 0x0001)
+
+	pixman_int_to_fixed (5),
+	pixman_int_to_fixed (5),
+	D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0),
+	D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0),
+	D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0),
+	D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0),
+	D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0)
+    };
+
+    result = 0;
+
+    srcbuf = (uint32_t *)malloc ((src_width + 10) * (src_height + 10) * 4);
+    dstbuf = (uint32_t *)malloc (dst_width * dst_height * 4);
+
+    memset (srcbuf, 0x88, src_width * src_height * 4);
+    memset (dstbuf, 0x33, dst_width * dst_height * 4);
+
+    src_img = pixman_image_create_bits (
+        PIXMAN_a8r8g8b8, src_width, src_height,
+	srcbuf + (src_width + 10) * 5 + 5, (src_width + 10) * 4);
+
+    pixman_image_fill_boxes (PIXMAN_OP_SRC, src_img, &color_cc, 1, &box);
+
+    dst_img = pixman_image_create_bits (
+        PIXMAN_a8r8g8b8, dst_width, dst_height, dstbuf, dst_width * 4);
+
+    pixman_transform_init_scale (&transform, scale_x, scale_y);
+    pixman_image_set_transform (src_img, &transform);
+    pixman_image_set_repeat (src_img, repeat);
+    if (filter == PIXMAN_FILTER_CONVOLUTION)
+	pixman_image_set_filter (src_img, filter, kernel, 27);
+    else
+	pixman_image_set_filter (src_img, filter, NULL, 0);
+
+    pixman_image_composite (PIXMAN_OP_SRC, src_img, NULL, dst_img,
+                            src_x, src_y, 0, 0, 0, 0, dst_width, dst_height);
+
+    pixman_image_unref (src_img);
+    pixman_image_unref (dst_img);
+
+    for (i = 0; i < dst_width * dst_height; i++)
+    {
+	if (dstbuf[i] != 0xCCCCCCCC && dstbuf[i] != 0x33333333)
+	{
+	    result = 1;
+	    break;
+	}
+    }
+
+    free (srcbuf);
+    free (dstbuf);
+    return result;
+}
+
+typedef struct filter_info_t filter_info_t;
+struct filter_info_t
+{
+    pixman_filter_t value;
+    char name[28];
+};
+
+static const filter_info_t filters[] =
+{
+    { PIXMAN_FILTER_NEAREST, "NEAREST" },
+    { PIXMAN_FILTER_BILINEAR, "BILINEAR" },
+    { PIXMAN_FILTER_CONVOLUTION, "CONVOLUTION" },
+};
+
+typedef struct repeat_info_t repeat_info_t;
+struct repeat_info_t
+{
+    pixman_repeat_t value;
+    char name[28];
+};
+
+
+static const repeat_info_t repeats[] =
+{
+    { PIXMAN_REPEAT_PAD, "PAD" },
+    { PIXMAN_REPEAT_REFLECT, "REFLECT" },
+    { PIXMAN_REPEAT_NORMAL, "NORMAL" }
+};
+
+static int
+do_test (int32_t		dst_size,
+	 int32_t		src_size,
+	 int32_t		src_offs,
+	 int32_t		scale_factor)
+{
+#define N_ELEMENTS(a)	(sizeof (a) / sizeof ((a)[0]))
+    int i, j;
+
+    for (i = 0; i < N_ELEMENTS(filters); ++i)
+    {
+	for (j = 0; j < N_ELEMENTS (repeats); ++j)
+	{
+	    /* horizontal test */
+	    if (run_test (dst_size, 1,
+			  src_size, 1,
+			  src_offs, 0,
+			  scale_factor, 65536,
+			  filters[i].value,
+			  repeats[j].value) != 0)
+	    {
+		printf ("Vertical test failed with %s filter and repeat mode %s\n",
+			filters[i].name, repeats[j].name);
+
+		return 1;
+	    }
+
+	    /* vertical test */
+	    if (run_test (1, dst_size,
+			  1, src_size,
+			  0, src_offs,
+			  65536, scale_factor,
+			  filters[i].value,
+			  repeats[j].value) != 0)
+	    {
+		printf ("Vertical test failed with %s filter and repeat mode %s\n",
+			filters[i].name, repeats[j].name);
+
+		return 1;
+	    }
+	}
+    }
+
+    return 0;
+}
+
+int
+main (int argc, char *argv[])
+{
+    int i;
+
+    pixman_disable_out_of_bounds_workaround ();
+
+    /* can potentially crash */
+    assert (do_test (
+		48000, 32767, 1, 65536 * 128) == 0);
+
+    /* can potentially get into a deadloop */
+    assert (do_test (
+		16384, 65536, 32, 32768) == 0);
+
+    /* can potentially access memory outside source image buffer */
+    assert (do_test (
+		10, 10, 0, 1) == 0);
+    assert (do_test (
+		10, 10, 0, 0) == 0);
+
+    for (i = 0; i < 100; ++i)
+    {
+	pixman_fixed_t one_seventh =
+	    (((pixman_fixed_48_16_t)pixman_fixed_1) << 16) / (7 << 16);
+
+	assert (do_test (
+		    1, 7, 3, one_seventh + i - 50) == 0);
+    }
+
+    for (i = 0; i < 100; ++i)
+    {
+	pixman_fixed_t scale =
+	    (((pixman_fixed_48_16_t)pixman_fixed_1) << 16) / (32767 << 16);
+
+	assert (do_test (
+		    1, 32767, 16383, scale + i - 50) == 0);
+    }
+
+    /* can potentially provide invalid results (out of range matrix stuff) */
+    assert (do_test (
+	48000, 32767, 16384, 65536 * 128) == 0);
+
+    return 0;
+}
diff --git a/test/scaling-helpers-test.c b/test/scaling-helpers-test.c
new file mode 100755
index 0000000..33ec47c
--- /dev/null
+++ b/test/scaling-helpers-test.c
@@ -0,0 +1,91 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "utils.h"
+#include "pixman-inlines.h"
+
+/* A trivial reference implementation for
+ * 'bilinear_pad_repeat_get_scanline_bounds'
+ */
+static void
+bilinear_pad_repeat_get_scanline_bounds_ref (int32_t        source_image_width,
+					     pixman_fixed_t vx_,
+					     pixman_fixed_t unit_x,
+					     int32_t *      left_pad,
+					     int32_t *      left_tz,
+					     int32_t *      width,
+					     int32_t *      right_tz,
+					     int32_t *      right_pad)
+{
+    int w = *width;
+    int64_t vx = vx_;
+    *left_pad = 0;
+    *left_tz = 0;
+    *width = 0;
+    *right_tz = 0;
+    *right_pad = 0;
+    while (--w >= 0)
+    {
+	if (vx < 0)
+	{
+	    if (vx + pixman_fixed_1 < 0)
+		*left_pad += 1;
+	    else
+		*left_tz += 1;
+	}
+	else if (vx + pixman_fixed_1 >= pixman_int_to_fixed (source_image_width))
+	{
+	    if (vx >= pixman_int_to_fixed (source_image_width))
+		*right_pad += 1;
+	    else
+		*right_tz += 1;
+	}
+	else
+	{
+	    *width += 1;
+	}
+	vx += unit_x;
+    }
+}
+
+int
+main (void)
+{
+    int i;
+    for (i = 0; i < 10000; i++)
+    {
+	int32_t left_pad1, left_tz1, width1, right_tz1, right_pad1;
+	int32_t left_pad2, left_tz2, width2, right_tz2, right_pad2;
+	pixman_fixed_t vx = lcg_rand_N(10000 << 16) - (3000 << 16);
+	int32_t width = lcg_rand_N(10000);
+	int32_t source_image_width = lcg_rand_N(10000) + 1;
+	pixman_fixed_t unit_x = lcg_rand_N(10 << 16) + 1;
+	width1 = width2 = width;
+
+	bilinear_pad_repeat_get_scanline_bounds_ref (source_image_width,
+						     vx,
+						     unit_x,
+						     &left_pad1,
+						     &left_tz1,
+						     &width1,
+						     &right_tz1,
+						     &right_pad1);
+
+	bilinear_pad_repeat_get_scanline_bounds (source_image_width,
+						 vx,
+						 unit_x,
+						 &left_pad2,
+						 &left_tz2,
+						 &width2,
+						 &right_tz2,
+						 &right_pad2);
+
+	assert (left_pad1 == left_pad2);
+	assert (left_tz1 == left_tz2);
+	assert (width1 == width2);
+	assert (right_tz1 == right_tz2);
+	assert (right_pad1 == right_pad2);
+    }
+
+    return 0;
+}
diff --git a/test/scaling-test.c b/test/scaling-test.c
new file mode 100755
index 0000000..82370f7
--- /dev/null
+++ b/test/scaling-test.c
@@ -0,0 +1,368 @@
+/*
+ * Test program, which can detect some problems with nearest neighbour
+ * and bilinear scaling in pixman. Testing is done by running lots
+ * of random SRC and OVER compositing operations a8r8g8b8, x8a8r8g8b8
+ * and r5g6b5 color formats.
+ *
+ * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in
+ * the case of test failure.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+#define MAX_SRC_WIDTH  48
+#define MAX_SRC_HEIGHT 8
+#define MAX_DST_WIDTH  48
+#define MAX_DST_HEIGHT 8
+#define MAX_STRIDE     4
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (int      testnum,
+		int      verbose)
+{
+    int                i;
+    pixman_image_t *   src_img;
+    pixman_image_t *   mask_img;
+    pixman_image_t *   dst_img;
+    pixman_transform_t transform;
+    pixman_region16_t  clip;
+    int                src_width, src_height;
+    int                mask_width, mask_height;
+    int                dst_width, dst_height;
+    int                src_stride, mask_stride, dst_stride;
+    int                src_x, src_y;
+    int                mask_x, mask_y;
+    int                dst_x, dst_y;
+    int                src_bpp;
+    int                mask_bpp = 1;
+    int                dst_bpp;
+    int                w, h;
+    pixman_fixed_t     scale_x = 65536, scale_y = 65536;
+    pixman_fixed_t     translate_x = 0, translate_y = 0;
+    pixman_fixed_t     mask_scale_x = 65536, mask_scale_y = 65536;
+    pixman_fixed_t     mask_translate_x = 0, mask_translate_y = 0;
+    pixman_op_t        op;
+    pixman_repeat_t    repeat = PIXMAN_REPEAT_NONE;
+    pixman_repeat_t    mask_repeat = PIXMAN_REPEAT_NONE;
+    pixman_format_code_t src_fmt, dst_fmt;
+    uint32_t *         srcbuf;
+    uint32_t *         dstbuf;
+    uint32_t *         maskbuf;
+    uint32_t           crc32;
+    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
+
+    lcg_srand (testnum);
+
+    src_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
+    dst_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
+    switch (lcg_rand_n (3))
+    {
+    case 0:
+	op = PIXMAN_OP_SRC;
+	break;
+    case 1:
+	op = PIXMAN_OP_OVER;
+	break;
+    default:
+	op = PIXMAN_OP_ADD;
+	break;
+    }
+
+    src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
+    src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
+
+    if (lcg_rand_n (2))
+    {
+	mask_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
+	mask_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
+    }
+    else
+    {
+	mask_width = mask_height = 1;
+    }
+
+    dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1;
+    dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1;
+    src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp;
+    mask_stride = mask_width * mask_bpp + lcg_rand_n (MAX_STRIDE) * mask_bpp;
+    dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp;
+
+    if (src_stride & 3)
+	src_stride += 2;
+
+    if (mask_stride & 1)
+	mask_stride += 1;
+    if (mask_stride & 2)
+	mask_stride += 2;
+
+    if (dst_stride & 3)
+	dst_stride += 2;
+
+    src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2);
+    src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2);
+    mask_x = -(mask_width / 4) + lcg_rand_n (mask_width * 3 / 2);
+    mask_y = -(mask_height / 4) + lcg_rand_n (mask_height * 3 / 2);
+    dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2);
+    dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2);
+    w = lcg_rand_n (dst_width * 3 / 2 - dst_x);
+    h = lcg_rand_n (dst_height * 3 / 2 - dst_y);
+
+    srcbuf = (uint32_t *)malloc (src_stride * src_height);
+    maskbuf = (uint32_t *)malloc (mask_stride * mask_height);
+    dstbuf = (uint32_t *)malloc (dst_stride * dst_height);
+
+    for (i = 0; i < src_stride * src_height; i++)
+	*((uint8_t *)srcbuf + i) = lcg_rand_n (256);
+
+    for (i = 0; i < mask_stride * mask_height; i++)
+	*((uint8_t *)maskbuf + i) = lcg_rand_n (256);
+
+    for (i = 0; i < dst_stride * dst_height; i++)
+	*((uint8_t *)dstbuf + i) = lcg_rand_n (256);
+
+    src_fmt = src_bpp == 4 ? (lcg_rand_n (2) == 0 ?
+                              PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
+
+    dst_fmt = dst_bpp == 4 ? (lcg_rand_n (2) == 0 ?
+                              PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
+
+    src_img = pixman_image_create_bits (
+        src_fmt, src_width, src_height, srcbuf, src_stride);
+
+    mask_img = pixman_image_create_bits (
+        PIXMAN_a8, mask_width, mask_height, maskbuf, mask_stride);
+
+    dst_img = pixman_image_create_bits (
+        dst_fmt, dst_width, dst_height, dstbuf, dst_stride);
+
+    image_endian_swap (src_img);
+    image_endian_swap (dst_img);
+
+    if (lcg_rand_n (4) > 0)
+    {
+	scale_x = -32768 * 3 + lcg_rand_N (65536 * 5);
+	scale_y = -32768 * 3 + lcg_rand_N (65536 * 5);
+	translate_x = lcg_rand_N (65536);
+	translate_y = lcg_rand_N (65536);
+	pixman_transform_init_scale (&transform, scale_x, scale_y);
+	pixman_transform_translate (&transform, NULL, translate_x, translate_y);
+	pixman_image_set_transform (src_img, &transform);
+    }
+
+    if (lcg_rand_n (2) > 0)
+    {
+	mask_scale_x = -32768 * 3 + lcg_rand_N (65536 * 5);
+	mask_scale_y = -32768 * 3 + lcg_rand_N (65536 * 5);
+	mask_translate_x = lcg_rand_N (65536);
+	mask_translate_y = lcg_rand_N (65536);
+	pixman_transform_init_scale (&transform, mask_scale_x, mask_scale_y);
+	pixman_transform_translate (&transform, NULL, mask_translate_x, mask_translate_y);
+	pixman_image_set_transform (mask_img, &transform);
+    }
+
+    switch (lcg_rand_n (4))
+    {
+    case 0:
+	mask_repeat = PIXMAN_REPEAT_NONE;
+	break;
+
+    case 1:
+	mask_repeat = PIXMAN_REPEAT_NORMAL;
+	break;
+
+    case 2:
+	mask_repeat = PIXMAN_REPEAT_PAD;
+	break;
+
+    case 3:
+	mask_repeat = PIXMAN_REPEAT_REFLECT;
+	break;
+
+    default:
+        break;
+    }
+    pixman_image_set_repeat (mask_img, mask_repeat);
+
+    switch (lcg_rand_n (4))
+    {
+    case 0:
+	repeat = PIXMAN_REPEAT_NONE;
+	break;
+
+    case 1:
+	repeat = PIXMAN_REPEAT_NORMAL;
+	break;
+
+    case 2:
+	repeat = PIXMAN_REPEAT_PAD;
+	break;
+
+    case 3:
+	repeat = PIXMAN_REPEAT_REFLECT;
+	break;
+
+    default:
+        break;
+    }
+    pixman_image_set_repeat (src_img, repeat);
+
+    if (lcg_rand_n (2))
+	pixman_image_set_filter (src_img, PIXMAN_FILTER_NEAREST, NULL, 0);
+    else
+	pixman_image_set_filter (src_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    if (lcg_rand_n (2))
+	pixman_image_set_filter (mask_img, PIXMAN_FILTER_NEAREST, NULL, 0);
+    else
+	pixman_image_set_filter (mask_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    if (verbose)
+    {
+	printf ("src_fmt=%08X, dst_fmt=%08X\n", src_fmt, dst_fmt);
+	printf ("op=%d, scale_x=%d, scale_y=%d, repeat=%d\n",
+	        op, scale_x, scale_y, repeat);
+	printf ("translate_x=%d, translate_y=%d\n",
+	        translate_x, translate_y);
+	printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
+	        src_width, src_height, dst_width, dst_height);
+	printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
+	        src_x, src_y, dst_x, dst_y);
+	printf ("w=%d, h=%d\n", w, h);
+    }
+
+    if (lcg_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = lcg_rand_n (2) + 1;
+
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = lcg_rand_n (src_width);
+	    clip_boxes[i].y1 = lcg_rand_n (src_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("source clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (src_img, &clip);
+	pixman_image_set_source_clipping (src_img, 1);
+	pixman_region_fini (&clip);
+    }
+
+    if (lcg_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = lcg_rand_n (2) + 1;
+
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = lcg_rand_n (mask_width);
+	    clip_boxes[i].y1 = lcg_rand_n (mask_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + lcg_rand_n (mask_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + lcg_rand_n (mask_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("mask clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (mask_img, &clip);
+	pixman_image_set_source_clipping (mask_img, 1);
+	pixman_region_fini (&clip);
+    }
+
+    if (lcg_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = lcg_rand_n (2) + 1;
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = lcg_rand_n (dst_width);
+	    clip_boxes[i].y1 = lcg_rand_n (dst_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("destination clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (dst_img, &clip);
+	pixman_region_fini (&clip);
+    }
+
+    if (lcg_rand_n (2) == 0)
+	pixman_image_composite (op, src_img, NULL, dst_img,
+                            src_x, src_y, 0, 0, dst_x, dst_y, w, h);
+    else
+	pixman_image_composite (op, src_img, mask_img, dst_img,
+                            src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h);
+
+    if (dst_fmt == PIXMAN_x8r8g8b8)
+    {
+	/* ignore unused part */
+	for (i = 0; i < dst_stride * dst_height / 4; i++)
+	    dstbuf[i] &= 0xFFFFFF;
+    }
+
+    image_endian_swap (dst_img);
+
+    if (verbose)
+    {
+	int j;
+	
+	for (i = 0; i < dst_height; i++)
+	{
+	    for (j = 0; j < dst_stride; j++)
+		printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
+
+	    printf ("\n");
+	}
+    }
+
+    pixman_image_unref (src_img);
+    pixman_image_unref (mask_img);
+    pixman_image_unref (dst_img);
+
+    crc32 = compute_crc32 (0, dstbuf, dst_stride * dst_height);
+    free (srcbuf);
+    free (maskbuf);
+    free (dstbuf);
+
+    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    pixman_disable_out_of_bounds_workaround ();
+
+    return fuzzer_test_main("scaling", 8000000, 0x80DF1CB2,
+			    test_composite, argc, argv);
+}
diff --git a/test/stress-test.c b/test/stress-test.c
new file mode 100755
index 0000000..571420a
--- /dev/null
+++ b/test/stress-test.c
@@ -0,0 +1,872 @@
+#include <stdio.h>
+#include "utils.h"
+#include <sys/types.h>
+
+#if 0
+#define fence_malloc malloc
+#define fence_free free
+#define make_random_bytes malloc
+#endif
+
+static const pixman_format_code_t image_formats[] =
+{
+    PIXMAN_a8r8g8b8,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_r5g6b5,
+    PIXMAN_r3g3b2,
+    PIXMAN_a8,
+    PIXMAN_a8b8g8r8,
+    PIXMAN_x8b8g8r8,
+    PIXMAN_b8g8r8a8,
+    PIXMAN_b8g8r8x8,
+    PIXMAN_r8g8b8a8,
+    PIXMAN_r8g8b8x8,
+    PIXMAN_x14r6g6b6,
+    PIXMAN_r8g8b8,
+    PIXMAN_b8g8r8,
+    PIXMAN_r5g6b5,
+    PIXMAN_b5g6r5,
+    PIXMAN_x2r10g10b10,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_x2b10g10r10,
+    PIXMAN_a2b10g10r10,
+    PIXMAN_a1r5g5b5,
+    PIXMAN_x1r5g5b5,
+    PIXMAN_a1b5g5r5,
+    PIXMAN_x1b5g5r5,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_x4r4g4b4,
+    PIXMAN_a4b4g4r4,
+    PIXMAN_x4b4g4r4,
+    PIXMAN_a8,
+    PIXMAN_r3g3b2,
+    PIXMAN_b2g3r3,
+    PIXMAN_a2r2g2b2,
+    PIXMAN_a2b2g2r2,
+    PIXMAN_c8,
+    PIXMAN_g8,
+    PIXMAN_x4c4,
+    PIXMAN_x4g4,
+    PIXMAN_c4,
+    PIXMAN_g4,
+    PIXMAN_g1,
+    PIXMAN_x4a4,
+    PIXMAN_a4,
+    PIXMAN_r1g2b1,
+    PIXMAN_b1g2r1,
+    PIXMAN_a1r1g1b1,
+    PIXMAN_a1b1g1r1,
+    PIXMAN_a1
+};
+
+static pixman_filter_t filters[] =
+{
+    PIXMAN_FILTER_NEAREST,
+    PIXMAN_FILTER_BILINEAR,
+    PIXMAN_FILTER_FAST,
+    PIXMAN_FILTER_GOOD,
+    PIXMAN_FILTER_BEST,
+    PIXMAN_FILTER_CONVOLUTION
+};
+
+static int
+get_size (void)
+{
+    switch (lcg_rand_n (28))
+    {
+    case 0:
+	return 1;
+
+    case 1:
+	return 2;
+
+    default:
+    case 2:
+	return lcg_rand_n (200);
+
+    case 4:
+	return lcg_rand_n (2000) + 1000;
+
+    case 5:
+	return 65535;
+
+    case 6:
+	return 65536;
+
+    case 7:
+	return lcg_rand_N (64000) + 63000;
+    }
+}
+
+static void
+destroy (pixman_image_t *image, void *data)
+{
+    if (image->type == BITS && image->bits.free_me != image->bits.bits)
+    {
+	uint32_t *bits;
+
+	if (image->bits.bits != (void *)0x01)
+	{
+	    bits = image->bits.bits;
+
+	    if (image->bits.rowstride < 0)
+		bits -= (- image->bits.rowstride * (image->bits.height - 1));
+
+	    fence_free (bits);
+	}
+    }
+
+    free (data);
+}
+
+static uint32_t
+real_reader (const void *src, int size)
+{
+    switch (size)
+    {
+    case 1:
+	return *(uint8_t *)src;
+    case 2:
+	return *(uint16_t *)src;
+    case 4:
+	return *(uint32_t *)src;
+    default:
+	assert (0);
+	return 0; /* silence MSVC */
+    }
+}
+
+static void
+real_writer (void *src, uint32_t value, int size)
+{
+    switch (size)
+    {
+    case 1:
+	*(uint8_t *)src = value;
+	break;
+
+    case 2:
+	*(uint16_t *)src = value;
+	break;
+
+    case 4:
+	*(uint32_t *)src = value;
+	break;
+
+    default:
+	assert (0);
+	break;
+    }
+}
+
+static uint32_t
+fake_reader (const void *src, int size)
+{
+    uint32_t r = lcg_rand_u32 ();
+
+    assert (size == 1 || size == 2 || size == 4);
+    return r & ((1 << (size * 8)) - 1);
+}
+
+static void
+fake_writer (void *src, uint32_t value, int size)
+{
+    assert (size == 1 || size == 2 || size == 4);
+}
+
+static int32_t
+log_rand (void)
+{
+    uint32_t mask;
+
+    mask = (1 << lcg_rand_n (31)) - 1;
+
+    return (lcg_rand () & mask) - (mask >> 1);
+}
+
+static pixman_image_t *
+create_random_bits_image (void)
+{
+    pixman_format_code_t format;
+    pixman_indexed_t *indexed;
+    pixman_image_t *image;
+    int width, height, stride;
+    uint32_t *bits;
+    pixman_read_memory_func_t read_func = NULL;
+    pixman_write_memory_func_t write_func = NULL;
+    pixman_filter_t filter;
+    pixman_fixed_t *coefficients = NULL;
+    int n_coefficients = 0;
+
+    /* format */
+    format = image_formats[lcg_rand_n (ARRAY_LENGTH (image_formats))];
+
+    indexed = NULL;
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
+    {
+	indexed = malloc (sizeof (pixman_indexed_t));
+
+	initialize_palette (indexed, PIXMAN_FORMAT_BPP (format), TRUE);
+    }
+    else if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
+    {
+	indexed = malloc (sizeof (pixman_indexed_t));
+
+	initialize_palette (indexed, PIXMAN_FORMAT_BPP (format), FALSE);
+    }
+    else
+    {
+	indexed = NULL;
+    }
+
+    /* size */
+    width = get_size ();
+    height = get_size ();
+
+    if ((uint64_t)width * height > 200000)
+    {
+	if (lcg_rand_n(2) == 0)
+	    height = 200000 / width;
+	else
+	    width = 200000 / height;
+    }
+
+    if (height == 0)
+	height = 1;
+    if (width == 0)
+	width = 1;
+
+    /* bits */
+    switch (lcg_rand_n (7))
+    {
+    default:
+    case 0:
+	stride = width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17);
+	stride = (stride + 3) & (~3);
+	bits = (uint32_t *)make_random_bytes (height * stride);
+	break;
+
+    case 1:
+	stride = 0;
+	bits = NULL;
+	break;
+
+    case 2: /* Zero-filled */
+	stride = width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17);
+	stride = (stride + 3) & (~3);
+	bits = fence_malloc (height * stride);
+	if (!bits)
+	    return NULL;
+	memset (bits, 0, height * stride);
+	break;
+
+    case 3: /* Filled with 0xFF */
+	stride = width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17);
+	stride = (stride + 3) & (~3);
+	bits = fence_malloc (height * stride);
+	if (!bits)
+	    return NULL;
+	memset (bits, 0xff, height * stride);
+	break;
+
+    case 4: /* bits is a bad pointer, has read/write functions */
+	stride = 232;
+	bits = (void *)0x01;
+	read_func = fake_reader;
+	write_func = fake_writer;
+	break;
+
+    case 5: /* bits is a real pointer, has read/write functions */
+	stride = width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17);
+	stride = (stride + 3) & (~3);
+	bits = fence_malloc (height * stride);
+	if (!bits)
+	    return NULL;
+	memset (bits, 0xff, height * stride);
+	read_func = real_reader;
+	write_func = real_writer;
+	break;
+
+    case 6: /* bits is a real pointer, stride is negative */
+	stride = (width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17));
+	stride = (stride + 3) & (~3);
+	bits = (uint32_t *)make_random_bytes (height * stride);
+	if (!bits)
+	    return NULL;
+	bits += ((height - 1) * stride) / 4;
+	stride = - stride;
+	break;
+    }
+
+    /* Filter */
+    filter = filters[lcg_rand_n (ARRAY_LENGTH (filters))];
+    if (filter == PIXMAN_FILTER_CONVOLUTION)
+    {
+	int width = lcg_rand_n (17);
+	int height = lcg_rand_n (19);
+
+	n_coefficients = width * height + 2;
+	coefficients = malloc (n_coefficients * sizeof (pixman_fixed_t));
+
+	if (coefficients)
+	{
+	    int i;
+
+	    for (i = 0; i < width * height; ++i)
+		coefficients[i + 2] = lcg_rand_u32();
+
+	    coefficients[0] = width << 16;
+	    coefficients[1] = height << 16;
+	}
+	else
+	{
+	    filter = PIXMAN_FILTER_BEST;
+	}
+    }
+
+    /* Finally create the image */
+    image = pixman_image_create_bits (format, width, height, bits, stride);
+    if (!image)
+	return NULL;
+
+    pixman_image_set_indexed (image, indexed);
+    pixman_image_set_destroy_function (image, destroy, indexed);
+    pixman_image_set_accessors (image, read_func, write_func);
+    pixman_image_set_filter (image, filter, coefficients, n_coefficients);
+
+    return image;
+}
+
+static pixman_repeat_t repeats[] =
+{
+    PIXMAN_REPEAT_NONE,
+    PIXMAN_REPEAT_NORMAL,
+    PIXMAN_REPEAT_REFLECT,
+    PIXMAN_REPEAT_PAD
+};
+
+static uint32_t
+absolute (int32_t i)
+{
+    return i < 0? -i : i;
+}
+
+static void
+set_general_properties (pixman_image_t *image, pixman_bool_t allow_alpha_map)
+{
+    pixman_repeat_t repeat;
+
+    /* Set properties that are generic to all images */
+
+    /* Repeat */
+    repeat = repeats[lcg_rand_n (ARRAY_LENGTH (repeats))];
+    pixman_image_set_repeat (image, repeat);
+
+    /* Alpha map */
+    if (allow_alpha_map && lcg_rand_n (3) == 0)
+    {
+	pixman_image_t *alpha_map;
+	int16_t x, y;
+
+	alpha_map = create_random_bits_image ();
+
+	if (alpha_map)
+	{
+	    set_general_properties (alpha_map, FALSE);
+
+	    x = lcg_rand_N (100000) - 65536;
+	    y = lcg_rand_N (100000) - 65536;
+
+	    pixman_image_set_alpha_map (image, alpha_map, x, y);
+
+	    pixman_image_unref (alpha_map);
+	}
+    }
+
+    /* Component alpha */
+    pixman_image_set_component_alpha (image, lcg_rand_n (3) == 0);
+
+    /* Clip region */
+    if (lcg_rand_n (8) != 0)
+    {
+	pixman_region32_t region;
+	int i, n_rects;
+
+	pixman_region32_init (&region);
+
+	switch (lcg_rand_n (10))
+	{
+	case 0:
+	    n_rects = 0;
+	    break;
+
+	case 1: case 2: case 3:
+	    n_rects = 1;
+	    break;
+
+	case 4: case 5:
+	    n_rects = 2;
+	    break;
+
+	case 6: case 7:
+	    n_rects = 3;
+
+	default:
+	    n_rects = lcg_rand_n (100);
+	    break;
+	}
+
+	for (i = 0; i < n_rects; ++i)
+	{
+	    uint32_t width, height;
+	    int x, y;
+
+	    x = log_rand();
+	    y = log_rand();
+	    width = absolute (log_rand ()) + 1;
+	    height = absolute (log_rand ()) + 1;
+
+	    pixman_region32_union_rect (
+		&region, &region, x, y, width, height);
+	}
+
+	pixman_image_set_clip_region32 (image, &region);
+
+	pixman_region32_fini (&region);
+    }
+
+    /* Whether source clipping is enabled */
+    pixman_image_set_source_clipping (image, !!lcg_rand_n (2));
+
+    /* Client clip */
+    pixman_image_set_has_client_clip (image, !!lcg_rand_n (2));
+
+    /* Transform */
+    if (lcg_rand_n (5) < 2)
+    {
+	pixman_transform_t xform;
+	int i, j, k;
+	uint32_t tx, ty, sx, sy;
+	uint32_t c, s;
+
+	memset (&xform, 0, sizeof xform);
+	xform.matrix[0][0] = pixman_fixed_1;
+	xform.matrix[1][1] = pixman_fixed_1;
+	xform.matrix[2][2] = pixman_fixed_1;
+
+	for (k = 0; k < 3; ++k)
+	{
+	    switch (lcg_rand_n (4))
+	    {
+	    case 0:
+		/* rotation */
+		c = lcg_rand_N (2 * 65536) - 65536;
+		s = lcg_rand_N (2 * 65536) - 65536;
+		pixman_transform_rotate (&xform, NULL, c, s);
+		break;
+
+	    case 1:
+		/* translation */
+		tx = lcg_rand_u32();
+		ty = lcg_rand_u32();
+		pixman_transform_translate (&xform, NULL, tx, ty);
+		break;
+
+	    case 2:
+		/* scale */
+		sx = lcg_rand_u32();
+		sy = lcg_rand_u32();
+		pixman_transform_scale (&xform, NULL, sx, sy);
+		break;
+
+	    case 3:
+		if (lcg_rand_n (16) == 0)
+		{
+		    /* random */
+		    for (i = 0; i < 3; ++i)
+			for (j = 0; j < 3; ++j)
+			    xform.matrix[i][j] = lcg_rand_u32();
+		    break;
+		}
+		else if (lcg_rand_n (16) == 0)
+		{
+		    /* zero */
+		    memset (&xform, 0, sizeof xform);
+		}
+		break;
+	    }
+	}
+
+	pixman_image_set_transform (image, &xform);
+    }
+}
+
+static pixman_color_t
+random_color (void)
+{
+    pixman_color_t color =
+    {
+	lcg_rand() & 0xffff,
+	lcg_rand() & 0xffff,
+	lcg_rand() & 0xffff,
+	lcg_rand() & 0xffff,
+    };
+
+    return color;
+}
+
+
+static pixman_image_t *
+create_random_solid_image (void)
+{
+    pixman_color_t color = random_color();
+    pixman_image_t *image = pixman_image_create_solid_fill (&color);
+
+    return image;
+}
+
+static pixman_gradient_stop_t *
+create_random_stops (int *n_stops)
+{
+    pixman_fixed_t step;
+    pixman_fixed_t s;
+    int i;
+    pixman_gradient_stop_t *stops;
+
+    *n_stops = lcg_rand_n (50) + 1;
+
+    step = pixman_fixed_1 / *n_stops;
+
+    stops = malloc (*n_stops * sizeof (pixman_gradient_stop_t));
+
+    s = 0;
+    for (i = 0; i < (*n_stops) - 1; ++i)
+    {
+	stops[i].x = s;
+	stops[i].color = random_color();
+
+	s += step;
+    }
+
+    stops[*n_stops - 1].x = pixman_fixed_1;
+    stops[*n_stops - 1].color = random_color();
+
+    return stops;
+}
+
+static pixman_point_fixed_t
+create_random_point (void)
+{
+    pixman_point_fixed_t p;
+
+    p.x = log_rand ();
+    p.y = log_rand ();
+
+    return p;
+}
+
+static pixman_image_t *
+create_random_linear_image (void)
+{
+    int n_stops;
+    pixman_gradient_stop_t *stops;
+    pixman_point_fixed_t p1, p2;
+    pixman_image_t *result;
+
+    stops = create_random_stops (&n_stops);
+    if (!stops)
+	return NULL;
+
+    p1 = create_random_point ();
+    p2 = create_random_point ();
+
+    result = pixman_image_create_linear_gradient (&p1, &p2, stops, n_stops);
+
+    free (stops);
+
+    return result;
+}
+
+static pixman_image_t *
+create_random_radial_image (void)
+{
+    int n_stops;
+    pixman_gradient_stop_t *stops;
+    pixman_point_fixed_t inner_c, outer_c;
+    pixman_fixed_t inner_r, outer_r;
+    pixman_image_t *result;
+
+    inner_c = create_random_point();
+    outer_c = create_random_point();
+    inner_r = lcg_rand();
+    outer_r = lcg_rand();
+
+    stops = create_random_stops (&n_stops);
+
+    if (!stops)
+	return NULL;
+
+    result = pixman_image_create_radial_gradient (
+	&inner_c, &outer_c, inner_r, outer_r, stops, n_stops);
+
+    free (stops);
+
+    return result;
+}
+
+static pixman_image_t *
+create_random_conical_image (void)
+{
+    pixman_gradient_stop_t *stops;
+    int n_stops;
+    pixman_point_fixed_t c;
+    pixman_fixed_t angle;
+    pixman_image_t *result;
+
+    c = create_random_point();
+    angle = lcg_rand();
+
+    stops = create_random_stops (&n_stops);
+
+    if (!stops)
+	return NULL;
+
+    result = pixman_image_create_conical_gradient (&c, angle, stops, n_stops);
+
+    free (stops);
+
+    return result;
+}
+
+static pixman_image_t *
+create_random_image (void)
+{
+    pixman_image_t *result;
+
+    switch (lcg_rand_n (5))
+    {
+    default:
+    case 0:
+	result = create_random_bits_image ();
+	break;
+
+    case 1:
+	result = create_random_solid_image ();
+	break;
+
+    case 2:
+	result = create_random_linear_image ();
+	break;
+
+    case 3:
+	result = create_random_radial_image ();
+	break;
+
+    case 4:
+	result = create_random_conical_image ();
+	break;
+    }
+
+    if (result)
+	set_general_properties (result, TRUE);
+
+    return result;
+}
+
+static const pixman_op_t op_list[] =
+{
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE,
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+    PIXMAN_OP_DISJOINT_DST,
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+    PIXMAN_OP_CONJOINT_DST,
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+    PIXMAN_OP_SOFT_LIGHT,
+    PIXMAN_OP_HSL_HUE,
+    PIXMAN_OP_HSL_SATURATION,
+    PIXMAN_OP_HSL_COLOR,
+    PIXMAN_OP_HSL_LUMINOSITY,
+};
+
+static void
+run_test (uint32_t seed, pixman_bool_t verbose, uint32_t mod)
+{
+    pixman_image_t *source, *mask, *dest;
+    pixman_op_t op;
+
+    if (verbose)
+    {
+	if (mod == 0 || (seed % mod) == 0)
+	    printf ("Seed 0x%08x\n", seed);
+    }
+	    
+    lcg_srand (seed);
+
+    source = create_random_image ();
+    mask   = create_random_image ();
+    dest   = create_random_bits_image ();
+
+    if (source && mask && dest)
+    {
+	set_general_properties (dest, TRUE);
+
+	op = op_list [lcg_rand_n (ARRAY_LENGTH (op_list))];
+
+	pixman_image_composite32 (op,
+				  source, mask, dest,
+				  log_rand(), log_rand(),
+				  log_rand(), log_rand(),
+				  log_rand(), log_rand(),
+				  absolute (log_rand()),
+				  absolute (log_rand()));
+    }
+    if (source)
+	pixman_image_unref (source);
+    if (mask)
+	pixman_image_unref (mask);
+    if (dest)
+	pixman_image_unref (dest);
+}
+
+static pixman_bool_t
+get_int (char *s, uint32_t *i)
+{
+    char *end;
+    int p;
+
+    p = strtol (s, &end, 0);
+
+    if (end != s && *end == 0)
+    {
+	*i = p;
+	return TRUE;
+    }
+
+    return FALSE;
+}
+
+int
+main (int argc, char **argv)
+{
+    int verbose = FALSE;
+    uint32_t seed = 1;
+    uint32_t n_tests = 0xffffffff;
+    uint32_t mod = 0;
+    pixman_bool_t use_threads = TRUE;
+    uint32_t i;
+
+    pixman_disable_out_of_bounds_workaround ();
+
+    enable_fp_exceptions();
+
+    if (getenv ("VERBOSE") != NULL)
+	verbose = TRUE;
+
+    for (i = 1; i < argc; ++i)
+    {
+	if (strcmp (argv[i], "-v") == 0)
+	{
+	    verbose = TRUE;
+
+	    if (i + 1 < argc)
+	    {
+		get_int (argv[i + 1], &mod);
+		i++;
+	    }
+	}
+	else if (strcmp (argv[i], "-s") == 0 && i + 1 < argc)
+	{
+	    get_int (argv[i + 1], &seed);
+	    use_threads = FALSE;
+	    i++;
+	}
+	else if (strcmp (argv[i], "-n") == 0 && i + 1 < argc)
+	{
+	    get_int (argv[i + 1], &n_tests);
+	    i++;
+	}
+	else
+	{
+	    if (strcmp (argv[i], "-h") != 0)
+		printf ("Unknown option '%s'\n\n", argv[i]);
+
+	    printf ("Options:\n\n"
+		    "-n <number>        Number of tests to run\n"
+		    "-s <seed> 	        Seed of first test (ignored if PIXMAN_RANDOMIZE_TESTS is set)\n"
+		    "-v                 Print out seeds\n"
+		    "-v <n>             Print out every n'th seed\n\n");
+
+	    exit (-1);
+	}
+    }
+
+    if (n_tests == 0xffffffff)
+	n_tests = 8000;
+
+    if (getenv ("PIXMAN_RANDOMIZE_TESTS"))
+    {
+	seed = get_random_seed();
+	printf ("First seed: 0x%08x\n", seed);
+    }
+
+    if (use_threads)
+    {
+#ifdef USE_OPENMP
+#   pragma omp parallel for default(none) shared(verbose, n_tests, mod, seed)
+#endif
+	for (i = seed; i < seed + n_tests; ++i)
+	    run_test (i, verbose, mod);
+    }
+    else
+    {
+	for (i = seed; i < seed + n_tests; ++i)
+	    run_test (i, verbose, mod);
+    }
+
+    return 0;
+}
diff --git a/test/trap-crasher.c b/test/trap-crasher.c
new file mode 100755
index 0000000..7485e62
--- /dev/null
+++ b/test/trap-crasher.c
@@ -0,0 +1,27 @@
+#include <stdlib.h>
+#include <pixman.h>
+
+int
+main()
+{
+    pixman_image_t *dst;
+    pixman_trapezoid_t traps[1] = {
+	{
+	    2147483646,
+	    2147483647,
+	    {
+		{ 0, 0 },
+		{ 0, 2147483647 }
+	    },
+	    {
+		{ 65536, 0 },
+		{ 0, 2147483647 }
+	    }
+	},
+    };
+
+    dst = pixman_image_create_bits (PIXMAN_a8, 1, 1, NULL, -1);
+
+    pixman_add_trapezoids (dst, 0, 0, sizeof (traps)/sizeof (traps[0]), traps);
+    return (0);
+}
diff --git a/test/utils.c b/test/utils.c
new file mode 100755
index 0000000..adabd75
--- /dev/null
+++ b/test/utils.c
@@ -0,0 +1,704 @@
+#define _GNU_SOURCE
+
+#include "utils.h"
+#include <signal.h>
+
+#ifdef HAVE_GETTIMEOFDAY
+#include <sys/time.h>
+#else
+#include <time.h>
+#endif
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+
+#ifdef HAVE_FENV_H
+#include <fenv.h>
+#endif
+
+#ifdef HAVE_LIBPNG
+#include <png.h>
+#endif
+
+/* Random number seed
+ */
+
+uint32_t lcg_seed;
+
+/*----------------------------------------------------------------------------*\
+ *  CRC-32 version 2.0.0 by Craig Bruce, 2006-04-29.
+ *
+ *  This program generates the CRC-32 values for the files named in the
+ *  command-line arguments.  These are the same CRC-32 values used by GZIP,
+ *  PKZIP, and ZMODEM.  The Crc32_ComputeBuf () can also be detached and
+ *  used independently.
+ *
+ *  THIS PROGRAM IS PUBLIC-DOMAIN SOFTWARE.
+ *
+ *  Based on the byte-oriented implementation "File Verification Using CRC"
+ *  by Mark R. Nelson in Dr. Dobb's Journal, May 1992, pp. 64-67.
+ *
+ *  v1.0.0: original release.
+ *  v1.0.1: fixed printf formats.
+ *  v1.0.2: fixed something else.
+ *  v1.0.3: replaced CRC constant table by generator function.
+ *  v1.0.4: reformatted code, made ANSI C.  1994-12-05.
+ *  v2.0.0: rewrote to use memory buffer & static table, 2006-04-29.
+\*----------------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------------*\
+ *  NAME:
+ *     Crc32_ComputeBuf () - computes the CRC-32 value of a memory buffer
+ *  DESCRIPTION:
+ *     Computes or accumulates the CRC-32 value for a memory buffer.
+ *     The 'inCrc32' gives a previously accumulated CRC-32 value to allow
+ *     a CRC to be generated for multiple sequential buffer-fuls of data.
+ *     The 'inCrc32' for the first buffer must be zero.
+ *  ARGUMENTS:
+ *     inCrc32 - accumulated CRC-32 value, must be 0 on first call
+ *     buf     - buffer to compute CRC-32 value for
+ *     bufLen  - number of bytes in buffer
+ *  RETURNS:
+ *     crc32 - computed CRC-32 value
+ *  ERRORS:
+ *     (no errors are possible)
+\*----------------------------------------------------------------------------*/
+
+uint32_t
+compute_crc32 (uint32_t    in_crc32,
+	       const void *buf,
+	       size_t      buf_len)
+{
+    static const uint32_t crc_table[256] = {
+	0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F,
+	0xE963A535, 0x9E6495A3, 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
+	0x09B64C2B, 0x7EB17CBD,	0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2,
+	0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+	0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,	0x14015C4F, 0x63066CD9,
+	0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
+	0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
+	0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+	0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423,
+	0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
+	0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x01DB7106,
+	0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+	0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D,
+	0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
+	0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950,
+	0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+	0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7,
+	0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
+	0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA,
+	0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+	0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
+	0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
+	0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84,
+	0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+	0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB,
+	0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
+	0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E,
+	0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+	0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55,
+	0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
+	0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28,
+	0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+	0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F,
+	0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
+	0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
+	0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+	0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69,
+	0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
+	0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC,
+	0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+	0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693,
+	0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
+	0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+    };
+
+    uint32_t              crc32;
+    unsigned char *       byte_buf;
+    size_t                i;
+
+    /* accumulate crc32 for buffer */
+    crc32 = in_crc32 ^ 0xFFFFFFFF;
+    byte_buf = (unsigned char*) buf;
+
+    for (i = 0; i < buf_len; i++)
+	crc32 = (crc32 >> 8) ^ crc_table[(crc32 ^ byte_buf[i]) & 0xFF];
+
+    return (crc32 ^ 0xFFFFFFFF);
+}
+
+pixman_bool_t
+is_little_endian (void)
+{
+    volatile uint16_t endian_check_var = 0x1234;
+
+    return (*(volatile uint8_t *)&endian_check_var == 0x34);
+}
+
+/* perform endian conversion of pixel data
+ */
+void
+image_endian_swap (pixman_image_t *img)
+{
+    int stride = pixman_image_get_stride (img);
+    uint32_t *data = pixman_image_get_data (img);
+    int height = pixman_image_get_height (img);
+    int bpp = PIXMAN_FORMAT_BPP (pixman_image_get_format (img));
+    int i, j;
+
+    /* swap bytes only on big endian systems */
+    if (is_little_endian())
+	return;
+
+    if (bpp == 8)
+	return;
+
+    for (i = 0; i < height; i++)
+    {
+	uint8_t *line_data = (uint8_t *)data + stride * i;
+	
+	switch (bpp)
+	{
+	case 1:
+	    for (j = 0; j < stride; j++)
+	    {
+		line_data[j] =
+		    ((line_data[j] & 0x80) >> 7) |
+		    ((line_data[j] & 0x40) >> 5) |
+		    ((line_data[j] & 0x20) >> 3) |
+		    ((line_data[j] & 0x10) >> 1) |
+		    ((line_data[j] & 0x08) << 1) |
+		    ((line_data[j] & 0x04) << 3) |
+		    ((line_data[j] & 0x02) << 5) |
+		    ((line_data[j] & 0x01) << 7);
+	    }
+	    break;
+	case 4:
+	    for (j = 0; j < stride; j++)
+	    {
+		line_data[j] = (line_data[j] >> 4) | (line_data[j] << 4);
+	    }
+	    break;
+	case 16:
+	    for (j = 0; j + 2 <= stride; j += 2)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+
+		line_data[j + 1] = t1;
+		line_data[j + 0] = t2;
+	    }
+	    break;
+	case 24:
+	    for (j = 0; j + 3 <= stride; j += 3)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+		char t3 = line_data[j + 2];
+
+		line_data[j + 2] = t1;
+		line_data[j + 1] = t2;
+		line_data[j + 0] = t3;
+	    }
+	    break;
+	case 32:
+	    for (j = 0; j + 4 <= stride; j += 4)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+		char t3 = line_data[j + 2];
+		char t4 = line_data[j + 3];
+
+		line_data[j + 3] = t1;
+		line_data[j + 2] = t2;
+		line_data[j + 1] = t3;
+		line_data[j + 0] = t4;
+	    }
+	    break;
+	default:
+	    assert (FALSE);
+	    break;
+	}
+    }
+}
+
+#define N_LEADING_PROTECTED	10
+#define N_TRAILING_PROTECTED	10
+
+typedef struct
+{
+    void *addr;
+    uint32_t len;
+    uint8_t *trailing;
+    int n_bytes;
+} info_t;
+
+#if defined(HAVE_MPROTECT) && defined(HAVE_GETPAGESIZE) && defined(HAVE_SYS_MMAN_H) && defined(HAVE_MMAP)
+
+/* This is apparently necessary on at least OS X */
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+void *
+fence_malloc (int64_t len)
+{
+    unsigned long page_size = getpagesize();
+    unsigned long page_mask = page_size - 1;
+    uint32_t n_payload_bytes = (len + page_mask) & ~page_mask;
+    uint32_t n_bytes =
+	(page_size * (N_LEADING_PROTECTED + N_TRAILING_PROTECTED + 2) +
+	 n_payload_bytes) & ~page_mask;
+    uint8_t *initial_page;
+    uint8_t *leading_protected;
+    uint8_t *trailing_protected;
+    uint8_t *payload;
+    uint8_t *addr;
+
+    if (len < 0)
+	abort();
+    
+    addr = mmap (NULL, n_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
+		 -1, 0);
+
+    if (addr == MAP_FAILED)
+    {
+	printf ("mmap failed on %lld %u\n", (long long int)len, n_bytes);
+	return NULL;
+    }
+
+    initial_page = (uint8_t *)(((unsigned long)addr + page_mask) & ~page_mask);
+    leading_protected = initial_page + page_size;
+    payload = leading_protected + N_LEADING_PROTECTED * page_size;
+    trailing_protected = payload + n_payload_bytes;
+
+    ((info_t *)initial_page)->addr = addr;
+    ((info_t *)initial_page)->len = len;
+    ((info_t *)initial_page)->trailing = trailing_protected;
+    ((info_t *)initial_page)->n_bytes = n_bytes;
+
+    if ((mprotect (leading_protected, N_LEADING_PROTECTED * page_size,
+		  PROT_NONE) == -1) ||
+	(mprotect (trailing_protected, N_TRAILING_PROTECTED * page_size,
+		  PROT_NONE) == -1))
+    {
+	munmap (addr, n_bytes);
+	return NULL;
+    }
+
+    return payload;
+}
+
+void
+fence_free (void *data)
+{
+    uint32_t page_size = getpagesize();
+    uint8_t *payload = data;
+    uint8_t *leading_protected = payload - N_LEADING_PROTECTED * page_size;
+    uint8_t *initial_page = leading_protected - page_size;
+    info_t *info = (info_t *)initial_page;
+
+    munmap (info->addr, info->n_bytes);
+}
+
+#else
+
+void *
+fence_malloc (int64_t len)
+{
+    return malloc (len);
+}
+
+void
+fence_free (void *data)
+{
+    free (data);
+}
+
+#endif
+
+uint8_t *
+make_random_bytes (int n_bytes)
+{
+    uint8_t *bytes = fence_malloc (n_bytes);
+    int i;
+
+    if (!bytes)
+	return NULL;
+
+    for (i = 0; i < n_bytes; ++i)
+	bytes[i] = lcg_rand () & 0xff;
+
+    return bytes;
+}
+
+#ifdef HAVE_LIBPNG
+
+static void
+pngify_pixels (uint32_t *pixels, int n_pixels)
+{
+    int i;
+
+    for (i = 0; i < n_pixels; ++i)
+    {
+	uint32_t p = pixels[i];
+	uint8_t *out = (uint8_t *)&(pixels[i]);
+	uint8_t a, r, g, b;
+
+	a = (p & 0xff000000) >> 24;
+	r = (p & 0x00ff0000) >> 16;
+	g = (p & 0x0000ff00) >> 8;
+	b = (p & 0x000000ff) >> 0;
+
+	if (a != 0)
+	{
+	    r = (r * 255) / a;
+	    g = (g * 255) / a;
+	    b = (b * 255) / a;
+	}
+
+	*out++ = r;
+	*out++ = g;
+	*out++ = b;
+	*out++ = a;
+    }
+}
+
+pixman_bool_t
+write_png (pixman_image_t *image, const char *filename)
+{
+    int width = pixman_image_get_width (image);
+    int height = pixman_image_get_height (image);
+    int stride = width * 4;
+    uint32_t *data = malloc (height * stride);
+    pixman_image_t *copy;
+    png_struct *write_struct;
+    png_info *info_struct;
+    pixman_bool_t result = FALSE;
+    FILE *f = fopen (filename, "wb");
+    png_bytep *row_pointers;
+    int i;
+
+    if (!f)
+	return FALSE;
+
+    row_pointers = malloc (height * sizeof (png_bytep));
+
+    copy = pixman_image_create_bits (
+	PIXMAN_a8r8g8b8, width, height, data, stride);
+
+    pixman_image_composite32 (
+	PIXMAN_OP_SRC, image, NULL, copy, 0, 0, 0, 0, 0, 0, width, height);
+
+    pngify_pixels (data, height * width);
+
+    for (i = 0; i < height; ++i)
+	row_pointers[i] = (png_bytep)(data + i * width);
+
+    if (!(write_struct = png_create_write_struct (
+	      PNG_LIBPNG_VER_STRING, NULL, NULL, NULL)))
+	goto out1;
+
+    if (!(info_struct = png_create_info_struct (write_struct)))
+	goto out2;
+
+    png_init_io (write_struct, f);
+
+    png_set_IHDR (write_struct, info_struct, width, height,
+		  8, PNG_COLOR_TYPE_RGB_ALPHA,
+		  PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE,
+		  PNG_FILTER_TYPE_BASE);
+
+    png_write_info (write_struct, info_struct);
+
+    png_write_image (write_struct, row_pointers);
+
+    png_write_end (write_struct, NULL);
+
+    result = TRUE;
+
+out2:
+    png_destroy_write_struct (&write_struct, &info_struct);
+
+out1:
+    if (fclose (f) != 0)
+	result = FALSE;
+
+    pixman_image_unref (copy);
+    free (row_pointers);
+    free (data);
+    return result;
+}
+
+#else /* no libpng */
+
+pixman_bool_t
+write_png (pixman_image_t *image, const char *filename)
+{
+    return FALSE;
+}
+
+#endif
+
+/*
+ * A function, which can be used as a core part of the test programs,
+ * intended to detect various problems with the help of fuzzing input
+ * to pixman API (according to some templates, aka "smart" fuzzing).
+ * Some general information about such testing can be found here:
+ * http://en.wikipedia.org/wiki/Fuzz_testing
+ *
+ * It may help detecting:
+ *  - crashes on bad handling of valid or reasonably invalid input to
+ *    pixman API.
+ *  - deviations from the behavior of older pixman releases.
+ *  - deviations from the behavior of the same pixman release, but
+ *    configured in a different way (for example with SIMD optimizations
+ *    disabled), or running on a different OS or hardware.
+ *
+ * The test is performed by calling a callback function a huge number
+ * of times. The callback function is expected to run some snippet of
+ * pixman code with pseudorandom variations to the data feeded to
+ * pixman API. A result of running each callback function should be
+ * some deterministic value which depends on test number (test number
+ * can be used as a seed for PRNG). When 'verbose' argument is nonzero,
+ * callback function is expected to print to stdout some information
+ * about what it does.
+ *
+ * Return values from many small tests are accumulated together and
+ * used as final checksum, which can be compared to some expected
+ * value. Running the tests not individually, but in a batch helps
+ * to reduce process start overhead and also allows to parallelize
+ * testing and utilize multiple CPU cores.
+ *
+ * The resulting executable can be run without any arguments. In
+ * this case it runs a batch of tests starting from 1 and up to
+ * 'default_number_of_iterations'. The resulting checksum is
+ * compared with 'expected_checksum' and FAIL or PASS verdict
+ * depends on the result of this comparison.
+ *
+ * If the executable is run with 2 numbers provided as command line
+ * arguments, they specify the starting and ending numbers for a test
+ * batch.
+ *
+ * If the executable is run with only one number provided as a command
+ * line argument, then this number is used to call the callback function
+ * once, and also with verbose flag set.
+ */
+int
+fuzzer_test_main (const char *test_name,
+		  int         default_number_of_iterations,
+		  uint32_t    expected_checksum,
+		  uint32_t    (*test_function)(int testnum, int verbose),
+		  int         argc,
+		  const char *argv[])
+{
+    int i, n1 = 1, n2 = 0;
+    uint32_t checksum = 0;
+    int verbose = getenv ("VERBOSE") != NULL;
+
+    if (argc >= 3)
+    {
+	n1 = atoi (argv[1]);
+	n2 = atoi (argv[2]);
+	if (n2 < n1)
+	{
+	    printf ("invalid test range\n");
+	    return 1;
+	}
+    }
+    else if (argc >= 2)
+    {
+	n2 = atoi (argv[1]);
+	checksum = test_function (n2, 1);
+	printf ("%d: checksum=%08X\n", n2, checksum);
+	return 0;
+    }
+    else
+    {
+	n1 = 1;
+	n2 = default_number_of_iterations;
+    }
+
+#ifdef USE_OPENMP
+    #pragma omp parallel for reduction(+:checksum) default(none) \
+					shared(n1, n2, test_function, verbose)
+#endif
+    for (i = n1; i <= n2; i++)
+    {
+	uint32_t crc = test_function (i, 0);
+	if (verbose)
+	    printf ("%d: %08X\n", i, crc);
+	checksum += crc;
+    }
+
+    if (n1 == 1 && n2 == default_number_of_iterations)
+    {
+	if (checksum == expected_checksum)
+	{
+	    printf ("%s test passed (checksum=%08X)\n",
+		    test_name, checksum);
+	}
+	else
+	{
+	    printf ("%s test failed! (checksum=%08X, expected %08X)\n",
+		    test_name, checksum, expected_checksum);
+	    return 1;
+	}
+    }
+    else
+    {
+	printf ("%d-%d: checksum=%08X\n", n1, n2, checksum);
+    }
+
+    return 0;
+}
+
+/* Try to obtain current time in seconds */
+double
+gettime (void)
+{
+#ifdef HAVE_GETTIMEOFDAY
+    struct timeval tv;
+
+    gettimeofday (&tv, NULL);
+    return (double)((int64_t)tv.tv_sec * 1000000 + tv.tv_usec) / 1000000.;
+#else
+    return (double)clock() / (double)CLOCKS_PER_SEC;
+#endif
+}
+
+uint32_t
+get_random_seed (void)
+{
+    double d = gettime();
+
+    lcg_srand (*(uint32_t *)&d);
+
+    return lcg_rand_u32 ();
+}
+
+static const char *global_msg;
+
+static void
+on_alarm (int signo)
+{
+    printf ("%s\n", global_msg);
+    exit (1);
+}
+
+void
+fail_after (int seconds, const char *msg)
+{
+#ifdef HAVE_SIGACTION
+#ifdef HAVE_ALARM
+    struct sigaction action;
+
+    global_msg = msg;
+
+    memset (&action, 0, sizeof (action));
+    action.sa_handler = on_alarm;
+
+    alarm (seconds);
+
+    sigaction (SIGALRM, &action, NULL);
+#endif
+#endif
+}
+
+void
+enable_fp_exceptions (void)
+{
+#ifdef HAVE_FENV_H
+#ifdef HAVE_FEENABLEEXCEPT
+    /* Note: we don't enable the FE_INEXACT trap because
+     * that happens quite commonly. It is possible that
+     * over- and underflow should similarly be considered
+     * okay, but for now the test suite passes with them
+     * enabled, and it's useful to know if they start
+     * occuring.
+     */
+    feenableexcept (FE_DIVBYZERO	|
+		    FE_INVALID		|
+		    FE_OVERFLOW		|
+		    FE_UNDERFLOW);
+#endif
+#endif
+}
+
+void *
+aligned_malloc (size_t align, size_t size)
+{
+    void *result;
+
+#ifdef HAVE_POSIX_MEMALIGN
+    if (posix_memalign (&result, align, size) != 0)
+      result = NULL;
+#else
+    result = malloc (size);
+#endif
+
+    return result;
+}
+
+#define CONVERT_15(c, is_rgb)						\
+    (is_rgb?								\
+     ((((c) >> 3) & 0x001f) |						\
+      (((c) >> 6) & 0x03e0) |						\
+      (((c) >> 9) & 0x7c00)) :						\
+     (((((c) >> 16) & 0xff) * 153 +					\
+       (((c) >>  8) & 0xff) * 301 +					\
+       (((c)      ) & 0xff) * 58) >> 2))
+
+void
+initialize_palette (pixman_indexed_t *palette, uint32_t depth, int is_rgb)
+{
+    int i;
+    uint32_t mask = (1 << depth) - 1;
+
+    for (i = 0; i < 32768; ++i)
+	palette->ent[i] = lcg_rand() & mask;
+
+    memset (palette->rgba, 0, sizeof (palette->rgba));
+
+    for (i = 0; i < mask + 1; ++i)
+    {
+	uint32_t rgba24;
+ 	pixman_bool_t retry;
+	uint32_t i15;
+
+	/* We filled the rgb->index map with random numbers, but we
+	 * do need the ability to round trip, that is if some indexed
+	 * color expands to an argb24, then the 15 bit version of that
+	 * color must map back to the index. Anything else, we don't
+	 * care about too much.
+	 */
+	do
+	{
+	    uint32_t old_idx;
+
+	    rgba24 = lcg_rand();
+	    i15 = CONVERT_15 (rgba24, is_rgb);
+
+	    old_idx = palette->ent[i15];
+	    if (CONVERT_15 (palette->rgba[old_idx], is_rgb) == i15)
+		retry = 1;
+	    else
+		retry = 0;
+	} while (retry);
+
+	palette->rgba[i] = rgba24;
+	palette->ent[i15] = i;
+    }
+
+    for (i = 0; i < mask + 1; ++i)
+    {
+	assert (palette->ent[CONVERT_15 (palette->rgba[i], is_rgb)] == i);
+    }
+}
diff --git a/test/utils.h b/test/utils.h
new file mode 100755
index 0000000..b23925c
--- /dev/null
+++ b/test/utils.h
@@ -0,0 +1,154 @@
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <assert.h>
+#include "pixman-private.h" /* For 'inline' definition */
+
+#define ARRAY_LENGTH(A) ((int) (sizeof (A) / sizeof ((A) [0])))
+
+/* A primitive pseudorandom number generator,
+ * taken from POSIX.1-2001 example
+ */
+
+extern uint32_t lcg_seed;
+#ifdef USE_OPENMP
+#pragma omp threadprivate(lcg_seed)
+#endif
+
+static inline uint32_t
+lcg_rand (void)
+{
+    lcg_seed = lcg_seed * 1103515245 + 12345;
+    return ((uint32_t)(lcg_seed / 65536) % 32768);
+}
+
+static inline void
+lcg_srand (uint32_t seed)
+{
+    lcg_seed = seed;
+}
+
+static inline uint32_t
+lcg_rand_n (int max)
+{
+    return lcg_rand () % max;
+}
+
+static inline uint32_t
+lcg_rand_N (int max)
+{
+    uint32_t lo = lcg_rand ();
+    uint32_t hi = lcg_rand () << 15;
+    return (lo | hi) % max;
+}
+
+static inline uint32_t
+lcg_rand_u32 (void)
+{
+    /* This uses the 10/11 most significant bits from the 3 lcg results
+     * (and mixes them with the low from the adjacent one).
+     */
+    uint32_t lo = lcg_rand() >> -(32 - 15 - 11 * 2);
+    uint32_t mid = lcg_rand() << (32 - 15 - 11 * 1);
+    uint32_t hi = lcg_rand() << (32 - 15 - 11 * 0);
+
+    return (hi ^ mid ^ lo);
+}
+
+/* CRC 32 computation
+ */
+uint32_t
+compute_crc32 (uint32_t    in_crc32,
+	       const void *buf,
+	       size_t      buf_len);
+
+/* Returns TRUE if running on a little endian system */
+pixman_bool_t
+is_little_endian (void);
+
+/* perform endian conversion of pixel data
+ */
+void
+image_endian_swap (pixman_image_t *img);
+
+/* Allocate memory that is bounded by protected pages,
+ * so that out-of-bounds access will cause segfaults
+ */
+void *
+fence_malloc (int64_t len);
+
+void
+fence_free (void *data);
+
+/* Generate n_bytes random bytes in fence_malloced memory */
+uint8_t *
+make_random_bytes (int n_bytes);
+
+/* Return current time in seconds */
+double
+gettime (void);
+
+uint32_t
+get_random_seed (void);
+
+/* main body of the fuzzer test */
+int
+fuzzer_test_main (const char *test_name,
+		  int         default_number_of_iterations,
+		  uint32_t    expected_checksum,
+		  uint32_t    (*test_function)(int testnum, int verbose),
+		  int         argc,
+		  const char *argv[]);
+
+void
+fail_after (int seconds, const char *msg);
+
+/* If possible, enable traps for floating point exceptions */
+void enable_fp_exceptions(void);
+
+pixman_bool_t
+write_png (pixman_image_t *image, const char *filename);
+
+/* A pair of macros which can help to detect corruption of
+ * floating point registers after a function call. This may
+ * happen if _mm_empty() call is forgotten in MMX/SSE2 fast
+ * path code, or ARM NEON assembly optimized function forgets
+ * to save/restore d8-d15 registers before use.
+ */
+
+#define FLOAT_REGS_CORRUPTION_DETECTOR_START()                 \
+    static volatile double frcd_volatile_constant1 = 123451;   \
+    static volatile double frcd_volatile_constant2 = 123452;   \
+    static volatile double frcd_volatile_constant3 = 123453;   \
+    static volatile double frcd_volatile_constant4 = 123454;   \
+    static volatile double frcd_volatile_constant5 = 123455;   \
+    static volatile double frcd_volatile_constant6 = 123456;   \
+    static volatile double frcd_volatile_constant7 = 123457;   \
+    static volatile double frcd_volatile_constant8 = 123458;   \
+    double frcd_canary_variable1 = frcd_volatile_constant1;    \
+    double frcd_canary_variable2 = frcd_volatile_constant2;    \
+    double frcd_canary_variable3 = frcd_volatile_constant3;    \
+    double frcd_canary_variable4 = frcd_volatile_constant4;    \
+    double frcd_canary_variable5 = frcd_volatile_constant5;    \
+    double frcd_canary_variable6 = frcd_volatile_constant6;    \
+    double frcd_canary_variable7 = frcd_volatile_constant7;    \
+    double frcd_canary_variable8 = frcd_volatile_constant8;
+
+#define FLOAT_REGS_CORRUPTION_DETECTOR_FINISH()                \
+    assert (frcd_canary_variable1 == frcd_volatile_constant1); \
+    assert (frcd_canary_variable2 == frcd_volatile_constant2); \
+    assert (frcd_canary_variable3 == frcd_volatile_constant3); \
+    assert (frcd_canary_variable4 == frcd_volatile_constant4); \
+    assert (frcd_canary_variable5 == frcd_volatile_constant5); \
+    assert (frcd_canary_variable6 == frcd_volatile_constant6); \
+    assert (frcd_canary_variable7 == frcd_volatile_constant7); \
+    assert (frcd_canary_variable8 == frcd_volatile_constant8);
+
+/* Try to get an aligned memory chunk */
+void *
+aligned_malloc (size_t align, size_t size);
+
+void
+initialize_palette (pixman_indexed_t *palette, uint32_t depth, int is_rgb);
-- 
2.7.4