pixman: Branch 'master' - 35 commits

Mon Aug 10 23:07:57 PDT 2009

pixman/pixman-combine.c.template |   22 -
 pixman/pixman-combine.h.template |  257 +++++++-------
 pixman/pixman-fast-path.c        |   23 -
 pixman/pixman-mmx.c              |   66 +--
 pixman/pixman-sse2.c             |   32 -
 pixman/pixman-utils.c            |   10 
 pixman/pixman-vmx.c              |  707 ++++++++++++++++-----------------------
 test/Makefile.am                 |    2 
 test/blitters-test-bisect.rb     |   43 ++
 test/blitters-test.c             |  638 +++++++++++++++++++++++++++++++++++
 10 files changed, 1176 insertions(+), 624 deletions(-)

New commits:
commit d6016d406a649f7a95bec2a477dfd89ba280188d
Merge: 93923c6... e084351...
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Tue Aug 11 02:04:40 2009 -0400

    Merge branch 'blitter-test'

commit e084351b13faad6a3ba67808b5721957b51d16f0
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Fri Aug 7 00:45:53 2009 -0400

    Update CRC value in blitters-test.
    
    At this point, the SIMD, SSE2, MMX and general implementations all
    agree.

diff --git a/test/blitters-test.c b/test/blitters-test.c
index cc829b5..4838e81 100644
--- a/test/blitters-test.c
+++ b/test/blitters-test.c
@@ -623,7 +623,7 @@ main (int argc, char *argv[])
 	    /* Predefined value for running with all the fastpath functions
 	       disabled. It needs to be updated every time when changes are
 	       introduced to this program or behavior of pixman changes! */
-	    if (crc == 0x4895C7B0)
+	    if (crc == 0xFE1244BF)
 	    {
 		printf ("blitters test passed\n");
 	    }
commit ba5c5325e77b36374d3be22bd92816c332a321bb
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Fri Aug 7 00:25:56 2009 -0400

    Various formatting fixes

diff --git a/test/blitters-test.c b/test/blitters-test.c
index 350210a..cc829b5 100644
--- a/test/blitters-test.c
+++ b/test/blitters-test.c
@@ -91,8 +91,8 @@ lcg_rand_n (int max)
 
 static uint32_t
 compute_crc32 (uint32_t    in_crc32,
-		  const void *buf,
-		  size_t      buf_len)
+	       const void *buf,
+	       size_t      buf_len)
 {
     static const uint32_t crc_table[256] = {
 	0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F,
@@ -144,7 +144,7 @@ compute_crc32 (uint32_t    in_crc32,
     unsigned char *       byte_buf;
     size_t                i;
 
-    /** accumulate crc32 for buffer **/
+    /* accumulate crc32 for buffer */
     crc32 = in_crc32 ^ 0xFFFFFFFF;
     byte_buf = (unsigned char*) buf;
 
@@ -165,7 +165,8 @@ image_endian_swap (pixman_image_t *img, int bpp)
 
     /* swap bytes only on big endian systems */
     volatile uint16_t endian_check_var = 0x1234;
-    if (*(volatile uint8_t *)&endian_check_var != 0x12) return;
+    if (*(volatile uint8_t *)&endian_check_var != 0x12)
+	return;
 
     for (i = 0; i < height; i++)
     {
@@ -198,6 +199,7 @@ image_endian_swap (pixman_image_t *img, int bpp)
 	    {
 		char t1 = line_data[j + 0];
 		char t2 = line_data[j + 1];
+
 		line_data[j + 1] = t1;
 		line_data[j + 0] = t2;
 	    }
@@ -208,6 +210,7 @@ image_endian_swap (pixman_image_t *img, int bpp)
 		char t1 = line_data[j + 0];
 		char t2 = line_data[j + 1];
 		char t3 = line_data[j + 2];
+
 		line_data[j + 2] = t1;
 		line_data[j + 1] = t2;
 		line_data[j + 0] = t3;
@@ -220,6 +223,7 @@ image_endian_swap (pixman_image_t *img, int bpp)
 		char t2 = line_data[j + 1];
 		char t3 = line_data[j + 2];
 		char t4 = line_data[j + 3];
+
 		line_data[j + 3] = t1;
 		line_data[j + 2] = t2;
 		line_data[j + 1] = t3;
@@ -234,22 +238,24 @@ image_endian_swap (pixman_image_t *img, int bpp)
 
 /* Create random image for testing purposes */
 static pixman_image_t *
-create_random_image (
-	pixman_format_code_t *allowed_formats,
-	int max_width,
-	int max_height,
-	int max_extra_stride,
-	pixman_format_code_t *used_fmt)
+create_random_image (pixman_format_code_t *allowed_formats,
+		     int                   max_width,
+		     int                   max_height,
+		     int                   max_extra_stride,
+		     pixman_format_code_t *used_fmt)
 {
     int n = 0, i, width, height, stride;
     pixman_format_code_t fmt;
     uint32_t *buf;
     pixman_image_t *img;
-    while (allowed_formats[n] != -1) n++;
+
+    while (allowed_formats[n] != -1)
+	n++;
     fmt = allowed_formats[lcg_rand_n (n)];
     width = lcg_rand_n (max_width) + 1;
     height = lcg_rand_n (max_height) + 1;
-    stride = (width * PIXMAN_FORMAT_BPP (fmt) + 7) / 8 + lcg_rand_n (max_extra_stride + 1);
+    stride = (width * PIXMAN_FORMAT_BPP (fmt) + 7) / 8 +
+	lcg_rand_n (max_extra_stride + 1);
     stride = (stride + 3) & ~3;
 
     /* do the allocation */
@@ -259,13 +265,13 @@ create_random_image (
     for (i = 0; i < stride * height; i++)
     {
 	/* generation is biased to having more 0 or 255 bytes as
-	   they are more likely to be special-cased in code */
+	 * they are more likely to be special-cased in code
+	 */
 	*((uint8_t *)buf + i) = lcg_rand_n (4) ? lcg_rand_n (256) :
 	    (lcg_rand_n (2) ? 0 : 255);
     }
 
-    img = pixman_image_create_bits (
-	fmt, width, height, buf, stride);
+    img = pixman_image_create_bits (fmt, width, height, buf, stride);
 
     image_endian_swap (img, PIXMAN_FORMAT_BPP (fmt));
 
@@ -275,10 +281,9 @@ create_random_image (
 
 /* Free random image, and optionally update crc32 based on its data */
 static uint32_t
-free_random_image (
-    uint32_t initcrc,
-    pixman_image_t *img,
-    pixman_format_code_t fmt)
+free_random_image (uint32_t initcrc,
+		   pixman_image_t *img,
+		   pixman_format_code_t fmt)
 {
     uint32_t crc32 = 0;
     int stride = pixman_image_get_stride (img);
@@ -294,19 +299,24 @@ free_random_image (
 	    int i;
 	    uint32_t *data = pixman_image_get_data (img);
 	    uint32_t mask = (1 << PIXMAN_FORMAT_DEPTH (fmt)) - 1;
+
 	    for (i = 0; i < 32; i++)
 		mask |= mask << (i * PIXMAN_FORMAT_BPP (fmt));
 
 	    for (i = 0; i < stride * height / 4; i++)
 		data[i] &= mask;
 	}
+
 	/* swap endiannes in order to provide identical results on both big
-	   and litte endian systems */
+	 * and litte endian systems
+	 */
 	image_endian_swap (img, PIXMAN_FORMAT_BPP (fmt));
 	crc32 = compute_crc32 (initcrc, data, stride * height);
     }
+
     pixman_image_unref (img);
     free (data);
+
     return crc32;
 }
 
@@ -370,6 +380,7 @@ static pixman_op_t op_list[] = {
     PIXMAN_OP_HSL_LUMINOSITY,
 #endif
 };
+
 static pixman_format_code_t img_fmt_list[] = {
     PIXMAN_a8r8g8b8,
     PIXMAN_x8r8g8b8,
@@ -419,6 +430,7 @@ static pixman_format_code_t img_fmt_list[] = {
     PIXMAN_a1,
     -1
 };
+
 static pixman_format_code_t mask_fmt_list[] = {
     PIXMAN_a8r8g8b8,
     PIXMAN_a8,
@@ -452,9 +464,15 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
 
     max_width = max_height = 24 + testnum / 10000;
     max_extra_stride = 4 + testnum / 1000000;
-    if (max_width > 256) max_width = 256;
-    if (max_height > 16) max_height = 16;
-    if (max_extra_stride > 8) max_extra_stride = 8;
+
+    if (max_width > 256)
+	max_width = 256;
+
+    if (max_height > 16)
+	max_height = 16;
+
+    if (max_extra_stride > 8)
+	max_extra_stride = 8;
 
     lcg_srand (testnum);
 
@@ -464,21 +482,23 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
     {
 	/* normal image */
 	src_img = create_random_image (img_fmt_list, max_width, max_height,
-				      max_extra_stride, &src_fmt);
+				       max_extra_stride, &src_fmt);
     }
     else
     {
 	/* solid case */
 	src_img = create_random_image (img_fmt_list, 1, 1,
-				      max_extra_stride, &src_fmt);
+				       max_extra_stride, &src_fmt);
+
 	pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
     }
 
     dst_img = create_random_image (img_fmt_list, max_width, max_height,
-				  max_extra_stride, &dst_fmt);
+				   max_extra_stride, &dst_fmt);
 
     mask_img = NULL;
     mask_fmt = -1;
+
     if (lcg_rand_n (2))
     {
 	if (lcg_rand_n (2))
@@ -493,6 +513,7 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
 					   max_extra_stride, &mask_fmt);
 	    pixman_image_set_repeat (mask_img, PIXMAN_REPEAT_NORMAL);
 	}
+
 	if (lcg_rand_n (2))
 	    pixman_image_set_component_alpha (mask_img, 1);
     }
@@ -500,15 +521,18 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
     src_width = pixman_image_get_width (src_img);
     src_height = pixman_image_get_height (src_img);
     src_stride = pixman_image_get_stride (src_img);
+
     dst_width = pixman_image_get_width (dst_img);
     dst_height = pixman_image_get_height (dst_img);
     dst_stride = pixman_image_get_stride (dst_img);
+
     dstbuf = pixman_image_get_data (dst_img);
 
     src_x = lcg_rand_n (src_width);
     src_y = lcg_rand_n (src_height);
     dst_x = lcg_rand_n (dst_width);
     dst_y = lcg_rand_n (dst_height);
+
     w = lcg_rand_n (dst_width - dst_x + 1);
     h = lcg_rand_n (dst_height - dst_y + 1);
 
@@ -531,6 +555,7 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
     if (verbose)
     {
 	int j;
+
 	printf ("---\n");
 	for (i = 0; i < dst_height; i++)
 	{
@@ -538,6 +563,7 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
 	    {
 		if (j == (dst_width * PIXMAN_FORMAT_BPP (dst_fmt) + 7) / 8)
 		    printf ("| ");
+
 		printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
 	    }
 	    printf ("\n");
@@ -547,7 +573,10 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
 
     free_random_image (initcrc, src_img, -1);
     crc32 = free_random_image (initcrc, dst_img, dst_fmt);
-    if (mask_img) free_random_image (initcrc, mask_img, -1);
+
+    if (mask_img)
+	free_random_image (initcrc, mask_img, -1);
+
     return crc32;
 }
 
commit cc5c59b3f25dc595ce17a876e89ca549bb477e46
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Wed Aug 5 16:28:10 2009 -0400

    Add the ability to print intermediate CRC values

diff --git a/test/blitters-test.c b/test/blitters-test.c
index 27fa956..350210a 100644
--- a/test/blitters-test.c
+++ b/test/blitters-test.c
@@ -556,6 +556,7 @@ main (int argc, char *argv[])
 {
     int i, n1 = 1, n2 = 0;
     uint32_t crc = 0;
+    int verbose = getenv ("VERBOSE") != NULL;
 
     if (argc >= 3)
     {
@@ -582,6 +583,9 @@ main (int argc, char *argv[])
 	for (i = n1; i <= n2; i++)
 	{
 	    crc = test_composite (crc, i, 0);
+
+	    if (verbose)
+		printf ("%d: %08X\n", i, crc);
 	}
 	printf ("crc32=%08X\n", crc);
 
commit 0bc4adae3eb758a2cd9026397c284bd6cc7bcd65
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Wed Aug 5 15:53:33 2009 -0400

    Reenable commented-out tests in blitter-test.
    
    The crashes and valgrind issues are all fixed at this point.

diff --git a/test/blitters-test.c b/test/blitters-test.c
index 6ce877c..27fa956 100644
--- a/test/blitters-test.c
+++ b/test/blitters-test.c
@@ -314,7 +314,6 @@ static pixman_op_t op_list[] = {
     PIXMAN_OP_SRC,
     PIXMAN_OP_OVER,
     PIXMAN_OP_ADD,
-#if 0
     PIXMAN_OP_CLEAR,
     PIXMAN_OP_SRC,
     PIXMAN_OP_DST,
@@ -331,9 +330,7 @@ static pixman_op_t op_list[] = {
     PIXMAN_OP_SATURATE,
     PIXMAN_OP_DISJOINT_CLEAR,
     PIXMAN_OP_DISJOINT_SRC,
-#if 0 /* using this crashes the test */
     PIXMAN_OP_DISJOINT_DST,
-#endif
     PIXMAN_OP_DISJOINT_OVER,
     PIXMAN_OP_DISJOINT_OVER_REVERSE,
     PIXMAN_OP_DISJOINT_IN,
@@ -345,9 +342,7 @@ static pixman_op_t op_list[] = {
     PIXMAN_OP_DISJOINT_XOR,
     PIXMAN_OP_CONJOINT_CLEAR,
     PIXMAN_OP_CONJOINT_SRC,
-#if 0 /* using this crashes the test */
     PIXMAN_OP_CONJOINT_DST,
-#endif
     PIXMAN_OP_CONJOINT_OVER,
     PIXMAN_OP_CONJOINT_OVER_REVERSE,
     PIXMAN_OP_CONJOINT_IN,
@@ -374,7 +369,6 @@ static pixman_op_t op_list[] = {
     PIXMAN_OP_HSL_COLOR,
     PIXMAN_OP_HSL_LUMINOSITY,
 #endif
-#endif
 };
 static pixman_format_code_t img_fmt_list[] = {
     PIXMAN_a8r8g8b8,
@@ -382,7 +376,6 @@ static pixman_format_code_t img_fmt_list[] = {
     PIXMAN_r5g6b5,
     PIXMAN_r3g3b2,
     PIXMAN_a8,
-#if 0
     PIXMAN_a8b8g8r8,
     PIXMAN_x8b8g8r8,
     PIXMAN_b8g8r8a8,
@@ -391,12 +384,10 @@ static pixman_format_code_t img_fmt_list[] = {
     PIXMAN_b8g8r8,
     PIXMAN_r5g6b5,
     PIXMAN_b5g6r5,
-#if 0 /* using these makes valgrind complain */
     PIXMAN_x2r10g10b10,
     PIXMAN_a2r10g10b10,
     PIXMAN_x2b10g10r10,
     PIXMAN_a2b10g10r10,
-#endif
     PIXMAN_a1r5g5b5,
     PIXMAN_x1r5g5b5,
     PIXMAN_a1b5g5r5,
@@ -426,16 +417,13 @@ static pixman_format_code_t img_fmt_list[] = {
     PIXMAN_a1r1g1b1,
     PIXMAN_a1b1g1r1,
     PIXMAN_a1,
-#endif
     -1
 };
 static pixman_format_code_t mask_fmt_list[] = {
     PIXMAN_a8r8g8b8,
     PIXMAN_a8,
-#if 0
     PIXMAN_a4,
     PIXMAN_a1,
-#endif
     -1
 };
 
@@ -505,10 +493,8 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
 					   max_extra_stride, &mask_fmt);
 	    pixman_image_set_repeat (mask_img, PIXMAN_REPEAT_NORMAL);
 	}
-#if 0 /* using this crashes the test */
 	if (lcg_rand_n (2))
 	    pixman_image_set_component_alpha (mask_img, 1);
-#endif
     }
 
     src_width = pixman_image_get_width (src_img);
commit 9ee18806a944ddde36dc1b045f89f02d025cbe48
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Mon Aug 3 00:01:01 2009 +0300

    One more update to blitters-test - use aligned memory
    allocations in order in order to make reproducibility
    of alignment sensitive bugs more deterministic
    Also testing of masks is reenabled

diff --git a/test/blitters-test.c b/test/blitters-test.c
index f57d500..6ce877c 100644
--- a/test/blitters-test.c
+++ b/test/blitters-test.c
@@ -24,6 +24,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <stdio.h>
+#include <malloc.h>
 #include "pixman.h"
 
 /* A primitive pseudorandom number generator, taken from POSIX.1-2001 example */
@@ -252,7 +253,7 @@ create_random_image (
     stride = (stride + 3) & ~3;
 
     /* do the allocation */
-    buf = (uint32_t *)malloc (stride * height);
+    buf = (uint32_t *)memalign (64, stride * height);
 
     /* initialize image with random data */
     for (i = 0; i < stride * height; i++)
@@ -431,8 +432,10 @@ static pixman_format_code_t img_fmt_list[] = {
 static pixman_format_code_t mask_fmt_list[] = {
     PIXMAN_a8r8g8b8,
     PIXMAN_a8,
+#if 0
     PIXMAN_a4,
     PIXMAN_a1,
+#endif
     -1
 };
 
@@ -459,9 +462,9 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
     uint32_t crc32;
     int max_width, max_height, max_extra_stride;
 
-    max_width = max_height = 24 + testnum / 100000;
+    max_width = max_height = 24 + testnum / 10000;
     max_extra_stride = 4 + testnum / 1000000;
-    if (max_width > 64) max_width = 64;
+    if (max_width > 256) max_width = 256;
     if (max_height > 16) max_height = 16;
     if (max_extra_stride > 8) max_extra_stride = 8;
 
@@ -488,7 +491,6 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
 
     mask_img = NULL;
     mask_fmt = -1;
-#if 0
     if (lcg_rand_n (2))
     {
 	if (lcg_rand_n (2))
@@ -508,7 +510,6 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
 	    pixman_image_set_component_alpha (mask_img, 1);
 #endif
     }
-#endif
 
     src_width = pixman_image_get_width (src_img);
     src_height = pixman_image_get_height (src_img);
commit 4fc0f9dd456bb4ad1f47e1733b02a3b491f425ed
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Sat Aug 1 02:20:12 2009 +0300

    HACK: updated test to better cover new neon optimizations

diff --git a/test/blitters-test.c b/test/blitters-test.c
index 1306040..f57d500 100644
--- a/test/blitters-test.c
+++ b/test/blitters-test.c
@@ -310,6 +310,10 @@ free_random_image (
 }
 
 static pixman_op_t op_list[] = {
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+#if 0
     PIXMAN_OP_CLEAR,
     PIXMAN_OP_SRC,
     PIXMAN_OP_DST,
@@ -369,10 +373,15 @@ static pixman_op_t op_list[] = {
     PIXMAN_OP_HSL_COLOR,
     PIXMAN_OP_HSL_LUMINOSITY,
 #endif
+#endif
 };
 static pixman_format_code_t img_fmt_list[] = {
     PIXMAN_a8r8g8b8,
     PIXMAN_x8r8g8b8,
+    PIXMAN_r5g6b5,
+    PIXMAN_r3g3b2,
+    PIXMAN_a8,
+#if 0
     PIXMAN_a8b8g8r8,
     PIXMAN_x8b8g8r8,
     PIXMAN_b8g8r8a8,
@@ -416,6 +425,7 @@ static pixman_format_code_t img_fmt_list[] = {
     PIXMAN_a1r1g1b1,
     PIXMAN_a1b1g1r1,
     PIXMAN_a1,
+#endif
     -1
 };
 static pixman_format_code_t mask_fmt_list[] = {
@@ -449,7 +459,7 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
     uint32_t crc32;
     int max_width, max_height, max_extra_stride;
 
-    max_width = max_height = 12 + testnum / 100000;
+    max_width = max_height = 24 + testnum / 100000;
     max_extra_stride = 4 + testnum / 1000000;
     if (max_width > 64) max_width = 64;
     if (max_height > 16) max_height = 16;
@@ -478,6 +488,7 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
 
     mask_img = NULL;
     mask_fmt = -1;
+#if 0
     if (lcg_rand_n (2))
     {
 	if (lcg_rand_n (2))
@@ -497,6 +508,7 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
 	    pixman_image_set_component_alpha (mask_img, 1);
 #endif
     }
+#endif
 
     src_width = pixman_image_get_width (src_img);
     src_height = pixman_image_get_height (src_img);
@@ -570,7 +582,7 @@ main (int argc, char *argv[])
     else
     {
 	n1 = 1;
-	n2 = 3000000;
+	n2 = 2000000;
     }
 
     if (n2 < 0)
@@ -586,12 +598,12 @@ main (int argc, char *argv[])
 	}
 	printf ("crc32=%08X\n", crc);
 
-	if (n2 == 3000000)
+	if (n2 == 2000000)
 	{
 	    /* Predefined value for running with all the fastpath functions
 	       disabled. It needs to be updated every time when changes are
 	       introduced to this program or behavior of pixman changes! */
-	    if (crc == 0x1A025829)
+	    if (crc == 0x4895C7B0)
 	    {
 		printf ("blitters test passed\n");
 	    }
commit 67769ad5bf15450d0fd0d83643e3533a9f563916
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Wed Jul 22 01:29:51 2009 +0300

    Test program for stressing the use of different formats and operators
    
    The code and overall method is mostly based on scaling-test. This one
    focuses on trying to stress as many different color formats and types
    of composition operations as possible.
    
    This is an initial implementation which may need more tuning. Also
    not all color format and operator combinations are actually used.
    
    When cpu specific optimizations are disabled, this test provides
    identical deterministic results on x86, PPC and ARM.
    
    Script blitters-test-bisect.rb now works in non-stop mode, until
    it finds any problem. This allows to run it for example overnight
    in order to test a lot more variants of pixman calls and increase
    chances of detecting problems in pixman. Just like with scaling-test,
    running blitters-test binary alone with no command line arguments
    runs a small predefined number of tests and compares checksum
    with a reference value for quick verification.

diff --git a/test/Makefile.am b/test/Makefile.am
index 324cb72..c56f62d 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -4,6 +4,7 @@ INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman
 TESTPROGRAMS =			\
 	region-test		\
 	scaling-test		\
+	blitters-test		\
 	fetch-test		\
 	oob-test		\
 	window-test		\
@@ -12,6 +13,7 @@ TESTPROGRAMS =			\
 fetch_test_LDADD = $(TEST_LDADD)
 region_test_LDADD = $(TEST_LDADD)
 scaling_test_LDADD = $(TEST_LDADD)
+blitters_test_LDADD = $(TEST_LDADD)
 trap_crasher_LDADD = $(TEST_LDADD)
 oob_test_LDADD = $(TEST_LDADD)
 window_test_LDADD = $(TEST_LDADD)
diff --git a/test/blitters-test-bisect.rb b/test/blitters-test-bisect.rb
new file mode 100644
index 0000000..62ff782
--- /dev/null
+++ b/test/blitters-test-bisect.rb
@@ -0,0 +1,43 @@
+#!/usr/bin/env ruby
+
+if not ARGV[0] or not ARGV[1] then
+    printf("Please provide two 'blitters-test' static binaries in the command line.\n\n")
+    printf("The first should be linked with a correct reference pixman library.\n")
+    printf("The second binary should be linked with a pixman library which needs to be tested.\n")
+    exit(0)
+end
+
+def test_range(min, max)
+    if `#{ARGV[0]} #{min} #{max} 2>/dev/null` == `#{ARGV[1]} #{min} #{max} 2>/dev/null` then
+        return
+    end
+    while max != min + 1 do
+        avg = ((min + max) / 2).to_i
+        res1 = `#{ARGV[0]} #{min} #{avg} 2>/dev/null`
+        res2 = `#{ARGV[1]} #{min} #{avg} 2>/dev/null`
+        if res1 != res2 then
+            max = avg
+        else
+            min = avg
+        end
+    end
+    return max
+end
+
+base = 1
+while true do
+    # run infinitely, processing 100000 test cases per iteration
+    printf("running tests %d-%d\n", base, base + 100000 - 1);
+    res = test_range(base, base + 100000 - 1)
+    if res then
+        printf("-- ref --\n")
+        printf("%s\n", `#{ARGV[0]} -#{res}`)
+        printf("-- new --\n")
+        printf("%s\n", `#{ARGV[1]} -#{res}`)
+
+        printf("\nFailed test %d, you can reproduce the problematic conditions by running\n", res)
+        printf("#{ARGV[1]} -%d\n", res)
+        exit(1)
+    end
+    base += 100000
+end
diff --git a/test/blitters-test.c b/test/blitters-test.c
new file mode 100644
index 0000000..1306040
--- /dev/null
+++ b/test/blitters-test.c
@@ -0,0 +1,606 @@
+/*
+ * Test program, which stresses the use of different color formats and
+ * compositing operations.
+ *
+ * Just run it without any command line arguments, and it will report either
+ *   "blitters test passed" - everything is ok
+ *   "blitters test failed!" - there is some problem
+ *
+ * In the case of failure, finding the problem involves the following steps:
+ * 1. Get the reference 'blitters-test' binary. It makes sense to disable all
+ *    the cpu specific optimizations in pixman and also configure it with
+ *    '--disable-shared' option. Those who are paranoid can also tweak the
+ *    sources to disable all fastpath functions. The resulting binary
+ *    can be renamed to something like 'blitters-test.ref'.
+ * 2. Compile the buggy binary (also with the '--disable-shared' option).
+ * 3. Run 'ruby blitters-test-bisect.rb ./blitters-test.ref ./blitters-test'
+ * 4. Look at the information about failed case (destination buffer content
+ *    will be shown) and try to figure out what is wrong. Loading
+ *    test program in gdb, specifying failed test number in the command
+ *    line with '-' character prepended and setting breakpoint on
+ *    'pixman_image_composite' function can provide detailed information
+ *    about function arguments
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "pixman.h"
+
+/* A primitive pseudorandom number generator, taken from POSIX.1-2001 example */
+
+static uint32_t lcg_seed;
+
+static inline uint32_t
+lcg_rand (void)
+{
+    lcg_seed = lcg_seed * 1103515245 + 12345;
+    return ((uint32_t)(lcg_seed / 65536) % 32768);
+}
+
+static inline void
+lcg_srand (uint32_t seed)
+{
+    lcg_seed = seed;
+}
+
+static inline uint32_t
+lcg_rand_n (int max)
+{
+    return lcg_rand () % max;
+}
+
+/*----------------------------------------------------------------------------*\
+ *  CRC-32 version 2.0.0 by Craig Bruce, 2006-04-29.
+ *
+ *  This program generates the CRC-32 values for the files named in the
+ *  command-line arguments.  These are the same CRC-32 values used by GZIP,
+ *  PKZIP, and ZMODEM.  The Crc32_ComputeBuf () can also be detached and
+ *  used independently.
+ *
+ *  THIS PROGRAM IS PUBLIC-DOMAIN SOFTWARE.
+ *
+ *  Based on the byte-oriented implementation "File Verification Using CRC"
+ *  by Mark R. Nelson in Dr. Dobb's Journal, May 1992, pp. 64-67.
+ *
+ *  v1.0.0: original release.
+ *  v1.0.1: fixed printf formats.
+ *  v1.0.2: fixed something else.
+ *  v1.0.3: replaced CRC constant table by generator function.
+ *  v1.0.4: reformatted code, made ANSI C.  1994-12-05.
+ *  v2.0.0: rewrote to use memory buffer & static table, 2006-04-29.
+\*----------------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------------*\
+ *  NAME:
+ *     Crc32_ComputeBuf () - computes the CRC-32 value of a memory buffer
+ *  DESCRIPTION:
+ *     Computes or accumulates the CRC-32 value for a memory buffer.
+ *     The 'inCrc32' gives a previously accumulated CRC-32 value to allow
+ *     a CRC to be generated for multiple sequential buffer-fuls of data.
+ *     The 'inCrc32' for the first buffer must be zero.
+ *  ARGUMENTS:
+ *     inCrc32 - accumulated CRC-32 value, must be 0 on first call
+ *     buf     - buffer to compute CRC-32 value for
+ *     bufLen  - number of bytes in buffer
+ *  RETURNS:
+ *     crc32 - computed CRC-32 value
+ *  ERRORS:
+ *     (no errors are possible)
+\*----------------------------------------------------------------------------*/
+
+static uint32_t
+compute_crc32 (uint32_t    in_crc32,
+		  const void *buf,
+		  size_t      buf_len)
+{
+    static const uint32_t crc_table[256] = {
+	0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F,
+	0xE963A535, 0x9E6495A3, 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
+	0x09B64C2B, 0x7EB17CBD,	0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2,
+	0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+	0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,	0x14015C4F, 0x63066CD9,
+	0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
+	0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
+	0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+	0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423,
+	0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
+	0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x01DB7106,
+	0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+	0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D,
+	0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
+	0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950,
+	0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+	0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7,
+	0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
+	0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA,
+	0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+	0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
+	0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
+	0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84,
+	0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+	0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB,
+	0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
+	0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E,
+	0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+	0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55,
+	0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
+	0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28,
+	0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+	0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F,
+	0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
+	0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
+	0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+	0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69,
+	0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
+	0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC,
+	0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+	0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693,
+	0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
+	0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+    };
+
+    uint32_t              crc32;
+    unsigned char *       byte_buf;
+    size_t                i;
+
+    /** accumulate crc32 for buffer **/
+    crc32 = in_crc32 ^ 0xFFFFFFFF;
+    byte_buf = (unsigned char*) buf;
+
+    for (i = 0; i < buf_len; i++)
+	crc32 = (crc32 >> 8) ^ crc_table[(crc32 ^ byte_buf[i]) & 0xFF];
+
+    return (crc32 ^ 0xFFFFFFFF);
+}
+
+/* perform endian conversion of pixel data */
+static void
+image_endian_swap (pixman_image_t *img, int bpp)
+{
+    int stride = pixman_image_get_stride (img);
+    uint32_t *data = pixman_image_get_data (img);
+    int height = pixman_image_get_height (img);;
+    int i, j;
+
+    /* swap bytes only on big endian systems */
+    volatile uint16_t endian_check_var = 0x1234;
+    if (*(volatile uint8_t *)&endian_check_var != 0x12) return;
+
+    for (i = 0; i < height; i++)
+    {
+	uint8_t *line_data = (uint8_t *)data + stride * i;
+	/* swap bytes only for 16, 24 and 32 bpp for now */
+	switch (bpp)
+	{
+	case 1:
+	    for (j = 0; j < stride; j++)
+	    {
+		line_data[j] =
+		    ((line_data[j] & 0x80) >> 7) |
+		    ((line_data[j] & 0x40) >> 5) |
+		    ((line_data[j] & 0x20) >> 3) |
+		    ((line_data[j] & 0x10) >> 1) |
+		    ((line_data[j] & 0x08) << 1) |
+		    ((line_data[j] & 0x04) << 3) |
+		    ((line_data[j] & 0x02) << 5) |
+		    ((line_data[j] & 0x01) << 7);
+	    }
+	    break;
+	case 4:
+	    for (j = 0; j < stride; j++)
+	    {
+		line_data[j] = (line_data[j] >> 4) | (line_data[j] << 4);
+	    }
+	    break;
+	case 16:
+	    for (j = 0; j + 2 <= stride; j += 2)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+		line_data[j + 1] = t1;
+		line_data[j + 0] = t2;
+	    }
+	    break;
+	case 24:
+	    for (j = 0; j + 3 <= stride; j += 3)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+		char t3 = line_data[j + 2];
+		line_data[j + 2] = t1;
+		line_data[j + 1] = t2;
+		line_data[j + 0] = t3;
+	    }
+	    break;
+	case 32:
+	    for (j = 0; j + 4 <= stride; j += 4)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+		char t3 = line_data[j + 2];
+		char t4 = line_data[j + 3];
+		line_data[j + 3] = t1;
+		line_data[j + 2] = t2;
+		line_data[j + 1] = t3;
+		line_data[j + 0] = t4;
+	    }
+	    break;
+	default:
+	    break;
+	}
+    }
+}
+
+/* Create random image for testing purposes */
+static pixman_image_t *
+create_random_image (
+	pixman_format_code_t *allowed_formats,
+	int max_width,
+	int max_height,
+	int max_extra_stride,
+	pixman_format_code_t *used_fmt)
+{
+    int n = 0, i, width, height, stride;
+    pixman_format_code_t fmt;
+    uint32_t *buf;
+    pixman_image_t *img;
+    while (allowed_formats[n] != -1) n++;
+    fmt = allowed_formats[lcg_rand_n (n)];
+    width = lcg_rand_n (max_width) + 1;
+    height = lcg_rand_n (max_height) + 1;
+    stride = (width * PIXMAN_FORMAT_BPP (fmt) + 7) / 8 + lcg_rand_n (max_extra_stride + 1);
+    stride = (stride + 3) & ~3;
+
+    /* do the allocation */
+    buf = (uint32_t *)malloc (stride * height);
+
+    /* initialize image with random data */
+    for (i = 0; i < stride * height; i++)
+    {
+	/* generation is biased to having more 0 or 255 bytes as
+	   they are more likely to be special-cased in code */
+	*((uint8_t *)buf + i) = lcg_rand_n (4) ? lcg_rand_n (256) :
+	    (lcg_rand_n (2) ? 0 : 255);
+    }
+
+    img = pixman_image_create_bits (
+	fmt, width, height, buf, stride);
+
+    image_endian_swap (img, PIXMAN_FORMAT_BPP (fmt));
+
+    if (used_fmt) *used_fmt = fmt;
+    return img;
+}
+
+/* Free random image, and optionally update crc32 based on its data */
+static uint32_t
+free_random_image (
+    uint32_t initcrc,
+    pixman_image_t *img,
+    pixman_format_code_t fmt)
+{
+    uint32_t crc32 = 0;
+    int stride = pixman_image_get_stride (img);
+    uint32_t *data = pixman_image_get_data (img);
+    int height = pixman_image_get_height (img);;
+
+    if (fmt != -1)
+    {
+	/* mask unused 'x' part */
+	if (PIXMAN_FORMAT_BPP (fmt) - PIXMAN_FORMAT_DEPTH (fmt) &&
+	    PIXMAN_FORMAT_DEPTH (fmt) != 0)
+	{
+	    int i;
+	    uint32_t *data = pixman_image_get_data (img);
+	    uint32_t mask = (1 << PIXMAN_FORMAT_DEPTH (fmt)) - 1;
+	    for (i = 0; i < 32; i++)
+		mask |= mask << (i * PIXMAN_FORMAT_BPP (fmt));
+
+	    for (i = 0; i < stride * height / 4; i++)
+		data[i] &= mask;
+	}
+	/* swap endiannes in order to provide identical results on both big
+	   and litte endian systems */
+	image_endian_swap (img, PIXMAN_FORMAT_BPP (fmt));
+	crc32 = compute_crc32 (initcrc, data, stride * height);
+    }
+    pixman_image_unref (img);
+    free (data);
+    return crc32;
+}
+
+static pixman_op_t op_list[] = {
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE,
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+#if 0 /* using this crashes the test */
+    PIXMAN_OP_DISJOINT_DST,
+#endif
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+#if 0 /* using this crashes the test */
+    PIXMAN_OP_CONJOINT_DST,
+#endif
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+#if 0 /* these use floating point math and are not always bitexact on different platforms */
+    PIXMAN_OP_SOFT_LIGHT,
+    PIXMAN_OP_HSL_HUE,
+    PIXMAN_OP_HSL_SATURATION,
+    PIXMAN_OP_HSL_COLOR,
+    PIXMAN_OP_HSL_LUMINOSITY,
+#endif
+};
+static pixman_format_code_t img_fmt_list[] = {
+    PIXMAN_a8r8g8b8,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_a8b8g8r8,
+    PIXMAN_x8b8g8r8,
+    PIXMAN_b8g8r8a8,
+    PIXMAN_b8g8r8x8,
+    PIXMAN_r8g8b8,
+    PIXMAN_b8g8r8,
+    PIXMAN_r5g6b5,
+    PIXMAN_b5g6r5,
+#if 0 /* using these makes valgrind complain */
+    PIXMAN_x2r10g10b10,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_x2b10g10r10,
+    PIXMAN_a2b10g10r10,
+#endif
+    PIXMAN_a1r5g5b5,
+    PIXMAN_x1r5g5b5,
+    PIXMAN_a1b5g5r5,
+    PIXMAN_x1b5g5r5,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_x4r4g4b4,
+    PIXMAN_a4b4g4r4,
+    PIXMAN_x4b4g4r4,
+    PIXMAN_a8,
+    PIXMAN_r3g3b2,
+    PIXMAN_b2g3r3,
+    PIXMAN_a2r2g2b2,
+    PIXMAN_a2b2g2r2,
+#if 0 /* using these crashes the test */
+    PIXMAN_c8,
+    PIXMAN_g8,
+    PIXMAN_x4c4,
+    PIXMAN_x4g4,
+    PIXMAN_c4,
+    PIXMAN_g4,
+    PIXMAN_g1,
+#endif
+    PIXMAN_x4a4,
+    PIXMAN_a4,
+    PIXMAN_r1g2b1,
+    PIXMAN_b1g2r1,
+    PIXMAN_a1r1g1b1,
+    PIXMAN_a1b1g1r1,
+    PIXMAN_a1,
+    -1
+};
+static pixman_format_code_t mask_fmt_list[] = {
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a8,
+    PIXMAN_a4,
+    PIXMAN_a1,
+    -1
+};
+
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (uint32_t initcrc, int testnum, int verbose)
+{
+    int i;
+    pixman_image_t *src_img = NULL;
+    pixman_image_t *dst_img = NULL;
+    pixman_image_t *mask_img = NULL;
+    int src_width, src_height;
+    int dst_width, dst_height;
+    int src_stride, dst_stride;
+    int src_x, src_y;
+    int dst_x, dst_y;
+    int w, h;
+    int op;
+    pixman_format_code_t src_fmt, dst_fmt, mask_fmt;
+    uint32_t *dstbuf;
+    uint32_t crc32;
+    int max_width, max_height, max_extra_stride;
+
+    max_width = max_height = 12 + testnum / 100000;
+    max_extra_stride = 4 + testnum / 1000000;
+    if (max_width > 64) max_width = 64;
+    if (max_height > 16) max_height = 16;
+    if (max_extra_stride > 8) max_extra_stride = 8;
+
+    lcg_srand (testnum);
+
+    op = op_list[lcg_rand_n (sizeof (op_list) / sizeof (op_list[0]))];
+
+    if (lcg_rand_n (8))
+    {
+	/* normal image */
+	src_img = create_random_image (img_fmt_list, max_width, max_height,
+				      max_extra_stride, &src_fmt);
+    }
+    else
+    {
+	/* solid case */
+	src_img = create_random_image (img_fmt_list, 1, 1,
+				      max_extra_stride, &src_fmt);
+	pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
+    }
+
+    dst_img = create_random_image (img_fmt_list, max_width, max_height,
+				  max_extra_stride, &dst_fmt);
+
+    mask_img = NULL;
+    mask_fmt = -1;
+    if (lcg_rand_n (2))
+    {
+	if (lcg_rand_n (2))
+	{
+	    mask_img = create_random_image (mask_fmt_list, max_width, max_height,
+					   max_extra_stride, &mask_fmt);
+	}
+	else
+	{
+	    /* solid case */
+	    mask_img = create_random_image (mask_fmt_list, 1, 1,
+					   max_extra_stride, &mask_fmt);
+	    pixman_image_set_repeat (mask_img, PIXMAN_REPEAT_NORMAL);
+	}
+#if 0 /* using this crashes the test */
+	if (lcg_rand_n (2))
+	    pixman_image_set_component_alpha (mask_img, 1);
+#endif
+    }
+
+    src_width = pixman_image_get_width (src_img);
+    src_height = pixman_image_get_height (src_img);
+    src_stride = pixman_image_get_stride (src_img);
+    dst_width = pixman_image_get_width (dst_img);
+    dst_height = pixman_image_get_height (dst_img);
+    dst_stride = pixman_image_get_stride (dst_img);
+    dstbuf = pixman_image_get_data (dst_img);
+
+    src_x = lcg_rand_n (src_width);
+    src_y = lcg_rand_n (src_height);
+    dst_x = lcg_rand_n (dst_width);
+    dst_y = lcg_rand_n (dst_height);
+    w = lcg_rand_n (dst_width - dst_x + 1);
+    h = lcg_rand_n (dst_height - dst_y + 1);
+
+    if (verbose)
+    {
+	printf ("op=%d, src_fmt=%08X, dst_fmt=%08X, mask_fmt=%08X\n",
+	    op, src_fmt, dst_fmt, mask_fmt);
+	printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
+	    src_width, src_height, dst_width, dst_height);
+	printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
+	    src_x, src_y, dst_x, dst_y);
+	printf ("src_stride=%d, dst_stride=%d\n",
+	    src_stride, dst_stride);
+	printf ("w=%d, h=%d\n", w, h);
+    }
+
+    pixman_image_composite (op, src_img, mask_img, dst_img,
+			    src_x, src_y, src_x, src_y, dst_x, dst_y, w, h);
+
+    if (verbose)
+    {
+	int j;
+	printf ("---\n");
+	for (i = 0; i < dst_height; i++)
+	{
+	    for (j = 0; j < dst_stride; j++)
+	    {
+		if (j == (dst_width * PIXMAN_FORMAT_BPP (dst_fmt) + 7) / 8)
+		    printf ("| ");
+		printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
+	    }
+	    printf ("\n");
+	}
+	printf ("---\n");
+    }
+
+    free_random_image (initcrc, src_img, -1);
+    crc32 = free_random_image (initcrc, dst_img, dst_fmt);
+    if (mask_img) free_random_image (initcrc, mask_img, -1);
+    return crc32;
+}
+
+int
+main (int argc, char *argv[])
+{
+    int i, n1 = 1, n2 = 0;
+    uint32_t crc = 0;
+
+    if (argc >= 3)
+    {
+	n1 = atoi (argv[1]);
+	n2 = atoi (argv[2]);
+    }
+    else if (argc >= 2)
+    {
+	n2 = atoi (argv[1]);
+    }
+    else
+    {
+	n1 = 1;
+	n2 = 3000000;
+    }
+
+    if (n2 < 0)
+    {
+	crc = test_composite (0, abs (n2), 1);
+	printf ("crc32=%08X\n", crc);
+    }
+    else
+    {
+	for (i = n1; i <= n2; i++)
+	{
+	    crc = test_composite (crc, i, 0);
+	}
+	printf ("crc32=%08X\n", crc);
+
+	if (n2 == 3000000)
+	{
+	    /* Predefined value for running with all the fastpath functions
+	       disabled. It needs to be updated every time when changes are
+	       introduced to this program or behavior of pixman changes! */
+	    if (crc == 0x1A025829)
+	    {
+		printf ("blitters test passed\n");
+	    }
+	    else
+	    {
+		printf ("blitters test failed!\n");
+		return 1;
+	    }
+	}
+    }
+    return 0;
+}
commit 51f597ad3258dd85b4620ac2bf0df8ca2e0ed182
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Fri Aug 7 12:00:07 2009 -0400

    Delete commented out code in pixman-vmx.c

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index b52803d..06325a7 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -1607,82 +1607,6 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
     }
 }
 
-#if 0
-void
-vmx_composite_over_n_8888 (pixman_operator_t op,
-                           pixman_image_t *  src_image,
-                           pixman_image_t *  mask_image,
-                           pixman_image_t *  dst_image,
-                           int16_t           src_x,
-                           int16_t           src_y,
-                           int16_t           mask_x,
-                           int16_t           mask_y,
-                           int16_t           dest_x,
-                           int16_t           dest_y,
-                           uint16_t          width,
-                           uint16_t          height)
-{
-    uint32_t src;
-    uint32_t    *dst_line, *dst;
-    int dst_stride;
-
-    _pixman_image_get_solid (src_image, dst_image, src);
-
-    if (src >> 24 == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	/* XXX vmx_combine_over_u (dst, src, width); */
-    }
-}
-
-void
-vmx_composite_over_n_0565 (pixman_operator_t op,
-                           pixman_image_t *  src_image,
-                           pixman_image_t *  mask_image,
-                           pixman_image_t *  dst_image,
-                           int16_t           src_x,
-                           int16_t           src_y,
-                           int16_t           mask_x,
-                           int16_t           mask_y,
-                           int16_t           dest_x,
-                           int16_t           dest_y,
-                           uint16_t          width,
-                           uint16_t          height)
-{
-    uint32_t src;
-    uint16_t    *dst_line, *dst;
-    uint16_t w;
-    int dst_stride;
-
-    _pixman_image_get_solid (src_image, dst_image, src);
-
-    if (src >> 24 == 0)
-	return;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	vmx_combine_over_u565 (dst, src, width);
-    }
-}
-
-static const pixman_fast_path_t vmx_fast_path_array[] =
-{
-    { PIXMAN_OP_NONE },
-};
-
-const pixman_fast_path_t *const vmx_fast_paths = vmx_fast_path_array;
-
-#endif
-
 pixman_implementation_t *
 _pixman_implementation_create_vmx (void)
 {
commit a590eabead0a0c405a7293d8689b9992de5a689b
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Fri Aug 7 11:53:50 2009 -0400

    Misc formatting fixes for pixman-vmx.c

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index c09bac6..b52803d 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -45,24 +45,24 @@ static force_inline vector unsigned int
 pix_multiply (vector unsigned int p, vector unsigned int a)
 {
     vector unsigned short hi, lo, mod;
-    
+
     /* unpack to short */
     hi = (vector unsigned short)
 	vec_mergeh ((vector unsigned char)AVV (0),
 		    (vector unsigned char)p);
-    
+
     mod = (vector unsigned short)
 	vec_mergeh ((vector unsigned char)AVV (0),
 		    (vector unsigned char)a);
-    
+
     hi = vec_mladd (hi, mod, (vector unsigned short)
                     AVV (0x0080, 0x0080, 0x0080, 0x0080,
                          0x0080, 0x0080, 0x0080, 0x0080));
-    
+
     hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
-    
+
     hi = vec_sr (hi, vec_splat_u16 (8));
-    
+
     /* unpack to short */
     lo = (vector unsigned short)
 	vec_mergel ((vector unsigned char)AVV (0),
@@ -70,15 +70,15 @@ pix_multiply (vector unsigned int p, vector unsigned int a)
     mod = (vector unsigned short)
 	vec_mergel ((vector unsigned char)AVV (0),
 		    (vector unsigned char)a);
-    
+
     lo = vec_mladd (lo, mod, (vector unsigned short)
                     AVV (0x0080, 0x0080, 0x0080, 0x0080,
                          0x0080, 0x0080, 0x0080, 0x0080));
-    
+
     lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
-    
+
     lo = vec_sr (lo, vec_splat_u16 (8));
-    
+
     return (vector unsigned int)vec_packsu (hi, lo);
 }
 
@@ -191,31 +191,31 @@ vmx_combine_over_u_no_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKS (dest, src);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
+
 	LOAD_VECTORS (dest, src);
-	
+
 	vdest = over (vsrc, splat_alpha (vsrc), vdest);
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
 	uint32_t ia = ALPHA_8 (~s);
-	
+
 	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
-	
+
 	dest[i] = d;
     }
 }
@@ -230,35 +230,34 @@ vmx_combine_over_u_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSM (dest, src, mask);
-	
+
 	vdest = over (vsrc, splat_alpha (vsrc), vdest);
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
 	uint32_t ia;
-	
+
 	UN8x4_MUL_UN8 (s, m);
-	
+
 	ia = ALPHA_8 (~s);
-	
+
 	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
 	dest[i] = d;
     }
@@ -287,29 +286,29 @@ vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKS (dest, src);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
+
 	LOAD_VECTORS (dest, src);
-	
+
 	vdest = over (vdest, splat_alpha (vdest), vsrc);
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
 	uint32_t ia = ALPHA_8 (~dest[i]);
-	
+
 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
 	dest[i] = s;
     }
@@ -325,33 +324,33 @@ vmx_combine_over_reverse_u_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
+
 	LOAD_VECTORSM (dest, src, mask);
-	
+
 	vdest = over (vdest, splat_alpha (vdest), vsrc);
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
 	uint32_t ia = ALPHA_8 (~dest[i]);
-	
+
 	UN8x4_MUL_UN8 (s, m);
-	
+
 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
 	dest[i] = s;
     }
@@ -380,28 +379,27 @@ vmx_combine_in_u_no_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKS (dest, src);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORS (dest, src);
-	
+
 	vdest = pix_multiply (vsrc, splat_alpha (vdest));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
-	
 	uint32_t s = src[i];
 	uint32_t a = ALPHA_8 (dest[i]);
+
 	UN8x4_MUL_UN8 (s, a);
 	dest[i] = s;
     }
@@ -417,33 +415,32 @@ vmx_combine_in_u_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSM (dest, src, mask);
-	
+
 	vdest = pix_multiply (vsrc, splat_alpha (vdest));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
 	uint32_t s = src[i];
 	uint32_t a = ALPHA_8 (dest[i]);
-	
+
 	UN8x4_MUL_UN8 (s, m);
-	
 	UN8x4_MUL_UN8 (s, a);
+
 	dest[i] = s;
     }
 }
@@ -471,28 +468,29 @@ vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKS (dest, src);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORS (dest, src);
-	
+
 	vdest = pix_multiply (vdest, splat_alpha (vsrc));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t d = dest[i];
 	uint32_t a = ALPHA_8 (src[i]);
+
 	UN8x4_MUL_UN8 (d, a);
+
 	dest[i] = d;
     }
 }
@@ -507,34 +505,33 @@ vmx_combine_in_reverse_u_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSM (dest, src, mask);
-	
+
 	vdest = pix_multiply (vdest, splat_alpha (vsrc));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
 	uint32_t d = dest[i];
 	uint32_t a = src[i];
-	
+
 	UN8x4_MUL_UN8 (a, m);
-	
 	a = ALPHA_8 (a);
 	UN8x4_MUL_UN8 (d, a);
+
 	dest[i] = d;
     }
 }
@@ -562,28 +559,29 @@ vmx_combine_out_u_no_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKS (dest, src);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORS (dest, src);
-	
+
 	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t s = src[i];
 	uint32_t a = ALPHA_8 (~dest[i]);
+
 	UN8x4_MUL_UN8 (s, a);
+
 	dest[i] = s;
     }
 }
@@ -598,33 +596,32 @@ vmx_combine_out_u_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSM (dest, src, mask);
-	
+
 	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
 	uint32_t s = src[i];
 	uint32_t a = ALPHA_8 (~dest[i]);
-	
+
 	UN8x4_MUL_UN8 (s, m);
-	
 	UN8x4_MUL_UN8 (s, a);
+
 	dest[i] = s;
     }
 }
@@ -652,28 +649,30 @@ vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKS (dest, src);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
+
 	LOAD_VECTORS (dest, src);
-	
+
 	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t d = dest[i];
 	uint32_t a = ALPHA_8 (~src[i]);
+
 	UN8x4_MUL_UN8 (d, a);
+
 	dest[i] = d;
     }
 }
@@ -688,34 +687,33 @@ vmx_combine_out_reverse_u_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSM (dest, src, mask);
-	
+
 	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
 	uint32_t d = dest[i];
 	uint32_t a = src[i];
-	
+
 	UN8x4_MUL_UN8 (a, m);
-	
 	a = ALPHA_8 (~a);
 	UN8x4_MUL_UN8 (d, a);
+
 	dest[i] = d;
     }
 }
@@ -743,32 +741,32 @@ vmx_combine_atop_u_no_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKS (dest, src);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORS (dest, src);
-	
+
 	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
 			     vdest, splat_alpha (negate (vsrc)));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
 	uint32_t dest_a = ALPHA_8 (d);
 	uint32_t src_ia = ALPHA_8 (~s);
-	
+
 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
 	dest[i] = s;
     }
 }
@@ -783,25 +781,24 @@ vmx_combine_atop_u_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSM (dest, src, mask);
-	
+
 	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
 			     vdest, splat_alpha (negate (vsrc)));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
@@ -809,12 +806,13 @@ vmx_combine_atop_u_mask (uint32_t *      dest,
 	uint32_t d = dest[i];
 	uint32_t dest_a = ALPHA_8 (d);
 	uint32_t src_ia;
-	
+
 	UN8x4_MUL_UN8 (s, m);
-	
+
 	src_ia = ALPHA_8 (~s);
-	
+
 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
 	dest[i] = s;
     }
 }
@@ -842,32 +840,32 @@ vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKS (dest, src);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORS (dest, src);
-	
+
 	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
 			     vsrc, splat_alpha (negate (vdest)));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
 	uint32_t src_a = ALPHA_8 (s);
 	uint32_t dest_ia = ALPHA_8 (~d);
-	
+
 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
 	dest[i] = s;
     }
 }
@@ -882,25 +880,24 @@ vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSM (dest, src, mask);
-	
+
 	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
 			     vsrc, splat_alpha (negate (vdest)));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
@@ -908,12 +905,13 @@ vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
 	uint32_t d = dest[i];
 	uint32_t src_a;
 	uint32_t dest_ia = ALPHA_8 (~d);
-	
+
 	UN8x4_MUL_UN8 (s, m);
-	
+
 	src_a = ALPHA_8 (s);
-	
+
 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
 	dest[i] = s;
     }
 }
@@ -941,32 +939,32 @@ vmx_combine_xor_u_no_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKS (dest, src);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORS (dest, src);
-	
+
 	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
 			     vdest, splat_alpha (negate (vsrc)));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
 	uint32_t src_ia = ALPHA_8 (~s);
 	uint32_t dest_ia = ALPHA_8 (~d);
-	
+
 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
 	dest[i] = s;
     }
 }
@@ -981,25 +979,24 @@ vmx_combine_xor_u_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSM (dest, src, mask);
-	
+
 	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
 			     vdest, splat_alpha (negate (vsrc)));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
@@ -1007,12 +1004,13 @@ vmx_combine_xor_u_mask (uint32_t *      dest,
 	uint32_t d = dest[i];
 	uint32_t src_ia;
 	uint32_t dest_ia = ALPHA_8 (~d);
-	
+
 	UN8x4_MUL_UN8 (s, m);
-	
+
 	src_ia = ALPHA_8 (~s);
-	
+
 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
 	dest[i] = s;
     }
 }
@@ -1040,27 +1038,28 @@ vmx_combine_add_u_no_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKS (dest, src);
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORS (dest, src);
-	
+
 	vdest = pix_add (vsrc, vdest);
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
+
 	UN8x4_ADD_UN8x4 (d, s);
+
 	dest[i] = d;
     }
 }
@@ -1075,33 +1074,32 @@ vmx_combine_add_u_mask (uint32_t *      dest,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, src_mask, mask_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSM (dest, src, mask);
-	
+
 	vdest = pix_add (vsrc, vdest);
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t m = ALPHA_8 (mask[i]);
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
-	
+
 	UN8x4_MUL_UN8 (s, m);
-	
 	UN8x4_ADD_UN8x4 (d, s);
+
 	dest[i] = d;
     }
 }
@@ -1132,28 +1130,30 @@ vmx_combine_src_ca (pixman_implementation_t *imp,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSC (dest, src, mask);
-	
+
 	vdest = pix_multiply (vsrc, vmask);
-	
+
 	STORE_VECTOR (dest);
-	
+
 	mask += 4;
 	src += 4;
 	dest += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t a = mask[i];
 	uint32_t s = src[i];
+
 	UN8x4_MUL_UN8x4 (s, a);
+
 	dest[i] = s;
     }
 }
@@ -1176,7 +1176,6 @@ vmx_combine_over_ca (pixman_implementation_t *imp,
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-
 	LOAD_VECTORSC (dest, src, mask);
 
 	vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
@@ -1283,6 +1282,7 @@ vmx_combine_in_ca (pixman_implementation_t *imp,
 
 	UN8x4_MUL_UN8x4 (s, a);
 	UN8x4_MUL_UN8 (s, da);
+
 	dest[i] = s;
     }
 }
@@ -1299,31 +1299,33 @@ vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
+
 	LOAD_VECTORSC (dest, src, mask);
-	
+
 	vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc)));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t a = mask[i];
 	uint32_t d = dest[i];
 	uint32_t sa = ALPHA_8 (src[i]);
+
 	UN8x4_MUL_UN8 (a, sa);
 	UN8x4_MUL_UN8x4 (d, a);
+
 	dest[i] = d;
     }
 }
@@ -1364,8 +1366,10 @@ vmx_combine_out_ca (pixman_implementation_t *imp,
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
 	uint32_t da = ALPHA_8 (~d);
+
 	UN8x4_MUL_UN8x4 (s, a);
 	UN8x4_MUL_UN8 (s, da);
+
 	dest[i] = s;
     }
 }
@@ -1508,6 +1512,7 @@ vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
 	UN8x4_MUL_UN8x4 (s, a);
 	UN8x4_MUL_UN8 (a, sa);
 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
+
 	dest[i] = d;
     }
 }
@@ -1524,27 +1529,26 @@ vmx_combine_xor_ca (pixman_implementation_t *imp,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSC (dest, src, mask);
-	
+
 	vdest = pix_add_mul (vdest,
 			     negate (pix_multiply (vmask, splat_alpha (vsrc))),
 			     pix_multiply (vsrc, vmask),
 			     negate (splat_alpha (vdest)));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t a = mask[i];
@@ -1552,10 +1556,11 @@ vmx_combine_xor_ca (pixman_implementation_t *imp,
 	uint32_t d = dest[i];
 	uint32_t sa = ALPHA_8 (s);
 	uint32_t da = ALPHA_8 (~d);
-	
+
 	UN8x4_MUL_UN8x4 (s, a);
 	UN8x4_MUL_UN8 (a, sa);
 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
 	dest[i] = d;
     }
 }
@@ -1572,32 +1577,32 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSC (dest, src, mask);
-	
+
 	vdest = pix_add (pix_multiply (vsrc, vmask), vdest);
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t a = mask[i];
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
-	
+
 	UN8x4_MUL_UN8x4 (s, a);
 	UN8x4_ADD_UN8x4 (s, d);
+
 	dest[i] = s;
     }
 }
@@ -1620,14 +1625,13 @@ vmx_composite_over_n_8888 (pixman_operator_t op,
     uint32_t src;
     uint32_t    *dst_line, *dst;
     int dst_stride;
-    
+
     _pixman_image_get_solid (src_image, dst_image, src);
-    
+
     if (src >> 24 == 0)
 	return;
-    
+
     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    
     while (height--)
     {
 	dst = dst_line;
@@ -1654,14 +1658,14 @@ vmx_composite_over_n_0565 (pixman_operator_t op,
     uint16_t    *dst_line, *dst;
     uint16_t w;
     int dst_stride;
-    
+
     _pixman_image_get_solid (src_image, dst_image, src);
-    
+
     if (src >> 24 == 0)
 	return;
-    
+
     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-    
+
     while (height--)
     {
 	dst = dst_line;
@@ -1684,10 +1688,9 @@ _pixman_implementation_create_vmx (void)
 {
     pixman_implementation_t *fast = _pixman_implementation_create_fast_path ();
     pixman_implementation_t *imp = _pixman_implementation_create (fast);
-    
+
     /* Set up function pointers */
-    
-    /* SSE code patch for fbcompose.c */
+
     imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u;
     imp->combine_32[PIXMAN_OP_IN] = vmx_combine_in_u;
@@ -1697,9 +1700,9 @@ _pixman_implementation_create_vmx (void)
     imp->combine_32[PIXMAN_OP_ATOP] = vmx_combine_atop_u;
     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u;
     imp->combine_32[PIXMAN_OP_XOR] = vmx_combine_xor_u;
-    
+
     imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u;
-    
+
     imp->combine_32_ca[PIXMAN_OP_SRC] = vmx_combine_src_ca;
     imp->combine_32_ca[PIXMAN_OP_OVER] = vmx_combine_over_ca;
     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca;
@@ -1711,7 +1714,6 @@ _pixman_implementation_create_vmx (void)
     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca;
     imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
     imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
-    
+
     return imp;
 }
-
commit 0ebb587e2460024fb306597799ae4974441511ec
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Fri Aug 7 11:48:22 2009 -0400

    In vmx_combine_atop_reverse_ca() extract alpha after inversing

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 1651ec3..c09bac6 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -1477,38 +1477,37 @@ vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSC (dest, src, mask);
-	
+
 	vdest = pix_add_mul (vdest,
 			     pix_multiply (vmask, splat_alpha (vsrc)),
 			     pix_multiply (vsrc, vmask),
 			     negate (splat_alpha (vdest)));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t a = mask[i];
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
 	uint32_t sa = ALPHA_8 (s);
-	uint32_t da = ALPHA_8 (d);
-	
+	uint32_t da = ALPHA_8 (~d);
+
 	UN8x4_MUL_UN8x4 (s, a);
 	UN8x4_MUL_UN8 (a, sa);
-	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, ~da);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
 	dest[i] = d;
     }
 }
commit 3d2f00783f2972ba5311937057ea8d452f942a36
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Fri Aug 7 11:46:09 2009 -0400

    Really fix vmx_combine_over_reverse_ca()
    
    The inverse destination alpha is just one component, not four.

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 587ade5..1651ec3 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -1238,8 +1238,10 @@ vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
 	uint32_t ida = ALPHA_8 (~d);
+
 	UN8x4_MUL_UN8x4 (s, a);
-	UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ida, d);
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
+
 	dest[i] = s;
     }
 }
@@ -1458,6 +1460,7 @@ vmx_combine_atop_ca (pixman_implementation_t *imp,
 	UN8x4_MUL_UN8x4 (s, a);
 	UN8x4_MUL_UN8 (a, sa);
 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
 	dest[i] = d;
     }
 }
commit 2f62a4f46c1e99ddb1b7ca6d5db9410d12f32e63
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Fri Aug 7 11:40:42 2009 -0400

    Fix vmx_combine_out_reverse_ca()
    
    The source alpha is just one component, not four.

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 6ad2017..587ade5 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -1380,33 +1380,34 @@ vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSC (dest, src, mask);
-	
+
 	vdest = pix_multiply (
 	    vdest, negate (pix_multiply (vmask, splat_alpha (vsrc))));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t a = mask[i];
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
 	uint32_t sa = ALPHA_8 (s);
-	UN8x4_MUL_UN8x4 (a, sa);
+
+	UN8x4_MUL_UN8 (a, sa);
 	UN8x4_MUL_UN8x4 (d, ~a);
+
 	dest[i] = d;
     }
 }
commit 7e58323385e442fb2cea207780db5e30be88be96
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Fri Aug 7 11:38:03 2009 -0400

    Fix vmx_over_reverse_ca()
    
    Destination alpha must be extracted after inversing, otherwise we end
    up with 0xFFs in the rgb channels.

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 018653e..6ad2017 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -1221,26 +1221,25 @@ vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
     /* printf("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSC (dest, src, mask);
-	
+
 	vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	mask += 4;
 	src += 4;
 	dest += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t a = mask[i];
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
-	uint32_t da = ALPHA_8 (d);
+	uint32_t ida = ALPHA_8 (~d);
 	UN8x4_MUL_UN8x4 (s, a);
-	UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ~da, d);
+	UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ida, d);
 	dest[i] = s;
     }
 }
commit 2382bd9e2724944a05ce8a581e9ddc31e299a0c6
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Fri Aug 7 11:35:20 2009 -0400

    Multiply with the alpha of dest, not inverse alpha

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index fbea24e..018653e 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -1437,7 +1437,7 @@ vmx_combine_atop_ca (pixman_implementation_t *imp,
 	vsrc = pix_multiply (vsrc, vmask);
 	vmask = pix_multiply (vmask, vsrca);
 
-	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
+	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
 			     negate (vmask), vdest);
 
 	STORE_VECTOR (dest);
commit 498df0f0bf2437130ed305fb757ae0fae90bebb7
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Fri Aug 7 11:32:31 2009 -0400

    Fix vmx_combine_vmx_atop_ca()
    
    It didn't compute the mask correct before.

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 0038752..fbea24e 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -1215,7 +1215,7 @@ vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf("%s\n",__PRETTY_FUNCTION__); */
@@ -1421,30 +1421,32 @@ vmx_combine_atop_ca (pixman_implementation_t *imp,
                      int                      width)
 {
     int i;
-    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask, vsrca;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSC (dest, src, mask);
-	
-	vdest = pix_add_mul (pix_multiply (vsrc, vmask), splat_alpha (vdest),
-			     vdest,
-			     negate (pix_multiply (vmask,
-						   splat_alpha (vmask))));
-	
+
+	vsrca = splat_alpha (vsrc);
+
+	vsrc = pix_multiply (vsrc, vmask);
+	vmask = pix_multiply (vmask, vsrca);
+
+	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
+			     negate (vmask), vdest);
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t a = mask[i];
@@ -1452,7 +1454,7 @@ vmx_combine_atop_ca (pixman_implementation_t *imp,
 	uint32_t d = dest[i];
 	uint32_t sa = ALPHA_8 (s);
 	uint32_t da = ALPHA_8 (d);
-	
+
 	UN8x4_MUL_UN8x4 (s, a);
 	UN8x4_MUL_UN8 (a, sa);
 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
commit 9650cd7432ef03c05895df04940e2ab6245f2618
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Fri Aug 7 11:26:23 2009 -0400

    Fix vmx_combine_over_ca().
    
    In the non-vector code, the mask needs to be multiplied with source
    alpha.

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 511c8cb..0038752 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -1170,30 +1170,35 @@ vmx_combine_over_ca (pixman_implementation_t *imp,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
+
 	LOAD_VECTORSC (dest, src, mask);
-	
+
 	vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
-	
+
 	STORE_VECTOR (dest);
-	
+
 	mask += 4;
 	src += 4;
 	dest += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t a = mask[i];
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (s);
+
 	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
 	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
+
 	dest[i] = d;
     }
 }
commit 38b9589fe6b14c822a2a4000df364d132e390f7a
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Fri Aug 7 11:21:43 2009 -0400

    In vmx_combine_out_ca() multiply with the alpha of the negated vdest.

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 16db95a..511c8cb 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -1342,7 +1342,8 @@ vmx_combine_out_ca (pixman_implementation_t *imp,
     {
 	LOAD_VECTORSC (dest, src, mask);
 
-	vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
+	vdest = pix_multiply (
+	    pix_multiply (vsrc, vmask), splat_alpha (negate (vdest)));
 
 	STORE_VECTOR (dest);
 
commit de180baba3a3e7eedeb09ff7d5f4d3eff3ffc6f4
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Fri Aug 7 11:16:31 2009 -0400

    Fix vmx_combine_out_ca()
    
    It should multiply with just the destination alpha channel, not all
    four channels.

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 3468ad5..16db95a 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -1334,24 +1334,23 @@ vmx_combine_out_ca (pixman_implementation_t *imp,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSC (dest, src, mask);
-	
+
 	vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t a = mask[i];
@@ -1359,7 +1358,7 @@ vmx_combine_out_ca (pixman_implementation_t *imp,
 	uint32_t d = dest[i];
 	uint32_t da = ALPHA_8 (~d);
 	UN8x4_MUL_UN8x4 (s, a);
-	UN8x4_MUL_UN8x4 (s, da);
+	UN8x4_MUL_UN8 (s, da);
 	dest[i] = s;
     }
 }
commit 5191421d1f143cca76afa1f4fbffa68f89a5d393
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Fri Aug 7 11:07:16 2009 -0400

    Do the full four-component IN computation in vmx_combine_in_ca().

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 5978a07..3468ad5 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -1252,30 +1252,30 @@ vmx_combine_in_ca (pixman_implementation_t *imp,
     vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
 	dest_mask, mask_mask, src_mask, store_mask;
-    
+
     COMPUTE_SHIFT_MASKC (dest, src, mask);
-    
+
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
     for (i = width / 4; i > 0; i--)
     {
-	
 	LOAD_VECTORSC (dest, src, mask);
-	
+
 	vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
-	
+
 	STORE_VECTOR (dest);
-	
+
 	src += 4;
 	dest += 4;
 	mask += 4;
     }
-    
+
     for (i = width % 4; --i >= 0;)
     {
 	uint32_t a = mask[i];
 	uint32_t s = src[i];
 	uint32_t da = ALPHA_8 (dest[i]);
-	UN8x4_MUL_UN8 (s, a);
+
+	UN8x4_MUL_UN8x4 (s, a);
 	UN8x4_MUL_UN8 (s, da);
 	dest[i] = s;
     }
commit 27fb8378fdae930475cf4528c539a78bfbd751c5
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Fri Aug 7 10:54:16 2009 -0400

    Fix bug in vmx_combine_xor_ca()
    
    The destination needs to be inverted before the alpha channel is
    extracted; otherwise, the RGB channels of da will be 0xff.

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 7d947d1..5978a07 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -1542,11 +1542,11 @@ vmx_combine_xor_ca (pixman_implementation_t *imp,
 	uint32_t s = src[i];
 	uint32_t d = dest[i];
 	uint32_t sa = ALPHA_8 (s);
-	uint32_t da = ALPHA_8 (d);
+	uint32_t da = ALPHA_8 (~d);
 	
 	UN8x4_MUL_UN8x4 (s, a);
 	UN8x4_MUL_UN8 (a, sa);
-	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, ~da);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
 	dest[i] = d;
     }
 }
commit c750667d7ac542dfa922a7970961b7095b44b8d3
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Fri Aug 7 01:07:01 2009 -0400

    Make pix_multiply bit-exact

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 6fc3cde..7d947d1 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -95,56 +95,12 @@ pix_add_mul (vector unsigned int x,
              vector unsigned int y,
              vector unsigned int b)
 {
-    vector unsigned short hi, lo, mod, hiy, loy, mody;
-    
-    hi = (vector unsigned short)
-	vec_mergeh ((vector unsigned char)AVV (0),
-		    (vector unsigned char)x);
-    mod = (vector unsigned short)
-	vec_mergeh ((vector unsigned char)AVV (0),
-		    (vector unsigned char)a);
-    hiy = (vector unsigned short)
-	vec_mergeh ((vector unsigned char)AVV (0),
-		    (vector unsigned char)y);
-    mody = (vector unsigned short)
-	vec_mergeh ((vector unsigned char)AVV (0),
-		    (vector unsigned char)b);
-    
-    hi = vec_mladd (hi, mod, (vector unsigned short)
-                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
-                         0x0080, 0x0080, 0x0080, 0x0080));
-    
-    hi = vec_mladd (hiy, mody, hi);
-    
-    hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
-    
-    hi = vec_sr (hi, vec_splat_u16 (8));
-    
-    lo = (vector unsigned short)
-	vec_mergel ((vector unsigned char)AVV (0),
-		    (vector unsigned char)x);
-    mod = (vector unsigned short)
-	vec_mergel ((vector unsigned char)AVV (0),
-		    (vector unsigned char)a);
-    
-    loy = (vector unsigned short)
-	vec_mergel ((vector unsigned char)AVV (0),
-		    (vector unsigned char)y);
-    mody = (vector unsigned short)
-	vec_mergel ((vector unsigned char)AVV (0),
-		    (vector unsigned char)b);
-    
-    lo = vec_mladd (lo, mod, (vector unsigned short)
-                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
-                         0x0080, 0x0080, 0x0080, 0x0080));
-    
-    lo = vec_mladd (loy, mody, lo);
-    
-    lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
-    
-    lo = vec_sr (lo, vec_splat_u16 (8));
-    
-    return (vector unsigned int)vec_packsu (hi, lo);
+    vector unsigned int t1, t2;
+
+    t1 = pix_multiply (x, a);
+    t2 = pix_multiply (y, b);
+
+    return pix_add (t1, t2);
 }
 
 static force_inline vector unsigned int
@@ -161,7 +117,7 @@ over (vector unsigned int src,
 {
     vector unsigned char tmp = (vector unsigned char)
 	pix_multiply (dest, negate (srca));
-    
+
     tmp = vec_adds ((vector unsigned char)src, tmp);
     return (vector unsigned int)tmp;
 }
commit 6243a0a015043f39531b98b9e8c4167f8bd47d82
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Thu Aug 6 23:50:32 2009 -0400

    Change the SSE2 versions of pix_add_multiply() to produce bit-exact results.

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 7057288..727ad42 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -264,19 +264,14 @@ pix_add_multiply_2x128 (__m128i* src_lo,
                         __m128i* ret_lo,
                         __m128i* ret_hi)
 {
-    __m128i lo, hi;
-    __m128i mul_lo, mul_hi;
+    __m128i t1_lo, t1_hi;
+    __m128i t2_lo, t2_hi;
 
-    lo = _mm_mullo_epi16 (*src_lo, *alpha_dst_lo);
-    hi = _mm_mullo_epi16 (*src_hi, *alpha_dst_hi);
-    mul_lo = _mm_mullo_epi16 (*dst_lo, *alpha_src_lo);
-    mul_hi = _mm_mullo_epi16 (*dst_hi, *alpha_src_hi);
-    lo = _mm_adds_epu16 (lo, mask_0080);
-    hi = _mm_adds_epu16 (hi, mask_0080);
-    lo = _mm_adds_epu16 (lo, mul_lo);
-    hi = _mm_adds_epu16 (hi, mul_hi);
-    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
-    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
+    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
+    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
+
+    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
+    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
 }
 
 static force_inline void
@@ -457,11 +452,10 @@ pix_add_multiply_1x64 (__m64* src,
                        __m64* dst,
                        __m64* alpha_src)
 {
-    return _mm_mulhi_pu16 (
-	_mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alpha_dst),
-				      mask_x0080),
-		       _mm_mullo_pi16 (*dst, *alpha_src)),
-	mask_x0101);
+    __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
+    __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
+
+    return _mm_adds_pu8 (t1, t2);
 }
 
 static force_inline __m64
commit 404f4a6f3e71de5e411cb3bb1107d8ffb7c52e62
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Thu Aug 6 23:52:11 2009 -0400

    Fix a couple of alpha==0 vs src==0 issues in pixman-sse2.c

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index e2a39a8..7057288 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -4857,8 +4857,6 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     sa = src >> 24;
-    if (sa == 0)
-	return;
 
     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
 
@@ -5073,8 +5071,6 @@ sse2_composite_add_8888_8_8 (pixman_implementation_t *imp,
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     sa = src >> 24;
-    if (sa == 0)
-	return;
 
     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
 
commit d9f80370a4d2ab54688e75256b3ea4267d8cc602
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Thu Aug 6 23:05:36 2009 -0400

    Rename mmx_composite_add_8888_8_8() to mmx_composite_add_n_8_8().

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 63c6e7e..7dcc1dc 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -2652,8 +2652,6 @@ mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     sa = src >> 24;
-    if (sa == 0)
-	return;
 
     vsrc = load8888 (src);
     vsrca = expand_alpha (vsrc);
@@ -2773,19 +2771,19 @@ mmx_composite_in_8_8 (pixman_implementation_t *imp,
 }
 
 static void
-mmx_composite_add_8888_8_8 (pixman_implementation_t *imp,
-                            pixman_op_t              op,
-                            pixman_image_t *         src_image,
-                            pixman_image_t *         mask_image,
-                            pixman_image_t *         dst_image,
-                            int32_t                  src_x,
-                            int32_t                  src_y,
-                            int32_t                  mask_x,
-                            int32_t                  mask_y,
-                            int32_t                  dest_x,
-                            int32_t                  dest_y,
-                            int32_t                  width,
-                            int32_t                  height)
+mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
+			 pixman_op_t              op,
+			 pixman_image_t *         src_image,
+			 pixman_image_t *         mask_image,
+			 pixman_image_t *         dst_image,
+			 int32_t                  src_x,
+			 int32_t                  src_y,
+			 int32_t                  mask_x,
+			 int32_t                  mask_y,
+			 int32_t                  dest_x,
+			 int32_t                  dest_y,
+			 int32_t                  width,
+			 int32_t                  height)
 {
     uint8_t     *dst_line, *dst;
     uint8_t     *mask_line, *mask;
@@ -2801,7 +2799,8 @@ mmx_composite_add_8888_8_8 (pixman_implementation_t *imp,
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     sa = src >> 24;
-    if (sa == 0)
+
+    if (src == 0)
 	return;
 
     vsrc = load8888 (src);
@@ -3278,7 +3277,7 @@ static const pixman_fast_path_t mmx_fast_paths[] =
     { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, mmx_composite_add_8888_8888,   0 },
     { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, mmx_composite_add_8888_8888,   0 },
     { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       mmx_composite_add_8000_8000,   0 },
-    { PIXMAN_OP_ADD, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       mmx_composite_add_8888_8_8,    0 },
+    { PIXMAN_OP_ADD, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       mmx_composite_add_n_8_8,    0 },
 
     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8r8g8b8, mmx_composite_src_n_8_8888, 0 },
     { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8r8g8b8, mmx_composite_src_n_8_8888, 0 },
commit 04619c3636697684fdd9ada9842845f6c8dd3914
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Thu Aug 6 22:46:50 2009 -0400

    Fix a couple more alpha==0 vs src==0 bugs in pixman-mmx.c

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index de672e6..63c6e7e 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -1110,7 +1110,7 @@ mmx_composite_over_n_8888 (pixman_implementation_t *imp,
 
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
-    if (src >> 24 == 0)
+    if (src == 0)
 	return;
 
     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
@@ -1189,7 +1189,7 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp,
 
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
-    if (src >> 24 == 0)
+    if (src == 0)
 	return;
 
     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
commit a075a870fd7e1fa70ae176d5089c695011667388
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Thu Aug 6 22:42:25 2009 -0400

    Make pix_add_mul() in pixman-mmx.c produce exact results.
    
    Previously this routine would compute (x * a + y * b) / 255. Now it
    computes (x * a) / 255 + (y * b) / 255, so that the results are
    bitwise equivalent to the non-mmx versions.

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 60b2b11..de672e6 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -400,25 +400,18 @@ pack_565 (__m64 pixel, __m64 target, int pos)
 static force_inline __m64
 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
 {
-    x = _mm_mullo_pi16 (x, a);
-    y = _mm_mullo_pi16 (y, b);
-    x = _mm_adds_pu16 (x, MC (4x0080));
-    x = _mm_adds_pu16 (x, y);
-    x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8));
-    x = _mm_srli_pi16 (x, 8);
+    x = pix_multiply (x, a);
+    y = pix_multiply (y, b);
 
-    return x;
+    return pix_add (x, y);
 }
 
 #else
 
 #define pix_add_mul(x, a, y, b)	 \
-    ( x = _mm_mullo_pi16 (x, a), \
-      y = _mm_mullo_pi16 (y, b), \
-      x = _mm_adds_pu16 (x, MC (4x0080)), \
-      x = _mm_adds_pu16 (x, y), \
-      x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)), \
-      _mm_srli_pi16 (x, 8) )
+    ( x = pix_multiply (x, a),	 \
+      y = pix_multiply (y, a),	 \
+      pix_add (x, y) )
 
 #endif
 
commit f7463ffafb8876c1f47ed9c527df33d45255e16c
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Thu Aug 6 20:29:44 2009 -0400

    Rewrite the two-component arithmetic macros.
    
    Previously they were not bit-for-bit equivalent to the one-component
    versions. The new code is also simpler and easier to read because it
    factors out some common sub-macros.
    
    The x * a + y * b macro now only uses four multiplications - the
    previous version used eight.

diff --git a/pixman/pixman-combine.h.template b/pixman/pixman-combine.h.template
index 2b4bb60..2f6392f 100644
--- a/pixman/pixman-combine.h.template
+++ b/pixman/pixman-combine.h.template
@@ -43,19 +43,56 @@
  */
 
 /*
+ * x_rb = (x_rb * a) / 255
+ */
+#define UNc_rb_MUL_UNc(x, a, t)						\
+    do									\
+    {									\
+	t  = ((x) & RB_MASK) * (a);					\
+	t += RB_ONE_HALF;						\
+	x = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
+	x &= RB_MASK;							\
+    } while (0)
+
+/*
+ * x_rb = min (x_rb + y_rb, 255)
+ */
+#define UNc_rb_ADD_UNc_rb(x, y, t)					\
+    do									\
+    {									\
+	t = ((x) + (y));						\
+	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);		\
+	x = (t & RB_MASK);						\
+    } while (0)
+
+/*
+ * x_rb = (x_rb * a_rb) / 255
+ */
+#define UNc_rb_MUL_UNc_rb(x, a, t)					\
+    do									\
+    {									\
+	t  = (x & MASK) * (a & MASK);					\
+	t |= (x & R_MASK) * ((a >> R_SHIFT) & MASK);			\
+	t += RB_ONE_HALF;						\
+	t = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
+	x = t & RB_MASK;						\
+    } while (0)
+
+/*
  * x_c = (x_c * a) / 255
  */
 #define UNcx4_MUL_UNc(x, a)						\
     do									\
     {									\
-	comp4_t t = ((x & RB_MASK) * a) + RB_ONE_HALF;                  \
-	t = (t + ((t >> COMPONENT_SIZE) & RB_MASK)) >> COMPONENT_SIZE;  \
-	t &= RB_MASK;                                                   \
-                                                                        \
-	x = (((x >> COMPONENT_SIZE) & RB_MASK) * a) + RB_ONE_HALF;      \
-	x = (x + ((x >> COMPONENT_SIZE) & RB_MASK));                    \
-	x &= RB_MASK << COMPONENT_SIZE;                                 \
-	x += t;                                                         \
+	comp4_t r1, r2, t;						\
+									\
+	r1 = (x);							\
+	UNc_rb_MUL_UNc (r1, a, t);					\
+									\
+	r2 = (x) >> G_SHIFT;						\
+	UNc_rb_MUL_UNc (r2, a, t);					\
+									\
+	x = r1 | (r2 << G_SHIFT);					\
     } while (0)
 
 /*
@@ -64,33 +101,19 @@
 #define UNcx4_MUL_UNc_ADD_UNcx4(x, a, y)				\
     do									\
     {									\
-	/* multiply and divide: trunc((i + 128)*257/65536) */           \
-	comp4_t t = ((x & RB_MASK) * a) + RB_ONE_HALF;                  \
-	t = (t + ((t >> COMPONENT_SIZE) & RB_MASK)) >> COMPONENT_SIZE;  \
-	t &= RB_MASK;                                                   \
-                                                                        \
-	/* add */                                                       \
-	t += y & RB_MASK;                                               \
-                                                                        \
-	/* saturate */                                                  \
-	t |= RB_MASK_PLUS_ONE - ((t >> COMPONENT_SIZE) & RB_MASK);      \
-	t &= RB_MASK;                                                   \
-                                                                        \
-	/* multiply and divide */                                       \
-	x = (((x >> COMPONENT_SIZE) & RB_MASK) * a) + RB_ONE_HALF;      \
-	x = (x + ((x >> COMPONENT_SIZE) & RB_MASK)) >> COMPONENT_SIZE;  \
-	x &= RB_MASK;                                                   \
-                                                                        \
-	/* add */                                                       \
-	x += (y >> COMPONENT_SIZE) & RB_MASK;                           \
-                                                                        \
-	/* saturate */                                                  \
-	x |= RB_MASK_PLUS_ONE - ((x >> COMPONENT_SIZE) & RB_MASK);      \
-	x &= RB_MASK;                                                   \
-                                                                        \
-	/* recombine */                                                 \
-	x <<= COMPONENT_SIZE;                                           \
-	x += t;                                                         \
+	comp4_t r1, r2, r3, t;						\
+									\
+	r1 = (x);							\
+	r2 = (y) & RB_MASK;						\
+	UNc_rb_MUL_UNc (r1, a, t);					\
+	UNc_rb_ADD_UNc_rb (r1, r2, t);					\
+									\
+	r2 = (x) >> G_SHIFT;						\
+	r3 = ((y) >> G_SHIFT) & RB_MASK;				\
+	UNc_rb_MUL_UNc (r2, a, t);					\
+	UNc_rb_ADD_UNc_rb (r2, r3, t);					\
+									\
+	x = r1 | (r2 << G_SHIFT);					\
     } while (0)
 
 /*
@@ -99,32 +122,21 @@
 #define UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc(x, a, y, b)			\
     do									\
     {									\
-	comp4_t t;                                                      \
-	comp4_t r = (x >> A_SHIFT) * a + (y >> A_SHIFT) * b + ONE_HALF; \
-	r += (r >> G_SHIFT);                                            \
-	r >>= G_SHIFT;                                                  \
-                                                                        \
-	t = (x & G_MASK) * a + (y & G_MASK) * b;                        \
-	t += (t >> G_SHIFT) + (ONE_HALF << G_SHIFT);                    \
-	t >>= R_SHIFT;                                                  \
-                                                                        \
-	t |= r << R_SHIFT;                                              \
-	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);             \
-	t &= RB_MASK;                                                   \
-	t <<= G_SHIFT;                                                  \
-                                                                        \
-	r = ((x >> R_SHIFT) & MASK) * a +                               \
-	    ((y >> R_SHIFT) & MASK) * b + ONE_HALF;                     \
-	r += (r >> G_SHIFT);                                            \
-	r >>= G_SHIFT;                                                  \
-                                                                        \
-	x = (x & MASK) * a + (y & MASK) * b + ONE_HALF;                 \
-	x += (x >> G_SHIFT);                                            \
-	x >>= G_SHIFT;                                                  \
-	x |= r << R_SHIFT;                                              \
-	x |= RB_MASK_PLUS_ONE - ((x >> G_SHIFT) & RB_MASK);             \
-	x &= RB_MASK;                                                   \
-	x |= t;                                                         \
+	comp4_t r1, r2, r3, t;						\
+									\
+	r1 = x;								\
+	r2 = y;								\
+	UNc_rb_MUL_UNc (r1, a, t);					\
+	UNc_rb_MUL_UNc (r2, b, t);					\
+	UNc_rb_ADD_UNc_rb (r1, r2, t);					\
+									\
+	r2 = (x >> G_SHIFT);						\
+	r3 = (y >> G_SHIFT);						\
+	UNc_rb_MUL_UNc (r2, a, t);					\
+	UNc_rb_MUL_UNc (r3, b, t);					\
+	UNc_rb_ADD_UNc_rb (r2, r3, t);					\
+									\
+	x = r1 | (r2 << G_SHIFT);					\
     } while (0)
 
 /*
@@ -133,19 +145,17 @@
 #define UNcx4_MUL_UNcx4(x, a)						\
     do									\
     {									\
-	comp4_t t;                                                      \
-	comp4_t r = (x & MASK) * (a & MASK);                            \
-	r |= (x & R_MASK) * ((a >> R_SHIFT) & MASK);                    \
-	r += RB_ONE_HALF;                                               \
-	r = (r + ((r >> G_SHIFT) & RB_MASK)) >> G_SHIFT;                \
-	r &= RB_MASK;                                                   \
-                                                                        \
-	x >>= G_SHIFT;                                                  \
-	t = (x & MASK) * ((a >> G_SHIFT) & MASK);                       \
-	t |= (x & R_MASK) * (a >> A_SHIFT);                             \
-	t += RB_ONE_HALF;                                               \
-	t = t + ((t >> G_SHIFT) & RB_MASK);                             \
-	x = r | (t & AG_MASK);                                          \
+	comp4_t r1, r2, r3, t;						\
+									\
+	r1 = x;								\
+	r2 = a;								\
+	UNc_rb_MUL_UNc_rb (r1, r2, t);					\
+									\
+	r2 = x >> G_SHIFT;						\
+	r3 = a >> G_SHIFT;						\
+	UNc_rb_MUL_UNc_rb (r2, r3, t);					\
+									\
+	x = r1 | (r2 << G_SHIFT);					\
     } while (0)
 
 /*
@@ -154,26 +164,21 @@
 #define UNcx4_MUL_UNcx4_ADD_UNcx4(x, a, y)				\
     do									\
     {									\
-	comp4_t t;                                                      \
-	comp4_t r = (x & MASK) * (a & MASK);                            \
-	r |= (x & R_MASK) * ((a >> R_SHIFT) & MASK);                    \
-	r += RB_ONE_HALF;                                               \
-	r = (r + ((r >> G_SHIFT) & RB_MASK)) >> G_SHIFT;                \
-	r &= RB_MASK;                                                   \
-	r += y & RB_MASK;                                               \
-	r |= RB_MASK_PLUS_ONE - ((r >> G_SHIFT) & RB_MASK);             \
-	r &= RB_MASK;                                                   \
-                                                                        \
-	x >>= G_SHIFT;                                                  \
-	t = (x & MASK) * ((a >> G_SHIFT) & MASK);                       \
-	t |= (x & R_MASK) * (a >> A_SHIFT);                             \
-	t += RB_ONE_HALF;                                               \
-	t = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;                \
-	t &= RB_MASK;                                                   \
-	t += (y >> G_SHIFT) & RB_MASK;                                  \
-	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);             \
-	t &= RB_MASK;                                                   \
-	x = r | (t << G_SHIFT);                                         \
+	comp4_t r1, r2, r3, t;						\
+									\
+	r1 = x;								\
+	r2 = a;								\
+	UNc_rb_MUL_UNc_rb (r1, r2, t);					\
+	r2 = y & RB_MASK;						\
+	UNc_rb_ADD_UNc_rb (r1, r2, t);					\
+									\
+	r2 = (x >> G_SHIFT);						\
+	r3 = (a >> G_SHIFT);						\
+	UNc_rb_MUL_UNc_rb (r2, r3, t);					\
+	r3 = (y >> G_SHIFT) & RB_MASK;					\
+	UNc_rb_ADD_UNc_rb (r2, r3, t);					\
+									\
+	x = r1 | (r2 << G_SHIFT);					\
     } while (0)
 
 /*
@@ -182,33 +187,23 @@
 #define UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc(x, a, y, b)			\
     do									\
     {									\
-	comp4_t t;                                                      \
-	comp4_t r = (x >> A_SHIFT) * (a >> A_SHIFT) +                   \
-	    (y >> A_SHIFT) * b;						\
-	r += (r >> G_SHIFT) + ONE_HALF;                                 \
-	r >>= G_SHIFT;                                                  \
-        								\
-	t = (x & G_MASK) * ((a >> G_SHIFT) & MASK) + (y & G_MASK) * b;  \
-	t += (t >> G_SHIFT) + (ONE_HALF << G_SHIFT);                    \
-	t >>= R_SHIFT;                                                  \
-        								\
-	t |= r << R_SHIFT;                                              \
-	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);             \
-	t &= RB_MASK;                                                   \
-	t <<= G_SHIFT;                                                  \
-									\
-	r = ((x >> R_SHIFT) & MASK) * ((a >> R_SHIFT) & MASK) +         \
-	    ((y >> R_SHIFT) & MASK) * b + ONE_HALF;                     \
-	r += (r >> G_SHIFT);                                            \
-	r >>= G_SHIFT;                                                  \
-        								\
-	x = (x & MASK) * (a & MASK) + (y & MASK) * b + ONE_HALF;        \
-	x += (x >> G_SHIFT);                                            \
-	x >>= G_SHIFT;                                                  \
-	x |= r << R_SHIFT;                                              \
-	x |= RB_MASK_PLUS_ONE - ((x >> G_SHIFT) & RB_MASK);             \
-	x &= RB_MASK;                                                   \
-	x |= t;                                                         \
+	comp4_t r1, r2, r3, t;						\
+									\
+	r1 = x;								\
+	r2 = a;								\
+	UNc_rb_MUL_UNc_rb (r1, r2, t);					\
+	r2 = y;								\
+	UNc_rb_MUL_UNc (r2, b, t);					\
+	UNc_rb_ADD_UNc_rb (r1, r2, t);					\
+									\
+	r2 = x >> G_SHIFT;						\
+	r3 = a >> G_SHIFT;						\
+	UNc_rb_MUL_UNc_rb (r2, r3, t);					\
+	r3 = y >> G_SHIFT;						\
+	UNc_rb_MUL_UNc (r3, b, t);					\
+	UNc_rb_ADD_UNc_rb (r2, r3, t);					\
+									\
+	x = r1 | (r2 << G_SHIFT);					\
     } while (0)
 
 /*
@@ -217,13 +212,15 @@
 #define UNcx4_ADD_UNcx4(x, y)						\
     do									\
     {									\
-	comp4_t t;                                                      \
-	comp4_t r = (x & RB_MASK) + (y & RB_MASK);                      \
-	r |= RB_MASK_PLUS_ONE - ((r >> G_SHIFT) & RB_MASK);             \
-	r &= RB_MASK;                                                   \
-        								\
-	t = ((x >> G_SHIFT) & RB_MASK) + ((y >> G_SHIFT) & RB_MASK);    \
-	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);             \
-	r |= (t & RB_MASK) << G_SHIFT;                                  \
-	x = r;                                                          \
+	comp4_t r1, r2, r3, t;						\
+									\
+	r1 = x & RB_MASK;						\
+	r2 = y & RB_MASK;						\
+	UNc_rb_ADD_UNc_rb (r1, r2, t);					\
+									\
+	r2 = (x >> G_SHIFT) & RB_MASK;					\
+	r3 = (y >> G_SHIFT) & RB_MASK;					\
+	UNc_rb_ADD_UNc_rb (r2, r3, t);					\
+									\
+	x = r1 | (r2 << G_SHIFT);					\
     } while (0)
commit 04ae08992f6381a8ffb50d8cba37753fdb58e3bf
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Thu Aug 6 20:41:04 2009 -0400

    Fix a bunch of srca == 0 checks that should be src == 0 in pixman-mmx.c

diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 7d4ec4f..60b2b11 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -1285,7 +1285,7 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
-    if (srca == 0)
+    if (src == 0)
 	return;
 
     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
@@ -1772,7 +1772,7 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
-    if (srca == 0)
+    if (src == 0)
 	return;
 
     srcsrc = (uint64_t)src << 32 | src;
@@ -2054,7 +2054,7 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
-    if (srca == 0)
+    if (src == 0)
     {
 	pixman_fill_mmx (dst_image->bits.bits, dst_image->bits.rowstride,
 			 PIXMAN_FORMAT_BPP (dst_image->bits.format),
@@ -2189,7 +2189,7 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
-    if (srca == 0)
+    if (src == 0)
 	return;
 
     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
@@ -2548,7 +2548,7 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
-    if (srca == 0)
+    if (src == 0)
 	return;
 
     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
commit 8bb58a3ce83d6b9c1f6796ce8e62450bdaa52cf0
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Wed Aug 5 21:24:50 2009 -0400

    Don't run fast paths if the format requires wide compositing.
    
    This could happen because the wide formats would still be considered
    solid if the image was 1x1 and repeating.

diff --git a/pixman/pixman-utils.c b/pixman/pixman-utils.c
index 1396fb7..961c6c6 100644
--- a/pixman/pixman-utils.c
+++ b/pixman/pixman-utils.c
@@ -663,7 +663,8 @@ _pixman_run_fast_path (const pixman_fast_path_t *paths,
 	if (has_fast_path && src->type == BITS)
 	{
 	    has_fast_path = !src->bits.read_func &&
-	                    !src->bits.write_func;
+	                    !src->bits.write_func &&
+		            !PIXMAN_FORMAT_IS_WIDE (src->bits.format);
 	}
     }
 
@@ -674,9 +675,10 @@ _pixman_run_fast_path (const pixman_fast_path_t *paths,
 	                !mask->common.alpha_map &&
 	                !mask->bits.read_func &&
 	                !mask->bits.write_func &&
-			mask->common.filter != PIXMAN_FILTER_CONVOLUTION &&
-			mask->common.repeat != PIXMAN_REPEAT_PAD &&
-			mask->common.repeat != PIXMAN_REPEAT_REFLECT;
+			 mask->common.filter != PIXMAN_FILTER_CONVOLUTION &&
+			 mask->common.repeat != PIXMAN_REPEAT_PAD &&
+			 mask->common.repeat != PIXMAN_REPEAT_REFLECT;
+	                !PIXMAN_FORMAT_IS_WIDE (src->bits.format);
     }
 
     if (has_fast_path)
commit d937b622389797a8c605b2cc50c24ca759dc57d2
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Wed Aug 5 21:16:14 2009 -0400

    Fix bug in combine_mask_alpha_ca()
    
    If the mask was 0xffffffff, the source would end up being shifted
    twice by A_SHIFT.

diff --git a/pixman/pixman-combine.c.template b/pixman/pixman-combine.c.template
index f707fe9..c129980 100644
--- a/pixman/pixman-combine.c.template
+++ b/pixman/pixman-combine.c.template
@@ -78,7 +78,6 @@ combine_mask_alpha_ca (const comp4_t *src, comp4_t *mask)
 
     if (a == ~0)
     {
-	x = x >> A_SHIFT;
 	x |= x << G_SHIFT;
 	x |= x << R_SHIFT;
 	*(mask) = x;
commit 0d576b965c34a6d89b00f7b93dba6a7b8737c731
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Wed Aug 5 20:40:36 2009 -0400

    Fix another case of changing the solid source.
    
    This time in fast_path_composite_n_8888_8888().

diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index ce97800..7f80578 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -423,7 +423,7 @@ fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
                                     int32_t                  width,
                                     int32_t                  height)
 {
-    uint32_t src, srca;
+    uint32_t src, srca, s;
     uint32_t    *dst_line, *dst, d;
     uint32_t    *mask_line, *mask, ma;
     int dst_stride, mask_stride;
@@ -459,11 +459,12 @@ fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
 	    else if (ma)
 	    {
 		d = *dst;
+		s = src;
 
-		UN8x4_MUL_UN8x4 (src, ma);
+		UN8x4_MUL_UN8x4 (s, ma);
 		UN8x4_MUL_UN8 (ma, srca);
 		ma = ~ma;
-		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, src);
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
 
 		*dst = d;
 	    }
commit 8b82cbb69197f9c367069a77ba992f3163d40230
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Wed Aug 5 20:31:41 2009 -0400

    Fix incorrect optimization in combine_over_ca().
    
    Previously the code assumed that an alpha of 0 meant that no change
    would take place. This is incorrect because an alpha of 0 can happen
    as the result of the source having alpha=0, but rgb != 0.

diff --git a/pixman/pixman-combine.c.template b/pixman/pixman-combine.c.template
index 59ea1e1..f707fe9 100644
--- a/pixman/pixman-combine.c.template
+++ b/pixman/pixman-combine.c.template
@@ -1610,17 +1610,14 @@ combine_over_ca (pixman_implementation_t *imp,
 	combine_mask_ca (&s, &m);
 
 	a = ~m;
-	if (a != ~0)
+	if (a)
 	{
-	    if (a)
-	    {
-		comp4_t d = *(dest + i);
-		UNcx4_MUL_UNcx4_ADD_UNcx4 (d, a, s);
-		s = d;
-	    }
-
-	    *(dest + i) = s;
+	    comp4_t d = *(dest + i);
+	    UNcx4_MUL_UNcx4_ADD_UNcx4 (d, a, s);
+	    s = d;
 	}
+
+	*(dest + i) = s;
     }
 }
 
commit ec8b36f01030fd2fa67595f2aef4ca568b060899
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Wed Aug 5 18:18:37 2009 -0400

    Don't change the constant source in fast_composite_over_n_8888_0565.

diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 40b7f9c..ce97800 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -619,11 +619,11 @@ fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
                                     int32_t                  width,
                                     int32_t                  height)
 {
-    uint32_t src, srca;
-    uint16_t src16;
-    uint16_t    *dst_line, *dst;
-    uint32_t d;
-    uint32_t    *mask_line, *mask, ma;
+    uint32_t  src, srca, s;
+    uint16_t  src16;
+    uint16_t *dst_line, *dst;
+    uint32_t  d;
+    uint32_t *mask_line, *mask, ma;
     int dst_stride, mask_stride;
     uint16_t w;
 
@@ -667,10 +667,12 @@ fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 		d = *dst;
 		d = CONVERT_0565_TO_0888 (d);
 
-		UN8x4_MUL_UN8x4 (src, ma);
+		s = src;
+
+		UN8x4_MUL_UN8x4 (s, ma);
 		UN8x4_MUL_UN8 (ma, srca);
 		ma = ~ma;
-		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, src);
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
 
 		*dst = CONVERT_8888_TO_0565 (d);
 	    }
commit de8fff746bfa80278f85859bef2dc0ab166f7a69
Author: SÃ¸ren Sandmann Pedersen <sandmann at redhat.com>
Date:   Wed Aug 5 16:17:52 2009 -0400

    Fix bugs in combine_over_reverse_ca().
    
    The computation cannot be optimized away when alpha is 0 because that
    can happen when the source has alpha zero and rgb non-zero.

diff --git a/pixman/pixman-combine.c.template b/pixman/pixman-combine.c.template
index 4a0e2bd..59ea1e1 100644
--- a/pixman/pixman-combine.c.template
+++ b/pixman/pixman-combine.c.template
@@ -1644,10 +1644,8 @@ combine_over_reverse_ca (pixman_implementation_t *imp,
 	    comp4_t s = *(src + i);
 	    comp4_t m = *(mask + i);
 
-	    combine_mask_value_ca (&s, &m);
-
-	    if (a != MASK)
-		UNcx4_MUL_UNc_ADD_UNcx4 (s, a, d);
+	    UNcx4_MUL_UNcx4 (s, m);
+	    UNcx4_MUL_UNc_ADD_UNcx4 (s, a, d);
 
 	    *(dest + i) = s;
 	}