pixman: Branch 'master' - 12 commits

Wed Mar 17 08:34:35 PDT 2010

configure.ac              |   82 ++++++++++++++
 pixman/Makefile.am        |    2 
 pixman/pixman-access.c    |   22 ++-
 pixman/pixman-compiler.h  |   63 ++++++++++-
 pixman/pixman-fast-path.c |  255 ++++++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-image.c     |   30 +++--
 pixman/pixman-private.h   |   10 +
 pixman/pixman.c           |  121 ++++++++++++++++-----
 test/blitters-test.c      |    2 
 9 files changed, 528 insertions(+), 59 deletions(-)

New commits:
commit 265ea1fb4d05a920323f23a02f9dc379312bbdae
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date:   Wed Mar 17 10:50:42 2010 -0400

    Specialize the fast_composite_scaled_nearest_* scalers to positive x units
    
    This avoids a test in the inner loop, which improves performance
    especially for tiled sources.
    
    On x86-32, I get these results:
    
    Before:
    op=1, src_fmt=20028888, dst_fmt=20028888, speed=306.96 MPix/s (73.18 FPS)
    op=1, src_fmt=20028888, dst_fmt=10020565, speed=102.67 MPix/s (24.48 FPS)
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=324.85 MPix/s (77.45 FPS)
    
    After:
    op=1, src_fmt=20028888, dst_fmt=20028888, speed=332.19 MPix/s (79.20 FPS)
    op=1, src_fmt=20028888, dst_fmt=10020565, speed=110.41 MPix/s (26.32 FPS)
    op=1, src_fmt=10020565, dst_fmt=10020565, speed=363.28 MPix/s (86.61 FPS)

diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 5b8ff5c..bf5b298 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -1485,13 +1485,21 @@ fast_composite_scaled_nearest_ ## scale_func_name ## _ ## OP (pixman_implementat
 	    x1 = vx >> 16;									\
 	    vx += unit_x;									\
 	    if (do_repeat)									\
-		repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);					\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= max_vx)								\
+		    vx -= max_vx;								\
+	    }											\
 	    s1 = src[x1];									\
 												\
 	    x2 = vx >> 16;									\
 	    vx += unit_x;									\
 	    if (do_repeat)									\
-		repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);					\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= max_vx)								\
+		    vx -= max_vx;								\
+	    }											\
 	    s2 = src[x2];									\
 												\
 	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
@@ -1537,7 +1545,11 @@ fast_composite_scaled_nearest_ ## scale_func_name ## _ ## OP (pixman_implementat
 	    x1 = vx >> 16;									\
 	    vx += unit_x;									\
 	    if (do_repeat)									\
-		repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);					\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= max_vx)								\
+		    vx -= max_vx;								\
+	    }											\
 	    s1 = src[x1];									\
 												\
 	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
@@ -1806,7 +1818,7 @@ static const pixman_fast_path_t c_fast_paths[] =
 #define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
     {   PIXMAN_OP_ ## op,						\
 	PIXMAN_ ## s,							\
-	SCALED_NEAREST_FLAGS | HAS_NORMAL_REPEAT_FLAGS | FAST_PATH_16BIT_SAFE,	\
+	SCALED_NEAREST_FLAGS | HAS_NORMAL_REPEAT_FLAGS | FAST_PATH_16BIT_SAFE | FAST_PATH_X_UNIT_POSITIVE, \
 	PIXMAN_null, 0,							\
 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
 	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
commit 9cd1051523493e0926b146f05cdde34158391602
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date:   Wed Mar 17 10:35:34 2010 -0400

    Add a FAST_PATH_X_UNIT_POSITIVE flag
    
    This is the common case for a lot of transformed images. If the unit
    were negative, the transformation would be a reflection which is
    fairly rare.

diff --git a/pixman/pixman-image.c b/pixman/pixman-image.c
index df5b457..9b44aa9 100644
--- a/pixman/pixman-image.c
+++ b/pixman/pixman-image.c
@@ -301,15 +301,21 @@ compute_image_info (pixman_image_t *image)
     /* Transform */
     if (!image->common.transform)
     {
-	flags |= FAST_PATH_ID_TRANSFORM;
+	flags |= (FAST_PATH_ID_TRANSFORM | FAST_PATH_X_UNIT_POSITIVE);
     }
-    else if (image->common.transform->matrix[0][1] == 0 &&
-	     image->common.transform->matrix[1][0] == 0 &&
-	     image->common.transform->matrix[2][0] == 0 &&
-	     image->common.transform->matrix[2][1] == 0 &&
-	     image->common.transform->matrix[2][2] == pixman_fixed_1)
+    else
     {
-	flags |= FAST_PATH_SCALE_TRANSFORM;
+	if (image->common.transform->matrix[0][1] == 0 &&
+	    image->common.transform->matrix[1][0] == 0 &&
+	    image->common.transform->matrix[2][0] == 0 &&
+	    image->common.transform->matrix[2][1] == 0 &&
+	    image->common.transform->matrix[2][2] == pixman_fixed_1)
+	{
+	    flags |= FAST_PATH_SCALE_TRANSFORM;
+	}
+
+	if (image->common.transform->matrix[0][0] > 0)
+	    flags |= FAST_PATH_X_UNIT_POSITIVE;
     }
 
     /* Alpha map */
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 0cf9113..d5767af 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -582,6 +582,7 @@ _pixman_choose_implementation (void);
 #define FAST_PATH_NO_NONE_REPEAT		(1 << 15)
 #define FAST_PATH_SAMPLES_COVER_CLIP		(1 << 16)
 #define FAST_PATH_16BIT_SAFE			(1 << 17)
+#define FAST_PATH_X_UNIT_POSITIVE		(1 << 18)
 
 #define _FAST_PATH_STANDARD_FLAGS					\
     (FAST_PATH_ID_TRANSFORM		|				\
commit a5b51bb03c5c1258d7558efa13eca6c570e34ce6
Author: Alexander Larsson <alexl at redhat.com>
Date:   Wed Mar 17 11:58:05 2010 +0100

    Use the right format for the OVER_8888_565 fast path

diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 6607a47..5b8ff5c 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -1836,7 +1836,7 @@ static const pixman_fast_path_t c_fast_paths[] =
     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, x888_x888),
     SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, x888_x888),
 
-    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, r5g6b5, 8888_565),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
 
 #define NEAREST_FAST_PATH(op,s,d)		\
     {   PIXMAN_OP_ ## op,			\
commit 3b92b711d031a7752e06d0a5f688f4c54f50a1e6
Author: Alexander Larsson <alexl at redhat.com>
Date:   Fri Mar 12 15:45:04 2010 +0100

    Add specialized fast nearest scalers
    
    This is a macroized version of SRC/OVER repeat normal/unneeded nearest
    neighbour scaling instantiated for some common 8888 and 565 formats.
    
    Based on work by Siarhei Siamashka

diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 4d26b0f..6607a47 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -27,6 +27,7 @@
 #include <config.h>
 #endif
 #include <string.h>
+#include <stdlib.h>
 #include "pixman-private.h"
 #include "pixman-combine32.h"
 
@@ -1373,6 +1374,208 @@ repeat (pixman_repeat_t repeat, int *c, int size)
     return TRUE;
 }
 
+/* A macroified version of specialized nearest scalers for some
+ * common 8888 and 565 formats. It supports SRC and OVER ops.
+ *
+ * There are two repeat versions, one that handles repeat normal,
+ * and one without repeat handling that only works if the src region
+ * used is completely covered by the pre-repeated source samples.
+ *
+ * The loops are unrolled to process two pixels per iteration for better
+ * performance on most CPU architectures (superscalar processors
+ * can issue several operations simultaneously, other processors can hide
+ * instructions latencies by pipelining operations). Unrolling more
+ * does not make much sense because the compiler will start running out
+ * of spare registers soon.
+ */
+
+#define GET_8888_ALPHA(s) ((s) >> 24)
+ /* This is not actually used since we don't have an OVER with
+    565 source, but it is needed to build. */
+#define GET_0565_ALPHA(s) 0xff
+
+#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,					\
+		     src_type_t, dst_type_t, OP, do_repeat)					\
+static void											\
+fast_composite_scaled_nearest_ ## scale_func_name ## _ ## OP (pixman_implementation_t *imp,	\
+							      pixman_op_t              op,      \
+							      pixman_image_t *         src_image, \
+							      pixman_image_t *         mask_image, \
+							      pixman_image_t *         dst_image, \
+							      int32_t                  src_x,   \
+							      int32_t                  src_y,   \
+							      int32_t                  mask_x,  \
+							      int32_t                  mask_y,  \
+							      int32_t                  dst_x,   \
+							      int32_t                  dst_y,   \
+							      int32_t                  width,   \
+							      int32_t                  height)  \
+{												\
+    dst_type_t *dst_line;									\
+    src_type_t *src_first_line;									\
+    uint32_t   d;										\
+    src_type_t s1, s2;										\
+    uint8_t   a1, a2;										\
+    int       w;										\
+    int       x1, x2, y;									\
+    pixman_fixed_t orig_vx;									\
+    pixman_fixed_t max_vx, max_vy;								\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+												\
+    src_type_t *src;										\
+    dst_type_t *dst;										\
+    int       src_stride, dst_stride;								\
+												\
+    if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
+	abort();										\
+												\
+    PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1);	\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
+    v.vector[0] -= pixman_fixed_e;								\
+    v.vector[1] -= pixman_fixed_e;								\
+												\
+    vx = v.vector[0];										\
+    vy = v.vector[1];										\
+												\
+    if (do_repeat)										\
+    {												\
+	/* Clamp repeating positions inside the actual samples */				\
+	max_vx = src_image->bits.width << 16;							\
+	max_vy = src_image->bits.height << 16;							\
+												\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
+	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+    }												\
+												\
+    orig_vx = vx;										\
+												\
+    while (--height >= 0)									\
+    {												\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+												\
+	y = vy >> 16;										\
+	vy += unit_y;										\
+	if (do_repeat)										\
+	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+												\
+	src = src_first_line + src_stride * y;							\
+												\
+	w = width;										\
+	vx = orig_vx;										\
+	while ((w -= 2) >= 0)									\
+	{											\
+	    x1 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (do_repeat)									\
+		repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);					\
+	    s1 = src[x1];									\
+												\
+	    x2 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (do_repeat)									\
+		repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);					\
+	    s2 = src[x2];									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+												\
+		if (a2 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+		}										\
+		else if (s2)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    a2 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		    *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		    *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+	    }											\
+	}											\
+												\
+	if (w & 1)										\
+	{											\
+	    x1 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (do_repeat)									\
+		repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);					\
+	    s1 = src[x1];									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		    *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+	    }											\
+	}											\
+    }												\
+}
+
+FAST_NEAREST(x888_x888_none, 8888, 8888, uint32_t, uint32_t, SRC, /*repeat: */ 0);
+FAST_NEAREST(x888_x888_normal, 8888, 8888, uint32_t, uint32_t, SRC, /*repeat: */ 1);
+FAST_NEAREST(x888_x888_none, 8888, 8888, uint32_t, uint32_t, OVER, /*repeat: */ 0);
+FAST_NEAREST(x888_x888_normal, 8888, 8888, uint32_t, uint32_t, OVER, /*repeat: */ 1);
+FAST_NEAREST(x888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, /*repeat: */ 0);
+FAST_NEAREST(x888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, /*repeat: */ 1);
+FAST_NEAREST(565_565_none, 0565, 0565, uint16_t, uint16_t, SRC, /*repeat: */ 0);
+FAST_NEAREST(565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, /*repeat: */ 1);
+FAST_NEAREST(8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, /*repeat: */ 0);
+FAST_NEAREST(8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, /*repeat: */ 1);
+
 static force_inline uint32_t
 fetch_nearest (pixman_repeat_t src_repeat,
 	       pixman_format_code_t format,
@@ -1595,6 +1798,46 @@ static const pixman_fast_path_t c_fast_paths[] =
      FAST_PATH_NO_ACCESSORS	|					\
      FAST_PATH_NO_WIDE_FORMAT)
 
+#define HAS_NORMAL_REPEAT_FLAGS						\
+    (FAST_PATH_NO_REFLECT_REPEAT |					\
+     FAST_PATH_NO_PAD_REPEAT     |					\
+     FAST_PATH_NO_NONE_REPEAT)
+
+#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | HAS_NORMAL_REPEAT_FLAGS | FAST_PATH_16BIT_SAFE,	\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    },									\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, x888_x888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, x888_x888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, x888_565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, x888_565),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, x888_x888),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, r5g6b5, 8888_565),
+
 #define NEAREST_FAST_PATH(op,s,d)		\
     {   PIXMAN_OP_ ## op,			\
 	PIXMAN_ ## s, SCALED_NEAREST_FLAGS,	\
commit 5750408e48259f42373a5233231104d9bd3eb35a
Author: Alexander Larsson <alexl at redhat.com>
Date:   Fri Mar 12 15:41:01 2010 +0100

    Add FAST_PATH_SAMPLES_COVER_CLIP and FAST_PATH_16BIT_SAFE
    
    FAST_PATH_SAMPLES_COVER_CLIP:
    
    This is set of the source sample grid, unrepeated but transformed
    completely completely covers the clip destination. If this is set
    you can use a simple scaled that doesn't have to care about the repeat
    mode.
    
    FAST_PATH_16BIT_SAFE:
    
    This signifies two things:
    1) The size of the src/mask fits in a 16.16 fixed point, so something like:
    
        max_vx = src_image->bits.width << 16;
    
        Is allowed and is guaranteed to not overflow max_vx
    
    2) When stepping the source space we're guaranteed to never overflow
       a 16.16 bit fix point variable, even if we step one extra step
       in the destination space. This means that a loop doing:
    
       x = vx >> 16;
       vx += unit_x;								   d = src_row[x];
    
       will never overflow vx causing x to be negative.
    
       And additionally, if you track vx like above and apply NORMAL repeat
       after the vx addition with something like:
    
       while (vx >= max_vx) vx -= max_vx;
    
       This will never overflow the vx even on the final increment that
       takes vx one past the end of where we will read, which makes the
       repeat loop safe.

diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 65314b9..0cf9113 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -580,6 +580,8 @@ _pixman_choose_implementation (void);
 #define FAST_PATH_IS_OPAQUE			(1 << 13)
 #define FAST_PATH_NEEDS_WORKAROUND		(1 << 14)
 #define FAST_PATH_NO_NONE_REPEAT		(1 << 15)
+#define FAST_PATH_SAMPLES_COVER_CLIP		(1 << 16)
+#define FAST_PATH_16BIT_SAFE			(1 << 17)
 
 #define _FAST_PATH_STANDARD_FLAGS					\
     (FAST_PATH_ID_TRANSFORM		|				\
diff --git a/pixman/pixman.c b/pixman/pixman.c
index 68483a0..56c9536 100644
--- a/pixman/pixman.c
+++ b/pixman/pixman.c
@@ -479,24 +479,75 @@ walk_region_internal (pixman_implementation_t *imp,
     }
 }
 
-static force_inline pixman_bool_t
-image_covers (pixman_image_t *image,
-              pixman_box32_t *extents,
-              int             x,
-              int             y)
+#define IS_16BIT(x) (((x) >= INT16_MIN) && ((x) <= INT16_MAX))
+
+static force_inline uint32_t
+compute_src_extents_flags (pixman_image_t *image,
+			   pixman_box32_t *extents,
+			   int             x,
+			   int             y)
 {
-    if (image->common.type == BITS &&
-	image->common.repeat == PIXMAN_REPEAT_NONE)
+    pixman_box16_t extents16;
+    uint32_t flags;
+
+    flags = FAST_PATH_COVERS_CLIP;
+
+    if (image->common.type != BITS)
+	return flags;
+
+    if (image->common.repeat == PIXMAN_REPEAT_NONE &&
+	(x > extents->x1 || y > extents->y1 ||
+	 x + image->bits.width < extents->x2 ||
+	 y + image->bits.height < extents->y2))
+    {
+	flags &= ~FAST_PATH_COVERS_CLIP;
+    }
+
+    if (IS_16BIT (extents->x1 - x) &&
+	IS_16BIT (extents->y1 - y) &&
+	IS_16BIT (extents->x2 - x) &&
+	IS_16BIT (extents->y2 - y))
     {
-	if (x > extents->x1 || y > extents->y1 ||
-	    x + image->bits.width < extents->x2 ||
-	    y + image->bits.height < extents->y2)
+	extents16.x1 = extents->x1 - x;
+	extents16.y1 = extents->y1 - y;
+	extents16.x2 = extents->x2 - x;
+	extents16.y2 = extents->y2 - y;
+
+	if (!image->common.transform ||
+	    pixman_transform_bounds (image->common.transform, &extents16))
 	{
-	    return FALSE;
+	    if (extents16.x1 >= 0  && extents16.y1 >= 0 &&
+		extents16.x2 <= image->bits.width &&
+		extents16.y2 <= image->bits.height)
+	    {
+		flags |= FAST_PATH_SAMPLES_COVER_CLIP;
+	    }
 	}
     }
 
-    return TRUE;
+    if (IS_16BIT (extents->x1 - x - 1) &&
+	IS_16BIT (extents->y1 - y - 1) &&
+	IS_16BIT (extents->x2 - x + 1) &&
+	IS_16BIT (extents->y2 - y + 1))
+    {
+	extents16.x1 = extents->x1 - x - 1;
+	extents16.y1 = extents->y1 - y - 1;
+	extents16.x2 = extents->x2 - x + 1;
+	extents16.y2 = extents->y2 - y + 1;
+
+	if (/* src space expanded by one in dest space fits in 16 bit */
+	    (!image->common.transform ||
+	     pixman_transform_bounds (image->common.transform, &extents16)) &&
+	    /* And src image size can be used as 16.16 fixed point */
+	    image->bits.width < 0x7fff &&
+	    image->bits.height < 0x7fff)
+	{
+	    /* Then we're "16bit safe" */
+	    flags |= FAST_PATH_16BIT_SAFE;
+	}
+    }
+
+    return flags;
 }
 
 #define N_CACHED_FAST_PATHS 8
@@ -588,11 +639,10 @@ do_composite (pixman_implementation_t *imp,
     
     extents = pixman_region32_extents (&region);
     
-    if (image_covers (src, extents, dest_x - src_x, dest_y - src_y))
-	src_flags |= FAST_PATH_COVERS_CLIP;
-    
-    if (mask && image_covers (mask, extents, dest_x - mask_x, dest_y - mask_y))
-	mask_flags |= FAST_PATH_COVERS_CLIP;
+    src_flags |= compute_src_extents_flags (src, extents, dest_x - src_x, dest_y - src_y);
+
+    if (mask)
+	mask_flags |= compute_src_extents_flags (mask, extents, dest_x - mask_x, dest_y - mask_y);
 
     /*
      * Check if we can replace our operator by a simpler one
commit cba6fbbddce5edfd8e28ef570c493b044761f870
Author: Alexander Larsson <alexl at redhat.com>
Date:   Fri Mar 12 15:40:07 2010 +0100

    Add FAST_PATH_NO_NONE_REPEAT flag

diff --git a/pixman/pixman-image.c b/pixman/pixman-image.c
index d09d193..df5b457 100644
--- a/pixman/pixman-image.c
+++ b/pixman/pixman-image.c
@@ -335,16 +335,20 @@ compute_image_info (pixman_image_t *image)
     /* Repeat mode */
     switch (image->common.repeat)
     {
+    case PIXMAN_REPEAT_NONE:
+	flags |= FAST_PATH_NO_REFLECT_REPEAT | FAST_PATH_NO_PAD_REPEAT;
+	break;
+
     case PIXMAN_REPEAT_REFLECT:
-	flags |= FAST_PATH_NO_PAD_REPEAT;
+	flags |= FAST_PATH_NO_PAD_REPEAT | FAST_PATH_NO_NONE_REPEAT;
 	break;
 
     case PIXMAN_REPEAT_PAD:
-	flags |= FAST_PATH_NO_REFLECT_REPEAT;
+	flags |= FAST_PATH_NO_REFLECT_REPEAT | FAST_PATH_NO_NONE_REPEAT;
 	break;
 
     default:
-	flags |= (FAST_PATH_NO_REFLECT_REPEAT | FAST_PATH_NO_PAD_REPEAT);
+	flags |= FAST_PATH_NO_REFLECT_REPEAT | FAST_PATH_NO_PAD_REPEAT | FAST_PATH_NO_NONE_REPEAT;
 	break;
     }
 
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index bc41249..65314b9 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -579,6 +579,7 @@ _pixman_choose_implementation (void);
 #define FAST_PATH_SIMPLE_REPEAT			(1 << 12)
 #define FAST_PATH_IS_OPAQUE			(1 << 13)
 #define FAST_PATH_NEEDS_WORKAROUND		(1 << 14)
+#define FAST_PATH_NO_NONE_REPEAT		(1 << 15)
 
 #define _FAST_PATH_STANDARD_FLAGS					\
     (FAST_PATH_ID_TRANSFORM		|				\
commit 7ec023ede155b9dacf574c4323740ef981802aa9
Author: Alexander Larsson <alexl at redhat.com>
Date:   Tue Mar 16 14:18:29 2010 +0100

    Add CONVERT_8888_TO_8888 and CONVERT_0565_TO_0565 macros
    
    These are useful for macroization

diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index d0bec39..bc41249 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -706,6 +706,10 @@ pixman_region16_copy_from_region32 (pixman_region16_t *dst,
 
 #define CONVERT_0565_TO_8888(s) (CONVERT_0565_TO_0888(s) | 0xff000000)
 
+/* Trivial versions that are useful in macros */
+#define CONVERT_8888_TO_8888(s) (s)
+#define CONVERT_0565_TO_0565(s) (s)
+
 #define PIXMAN_FORMAT_IS_WIDE(f)					\
     (PIXMAN_FORMAT_A (f) > 8 ||						\
      PIXMAN_FORMAT_R (f) > 8 ||						\
commit c903d03052e1c34478556964338959b34928a388
Author: Alexander Larsson <alexl at redhat.com>
Date:   Fri Mar 12 16:23:42 2010 +0100

    Add CONVERT_0565_TO_8888 macro
    
    This lets us simplify some fast paths since we get a consistent
    naming that always has 8888 and gets some value for alpha.

diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 9dcdca7..d0bec39 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -704,6 +704,8 @@ pixman_region16_copy_from_region32 (pixman_region16_t *dst,
      ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) |			\
      ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000)))
 
+#define CONVERT_0565_TO_8888(s) (CONVERT_0565_TO_0888(s) | 0xff000000)
+
 #define PIXMAN_FORMAT_IS_WIDE(f)					\
     (PIXMAN_FORMAT_A (f) > 8 ||						\
      PIXMAN_FORMAT_R (f) > 8 ||						\
commit de27f45ddd46fc48ec9598f2f177155328d55580
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date:   Mon Mar 15 11:51:09 2010 -0400

    Ensure that only the low 4 bit of 4 bit pixels are stored.
    
    In some cases we end up trying to use the STORE_4 macro with an 8 bit
    values, which resulted in other pixels getting overwritten. Fix this
    by always masking off the low 4 bits.
    
    This fixes blitters-test on big-endian machines.

diff --git a/pixman/pixman-access.c b/pixman/pixman-access.c
index 389cf2a..fa0a267 100644
--- a/pixman/pixman-access.c
+++ b/pixman/pixman-access.c
@@ -2445,9 +2445,12 @@ store_scanline_x4a4 (bits_image_t *  image,
     do									\
     {									\
 	int bo = 4 * (o);						\
-	STORE_8 (img, l, bo, (bo & 4 ?					\
-			      (FETCH_8 (img, l, bo) & 0xf0) | (v) :	\
-			      (FETCH_8 (img, l, bo) & 0x0f) | ((v) << 4))); \
+	int v4 = (v) & 0x0f;						\
+									\
+	STORE_8 (img, l, bo, (						\
+		     bo & 4 ?						\
+		     (FETCH_8 (img, l, bo) & 0xf0) | (v4) :		\
+		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4)));	\
     } while (0)
 #else
 
@@ -2455,9 +2458,12 @@ store_scanline_x4a4 (bits_image_t *  image,
     do									\
     {									\
 	int bo = 4 * (o);						\
-	STORE_8 (img, l, bo, (bo & 4 ?					\
-			      (FETCH_8 (img, l, bo) & 0x0f) | ((v) << 4) : \
-			      (FETCH_8 (img, l, bo) & 0xf0) | (v)));	\
+	int v4 = (v) & 0x0f;						\
+									\
+	STORE_8 (img, l, bo, (						\
+		     bo & 4 ?						\
+		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4) :	\
+		     (FETCH_8 (img, l, bo) & 0xf0) | (v4)));		\
     } while (0)
 #endif
 
@@ -2484,11 +2490,11 @@ store_scanline_r1g2b1 (bits_image_t *  image,
 {
     uint32_t *bits = image->bits + image->rowstride * y;
     int i;
-    
+
     for (i = 0; i < width; ++i)
     {
 	uint32_t pixel;
-	
+
 	SPLIT (values[i]);
 	pixel = (((r >> 4) & 0x8) |
 	         ((g >> 5) & 0x6) |
diff --git a/test/blitters-test.c b/test/blitters-test.c
index c11917d..5e33031 100644
--- a/test/blitters-test.c
+++ b/test/blitters-test.c
@@ -482,7 +482,7 @@ main (int argc, char *argv[])
 	    /* Predefined value for running with all the fastpath functions
 	       disabled. It needs to be updated every time when changes are
 	       introduced to this program or behavior of pixman changes! */
-	    if (crc == 0xEF7A1179)
+	    if (crc == 0xA058F792)
 	    {
 		printf ("blitters test passed\n");
 	    }
commit 6532f8488abffb89501cb76de7d80b8ab2d49aed
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date:   Tue Mar 16 08:17:10 2010 -0400

    Fix contact address in configure.ac

diff --git a/configure.ac b/configure.ac
index 0bf5658..4dccfda 100644
--- a/configure.ac
+++ b/configure.ac
@@ -58,7 +58,7 @@ m4_define([pixman_micro], 11)
 
 m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
 
-AC_INIT(pixman, pixman_version, "sandmann at daimi.au.dk", pixman)
+AC_INIT(pixman, pixman_version, "pixman at lists.freedesktop.org", pixman)
 AM_INIT_AUTOMAKE([foreign dist-bzip2])
 
 # Suppress verbose compile lines
commit 7c9f121efe7ee6afafad8b294974f5498054559b
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date:   Tue Mar 16 12:23:50 2010 -0400

    Add PIXMAN_DEFINE_THREAD_LOCAL() and PIXMAN_GET_THREAD_LOCAL() macros
    
    These macros hide the various types of thread local support. On Linux
    and Unix, they expand to just __thread. On Microsoft Visual C++, they
    expand to __declspec(thread).
    
    On OS X and other systems that don't have __thread, they expand to a
    complicated concoction that uses pthread_once() and
    pthread_get/set_specific() to get thread local variables.

diff --git a/pixman/pixman-compiler.h b/pixman/pixman-compiler.h
index 5aeef86..a4e3f88 100644
--- a/pixman/pixman-compiler.h
+++ b/pixman/pixman-compiler.h
@@ -70,11 +70,62 @@
 #endif
 
 /* TLS */
-#if (defined (__GNUC__) && ((__GNUC__ == 3 && __GNUC_MINOR >= 3) || __GNUC__ > 3)) || defined(__SUNPRO_C)
-#    define THREAD_LOCAL __thread
-#elif defined (_MSC_VER)
-#    define THREAD_LOCAL __declspec(thread)
+#if defined(TOOLCHAIN_SUPPORTS__THREAD)
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static __thread type name
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    (&name)
+
+#elif defined(_MSC_VER)
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static __declspec(thread) type name
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    (&name)
+
+#elif defined(HAVE_PTHREAD_SETSPECIFIC)
+
+#include <pthread.h>
+
+#  define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static pthread_once_t tls_ ## name ## _once_control = PTHREAD_ONCE_INIT; \
+    static pthread_key_t tls_ ## name ## _key;				\
+									\
+    static void								\
+    tls_ ## name ## _make_key (void)					\
+    {									\
+	pthread_key_create (&tls_ ## name ## _key, NULL);		\
+    }									\
+									\
+    static type *							\
+    tls_ ## name ## _alloc (key)					\
+    {									\
+	type *value = malloc (sizeof (type));				\
+	if (value)							\
+	    pthread_setspecific (key, value);				\
+	return value;							\
+    }									\
+									\
+    static force_inline type *						\
+    tls_ ## name ## _get (key)						\
+    {									\
+	type *value = NULL;						\
+	if (pthread_once (&tls_ ## name ## _once_control,		\
+			  tls_ ## name ## _make_key) == 0)		\
+	{								\
+	    value = pthread_getspecific (tls_ ## name ## _key);		\
+	    if (!value)							\
+		value = tls_ ## name ## _alloc (key);			\
+	}								\
+	return value;							\
+    }
+
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    tls_ ## name ## _get (tls_ ## name ## _key)
+
 #else
-#    warning "unknown compiler"
-#    define THREAD_LOCAL __thread
+
+#    error "Unknown thread local support for this system"
+
 #endif
diff --git a/pixman/pixman.c b/pixman/pixman.c
index c71617e..68483a0 100644
--- a/pixman/pixman.c
+++ b/pixman/pixman.c
@@ -499,6 +499,15 @@ image_covers (pixman_image_t *image,
     return TRUE;
 }
 
+#define N_CACHED_FAST_PATHS 8
+
+typedef struct
+{
+    pixman_fast_path_t cache [N_CACHED_FAST_PATHS];
+} cache_t;
+
+PIXMAN_DEFINE_THREAD_LOCAL (cache_t, fast_path_cache);
+
 static void
 do_composite (pixman_implementation_t *imp,
 	      pixman_op_t	       op,
@@ -514,8 +523,6 @@ do_composite (pixman_implementation_t *imp,
 	      int		       width,
 	      int		       height)
 {
-#define N_CACHED_FAST_PATHS 8
-    static THREAD_LOCAL pixman_fast_path_t tls_cache[N_CACHED_FAST_PATHS];
     pixman_format_code_t src_format, mask_format, dest_format;
     uint32_t src_flags, mask_flags, dest_flags;
     pixman_region32_t region;
@@ -527,8 +534,8 @@ do_composite (pixman_implementation_t *imp,
     uint32_t *dest_bits;
     int dest_dx, dest_dy;
     pixman_bool_t need_workaround;
-    pixman_fast_path_t *cache;
     const pixman_fast_path_t *info;
+    cache_t *cache;
     int i;
 
     src_format = src->common.extended_format_code;
@@ -597,11 +604,11 @@ do_composite (pixman_implementation_t *imp,
 	return;
 
     /* Check cache for fast paths */
-    cache = tls_cache;
+    cache = PIXMAN_GET_THREAD_LOCAL (fast_path_cache);
 
     for (i = 0; i < N_CACHED_FAST_PATHS; ++i)
     {
-	info = &(cache[i]);
+	info = &(cache->cache[i]);
 
 	/* Note that we check for equality here, not whether
 	 * the cached fast path matches. This is to prevent
@@ -677,16 +684,16 @@ found:
 	pixman_composite_func_t func = info->func;
 	
 	while (i--)
-	    cache[i + 1] = cache[i];
-
-	cache[0].op = op;
-	cache[0].src_format = src_format;
-	cache[0].src_flags = src_flags;
-	cache[0].mask_format = mask_format;
-	cache[0].mask_flags = mask_flags;
-	cache[0].dest_format = dest_format;
-	cache[0].dest_flags = dest_flags;
-	cache[0].func = func;
+	    cache->cache[i + 1] = cache->cache[i];
+
+	cache->cache[0].op = op;
+	cache->cache[0].src_format = src_format;
+	cache->cache[0].src_flags = src_flags;
+	cache->cache[0].mask_format = mask_format;
+	cache->cache[0].mask_flags = mask_flags;
+	cache->cache[0].dest_format = dest_format;
+	cache->cache[0].dest_flags = dest_flags;
+	cache->cache[0].func = func;
     }
 
 out:
commit 6b9c54820015f69e667ed54441e83042c9a84cc1
Author: Søren Sandmann Pedersen <ssp at redhat.com>
Date:   Tue Mar 16 11:01:08 2010 -0400

    Add checks for various types of thread local storage.
    
    OS X does not support __thread, so we have to check for it before
    using it.  It does however support pthread_get/setspecific(), so if we
    don't have __thread, check if those are available.

diff --git a/configure.ac b/configure.ac
index fed97b1..0bf5658 100644
--- a/configure.ac
+++ b/configure.ac
@@ -523,6 +523,86 @@ if test x$have_posix_memalign = xyes; then
    AC_DEFINE(HAVE_POSIX_MEMALIGN, 1, [Whether we have posix_memalign()])
 fi
 
+dnl =====================================
+dnl Thread local storage
+
+support_for__thread=no
+
+AC_MSG_CHECKING(for __thread)
+AC_COMPILE_IFELSE([
+__thread int x ;
+int main () { return 0; }
+], support_for__thread=yes)
+
+if test $support_for__thread = yes; then 
+   AC_DEFINE([TOOLCHAIN_SUPPORTS__THREAD],[],[Whether the tool chain supports __thread])
+fi
+
+AC_MSG_RESULT($support_for__thread)
+
+dnl posix tls
+
+if test $support_for__thread = no; then
+
+support_for_pthread_setspecific=no
+   
+AC_MSG_CHECKING(for pthread_setspecific)
+
+save_LDFLAGS=$LDFLAGS
+
+LDFLAGS="-pthread"
+
+AC_LINK_IFELSE([
+#include <pthread.h>
+
+#include <stdlib.h>
+#include <pthread.h>
+
+static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+static pthread_key_t key;
+
+static void
+make_key (void)
+{
+    pthread_key_create (&key, NULL);
+}
+
+int
+main ()
+{
+    void *value = NULL;
+    
+    if (pthread_once (&once_control, make_key) != 0)
+    {
+	value = NULL;
+    }
+    else
+    {
+	value = pthread_getspecific (key);
+	if (!value)
+	{
+	    value = malloc (100);
+	    pthread_setspecific (key, value);
+	}
+    }
+}
+], support_for_pthread_setspecific=yes);
+
+LDFLAGS=$save_LDFLAGS
+
+if test $support_for_pthread_setspecific = yes; then
+   PTHREAD_LDFLAGS="-pthread"
+   AC_DEFINE([HAVE_PTHREAD_SETSPECIFIC], [], [Whether pthread_setspecific() is supported])
+fi
+
+AC_MSG_RESULT($support_for_pthread_setspecific);
+
+fi
+
+AC_SUBST(TOOLCHAIN_SUPPORTS__THREAD)
+AC_SUBST(HAVE_PTHREAD_SETSPECIFIC)
+AC_SUBST(PTHREAD_LDFLAGS)
+
 AC_OUTPUT([pixman-1.pc
            pixman-1-uninstalled.pc
            Makefile
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index 8ac6827..5a0e7a9 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -1,5 +1,5 @@
 lib_LTLIBRARIES = libpixman-1.la
-libpixman_1_la_LDFLAGS = -version-info $(LT_VERSION_INFO) -no-undefined
+libpixman_1_la_LDFLAGS = -version-info $(LT_VERSION_INFO) -no-undefined @PTHREAD_LDFLAGS@ 
 libpixman_1_la_LIBADD = @DEP_LIBS@ -lm
 libpixman_1_la_SOURCES =			\
 	pixman.h				\