pixman: Branch 'master' - 6 commits

Tue Sep 21 03:38:36 PDT 2010

pixman/Makefile.am        |    1 
 pixman/pixman-fast-path.c |  268 +--------------------------
 pixman/pixman-fast-path.h |  443 ++++++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-sse2.c      |  115 +++++++++++
 4 files changed, 570 insertions(+), 257 deletions(-)

New commits:
commit 517a77a992255cb6dae7e74bc6f6b9ac21003ac1
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Wed Sep 8 09:16:12 2010 +0300

    SSE2 optimization for scaled over_8888_8888 operation with nearest filter
    
    This is the first demo implementation, it should be possible to
    generalize it later to cover more operations with less lines of code.
    
    It should be also possible to introduce the use of '__builtin_constant_p'
    gcc builtin function for an efficient way of checking if 'unit_x' is known
    to be zero at compile time (when processing padding pixels for NONE, or
    PAD repeat).
    
    Benchmarks from Intel Core i7 860:
    
    == before (nearest OVER) ==
    op=3, src_fmt=20028888, dst_fmt=20028888, speed=142.01 MPix/s
    
    == after (nearest OVER) ==
    op=3, src_fmt=20028888, dst_fmt=20028888, speed=314.99 MPix/s
    
    == performance of nonscaled operation as a reference ==
    op=3, src_fmt=20028888, dst_fmt=20028888, speed=652.09 MPix/s

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 3dd7967..33d71ee 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -35,6 +35,7 @@
 #include <emmintrin.h> /* for SSE2 intrinsics */
 #include "pixman-private.h"
 #include "pixman-combine32.h"
+#include "pixman-fast-path.h"
 
 #if defined(_MSC_VER) && defined(_M_AMD64)
 /* Windows 64 doesn't allow MMX to be used, so
@@ -6346,6 +6347,107 @@ sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
     _mm_empty ();
 }
 
+/* A variant of 'core_combine_over_u_sse2' with minor tweaks */
+static force_inline void
+scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
+                                             const uint32_t* ps,
+                                             int32_t         w,
+                                             pixman_fixed_t  vx,
+                                             pixman_fixed_t  unit_x,
+                                             pixman_fixed_t  max_vx)
+{
+    uint32_t s, d;
+    const uint32_t* pm = NULL;
+
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((unsigned long)pd & 15))
+    {
+	d = *pd;
+	s = combine1 (ps + (vx >> 16), pm);
+	vx += unit_x;
+
+	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
+	if (pm)
+	    pm++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i tmp;
+	uint32_t tmp1, tmp2, tmp3, tmp4;
+
+	tmp1 = ps[vx >> 16];
+	vx += unit_x;
+	tmp2 = ps[vx >> 16];
+	vx += unit_x;
+	tmp3 = ps[vx >> 16];
+	vx += unit_x;
+	tmp4 = ps[vx >> 16];
+	vx += unit_x;
+
+	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
+
+	if (is_opaque (xmm_src_hi))
+	{
+	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
+	}
+	else if (!is_zero (xmm_src_hi))
+	{
+	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	    expand_alpha_2x128 (
+		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+
+	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			&xmm_alpha_lo, &xmm_alpha_hi,
+			&xmm_dst_lo, &xmm_dst_hi);
+
+	    /* rebuid the 4 pixel data and save*/
+	    save_128_aligned ((__m128i*)pd,
+			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	}
+
+	w -= 4;
+	pd += 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps + (vx >> 16), pm);
+	vx += unit_x;
+
+	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
+	if (pm)
+	    pm++;
+
+	w--;
+    }
+    _mm_empty ();
+}
+
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, COVER);
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, NONE);
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, PAD);
+
 static const pixman_fast_path_t sse2_fast_paths[] =
 {
     /* PIXMAN_OP_OVER */
@@ -6429,6 +6531,19 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
 
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
     { PIXMAN_OP_NONE },
 };
 
commit abc90dad57f03bf9293fc825835c6f0fddc6771b
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Thu Sep 16 18:25:40 2010 +0300

    NONE repeat support for fast scaling with nearest filter
    
    Implemented very similar to PAD repeat.
    
    And gcc also seems to be able to completely eliminate the
    code responsible for left and right padding pixels for OVER
    operation with NONE repeat.

diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 5b10d65..3bf03d5 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -1388,18 +1388,23 @@ fast_composite_src_memcpy (pixman_implementation_t *imp,
 }
 
 FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER);
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE);
 FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD);
 FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL);
 FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER);
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE);
 FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD);
 FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL);
 FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER);
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE);
 FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD);
 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL);
 FAST_NEAREST (565_565_cover, 0565, 0565, uint16_t, uint16_t, SRC, COVER);
+FAST_NEAREST (565_565_none, 0565, 0565, uint16_t, uint16_t, SRC, NONE);
 FAST_NEAREST (565_565_pad, 0565, 0565, uint16_t, uint16_t, SRC, PAD);
 FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL);
 FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER);
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE);
 FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD);
 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL);
 
diff --git a/pixman/pixman-fast-path.h b/pixman/pixman-fast-path.h
index 7c14379..a6b5414 100644
--- a/pixman/pixman-fast-path.h
+++ b/pixman/pixman-fast-path.h
@@ -309,7 +309,8 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,
 	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
     }												\
 												\
-    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
+	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
     {												\
 	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
 					&width, &left_pad, &right_pad);				\
@@ -343,6 +344,28 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,
 			        right_pad, 0, 0, 0);						\
 	    }											\
 	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
+	{											\
+	    static src_type_t zero = 0;								\
+	    if (y < 0 || y >= src_image->bits.height)						\
+	    {											\
+		scanline_func (dst, &zero, left_pad + width + right_pad, 0, 0, 0);		\
+		continue;									\
+	    }											\
+	    src = src_first_line + src_stride * y;						\
+	    if (left_pad > 0)									\
+	    {											\
+		scanline_func (dst, &zero, left_pad, 0, 0, 0);					\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (dst + left_pad, src, width, vx, unit_x, 0);			\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		scanline_func (dst + left_pad + width, &zero, right_pad, 0, 0, 0);		\
+	    }											\
+	}											\
 	else											\
 	{											\
 	    src = src_first_line + src_stride * y;						\
@@ -390,6 +413,17 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,
 	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
     }
 
+#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
 #define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
     {   PIXMAN_OP_ ## op,						\
 	PIXMAN_ ## s,							\
@@ -402,6 +436,7 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,
 /* Prefer the use of 'cover' variant, because it is faster */
 #define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
     SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),			\
     SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\
     SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
 
commit 45833d5b198507e9e69b918459eaaf6088e5de00
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Thu Sep 16 17:10:40 2010 +0300

    PAD repeat support for fast scaling with nearest filter
    
    When processing pixels from the left and right padding, the same
    scanline function is used with 'unit_x' set to 0.
    
    Actually appears that gcc can handle this quite efficiently. When
    using 'restrict' keyword, it is able to optimize the whole operation
    performed on left or right padding pixels to a small unrolled loop
    (the code is reduced to a simple fill implementation):
    
        9b30:       89 08                   mov    %ecx,(%rax)
        9b32:       89 48 04                mov    %ecx,0x4(%rax)
        9b35:       48 83 c0 08             add    $0x8,%rax
        9b39:       49 39 c0                cmp    %rax,%r8
        9b3c:       75 f2                   jne    9b30
    
    Without 'restrict' keyword, there is one instruction more: reloading
    source pixel data from memory in the beginning of each iteration. That
    is slower, but also acceptable.

diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index c060749..5b10d65 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -1388,14 +1388,19 @@ fast_composite_src_memcpy (pixman_implementation_t *imp,
 }
 
 FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER);
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD);
 FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL);
 FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER);
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD);
 FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL);
 FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER);
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD);
 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL);
 FAST_NEAREST (565_565_cover, 0565, 0565, uint16_t, uint16_t, SRC, COVER);
+FAST_NEAREST (565_565_pad, 0565, 0565, uint16_t, uint16_t, SRC, PAD);
 FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL);
 FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER);
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD);
 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL);
 
 static force_inline uint32_t
diff --git a/pixman/pixman-fast-path.h b/pixman/pixman-fast-path.h
index 287b753..7c14379 100644
--- a/pixman/pixman-fast-path.h
+++ b/pixman/pixman-fast-path.h
@@ -58,6 +58,63 @@ repeat (pixman_repeat_t repeat, int *c, int size)
     return TRUE;
 }
 
+/*
+ * For each scanline fetched from source image with PAD repeat:
+ * - calculate how many pixels need to be padded on the left side
+ * - calculate how many pixels need to be padded on the right side
+ * - update width to only count pixels which are fetched from the image
+ * All this information is returned via 'width', 'left_pad', 'right_pad'
+ * arguments. The code is assuming that 'unit_x' is positive.
+ *
+ * Note: 64-bit math is used in order to avoid potential overflows, which
+ *       is probably excessive in many cases. This particular function
+ *       may need its own correctness test and performance tuning.
+ */
+static force_inline void
+pad_repeat_get_scanline_bounds (int32_t         source_image_width,
+				pixman_fixed_t  vx,
+				pixman_fixed_t  unit_x,
+				int32_t *       width,
+				int32_t *       left_pad,
+				int32_t *       right_pad)
+{
+    int64_t max_vx = (int64_t) source_image_width << 16;
+    int64_t tmp;
+    if (vx < 0)
+    {
+	tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
+	if (tmp > *width)
+	{
+	    *left_pad = *width;
+	    *width = 0;
+	}
+	else
+	{
+	    *left_pad = (int32_t) tmp;
+	    *width -= (int32_t) tmp;
+	}
+    }
+    else
+    {
+	*left_pad = 0;
+    }
+    tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
+    if (tmp < 0)
+    {
+	*right_pad = *width;
+	*width = 0;
+    }
+    else if (tmp >= *width)
+    {
+	*right_pad = 0;
+    }
+    else
+    {
+	*right_pad = *width - (int32_t) tmp;
+	*width = (int32_t) tmp;
+    }
+}
+
 /* A macroified version of specialized nearest scalers for some
  * common 8888 and 565 formats. It supports SRC and OVER ops.
  *
@@ -213,6 +270,7 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,
     pixman_vector_t v;										\
     pixman_fixed_t vx, vy;									\
     pixman_fixed_t unit_x, unit_y;								\
+    int32_t left_pad, right_pad;								\
 												\
     src_type_t *src;										\
     dst_type_t *dst;										\
@@ -251,6 +309,13 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,
 	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
     }												\
 												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+    {												\
+	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
+					&width, &left_pad, &right_pad);				\
+	vx += left_pad * unit_x;								\
+    }												\
+												\
     while (--height >= 0)									\
     {												\
 	dst = dst_line;										\
@@ -260,10 +325,29 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,
 	vy += unit_y;										\
 	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
 	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
-												\
-	src = src_first_line + src_stride * y;							\
-												\
-	scanline_func (dst, src, width, vx, unit_x, max_vx);					\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\
+	    src = src_first_line + src_stride * y;						\
+	    if (left_pad > 0)									\
+	    {											\
+		scanline_func (dst, src, left_pad, 0, 0, 0);					\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (dst + left_pad, src, width, vx, unit_x, 0);			\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		scanline_func (dst + left_pad + width, src + src_image->bits.width - 1,		\
+			        right_pad, 0, 0, 0);						\
+	    }											\
+	}											\
+	else											\
+	{											\
+	    src = src_first_line + src_stride * y;						\
+	    scanline_func (dst, src, width, vx, unit_x, max_vx);				\
+	}											\
     }												\
 }
 
@@ -295,6 +379,17 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,
 	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
     }
 
+#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
 #define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
     {   PIXMAN_OP_ ## op,						\
 	PIXMAN_ ## s,							\
@@ -307,6 +402,7 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,
 /* Prefer the use of 'cover' variant, because it is faster */
 #define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
     SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\
     SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
 
 #endif
commit 3db0cc5c75a4a764726059511fa6d67082fbeb64
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Fri Sep 17 16:22:25 2010 +0300

    Introduce a fake PIXMAN_REPEAT_COVER constant
    
    We need to implement a true PIXMAN_REPEAT_NONE support later (padding
    the source with zero pixels). So it's better not to use PIXMAN_REPEAT_NONE
    for handling FAST_PATH_SAMPLES_COVER_CLIP special case.

diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 12036a9..c060749 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -1387,15 +1387,15 @@ fast_composite_src_memcpy (pixman_implementation_t *imp,
     }
 }
 
-FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE);
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER);
 FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL);
-FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE);
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER);
 FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL);
-FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE);
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER);
 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL);
-FAST_NEAREST (565_565_none, 0565, 0565, uint16_t, uint16_t, SRC, NONE);
+FAST_NEAREST (565_565_cover, 0565, 0565, uint16_t, uint16_t, SRC, COVER);
 FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL);
-FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE);
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER);
 FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL);
 
 static force_inline uint32_t
diff --git a/pixman/pixman-fast-path.h b/pixman/pixman-fast-path.h
index 7babd66..287b753 100644
--- a/pixman/pixman-fast-path.h
+++ b/pixman/pixman-fast-path.h
@@ -28,6 +28,8 @@
 
 #include "pixman-private.h"
 
+#define PIXMAN_REPEAT_COVER -1
+
 static force_inline pixman_bool_t
 repeat (pixman_repeat_t repeat, int *c, int size)
 {
@@ -216,12 +218,6 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,
     dst_type_t *dst;										\
     int       src_stride, dst_stride;								\
 												\
-    if (PIXMAN_REPEAT_ ## repeat_mode != PIXMAN_REPEAT_NORMAL		&&			\
-	PIXMAN_REPEAT_ ## repeat_mode != PIXMAN_REPEAT_NONE)					\
-    {												\
-	abort();										\
-    }												\
-												\
     PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1);	\
     /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
      * transformed from destination space to source space */					\
@@ -305,7 +301,7 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,
 	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
 	PIXMAN_null, 0,							\
 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
     }
 
 /* Prefer the use of 'cover' variant, because it is faster */
commit e9b0740af76853f58df72cd40cd7cb4e2ac7261b
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Thu Sep 16 13:02:18 2010 +0300

    Nearest scaling fast path macro split into two parts
    
    Scanline processing is now split into a separate function. This provides
    an easy way of overriding it with a platform specific implementation,
    which may use SIMD optimizations. Only basic C data types are used as
    the arguments for this function, so it may be implemented entirely in
    assembly or be generated by some JIT engine.
    
    Also as a result of this split, the complexity of code is reduced a
    bit and now it should be easier to introduce support for the currently
    missing NONE, PAD and REFLECT repeat types.

diff --git a/pixman/pixman-fast-path.h b/pixman/pixman-fast-path.h
index f289288..7babd66 100644
--- a/pixman/pixman-fast-path.h
+++ b/pixman/pixman-fast-path.h
@@ -76,98 +76,24 @@ repeat (pixman_repeat_t repeat, int *c, int size)
     565 source, but it is needed to build. */
 #define GET_0565_ALPHA(s) 0xff
 
-#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,					\
-		     src_type_t, dst_type_t, OP, repeat_mode)					\
-static void											\
-fast_composite_scaled_nearest_ ## scale_func_name ## _ ## OP (pixman_implementation_t *imp,	\
-							      pixman_op_t              op,      \
-							      pixman_image_t *         src_image, \
-							      pixman_image_t *         mask_image, \
-							      pixman_image_t *         dst_image, \
-							      int32_t                  src_x,   \
-							      int32_t                  src_y,   \
-							      int32_t                  mask_x,  \
-							      int32_t                  mask_y,  \
-							      int32_t                  dst_x,   \
-							      int32_t                  dst_y,   \
-							      int32_t                  width,   \
-							      int32_t                  height)  \
+#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,			\
+			      src_type_t, dst_type_t, OP, repeat_mode)				\
+static force_inline void									\
+scanline_func_name (dst_type_t     *dst,							\
+		    src_type_t     *src,							\
+		    int32_t         w,								\
+		    pixman_fixed_t  vx,								\
+		    pixman_fixed_t  unit_x,							\
+		    pixman_fixed_t  max_vx)							\
 {												\
-    dst_type_t *dst_line;									\
-    src_type_t *src_first_line;									\
-    uint32_t   d;										\
-    src_type_t s1, s2;										\
-    uint8_t   a1, a2;										\
-    int       w;										\
-    int       x1, x2, y;									\
-    pixman_fixed_t orig_vx;									\
-    pixman_fixed_t max_vx, max_vy;								\
-    pixman_vector_t v;										\
-    pixman_fixed_t vx, vy;									\
-    pixman_fixed_t unit_x, unit_y;								\
-												\
-    src_type_t *src;										\
-    dst_type_t *dst;										\
-    int       src_stride, dst_stride;								\
-												\
-    if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
-	abort();										\
-												\
-    if (PIXMAN_REPEAT_ ## repeat_mode != PIXMAN_REPEAT_NORMAL		&&			\
-	PIXMAN_REPEAT_ ## repeat_mode != PIXMAN_REPEAT_NONE)					\
-    {												\
-	abort();										\
-    }												\
-												\
-    PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1);	\
-    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
-     * transformed from destination space to source space */					\
-    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
-												\
-    /* reference point is the center of the pixel */						\
-    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
-    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
-    v.vector[2] = pixman_fixed_1;								\
-												\
-    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
-	return;											\
-												\
-    unit_x = src_image->common.transform->matrix[0][0];						\
-    unit_y = src_image->common.transform->matrix[1][1];						\
+	uint32_t   d;										\
+	src_type_t s1, s2;									\
+	uint8_t    a1, a2;									\
+	int        x1, x2;									\
 												\
-    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
-    v.vector[0] -= pixman_fixed_e;								\
-    v.vector[1] -= pixman_fixed_e;								\
-												\
-    vx = v.vector[0];										\
-    vy = v.vector[1];										\
-												\
-    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
-    {												\
-	/* Clamp repeating positions inside the actual samples */				\
-	max_vx = src_image->bits.width << 16;							\
-	max_vy = src_image->bits.height << 16;							\
-												\
-	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
-	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
-    }												\
-												\
-    orig_vx = vx;										\
+	if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
+	    abort();										\
 												\
-    while (--height >= 0)									\
-    {												\
-	dst = dst_line;										\
-	dst_line += dst_stride;									\
-												\
-	y = vy >> 16;										\
-	vy += unit_y;										\
-	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
-												\
-	src = src_first_line + src_stride * y;							\
-												\
-	w = width;										\
-	vx = orig_vx;										\
 	while ((w -= 2) >= 0)									\
 	{											\
 	    x1 = vx >> 16;									\
@@ -258,9 +184,103 @@ fast_composite_scaled_nearest_ ## scale_func_name ## _ ## OP (pixman_implementat
 		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
 	    }											\
 	}											\
+}
+
+#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t,		\
+			      repeat_mode)							\
+static void											\
+fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp,		\
+						   pixman_op_t              op,			\
+						   pixman_image_t *         src_image,		\
+						   pixman_image_t *         mask_image,		\
+						   pixman_image_t *         dst_image,		\
+						   int32_t                  src_x,		\
+						   int32_t                  src_y,		\
+						   int32_t                  mask_x,		\
+						   int32_t                  mask_y,		\
+						   int32_t                  dst_x,		\
+						   int32_t                  dst_y,		\
+						   int32_t                  width,		\
+						   int32_t                  height)		\
+{												\
+    dst_type_t *dst_line;									\
+    src_type_t *src_first_line;									\
+    int       y;										\
+    pixman_fixed_t max_vx = max_vx; /* suppress uninitialized variable warning */		\
+    pixman_fixed_t max_vy;									\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+												\
+    src_type_t *src;										\
+    dst_type_t *dst;										\
+    int       src_stride, dst_stride;								\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode != PIXMAN_REPEAT_NORMAL		&&			\
+	PIXMAN_REPEAT_ ## repeat_mode != PIXMAN_REPEAT_NONE)					\
+    {												\
+	abort();										\
+    }												\
+												\
+    PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1);	\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
+    v.vector[0] -= pixman_fixed_e;								\
+    v.vector[1] -= pixman_fixed_e;								\
+												\
+    vx = v.vector[0];										\
+    vy = v.vector[1];										\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
+    {												\
+	/* Clamp repeating positions inside the actual samples */				\
+	max_vx = src_image->bits.width << 16;							\
+	max_vy = src_image->bits.height << 16;							\
+												\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
+	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+    }												\
+												\
+    while (--height >= 0)									\
+    {												\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+												\
+	y = vy >> 16;										\
+	vy += unit_y;										\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+												\
+	src = src_first_line + src_stride * y;							\
+												\
+	scanline_func (dst, src, width, vx, unit_x, max_vx);					\
     }												\
 }
 
+#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,				\
+		     src_type_t, dst_type_t, OP, repeat_mode)				\
+    FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
+			  SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t,		\
+			  OP, repeat_mode)						\
+    FAST_NEAREST_MAINLOOP(scale_func_name##_##OP,					\
+			  scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
+			  src_type_t, dst_type_t, repeat_mode)
+
+
 #define SCALED_NEAREST_FLAGS						\
     (FAST_PATH_SCALE_TRANSFORM	|					\
      FAST_PATH_NO_ALPHA_MAP	|					\
@@ -268,7 +288,7 @@ fast_composite_scaled_nearest_ ## scale_func_name ## _ ## OP (pixman_implementat
      FAST_PATH_NO_ACCESSORS	|					\
      FAST_PATH_NO_WIDE_FORMAT)
 
-#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
+#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func)			\
     {   PIXMAN_OP_ ## op,						\
 	PIXMAN_ ## s,							\
 	(SCALED_NEAREST_FLAGS		|				\
@@ -277,7 +297,9 @@ fast_composite_scaled_nearest_ ## scale_func_name ## _ ## OP (pixman_implementat
 	PIXMAN_null, 0,							\
 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
 	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
-    },									\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
     {   PIXMAN_OP_ ## op,						\
 	PIXMAN_ ## s,							\
 	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
@@ -286,4 +308,9 @@ fast_composite_scaled_nearest_ ## scale_func_name ## _ ## OP (pixman_implementat
 	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
     }
 
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
+
 #endif
commit 066ce191a6d3bb970b5024c070193cac4c130418
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Thu Sep 16 12:31:27 2010 +0300

    Nearest scaling fast path macros moved to 'pixman-fast-path.h'
    
    These macros with some modifications can can be reused later by
    various platform specific implementations, introducing SIMD
    optimizations for nearest scaling fast paths.

diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index a9de19f..ca31301 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -21,6 +21,7 @@ libpixman_1_la_SOURCES =			\
 	pixman-general.c			\
 	pixman.c				\
 	pixman-fast-path.c			\
+	pixman-fast-path.h			\
 	pixman-solid-fill.c			\
 	pixman-conical-gradient.c		\
 	pixman-linear-gradient.c		\
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index f03752f..12036a9 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -30,6 +30,7 @@
 #include <stdlib.h>
 #include "pixman-private.h"
 #include "pixman-combine32.h"
+#include "pixman-fast-path.h"
 
 static force_inline uint32_t
 fetch_24 (uint8_t *a)
@@ -1386,239 +1387,6 @@ fast_composite_src_memcpy (pixman_implementation_t *imp,
     }
 }
 
-static force_inline pixman_bool_t
-repeat (pixman_repeat_t repeat, int *c, int size)
-{
-    if (repeat == PIXMAN_REPEAT_NONE)
-    {
-	if (*c < 0 || *c >= size)
-	    return FALSE;
-    }
-    else if (repeat == PIXMAN_REPEAT_NORMAL)
-    {
-	while (*c >= size)
-	    *c -= size;
-	while (*c < 0)
-	    *c += size;
-    }
-    else if (repeat == PIXMAN_REPEAT_PAD)
-    {
-	*c = CLIP (*c, 0, size - 1);
-    }
-    else /* REFLECT */
-    {
-	*c = MOD (*c, size * 2);
-	if (*c >= size)
-	    *c = size * 2 - *c - 1;
-    }
-    return TRUE;
-}
-
-/* A macroified version of specialized nearest scalers for some
- * common 8888 and 565 formats. It supports SRC and OVER ops.
- *
- * There are two repeat versions, one that handles repeat normal,
- * and one without repeat handling that only works if the src region
- * used is completely covered by the pre-repeated source samples.
- *
- * The loops are unrolled to process two pixels per iteration for better
- * performance on most CPU architectures (superscalar processors
- * can issue several operations simultaneously, other processors can hide
- * instructions latencies by pipelining operations). Unrolling more
- * does not make much sense because the compiler will start running out
- * of spare registers soon.
- */
-
-#define GET_8888_ALPHA(s) ((s) >> 24)
- /* This is not actually used since we don't have an OVER with
-    565 source, but it is needed to build. */
-#define GET_0565_ALPHA(s) 0xff
-
-#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,					\
-		     src_type_t, dst_type_t, OP, repeat_mode)					\
-static void											\
-fast_composite_scaled_nearest_ ## scale_func_name ## _ ## OP (pixman_implementation_t *imp,	\
-							      pixman_op_t              op,      \
-							      pixman_image_t *         src_image, \
-							      pixman_image_t *         mask_image, \
-							      pixman_image_t *         dst_image, \
-							      int32_t                  src_x,   \
-							      int32_t                  src_y,   \
-							      int32_t                  mask_x,  \
-							      int32_t                  mask_y,  \
-							      int32_t                  dst_x,   \
-							      int32_t                  dst_y,   \
-							      int32_t                  width,   \
-							      int32_t                  height)  \
-{												\
-    dst_type_t *dst_line;									\
-    src_type_t *src_first_line;									\
-    uint32_t   d;										\
-    src_type_t s1, s2;										\
-    uint8_t   a1, a2;										\
-    int       w;										\
-    int       x1, x2, y;									\
-    pixman_fixed_t orig_vx;									\
-    pixman_fixed_t max_vx, max_vy;								\
-    pixman_vector_t v;										\
-    pixman_fixed_t vx, vy;									\
-    pixman_fixed_t unit_x, unit_y;								\
-												\
-    src_type_t *src;										\
-    dst_type_t *dst;										\
-    int       src_stride, dst_stride;								\
-												\
-    if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
-	abort();										\
-												\
-    if (PIXMAN_REPEAT_ ## repeat_mode != PIXMAN_REPEAT_NORMAL		&&			\
-	PIXMAN_REPEAT_ ## repeat_mode != PIXMAN_REPEAT_NONE)					\
-    {												\
-	abort();										\
-    }												\
-												\
-    PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1);	\
-    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
-     * transformed from destination space to source space */					\
-    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
-												\
-    /* reference point is the center of the pixel */						\
-    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
-    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
-    v.vector[2] = pixman_fixed_1;								\
-												\
-    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
-	return;											\
-												\
-    unit_x = src_image->common.transform->matrix[0][0];						\
-    unit_y = src_image->common.transform->matrix[1][1];						\
-												\
-    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
-    v.vector[0] -= pixman_fixed_e;								\
-    v.vector[1] -= pixman_fixed_e;								\
-												\
-    vx = v.vector[0];										\
-    vy = v.vector[1];										\
-												\
-    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
-    {												\
-	/* Clamp repeating positions inside the actual samples */				\
-	max_vx = src_image->bits.width << 16;							\
-	max_vy = src_image->bits.height << 16;							\
-												\
-	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
-	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
-    }												\
-												\
-    orig_vx = vx;										\
-												\
-    while (--height >= 0)									\
-    {												\
-	dst = dst_line;										\
-	dst_line += dst_stride;									\
-												\
-	y = vy >> 16;										\
-	vy += unit_y;										\
-	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
-												\
-	src = src_first_line + src_stride * y;							\
-												\
-	w = width;										\
-	vx = orig_vx;										\
-	while ((w -= 2) >= 0)									\
-	{											\
-	    x1 = vx >> 16;									\
-	    vx += unit_x;									\
-	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	    {											\
-		/* This works because we know that unit_x is positive */			\
-		while (vx >= max_vx)								\
-		    vx -= max_vx;								\
-	    }											\
-	    s1 = src[x1];									\
-												\
-	    x2 = vx >> 16;									\
-	    vx += unit_x;									\
-	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
-	    {											\
-		/* This works because we know that unit_x is positive */			\
-		while (vx >= max_vx)								\
-		    vx -= max_vx;								\
-	    }											\
-	    s2 = src[x2];									\
-												\
-	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
-	    {											\
-		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
-		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
-												\
-		if (a1 == 0xff)									\
-		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-		}										\
-		else if (s1)									\
-		{										\
-		    d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
-		    a1 ^= 0xff;									\
-		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
-		}										\
-		dst++;										\
-												\
-		if (a2 == 0xff)									\
-		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
-		}										\
-		else if (s2)									\
-		{										\
-		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2);				\
-		    a2 ^= 0xff;									\
-		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
-		}										\
-		dst++;										\
-	    }											\
-	    else /* PIXMAN_OP_SRC */								\
-	    {											\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
-	    }											\
-	}											\
-												\
-	if (w & 1)										\
-	{											\
-	    x1 = vx >> 16;									\
-	    s1 = src[x1];									\
-												\
-	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
-	    {											\
-		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
-												\
-		if (a1 == 0xff)									\
-		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-		}										\
-		else if (s1)									\
-		{										\
-		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
-		    a1 ^= 0xff;									\
-		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
-		}										\
-		dst++;										\
-	    }											\
-	    else /* PIXMAN_OP_SRC */								\
-	    {											\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-	    }											\
-	}											\
-    }												\
-}
-
 FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE);
 FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL);
 FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE);
@@ -1859,30 +1627,6 @@ static const pixman_fast_path_t c_fast_paths[] =
     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
 
-#define SCALED_NEAREST_FLAGS						\
-    (FAST_PATH_SCALE_TRANSFORM	|					\
-     FAST_PATH_NO_ALPHA_MAP	|					\
-     FAST_PATH_NEAREST_FILTER	|					\
-     FAST_PATH_NO_ACCESSORS	|					\
-     FAST_PATH_NO_WIDE_FORMAT)
-
-#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	(SCALED_NEAREST_FLAGS		|				\
-	 FAST_PATH_NORMAL_REPEAT	|				\
-	 FAST_PATH_X_UNIT_POSITIVE),					\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
-    },									\
-    {   PIXMAN_OP_ ## op,						\
-	PIXMAN_ ## s,							\
-	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
-	PIXMAN_null, 0,							\
-	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
-	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
-    }
     SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
     SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
     SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
diff --git a/pixman/pixman-fast-path.h b/pixman/pixman-fast-path.h
new file mode 100644
index 0000000..f289288
--- /dev/null
+++ b/pixman/pixman-fast-path.h
@@ -0,0 +1,289 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright Â© 2000 SuSE, Inc.
+ * Copyright Â© 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifndef PIXMAN_FAST_PATH_H__
+#define PIXMAN_FAST_PATH_H__
+
+#include "pixman-private.h"
+
+static force_inline pixman_bool_t
+repeat (pixman_repeat_t repeat, int *c, int size)
+{
+    if (repeat == PIXMAN_REPEAT_NONE)
+    {
+	if (*c < 0 || *c >= size)
+	    return FALSE;
+    }
+    else if (repeat == PIXMAN_REPEAT_NORMAL)
+    {
+	while (*c >= size)
+	    *c -= size;
+	while (*c < 0)
+	    *c += size;
+    }
+    else if (repeat == PIXMAN_REPEAT_PAD)
+    {
+	*c = CLIP (*c, 0, size - 1);
+    }
+    else /* REFLECT */
+    {
+	*c = MOD (*c, size * 2);
+	if (*c >= size)
+	    *c = size * 2 - *c - 1;
+    }
+    return TRUE;
+}
+
+/* A macroified version of specialized nearest scalers for some
+ * common 8888 and 565 formats. It supports SRC and OVER ops.
+ *
+ * There are two repeat versions, one that handles repeat normal,
+ * and one without repeat handling that only works if the src region
+ * used is completely covered by the pre-repeated source samples.
+ *
+ * The loops are unrolled to process two pixels per iteration for better
+ * performance on most CPU architectures (superscalar processors
+ * can issue several operations simultaneously, other processors can hide
+ * instructions latencies by pipelining operations). Unrolling more
+ * does not make much sense because the compiler will start running out
+ * of spare registers soon.
+ */
+
+#define GET_8888_ALPHA(s) ((s) >> 24)
+ /* This is not actually used since we don't have an OVER with
+    565 source, but it is needed to build. */
+#define GET_0565_ALPHA(s) 0xff
+
+#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,					\
+		     src_type_t, dst_type_t, OP, repeat_mode)					\
+static void											\
+fast_composite_scaled_nearest_ ## scale_func_name ## _ ## OP (pixman_implementation_t *imp,	\
+							      pixman_op_t              op,      \
+							      pixman_image_t *         src_image, \
+							      pixman_image_t *         mask_image, \
+							      pixman_image_t *         dst_image, \
+							      int32_t                  src_x,   \
+							      int32_t                  src_y,   \
+							      int32_t                  mask_x,  \
+							      int32_t                  mask_y,  \
+							      int32_t                  dst_x,   \
+							      int32_t                  dst_y,   \
+							      int32_t                  width,   \
+							      int32_t                  height)  \
+{												\
+    dst_type_t *dst_line;									\
+    src_type_t *src_first_line;									\
+    uint32_t   d;										\
+    src_type_t s1, s2;										\
+    uint8_t   a1, a2;										\
+    int       w;										\
+    int       x1, x2, y;									\
+    pixman_fixed_t orig_vx;									\
+    pixman_fixed_t max_vx, max_vy;								\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+												\
+    src_type_t *src;										\
+    dst_type_t *dst;										\
+    int       src_stride, dst_stride;								\
+												\
+    if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
+	abort();										\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode != PIXMAN_REPEAT_NORMAL		&&			\
+	PIXMAN_REPEAT_ ## repeat_mode != PIXMAN_REPEAT_NONE)					\
+    {												\
+	abort();										\
+    }												\
+												\
+    PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1);	\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
+    v.vector[0] -= pixman_fixed_e;								\
+    v.vector[1] -= pixman_fixed_e;								\
+												\
+    vx = v.vector[0];										\
+    vy = v.vector[1];										\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
+    {												\
+	/* Clamp repeating positions inside the actual samples */				\
+	max_vx = src_image->bits.width << 16;							\
+	max_vy = src_image->bits.height << 16;							\
+												\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
+	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+    }												\
+												\
+    orig_vx = vx;										\
+												\
+    while (--height >= 0)									\
+    {												\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+												\
+	y = vy >> 16;										\
+	vy += unit_y;										\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+												\
+	src = src_first_line + src_stride * y;							\
+												\
+	w = width;										\
+	vx = orig_vx;										\
+	while ((w -= 2) >= 0)									\
+	{											\
+	    x1 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= max_vx)								\
+		    vx -= max_vx;								\
+	    }											\
+	    s1 = src[x1];									\
+												\
+	    x2 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= max_vx)								\
+		    vx -= max_vx;								\
+	    }											\
+	    s2 = src[x2];									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst);				\
+		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+												\
+		if (a2 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+		}										\
+		else if (s2)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2);				\
+		    a2 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+	    }											\
+	}											\
+												\
+	if (w & 1)										\
+	{											\
+	    x1 = vx >> 16;									\
+	    s1 = src[x1];									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+	    }											\
+	}											\
+    }												\
+}
+
+#define SCALED_NEAREST_FLAGS						\
+    (FAST_PATH_SCALE_TRANSFORM	|					\
+     FAST_PATH_NO_ALPHA_MAP	|					\
+     FAST_PATH_NEAREST_FILTER	|					\
+     FAST_PATH_NO_ACCESSORS	|					\
+     FAST_PATH_NO_WIDE_FORMAT)
+
+#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    },									\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#endif