pixman: Branch 'master' - 4 commits

Tue Dec 18 11:10:54 PST 2012

pixman/pixman-bits-image.c |    2 -
 pixman/pixman-combine32.h  |   47 +++++++++++++++++++++++++++++++++++++++
 pixman/pixman-fast-path.c  |   38 ++++++++++++++++----------------
 pixman/pixman-inlines.h    |   30 ++++++++++++-------------
 pixman/pixman-mmx.c        |   20 ++++++++--------
 pixman/pixman-private.h    |   53 ++++++++++++++++++++++++++++++++++-----------
 pixman/pixman-sse2.c       |    8 +++---
 pixman/pixman.c            |    2 -
 test/lowlevel-blt-bench.c  |    1 
 9 files changed, 139 insertions(+), 62 deletions(-)

New commits:
commit 2734071d7bee699401dc8c98d5c2ef0e2dbb0c91
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Thu Dec 6 17:13:16 2012 +0200

    ARM: make use of UQADD8 instruction even in generic C code paths
    
    ARMv6 has UQADD8 instruction, which implements unsigned saturated
    addition for 8-bit values packed in 32-bit registers. It is very useful
    for UN8x4_ADD_UN8x4, UN8_rb_ADD_UN8_rb and ADD_UN8 macros (which would
    otherwise need a lot of arithmetic operations to simulate this operation).
    Since most of the major ARM linux distros are built for ARMv7, we are
    much less dependent on runtime CPU detection and can get practical
    benefits from conditional compilation here for a lot of users.
    
    The results of cairo-perf-trace benchmark on ARM Cortex-A15 with pixman
    compiled by gcc 4.7.2 and PIXMAN_DISABLE set to "arm-simd arm-neon":
    
    Speedups
    ========
    image    firefox-talos-gfx  (29938.22 0.12%) ->  (27814.76 0.51%) : 1.08x speedup
    image    firefox-asteroids  (23241.11 0.07%) ->  (21795.19 0.07%) : 1.07x speedup
    image firefox-canvas-alpha (174519.85 0.08%) -> (164788.64 0.20%) : 1.06x speedup
    image              poppler   (9464.46 1.61%) ->   (8991.53 0.14%) : 1.05x speedup

diff --git a/pixman/pixman-combine32.h b/pixman/pixman-combine32.h
index 875dde3..cdd56a6 100644
--- a/pixman/pixman-combine32.h
+++ b/pixman/pixman-combine32.h
@@ -20,6 +20,47 @@
 #define BLUE_8(x) ((x) & MASK)
 
 /*
+ * ARMv6 has UQADD8 instruction, which implements unsigned saturated
+ * addition for 8-bit values packed in 32-bit registers. It is very useful
+ * for UN8x4_ADD_UN8x4, UN8_rb_ADD_UN8_rb and ADD_UN8 macros (which would
+ * otherwise need a lot of arithmetic operations to simulate this operation).
+ * Since most of the major ARM linux distros are built for ARMv7, we are
+ * much less dependent on runtime CPU detection and can get practical
+ * benefits from conditional compilation here for a lot of users.
+ */
+
+#if defined(USE_GCC_INLINE_ASM) && defined(__arm__) && \
+    !defined(__aarch64__) && (!defined(__thumb__) || defined(__thumb2__))
+#if defined(__ARM_ARCH_6__)   || defined(__ARM_ARCH_6J__)  || \
+    defined(__ARM_ARCH_6K__)  || defined(__ARM_ARCH_6Z__)  || \
+    defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) || \
+    defined(__ARM_ARCH_6M__)  || defined(__ARM_ARCH_7__)   || \
+    defined(__ARM_ARCH_7A__)  || defined(__ARM_ARCH_7R__)  || \
+    defined(__ARM_ARCH_7M__)  || defined(__ARM_ARCH_7EM__)
+
+static force_inline uint32_t
+un8x4_add_un8x4 (uint32_t x, uint32_t y)
+{
+    uint32_t t;
+    asm ("uqadd8 %0, %1, %2" : "=r" (t) : "%r" (x), "r" (y));
+    return t;
+}
+
+#define UN8x4_ADD_UN8x4(x, y) \
+    ((x) = un8x4_add_un8x4 ((x), (y)))
+
+#define UN8_rb_ADD_UN8_rb(x, y, t) \
+    ((t) = un8x4_add_un8x4 ((x), (y)), (x) = (t))
+
+#define ADD_UN8(x, y, t) \
+    ((t) = (x), un8x4_add_un8x4 ((t), (y)))
+
+#endif
+#endif
+
+/*****************************************************************************/
+
+/*
  * Helper macros.
  */
 
@@ -29,9 +70,11 @@
 #define DIV_UN8(a, b)							\
     (((uint16_t) (a) * MASK + ((b) / 2)) / (b))
 
+#ifndef ADD_UN8
 #define ADD_UN8(x, y, t)				     \
     ((t) = (x) + (y),					     \
      (uint32_t) (uint8_t) ((t) | (0 - ((t) >> G_SHIFT))))
+#endif
 
 #define DIV_ONE_UN8(x)							\
     (((x) + ONE_HALF + (((x) + ONE_HALF) >> G_SHIFT)) >> G_SHIFT)
@@ -56,6 +99,7 @@
 /*
  * x_rb = min (x_rb + y_rb, 255)
  */
+#ifndef UN8_rb_ADD_UN8_rb
 #define UN8_rb_ADD_UN8_rb(x, y, t)					\
     do									\
     {									\
@@ -63,6 +107,7 @@
 	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);		\
 	x = (t & RB_MASK);						\
     } while (0)
+#endif
 
 /*
  * x_rb = (x_rb * a_rb) / 255
@@ -208,6 +253,7 @@
 /*
   x_c = min(x_c + y_c, 255)
 */
+#ifndef UN8x4_ADD_UN8x4
 #define UN8x4_ADD_UN8x4(x, y)						\
     do									\
     {									\
@@ -223,3 +269,4 @@
 									\
 	x = r1__ | (r2__ << G_SHIFT);					\
     } while (0)
+#endif
commit f9a41703b2d46c988b9e4e378d27396f718006ae
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Mon Dec 3 03:01:21 2012 +0200

    Faster conversion from a8r8g8b8 to r5g6b5 in C code
    
    This change reduces 3 shifts, 3 ANDs and 2 ORs (total 8 arithmetic
    operations) to 3 shifts, 2 ANDs and 2 ORs (total 7 arithmetic
    operations).
    
    We get garbage in the high 16 bits of the result, which might need
    to be cleared when casting to uint16_t (it would bring us back to
    total 8 arithmetic operations). However in the case if the result
    of a8r8g8b8->r5g6b5 conversion is immediately stored to memory, no
    extra instructions for clearing these garbage bits are needed.
    
    This allows the a8r8g8b8->r5g6b5 conversion code to be compiled
    into 4 instructions for ARM instead of 5 (assuming a good optimizing
    compiler), which has no pipeline stalls on ARM11 as an additional
    bonus.
    
    The change in benchmark results for 'lowlevel-blt-bench src_8888_0565'
    with PIXMAN_DISABLE="arm-simd arm-neon mips-dspr2 mmx sse2" and pixman
    compiled by gcc-4.7.2:
    
        MIPS 74K        480MHz  :  40.44 MPix/s ->  40.13 MPix/s
        ARM11           700MHz  :  50.28 MPix/s ->  62.85 MPix/s
        ARM Cortex-A8  1000MHz  : 124.38 MPix/s -> 141.85 MPix/s
        ARM Cortex-A15 1700MHz  : 281.07 MPix/s -> 303.29 MPix/s
        Intel Core i7  2800MHz  : 515.92 MPix/s -> 531.16 MPix/s
    
    The same trick was used in xomap (X server for Nokia N800/N810):
        http://repository.maemo.org/pool/diablo/free/x/xorg-server/
        xorg-server_1.3.99.0~git20070321-0osso20083801.tar.gz

diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index ce71bbd..ea447aa 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -887,9 +887,13 @@ pixman_list_move_to_front (pixman_list_t *list, pixman_link_t *link)
 static force_inline uint16_t
 convert_8888_to_0565 (uint32_t s)
 {
-    return ((((s) >> 3) & 0x001f) |
-            (((s) >> 5) & 0x07e0) |
-            (((s) >> 8) & 0xf800));
+    /* The following code can be compiled into just 4 instructions on ARM */
+    uint32_t a, b;
+    a = (s >> 3) & 0x1F001F;
+    b = s & 0xFC00;
+    a |= a >> 5;
+    a |= b >> 5;
+    return (uint16_t)a;
 }
 
 static force_inline uint32_t
commit 3922e90c400fca3ac43dc77b8dd0c0591e7e4fbc
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Mon Dec 3 02:50:20 2012 +0200

    Change CONVERT_XXXX_TO_YYYY macros into inline functions
    
    It is easier and safer to modify their code in the case if the
    calculations need some temporary variables. And the temporary
    variables will be needed soon.

diff --git a/pixman/pixman-bits-image.c b/pixman/pixman-bits-image.c
index 86d80c3..75a39a1 100644
--- a/pixman/pixman-bits-image.c
+++ b/pixman/pixman-bits-image.c
@@ -1114,7 +1114,7 @@ convert_a8 (const uint8_t *row, int x)
 static force_inline uint32_t
 convert_r5g6b5 (const uint8_t *row, int x)
 {
-    return CONVERT_0565_TO_0888 (*((uint16_t *)row + x));
+    return convert_0565_to_0888 (*((uint16_t *)row + x));
 }
 
 #define MAKE_SEPARABLE_CONVOLUTION_FETCHER(name, format, repeat_mode)  \
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 3429758..420a3d8 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -507,15 +507,15 @@ fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
 		else
 		{
 		    d = *dst;
-		    d = over (src, CONVERT_0565_TO_0888 (d));
+		    d = over (src, convert_0565_to_0888 (d));
 		}
-		*dst = CONVERT_8888_TO_0565 (d);
+		*dst = convert_8888_to_0565 (d);
 	    }
 	    else if (m)
 	    {
 		d = *dst;
-		d = over (in (src, m), CONVERT_0565_TO_0888 (d));
-		*dst = CONVERT_8888_TO_0565 (d);
+		d = over (in (src, m), convert_0565_to_0888 (d));
+		*dst = convert_8888_to_0565 (d);
 	    }
 	    dst++;
 	}
@@ -541,7 +541,7 @@ fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
     if (src == 0)
 	return;
 
-    src16 = CONVERT_8888_TO_0565 (src);
+    src16 = convert_8888_to_0565 (src);
 
     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
@@ -566,14 +566,14 @@ fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 		else
 		{
 		    d = *dst;
-		    d = over (src, CONVERT_0565_TO_0888 (d));
-		    *dst = CONVERT_8888_TO_0565 (d);
+		    d = over (src, convert_0565_to_0888 (d));
+		    *dst = convert_8888_to_0565 (d);
 		}
 	    }
 	    else if (ma)
 	    {
 		d = *dst;
-		d = CONVERT_0565_TO_0888 (d);
+		d = convert_0565_to_0888 (d);
 
 		s = src;
 
@@ -582,7 +582,7 @@ fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
 		ma = ~ma;
 		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
 
-		*dst = CONVERT_8888_TO_0565 (d);
+		*dst = convert_8888_to_0565 (d);
 	    }
 	    dst++;
 	}
@@ -729,9 +729,9 @@ fast_composite_over_8888_0565 (pixman_implementation_t *imp,
 		else
 		{
 		    d = *dst;
-		    d = over (s, CONVERT_0565_TO_0888 (d));
+		    d = over (s, convert_0565_to_0888 (d));
 		}
-		*dst = CONVERT_8888_TO_0565 (d);
+		*dst = convert_8888_to_0565 (d);
 	    }
 	    dst++;
 	}
@@ -762,7 +762,7 @@ fast_composite_src_x888_0565 (pixman_implementation_t *imp,
 	while (w--)
 	{
 	    s = *src++;
-	    *dst = CONVERT_8888_TO_0565 (s);
+	    *dst = convert_8888_to_0565 (s);
 	    dst++;
 	}
     }
@@ -838,13 +838,13 @@ fast_composite_add_0565_0565 (pixman_implementation_t *imp,
 	    if (s)
 	    {
 		d = *dst;
-		s = CONVERT_0565_TO_8888 (s);
+		s = convert_0565_to_8888 (s);
 		if (d)
 		{
-		    d = CONVERT_0565_TO_8888 (d);
+		    d = convert_0565_to_8888 (d);
 		    UN8x4_ADD_UN8x4 (s, d);
 		}
-		*dst = CONVERT_8888_TO_0565 (s);
+		*dst = convert_8888_to_0565 (s);
 	    }
 	    dst++;
 	}
@@ -1094,7 +1094,7 @@ fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
 
     if (srca == 0xff)
     {
-	src565 = CONVERT_8888_TO_0565 (src);
+	src565 = convert_8888_to_0565 (src);
 	while (height--)
 	{
 	    dst = dst_line;
@@ -1142,8 +1142,8 @@ fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
 		}
 		if (bitcache & bitmask)
 		{
-		    d = over (src, CONVERT_0565_TO_0888 (*dst));
-		    *dst = CONVERT_8888_TO_0565 (d);
+		    d = over (src, convert_0565_to_0888 (*dst));
+		    *dst = convert_8888_to_0565 (d);
 		}
 		bitmask = UPDATE_BITMASK (bitmask);
 		dst++;
@@ -1176,7 +1176,7 @@ fast_composite_solid_fill (pixman_implementation_t *imp,
     else if (dest_image->bits.format == PIXMAN_r5g6b5 ||
              dest_image->bits.format == PIXMAN_b5g6r5)
     {
-	src = CONVERT_8888_TO_0565 (src);
+	src = convert_8888_to_0565 (src);
     }
 
     pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
diff --git a/pixman/pixman-inlines.h b/pixman/pixman-inlines.h
index 7f2e404..ab4def0 100644
--- a/pixman/pixman-inlines.h
+++ b/pixman/pixman-inlines.h
@@ -314,36 +314,36 @@ scanline_func_name (dst_type_t       *dst,							\
 												\
 		if (a1 == 0xff)									\
 		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
 		}										\
 		else if (s1)									\
 		{										\
-		    d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
+		    d = convert_ ## DST_FORMAT ## _to_8888 (*dst);				\
+		    s1 = convert_ ## SRC_FORMAT ## _to_8888 (s1);				\
 		    a1 ^= 0xff;									\
 		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
 		}										\
 		dst++;										\
 												\
 		if (a2 == 0xff)									\
 		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s2);			\
 		}										\
 		else if (s2)									\
 		{										\
-		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2);				\
+		    d = convert_## DST_FORMAT ## _to_8888 (*dst);				\
+		    s2 = convert_## SRC_FORMAT ## _to_8888 (s2);				\
 		    a2 ^= 0xff;									\
 		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
 		}										\
 		dst++;										\
 	    }											\
 	    else /* PIXMAN_OP_SRC */								\
 	    {											\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
+		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s2);			\
 	    }											\
 	}											\
 												\
@@ -358,21 +358,21 @@ scanline_func_name (dst_type_t       *dst,							\
 												\
 		if (a1 == 0xff)									\
 		{										\
-		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
 		}										\
 		else if (s1)									\
 		{										\
-		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
-		    s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);				\
+		    d = convert_## DST_FORMAT ## _to_8888 (*dst);				\
+		    s1 = convert_ ## SRC_FORMAT ## _to_8888 (s1);				\
 		    a1 ^= 0xff;									\
 		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
-		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
 		}										\
 		dst++;										\
 	    }											\
 	    else /* PIXMAN_OP_SRC */								\
 	    {											\
-		*dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
 	    }											\
 	}											\
 }
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index aef468a..5a7ea44 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -2230,7 +2230,7 @@ mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
 	while (w && (uintptr_t)dst & 7)
 	{
 	    s = *src++;
-	    *dst = CONVERT_8888_TO_0565 (s);
+	    *dst = convert_8888_to_0565 (s);
 	    dst++;
 	    w--;
 	}
@@ -2253,7 +2253,7 @@ mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
 	while (w)
 	{
 	    s = *src++;
-	    *dst = CONVERT_8888_TO_0565 (s);
+	    *dst = convert_8888_to_0565 (s);
 	    dst++;
 	    w--;
 	}
@@ -3136,13 +3136,13 @@ mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
 	    if (s)
 	    {
 		d = *dst;
-		s = CONVERT_0565_TO_8888 (s);
+		s = convert_0565_to_8888 (s);
 		if (d)
 		{
-		    d = CONVERT_0565_TO_8888 (d);
+		    d = convert_0565_to_8888 (d);
 		    UN8x4_ADD_UN8x4 (s, d);
 		}
-		*dst = CONVERT_8888_TO_0565 (s);
+		*dst = convert_8888_to_0565 (s);
 	    }
 	    dst++;
 	    w--;
@@ -3174,13 +3174,13 @@ mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
 	    if (s)
 	    {
 		d = *dst;
-		s = CONVERT_0565_TO_8888 (s);
+		s = convert_0565_to_8888 (s);
 		if (d)
 		{
-		    d = CONVERT_0565_TO_8888 (d);
+		    d = convert_0565_to_8888 (d);
 		    UN8x4_ADD_UN8x4 (s, d);
 		}
-		*dst = CONVERT_8888_TO_0565 (s);
+		*dst = convert_8888_to_0565 (s);
 	    }
 	    dst++;
 	}
@@ -3824,7 +3824,7 @@ mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
     {
 	uint16_t s = *src++;
 
-	*dst++ = CONVERT_0565_TO_8888 (s);
+	*dst++ = convert_0565_to_8888 (s);
 	w--;
     }
 
@@ -3847,7 +3847,7 @@ mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
     {
 	uint16_t s = *src++;
 
-	*dst++ = CONVERT_0565_TO_8888 (s);
+	*dst++ = convert_0565_to_8888 (s);
 	w--;
     }
 
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 99125a1..ce71bbd 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -884,22 +884,47 @@ pixman_list_move_to_front (pixman_list_t *list, pixman_link_t *link)
 
 /* Conversion between 8888 and 0565 */
 
-#define CONVERT_8888_TO_0565(s)						\
-    ((((s) >> 3) & 0x001f) |						\
-     (((s) >> 5) & 0x07e0) |						\
-     (((s) >> 8) & 0xf800))
+static force_inline uint16_t
+convert_8888_to_0565 (uint32_t s)
+{
+    return ((((s) >> 3) & 0x001f) |
+            (((s) >> 5) & 0x07e0) |
+            (((s) >> 8) & 0xf800));
+}
 
-#define CONVERT_0565_TO_0888(s)						\
-    (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) |			\
-     ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) |			\
-     ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000)))
+static force_inline uint32_t
+convert_0565_to_0888 (uint16_t s)
+{
+    return (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) |
+            ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) |
+            ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000)));
+}
 
-#define CONVERT_0565_TO_8888(s) (CONVERT_0565_TO_0888(s) | 0xff000000)
+static force_inline uint32_t
+convert_0565_to_8888 (uint16_t s)
+{
+    return convert_0565_to_0888 (s) | 0xff000000;
+}
 
 /* Trivial versions that are useful in macros */
-#define CONVERT_8888_TO_8888(s) (s)
-#define CONVERT_x888_TO_8888(s) ((s) | 0xff000000)
-#define CONVERT_0565_TO_0565(s) (s)
+
+static force_inline uint32_t
+convert_8888_to_8888 (uint32_t s)
+{
+    return s;
+}
+
+static force_inline uint32_t
+convert_x888_to_8888 (uint32_t s)
+{
+    return s | 0xff000000;
+}
+
+static force_inline uint16_t
+convert_0565_to_0565 (uint16_t s)
+{
+    return s;
+}
 
 #define PIXMAN_FORMAT_IS_WIDE(f)					\
     (PIXMAN_FORMAT_A (f) > 8 ||						\
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 7e980c9..b1cb73e 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -2881,7 +2881,7 @@ sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
 	while (w && (uintptr_t)dst & 15)
 	{
 	    s = *src++;
-	    *dst = CONVERT_8888_TO_0565 (s);
+	    *dst = convert_8888_to_0565 (s);
 	    dst++;
 	    w--;
 	}
@@ -2901,7 +2901,7 @@ sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
 	while (w)
 	{
 	    s = *src++;
-	    *dst = CONVERT_8888_TO_0565 (s);
+	    *dst = convert_8888_to_0565 (s);
 	    dst++;
 	    w--;
 	}
@@ -5970,7 +5970,7 @@ sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
     {
 	uint16_t s = *src++;
 
-	*dst++ = CONVERT_0565_TO_8888 (s);
+	*dst++ = convert_0565_to_8888 (s);
 	w--;
     }
 
@@ -5995,7 +5995,7 @@ sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
     {
 	uint16_t s = *src++;
 
-	*dst++ = CONVERT_0565_TO_8888 (s);
+	*dst++ = convert_0565_to_8888 (s);
 	w--;
     }
 
diff --git a/pixman/pixman.c b/pixman/pixman.c
index 0661f41..d3ca3d8 100644
--- a/pixman/pixman.c
+++ b/pixman/pixman.c
@@ -828,7 +828,7 @@ color_to_pixel (const pixman_color_t *color,
 	c = c >> 24;
     else if (format == PIXMAN_r5g6b5 ||
              format == PIXMAN_b5g6r5)
-	c = CONVERT_8888_TO_0565 (c);
+	c = convert_8888_to_0565 (c);
 
 #if 0
     printf ("color: %x %x %x %x\n", color->alpha, color->red, color->green, color->blue);
commit e4519360c15772ac51038b9f86e3f730f06cfb65
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Mon Dec 3 05:44:36 2012 +0200

    test: add "src_0565_8888" to lowlevel-blt-bench

diff --git a/test/lowlevel-blt-bench.c b/test/lowlevel-blt-bench.c
index 3afa926..2f97b7b 100644
--- a/test/lowlevel-blt-bench.c
+++ b/test/lowlevel-blt-bench.c
@@ -616,6 +616,7 @@ tests_tbl[] =
     { "src_n_2x10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
     { "src_n_2a10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
     { "src_8888_0565",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_0565_8888",         PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
     { "src_8888_4444",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a4r4g4b4 },
     { "src_8888_2222",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a2r2g2b2 },
     { "src_8888_2x10",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },