pixman: Branch 'master' - 5 commits

Thu Feb 3 00:24:12 PST 2011

pixman/pixman-private.h   |   18 +-
 pixman/pixman-sse2.c      |  352 +++++++++++++++++++++++++++++++++++++++++-----
 test/lowlevel-blt-bench.c |    1 
 3 files changed, 327 insertions(+), 44 deletions(-)

New commits:
commit 7fd4897730412977f730b850e6e697156fb3734b
Author: SÃ¸ren Sandmann Pedersen <ssp at redhat.com>
Date:   Sun Jan 23 16:53:26 2011 -0500

    Add SSE2 fetcher for 0565
    
    Before:
    
    add_0565_0565 = L1:  61.08  L2:  61.03  M: 60.57 ( 10.95%)  HT: 46.85  VT: 45.25  R: 39.99  RT: 20.41 ( 233Kops/s)
    
    After:
    
    add_0565_0565 = L1:  77.84  L2:  76.25  M: 75.38 ( 13.71%)  HT: 55.99  VT: 54.56  R: 45.41  RT: 21.95 ( 255Kops/s)

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index e28b986..91adc05 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -6082,6 +6082,52 @@ sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
 }
 
 static uint32_t *
+sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint16_t *src = (uint16_t *)iter->bits;
+    __m128i ff000000 = mask_ff000000;
+
+    iter->bits += iter->stride;
+
+    while (w && ((unsigned long)dst) & 0x0f)
+    {
+	uint16_t s = *src++;
+
+	*dst++ = CONVERT_0565_TO_8888 (s);
+	w--;
+    }
+
+    while (w >= 8)
+    {
+	__m128i lo, hi, s;
+
+	s = _mm_loadu_si128 ((__m128i *)src);
+
+	lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
+	hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
+
+	save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
+	save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
+
+	dst += 8;
+	src += 8;
+	w -= 8;
+    }
+
+    while (w)
+    {
+	uint16_t s = *src++;
+
+	*dst++ = CONVERT_0565_TO_8888 (s);
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
 {
     int w = iter->width;
@@ -6136,6 +6182,7 @@ typedef struct
 static const fetcher_info_t fetchers[] =
 {
     { PIXMAN_x8r8g8b8,		sse2_fetch_x8r8g8b8 },
+    { PIXMAN_r5g6b5,		sse2_fetch_r5g6b5 },
     { PIXMAN_a8,		sse2_fetch_a8 },
     { PIXMAN_null }
 };
commit 8414aa76c20732a6ed29a2d80175936570c5e592
Author: SÃ¸ren Sandmann Pedersen <ssp at redhat.com>
Date:   Fri Dec 31 00:57:46 2010 -0500

    Improve performance of sse2_combine_over_u()
    
    Split this function into two, one that has a mask, and one that
    doesn't. This is a fairly substantial speed-up in many cases.
    
    New output of lowlevel-blt-bench over_x888_8_0565:
    
    over_x888_8_0565 =  L1:  63.76  L2:  62.75  M: 59.37 ( 21.55%)  HT: 45.89  VT: 43.55  R: 34.51  RT: 16.80 ( 201Kops/s)

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index d5ea14b..e28b986 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -610,82 +610,174 @@ combine4 (const __m128i *ps, const __m128i *pm)
 }
 
 static force_inline void
-core_combine_over_u_sse2 (uint32_t*       pd,
-                          const uint32_t* ps,
-                          const uint32_t* pm,
-                          int             w)
+core_combine_over_u_sse2_mask (uint32_t *	  pd,
+			       const uint32_t*    ps,
+			       const uint32_t*    pm,
+			       int                w)
 {
     uint32_t s, d;
 
-    __m128i xmm_dst_lo, xmm_dst_hi;
-    __m128i xmm_src_lo, xmm_src_hi;
-    __m128i xmm_alpha_lo, xmm_alpha_hi;
-
     /* Align dst on a 16-byte boundary */
     while (w && ((unsigned long)pd & 15))
     {
 	d = *pd;
 	s = combine1 (ps, pm);
 
-	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
 	ps++;
-	if (pm)
-	    pm++;
+	pm++;
 	w--;
     }
 
     while (w >= 4)
     {
-	/* I'm loading unaligned because I'm not sure about
-	 * the address alignment.
-	 */
-	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	__m128i mask = load_128_unaligned ((__m128i *)pm);
 
-	if (is_opaque (xmm_src_hi))
+	if (!is_zero (mask))
 	{
-	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
-	}
-	else if (!is_zero (xmm_src_hi))
-	{
-	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+	    __m128i src;
+	    __m128i src_hi, src_lo;
+	    __m128i mask_hi, mask_lo;
+	    __m128i alpha_hi, alpha_lo;
 
-	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
-	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	    src = load_128_unaligned ((__m128i *)ps);
 
-	    expand_alpha_2x128 (
-		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+	    if (is_opaque (_mm_and_si128 (src, mask)))
+	    {
+		save_128_aligned ((__m128i *)pd, src);
+	    }
+	    else
+	    {
+		__m128i dst = load_128_aligned ((__m128i *)pd);
+		__m128i dst_hi, dst_lo;
 
-	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
-			&xmm_alpha_lo, &xmm_alpha_hi,
-			&xmm_dst_lo, &xmm_dst_hi);
+		unpack_128_2x128 (mask, &mask_lo, &mask_hi);
+		unpack_128_2x128 (src, &src_lo, &src_hi);
 
-	    /* rebuid the 4 pixel data and save*/
-	    save_128_aligned ((__m128i*)pd,
-			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+		expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
+		pix_multiply_2x128 (&src_lo, &src_hi,
+				    &mask_lo, &mask_hi,
+				    &src_lo, &src_hi);
+
+		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+		expand_alpha_2x128 (src_lo, src_hi,
+				    &alpha_lo, &alpha_hi);
+
+		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+			    &dst_lo, &dst_hi);
+
+		save_128_aligned (
+		    (__m128i *)pd,
+		    pack_2x128_128 (dst_lo, dst_hi));
+	    }
 	}
 
-	w -= 4;
+	pm += 4;
 	ps += 4;
 	pd += 4;
-	if (pm)
-	    pm += 4;
+	w -= 4;
     }
-
     while (w)
     {
 	d = *pd;
 	s = combine1 (ps, pm);
 
-	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
 	ps++;
-	if (pm)
-	    pm++;
+	pm++;
 
 	w--;
     }
 }
 
 static force_inline void
+core_combine_over_u_sse2_no_mask (uint32_t *	  pd,
+				  const uint32_t*    ps,
+				  int                w)
+{
+    uint32_t s, d;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((unsigned long)pd & 15))
+    {
+	d = *pd;
+	s = *ps;
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i src;
+	__m128i src_hi, src_lo, dst_hi, dst_lo;
+	__m128i alpha_hi, alpha_lo;
+
+	src = load_128_unaligned ((__m128i *)ps);
+
+	if (!is_zero (src))
+	{
+	    if (is_opaque (src))
+	    {
+		save_128_aligned ((__m128i *)pd, src);
+	    }
+	    else
+	    {
+		__m128i dst = load_128_aligned ((__m128i *)pd);
+
+		unpack_128_2x128 (src, &src_lo, &src_hi);
+		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+		expand_alpha_2x128 (src_lo, src_hi,
+				    &alpha_lo, &alpha_hi);
+		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+			    &dst_lo, &dst_hi);
+
+		save_128_aligned (
+		    (__m128i *)pd,
+		    pack_2x128_128 (dst_lo, dst_hi));
+	    }
+	}
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+    }
+    while (w)
+    {
+	d = *pd;
+	s = *ps;
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+
+	w--;
+    }
+}
+
+static force_inline void
+core_combine_over_u_sse2 (uint32_t*       pd,
+                          const uint32_t* ps,
+                          const uint32_t* pm,
+                          int             w)
+{
+    if (pm)
+	core_combine_over_u_sse2_mask (pd, ps, pm, w);
+    else
+	core_combine_over_u_sse2_no_mask (pd, ps, w);
+}
+
+static force_inline void
 core_combine_over_reverse_u_sse2 (uint32_t*       pd,
                                   const uint32_t* ps,
                                   const uint32_t* pm,
commit 08e855f15cba24aac83145b994069d0bb50be5a1
Author: SÃ¸ren Sandmann Pedersen <ssp at redhat.com>
Date:   Sun Jan 23 16:17:17 2011 -0500

    Add SSE2 fetcher for a8
    
    New output of lowlevel-blt-bench over_x888_8_0565:
    
    over_x888_8_0565 =  L1:  57.85  L2:  56.80  M: 54.14 ( 19.50%)  HT: 42.64  VT: 40.56  R: 32.67  RT: 16.22 ( 195Kops/s)
    
    Based in part on code by Steve Snyder from
    
        https://bugs.freedesktop.org/show_bug.cgi?id=21173

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 10a3dd0..d5ea14b 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5989,6 +5989,52 @@ sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
     return iter->buffer;
 }
 
+static uint32_t *
+sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint8_t *src = iter->bits;
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+
+    iter->bits += iter->stride;
+
+    while (w && (((unsigned long)dst) & 15))
+    {
+        *dst++ = *(src++) << 24;
+        w--;
+    }
+
+    while (w >= 16)
+    {
+	xmm0 = _mm_loadu_si128((__m128i *)src);
+
+	xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
+	xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
+	xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
+	xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
+	xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
+	xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
+
+	_mm_store_si128(((__m128i *)(dst +  0)), xmm3);
+	_mm_store_si128(((__m128i *)(dst +  4)), xmm4);
+	_mm_store_si128(((__m128i *)(dst +  8)), xmm5);
+	_mm_store_si128(((__m128i *)(dst + 12)), xmm6);
+
+	dst += 16;
+	src += 16;
+	w -= 16;
+    }
+
+    while (w)
+    {
+	*dst++ = *(src++) << 24;
+	w--;
+    }
+
+    return iter->buffer;
+}
+
 typedef struct
 {
     pixman_format_code_t	format;
@@ -5997,7 +6043,8 @@ typedef struct
 
 static const fetcher_info_t fetchers[] =
 {
-    { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
+    { PIXMAN_x8r8g8b8,		sse2_fetch_x8r8g8b8 },
+    { PIXMAN_a8,		sse2_fetch_a8 },
     { PIXMAN_null }
 };
 
commit 2b6b0cf3591ce4438f7e0571c7a762972a999cd8
Author: SÃ¸ren Sandmann Pedersen <ssp at redhat.com>
Date:   Wed Jan 12 06:38:54 2011 -0500

    Add SSE2 fetcher for x8r8g8b8
    
    New output of lowlevel-blt-bench over_x888_8_0565:
    
    over_x888_8_0565 =  L1:  55.68  L2:  55.11  M: 52.83 ( 19.04%)  HT: 39.62  VT: 37.70  R: 30.88  RT: 14.62 ( 174Kops/s)
    
    The fetcher is looked up in a table, so that other fetchers can easily
    be added.
    
    See also https://bugs.freedesktop.org/show_bug.cgi?id=20709

diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 664260b..f5d0ba1 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -183,6 +183,9 @@ union pixman_image
 };
 
 typedef struct pixman_iter_t pixman_iter_t;
+typedef uint32_t *(* pixman_iter_get_scanline_t) (pixman_iter_t *iter, const uint32_t *mask);
+typedef void      (* pixman_iter_write_back_t)   (pixman_iter_t *iter);
+
 typedef enum
 {
     ITER_NARROW =		(1 << 0),
@@ -209,13 +212,16 @@ typedef enum
 
 struct pixman_iter_t
 {
-    uint32_t *(* get_scanline) (pixman_iter_t *iter, const uint32_t *mask);
-    void      (* write_back)   (pixman_iter_t *iter);
+    pixman_iter_get_scanline_t	get_scanline;
+    pixman_iter_write_back_t	write_back;
+
+    pixman_image_t *		image;
+    uint32_t *			buffer;
+    int				x, y;
+    int				width;
 
-    pixman_image_t *    image;
-    uint32_t *          buffer;
-    int                 x, y;
-    int                 width;
+    uint8_t *			bits;
+    int				stride;
 };
 
 void
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index ae55456..10a3dd0 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5953,6 +5953,94 @@ sse2_fill (pixman_implementation_t *imp,
     return TRUE;
 }
 
+static uint32_t *
+sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    __m128i ff000000 = mask_ff000000;
+    uint32_t *dst = iter->buffer;
+    uint32_t *src = (uint32_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && ((unsigned long)dst) & 0x0f)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	save_128_aligned (
+	    (__m128i *)dst, _mm_or_si128 (
+		load_128_unaligned ((__m128i *)src), ff000000));
+
+	dst += 4;
+	src += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    pixman_iter_get_scanline_t	get_scanline;
+} fetcher_info_t;
+
+static const fetcher_info_t fetchers[] =
+{
+    { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
+    { PIXMAN_null }
+};
+
+static void
+sse2_src_iter_init (pixman_implementation_t *imp,
+		    pixman_iter_t *iter,
+		    pixman_image_t *image,
+		    int x, int y, int width, int height,
+		    uint8_t *buffer, iter_flags_t flags)
+{
+#define FLAGS								\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
+
+    if ((flags & ITER_NARROW)				&&
+	(image->common.flags & FLAGS) == FLAGS		&&
+	x >= 0 && y >= 0				&&
+	x + width <= image->bits.width			&&
+	y + height <= image->bits.height)
+    {
+	const fetcher_info_t *f;
+
+	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+	{
+	    if (image->common.extended_format_code == f->format)
+	    {
+		uint8_t *b = (uint8_t *)image->bits.bits;
+		int s = image->bits.rowstride * 4;
+
+		iter->bits = b + s * y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
+		iter->stride = s;
+		iter->width = width;
+		iter->buffer = (uint32_t *)buffer;
+
+		iter->get_scanline = f->get_scanline;
+		return;
+	    }
+	}
+    }
+
+    _pixman_implementation_src_iter_init (
+	imp->delegate, iter, image, x, y, width, height, buffer, flags);
+}
+
 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
 __attribute__((__force_align_arg_pointer__))
 #endif
@@ -6020,6 +6108,8 @@ _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
     imp->blt = sse2_blt;
     imp->fill = sse2_fill;
 
+    imp->src_iter_init = sse2_src_iter_init;
+
     return imp;
 }
 
commit 13aed37758d1af5b5bc2a80d886b764d4c45827e
Author: SÃ¸ren Sandmann Pedersen <ssp at redhat.com>
Date:   Sat Jan 22 17:13:19 2011 -0500

    Add a test for over_x888_8_0565 in lowlevel_blt_bench().
    
    The next few commits will speed this up quite a bit.
    
    Current output:
    
    ---
    reference memcpy speed = 2217.5MB/s (554.4MP/s for 32bpp fills)
    ---
    over_x888_8_0565 =  L1:  54.67  L2:  54.01  M: 52.33 ( 18.88%)  HT: 37.19  VT: 35.54  R: 29.40  RT: 13.63 ( 162Kops/s)

diff --git a/test/lowlevel-blt-bench.c b/test/lowlevel-blt-bench.c
index f7ebb1f..67c845f 100644
--- a/test/lowlevel-blt-bench.c
+++ b/test/lowlevel-blt-bench.c
@@ -618,6 +618,7 @@ tests_tbl[] =
     { "over_n_1555",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
     { "over_8888_0565",        PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_r5g6b5 },
     { "over_8888_x888",        PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "over_x888_8_0565",      PIXMAN_x8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_r5g6b5 },
     { "over_n_8_0565",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_r5g6b5 },
     { "over_n_8_1555",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
     { "over_n_8_4444",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a4r4g4b4 },