pixman: Branch 'master'

Søren Sandmann Pedersen sandmann at kemper.freedesktop.org
Wed Sep 2 16:33:52 PDT 2009


 pixman/pixman-sse2.c |  228 ++++++++++++++++++++++++---------------------------
 1 file changed, 108 insertions(+), 120 deletions(-)

New commits:
commit 632125d4108f9a53d625a6b997832fa45a295807
Author: Søren Sandmann Pedersen <sandmann at redhat.com>
Date:   Tue Jun 2 08:27:33 2009 -0400

    Enable the x888_8_8888 sse2 fast path.

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 34a1ac9..984614e 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5422,9 +5422,7 @@ sse2_composite_copy_area (pixman_implementation_t *imp,
                      src_x, src_y, dest_x, dest_y, width, height);
 }
 
-#if 0
-/* This code are buggy in MMX version, now the bug was translated to SSE2 version */
-void
+static void
 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
                                  pixman_op_t              op,
                                  pixman_image_t *         src_image,
@@ -5459,125 +5457,118 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
 
     while (height--)
     {
-	src = src_line;
-	src_line += src_stride;
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
-
-	w = width;
-
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)mask);
-
-	while (w && (unsigned long)dst & 15)
-	{
-	    s = 0xff000000 | *src++;
-	    m = (uint32_t) *mask++;
-	    d = *dst;
-
-	    __m64 ms = unpack_32_1x64 (s);
-
-	    if (m != 0xff)
-	    {
-		ms = in_over_1x64 (ms,
-		                   mask_x00ff,
-		                   expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
-		                   unpack_32_1x64 (d));
-	    }
-
-	    *dst++ = pack_1x64_32 (ms);
-	    w--;
-	}
-
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)mask);
-
-	while (w >= 4)
-	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-	    cache_prefetch_next ((__m128i*)mask);
-
-	    m = *(uint32_t*) mask;
-	    xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
-
-	    if (m == 0xffffffff)
-	    {
-		save_128_aligned ((__m128i*)dst, xmm_src);
-	    }
-	    else
-	    {
-		xmm_dst = load_128_aligned ((__m128i*)dst);
-
-		xmm_mask = _mm_unpacklo_epi16 (
-		    unpack_32_1x128 (m), _mm_setzero_si128 ());
-
-		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
-
-		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
-					&xmm_mask_lo, &xmm_mask_hi);
-
-		in_over_2x128 (xmm_src_lo, xmm_src_hi,
-			       mask_00ff, mask_00ff,
-			       xmm_mask_lo, xmm_mask_hi,
-			       &xmm_dst_lo, &xmm_dst_hi);
-
-		save_128_aligned (
-		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
-	    }
-
-	    src += 4;
-	    dst += 4;
-	    mask += 4;
-	    w -= 4;
-	}
-
-	while (w)
-	{
-	    m = (uint32_t) *mask++;
-
-	    if (m)
-	    {
-		s = 0xff000000 | *src;
-
-		if (m == 0xff)
-		{
-		    *dst = s;
-		}
-		else
-		{
-		    d = *dst;
-
-		    *dst = pack_1x64_32 (
-			in_over_1x64 (
-			    unpack_32_1x64 (s),
-			    mask_x00ff,
-			    expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
-			    unpack_32_1x64 (d)));
-		}
-
-	    }
-
-	    src++;
-	    dst++;
-	    w--;
-	}
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        /* call prefetch hint to optimize cache load*/
+        cache_prefetch ((__m128i*)src);
+        cache_prefetch ((__m128i*)dst);
+        cache_prefetch ((__m128i*)mask);
+
+        while (w && (unsigned long)dst & 15)
+        {
+            s = 0xff000000 | *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+
+            __m64 ms = unpack_32_1x64 (s);
+
+            if (m != 0xff)
+            {
+		__m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
+		__m64 md = unpack_32_1x64 (d);
+
+                ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
+            }
+
+            *dst++ = pack_1x64_32 (ms);
+            w--;
+        }
+
+        /* call prefetch hint to optimize cache load*/
+        cache_prefetch ((__m128i*)src);
+        cache_prefetch ((__m128i*)dst);
+        cache_prefetch ((__m128i*)mask);
+
+        while (w >= 4)
+        {
+            /* fill cache line with next memory */
+            cache_prefetch_next ((__m128i*)src);
+            cache_prefetch_next ((__m128i*)dst);
+            cache_prefetch_next ((__m128i*)mask);
+
+            m = *(uint32_t*) mask;
+            xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
+
+            if (m == 0xffffffff)
+            {
+                save_128_aligned ((__m128i*)dst, xmm_src);
+            }
+            else
+            {
+                xmm_dst = load_128_aligned ((__m128i*)dst);
+
+                xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+                unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+                expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+                in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+            }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+            m = (uint32_t) *mask++;
+
+            if (m)
+            {
+                s = 0xff000000 | *src;
+
+                if (m == 0xff)
+                {
+                    *dst = s;
+                }
+                else
+                {
+		    __m64 ma, md, ms;
+
+                    d = *dst;
+
+		    ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
+		    md = unpack_32_1x64 (d);
+		    ms = unpack_32_1x64 (s);
+
+                    *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
+                }
+
+            }
+
+            src++;
+            dst++;
+            w--;
+        }
     }
 
     _mm_empty ();
 }
 
-#endif
-
 static const pixman_fast_path_t sse2_fast_paths[] =
 {
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   sse2_composite_over_n_8_0565,       0 },
@@ -5595,13 +5586,10 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888,       0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888,       0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888,       0 },
-#if 0
-    /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888,    0 },
     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
-#endif
     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },


More information about the xorg-commit mailing list