pixman: Branch 'master' - 4 commits

Mon Apr 18 13:30:38 PDT 2011

configure.ac                 |   49 +++++++-----
 demos/tri-test.c             |    2 
 pixman/pixman-arm-neon-asm.S |  169 +++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-neon.c     |    4 +
 pixman/pixman-trap.c         |   23 ++++-
 test/composite-traps-test.c  |    2 
 6 files changed, 220 insertions(+), 29 deletions(-)

New commits:
commit b455496890f7f941d561c284aca14783300bedd6
Author: SÃ¸ren Sandmann Pedersen <ssp at redhat.com>
Date:   Fri Mar 11 07:52:57 2011 -0500

    Offset rendering in pixman_composite_trapezoids() by (x_dst, y_dst)
    
    Previously, this function would do coordinate calculations in such a
    way that (x_dst, y_dst) would only affect the alignment of the source
    image, but not of the traps, which would always be considered to be in
    absolute destination coordinates. This is unlike the
    pixman_image_composite() function which also registers the mask to the
    destination.
    
    This patch makes it so that traps are also offset by (x_dst, y_dst).
    
    Also add a comment explaining how this function is supposed to
    operate, and update tri-test.c and composite-trap-test.c to deal with
    the new semantics.

diff --git a/demos/tri-test.c b/demos/tri-test.c
index ff4779e..a71869a 100644
--- a/demos/tri-test.c
+++ b/demos/tri-test.c
@@ -36,7 +36,7 @@ main (int argc, char **argv)
 				dest_img,
 				PIXMAN_a8,
 				200, 200,
-				35, 5,
+				-5, 5,
 				ARRAY_LENGTH (tris), tris);
     show_image (dest_img);
     
diff --git a/pixman/pixman-trap.c b/pixman/pixman-trap.c
index 2957a2b..c99f03e 100644
--- a/pixman/pixman-trap.c
+++ b/pixman/pixman-trap.c
@@ -387,6 +387,19 @@ pixman_rasterize_trapezoid (pixman_image_t *          image,
     }
 }
 
+/*
+ * pixman_composite_trapezoids()
+ *
+ * All the trapezoids are conceptually rendered to an infinitely big image.
+ * The (0, 0) coordinates of this image are then aligned with the (x, y)
+ * coordinates of the source image, and then both images are aligned with
+ * the (x, y) coordinates of the destination. Then, in principle, compositing
+ * of these three images takes place across the entire destination.
+ *
+ * FIXME: However, there is currently a bug, where we restrict this compositing
+ * to the bounding box of the trapezoids. This is incorrect for operators such
+ * as SRC and IN where blank source pixels do have an effect on the destination.
+ */
 PIXMAN_EXPORT void
 pixman_composite_trapezoids (pixman_op_t		op,
 			     pixman_image_t *		src,
@@ -419,14 +432,13 @@ pixman_composite_trapezoids (pixman_op_t		op,
 	    if (!pixman_trapezoid_valid (trap))
 		continue;
 	    
-	    pixman_rasterize_trapezoid (dst, trap, 0, 0);
+	    pixman_rasterize_trapezoid (dst, trap, x_dst, y_dst);
 	}
     }
     else
     {
 	pixman_image_t *tmp;
 	pixman_box32_t box;
-	int x_rel, y_rel;
 	
 	box.x1 = INT32_MAX;
 	box.y1 = INT32_MAX;
@@ -482,11 +494,10 @@ pixman_composite_trapezoids (pixman_op_t		op,
 	    pixman_rasterize_trapezoid (tmp, trap, - box.x1, - box.y1);
 	}
 	
-	x_rel = box.x1 + x_src - x_dst;
-	y_rel = box.y1 + y_src - y_dst;
-	
 	pixman_image_composite (op, src, tmp, dst,
-				x_rel, y_rel, 0, 0, box.x1, box.y1,
+				x_src + box.x1, y_src + box.y1,
+				0, 0,
+				x_dst + box.x1, y_dst + box.y1,
 				box.x2 - box.x1, box.y2 - box.y1);
 	
 	pixman_image_unref (tmp);
diff --git a/test/composite-traps-test.c b/test/composite-traps-test.c
index cf30281..fa6d8a9 100644
--- a/test/composite-traps-test.c
+++ b/test/composite-traps-test.c
@@ -252,6 +252,6 @@ test_composite (int      testnum,
 int
 main (int argc, const char *argv[])
 {
-    return fuzzer_test_main("composite traps", 40000, 0xA34F95C7,
+    return fuzzer_test_main("composite traps", 40000, 0xE3112106,
 			    test_composite, argc, argv);
 }
commit e75e6a4ef5c5a8ac8b0e8464f08f83fd2b6e86ed
Author: SÃ¸ren Sandmann Pedersen <ssp at redhat.com>
Date:   Sat Apr 2 23:24:48 2011 -0400

    ARM: Add 'neon_composite_over_n_8888_0565_ca' fast path
    
    This improves the performance of the firefox-talos-gfx benchmark with
    the image16 backend. Benchmark on an 800 MHz ARM Cortex A8:
    
    Before:
    
    [ # ]  backend                         test   min(s) median(s) stddev. count
    [  0]  image16            firefox-talos-gfx  121.773  122.218   0.15%    6/6
    
    After:
    
    [ # ]  backend                         test   min(s) median(s) stddev. count
    [  0]  image16            firefox-talos-gfx   85.247   85.563   0.22%    6/6
    
    V2: Slightly better instruction scheduling based on comments from Taekyun Kim.
    V3: Eliminate all stalls from the inner loop. Also based on comments from Taekyun Kim.

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 5e9fda3..833f18c 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1426,6 +1426,175 @@ generate_composite_function \
 
 /******************************************************************************/
 
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
+    /*
+     * 'combine_mask_ca' replacement
+     *
+     * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
+     *         mask in          {d24, d25, d26}       [B, G, R]
+     * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
+     *         updated mask in  {d24, d25, d26}       [B, G, R]
+     */
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d9
+    vmull.u8    q6,  d26, d10
+    vmull.u8    q9,  d11, d25
+    vmull.u8    q12, d11, d24
+    vmull.u8    q13, d11, d26
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q10, q1,  #8
+    vrshr.u16   q11, q6,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q10
+    vraddhn.u16 d2,  q6,  q11
+    vrshr.u16   q11, q12, #8
+    vrshr.u16   q8,  q9,  #8
+    vrshr.u16   q6,  q13, #8
+    vraddhn.u16 d24, q12, q11
+    vraddhn.u16 d25, q9,  q8
+    /*
+     * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+     * and put data into d16 - blue, d17 - green, d18 - red
+     */
+       vshrn.u16   d17, q2,  #3
+       vshrn.u16   d18, q2,  #8
+    vraddhn.u16 d26, q13, q6
+       vsli.u16    q2,  q2,  #5
+       vsri.u8     d18, d18, #5
+       vsri.u8     d17, d17, #6
+    /*
+     * 'combine_over_ca' replacement
+     *
+     * output: updated dest in d16 - blue, d17 - green, d18 - red
+     */
+    vmvn.8      q12, q12
+       vshrn.u16   d16, q2,  #2
+    vmvn.8      d26, d26
+    vmull.u8    q6,  d16, d24
+    vmull.u8    q7,  d17, d25
+    vmull.u8    q11, d18, d26
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
+    /* ... continue 'combine_over_ca' replacement */
+    vrshr.u16   q10, q6,  #8
+    vrshr.u16   q14, q7,  #8
+    vrshr.u16   q15, q11, #8
+    vraddhn.u16 d16, q10, q6
+    vraddhn.u16 d17, q14, q7
+    vraddhn.u16 d18, q15, q11
+    vqadd.u8    q8,  q0,  q8
+    vqadd.u8    d18, d2,  d18
+    /*
+     * convert the results in d16, d17, d18 to r5g6b5 and store
+     * them into {d28, d29}
+     */
+    vshll.u8    q14, d18, #8
+    vshll.u8    q10, d17, #8
+    vshll.u8    q15, d16, #8
+    vsri.u16    q14, q10, #5
+    vsri.u16    q14, q15, #11
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+    fetch_mask_pixblock
+        vrshr.u16   q10, q6, #8
+        vrshr.u16   q14, q7, #8
+    vld1.16     {d4, d5}, [DST_R, :128]!
+        vrshr.u16   q15, q11, #8
+        vraddhn.u16 d16, q10, q6
+        vraddhn.u16 d17, q14, q7
+        vraddhn.u16 d22, q15, q11
+            /* process_pixblock_head */
+            /*
+             * 'combine_mask_ca' replacement
+             *
+             * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
+             *         mask in          {d24, d25, d26}       [B, G, R]
+             * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
+             *         updated mask in  {d24, d25, d26}       [B, G, R]
+             */
+            vmull.u8    q1,  d25, d9
+        vqadd.u8    q8,  q0, q8
+            vmull.u8    q0,  d24, d8
+        vqadd.u8    d22, d2, d22
+            vmull.u8    q6,  d26, d10
+        /*
+         * convert the result in d16, d17, d22 to r5g6b5 and store
+         * it into {d28, d29}
+         */
+        vshll.u8    q14, d22, #8
+        vshll.u8    q10, d17, #8
+        vshll.u8    q15, d16, #8
+            vmull.u8    q9,  d11, d25
+        vsri.u16    q14, q10, #5
+            vmull.u8    q12, d11, d24
+            vmull.u8    q13, d11, d26
+        vsri.u16    q14, q15, #11
+    cache_preload 8, 8
+            vrshr.u16   q8,  q0,  #8
+            vrshr.u16   q10, q1,  #8
+            vrshr.u16   q11, q6,  #8
+            vraddhn.u16 d0,  q0,  q8
+            vraddhn.u16 d1,  q1,  q10
+            vraddhn.u16 d2,  q6,  q11
+            vrshr.u16   q11, q12, #8
+            vrshr.u16   q8,  q9,  #8
+            vrshr.u16   q6,  q13, #8
+            vraddhn.u16 d25, q9,  q8
+                /*
+                 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
+	         * 8-bit format and put data into d16 - blue, d17 - green,
+	         * d18 - red
+                 */
+                vshrn.u16   d17, q2,  #3
+                vshrn.u16   d18, q2,  #8
+            vraddhn.u16 d24, q12, q11
+            vraddhn.u16 d26, q13, q6
+                vsli.u16    q2,  q2,  #5
+                vsri.u8     d18, d18, #5
+                vsri.u8     d17, d17, #6
+            /*
+             * 'combine_over_ca' replacement
+             *
+             * output: updated dest in d16 - blue, d17 - green, d18 - red
+             */
+            vmvn.8      q12, q12
+                vshrn.u16   d16, q2,  #2
+            vmvn.8      d26, d26
+            vmull.u8    q7,  d17, d25
+            vmull.u8    q6,  d16, d24
+            vmull.u8    q11, d18, d26
+    vst1.16     {d28, d29}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_0565_ca_init, \
+    pixman_composite_over_n_8888_0565_ca_cleanup, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
 .macro pixman_composite_in_n_8_process_pixblock_head
     /* expecting source data in {d0, d1, d2, d3} */
     /* and destination data in {d4, d5, d6, d7} */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 0a10ca1..77875ad 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -80,6 +80,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8888,
                                       uint8_t, 1, uint32_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_8888_ca,
                                       uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_0565_ca,
+				      uint32_t, 1, uint16_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8,
                                       uint8_t, 1, uint8_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8,
@@ -282,6 +284,8 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, neon_composite_over_n_8888_8888_ca),
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, neon_composite_over_n_8888_8888_ca),
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5,   neon_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5,   neon_composite_over_n_8888_0565_ca),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    a8r8g8b8, neon_composite_over_8888_n_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    x8r8g8b8, neon_composite_over_8888_n_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    r5g6b5,   neon_composite_over_8888_n_0565),
commit 1670b952143284f480c39ff087b5694a64eb7db3
Author: Gilles Espinasse <g.esp at free.fr>
Date:   Tue Apr 12 22:44:56 2011 +0200

    Fix OpenMP not supported case
    
    PIXMAN_LINK_WITH_ENV did not fail unless -Wall -Werror is used.
    So even when the compiler did not support OpenMP, USE_OPENMP was defined.
    Fix that by running the second OpenMP test only when first AC_OPENMP find supported
    
    configure tested in the cases :
    gcc without libgomp support, no openmp option, --enable-openmp and --disable-openmp
    gcc with libgomp support, no openmp option, --enable-openmp and --disable-openmp
    
    Not tested with autoconf version not knowing openmp (<2.62)
    
    Warn when --enable-openmp is requested but no support is found
    
    Signed-off-by: Gilles Espinasse <g.esp at free.fr>

diff --git a/configure.ac b/configure.ac
index 0b526d6..09a4948 100644
--- a/configure.ac
+++ b/configure.ac
@@ -192,35 +192,43 @@ dnl =========================================================================
 dnl OpenMP for the test suite?
 dnl
 
-# Check for OpenMP support (only supported by autoconf >=2.62)
+# Check for OpenMP support only when autoconf support that (require autoconf >=2.62)
 OPENMP_CFLAGS=
 m4_ifdef([AC_OPENMP], [AC_OPENMP])
 
-m4_define([openmp_test_program],[dnl
-#include <stdio.h>
+if test "x$enable_openmp" = "xyes" && test "x$ac_cv_prog_c_openmp" = "xunsupported" ; then
+  AC_MSG_WARN([OpenMP support requested but found unsupported])
+fi
 
-extern unsigned int lcg_seed;
-#pragma omp threadprivate(lcg_seed)
-unsigned int lcg_seed;
+dnl May not fail to link without -Wall -Werror added
+dnl So try to link only when openmp is supported
+dnl ac_cv_prog_c_openmp is not defined when --disable-openmp is used
+if test "x$ac_cv_prog_c_openmp" != "xunsupported" && test "x$ac_cv_prog_c_openmp" != "x"; then
+  m4_define([openmp_test_program],[dnl
+  #include <stdio.h>
 
-unsigned function(unsigned a, unsigned b)
-{
+  extern unsigned int lcg_seed;
+  #pragma omp threadprivate(lcg_seed)
+  unsigned int lcg_seed;
+
+  unsigned function(unsigned a, unsigned b)
+  {
 	lcg_seed ^= b;
 	return ((a + b) ^ a ) + lcg_seed;
-}
+  }
 
-int main(int argc, char **argv)
-{
+  int main(int argc, char **argv)
+  {
 	int i;
 	int n1 = 0, n2 = argc;
 	unsigned checksum = 0;
 	int verbose = argv != NULL;
 	unsigned (*test_function)(unsigned, unsigned);
 	test_function = function;
-    #pragma omp parallel for reduction(+:checksum) default(none) \
+	#pragma omp parallel for reduction(+:checksum) default(none) \
 					shared(n1, n2, test_function, verbose)
 	for (i = n1; i < n2; i++)
-    	{
+	{
 		unsigned crc = test_function (i, 0);
 		if (verbose)
 			printf ("%d: %08X\n", i, crc);
@@ -228,18 +236,17 @@ int main(int argc, char **argv)
 	}
 	printf("%u\n", checksum);
 	return 0;
-}
-])
+  }
+  ])
 
-PIXMAN_LINK_WITH_ENV(
+  PIXMAN_LINK_WITH_ENV(
 	[CFLAGS="$OPENMP_CFLAGS" LDFLAGS="$OPENMP_CFLAGS"],
 	[openmp_test_program],
 	[have_openmp=yes],
 	[have_openmp=no])
-if test "x$have_openmp" = "xyes"; then
-   AC_DEFINE(USE_OPENMP, 1, [use OpenMP in the test suite])
-else
-   OPENMP_CFLAGS=""
+  if test "x$have_openmp" = "xyes" ; then
+    AC_DEFINE(USE_OPENMP, 1, [use OpenMP in the test suite])
+  fi
 fi
 AC_SUBST(OPENMP_CFLAGS)
 
commit b9e8f7fb7494e4ee4be56d1555632233a494b28e
Author: Gilles Espinasse <g.esp at free.fr>
Date:   Tue Apr 12 22:44:25 2011 +0200

    Fix missing AC_MSG_RESULT value from Werror test
    
    Use the correct variable name
    
    Signed-off-by: Gilles Espinasse <g.esp at free.fr>

diff --git a/configure.ac b/configure.ac
index db9a883..0b526d6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -119,7 +119,7 @@ for w in -Werror -errwarn; do
 		[CFLAGS=$w],
 		[int main(int c, char **v) { (void)c; (void)v; return 0; }],
 		[WERROR=$w; yesno=yes], [yesno=no])
-	AC_MSG_RESULT($_yesno)
+	AC_MSG_RESULT($yesno)
     fi
 done