pixman: Branch 'master' - 2 commits

Mon Dec 6 18:54:38 PST 2010

pixman/pixman-arm-neon-asm.S |   57 +++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-neon.c     |    5 +++
 pixman/pixman-general.c      |    6 ++--
 3 files changed, 65 insertions(+), 3 deletions(-)

New commits:
commit 3d094997b1820719d15cec7dc633ed37e1912bfc
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Tue Nov 30 00:31:06 2010 +0200

    Fix for potential unaligned memory accesses
    
    The temporary scanline buffer allocated on stack was declared
    as uint8_t array. As a result, the compiler was free to select
    any arbitrary alignment for it (even though there is typically
    no reason to use really weird alignments here and the stack is
    normally at least 4 bytes aligned on most platforms). Having
    improper alignment is non-portable and can impact performance
    or even make the code misbehave depending on the target platform.
    
    Using uint64_t type for this array should ensure that any possible
    memory accesses done by pixman code are going to be handled correctly
    (pixman-combine64.c can access this buffer via uint64_t * pointer).
    
    Some alignment related problem was reported in:
    http://lists.freedesktop.org/archives/pixman/2010-November/000747.html

diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c
index 4d234a0..8130f16 100644
--- a/pixman/pixman-general.c
+++ b/pixman/pixman-general.c
@@ -56,8 +56,8 @@ general_composite_rect  (pixman_implementation_t *imp,
                          int32_t                  width,
                          int32_t                  height)
 {
-    uint8_t stack_scanline_buffer[SCANLINE_BUFFER_LENGTH * 3];
-    uint8_t *scanline_buffer = stack_scanline_buffer;
+    uint64_t stack_scanline_buffer[(SCANLINE_BUFFER_LENGTH * 3 + 7) / 8];
+    uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer;
     uint8_t *src_buffer, *mask_buffer, *dest_buffer;
     fetch_scanline_t fetch_src = NULL, fetch_mask = NULL, fetch_dest = NULL;
     pixman_combine_32_func_t compose;
@@ -255,7 +255,7 @@ general_composite_rect  (pixman_implementation_t *imp,
 	}
     }
 
-    if (scanline_buffer != stack_scanline_buffer)
+    if (scanline_buffer != (uint8_t *) stack_scanline_buffer)
 	free (scanline_buffer);
 }
 
commit 985e59a82fa5e644cb6516dc174ab3f79f1448df
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Thu Nov 25 02:28:29 2010 +0200

    ARM: added 'neon_src_rpixbuf_8888' fast path
    
    With this optimization added, pixman assisted conversion from
    non-premultiplied to premultiplied alpha format is now fully
    NEON optimized (both with and without R/B color components
    swapping in the process).

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index cf014fa..4dddde1 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2072,6 +2072,63 @@ generate_composite_function \
 
 /******************************************************************************/
 
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
+    vrshr.u16   q11, q8, #8
+    vswp        d3, d31
+    vrshr.u16   q12, q9, #8
+    vrshr.u16   q13, q10, #8
+    vraddhn.u16 d28, q11, q8
+    vraddhn.u16 d29, q12, q9
+    vraddhn.u16 d30, q13, q10
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
+        vrshr.u16   q11, q8, #8
+        vswp        d3, d31
+        vrshr.u16   q12, q9, #8
+        vrshr.u16   q13, q10, #8
+    fetch_src_pixblock
+        vraddhn.u16 d28, q11, q8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d29, q12, q9
+        vraddhn.u16 d30, q13, q10
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
 .macro pixman_composite_over_0565_8_0565_process_pixblock_head
     /* mask is in d15 */
     convert_0565_to_x888 q4, d2, d1, d0
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index e3eca2b..c28c481 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -52,6 +52,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0565_rev,
                                    uint8_t, 3, uint16_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_pixbuf_8888,
                                    uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_rpixbuf_8888,
+                                   uint32_t, 1, uint32_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8_8,
                                    uint8_t, 1, uint8_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8888_8888,
@@ -249,6 +251,9 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC,  b8g8r8,   null,     x8r8g8b8, neon_composite_src_0888_8888_rev),
     PIXMAN_STD_FAST_PATH (SRC,  b8g8r8,   null,     r5g6b5,   neon_composite_src_0888_0565_rev),
     PIXMAN_STD_FAST_PATH (SRC,  pixbuf,   pixbuf,   a8r8g8b8, neon_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  pixbuf,   pixbuf,   a8b8g8r8, neon_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  rpixbuf,  rpixbuf,  a8r8g8b8, neon_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  rpixbuf,  rpixbuf,  a8b8g8r8, neon_composite_src_pixbuf_8888),
     PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8,       neon_composite_over_n_8_8),
     PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       r5g6b5,   neon_composite_over_n_8_0565),
     PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       b5g6r5,   neon_composite_over_n_8_0565),