pixman: Branch 'master' - 10 commits

Wed Nov 10 08:52:05 PST 2010

pixman/pixman-arm-common.h   |   40 +++++++
 pixman/pixman-arm-neon-asm.S |  100 ++++++++++++++-----
 pixman/pixman-arm-neon-asm.h |  216 ++++++++++++++++++++++++++++++++++++++++---
 pixman/pixman-arm-neon.c     |   30 +++++
 pixman/pixman-arm-simd-asm.S |   70 +++++++++++++
 pixman/pixman-arm-simd.c     |    7 +
 6 files changed, 423 insertions(+), 40 deletions(-)

New commits:
commit d8fe87a6262ee661af8fb0d46bab223e4ab3d88e
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Mon Oct 4 01:56:59 2010 +0300

    ARM: optimization for scaled src_0565_0565 with nearest filter
    
    The performance improvement is only in the ballpark of 5% when
    compared against C code built with a reasonably good compiler
    (gcc 4.5.1). But gcc 4.4 produces approximately 30% slower code
    here, so assembly optimization makes sense to avoid dependency
    on the compiler quality and/or optimization options.
    
    Benchmark from ARM11:
        == before ==
        op=1, src_fmt=10020565, dst_fmt=10020565, speed=34.86 MPix/s
    
        == after ==
        op=1, src_fmt=10020565, dst_fmt=10020565, speed=36.62 MPix/s
    
    Benchmark from ARM Cortex-A8:
        == before ==
        op=1, src_fmt=10020565, dst_fmt=10020565, speed=89.55 MPix/s
    
        == after ==
        op=1, src_fmt=10020565, dst_fmt=10020565, speed=94.91 MPix/s

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index a3d2d40..7567700 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -1,5 +1,6 @@
 /*
  * Copyright Â© 2008 Mozilla Corporation
+ * Copyright Â© 2010 Nokia Corporation
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
@@ -328,3 +329,72 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
 	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
 	bx	lr
 .endfunc
+
+/*
+ * Note: This function is only using armv4t instructions (not even armv6),
+ *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
+ *       be split into a few variants, tuned for each microarchitecture.
+ *
+ * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
+ * have efficient write combining), it needs to be changed to use 16-byte
+ * aligned writes using STM instruction.
+ */
+pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
+	W	.req	r0
+	DST	.req	r1
+	SRC	.req	r2
+	VX	.req	r3
+	UNIT_X	.req	ip
+	TMP1	.req	r4
+	TMP2	.req	r5
+	VXMASK	.req	r6
+
+	ldr	UNIT_X, [sp]
+	push	{r4, r5, r6, r7}
+	mvn	VXMASK, #1
+
+	/* define helper macro */
+	.macro	scale_2_pixels
+		ldrh	TMP1, [SRC, TMP1]
+		and	TMP2, VXMASK, VX, lsr #15
+		add	VX, VX, UNIT_X
+		strh	TMP1, [DST], #2
+
+		ldrh	TMP2, [SRC, TMP2]
+		and	TMP1, VXMASK, VX, lsr #15
+		add	VX, VX, UNIT_X
+		strh	TMP2, [DST], #2
+	.endm
+
+	/* now do the scaling */
+	and	TMP1, VXMASK, VX, lsr #15
+	add	VX, VX, UNIT_X
+	subs	W, #4
+	blt	2f
+1: /* main loop, process 4 pixels per iteration */
+	scale_2_pixels
+	scale_2_pixels
+	subs	W, W, #4
+	bge	1b
+2:
+	tst	W, #2
+	beq	2f
+	scale_2_pixels
+2:
+	tst	W, #1
+	ldrneh	TMP1, [SRC, TMP1]
+	strneh	TMP1, [DST], #2
+	/* cleanup helper macro */
+	.purgem	scale_2_pixels
+	.unreq	DST
+	.unreq	SRC
+	.unreq	W
+	.unreq	VX
+	.unreq	UNIT_X
+	.unreq	TMP1
+	.unreq	TMP2
+	.unreq	VXMASK
+	/* return */
+	pop	{r4, r5, r6, r7}
+	bx	lr
+.endfunc
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index d466a31..3b05007 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -29,6 +29,7 @@
 
 #include "pixman-private.h"
 #include "pixman-arm-common.h"
+#include "pixman-fast-path.h"
 
 #if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
 
@@ -386,6 +387,9 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (armv6, over_8888_n_8888,
 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (armv6, over_n_8_8888,
                                       uint8_t, 1, uint32_t, 1)
 
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
+                                        uint16_t, uint16_t)
+
 static const pixman_fast_path_t arm_simd_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
@@ -404,6 +408,9 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
 
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
+
     { PIXMAN_OP_NONE },
 };
 
commit b8007d042354fd9bd15711d9921e6f1ebb1c3c22
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Tue Nov 2 16:12:42 2010 +0200

    ARM: NEON optimization for scaled src_0565_8888 with nearest filter
    
    Benchmark from ARM Cortex-A8 @720MHz:
        == before ==
        op=1, src_fmt=10020565, dst_fmt=20028888, speed=8.99 MPix/s
    
        == after ==
        op=1, src_fmt=10020565, dst_fmt=20028888, speed=76.98 MPix/s
    
        == unscaled ==
        op=1, src_fmt=10020565, dst_fmt=20028888, speed=137.78 MPix/s

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 4d6b03b..91ec27d 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2009,3 +2009,13 @@ generate_composite_function_nearest_scanline \
     pixman_composite_src_8888_0565_process_pixblock_head, \
     pixman_composite_src_8888_0565_process_pixblock_tail, \
     pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index bf921e6..2f82069 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -103,6 +103,8 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, OVER,
                                         uint32_t, uint16_t)
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, SRC,
                                         uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 0565_8888, SRC,
+                                        uint16_t, uint32_t)
 
 void
 pixman_composite_src_n_8_asm_neon (int32_t   w,
@@ -291,6 +293,14 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, b5g6r5, neon_8888_0565),
     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, b5g6r5, neon_8888_0565),
 
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, neon_0565_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8888),
+    /* Note: NONE repeat is not supported yet */
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
+
     { PIXMAN_OP_NONE },
 };
 
commit 2e855a2b4a2bb7b3d2ed1826cb4426d14080ca67
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Tue Nov 2 15:25:51 2010 +0200

    ARM: NEON optimization for scaled src_8888_0565 with nearest filter
    
    Benchmark from ARM Cortex-A8 @720MHz:
        == before ==
        op=1, src_fmt=20028888, dst_fmt=10020565, speed=42.51 MPix/s
    
        == after ==
        op=1, src_fmt=20028888, dst_fmt=10020565, speed=55.61 MPix/s
    
        == unscaled ==
        op=1, src_fmt=20028888, dst_fmt=10020565, speed=117.99 MPix/s

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index ea3337a..4d6b03b 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1999,3 +1999,13 @@ generate_composite_function_nearest_scanline \
     4,  /* dst_r_basereg */ \
     0,  /* src_basereg   */ \
     24  /* mask_basereg  */
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index a200ed1..bf921e6 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -101,6 +101,8 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_8888, OVER,
                                         uint32_t, uint32_t)
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, OVER,
                                         uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, SRC,
+                                        uint32_t, uint16_t)
 
 void
 pixman_composite_src_n_8_asm_neon (int32_t   w,
@@ -284,6 +286,11 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_0565),
 
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, b5g6r5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, b5g6r5, neon_8888_0565),
+
     { PIXMAN_OP_NONE },
 };
 
commit 4a09e472b8fbfae3e67d05a26ecc9c8a17225053
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Tue Nov 2 14:39:02 2010 +0200

    ARM: NEON optimization for scaled over_8888_0565 with nearest filter
    
    Benchmark from ARM Cortex-A8 @720MHz:
        == before ==
        op=3, src_fmt=20028888, dst_fmt=10020565, speed=10.29 MPix/s
    
        == after ==
        op=3, src_fmt=20028888, dst_fmt=10020565, speed=36.36 MPix/s
    
        == unscaled ==
        op=3, src_fmt=20028888, dst_fmt=10020565, speed=79.40 MPix/s

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index b009578..ea3337a 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1985,3 +1985,17 @@ generate_composite_function_nearest_scanline \
     pixman_composite_over_8888_8888_process_pixblock_head, \
     pixman_composite_over_8888_8888_process_pixblock_tail, \
     pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 9c2838f..a200ed1 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -99,6 +99,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_0565_8_0565,
 
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_8888, OVER,
                                         uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, OVER,
+                                        uint32_t, uint16_t)
 
 void
 pixman_composite_src_n_8_asm_neon (int32_t   w,
@@ -279,6 +281,9 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, neon_8888_8888),
 
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_0565),
+
     { PIXMAN_OP_NONE },
 };
 
commit 67a4991f3341d38bc3477c8f99f2ef581cd609e3
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Tue Nov 2 14:29:57 2010 +0200

    ARM: NEON optimization for scaled over_8888_8888 with nearest filter
    
    Benchmark from ARM Cortex-A8 @720MHz:
        == before ==
        op=3, src_fmt=20028888, dst_fmt=20028888, speed=12.73 MPix/s
    
        == after ==
        op=3, src_fmt=20028888, dst_fmt=20028888, speed=28.75 MPix/s
    
        == unscaled ==
        op=3, src_fmt=20028888, dst_fmt=20028888, speed=53.03 MPix/s

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 029709b..b009578 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1973,3 +1973,15 @@ generate_composite_function \
     10, /* dst_r_basereg */ \
     15, /* src_basereg   */ \
     0   /* mask_basereg  */
+
+/******************************************************************************/
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index be5d403..9c2838f 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -97,6 +97,9 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_0565,
 PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_0565_8_0565,
                                         uint16_t, 1, uint8_t, 1, uint16_t, 1)
 
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_8888, OVER,
+                                        uint32_t, uint32_t)
+
 void
 pixman_composite_src_n_8_asm_neon (int32_t   w,
                                    int32_t   h,
@@ -271,6 +274,11 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, r5g6b5,   neon_composite_out_reverse_8_0565),
     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, b5g6r5,   neon_composite_out_reverse_8_0565),
 
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, neon_8888_8888),
+
     { PIXMAN_OP_NONE },
 };
 
commit 0b56244ac81f2bb2402629f8720c7e22893a24df
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Tue Nov 2 19:16:46 2010 +0200

    ARM: performance tuning of NEON nearest scaled pixel fetcher
    
    Interleaving the use of NEON registers helps to avoid some stalls
    in NEON pipeline and provides a small performance improvement.

diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index d3b506d..c75bdc3 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -241,6 +241,30 @@
 .endif
 .endm
 
+.macro pixld2_s elem_size, reg1, reg2, mem_operand
+.if elem_size == 32
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X, asl #1
+    add     TMP1, mem_operand, TMP1, asl #2
+    mov     TMP2, VX, asr #16
+    sub     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    vld1.32 {d&reg1&[0]}, [TMP1, :32]
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X, asl #1
+    add     TMP1, mem_operand, TMP1, asl #2
+    vld1.32 {d&reg2&[0]}, [TMP2, :32]
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    vld1.32 {d&reg1&[1]}, [TMP1, :32]
+    vld1.32 {d&reg2&[1]}, [TMP2, :32]
+.else
+    pixld1_s elem_size, reg1, mem_operand
+    pixld1_s elem_size, reg2, mem_operand
+.endif
+.endm
+
 .macro pixld0_s elem_size, reg1, idx, mem_operand
 .if elem_size == 16
     mov     TMP1, VX, asr #16
@@ -257,14 +281,11 @@
 
 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
 .if numbytes == 32
-    pixld1_s elem_size, %(basereg+4), mem_operand
-    pixld1_s elem_size, %(basereg+5), mem_operand
-    pixld1_s elem_size, %(basereg+6), mem_operand
-    pixld1_s elem_size, %(basereg+7), mem_operand
+    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
+    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
     pixdeinterleave elem_size, %(basereg+4)
 .elseif numbytes == 16
-    pixld1_s elem_size, %(basereg+2), mem_operand
-    pixld1_s elem_size, %(basereg+3), mem_operand
+    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
 .elseif numbytes == 8
     pixld1_s elem_size, %(basereg+1), mem_operand
 .elseif numbytes == 4
commit 6e76af0d4b60ab74b309994926f28c532c5af155
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Tue Nov 2 14:26:13 2010 +0200

    ARM: macro template in C code to simplify using scaled fast paths
    
    This template can be used to instantiate scaled fast path functions
    by providing main loop code and calling NEON assembly optimized
    scanline processing functions from it. Another macro can be used
    to simplify adding entries to fast path tables.

diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index 8d432b1..2cff6c8 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -26,6 +26,8 @@
 #ifndef PIXMAN_ARM_COMMON_H
 #define PIXMAN_ARM_COMMON_H
 
+#include "pixman-fast-path.h"
+
 /* Define some macros which can expand into proxy functions between
  * ARM assembly optimized functions and the rest of pixman fast path API.
  *
@@ -270,4 +272,42 @@ cputype##_composite_##name (pixman_implementation_t *imp,               \
                                              mask_line, mask_stride);   \
 }
 
+#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST(cputype, name, op,             \
+                                               src_type, dst_type)            \
+void                                                                          \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
+                                                       int32_t        w,      \
+                                                       dst_type *     dst,    \
+                                                       src_type *     src,    \
+                                                       pixman_fixed_t vx,     \
+                                                       pixman_fixed_t unit_x);\
+                                                                              \
+static force_inline void                                                      \
+scaled_nearest_scanline_##cputype##_##name##_##op (dst_type *       pd,       \
+                                                   src_type *       ps,       \
+                                                   int32_t          w,        \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx)   \
+{                                                                             \
+    pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
+                                                                  vx, unit_x);\
+}                                                                             \
+                                                                              \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_cover_##op,                         \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, COVER)                             \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_none_##op,                          \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, NONE)                              \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op,                           \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, PAD)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                      \
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                             \
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                              \
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
+
 #endif
commit 88014a0e6ffaa22b3ac363c2c73b72530cdba0cc
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Mon Nov 1 10:03:59 2010 +0200

    ARM: nearest scaling support for NEON scanline compositing functions
    
    Now it is possible to generate scanline processing functions
    for the case when the source image is scaled with NEAREST filter.
    
    Only 16bpp and 32bpp pixel formats are supported for now. But the
    others can be also added later when needed. All the existing NEON
    fast path functions should be quite easy to reuse for implementing
    fast paths which can work with scaled source images.

diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index aa5e9bd..d3b506d 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -205,6 +205,100 @@
 .endif
 .endm
 
+/*
+ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
+ * aliases to be defined)
+ */
+.macro pixld1_s elem_size, reg1, mem_operand
+.if elem_size == 16
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #1
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #1
+    vld1.16 {d&reg1&[0]}, [TMP1, :16]
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #1
+    vld1.16 {d&reg1&[1]}, [TMP2, :16]
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #1
+    vld1.16 {d&reg1&[2]}, [TMP1, :16]
+    vld1.16 {d&reg1&[3]}, [TMP2, :16]
+.elseif elem_size == 32
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #2
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    vld1.32 {d&reg1&[0]}, [TMP1, :32]
+    vld1.32 {d&reg1&[1]}, [TMP2, :32]
+.else
+    .error "unsupported"
+.endif
+.endm
+
+.macro pixld0_s elem_size, reg1, idx, mem_operand
+.if elem_size == 16
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #1
+    vld1.16 {d&reg1&[idx]}, [TMP1, :16]
+.elseif elem_size == 32
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #2
+    vld1.32 {d&reg1&[idx]}, [TMP1, :32]
+.endif
+.endm
+
+.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
+.if numbytes == 32
+    pixld1_s elem_size, %(basereg+4), mem_operand
+    pixld1_s elem_size, %(basereg+5), mem_operand
+    pixld1_s elem_size, %(basereg+6), mem_operand
+    pixld1_s elem_size, %(basereg+7), mem_operand
+    pixdeinterleave elem_size, %(basereg+4)
+.elseif numbytes == 16
+    pixld1_s elem_size, %(basereg+2), mem_operand
+    pixld1_s elem_size, %(basereg+3), mem_operand
+.elseif numbytes == 8
+    pixld1_s elem_size, %(basereg+1), mem_operand
+.elseif numbytes == 4
+    .if elem_size == 32
+        pixld0_s elem_size, %(basereg+0), 1, mem_operand
+    .elseif elem_size == 16
+        pixld0_s elem_size, %(basereg+0), 2, mem_operand
+        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+    .else
+        pixld0_s elem_size, %(basereg+0), 4, mem_operand
+        pixld0_s elem_size, %(basereg+0), 5, mem_operand
+        pixld0_s elem_size, %(basereg+0), 6, mem_operand
+        pixld0_s elem_size, %(basereg+0), 7, mem_operand
+    .endif
+.elseif numbytes == 2
+    .if elem_size == 16
+        pixld0_s elem_size, %(basereg+0), 1, mem_operand
+    .else
+        pixld0_s elem_size, %(basereg+0), 2, mem_operand
+        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+    .endif
+.elseif numbytes == 1
+    pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.else
+    .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld_s numpix, bpp, basereg, mem_operand
+.if bpp > 0
+    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
+.endif
+.endm
+
 .macro vuzp8 reg1, reg2
     vuzp.8 d&reg1, d&reg2
 .endm
@@ -792,7 +886,8 @@ fname:
  * A simplified variant of function generation template for a single
  * scanline processing (for implementing pixman combine functions)
  */
-.macro generate_composite_function_single_scanline fname, \
+.macro generate_composite_function_scanline        use_nearest_scaling, \
+                                                   fname, \
                                                    src_bpp_, \
                                                    mask_bpp_, \
                                                    dst_w_bpp_, \
@@ -830,23 +925,44 @@ fname:
     .set src_basereg, src_basereg_
     .set mask_basereg, mask_basereg_
 
+.if use_nearest_scaling != 0
+    /*
+     * Assign symbolic names to registers for nearest scaling
+     */
+    W           .req        r0
+    DST_W       .req        r1
+    SRC         .req        r2
+    VX          .req        r3
+    UNIT_X      .req        ip
+    MASK        .req        lr
+    TMP1        .req        r4
+    TMP2        .req        r5
+    DST_R       .req        r6
+
     .macro pixld_src x:vararg
-        pixld x
-    .endm
-    .macro fetch_src_pixblock
-        pixld_src   pixblock_size, src_bpp, \
-                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+        pixld_s x
     .endm
 
-/*
- * Assign symbolic names to registers
- */
+    ldr         UNIT_X, [sp]
+    push        {r4-r6, lr}
+    .if mask_bpp != 0
+    ldr         MASK, [sp, #(16 + 4)]
+    .endif
+.else
+    /*
+     * Assign symbolic names to registers
+     */
     W           .req        r0      /* width (is updated during processing) */
     DST_W       .req        r1      /* destination buffer pointer for writes */
     SRC         .req        r2      /* source buffer pointer */
     DST_R       .req        ip      /* destination buffer pointer for reads */
     MASK        .req        r3      /* mask pointer */
 
+    .macro pixld_src x:vararg
+        pixld x
+    .endm
+.endif
+
 .if (((flags) & FLAG_DST_READWRITE) != 0)
     .set dst_r_bpp, dst_w_bpp
 .else
@@ -858,6 +974,11 @@ fname:
     .set DEINTERLEAVE_32BPP_ENABLED, 0
 .endif
 
+    .macro fetch_src_pixblock
+        pixld_src   pixblock_size, src_bpp, \
+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+    .endm
+
     init
     mov         DST_R, DST_W
 
@@ -896,7 +1017,11 @@ fname:
                             process_pixblock_tail_head
 
     cleanup
-    bx         lr  /* exit */
+.if use_nearest_scaling != 0
+    pop         {r4-r6, pc}  /* exit */
+.else
+    bx          lr  /* exit */
+.endif
 8:
     /* Process the remaining trailing pixels in the scanline (dst unaligned) */
     process_trailing_pixels 0, 0, \
@@ -905,19 +1030,44 @@ fname:
                             process_pixblock_tail_head
 
     cleanup
-    bx          lr  /* exit */
 
-    .purgem     fetch_src_pixblock
-    .purgem     pixld_src
+.if use_nearest_scaling != 0
+    pop         {r4-r6, pc}  /* exit */
+
+    .unreq      DST_R
+    .unreq      SRC
+    .unreq      W
+    .unreq      VX
+    .unreq      UNIT_X
+    .unreq      TMP1
+    .unreq      TMP2
+    .unreq      DST_W
+    .unreq      MASK
+
+.else
+    bx          lr  /* exit */
 
     .unreq      SRC
     .unreq      MASK
     .unreq      DST_R
     .unreq      DST_W
     .unreq      W
+.endif
+
+    .purgem     fetch_src_pixblock
+    .purgem     pixld_src
+
     .endfunc
 .endm
 
+.macro generate_composite_function_single_scanline x:vararg
+    generate_composite_function_scanline 0, x
+.endm
+
+.macro generate_composite_function_nearest_scanline x:vararg
+    generate_composite_function_scanline 1, x
+.endm
+
 /* Default prologue/epilogue, nothing special needs to be done */
 
 .macro default_init
commit 324712e48cf04df3cfcfc463fb221fcdf96e020a
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Mon Nov 1 05:10:34 2010 +0200

    ARM: NEON: source image pixel fetcher can be overrided now
    
    Added a special macro 'pixld_src' which is now responsible for fetching
    pixels from the source image. Right now it just passes all its arguments
    directly to 'pixld' macro, but it can be used in the future to provide
    a special pixel fetcher for implementing nearest scaling.
    
    The 'pixld_src' has a lot of arguments which define its behavior. But
    for each particular fast path implementation, we already know NEON
    registers allocation and how many pixels are processed in a single block.
    That's why a higher level macro 'fetch_src_pixblock' is also introduced
    (it's easier to use because it has no arguments) and used everywhere
    in 'pixman-arm-neon-asm.S' instead of VLD instructions.
    
    This patch does not introduce any functional changes and the resulting code
    in the compiled object file is exactly the same.

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 87b8045..029709b 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -253,7 +253,7 @@
     vld1.16     {d4, d5}, [DST_R, :128]!
         vqadd.u8    q9, q0, q11
     vshrn.u16   d6, q2, #8
-    vld4.8      {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
     vshrn.u16   d7, q2, #3
     vsli.u16    q2, q2, #5
         vshll.u8    q14, d16, #8
@@ -295,7 +295,7 @@
     pixman_composite_over_8888_0565_process_pixblock_tail
     vst1.16     {d28, d29}, [DST_W, :128]!
     vld1.16     {d4, d5}, [DST_R, :128]!
-    vld4.32     {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
     pixman_composite_over_8888_0565_process_pixblock_head
     cache_preload 8, 8
 .endm
@@ -433,7 +433,7 @@ generate_composite_function \
         vsri.u16    q14, q8, #5
                                     PF add PF_X, PF_X, #8
                                     PF tst PF_CTL, #0xF
-    vld4.8      {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
                                     PF addne PF_X, PF_X, #8
                                     PF subne PF_CTL, PF_CTL, #1
         vsri.u16    q14, q9, #11
@@ -478,7 +478,7 @@ generate_composite_function \
 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
     pixman_composite_src_0565_8888_process_pixblock_tail
     vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
-    vld1.16    {d0, d1}, [SRC]!
+    fetch_src_pixblock
     pixman_composite_src_0565_8888_process_pixblock_head
     cache_preload 8, 8
 .endm
@@ -505,7 +505,7 @@ generate_composite_function \
 .endm
 
 .macro pixman_composite_add_8_8_process_pixblock_tail_head
-    vld1.8      {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
                                     PF add PF_X, PF_X, #32
                                     PF tst PF_CTL, #0xF
     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
@@ -537,7 +537,7 @@ generate_composite_function \
 /******************************************************************************/
 
 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
-    vld1.32     {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
                                     PF add PF_X, PF_X, #8
                                     PF tst PF_CTL, #0xF
     vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
@@ -613,7 +613,7 @@ generate_composite_function_single_scanline \
                                     PF cmp PF_X, ORIG_W
         vraddhn.u16 d30, q12, q10
         vraddhn.u16 d31, q13, q11
-    vld4.8      {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
     vmvn.8      d22, d3
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
@@ -667,7 +667,7 @@ generate_composite_function_single_scanline \
         vraddhn.u16 d31, q13, q11
         vqadd.u8    q14, q0, q14
         vqadd.u8    q15, q1, q15
-    vld4.8      {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
     vmvn.8      d22, d3
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
@@ -887,7 +887,7 @@ generate_composite_function \
 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
     vld1.16     {d4, d5}, [DST_R, :128]!
     pixman_composite_over_n_8_0565_process_pixblock_tail
-    vld4.8      {d8, d9, d10, d11}, [SRC]!
+    fetch_src_pixblock
     cache_preload 8, 8
     vld1.8      {d24}, [MASK]!
     pixman_composite_over_n_8_0565_process_pixblock_head
@@ -919,7 +919,7 @@ generate_composite_function \
 
 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
     vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
-    vld1.16 {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
     cache_preload 16, 16
 .endm
 
@@ -1065,7 +1065,7 @@ generate_composite_function \
 
 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
     vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
-    vld1.32 {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
     cache_preload 8, 8
 .endm
 
@@ -1096,7 +1096,7 @@ generate_composite_function \
 
 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
     vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
-    vld1.32 {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
     vorr     q0, q0, q2
     vorr     q1, q1, q2
     cache_preload 8, 8
@@ -1395,7 +1395,7 @@ generate_composite_function \
     vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
     vld1.8      {d24, d25, d26, d27}, [MASK]!
-    vld1.8      {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
     cache_preload 32, 32
     pixman_composite_add_8_8_8_process_pixblock_head
 .endm
@@ -1448,7 +1448,7 @@ generate_composite_function \
     vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
     vld4.8      {d24, d25, d26, d27}, [MASK]!
-    vld4.8      {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
     cache_preload 8, 8
     pixman_composite_add_8888_8888_8888_process_pixblock_head
 .endm
@@ -1517,7 +1517,7 @@ generate_composite_function_single_scanline \
 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
     vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
     pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
-    vld4.8     {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
     cache_preload 8, 8
     vld4.8     {d12, d13, d14, d15}, [MASK]!
     pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
@@ -1554,7 +1554,7 @@ generate_composite_function_single_scanline \
 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
     vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
     pixman_composite_over_8888_n_8888_process_pixblock_tail
-    vld4.8     {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
     cache_preload 8, 8
     pixman_composite_over_8888_n_8888_process_pixblock_head
     vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
@@ -1588,7 +1588,7 @@ generate_composite_function \
 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
     vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
     pixman_composite_over_8888_n_8888_process_pixblock_tail
-    vld4.8     {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
     cache_preload 8, 8
     vld4.8     {d12, d13, d14, d15}, [MASK]!
     pixman_composite_over_8888_n_8888_process_pixblock_head
@@ -1630,7 +1630,7 @@ generate_composite_function_single_scanline \
 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
     vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
     pixman_composite_over_8888_n_8888_process_pixblock_tail
-    vld4.8     {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
     cache_preload 8, 8
     vld1.8     {d15}, [MASK]!
     pixman_composite_over_8888_n_8888_process_pixblock_head
@@ -1662,7 +1662,7 @@ generate_composite_function \
 
 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
     vst3.8 {d0, d1, d2}, [DST_W]!
-    vld3.8 {d0, d1, d2}, [SRC]!
+    fetch_src_pixblock
     cache_preload 8, 8
 .endm
 
@@ -1692,7 +1692,7 @@ generate_composite_function \
 
 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
     vst4.8 {d0, d1, d2, d3}, [DST_W]!
-    vld3.8 {d0, d1, d2}, [SRC]!
+    fetch_src_pixblock
     vswp   d0, d2
     cache_preload 8, 8
 .endm
@@ -1731,7 +1731,7 @@ generate_composite_function \
 
 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
         vshll.u8    q14, d0, #8
-    vld3.8 {d0, d1, d2}, [SRC]!
+    fetch_src_pixblock
         vsri.u16    q14, q8, #5
         vsri.u16    q14, q9, #11
     vshll.u8    q8, d1, #8
@@ -1777,7 +1777,7 @@ generate_composite_function \
         vswp        d3, d31
         vrshr.u16   q12, q9, #8
         vrshr.u16   q13, q10, #8
-    vld4.8 {d0, d1, d2, d3}, [SRC]!
+    fetch_src_pixblock
         vraddhn.u16 d30, q11, q8
                                     PF add PF_X, PF_X, #8
                                     PF tst PF_CTL, #0xF
@@ -1851,7 +1851,7 @@ generate_composite_function \
 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
     vld1.8     {d15}, [MASK]!
     pixman_composite_over_0565_8_0565_process_pixblock_tail
-    vld1.16    {d8, d9}, [SRC]!
+    fetch_src_pixblock
     vld1.16    {d10, d11}, [DST_R, :128]!
     cache_preload 8, 8
     pixman_composite_over_0565_8_0565_process_pixblock_head
@@ -1903,7 +1903,7 @@ generate_composite_function \
 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
     vld1.8     {d15}, [MASK]!
     pixman_composite_add_0565_8_0565_process_pixblock_tail
-    vld1.16    {d8, d9}, [SRC]!
+    fetch_src_pixblock
     vld1.16    {d10, d11}, [DST_R, :128]!
     cache_preload 8, 8
     pixman_composite_add_0565_8_0565_process_pixblock_head
@@ -1951,7 +1951,7 @@ generate_composite_function \
 
 /* TODO: expand macros and do better instructions scheduling */
 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
-    vld1.8     {d15}, [SRC]!
+    fetch_src_pixblock
     pixman_composite_out_reverse_8_0565_process_pixblock_tail
     vld1.16    {d10, d11}, [DST_R, :128]!
     cache_preload 8, 8
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index dec73d7..aa5e9bd 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -335,7 +335,7 @@ local skip1
     tst         DST_R, #lowbit
     beq         1f
 .endif
-    pixld       (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
     pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
 .if dst_r_bpp > 0
     pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
@@ -397,7 +397,7 @@ local skip1
 .if pixblock_size > chunk_size
     tst         W, #chunk_size
     beq         1f
-    pixld       chunk_size, src_bpp, src_basereg, SRC
+    pixld_src   chunk_size, src_bpp, src_basereg, SRC
     pixld       chunk_size, mask_bpp, mask_basereg, MASK
 .if dst_aligned_flag != 0
     pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
@@ -531,6 +531,13 @@ fname:
     .set src_basereg, src_basereg_
     .set mask_basereg, mask_basereg_
 
+    .macro pixld_src x:vararg
+        pixld x
+    .endm
+    .macro fetch_src_pixblock
+        pixld_src   pixblock_size, src_bpp, \
+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+    .endm
 /*
  * Assign symbolic names to registers
  */
@@ -696,8 +703,7 @@ fname:
     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
     pixld_a     pixblock_size, dst_r_bpp, \
                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
-    pixld       pixblock_size, src_bpp, \
-                (src_basereg - pixblock_size * src_bpp / 64), SRC
+    fetch_src_pixblock
     pixld       pixblock_size, mask_bpp, \
                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
     PF add      PF_X, PF_X, #pixblock_size
@@ -739,8 +745,7 @@ fname:
     beq         1f
     pixld       pixblock_size, dst_r_bpp, \
                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
-    pixld       pixblock_size, src_bpp, \
-                (src_basereg - pixblock_size * src_bpp / 64), SRC
+    fetch_src_pixblock
     pixld       pixblock_size, mask_bpp, \
                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
     process_pixblock_head
@@ -761,6 +766,9 @@ fname:
     cleanup
     pop         {r4-r12, pc}  /* exit */
 
+    .purgem     fetch_src_pixblock
+    .purgem     pixld_src
+
     .unreq      SRC
     .unreq      MASK
     .unreq      DST_R
@@ -821,6 +829,15 @@ fname:
     .set dst_r_basereg, dst_r_basereg_
     .set src_basereg, src_basereg_
     .set mask_basereg, mask_basereg_
+
+    .macro pixld_src x:vararg
+        pixld x
+    .endm
+    .macro fetch_src_pixblock
+        pixld_src   pixblock_size, src_bpp, \
+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+    .endm
+
 /*
  * Assign symbolic names to registers
  */
@@ -857,8 +874,7 @@ fname:
     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
     pixld_a     pixblock_size, dst_r_bpp, \
                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
-    pixld       pixblock_size, src_bpp, \
-                (src_basereg - pixblock_size * src_bpp / 64), SRC
+    fetch_src_pixblock
     pixld       pixblock_size, mask_bpp, \
                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
     process_pixblock_head
@@ -891,6 +907,9 @@ fname:
     cleanup
     bx          lr  /* exit */
 
+    .purgem     fetch_src_pixblock
+    .purgem     pixld_src
+
     .unreq      SRC
     .unreq      MASK
     .unreq      DST_R
commit cb3f1830257a56f56abf7d50a8b34e215c616aec
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Tue Nov 2 22:53:55 2010 +0200

    ARM: fix 'vld1.8'->'vld1.32' typo in add_8888_8888 NEON fast path
    
    This was mostly harmless and had no effect on little endian systems.
    But wrong vector element size is at least inconsistent and also
    can theoretically cause problems on big endian ARM systems.

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index e4db5cd..87b8045 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -537,13 +537,13 @@ generate_composite_function \
 /******************************************************************************/
 
 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
-    vld1.8      {d0, d1, d2, d3}, [SRC]!
+    vld1.32     {d0, d1, d2, d3}, [SRC]!
                                     PF add PF_X, PF_X, #8
                                     PF tst PF_CTL, #0xF
-    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
                                     PF addne PF_X, PF_X, #8
                                     PF subne PF_CTL, PF_CTL, #1
-        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+        vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
                                     PF cmp PF_X, ORIG_W
                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]