pixman: Branch 'master' - 10 commits

Sat Mar 12 11:47:09 PST 2011

pixman/pixman-arm-common.h   |   45 ++++++
 pixman/pixman-arm-neon-asm.S |  292 ++++++++++++++++++++++++++++++-------------
 pixman/pixman-arm-neon-asm.h |   17 ++
 pixman/pixman-arm-neon.c     |   56 ++------
 pixman/pixman-arm-simd-asm.S |   66 +++++++--
 pixman/pixman-arm-simd.c     |    9 +
 6 files changed, 348 insertions(+), 137 deletions(-)

New commits:
commit 70a923882ca24664344ba91a649e7aa12c3063f7
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Wed Mar 9 13:55:48 2011 +0200

    ARM: a bit faster NEON bilinear scaling for r5g6b5 source images
    
    Instructions scheduling improved in the code responsible for fetching r5g6b5
    pixels and converting them to the intermediate x8r8g8b8 color format used in
    the interpolation part of code. Still a lot of NEON stalls are remaining,
    which can be resolved later by the use of pipelining.
    
    Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
     Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
      before: op=1, src=10020565, dst=10020565, speed=32.29 MPix/s
              op=1, src=10020565, dst=20020888, speed=36.82 MPix/s
      after:  op=1, src=10020565, dst=10020565, speed=41.35 MPix/s
              op=1, src=10020565, dst=20020888, speed=49.16 MPix/s

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 2b6875b..71b30ac 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2430,6 +2430,101 @@ fname:
     convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
 .endm
 
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+    bilinear_load_8888 reg1, reg2, tmp1
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    bilinear_load_8888 reg3, reg4, tmp2
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP2, asl #1
+    add       TMP2, BOTTOM, TMP2, asl #1
+    add       TMP3, TOP, TMP4, asl #1
+    add       TMP4, BOTTOM, TMP4, asl #1
+    vld1.32   {acc2lo[0]}, [TMP1]
+    vld1.32   {acc2hi[0]}, [TMP3]
+    vld1.32   {acc2lo[1]}, [TMP2]
+    vld1.32   {acc2hi[1]}, [TMP4]
+    convert_0565_to_x888 acc2, reg3, reg2, reg1
+    vzip.u8   reg1, reg3
+    vzip.u8   reg2, reg4
+    vzip.u8   reg3, reg4
+    vzip.u8   reg1, reg2
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP2, asl #1
+    add       TMP2, BOTTOM, TMP2, asl #1
+    add       TMP3, TOP, TMP4, asl #1
+    add       TMP4, BOTTOM, TMP4, asl #1
+    vld1.32   {xacc2lo[0]}, [TMP1]
+    vld1.32   {xacc2hi[0]}, [TMP3]
+    vld1.32   {xacc2lo[1]}, [TMP2]
+    vld1.32   {xacc2hi[1]}, [TMP4]
+    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP2, asl #1
+    add       TMP2, BOTTOM, TMP2, asl #1
+    add       TMP3, TOP, TMP4, asl #1
+    add       TMP4, BOTTOM, TMP4, asl #1
+    vld1.32   {yacc2lo[0]}, [TMP1]
+    vzip.u8   xreg1, xreg3
+    vld1.32   {yacc2hi[0]}, [TMP3]
+    vzip.u8   xreg2, xreg4
+    vld1.32   {yacc2lo[1]}, [TMP2]
+    vzip.u8   xreg3, xreg4
+    vld1.32   {yacc2hi[1]}, [TMP4]
+    vzip.u8   xreg1, xreg2
+    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+    vmull.u8  xacc1, xreg1, d28
+    vzip.u8   yreg1, yreg3
+    vmlal.u8  xacc1, xreg2, d29
+    vzip.u8   yreg2, yreg4
+    vmull.u8  xacc2, xreg3, d28
+    vzip.u8   yreg3, yreg4
+    vmlal.u8  xacc2, xreg4, d29
+    vzip.u8   yreg1, yreg2
+    vmull.u8  yacc1, yreg1, d28
+    vmlal.u8  yacc1, yreg2, d29
+    vmull.u8  yacc2, yreg3, d28
+    vmlal.u8  yacc2, yreg4, d29
+.endm
+
 .macro bilinear_store_8888 numpix, tmp1, tmp2
 .if numpix == 4
     vst1.32   {d0, d1}, [OUT]!
@@ -2477,12 +2572,8 @@ fname:
 .endm
 
 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
-    bilinear_load_&src_fmt d0, d1, d2
-    vmull.u8  q1, d0, d28
-    vmlal.u8  q1, d1, d29
-    bilinear_load_&src_fmt d20, d21, d22
-    vmull.u8  q11, d20, d28
-    vmlal.u8  q11, d21, d29
+    bilinear_load_and_vertical_interpolate_two_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23
     vshr.u16  q15, q12, #8
     vadd.u16  q12, q12, q13
     vshll.u16 q0, d2, #8
@@ -2498,18 +2589,9 @@ fname:
 .endm
 
 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
-    bilinear_load_&src_fmt d0, d1, d2
-    vmull.u8  q1, d0, d28
-    vmlal.u8  q1, d1, d29
-    bilinear_load_&src_fmt d20, d21, d22
-    vmull.u8  q11, d20, d28
-    vmlal.u8  q11, d21, d29
-    bilinear_load_&src_fmt d4, d5, d6
-    vmull.u8  q3, d4, d28
-    vmlal.u8  q3, d5, d29
-    bilinear_load_&src_fmt d16, d17, d18
-    vmull.u8  q9, d16, d28
-    vmlal.u8  q9, d17, d29
+    bilinear_load_and_vertical_interpolate_four_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23 \
+                q3, q9,  d4, d5, d16, d17, d18, d19
     pld       [TMP1, PF_OFFS]
     vshr.u16  q15, q12, #8
     vadd.u16  q12, q12, q13
commit fe99673719091d4a880d031add1369332a75731b
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Wed Mar 9 13:27:41 2011 +0200

    ARM: NEON optimization for bilinear scaled 'src_0565_0565'
    
    Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
     Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
      before: op=1, src=10020565, dst=10020565, speed=3.30 MPix/s
      after:  op=1, src=10020565, dst=10020565, speed=32.29 MPix/s

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 9245db9..2b6875b 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2631,3 +2631,6 @@ generate_bilinear_scanline_func \
 
 generate_bilinear_scanline_func \
     pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 28
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 28
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 18e26eb..0a10ca1 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -133,6 +133,8 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC,
                                          uint32_t, uint16_t)
 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,
                                          uint16_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,
+                                         uint16_t, uint16_t)
 
 void
 pixman_composite_src_n_8_asm_neon (int32_t   w,
@@ -358,6 +360,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
 
     SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565),
 
     { PIXMAN_OP_NONE },
 };
commit 29003c3befe2159396d181ef9ac1caaadcabf382
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Wed Mar 9 13:21:53 2011 +0200

    ARM: NEON optimization for bilinear scaled 'src_0565_x888'
    
    Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
     Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
      before: op=1, src=10020565, dst=20020888, speed=3.39 MPix/s
      after:  op=1, src=10020565, dst=20020888, speed=36.82 MPix/s

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index f0b42ca..9245db9 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2628,3 +2628,6 @@ generate_bilinear_scanline_func \
 
 generate_bilinear_scanline_func \
     pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 28
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 28
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index ba6de66..18e26eb 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -131,6 +131,8 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC,
                                          uint32_t, uint32_t)
 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC,
                                          uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,
+                                         uint16_t, uint32_t)
 
 void
 pixman_composite_src_n_8_asm_neon (int32_t   w,
@@ -355,6 +357,8 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
 
+    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888),
+
     { PIXMAN_OP_NONE },
 };
 
commit 2ee27e7d79637da9173ee1bf3423e5a81534ccb4
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Wed Mar 9 11:53:04 2011 +0200

    ARM: NEON optimization for bilinear scaled 'src_8888_0565'
    
    Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
     Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
      before: op=1, src=20028888, dst=10020565, speed=6.56 MPix/s
      after:  op=1, src=20028888, dst=10020565, speed=61.65 MPix/s

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 52dc444..f0b42ca 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2625,3 +2625,6 @@ pixman_asm_function fname
 
 generate_bilinear_scanline_func \
     pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 28
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 98ad5f2..ba6de66 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -129,6 +129,8 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565,
 
 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC,
                                          uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC,
+                                         uint32_t, uint16_t)
 
 void
 pixman_composite_src_n_8_asm_neon (int32_t   w,
@@ -350,6 +352,9 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888),
 
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+
     { PIXMAN_OP_NONE },
 };
 
commit 11a0c5badbc59ce967707ef836313cc98f8aec4e
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Wed Mar 9 11:46:48 2011 +0200

    ARM: use common macro template for bilinear scaled 'src_8888_8888'
    
    This is a cleanup for old and now duplicated code. The performance improvement
    is mostly coming from the enabled use of software prefetch, but instructions
    scheduling is also slightly better.
    
    Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
     Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
      before: op=1, src=20028888, dst=20028888, speed=53.24 MPix/s
      after:  op=1, src=20028888, dst=20028888, speed=74.36 MPix/s

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index f3784f5..52dc444 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2405,194 +2405,6 @@ generate_composite_function_nearest_scanline \
 fname:
 .endm
 
-.macro bilinear_interpolate_last_pixel
-    mov       TMP1, X, asr #16
-    mov       TMP2, X, asr #16
-    add       TMP1, TOP, TMP1, asl #2
-    add       TMP2, BOTTOM, TMP2, asl #2
-    vld1.32   {d0}, [TMP1]
-    vshr.u16  d30, d24, #8
-    vld1.32   {d1}, [TMP2]
-    vmull.u8  q1, d0, d28
-    vmlal.u8  q1, d1, d29
-    /* 5 cycles bubble */
-    vshll.u16 q0, d2, #8
-    vmlsl.u16 q0, d2, d30
-    vmlal.u16 q0, d3, d30
-    /* 5 cycles bubble */
-    vshrn.u32 d0, q0, #16
-    /* 3 cycles bubble */
-    vmovn.u16 d0, q0
-    /* 1 cycle bubble */
-    vst1.32   {d0[0]}, [OUT, :32]!
-.endm
-
-.macro bilinear_interpolate_two_pixels
-    mov       TMP1, X, asr #16
-    mov       TMP2, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #2
-    add       TMP2, BOTTOM, TMP2, asl #2
-    vld1.32   {d0}, [TMP1]
-    vld1.32   {d1}, [TMP2]
-    vmull.u8  q1, d0, d28
-    vmlal.u8  q1, d1, d29
-    mov       TMP1, X, asr #16
-    mov       TMP2, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #2
-    add       TMP2, BOTTOM, TMP2, asl #2
-    vld1.32   {d20}, [TMP1]
-    vld1.32   {d21}, [TMP2]
-    vmull.u8  q11, d20, d28
-    vmlal.u8  q11, d21, d29
-    vshr.u16  q15, q12, #8
-    vadd.u16  q12, q12, q13
-    vshll.u16 q0, d2, #8
-    vmlsl.u16 q0, d2, d30
-    vmlal.u16 q0, d3, d30
-    vshll.u16 q10, d22, #8
-    vmlsl.u16 q10, d22, d31
-    vmlal.u16 q10, d23, d31
-    vshrn.u32 d30, q0, #16
-    vshrn.u32 d31, q10, #16
-    vmovn.u16 d0, q15
-    vst1.32   {d0}, [OUT]!
-.endm
-
-.macro bilinear_interpolate_four_pixels
-    mov       TMP1, X, asr #16
-    mov       TMP2, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #2
-    add       TMP2, BOTTOM, TMP2, asl #2
-    vld1.32   {d0}, [TMP1]
-    vld1.32   {d1}, [TMP2]
-    vmull.u8  q1, d0, d28
-    vmlal.u8  q1, d1, d29
-    mov       TMP1, X, asr #16
-    mov       TMP2, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #2
-    add       TMP2, BOTTOM, TMP2, asl #2
-    vld1.32   {d20}, [TMP1]
-    vld1.32   {d21}, [TMP2]
-    vmull.u8  q11, d20, d28
-    vmlal.u8  q11, d21, d29
-    vshr.u16  q15, q12, #8
-    vadd.u16  q12, q12, q13
-    vshll.u16 q0, d2, #8
-    vmlsl.u16 q0, d2, d30
-    vmlal.u16 q0, d3, d30
-    vshll.u16 q10, d22, #8
-    vmlsl.u16 q10, d22, d31
-    vmlal.u16 q10, d23, d31
-    mov       TMP1, X, asr #16
-    mov       TMP2, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #2
-    add       TMP2, BOTTOM, TMP2, asl #2
-    vld1.32   {d4}, [TMP1]
-    vld1.32   {d5}, [TMP2]
-    vmull.u8  q3, d4, d28
-    vmlal.u8  q3, d5, d29
-    mov       TMP1, X, asr #16
-    mov       TMP2, X, asr #16
-    add       X, X, UX
-    add       TMP1, TOP, TMP1, asl #2
-    add       TMP2, BOTTOM, TMP2, asl #2
-    vld1.32   {d16}, [TMP1]
-    vld1.32   {d17}, [TMP2]
-    vmull.u8  q9, d16, d28
-    vmlal.u8  q9, d17, d29
-    vshr.u16  q15, q12, #8
-    vadd.u16  q12, q12, q13
-    vshll.u16 q2, d6, #8
-    vmlsl.u16 q2, d6, d30
-    vmlal.u16 q2, d7, d30
-    vshll.u16 q8, d18, #8
-    vmlsl.u16 q8, d18, d31
-    vmlal.u16 q8, d19, d31
-    vshrn.u32 d0, q0, #16
-    vshrn.u32 d1, q10, #16
-    vshrn.u32 d4, q2, #16
-    vshrn.u32 d5, q8, #16
-    vmovn.u16 d0, q0
-    vmovn.u16 d1, q2
-    vst1.32   {d0, d1}, [OUT]!
-.endm
-
-
-/*
- * pixman_scaled_bilinear_scanline_8888_8888_SRC (uint32_t *       out,
- *                                                const uint32_t * top,
- *                                                const uint32_t * bottom,
- *                                                int              wt,
- *                                                int              wb,
- *                                                pixman_fixed_t   x,
- *                                                pixman_fixed_t   ux,
- *                                                int              width)
- */
-
-pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon
-    OUT       .req      r0
-    TOP       .req      r1
-    BOTTOM    .req      r2
-    WT        .req      r3
-    WB        .req      r4
-    X         .req      r5
-    UX        .req      r6
-    WIDTH     .req      ip
-    TMP1      .req      r3
-    TMP2      .req      r4
-
-    mov       ip, sp
-    push      {r4, r5, r6, r7}
-    ldmia     ip, {WB, X, UX, WIDTH}
-
-    cmp       WIDTH, #0
-    ble       3f
-    vdup.u16  q12, X
-    vdup.u16  q13, UX
-    vdup.u8   d28, WT
-    vdup.u8   d29, WB
-    vadd.u16  d25, d25, d26
-    vadd.u16  q13, q13, q13
-
-    subs      WIDTH, WIDTH, #4
-    blt       1f
-0:
-    bilinear_interpolate_four_pixels
-    subs      WIDTH, WIDTH, #4
-    bge       0b
-1:
-    tst       WIDTH, #2
-    beq       2f
-    bilinear_interpolate_two_pixels
-2:
-    tst       WIDTH, #1
-    beq       3f
-    bilinear_interpolate_last_pixel
-3:
-    pop       {r4, r5, r6, r7}
-    bx        lr
-
-    .unreq    OUT
-    .unreq    TOP
-    .unreq    BOTTOM
-    .unreq    WT
-    .unreq    WB
-    .unreq    X
-    .unreq    UX
-    .unreq    WIDTH
-    .unreq    TMP1
-    .unreq    TMP2
-.endfunc
-
-.purgem bilinear_interpolate_last_pixel
-.purgem bilinear_interpolate_two_pixels
-.purgem bilinear_interpolate_four_pixels
-
 /*
  * Bilinear scaling support code which tries to provide pixel fetching, color
  * format conversion, and interpolation as separate macros which can be used
@@ -2810,3 +2622,6 @@ pixman_asm_function fname
 .endfunc
 
 .endm
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28
commit 34098dba6763afd3636a14f9c2a079ab08f23b2d
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Wed Mar 9 11:34:15 2011 +0200

    ARM: NEON: common macro template for bilinear scanline scalers
    
    This allows to generate bilinear scanline scaling functions targeting
    various source and destination color formats. Right now a8r8g8b8/x8r8g8b8
    and r5g6b5 color formats are supported. More formats can be added if needed.

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index c168e10..f3784f5 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2588,3 +2588,225 @@ pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon
     .unreq    TMP1
     .unreq    TMP2
 .endfunc
+
+.purgem bilinear_interpolate_last_pixel
+.purgem bilinear_interpolate_two_pixels
+.purgem bilinear_interpolate_four_pixels
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP2, asl #2
+    add       TMP2, BOTTOM, TMP2, asl #2
+    vld1.32   {reg1}, [TMP1]
+    vld1.32   {reg2}, [TMP2]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP2, asl #1
+    add       TMP2, BOTTOM, TMP2, asl #1
+    vld1.32   {reg2[0]}, [TMP1]
+    vld1.32   {reg2[1]}, [TMP2]
+    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if numpix == 4
+    vst1.32   {d0, d1}, [OUT]!
+.elseif numpix == 2
+    vst1.32   {d0}, [OUT]!
+.elseif numpix == 1
+    vst1.32   {d0[0]}, [OUT, :32]!
+.else
+    .error bilinear_store_8888 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+    vuzp.u8 d0, d1
+    vuzp.u8 d2, d3
+    vuzp.u8 d1, d3
+    vuzp.u8 d0, d2
+    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
+.if numpix == 4
+    vst1.16   {d2}, [OUT]!
+.elseif numpix == 2
+    vst1.32   {d2[0]}, [OUT]!
+.elseif numpix == 1
+    vst1.16   {d2[0]}, [OUT]!
+.else
+    .error bilinear_store_0565 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    bilinear_load_&src_fmt d0, d1, d2
+    vmull.u8  q1, d0, d28
+    vmlal.u8  q1, d1, d29
+    vshr.u16  d30, d24, #8
+    /* 4 cycles bubble */
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    /* 5 cycles bubble */
+    vshrn.u32 d0, q0, #16
+    /* 3 cycles bubble */
+    vmovn.u16 d0, q0
+    /* 1 cycle bubble */
+    bilinear_store_&dst_fmt 1, q2, q3
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    bilinear_load_&src_fmt d0, d1, d2
+    vmull.u8  q1, d0, d28
+    vmlal.u8  q1, d1, d29
+    bilinear_load_&src_fmt d20, d21, d22
+    vmull.u8  q11, d20, d28
+    vmlal.u8  q11, d21, d29
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #8
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshrn.u32 d30, q0, #16
+    vshrn.u32 d31, q10, #16
+    vmovn.u16 d0, q15
+    bilinear_store_&dst_fmt 2, q2, q3
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_load_&src_fmt d0, d1, d2
+    vmull.u8  q1, d0, d28
+    vmlal.u8  q1, d1, d29
+    bilinear_load_&src_fmt d20, d21, d22
+    vmull.u8  q11, d20, d28
+    vmlal.u8  q11, d21, d29
+    bilinear_load_&src_fmt d4, d5, d6
+    vmull.u8  q3, d4, d28
+    vmlal.u8  q3, d5, d29
+    bilinear_load_&src_fmt d16, d17, d18
+    vmull.u8  q9, d16, d28
+    vmlal.u8  q9, d17, d29
+    pld       [TMP1, PF_OFFS]
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #8
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshr.u16  q15, q12, #8
+    vshll.u16 q2, d6, #8
+    vmlsl.u16 q2, d6, d30
+    vmlal.u16 q2, d7, d30
+    vshll.u16 q8, d18, #8
+    pld       [TMP2, PF_OFFS]
+    vmlsl.u16 q8, d18, d31
+    vmlal.u16 q8, d19, d31
+    vadd.u16  q12, q12, q13
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q10, #16
+    vshrn.u32 d4, q2, #16
+    vshrn.u32 d5, q8, #16
+    vmovn.u16 d0, q0
+    vmovn.u16 d1, q2
+    bilinear_store_&dst_fmt 4, q2, q3
+.endm
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline
+ * functions.
+ *
+ * TODO: use software pipelining and aligned writes to the destination buffer
+ *       in order to improve performance
+ *
+ * Bilinear scanline scaler macro template uses the following arguments:
+ *  fname             - name of the function to generate
+ *  src_fmt           - source color format (8888 or 0565)
+ *  dst_fmt           - destination color format (8888 or 0565)
+ *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
+ *  prefetch_distance - prefetch in the source image by that many
+ *                      pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
+                                       bpp_shift, prefetch_distance
+
+pixman_asm_function fname
+    OUT       .req      r0
+    TOP       .req      r1
+    BOTTOM    .req      r2
+    WT        .req      r3
+    WB        .req      r4
+    X         .req      r5
+    UX        .req      r6
+    WIDTH     .req      ip
+    TMP1      .req      r3
+    TMP2      .req      r4
+    PF_OFFS   .req      r7
+    TMP3      .req      r8
+    TMP4      .req      r9
+
+    mov       ip, sp
+    push      {r4, r5, r6, r7, r8, r9}
+    mov       PF_OFFS, #prefetch_distance
+    ldmia     ip, {WB, X, UX, WIDTH}
+    mul       PF_OFFS, PF_OFFS, UX
+
+    cmp       WIDTH, #0
+    ble       3f
+
+    vdup.u16  q12, X
+    vdup.u16  q13, UX
+    vdup.u8   d28, WT
+    vdup.u8   d29, WB
+    vadd.u16  d25, d25, d26
+    vadd.u16  q13, q13, q13
+
+    subs      WIDTH, WIDTH, #4
+    blt       1f
+    mov       PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)
+0:
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #4
+    bge       0b
+1:
+    tst       WIDTH, #2
+    beq       2f
+    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+2:
+    tst       WIDTH, #1
+    beq       3f
+    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+3:
+    pop       {r4, r5, r6, r7, r8, r9}
+    bx        lr
+
+    .unreq    OUT
+    .unreq    TOP
+    .unreq    BOTTOM
+    .unreq    WT
+    .unreq    WB
+    .unreq    X
+    .unreq    UX
+    .unreq    WIDTH
+    .unreq    TMP1
+    .unreq    TMP2
+    .unreq    PF_OFFS
+    .unreq    TMP3
+    .unreq    TMP4
+.endfunc
+
+.endm
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index 24fa361..97adc6a 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -1158,3 +1158,20 @@ fname:
     vsri.u16    out, tmp1, #5
     vsri.u16    out, tmp2, #11
 .endm
+
+/*
+ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
+ * returned in (out0, out1) registers pair. Requires one temporary
+ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
+ * value from 'in' is lost
+ */
+.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
+    vshl.u16    out0, in,   #5  /* G top 6 bits */
+    vshl.u16    tmp,  in,   #11 /* B top 5 bits */
+    vsri.u16    in,   in,   #5  /* R is ready in top bits */
+    vsri.u16    out0, out0, #6  /* G is ready in top bits */
+    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
+    vshr.u16    out1, in,   #8  /* R is in place */
+    vsri.u16    out0, tmp,  #8  /* G & B is in place */
+    vzip.u16    out0, out1      /* everything is in place */
+.endm
commit 66f4ee1b3bccf4516433d61dbf2035551a712fa2
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Wed Mar 9 10:59:46 2011 +0200

    ARM: new bilinear fast path template macro in 'pixman-arm-common.h'
    
    It can be reused in different ARM NEON bilinear scaling fast path functions.

diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index 9b1322b..c3bf986 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -361,4 +361,49 @@ FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                    \
     SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),                      \
     SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
 
+/*****************************************************************************/
+
+#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(flags, cputype, name, op,     \
+                                                src_type, dst_type)           \
+void                                                                          \
+pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \
+                                                dst_type *       dst,         \
+                                                const src_type * top,         \
+                                                const src_type * bottom,      \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   x,           \
+                                                pixman_fixed_t   ux,          \
+                                                int              width);      \
+                                                                              \
+static force_inline void                                                      \
+scaled_bilinear_scanline_##cputype##_##name##_##op (                          \
+                                                dst_type *       dst,         \
+                                                const uint32_t * mask,        \
+                                                const src_type * src_top,     \
+                                                const src_type * src_bottom,  \
+                                                int32_t          w,           \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   vx,          \
+                                                pixman_fixed_t   unit_x,      \
+                                                pixman_fixed_t   max_vx,      \
+                                                pixman_bool_t    zero_src)    \
+{                                                                             \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
+	return;                                                               \
+    pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
+                            dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \
+}                                                                             \
+                                                                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, COVER, FALSE, FALSE)     \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, NONE, FALSE, FALSE)      \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, PAD, FALSE, FALSE)
+
 #endif
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index c7c0254..98ad5f2 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -127,6 +127,9 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_0565,
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565,
                                            OVER, uint16_t, uint16_t)
 
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC,
+                                         uint32_t, uint32_t)
+
 void
 pixman_composite_src_n_8_asm_neon (int32_t   w,
                                    int32_t   h,
@@ -232,47 +235,6 @@ pixman_blt_neon (uint32_t *src_bits,
     }
 }
 
-void
-pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (uint32_t *       out,
-                                                        const uint32_t * top,
-                                                        const uint32_t * bottom,
-                                                        int              wt,
-                                                        int              wb,
-                                                        pixman_fixed_t   x,
-                                                        pixman_fixed_t   ux,
-                                                        int              width);
-
-static force_inline void
-scaled_bilinear_scanline_neon_8888_8888_SRC (uint32_t *       dst,
-					     const uint32_t * mask,
-					     const uint32_t * src_top,
-					     const uint32_t * src_bottom,
-					     int32_t          w,
-					     int              wt,
-					     int              wb,
-					     pixman_fixed_t   vx,
-					     pixman_fixed_t   unit_x,
-					     pixman_fixed_t   max_vx,
-					     pixman_bool_t    zero_src)
-{
-    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top,
-                                                            src_bottom, wt, wb,
-                                                            vx, unit_x, w);
-}
-
-FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_cover_SRC,
-			       scaled_bilinear_scanline_neon_8888_8888_SRC,
-			       uint32_t, uint32_t, uint32_t,
-			       COVER, FALSE, FALSE)
-FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_pad_SRC,
-			       scaled_bilinear_scanline_neon_8888_8888_SRC,
-			       uint32_t, uint32_t, uint32_t,
-			       PAD, FALSE, FALSE)
-FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_none_SRC,
-			       scaled_bilinear_scanline_neon_8888_8888_SRC,
-			       uint32_t, uint32_t, uint32_t,
-			       NONE, FALSE, FALSE)
-
 static const pixman_fast_path_t arm_neon_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     r5g6b5,   neon_composite_src_0565_0565),
commit 5921c17639fe5fdc595c850e3347281c1c8746ba
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Sun Mar 6 22:16:32 2011 +0200

    ARM: assembly optimized nearest scaled 'src_8888_8888'
    
    Benchmark on ARM Cortex-A8 r1p3 @500MHz, 32-bit LPDDR @166MHz:
     Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
      before: op=1, src=20028888, dst=20028888, speed=44.36 MPix/s
      after:  op=1, src=20028888, dst=20028888, speed=39.79 MPix/s
    
    Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
     Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
      before: op=1, src=20028888, dst=20028888, speed=102.36 MPix/s
      after:  op=1, src=20028888, dst=20028888, speed=163.12 MPix/s

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index a9775e2..858c690 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -433,3 +433,6 @@ pixman_asm_function fname
 
 generate_nearest_scanline_func \
     pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
+
+generate_nearest_scanline_func \
+    pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2,  , 48, 32
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 6bbc109..a66f8df 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -389,6 +389,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
 
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
                                         uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
+                                        uint32_t, uint32_t)
 
 static const pixman_fast_path_t arm_simd_fast_paths[] =
 {
@@ -411,6 +413,13 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
 
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
+
     { PIXMAN_OP_NONE },
 };
 
commit f3e17872f5522e25da8e32de83e62bee8cc198d7
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Mon Mar 7 03:10:43 2011 +0200

    ARM: common macro for nearest scaling fast paths
    
    The code of nearest scaled 'src_0565_0565' function was generalized
    and moved to a common macro, so that it can be reused for other
    fast paths.

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index dd1366d..a9775e2 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -331,15 +331,29 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
 .endfunc
 
 /*
- * Note: This function is only using armv4t instructions (not even armv6),
+ * Note: This code is only using armv5te instructions (not even armv6),
  *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
  *       be split into a few variants, tuned for each microarchitecture.
  *
  * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
  * have efficient write combining), it needs to be changed to use 16-byte
  * aligned writes using STM instruction.
+ *
+ * Nearest scanline scaler macro template uses the following arguments:
+ *  fname                     - name of the function to generate
+ *  bpp_shift                 - (1 << bpp_shift) is the size of pixel in bytes
+ *  t                         - type suffix for LDR/STR instructions
+ *  prefetch_distance         - prefetch in the source image by that many
+ *                              pixels ahead
+ *  prefetch_braking_distance - stop prefetching when that many pixels are
+ *                              remaining before the end of scanline
  */
-pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
+
+.macro generate_nearest_scanline_func fname, bpp_shift, t,      \
+                                      prefetch_distance,        \
+                                      prefetch_braking_distance
+
+pixman_asm_function fname
 	W	.req	r0
 	DST	.req	r1
 	SRC	.req	r2
@@ -352,35 +366,29 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
 
 	ldr	UNIT_X, [sp]
 	push	{r4, r5, r6, r7}
-	mvn	VXMASK, #1
+	mvn	VXMASK, #((1 << bpp_shift) - 1)
 
 	/* define helper macro */
 	.macro	scale_2_pixels
-		ldrh	TMP1, [SRC, TMP1]
-		and	TMP2, VXMASK, VX, lsr #15
+		ldr&t	TMP1, [SRC, TMP1]
+		and	TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
 		add	VX, VX, UNIT_X
-		strh	TMP1, [DST], #2
+		str&t	TMP1, [DST], #(1 << bpp_shift)
 
-		ldrh	TMP2, [SRC, TMP2]
-		and	TMP1, VXMASK, VX, lsr #15
+		ldr&t	TMP2, [SRC, TMP2]
+		and	TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
 		add	VX, VX, UNIT_X
-		strh	TMP2, [DST], #2
+		str&t	TMP2, [DST], #(1 << bpp_shift)
 	.endm
 
-	/*
-	 * stop prefetch before reaching the end of scanline (a good behaving
-	 * value selected based on some benchmarks with short scanlines)
-	 */
-	#define PREFETCH_BRAKING_DISTANCE 32
-
 	/* now do the scaling */
-	and	TMP1, VXMASK, VX, lsr #15
+	and	TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
 	add	VX, VX, UNIT_X
-	subs	W, #(8 + PREFETCH_BRAKING_DISTANCE)
+	subs	W, W, #(8 + prefetch_braking_distance)
 	blt	2f
-	/* set prefetch distance to 80 pixels ahead */
-	add	PF_OFFS, VX, UNIT_X, lsl #6
-	add	PF_OFFS, PF_OFFS, UNIT_X, lsl #4
+	/* calculate prefetch offset */
+	mov	PF_OFFS, #prefetch_distance
+	mla	PF_OFFS, UNIT_X, PF_OFFS, VX
 1:	/* main loop, process 8 pixels per iteration with prefetch */
 	subs	W, W, #8
 	add	PF_OFFS, UNIT_X, lsl #3
@@ -388,10 +396,10 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
 	scale_2_pixels
 	scale_2_pixels
 	scale_2_pixels
-	pld	[SRC, PF_OFFS, lsr #15]
+	pld	[SRC, PF_OFFS, lsr #(16 - bpp_shift)]
 	bge	1b
 2:
-	subs	W, #(4 - 8 - PREFETCH_BRAKING_DISTANCE)
+	subs	W, W, #(4 - 8 - prefetch_braking_distance)
 	blt	2f
 1:	/* process the remaining pixels */
 	scale_2_pixels
@@ -404,8 +412,8 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
 	scale_2_pixels
 2:
 	tst	W, #1
-	ldrneh	TMP1, [SRC, TMP1]
-	strneh	TMP1, [DST], #2
+	ldrne&t	TMP1, [SRC, TMP1]
+	strne&t	TMP1, [DST]
 	/* cleanup helper macro */
 	.purgem	scale_2_pixels
 	.unreq	DST
@@ -421,3 +429,7 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
 	pop	{r4, r5, r6, r7}
 	bx	lr
 .endfunc
+.endm
+
+generate_nearest_scanline_func \
+    pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
commit bb3d1b67fd0f42ae00af811c624ea1c44541034d
Author: Siarhei Siamashka <siarhei.siamashka at nokia.com>
Date:   Sun Mar 6 16:17:12 2011 +0200

    ARM: use prefetch in nearest scaled 'src_0565_0565'
    
    Benchmark on ARM Cortex-A8 r1p3 @500MHz, 32-bit LPDDR @166MHz:
     Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
      before: op=1, src=10020565, dst=10020565, speed=75.02 MPix/s
      after:  op=1, src=10020565, dst=10020565, speed=73.63 MPix/s
    
    Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
     Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
      before: op=1, src=10020565, dst=10020565, speed=176.12 MPix/s
      after:  op=1, src=10020565, dst=10020565, speed=267.50 MPix/s

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 7567700..dd1366d 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -348,6 +348,7 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
 	TMP1	.req	r4
 	TMP2	.req	r5
 	VXMASK	.req	r6
+	PF_OFFS	.req	r7
 
 	ldr	UNIT_X, [sp]
 	push	{r4, r5, r6, r7}
@@ -366,12 +367,33 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
 		strh	TMP2, [DST], #2
 	.endm
 
+	/*
+	 * stop prefetch before reaching the end of scanline (a good behaving
+	 * value selected based on some benchmarks with short scanlines)
+	 */
+	#define PREFETCH_BRAKING_DISTANCE 32
+
 	/* now do the scaling */
 	and	TMP1, VXMASK, VX, lsr #15
 	add	VX, VX, UNIT_X
-	subs	W, #4
+	subs	W, #(8 + PREFETCH_BRAKING_DISTANCE)
+	blt	2f
+	/* set prefetch distance to 80 pixels ahead */
+	add	PF_OFFS, VX, UNIT_X, lsl #6
+	add	PF_OFFS, PF_OFFS, UNIT_X, lsl #4
+1:	/* main loop, process 8 pixels per iteration with prefetch */
+	subs	W, W, #8
+	add	PF_OFFS, UNIT_X, lsl #3
+	scale_2_pixels
+	scale_2_pixels
+	scale_2_pixels
+	scale_2_pixels
+	pld	[SRC, PF_OFFS, lsr #15]
+	bge	1b
+2:
+	subs	W, #(4 - 8 - PREFETCH_BRAKING_DISTANCE)
 	blt	2f
-1: /* main loop, process 4 pixels per iteration */
+1:	/* process the remaining pixels */
 	scale_2_pixels
 	scale_2_pixels
 	subs	W, W, #4
@@ -394,6 +416,7 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
 	.unreq	TMP1
 	.unreq	TMP2
 	.unreq	VXMASK
+	.unreq	PF_OFFS
 	/* return */
 	pop	{r4, r5, r6, r7}
 	bx	lr