pixman: Branch 'master' - 5 commits

Mon Apr 21 18:39:18 PDT 2014

pixman/pixman-arm-neon-asm.h |   20 --
 pixman/pixman-arm-simd-asm.S |  370 +++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-simd-asm.h |   89 ++++++++--
 pixman/pixman-arm-simd.c     |   15 +
 4 files changed, 464 insertions(+), 30 deletions(-)

New commits:
commit 5f661ee719be25c3aa0eb0d45e0db23a37e76468
Author: Pekka Paalanen <pekka.paalanen at collabora.co.uk>
Date:   Thu Apr 10 09:41:38 2014 +0300

    ARM: use pixman_asm_function in internal headers
    
    The two ARM headers contained open-coded copies of pixman_asm_function,
    replace these.
    
    Since it seems customary that ARM headers do not use CPP include guards,
    rely on the .S files to #include "pixman-arm-asm.h" first. They all
    do now.
    
    v2: Fix a build failure on rpi by adding one #include.

diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index d0d92d7..bdcf6a9 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -631,14 +631,8 @@ local skip1
                                    src_basereg_   = 0, \
                                    mask_basereg_  = 24
 
-    .func fname
-    .global fname
-    /* For ELF format also set function visibility to hidden */
-#ifdef __ELF__
-    .hidden fname
-    .type fname, %function
-#endif
-fname:
+    pixman_asm_function fname
+
     push        {r4-r12, lr}        /* save all registers */
 
 /*
@@ -945,14 +939,8 @@ fname:
                                                    src_basereg_   = 0, \
                                                    mask_basereg_  = 24
 
-    .func fname
-    .global fname
-    /* For ELF format also set function visibility to hidden */
-#ifdef __ELF__
-    .hidden fname
-    .type fname, %function
-#endif
-fname:
+    pixman_asm_function fname
+
     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
 /*
  * Make some macro arguments globally visible and accessible
diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index 0baf87a..8de060a 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -596,13 +596,7 @@
                                    process_tail, \
                                    process_inner_loop
 
- .func fname
- .global fname
- /* For ELF format also set function visibility to hidden */
-#ifdef __ELF__
- .hidden fname
- .type fname, %function
-#endif
+    pixman_asm_function fname
 
 /*
  * Make some macro arguments globally visible and accessible
@@ -714,7 +708,6 @@
     SCRATCH     .req    r12
     ORIG_W      .req    r14 /* width (pixels) */
 
-fname:
         push    {r4-r11, lr}        /* save all registers */
 
         subs    Y, Y, #1
commit ab587b444cf9ac1ee421b9942befc509b52cb109
Author: Ben Avison <bavison at riscosopen.org>
Date:   Wed Apr 9 16:25:32 2014 +0300

    ARMv6: Add fast path for in_reverse_8888_8888
    
    Benchmark results, "before" is the patch
    * upstream/master 4b76bbfda670f9ede67d0449f3640605e1fc4df0
    + ARMv6: Support for very variable-hungry composite operations
    + ARMv6: Add fast path for over_n_8888_8888_ca
    and "after" contains the additional patches on top:
    + ARMv6: Add fast path flag to force no preload of destination buffer
    + ARMv6: Add fast path for in_reverse_8888_8888 (this patch)
    
    lowlevel-blt-bench, in_reverse_8888_8888, 100 iterations:
    
           Before          After
          Mean StdDev     Mean StdDev   Confidence   Change
    L1    21.1   0.07     32.3   0.08    100.00%     +52.9%
    L2    11.6   0.29     18.0   0.52    100.00%     +54.4%
    M     10.5   0.01     16.1   0.03    100.00%     +54.1%
    HT     8.2   0.02     12.0   0.04    100.00%     +45.9%
    VT     8.1   0.02     11.7   0.04    100.00%     +44.5%
    R      8.1   0.02     11.3   0.04    100.00%     +39.7%
    RT     4.8   0.04      6.1   0.09    100.00%     +27.3%
    
    At most 12 outliers rejected per test per set.
    
    cairo-perf-trace with trimmed traces, 30 iterations:
    
                                        Before          After
                                       Mean StdDev     Mean StdDev   Confidence   Change
    t-firefox-paintball.trace          18.0   0.01     14.1   0.01    100.00%     +27.4%
    t-firefox-chalkboard.trace         36.7   0.03     36.0   0.02    100.00%      +1.9%
    t-firefox-canvas-alpha.trace       20.7   0.22     20.3   0.22    100.00%      +1.9%
    t-swfdec-youtube.trace              7.8   0.03      7.8   0.03    100.00%      +0.9%
    t-firefox-talos-gfx.trace          25.8   0.44     25.6   0.29     93.87%      +0.7%  (insignificant)
    t-firefox-talos-svg.trace          20.6   0.04     20.6   0.03    100.00%      +0.2%
    t-firefox-fishbowl.trace           21.2   0.04     21.1   0.02    100.00%      +0.2%
    t-xfce4-terminal-a1.trace           4.8   0.01      4.8   0.01     98.85%      +0.2%  (insignificant)
    t-swfdec-giant-steps.trace         14.9   0.03     14.9   0.02     99.99%      +0.2%
    t-poppler-reseau.trace             22.4   0.11     22.4   0.08     86.52%      +0.2%  (insignificant)
    t-gnome-system-monitor.trace       17.3   0.03     17.2   0.03     99.74%      +0.2%
    t-firefox-scrolling.trace          24.8   0.12     24.8   0.11     70.15%      +0.1%  (insignificant)
    t-firefox-particles.trace          27.5   0.18     27.5   0.21     48.33%      +0.1%  (insignificant)
    t-grads-heat-map.trace              4.4   0.04      4.4   0.04     16.61%      +0.0%  (insignificant)
    t-firefox-fishtank.trace           13.2   0.01     13.2   0.01      7.64%      +0.0%  (insignificant)
    t-firefox-canvas.trace             18.0   0.05     18.0   0.05      1.31%      -0.0%  (insignificant)
    t-midori-zoomed.trace               8.0   0.01      8.0   0.01     78.22%      -0.0%  (insignificant)
    t-firefox-planet-gnome.trace       10.9   0.02     10.9   0.02     64.81%      -0.0%  (insignificant)
    t-gvim.trace                       33.2   0.21     33.2   0.18     38.61%      -0.1%  (insignificant)
    t-firefox-canvas-swscroll.trace    32.2   0.09     32.2   0.11     73.17%      -0.1%  (insignificant)
    t-firefox-asteroids.trace          11.1   0.01     11.1   0.01    100.00%      -0.2%
    t-evolution.trace                  13.0   0.05     13.0   0.05     91.99%      -0.2%  (insignificant)
    t-gnome-terminal-vim.trace         19.9   0.14     20.0   0.14     97.38%      -0.4%  (insignificant)
    t-poppler.trace                     9.8   0.06      9.8   0.04     99.91%      -0.5%
    t-chromium-tabs.trace               4.9   0.02      4.9   0.02    100.00%      -0.6%
    
    At most 6 outliers rejected per test per set.
    
    Cairo perf reports the running time, but the change is computed for
    operations per second instead (inverse of running time).
    
    Confidence is based on Welch's t-test. Absolute changes less than 1%
    can be accounted as measurement errors, even if statistically
    significant.
    
    There was a question of why FLAG_NO_PRELOAD_DST is used. It makes
    lowlevel-blt-bench results worse except for L1, but improves some
    Cairo trace benchmarks.
    
    "Ben Avison" <bavison at riscosopen.org> wrote:
    
    > The thing with the lowlevel-blt-bench benchmarks for the more
    > sophisticated composite types (as a general rule, anything that involves
    > branches at the per-pixel level) is that they are only profiling the case
    > where you have mid-level alpha values in the source/mask/destination.
    > Real-world images typically have a disproportionate number of fully
    > opaque and fully transparent pixels, which is why when there's a
    > discrepancy between which implementation performs best with cairo-perf
    > trace versus lowlevel-blt-bench, I usually favour the Cairo winner.
    >
    > The results of removing FLAG_NO_PRELOAD_DST (in other words, adding
    > preload of the destination buffer) are easy to explain in the
    > lowlevel-blt-bench results. In the L1 case, the destination buffer is
    > already in the L1 cache, so adding the preloads is simply adding extra
    > instruction cycles that have no effect on memory operations. The "in"
    > compositing operator depends upon the alpha of both source and
    > destination, so if you use uniform mid-alpha, then you actually do need
    > to read your destination pixels, so you benefit from preloading them. But
    > for fully opaque or fully transparent source pixels, you don't need to
    > read the corresponding destination pixel - it'll either be left alone or
    > overwritten. Since the ARM11 doesn't use write-allocate cacheing, both of
    > these cases avoid both the time taken to load the extra cachelines, as
    > well as increasing the efficiency of the cache for other data. If you
    > examine the source images being used by the Cairo test, you'll probably
    > find they mostly use transparent or opaque pixels.
    
    v4, Pekka Paalanen <pekka.paalanen at collabora.co.uk> :
    	Rebased, re-benchmarked on Raspberry Pi, commit message.
    
    v5, Pekka Paalanen <pekka.paalanen at collabora.co.uk> :
    	Rebased, re-benchmarked on Raspberry Pi due to a fix to
    	"ARMv6: Add fast path for over_n_8888_8888_ca" patch.

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index b6f9a39..bc02ebb 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -956,3 +956,106 @@ generate_composite_function \
 
 /******************************************************************************/
 
+.macro in_reverse_8888_8888_init
+        /* Hold loop invariant in MASK */
+        ldr     MASK, =0x00800080
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, MASK, MASK
+        /* Offset the source pointer: we only need the alpha bytes */
+        add     SRC, SRC, #3
+        line_saved_regs  ORIG_W
+.endm
+
+.macro in_reverse_8888_8888_head  numbytes, reg1, reg2, reg3
+        ldrb    ORIG_W, [SRC], #4
+ .if numbytes >= 8
+        ldrb    WK&reg1, [SRC], #4
+  .if numbytes == 16
+        ldrb    WK&reg2, [SRC], #4
+        ldrb    WK&reg3, [SRC], #4
+  .endif
+ .endif
+        add     DST, DST, #numbytes
+.endm
+
+.macro in_reverse_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+        in_reverse_8888_8888_head  numbytes, firstreg, %(firstreg+1), %(firstreg+2)
+.endm
+
+.macro in_reverse_8888_8888_1pixel  s, d, offset, is_only
+ .if is_only != 1
+        movs    s, ORIG_W
+  .if offset != 0
+        ldrb    ORIG_W, [SRC, #offset]
+  .endif
+        beq     01f
+        teq     STRIDE_M, #0xFF
+        beq     02f
+ .endif
+        uxtb16  SCRATCH, d                 /* rb_dest */
+        uxtb16  d, d, ror #8               /* ag_dest */
+        mla     SCRATCH, SCRATCH, s, MASK
+        mla     d, d, s, MASK
+        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+        uxtab16 d, d, d, ror #8
+        mov     SCRATCH, SCRATCH, ror #8
+        sel     d, SCRATCH, d
+        b       02f
+ .if offset == 0
+48:     /* Last mov d,#0 of the set - used as part of shortcut for
+         * source values all 0 */
+ .endif
+01:     mov     d, #0
+02:
+.endm
+
+.macro in_reverse_8888_8888_tail  numbytes, reg1, reg2, reg3, reg4
+ .if numbytes == 4
+        teq     ORIG_W, ORIG_W, asr #32
+        ldrne   WK&reg1, [DST, #-4]
+ .elseif numbytes == 8
+        teq     ORIG_W, WK&reg1
+        teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
+        ldmnedb DST, {WK&reg1-WK&reg2}
+ .else
+        teq     ORIG_W, WK&reg1
+        teqeq   ORIG_W, WK&reg2
+        teqeq   ORIG_W, WK&reg3
+        teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
+        ldmnedb DST, {WK&reg1-WK&reg4}
+ .endif
+        cmnne   DST, #0   /* clear C if NE */
+        bcs     49f       /* no writes to dest if source all -1 */
+        beq     48f       /* set dest to all 0 if source all 0 */
+ .if numbytes == 4
+        in_reverse_8888_8888_1pixel  ORIG_W, WK&reg1, 0, 1
+        str     WK&reg1, [DST, #-4]
+ .elseif numbytes == 8
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -4, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, 0, 0
+        stmdb   DST, {WK&reg1-WK&reg2}
+ .else
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -12, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, -8, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg3, -4, 0
+        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg4, 0, 0
+        stmdb   DST, {WK&reg1-WK&reg4}
+ .endif
+49:
+.endm
+
+.macro in_reverse_8888_8888_process_tail  cond, numbytes, firstreg
+        in_reverse_8888_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+.endm
+
+generate_composite_function \
+    pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \
+    2, /* prefetch distance */ \
+    in_reverse_8888_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    in_reverse_8888_8888_process_head, \
+    in_reverse_8888_8888_process_tail
+
+/******************************************************************************/
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index dd6b907..c17ce5a 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -46,6 +46,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
                                    uint8_t, 1, uint8_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
                                    uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, in_reverse_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
 
 PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, armv6, over_reverse_n_8888,
                                  uint32_t, 1)
@@ -241,6 +243,11 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
 
+    PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, a8r8g8b8, armv6_composite_in_reverse_8888_8888),
+    PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, x8r8g8b8, armv6_composite_in_reverse_8888_8888),
+    PIXMAN_STD_FAST_PATH (IN_REVERSE, a8b8g8r8, null, a8b8g8r8, armv6_composite_in_reverse_8888_8888),
+    PIXMAN_STD_FAST_PATH (IN_REVERSE, a8b8g8r8, null, x8b8g8r8, armv6_composite_in_reverse_8888_8888),
+
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, armv6_composite_over_n_8888_8888_ca),
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, armv6_composite_over_n_8888_8888_ca),
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca),
commit 68d2f7b486a9ccc877a2214f7f5ef562e2846581
Author: Ben Avison <bavison at riscosopen.org>
Date:   Wed Apr 9 16:25:31 2014 +0300

    ARMv6: Add fast path flag to force no preload of destination buffer

diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index 1bb8b45..0baf87a 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -78,6 +78,8 @@
 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64
 .set FLAG_PROCESS_PRESERVES_WK0,     0
 .set FLAG_PROCESS_CORRUPTS_WK0,      128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
+.set FLAG_PRELOAD_DST,               0
+.set FLAG_NO_PRELOAD_DST,            256
 
 /*
  * Number of bytes by which to adjust preload offset of destination
@@ -445,7 +447,7 @@
         preload_middle  src_bpp, SRC, 0
         preload_middle  mask_bpp, MASK, 0
   .endif
-  .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)
+  .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)
         /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
          * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
          * preloads for, to achieve staggered prefetches for multiple channels, because there are
@@ -480,7 +482,9 @@
  .endif
         preload_trailing  src_bpp, src_bpp_shift, SRC
         preload_trailing  mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
         preload_trailing  dst_r_bpp, dst_bpp_shift, DST
+ .endif
         add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
         /* The remainder of the line is handled identically to the medium case */
         medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
@@ -779,7 +783,9 @@ fname:
         newline
         preload_leading_step1  src_bpp, WK1, SRC
         preload_leading_step1  mask_bpp, WK2, MASK
+  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
         preload_leading_step1  dst_r_bpp, WK3, DST
+  .endif
         
         ands    WK0, DST, #15
         beq     154f
@@ -787,7 +793,9 @@ fname:
 
         preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
         preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
+  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
         preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
+  .endif
 
         leading_15bytes  process_head, process_tail
         
@@ -827,7 +835,9 @@ fname:
         newline
         preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
         preload_line 0, mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
         preload_line 0, dst_r_bpp, dst_bpp_shift, DST
+ .endif
         
         sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
         ands    WK0, DST, #15
@@ -856,7 +866,9 @@ fname:
         newline
         preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
         preload_line 1, mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
         preload_line 1, dst_r_bpp, dst_bpp_shift, DST
+ .endif
         
  .if dst_w_bpp == 8
         tst     DST, #3
commit 4ad769cbec47ca0df43dc586f689b1968bbc942f
Author: Ben Avison <bavison at riscosopen.org>
Date:   Wed Apr 9 16:25:30 2014 +0300

    ARMv6: Add fast path for over_n_8888_8888_ca
    
    Benchmark results, "before" is
    * upstream/master 4b76bbfda670f9ede67d0449f3640605e1fc4df0
    "after" contains the additional patches on top:
    + ARMv6: Support for very variable-hungry composite operations
    + ARMv6: Add fast path for over_n_8888_8888_ca (this patch)
    
    lowlevel-blt-bench, over_n_8888_8888_ca, 100 iterations:
    
           Before          After
          Mean StdDev     Mean StdDev   Confidence   Change
    L1     2.7   0.00     16.1   0.06    100.00%    +500.7%
    L2     2.4   0.01     14.1   0.15    100.00%    +489.9%
    M      2.3   0.00     14.3   0.01    100.00%    +510.2%
    HT     2.2   0.00      9.7   0.03    100.00%    +345.0%
    VT     2.2   0.00      9.4   0.02    100.00%    +333.4%
    R      2.2   0.01      9.5   0.03    100.00%    +331.6%
    RT     1.9   0.01      5.5   0.07    100.00%    +192.7%
    
    At most 1 outliers rejected per test per set.
    
    cairo-perf-trace with trimmed traces, 30 iterations:
    
                                        Before          After
                                       Mean StdDev     Mean StdDev   Confidence   Change
    t-firefox-talos-gfx.trace          33.1   0.42     25.8   0.44    100.00%     +28.6%
    t-firefox-scrolling.trace          31.4   0.11     24.8   0.12    100.00%     +26.3%
    t-gnome-terminal-vim.trace         22.4   0.10     19.9   0.14    100.00%     +12.5%
    t-evolution.trace                  13.9   0.07     13.0   0.05    100.00%      +6.5%
    t-firefox-planet-gnome.trace       11.6   0.02     10.9   0.02    100.00%      +6.5%
    t-gvim.trace                       34.0   0.21     33.2   0.21    100.00%      +2.4%
    t-chromium-tabs.trace               4.9   0.02      4.9   0.02    100.00%      +1.0%
    t-poppler.trace                     9.8   0.05      9.8   0.06    100.00%      +0.7%
    t-firefox-canvas-swscroll.trace    32.3   0.10     32.2   0.09    100.00%      +0.4%
    t-firefox-paintball.trace          18.1   0.01     18.0   0.01    100.00%      +0.3%
    t-poppler-reseau.trace             22.5   0.09     22.4   0.11     99.29%      +0.3%
    t-firefox-canvas.trace             18.1   0.06     18.0   0.05     99.29%      +0.2%
    t-xfce4-terminal-a1.trace           4.8   0.01      4.8   0.01     99.77%      +0.2%
    t-firefox-fishbowl.trace           21.2   0.03     21.2   0.04    100.00%      +0.2%
    t-gnome-system-monitor.trace       17.3   0.03     17.3   0.03     99.54%      +0.1%
    t-firefox-asteroids.trace          11.1   0.01     11.1   0.01    100.00%      +0.1%
    t-midori-zoomed.trace               8.0   0.01      8.0   0.01     99.98%      +0.1%
    t-grads-heat-map.trace              4.4   0.04      4.4   0.04     34.08%      +0.1%  (insignificant)
    t-firefox-talos-svg.trace          20.6   0.03     20.6   0.04     54.06%      +0.0%  (insignificant)
    t-firefox-fishtank.trace           13.2   0.01     13.2   0.01     52.81%      -0.0%  (insignificant)
    t-swfdec-giant-steps.trace         14.9   0.02     14.9   0.03     85.50%      -0.1%  (insignificant)
    t-firefox-chalkboard.trace         36.6   0.02     36.7   0.03    100.00%      -0.2%
    t-firefox-canvas-alpha.trace       20.7   0.32     20.7   0.22     55.76%      -0.3%  (insignificant)
    t-swfdec-youtube.trace              7.8   0.02      7.8   0.03    100.00%      -0.5%
    t-firefox-particles.trace          27.4   0.16     27.5   0.18     99.94%      -0.6%
    
    At most 4 outliers rejected per test per set.
    
    Cairo perf reports the running time, but the change is computed for
    operations per second instead (inverse of running time).
    
    Confidence is based on Welch's t-test. Absolute changes less than 1%
    can be accounted as measurement errors, even if statistically
    significant.
    
    v4, Pekka Paalanen <pekka.paalanen at collabora.co.uk> :
    	Use pixman_asm_function instead of startfunc.
    	Rebased. Re-benchmarked on Raspberry Pi.
    	Commit message.
    
    v5, Ben Avison <bavison at riscosopen.org> :
    	Fixed the bug exposed in blitters-test 4928372.
    	15 hours of testing, compared to the 45 minutes to hit
    	the bug originally.
        Pekka Paalanen <pekka.paalanen at collabora.co.uk> :
    	Squash the fix, re-benchmark on Raspberry Pi.

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index dd6f788..b6f9a39 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -37,6 +37,7 @@
 	.altmacro
 	.p2align 2
 
+#include "pixman-arm-asm.h"
 #include "pixman-arm-simd-asm.h"
 
 /* A head macro should do all processing which results in an output of up to
@@ -689,3 +690,269 @@ generate_composite_function \
 
 /******************************************************************************/
 
+.macro over_white_8888_8888_ca_init
+        HALF    .req    SRC
+        TMP0    .req    STRIDE_D
+        TMP1    .req    STRIDE_S
+        TMP2    .req    STRIDE_M
+        TMP3    .req    ORIG_W
+        WK4     .req    SCRATCH
+        line_saved_regs STRIDE_D, STRIDE_M, ORIG_W
+        ldr     SCRATCH, =0x800080
+        mov     HALF, #0x80
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, SCRATCH, SCRATCH
+        .set DST_PRELOAD_BIAS, 8
+.endm
+
+.macro over_white_8888_8888_ca_cleanup
+        .set DST_PRELOAD_BIAS, 0
+        .unreq  HALF
+        .unreq  TMP0
+        .unreq  TMP1
+        .unreq  TMP2
+        .unreq  TMP3
+        .unreq  WK4
+.endm
+
+.macro over_white_8888_8888_ca_combine  m, d
+        uxtb16  TMP1, TMP0                /* rb_notmask */
+        uxtb16  TMP2, d                   /* rb_dest; 1 stall follows */
+        smlatt  TMP3, TMP2, TMP1, HALF    /* red */
+        smlabb  TMP2, TMP2, TMP1, HALF    /* blue */
+        uxtb16  TMP0, TMP0, ror #8        /* ag_notmask */
+        uxtb16  TMP1, d, ror #8           /* ag_dest; 1 stall follows */
+        smlatt  d, TMP1, TMP0, HALF       /* alpha */
+        smlabb  TMP1, TMP1, TMP0, HALF    /* green */
+        pkhbt   TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
+        pkhbt   TMP1, TMP1, d, lsl #16    /* ag */
+        uxtab16 TMP0, TMP0, TMP0, ror #8
+        uxtab16 TMP1, TMP1, TMP1, ror #8
+        mov     TMP0, TMP0, ror #8
+        sel     d, TMP0, TMP1
+        uqadd8  d, d, m                   /* d is a late result */
+.endm
+
+.macro over_white_8888_8888_ca_1pixel_head
+        pixld   , 4, 1, MASK, 0
+        pixld   , 4, 3, DST, 0
+.endm
+
+.macro over_white_8888_8888_ca_1pixel_tail
+        mvn     TMP0, WK1
+        teq     WK1, WK1, asr #32
+        bne     01f
+        bcc     03f
+        mov     WK3, WK1
+        b       02f
+01:     over_white_8888_8888_ca_combine WK1, WK3
+02:     pixst   , 4, 3, DST
+03:
+.endm
+
+.macro over_white_8888_8888_ca_2pixels_head
+        pixld   , 8, 1, MASK, 0
+.endm
+
+.macro over_white_8888_8888_ca_2pixels_tail
+        pixld   , 8, 3, DST
+        mvn     TMP0, WK1
+        teq     WK1, WK1, asr #32
+        bne     01f
+        movcs   WK3, WK1
+        bcs     02f
+        teq     WK2, #0
+        beq     05f
+        b       02f
+01:     over_white_8888_8888_ca_combine WK1, WK3
+02:     mvn     TMP0, WK2
+        teq     WK2, WK2, asr #32
+        bne     03f
+        movcs   WK4, WK2
+        b       04f
+03:     over_white_8888_8888_ca_combine WK2, WK4
+04:     pixst   , 8, 3, DST
+05:
+.endm
+
+.macro over_white_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 4
+        over_white_8888_8888_ca_1pixel_head
+ .else
+  .if numbytes == 16
+        over_white_8888_8888_ca_2pixels_head
+        over_white_8888_8888_ca_2pixels_tail
+  .endif
+        over_white_8888_8888_ca_2pixels_head
+ .endif
+.endm
+
+.macro over_white_8888_8888_ca_process_tail  cond, numbytes, firstreg
+ .if numbytes == 4
+        over_white_8888_8888_ca_1pixel_tail
+ .else
+        over_white_8888_8888_ca_2pixels_tail
+ .endif
+.endm
+
+generate_composite_function \
+    pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \
+    2, /* prefetch distance */ \
+    over_white_8888_8888_ca_init, \
+    nop_macro, /* newline */ \
+    over_white_8888_8888_ca_cleanup, \
+    over_white_8888_8888_ca_process_head, \
+    over_white_8888_8888_ca_process_tail
+
+
+.macro over_n_8888_8888_ca_init
+        /* Set up constants. RB_SRC and AG_SRC are in registers;
+         * RB_FLDS, A_SRC, and the two HALF values need to go on the
+         * stack (and the ful SRC value is already there) */
+        ldr     SCRATCH, [sp, #ARGS_STACK_OFFSET]
+        mov     WK0, #0x00FF0000
+        orr     WK0, WK0, #0xFF        /* RB_FLDS (0x00FF00FF) */
+        mov     WK1, #0x80             /* HALF default value */
+        mov     WK2, SCRATCH, lsr #24  /* A_SRC */
+        orr     WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */
+        push    {WK0-WK3}
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16
+        uxtb16  SRC, SCRATCH
+        uxtb16  STRIDE_S, SCRATCH, ror #8
+
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, WK3, WK3
+
+        .unreq  WK0
+        .unreq  WK1
+        .unreq  WK2
+        .unreq  WK3
+        WK0     .req    Y
+        WK1     .req    STRIDE_D
+        RB_SRC  .req    SRC
+        AG_SRC  .req    STRIDE_S
+        WK2     .req    STRIDE_M
+        RB_FLDS .req    r8       /* the reloaded constants have to be at consecutive registers starting at an even one */
+        A_SRC   .req    r8
+        HALF    .req    r9
+        WK3     .req    r10
+        WK4     .req    r11
+        WK5     .req    SCRATCH
+        WK6     .req    ORIG_W
+
+        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
+.endm
+
+.macro over_n_8888_8888_ca_cleanup
+        add     sp, sp, #16
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16
+
+        .unreq  WK0
+        .unreq  WK1
+        .unreq  RB_SRC
+        .unreq  AG_SRC
+        .unreq  WK2
+        .unreq  RB_FLDS
+        .unreq  A_SRC
+        .unreq  HALF
+        .unreq  WK3
+        .unreq  WK4
+        .unreq  WK5
+        .unreq  WK6
+        WK0     .req    r8
+        WK1     .req    r9
+        WK2     .req    r10
+        WK3     .req    r11
+.endm
+
+.macro over_n_8888_8888_ca_1pixel_head
+        pixld   , 4, 6, MASK, 0
+        pixld   , 4, 0, DST, 0
+.endm
+
+.macro over_n_8888_8888_ca_1pixel_tail
+        ldrd    A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8]
+        uxtb16  WK1, WK6                 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */
+        teq     WK6, WK6, asr #32        /* Zc if transparent, ZC if opaque */
+        bne     20f
+        bcc     40f
+        /* Mask is fully opaque (all channels) */
+        ldr     WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */
+        eors    A_SRC, A_SRC, #0xFF
+        bne     10f
+        /* Source is also opaque - same as src_8888_8888 */
+        mov     WK0, WK6
+        b       30f
+10:     /* Same as over_8888_8888 */
+        mul_8888_8 WK0, A_SRC, WK5, HALF
+        uqadd8  WK0, WK0, WK6
+        b       30f
+20:     /* No simplifications possible - do it the hard way */
+        uxtb16  WK2, WK6, ror #8         /* ag_mask */
+        mla     WK3, WK1, A_SRC, HALF    /* rb_mul; 2 cycles */
+        mla     WK4, WK2, A_SRC, HALF    /* ag_mul; 2 cycles */
+        ldrd    RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET]
+        uxtb16  WK5, WK0                 /* rb_dest */
+        uxtab16 WK3, WK3, WK3, ror #8
+        uxtb16  WK6, WK0, ror #8         /* ag_dest */
+        uxtab16 WK4, WK4, WK4, ror #8
+        smlatt  WK0, RB_SRC, WK1, HALF   /* red1 */
+        smlabb  WK1, RB_SRC, WK1, HALF   /* blue1 */
+        bic     WK3, RB_FLDS, WK3, lsr #8
+        bic     WK4, RB_FLDS, WK4, lsr #8
+        pkhbt   WK1, WK1, WK0, lsl #16   /* rb1 */
+        smlatt  WK0, WK5, WK3, HALF      /* red2 */
+        smlabb  WK3, WK5, WK3, HALF      /* blue2 */
+        uxtab16 WK1, WK1, WK1, ror #8
+        smlatt  WK5, AG_SRC, WK2, HALF   /* alpha1 */
+        pkhbt   WK3, WK3, WK0, lsl #16   /* rb2 */
+        smlabb  WK0, AG_SRC, WK2, HALF   /* green1 */
+        smlatt  WK2, WK6, WK4, HALF      /* alpha2 */
+        smlabb  WK4, WK6, WK4, HALF      /* green2 */
+        pkhbt   WK0, WK0, WK5, lsl #16   /* ag1 */
+        uxtab16 WK3, WK3, WK3, ror #8
+        pkhbt   WK4, WK4, WK2, lsl #16   /* ag2 */
+        uxtab16 WK0, WK0, WK0, ror #8
+        uxtab16 WK4, WK4, WK4, ror #8
+        mov     WK1, WK1, ror #8
+        mov     WK3, WK3, ror #8
+        sel     WK2, WK1, WK0            /* recombine source*mask */
+        sel     WK1, WK3, WK4            /* recombine dest*(1-source_alpha*mask) */
+        uqadd8  WK0, WK1, WK2            /* followed by 1 stall */
+30:     /* The destination buffer is already in the L1 cache, so
+         * there's little point in amalgamating writes */
+        pixst   , 4, 0, DST
+40:
+.endm
+
+.macro over_n_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .rept (numbytes / 4) - 1
+        over_n_8888_8888_ca_1pixel_head
+        over_n_8888_8888_ca_1pixel_tail
+ .endr
+        over_n_8888_8888_ca_1pixel_head
+.endm
+
+.macro over_n_8888_8888_ca_process_tail  cond, numbytes, firstreg
+        over_n_8888_8888_ca_1pixel_tail
+.endm
+
+pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
+        ldr     ip, [sp]
+        cmp     ip, #-1
+        beq     pixman_composite_over_white_8888_8888_ca_asm_armv6
+        /* else drop through... */
+ .endfunc
+generate_composite_function \
+    pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
+    2, /* prefetch distance */ \
+    over_n_8888_8888_ca_init, \
+    nop_macro, /* newline */ \
+    over_n_8888_8888_ca_cleanup, \
+    over_n_8888_8888_ca_process_head, \
+    over_n_8888_8888_ca_process_tail
+
+/******************************************************************************/
+
diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index 852a113..1bb8b45 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -80,6 +80,12 @@
 .set FLAG_PROCESS_CORRUPTS_WK0,      128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
 
 /*
+ * Number of bytes by which to adjust preload offset of destination
+ * buffer (allows preload instruction to be moved before the load(s))
+ */
+.set DST_PRELOAD_BIAS, 0
+
+/*
  * Offset into stack where mask and source pointer/stride can be accessed.
  */
 #ifdef DEBUG_PARAMS
@@ -462,11 +468,11 @@
  .if dst_r_bpp > 0
         tst     DST, #16
         bne     111f
-        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16
+        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
         b       112f
 111:
  .endif
-        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0
+        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
 112:
         /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
  .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 8fbc439..dd6b907 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -56,6 +56,9 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
                                       uint8_t, 1, uint32_t, 1)
 
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8888_8888_ca,
+                                      uint32_t, 1, uint32_t, 1)
+
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
                                         uint16_t, uint16_t)
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
@@ -238,6 +241,11 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
 
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, armv6_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, armv6_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, armv6_composite_over_n_8888_8888_ca),
+
     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
     PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
 
commit 73d2f8b61ae5e320a7795c29b6041b7885cf2953
Author: Ben Avison <bavison at riscosopen.org>
Date:   Wed Apr 9 16:25:29 2014 +0300

    ARMv6: Support for very variable-hungry composite operations
    
    Previously, the variable ARGS_STACK_OFFSET was available to extract values
    from function arguments during the init macro. Now this changes dynamically
    around stack operations in the function as a whole so that arguments can be
    accessed at any point. It is also joined by LOCALS_STACK_OFFSET, which
    allows access to space reserved on the stack during the init macro.
    
    On top of this, composite macros now have the option of using all of WK0-WK3
    registers rather than just the subset it was told to use; this requires the
    pixel count to be spilled to the stack over the leading pixels at the start
    of each line. Thus, at best, each composite operation can use 11 registers,
    plus any pointer registers not required for the composite type, plus as much
    stack space as it needs, divided up into constants and variables as necessary.

diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index 24b1ad2..852a113 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -76,6 +76,8 @@
 .set FLAG_SPILL_LINE_VARS,           48
 .set FLAG_PROCESS_CORRUPTS_SCRATCH,  0
 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64
+.set FLAG_PROCESS_PRESERVES_WK0,     0
+.set FLAG_PROCESS_CORRUPTS_WK0,      128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
 
 /*
  * Offset into stack where mask and source pointer/stride can be accessed.
@@ -87,6 +89,11 @@
 #endif
 
 /*
+ * Offset into stack where space allocated during init macro can be accessed.
+ */
+.set LOCALS_STACK_OFFSET,     0
+
+/*
  * Constants for selecting preferable prefetch type.
  */
 .set PREFETCH_TYPE_NONE,       0
@@ -359,23 +366,41 @@
 
 
 .macro test_bits_1_0_ptr
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+        movs    SCRATCH, X, lsl #32-1  /* C,N = bits 1,0 of DST */
+ .else
         movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */
+ .endif
 .endm
 
 .macro test_bits_3_2_ptr
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+        movs    SCRATCH, X, lsl #32-3  /* C,N = bits 3, 2 of DST */
+ .else
         movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */
+ .endif
 .endm
 
 .macro leading_15bytes  process_head, process_tail
         /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
+ .set DECREMENT_X, 1
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+  .set DECREMENT_X, 0
+        sub     X, X, WK0, lsr #dst_bpp_shift
+        str     X, [sp, #LINE_SAVED_REG_COUNT*4]
+        mov     X, WK0
+ .endif
         /* Use unaligned loads in all cases for simplicity */
  .if dst_w_bpp == 8
-        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
+        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
  .elseif dst_w_bpp == 16
         test_bits_1_0_ptr
-        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, 1
+        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
+ .endif
+        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+        ldr     X, [sp, #LINE_SAVED_REG_COUNT*4]
  .endif
-        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
 .endm
 
 .macro test_bits_3_2_pix
@@ -705,6 +730,13 @@ fname:
 #endif
 
         init
+
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+        /* Reserve a word in which to store X during leading pixels */
+        sub     sp, sp, #4
+  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
+  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
+ .endif
         
         lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */
         sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
@@ -734,6 +766,8 @@ fname:
   .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
         /* This is stmdb sp!,{} */
         .word   0xE92D0000 | LINE_SAVED_REGS
+   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
   .endif
 151:    /* New line */
         newline
@@ -767,6 +801,10 @@ fname:
 
 157:    /* Check for another line */
         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
+  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
+   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+  .endif
  .endif
 
  .ltorg
@@ -776,6 +814,8 @@ fname:
  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
         /* This is stmdb sp!,{} */
         .word   0xE92D0000 | LINE_SAVED_REGS
+  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
  .endif
 161:    /* New line */
         newline
@@ -841,12 +881,22 @@ fname:
 
 177:    /* Check for another line */
         end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .endif
 
 197:
  .if (flags) & FLAG_SPILL_LINE_VARS
         add     sp, sp, #LINE_SAVED_REG_COUNT*4
  .endif
 198:
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
+  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
+        add     sp, sp, #4
+ .endif
+
         cleanup
 
 #ifdef DEBUG_PARAMS