pixman: Branch 'master' - 5 commits
Søren Sandmann Pedersen
sandmann at kemper.freedesktop.org
Mon Apr 21 18:39:18 PDT 2014
pixman/pixman-arm-neon-asm.h | 20 --
pixman/pixman-arm-simd-asm.S | 370 +++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-simd-asm.h | 89 ++++++++--
pixman/pixman-arm-simd.c | 15 +
4 files changed, 464 insertions(+), 30 deletions(-)
New commits:
commit 5f661ee719be25c3aa0eb0d45e0db23a37e76468
Author: Pekka Paalanen <pekka.paalanen at collabora.co.uk>
Date: Thu Apr 10 09:41:38 2014 +0300
ARM: use pixman_asm_function in internal headers
The two ARM headers contained open-coded copies of pixman_asm_function,
replace these.
Since it seems customary that ARM headers do not use CPP include guards,
rely on the .S files to #include "pixman-arm-asm.h" first. They all
do now.
v2: Fix a build failure on rpi by adding one #include.
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index d0d92d7..bdcf6a9 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -631,14 +631,8 @@ local skip1
src_basereg_ = 0, \
mask_basereg_ = 24
- .func fname
- .global fname
- /* For ELF format also set function visibility to hidden */
-#ifdef __ELF__
- .hidden fname
- .type fname, %function
-#endif
-fname:
+ pixman_asm_function fname
+
push {r4-r12, lr} /* save all registers */
/*
@@ -945,14 +939,8 @@ fname:
src_basereg_ = 0, \
mask_basereg_ = 24
- .func fname
- .global fname
- /* For ELF format also set function visibility to hidden */
-#ifdef __ELF__
- .hidden fname
- .type fname, %function
-#endif
-fname:
+ pixman_asm_function fname
+
.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
/*
* Make some macro arguments globally visible and accessible
diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index 0baf87a..8de060a 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -596,13 +596,7 @@
process_tail, \
process_inner_loop
- .func fname
- .global fname
- /* For ELF format also set function visibility to hidden */
-#ifdef __ELF__
- .hidden fname
- .type fname, %function
-#endif
+ pixman_asm_function fname
/*
* Make some macro arguments globally visible and accessible
@@ -714,7 +708,6 @@
SCRATCH .req r12
ORIG_W .req r14 /* width (pixels) */
-fname:
push {r4-r11, lr} /* save all registers */
subs Y, Y, #1
commit ab587b444cf9ac1ee421b9942befc509b52cb109
Author: Ben Avison <bavison at riscosopen.org>
Date: Wed Apr 9 16:25:32 2014 +0300
ARMv6: Add fast path for in_reverse_8888_8888
Benchmark results, "before" is the patch
* upstream/master 4b76bbfda670f9ede67d0449f3640605e1fc4df0
+ ARMv6: Support for very variable-hungry composite operations
+ ARMv6: Add fast path for over_n_8888_8888_ca
and "after" contains the additional patches on top:
+ ARMv6: Add fast path flag to force no preload of destination buffer
+ ARMv6: Add fast path for in_reverse_8888_8888 (this patch)
lowlevel-blt-bench, in_reverse_8888_8888, 100 iterations:
Before After
Mean StdDev Mean StdDev Confidence Change
L1 21.1 0.07 32.3 0.08 100.00% +52.9%
L2 11.6 0.29 18.0 0.52 100.00% +54.4%
M 10.5 0.01 16.1 0.03 100.00% +54.1%
HT 8.2 0.02 12.0 0.04 100.00% +45.9%
VT 8.1 0.02 11.7 0.04 100.00% +44.5%
R 8.1 0.02 11.3 0.04 100.00% +39.7%
RT 4.8 0.04 6.1 0.09 100.00% +27.3%
At most 12 outliers rejected per test per set.
cairo-perf-trace with trimmed traces, 30 iterations:
Before After
Mean StdDev Mean StdDev Confidence Change
t-firefox-paintball.trace 18.0 0.01 14.1 0.01 100.00% +27.4%
t-firefox-chalkboard.trace 36.7 0.03 36.0 0.02 100.00% +1.9%
t-firefox-canvas-alpha.trace 20.7 0.22 20.3 0.22 100.00% +1.9%
t-swfdec-youtube.trace 7.8 0.03 7.8 0.03 100.00% +0.9%
t-firefox-talos-gfx.trace 25.8 0.44 25.6 0.29 93.87% +0.7% (insignificant)
t-firefox-talos-svg.trace 20.6 0.04 20.6 0.03 100.00% +0.2%
t-firefox-fishbowl.trace 21.2 0.04 21.1 0.02 100.00% +0.2%
t-xfce4-terminal-a1.trace 4.8 0.01 4.8 0.01 98.85% +0.2% (insignificant)
t-swfdec-giant-steps.trace 14.9 0.03 14.9 0.02 99.99% +0.2%
t-poppler-reseau.trace 22.4 0.11 22.4 0.08 86.52% +0.2% (insignificant)
t-gnome-system-monitor.trace 17.3 0.03 17.2 0.03 99.74% +0.2%
t-firefox-scrolling.trace 24.8 0.12 24.8 0.11 70.15% +0.1% (insignificant)
t-firefox-particles.trace 27.5 0.18 27.5 0.21 48.33% +0.1% (insignificant)
t-grads-heat-map.trace 4.4 0.04 4.4 0.04 16.61% +0.0% (insignificant)
t-firefox-fishtank.trace 13.2 0.01 13.2 0.01 7.64% +0.0% (insignificant)
t-firefox-canvas.trace 18.0 0.05 18.0 0.05 1.31% -0.0% (insignificant)
t-midori-zoomed.trace 8.0 0.01 8.0 0.01 78.22% -0.0% (insignificant)
t-firefox-planet-gnome.trace 10.9 0.02 10.9 0.02 64.81% -0.0% (insignificant)
t-gvim.trace 33.2 0.21 33.2 0.18 38.61% -0.1% (insignificant)
t-firefox-canvas-swscroll.trace 32.2 0.09 32.2 0.11 73.17% -0.1% (insignificant)
t-firefox-asteroids.trace 11.1 0.01 11.1 0.01 100.00% -0.2%
t-evolution.trace 13.0 0.05 13.0 0.05 91.99% -0.2% (insignificant)
t-gnome-terminal-vim.trace 19.9 0.14 20.0 0.14 97.38% -0.4% (insignificant)
t-poppler.trace 9.8 0.06 9.8 0.04 99.91% -0.5%
t-chromium-tabs.trace 4.9 0.02 4.9 0.02 100.00% -0.6%
At most 6 outliers rejected per test per set.
Cairo perf reports the running time, but the change is computed for
operations per second instead (inverse of running time).
Confidence is based on Welch's t-test. Absolute changes less than 1%
can be accounted as measurement errors, even if statistically
significant.
There was a question of why FLAG_NO_PRELOAD_DST is used. It makes
lowlevel-blt-bench results worse except for L1, but improves some
Cairo trace benchmarks.
"Ben Avison" <bavison at riscosopen.org> wrote:
> The thing with the lowlevel-blt-bench benchmarks for the more
> sophisticated composite types (as a general rule, anything that involves
> branches at the per-pixel level) is that they are only profiling the case
> where you have mid-level alpha values in the source/mask/destination.
> Real-world images typically have a disproportionate number of fully
> opaque and fully transparent pixels, which is why when there's a
> discrepancy between which implementation performs best with cairo-perf
> trace versus lowlevel-blt-bench, I usually favour the Cairo winner.
>
> The results of removing FLAG_NO_PRELOAD_DST (in other words, adding
> preload of the destination buffer) are easy to explain in the
> lowlevel-blt-bench results. In the L1 case, the destination buffer is
> already in the L1 cache, so adding the preloads is simply adding extra
> instruction cycles that have no effect on memory operations. The "in"
> compositing operator depends upon the alpha of both source and
> destination, so if you use uniform mid-alpha, then you actually do need
> to read your destination pixels, so you benefit from preloading them. But
> for fully opaque or fully transparent source pixels, you don't need to
> read the corresponding destination pixel - it'll either be left alone or
> overwritten. Since the ARM11 doesn't use write-allocate cacheing, both of
> these cases avoid both the time taken to load the extra cachelines, as
> well as increasing the efficiency of the cache for other data. If you
> examine the source images being used by the Cairo test, you'll probably
> find they mostly use transparent or opaque pixels.
v4, Pekka Paalanen <pekka.paalanen at collabora.co.uk> :
Rebased, re-benchmarked on Raspberry Pi, commit message.
v5, Pekka Paalanen <pekka.paalanen at collabora.co.uk> :
Rebased, re-benchmarked on Raspberry Pi due to a fix to
"ARMv6: Add fast path for over_n_8888_8888_ca" patch.
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index b6f9a39..bc02ebb 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -956,3 +956,106 @@ generate_composite_function \
/******************************************************************************/
+.macro in_reverse_8888_8888_init
+ /* Hold loop invariant in MASK */
+ ldr MASK, =0x00800080
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, MASK, MASK
+ /* Offset the source pointer: we only need the alpha bytes */
+ add SRC, SRC, #3
+ line_saved_regs ORIG_W
+.endm
+
+.macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3
+ ldrb ORIG_W, [SRC], #4
+ .if numbytes >= 8
+ ldrb WK®1, [SRC], #4
+ .if numbytes == 16
+ ldrb WK®2, [SRC], #4
+ ldrb WK®3, [SRC], #4
+ .endif
+ .endif
+ add DST, DST, #numbytes
+.endm
+
+.macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2)
+.endm
+
+.macro in_reverse_8888_8888_1pixel s, d, offset, is_only
+ .if is_only != 1
+ movs s, ORIG_W
+ .if offset != 0
+ ldrb ORIG_W, [SRC, #offset]
+ .endif
+ beq 01f
+ teq STRIDE_M, #0xFF
+ beq 02f
+ .endif
+ uxtb16 SCRATCH, d /* rb_dest */
+ uxtb16 d, d, ror #8 /* ag_dest */
+ mla SCRATCH, SCRATCH, s, MASK
+ mla d, d, s, MASK
+ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+ uxtab16 d, d, d, ror #8
+ mov SCRATCH, SCRATCH, ror #8
+ sel d, SCRATCH, d
+ b 02f
+ .if offset == 0
+48: /* Last mov d,#0 of the set - used as part of shortcut for
+ * source values all 0 */
+ .endif
+01: mov d, #0
+02:
+.endm
+
+.macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4
+ .if numbytes == 4
+ teq ORIG_W, ORIG_W, asr #32
+ ldrne WK®1, [DST, #-4]
+ .elseif numbytes == 8
+ teq ORIG_W, WK®1
+ teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
+ ldmnedb DST, {WK®1-WK®2}
+ .else
+ teq ORIG_W, WK®1
+ teqeq ORIG_W, WK®2
+ teqeq ORIG_W, WK®3
+ teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
+ ldmnedb DST, {WK®1-WK®4}
+ .endif
+ cmnne DST, #0 /* clear C if NE */
+ bcs 49f /* no writes to dest if source all -1 */
+ beq 48f /* set dest to all 0 if source all 0 */
+ .if numbytes == 4
+ in_reverse_8888_8888_1pixel ORIG_W, WK®1, 0, 1
+ str WK®1, [DST, #-4]
+ .elseif numbytes == 8
+ in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -4, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK®2, 0, 0
+ stmdb DST, {WK®1-WK®2}
+ .else
+ in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -12, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK®2, -8, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK®3, -4, 0
+ in_reverse_8888_8888_1pixel STRIDE_M, WK®4, 0, 0
+ stmdb DST, {WK®1-WK®4}
+ .endif
+49:
+.endm
+
+.macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg
+ in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+.endm
+
+generate_composite_function \
+ pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \
+ 2, /* prefetch distance */ \
+ in_reverse_8888_8888_init, \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ in_reverse_8888_8888_process_head, \
+ in_reverse_8888_8888_process_tail
+
+/******************************************************************************/
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index dd6b907..c17ce5a 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -46,6 +46,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, in_reverse_8888_8888,
+ uint32_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, armv6, over_reverse_n_8888,
uint32_t, 1)
@@ -241,6 +243,11 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, a8r8g8b8, armv6_composite_in_reverse_8888_8888),
+ PIXMAN_STD_FAST_PATH (IN_REVERSE, a8r8g8b8, null, x8r8g8b8, armv6_composite_in_reverse_8888_8888),
+ PIXMAN_STD_FAST_PATH (IN_REVERSE, a8b8g8r8, null, a8b8g8r8, armv6_composite_in_reverse_8888_8888),
+ PIXMAN_STD_FAST_PATH (IN_REVERSE, a8b8g8r8, null, x8b8g8r8, armv6_composite_in_reverse_8888_8888),
+
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, armv6_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, armv6_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca),
commit 68d2f7b486a9ccc877a2214f7f5ef562e2846581
Author: Ben Avison <bavison at riscosopen.org>
Date: Wed Apr 9 16:25:31 2014 +0300
ARMv6: Add fast path flag to force no preload of destination buffer
diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index 1bb8b45..0baf87a 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -78,6 +78,8 @@
.set FLAG_PROCESS_PRESERVES_SCRATCH, 64
.set FLAG_PROCESS_PRESERVES_WK0, 0
.set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
+.set FLAG_PRELOAD_DST, 0
+.set FLAG_NO_PRELOAD_DST, 256
/*
* Number of bytes by which to adjust preload offset of destination
@@ -445,7 +447,7 @@
preload_middle src_bpp, SRC, 0
preload_middle mask_bpp, MASK, 0
.endif
- .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)
+ .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)
/* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
* destination prefetches are 32-byte aligned. It's also the easiest channel to offset
* preloads for, to achieve staggered prefetches for multiple channels, because there are
@@ -480,7 +482,9 @@
.endif
preload_trailing src_bpp, src_bpp_shift, SRC
preload_trailing mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
preload_trailing dst_r_bpp, dst_bpp_shift, DST
+ .endif
add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
/* The remainder of the line is handled identically to the medium case */
medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
@@ -779,7 +783,9 @@ fname:
newline
preload_leading_step1 src_bpp, WK1, SRC
preload_leading_step1 mask_bpp, WK2, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
preload_leading_step1 dst_r_bpp, WK3, DST
+ .endif
ands WK0, DST, #15
beq 154f
@@ -787,7 +793,9 @@ fname:
preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC
preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST
+ .endif
leading_15bytes process_head, process_tail
@@ -827,7 +835,9 @@ fname:
newline
preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
preload_line 0, mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
preload_line 0, dst_r_bpp, dst_bpp_shift, DST
+ .endif
sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */
ands WK0, DST, #15
@@ -856,7 +866,9 @@ fname:
newline
preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
preload_line 1, mask_bpp, mask_bpp_shift, MASK
+ .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
preload_line 1, dst_r_bpp, dst_bpp_shift, DST
+ .endif
.if dst_w_bpp == 8
tst DST, #3
commit 4ad769cbec47ca0df43dc586f689b1968bbc942f
Author: Ben Avison <bavison at riscosopen.org>
Date: Wed Apr 9 16:25:30 2014 +0300
ARMv6: Add fast path for over_n_8888_8888_ca
Benchmark results, "before" is
* upstream/master 4b76bbfda670f9ede67d0449f3640605e1fc4df0
"after" contains the additional patches on top:
+ ARMv6: Support for very variable-hungry composite operations
+ ARMv6: Add fast path for over_n_8888_8888_ca (this patch)
lowlevel-blt-bench, over_n_8888_8888_ca, 100 iterations:
Before After
Mean StdDev Mean StdDev Confidence Change
L1 2.7 0.00 16.1 0.06 100.00% +500.7%
L2 2.4 0.01 14.1 0.15 100.00% +489.9%
M 2.3 0.00 14.3 0.01 100.00% +510.2%
HT 2.2 0.00 9.7 0.03 100.00% +345.0%
VT 2.2 0.00 9.4 0.02 100.00% +333.4%
R 2.2 0.01 9.5 0.03 100.00% +331.6%
RT 1.9 0.01 5.5 0.07 100.00% +192.7%
At most 1 outliers rejected per test per set.
cairo-perf-trace with trimmed traces, 30 iterations:
Before After
Mean StdDev Mean StdDev Confidence Change
t-firefox-talos-gfx.trace 33.1 0.42 25.8 0.44 100.00% +28.6%
t-firefox-scrolling.trace 31.4 0.11 24.8 0.12 100.00% +26.3%
t-gnome-terminal-vim.trace 22.4 0.10 19.9 0.14 100.00% +12.5%
t-evolution.trace 13.9 0.07 13.0 0.05 100.00% +6.5%
t-firefox-planet-gnome.trace 11.6 0.02 10.9 0.02 100.00% +6.5%
t-gvim.trace 34.0 0.21 33.2 0.21 100.00% +2.4%
t-chromium-tabs.trace 4.9 0.02 4.9 0.02 100.00% +1.0%
t-poppler.trace 9.8 0.05 9.8 0.06 100.00% +0.7%
t-firefox-canvas-swscroll.trace 32.3 0.10 32.2 0.09 100.00% +0.4%
t-firefox-paintball.trace 18.1 0.01 18.0 0.01 100.00% +0.3%
t-poppler-reseau.trace 22.5 0.09 22.4 0.11 99.29% +0.3%
t-firefox-canvas.trace 18.1 0.06 18.0 0.05 99.29% +0.2%
t-xfce4-terminal-a1.trace 4.8 0.01 4.8 0.01 99.77% +0.2%
t-firefox-fishbowl.trace 21.2 0.03 21.2 0.04 100.00% +0.2%
t-gnome-system-monitor.trace 17.3 0.03 17.3 0.03 99.54% +0.1%
t-firefox-asteroids.trace 11.1 0.01 11.1 0.01 100.00% +0.1%
t-midori-zoomed.trace 8.0 0.01 8.0 0.01 99.98% +0.1%
t-grads-heat-map.trace 4.4 0.04 4.4 0.04 34.08% +0.1% (insignificant)
t-firefox-talos-svg.trace 20.6 0.03 20.6 0.04 54.06% +0.0% (insignificant)
t-firefox-fishtank.trace 13.2 0.01 13.2 0.01 52.81% -0.0% (insignificant)
t-swfdec-giant-steps.trace 14.9 0.02 14.9 0.03 85.50% -0.1% (insignificant)
t-firefox-chalkboard.trace 36.6 0.02 36.7 0.03 100.00% -0.2%
t-firefox-canvas-alpha.trace 20.7 0.32 20.7 0.22 55.76% -0.3% (insignificant)
t-swfdec-youtube.trace 7.8 0.02 7.8 0.03 100.00% -0.5%
t-firefox-particles.trace 27.4 0.16 27.5 0.18 99.94% -0.6%
At most 4 outliers rejected per test per set.
Cairo perf reports the running time, but the change is computed for
operations per second instead (inverse of running time).
Confidence is based on Welch's t-test. Absolute changes less than 1%
can be accounted as measurement errors, even if statistically
significant.
v4, Pekka Paalanen <pekka.paalanen at collabora.co.uk> :
Use pixman_asm_function instead of startfunc.
Rebased. Re-benchmarked on Raspberry Pi.
Commit message.
v5, Ben Avison <bavison at riscosopen.org> :
Fixed the bug exposed in blitters-test 4928372.
15 hours of testing, compared to the 45 minutes to hit
the bug originally.
Pekka Paalanen <pekka.paalanen at collabora.co.uk> :
Squash the fix, re-benchmark on Raspberry Pi.
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index dd6f788..b6f9a39 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -37,6 +37,7 @@
.altmacro
.p2align 2
+#include "pixman-arm-asm.h"
#include "pixman-arm-simd-asm.h"
/* A head macro should do all processing which results in an output of up to
@@ -689,3 +690,269 @@ generate_composite_function \
/******************************************************************************/
+.macro over_white_8888_8888_ca_init
+ HALF .req SRC
+ TMP0 .req STRIDE_D
+ TMP1 .req STRIDE_S
+ TMP2 .req STRIDE_M
+ TMP3 .req ORIG_W
+ WK4 .req SCRATCH
+ line_saved_regs STRIDE_D, STRIDE_M, ORIG_W
+ ldr SCRATCH, =0x800080
+ mov HALF, #0x80
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, SCRATCH, SCRATCH
+ .set DST_PRELOAD_BIAS, 8
+.endm
+
+.macro over_white_8888_8888_ca_cleanup
+ .set DST_PRELOAD_BIAS, 0
+ .unreq HALF
+ .unreq TMP0
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
+ .unreq WK4
+.endm
+
+.macro over_white_8888_8888_ca_combine m, d
+ uxtb16 TMP1, TMP0 /* rb_notmask */
+ uxtb16 TMP2, d /* rb_dest; 1 stall follows */
+ smlatt TMP3, TMP2, TMP1, HALF /* red */
+ smlabb TMP2, TMP2, TMP1, HALF /* blue */
+ uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */
+ uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */
+ smlatt d, TMP1, TMP0, HALF /* alpha */
+ smlabb TMP1, TMP1, TMP0, HALF /* green */
+ pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
+ pkhbt TMP1, TMP1, d, lsl #16 /* ag */
+ uxtab16 TMP0, TMP0, TMP0, ror #8
+ uxtab16 TMP1, TMP1, TMP1, ror #8
+ mov TMP0, TMP0, ror #8
+ sel d, TMP0, TMP1
+ uqadd8 d, d, m /* d is a late result */
+.endm
+
+.macro over_white_8888_8888_ca_1pixel_head
+ pixld , 4, 1, MASK, 0
+ pixld , 4, 3, DST, 0
+.endm
+
+.macro over_white_8888_8888_ca_1pixel_tail
+ mvn TMP0, WK1
+ teq WK1, WK1, asr #32
+ bne 01f
+ bcc 03f
+ mov WK3, WK1
+ b 02f
+01: over_white_8888_8888_ca_combine WK1, WK3
+02: pixst , 4, 3, DST
+03:
+.endm
+
+.macro over_white_8888_8888_ca_2pixels_head
+ pixld , 8, 1, MASK, 0
+.endm
+
+.macro over_white_8888_8888_ca_2pixels_tail
+ pixld , 8, 3, DST
+ mvn TMP0, WK1
+ teq WK1, WK1, asr #32
+ bne 01f
+ movcs WK3, WK1
+ bcs 02f
+ teq WK2, #0
+ beq 05f
+ b 02f
+01: over_white_8888_8888_ca_combine WK1, WK3
+02: mvn TMP0, WK2
+ teq WK2, WK2, asr #32
+ bne 03f
+ movcs WK4, WK2
+ b 04f
+03: over_white_8888_8888_ca_combine WK2, WK4
+04: pixst , 8, 3, DST
+05:
+.endm
+
+.macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 4
+ over_white_8888_8888_ca_1pixel_head
+ .else
+ .if numbytes == 16
+ over_white_8888_8888_ca_2pixels_head
+ over_white_8888_8888_ca_2pixels_tail
+ .endif
+ over_white_8888_8888_ca_2pixels_head
+ .endif
+.endm
+
+.macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg
+ .if numbytes == 4
+ over_white_8888_8888_ca_1pixel_tail
+ .else
+ over_white_8888_8888_ca_2pixels_tail
+ .endif
+.endm
+
+generate_composite_function \
+ pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \
+ 2, /* prefetch distance */ \
+ over_white_8888_8888_ca_init, \
+ nop_macro, /* newline */ \
+ over_white_8888_8888_ca_cleanup, \
+ over_white_8888_8888_ca_process_head, \
+ over_white_8888_8888_ca_process_tail
+
+
+.macro over_n_8888_8888_ca_init
+ /* Set up constants. RB_SRC and AG_SRC are in registers;
+ * RB_FLDS, A_SRC, and the two HALF values need to go on the
+ * stack (and the ful SRC value is already there) */
+ ldr SCRATCH, [sp, #ARGS_STACK_OFFSET]
+ mov WK0, #0x00FF0000
+ orr WK0, WK0, #0xFF /* RB_FLDS (0x00FF00FF) */
+ mov WK1, #0x80 /* HALF default value */
+ mov WK2, SCRATCH, lsr #24 /* A_SRC */
+ orr WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */
+ push {WK0-WK3}
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16
+ uxtb16 SRC, SCRATCH
+ uxtb16 STRIDE_S, SCRATCH, ror #8
+
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, WK3, WK3
+
+ .unreq WK0
+ .unreq WK1
+ .unreq WK2
+ .unreq WK3
+ WK0 .req Y
+ WK1 .req STRIDE_D
+ RB_SRC .req SRC
+ AG_SRC .req STRIDE_S
+ WK2 .req STRIDE_M
+ RB_FLDS .req r8 /* the reloaded constants have to be at consecutive registers starting at an even one */
+ A_SRC .req r8
+ HALF .req r9
+ WK3 .req r10
+ WK4 .req r11
+ WK5 .req SCRATCH
+ WK6 .req ORIG_W
+
+ line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
+.endm
+
+.macro over_n_8888_8888_ca_cleanup
+ add sp, sp, #16
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16
+
+ .unreq WK0
+ .unreq WK1
+ .unreq RB_SRC
+ .unreq AG_SRC
+ .unreq WK2
+ .unreq RB_FLDS
+ .unreq A_SRC
+ .unreq HALF
+ .unreq WK3
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ WK0 .req r8
+ WK1 .req r9
+ WK2 .req r10
+ WK3 .req r11
+.endm
+
+.macro over_n_8888_8888_ca_1pixel_head
+ pixld , 4, 6, MASK, 0
+ pixld , 4, 0, DST, 0
+.endm
+
+.macro over_n_8888_8888_ca_1pixel_tail
+ ldrd A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8]
+ uxtb16 WK1, WK6 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */
+ teq WK6, WK6, asr #32 /* Zc if transparent, ZC if opaque */
+ bne 20f
+ bcc 40f
+ /* Mask is fully opaque (all channels) */
+ ldr WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */
+ eors A_SRC, A_SRC, #0xFF
+ bne 10f
+ /* Source is also opaque - same as src_8888_8888 */
+ mov WK0, WK6
+ b 30f
+10: /* Same as over_8888_8888 */
+ mul_8888_8 WK0, A_SRC, WK5, HALF
+ uqadd8 WK0, WK0, WK6
+ b 30f
+20: /* No simplifications possible - do it the hard way */
+ uxtb16 WK2, WK6, ror #8 /* ag_mask */
+ mla WK3, WK1, A_SRC, HALF /* rb_mul; 2 cycles */
+ mla WK4, WK2, A_SRC, HALF /* ag_mul; 2 cycles */
+ ldrd RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET]
+ uxtb16 WK5, WK0 /* rb_dest */
+ uxtab16 WK3, WK3, WK3, ror #8
+ uxtb16 WK6, WK0, ror #8 /* ag_dest */
+ uxtab16 WK4, WK4, WK4, ror #8
+ smlatt WK0, RB_SRC, WK1, HALF /* red1 */
+ smlabb WK1, RB_SRC, WK1, HALF /* blue1 */
+ bic WK3, RB_FLDS, WK3, lsr #8
+ bic WK4, RB_FLDS, WK4, lsr #8
+ pkhbt WK1, WK1, WK0, lsl #16 /* rb1 */
+ smlatt WK0, WK5, WK3, HALF /* red2 */
+ smlabb WK3, WK5, WK3, HALF /* blue2 */
+ uxtab16 WK1, WK1, WK1, ror #8
+ smlatt WK5, AG_SRC, WK2, HALF /* alpha1 */
+ pkhbt WK3, WK3, WK0, lsl #16 /* rb2 */
+ smlabb WK0, AG_SRC, WK2, HALF /* green1 */
+ smlatt WK2, WK6, WK4, HALF /* alpha2 */
+ smlabb WK4, WK6, WK4, HALF /* green2 */
+ pkhbt WK0, WK0, WK5, lsl #16 /* ag1 */
+ uxtab16 WK3, WK3, WK3, ror #8
+ pkhbt WK4, WK4, WK2, lsl #16 /* ag2 */
+ uxtab16 WK0, WK0, WK0, ror #8
+ uxtab16 WK4, WK4, WK4, ror #8
+ mov WK1, WK1, ror #8
+ mov WK3, WK3, ror #8
+ sel WK2, WK1, WK0 /* recombine source*mask */
+ sel WK1, WK3, WK4 /* recombine dest*(1-source_alpha*mask) */
+ uqadd8 WK0, WK1, WK2 /* followed by 1 stall */
+30: /* The destination buffer is already in the L1 cache, so
+ * there's little point in amalgamating writes */
+ pixst , 4, 0, DST
+40:
+.endm
+
+.macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .rept (numbytes / 4) - 1
+ over_n_8888_8888_ca_1pixel_head
+ over_n_8888_8888_ca_1pixel_tail
+ .endr
+ over_n_8888_8888_ca_1pixel_head
+.endm
+
+.macro over_n_8888_8888_ca_process_tail cond, numbytes, firstreg
+ over_n_8888_8888_ca_1pixel_tail
+.endm
+
+pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
+ ldr ip, [sp]
+ cmp ip, #-1
+ beq pixman_composite_over_white_8888_8888_ca_asm_armv6
+ /* else drop through... */
+ .endfunc
+generate_composite_function \
+ pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
+ 2, /* prefetch distance */ \
+ over_n_8888_8888_ca_init, \
+ nop_macro, /* newline */ \
+ over_n_8888_8888_ca_cleanup, \
+ over_n_8888_8888_ca_process_head, \
+ over_n_8888_8888_ca_process_tail
+
+/******************************************************************************/
+
diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index 852a113..1bb8b45 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -80,6 +80,12 @@
.set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
/*
+ * Number of bytes by which to adjust preload offset of destination
+ * buffer (allows preload instruction to be moved before the load(s))
+ */
+.set DST_PRELOAD_BIAS, 0
+
+/*
* Offset into stack where mask and source pointer/stride can be accessed.
*/
#ifdef DEBUG_PARAMS
@@ -462,11 +468,11 @@
.if dst_r_bpp > 0
tst DST, #16
bne 111f
- process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16
+ process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
b 112f
111:
.endif
- process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0
+ process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
112:
/* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
.if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 8fbc439..dd6b907 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -56,6 +56,9 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8888_8888_ca,
+ uint32_t, 1, uint32_t, 1)
+
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
uint16_t, uint16_t)
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
@@ -238,6 +241,11 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, armv6_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, armv6_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, armv6_composite_over_n_8888_8888_ca),
+ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, armv6_composite_over_n_8888_8888_ca),
+
PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
commit 73d2f8b61ae5e320a7795c29b6041b7885cf2953
Author: Ben Avison <bavison at riscosopen.org>
Date: Wed Apr 9 16:25:29 2014 +0300
ARMv6: Support for very variable-hungry composite operations
Previously, the variable ARGS_STACK_OFFSET was available to extract values
from function arguments during the init macro. Now this changes dynamically
around stack operations in the function as a whole so that arguments can be
accessed at any point. It is also joined by LOCALS_STACK_OFFSET, which
allows access to space reserved on the stack during the init macro.
On top of this, composite macros now have the option of using all of WK0-WK3
registers rather than just the subset it was told to use; this requires the
pixel count to be spilled to the stack over the leading pixels at the start
of each line. Thus, at best, each composite operation can use 11 registers,
plus any pointer registers not required for the composite type, plus as much
stack space as it needs, divided up into constants and variables as necessary.
diff --git a/pixman/pixman-arm-simd-asm.h b/pixman/pixman-arm-simd-asm.h
index 24b1ad2..852a113 100644
--- a/pixman/pixman-arm-simd-asm.h
+++ b/pixman/pixman-arm-simd-asm.h
@@ -76,6 +76,8 @@
.set FLAG_SPILL_LINE_VARS, 48
.set FLAG_PROCESS_CORRUPTS_SCRATCH, 0
.set FLAG_PROCESS_PRESERVES_SCRATCH, 64
+.set FLAG_PROCESS_PRESERVES_WK0, 0
+.set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
/*
* Offset into stack where mask and source pointer/stride can be accessed.
@@ -87,6 +89,11 @@
#endif
/*
+ * Offset into stack where space allocated during init macro can be accessed.
+ */
+.set LOCALS_STACK_OFFSET, 0
+
+/*
* Constants for selecting preferable prefetch type.
*/
.set PREFETCH_TYPE_NONE, 0
@@ -359,23 +366,41 @@
.macro test_bits_1_0_ptr
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */
+ .else
movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */
+ .endif
.endm
.macro test_bits_3_2_ptr
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ movs SCRATCH, X, lsl #32-3 /* C,N = bits 3, 2 of DST */
+ .else
movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */
+ .endif
.endm
.macro leading_15bytes process_head, process_tail
/* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
+ .set DECREMENT_X, 1
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ .set DECREMENT_X, 0
+ sub X, X, WK0, lsr #dst_bpp_shift
+ str X, [sp, #LINE_SAVED_REG_COUNT*4]
+ mov X, WK0
+ .endif
/* Use unaligned loads in all cases for simplicity */
.if dst_w_bpp == 8
- conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
+ conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
.elseif dst_w_bpp == 16
test_bits_1_0_ptr
- conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, 1
+ conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
+ .endif
+ conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ ldr X, [sp, #LINE_SAVED_REG_COUNT*4]
.endif
- conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
.endm
.macro test_bits_3_2_pix
@@ -705,6 +730,13 @@ fname:
#endif
init
+
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ /* Reserve a word in which to store X during leading pixels */
+ sub sp, sp, #4
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
+ .endif
lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */
sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
@@ -734,6 +766,8 @@ fname:
.if (flags) & FLAG_SPILL_LINE_VARS_WIDE
/* This is stmdb sp!,{} */
.word 0xE92D0000 | LINE_SAVED_REGS
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.endif
151: /* New line */
newline
@@ -767,6 +801,10 @@ fname:
157: /* Check for another line */
end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
+ .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .endif
.endif
.ltorg
@@ -776,6 +814,8 @@ fname:
.if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
/* This is stmdb sp!,{} */
.word 0xE92D0000 | LINE_SAVED_REGS
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.endif
161: /* New line */
newline
@@ -841,12 +881,22 @@ fname:
177: /* Check for another line */
end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
+ .endif
197:
.if (flags) & FLAG_SPILL_LINE_VARS
add sp, sp, #LINE_SAVED_REG_COUNT*4
.endif
198:
+ .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
+ .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
+ .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
+ add sp, sp, #4
+ .endif
+
cleanup
#ifdef DEBUG_PARAMS
More information about the xorg-commit
mailing list