pixman: Branch 'master'

Søren Sandmann Pedersen sandmann at kemper.freedesktop.org
Thu May 1 12:12:07 PDT 2014


 pixman/pixman-arm-simd-asm.S |   77 +++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-simd.c     |    7 +++
 2 files changed, 84 insertions(+)

New commits:
commit 91f32ce961bc85f98b3372b95681ad8918d24b18
Author: Ben Avison <bavison at riscosopen.org>
Date:   Thu Apr 24 13:39:06 2014 +0300

    ARMv6: Add fast path for src_x888_0565
    
    Benchmark results, "before" is upstream/master
    5f661ee719be25c3aa0eb0d45e0db23a37e76468, and "after" contains this
    patch on top.
    
    lowlevel-blt-bench, src_8888_0565, 100 iterations:
    
           Before          After
          Mean StdDev     Mean StdDev   Confidence   Change
    L1    25.9   0.20    115.6   0.70    100.00%    +347.1%
    L2    14.4   0.23     52.7   3.48    100.00%    +265.0%
    M     14.1   0.01     79.8   0.17    100.00%    +465.9%
    HT    10.2   0.03     32.9   0.31    100.00%    +221.2%
    VT     9.8   0.03     29.8   0.25    100.00%    +203.4%
    R      9.4   0.03     27.8   0.18    100.00%    +194.7%
    RT     4.6   0.04     10.9   0.29    100.00%    +135.9%
    
    At most 19 outliers rejected per test per set.
    
    cairo-perf-trace with trimmed traces results were indifferent.
    
    A system-wide perf_3.10 profile on Raspbian shows significant
    differences in the X server CPU usage. The following were measured from
    a 130x62 char lxterminal running 'dmesg' every 0.5 seconds for roughly
    30 seconds. These profiles are libpixman.so symbols only.
    
    Before:
    
    Samples: 63K of event 'cpu-clock', Event count (approx.): 2941348112, DSO: libpixman-1.so.0.33.1
     37.77%  Xorg  [.] fast_fetch_r5g6b5
     14.39%  Xorg  [.] pixman_composite_over_n_8_8888_asm_armv6
      8.51%  Xorg  [.] fast_write_back_r5g6b5
      7.38%  Xorg  [.] pixman_composite_src_8888_8888_asm_armv6
      4.39%  Xorg  [.] pixman_composite_add_8_8_asm_armv6
      3.69%  Xorg  [.] pixman_composite_src_n_8888_asm_armv6
      2.53%  Xorg  [.] _pixman_image_validate
      2.35%  Xorg  [.] pixman_image_composite32
    
    After:
    
    Samples: 31K of event 'cpu-clock', Event count (approx.): 3619782704, DSO: libpixman-1.so.0.33.1
     22.36%  Xorg  [.] pixman_composite_over_n_8_8888_asm_armv6
     13.59%  Xorg  [.] pixman_composite_src_x888_0565_asm_armv6
     12.75%  Xorg  [.] pixman_composite_src_8888_8888_asm_armv6
      6.79%  Xorg  [.] pixman_composite_add_8_8_asm_armv6
      5.95%  Xorg  [.] pixman_composite_src_n_8888_asm_armv6
      4.12%  Xorg  [.] pixman_image_composite32
      3.69%  Xorg  [.] _pixman_image_validate
      3.65%  Xorg  [.] _pixman_bits_image_setup_accessors
    
    Before, fast_fetch_r5g6b5 + fast_write_back_r5g6b5 took 46% of the
    samples in libpixman, and probably incurred some memcpy() load, too.
    After, pixman_composite_src_x888_0565_asm_armv6 takes 14%. Note, that
    the sample counts are very different before/after, as less time is spent
    in Pixman and running time is not exactly the same.
    
    Furthermore, in the above test, the CPU idle function was sampled 9%
    before, and 15% after.
    
    v4, Pekka Paalanen <pekka.paalanen at collabora.co.uk> :
    	Re-benchmarked on Raspberry Pi, commit message.

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index bc02ebb..7b0727b 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -304,6 +304,83 @@ generate_composite_function \
 
 /******************************************************************************/
 
+.macro src_x888_0565_init
+        /* Hold loop invariant in MASK */
+        ldr     MASK, =0x001F001F
+        line_saved_regs  STRIDE_S, ORIG_W
+.endm
+
+.macro src_x888_0565_1pixel  s, d
+        and     WK&d, MASK, WK&s, lsr #3           @ 00000000000rrrrr00000000000bbbbb
+        and     STRIDE_S, WK&s, #0xFC00            @ 0000000000000000gggggg0000000000
+        orr     WK&d, WK&d, WK&d, lsr #5           @ 00000000000-----rrrrr000000bbbbb
+        orr     WK&d, WK&d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
+        /* Top 16 bits are discarded during the following STRH */
+.endm
+
+.macro src_x888_0565_2pixels  slo, shi, d, tmp
+        and     SCRATCH, WK&shi, #0xFC00           @ 0000000000000000GGGGGG0000000000
+        and     WK&tmp, MASK, WK&shi, lsr #3       @ 00000000000RRRRR00000000000BBBBB
+        and     WK&shi, MASK, WK&slo, lsr #3       @ 00000000000rrrrr00000000000bbbbb
+        orr     WK&tmp, WK&tmp, WK&tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB
+        orr     WK&tmp, WK&tmp, SCRATCH, lsr #5    @ 00000000000-----RRRRRGGGGGGBBBBB
+        and     SCRATCH, WK&slo, #0xFC00           @ 0000000000000000gggggg0000000000
+        orr     WK&shi, WK&shi, WK&shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb
+        orr     WK&shi, WK&shi, SCRATCH, lsr #5    @ 00000000000-----rrrrrggggggbbbbb
+        pkhbt   WK&d, WK&shi, WK&tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
+.endm
+
+.macro src_x888_0565_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+        WK4     .req    STRIDE_S
+        WK5     .req    STRIDE_M
+        WK6     .req    WK3
+        WK7     .req    ORIG_W
+ .if numbytes == 16
+        pixld   , 16, 4, SRC, 0
+        src_x888_0565_2pixels  4, 5, 0, 0
+        pixld   , 8, 4, SRC, 0
+        src_x888_0565_2pixels  6, 7, 1, 1
+        pixld   , 8, 6, SRC, 0
+ .else
+        pixld   , numbytes*2, 4, SRC, 0
+ .endif
+.endm
+
+.macro src_x888_0565_process_tail   cond, numbytes, firstreg
+ .if numbytes == 16
+        src_x888_0565_2pixels  4, 5, 2, 2
+        src_x888_0565_2pixels  6, 7, 3, 4
+ .elseif numbytes == 8
+        src_x888_0565_2pixels  4, 5, 1, 1
+        src_x888_0565_2pixels  6, 7, 2, 2
+ .elseif numbytes == 4
+        src_x888_0565_2pixels  4, 5, 1, 1
+ .else
+        src_x888_0565_1pixel  4, 1
+ .endif
+ .if numbytes == 16
+        pixst   , numbytes, 0, DST
+ .else
+        pixst   , numbytes, 1, DST
+ .endif
+        .unreq  WK4
+        .unreq  WK5
+        .unreq  WK6
+        .unreq  WK7
+.endm
+
+generate_composite_function \
+    pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
+    3, /* prefetch distance */ \
+    src_x888_0565_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    src_x888_0565_process_head, \
+    src_x888_0565_process_tail
+
+/******************************************************************************/
+
 .macro add_8_8_8pixels  cond, dst1, dst2
         uqadd8&cond  WK&dst1, WK&dst1, MASK
         uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index c17ce5a..fa1ab5c 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -41,6 +41,8 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8_8,
                                    uint8_t, 1, uint8_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_8888,
                                    uint16_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_0565,
+                                   uint32_t, 1, uint16_t, 1)
 
 PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
                                    uint8_t, 1, uint8_t, 1)
@@ -224,6 +226,11 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, armv6_composite_src_0565_8888),
     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, armv6_composite_src_0565_8888),
 
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, armv6_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, armv6_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, armv6_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, armv6_composite_src_x888_0565),
+
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),


More information about the xorg-commit mailing list