pixman: Branch 'master' - 2 commits

Nemanja Lukic nlukic at kemper.freedesktop.org
Wed Feb 27 05:43:40 PST 2013


 pixman/pixman-mips-dspr2-asm.S |  587 +++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-mips-dspr2.c     |   17 +
 2 files changed, 604 insertions(+)

New commits:
commit 5feda20fc39407879993ed4a6d861ef7f78d9432
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date:   Wed Feb 27 14:40:51 2013 +0100

    MIPS: DSPr2: Added more fast-paths for SRC operation:
     - src_0888_8888_rev
     - src_0888_0565_rev
    
    Performance numbers before/after on MIPS-74kc @ 1GHz:
    
    lowlevel-blt-bench results
    
    Referent (before):
            src_0888_8888_rev =  L1:  51.88  L2:  42.00  M: 19.04 ( 88.50%)  HT: 15.27  VT: 14.62  R: 14.13  RT:  7.12 (  45Kops/s)
            src_0888_0565_rev =  L1:  31.96  L2:  30.90  M: 22.60 ( 75.03%)  HT: 15.32  VT: 15.11  R: 14.49  RT:  6.64 (  43Kops/s)
    
    Optimized:
            src_0888_8888_rev =  L1: 222.73  L2: 113.70  M: 20.97 ( 97.35%)  HT: 18.31  VT: 17.14  R: 16.71  RT:  9.74 (  54Kops/s)
            src_0888_0565_rev =  L1: 100.37  L2:  74.27  M: 29.43 ( 97.63%)  HT: 22.92  VT: 21.59  R: 20.52  RT: 10.56 (  56Kops/s)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 299f739..3adbb2a 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -310,6 +310,395 @@ LEAF_MIPS_DSPR2(pixman_composite_src_x888_8888_asm_mips)
 
 END(pixman_composite_src_x888_8888_asm_mips)
 
+#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+LEAF_MIPS_DSPR2(pixman_composite_src_0888_8888_rev_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (b8g8r8)
+ * a2 - w
+ */
+
+    beqz              a2, 6f
+     nop
+
+    lui               t8, 0xff00;
+    srl               t9, a2, 2   /* t9 = how many multiples of 4 src pixels */
+    beqz              t9, 4f      /* branch if less than 4 src pixels */
+     nop
+
+    li                t0, 0x1
+    li                t1, 0x2
+    li                t2, 0x3
+    andi              t3, a1, 0x3
+    beq               t3, t0, 1f
+     nop
+    beq               t3, t1, 2f
+     nop
+    beq               t3, t2, 3f
+     nop
+
+0:
+    beqz              t9, 4f
+     addiu            t9, t9, -1
+    lw                t0, 0(a1)            /* t0 = R2 | B1 | G1 | R1 */
+    lw                t1, 4(a1)            /* t1 = G3 | R3 | B2 | G2 */
+    lw                t2, 8(a1)            /* t2 = B4 | G4 | R4 | B3 */
+
+    addiu             a1, a1, 12
+    addiu             a2, a2, -4
+
+    wsbh              t0, t0               /* t0 = B1 | R2 | R1 | G1 */
+    wsbh              t1, t1               /* t1 = R3 | G3 | G2 | B2 */
+    wsbh              t2, t2               /* t2 = G4 | B4 | B3 | R4 */
+
+    packrl.ph         t3, t1, t0           /* t3 = G2 | B2 | B1 | R2 */
+    packrl.ph         t4, t0, t0           /* t4 = R1 | G1 | B1 | R2 */
+    rotr              t3, t3, 16           /* t3 = B1 | R2 | G2 | B2 */
+    or                t3, t3, t8           /* t3 = FF | R2 | G2 | B2 */
+    srl               t4, t4, 8            /* t4 =  0 | R1 | G1 | B1 */
+    or                t4, t4, t8           /* t4 = FF | R1 | G1 | B1 */
+    packrl.ph         t5, t2, t1           /* t5 = B3 | R4 | R3 | G3 */
+    rotr              t5, t5, 24           /* t5 = R4 | R3 | G3 | B3 */
+    or                t5, t5, t8           /* t5 = FF | R3 | G3 | B3 */
+    rotr              t2, t2, 16           /* t2 = B3 | R4 | G4 | B4 */
+    or                t2, t2, t8           /* t5 = FF | R3 | G3 | B3 */
+
+    sw                t4, 0(a0)
+    sw                t3, 4(a0)
+    sw                t5, 8(a0)
+    sw                t2, 12(a0)
+    b                 0b
+     addiu            a0, a0, 16
+
+1:
+    lbu               t6, 0(a1)            /* t6 =  0 |  0 |  0 | R1 */
+    lhu               t7, 1(a1)            /* t7 =  0 |  0 | B1 | G1 */
+    sll               t6, t6, 16           /* t6 =  0 | R1 |  0 | 0  */
+    wsbh              t7, t7               /* t7 =  0 |  0 | G1 | B1 */
+    or                t7, t6, t7           /* t7 =  0 | R1 | G1 | B1 */
+11:
+    beqz              t9, 4f
+     addiu            t9, t9, -1
+    lw                t0, 3(a1)            /* t0 = R3 | B2 | G2 | R2 */
+    lw                t1, 7(a1)            /* t1 = G4 | R4 | B3 | G3 */
+    lw                t2, 11(a1)           /* t2 = B5 | G5 | R5 | B4 */
+
+    addiu             a1, a1, 12
+    addiu             a2, a2, -4
+
+    wsbh              t0, t0               /* t0 = B2 | R3 | R2 | G2 */
+    wsbh              t1, t1               /* t1 = R4 | G4 | G3 | B3 */
+    wsbh              t2, t2               /* t2 = G5 | B5 | B4 | R5 */
+
+    packrl.ph         t3, t1, t0           /* t3 = G3 | B3 | B2 | R3 */
+    packrl.ph         t4, t2, t1           /* t4 = B4 | R5 | R4 | G4 */
+    rotr              t0, t0, 24           /* t0 = R3 | R2 | G2 | B2 */
+    rotr              t3, t3, 16           /* t3 = B2 | R3 | G3 | B3 */
+    rotr              t4, t4, 24           /* t4 = R5 | R4 | G4 | B4 */
+    or                t7, t7, t8           /* t7 = FF | R1 | G1 | B1 */
+    or                t0, t0, t8           /* t0 = FF | R2 | G2 | B2 */
+    or                t3, t3, t8           /* t1 = FF | R3 | G3 | B3 */
+    or                t4, t4, t8           /* t3 = FF | R4 | G4 | B4 */
+
+    sw                t7, 0(a0)
+    sw                t0, 4(a0)
+    sw                t3, 8(a0)
+    sw                t4, 12(a0)
+    rotr              t7, t2, 16           /* t7 = xx | R5 | G5 | B5 */
+    b                 11b
+     addiu            a0, a0, 16
+
+2:
+    lhu               t7, 0(a1)            /* t7 =  0 |  0 | G1 | R1 */
+    wsbh              t7, t7               /* t7 =  0 |  0 | R1 | G1 */
+21:
+    beqz              t9, 4f
+     addiu            t9, t9, -1
+    lw                t0, 2(a1)            /* t0 = B2 | G2 | R2 | B1 */
+    lw                t1, 6(a1)            /* t1 = R4 | B3 | G3 | R3 */
+    lw                t2, 10(a1)           /* t2 = G5 | R5 | B4 | G4 */
+
+    addiu             a1, a1, 12
+    addiu             a2, a2, -4
+
+    wsbh              t0, t0               /* t0 = G2 | B2 | B1 | R2 */
+    wsbh              t1, t1               /* t1 = B3 | R4 | R3 | G3 */
+    wsbh              t2, t2               /* t2 = R5 | G5 | G4 | B4 */
+
+    precr_sra.ph.w    t7, t0, 0            /* t7 = R1 | G1 | B1 | R2 */
+    rotr              t0, t0, 16           /* t0 = B1 | R2 | G2 | B2 */
+    packrl.ph         t3, t2, t1           /* t3 = G4 | B4 | B3 | R4 */
+    rotr              t1, t1, 24           /* t1 = R4 | R3 | G3 | B3 */
+    srl               t7, t7, 8            /* t7 =  0 | R1 | G1 | B1 */
+    rotr              t3, t3, 16           /* t3 = B3 | R4 | G4 | B4 */
+    or                t7, t7, t8           /* t7 = FF | R1 | G1 | B1 */
+    or                t0, t0, t8           /* t0 = FF | R2 | G2 | B2 */
+    or                t1, t1, t8           /* t1 = FF | R3 | G3 | B3 */
+    or                t3, t3, t8           /* t3 = FF | R4 | G4 | B4 */
+
+    sw                t7, 0(a0)
+    sw                t0, 4(a0)
+    sw                t1, 8(a0)
+    sw                t3, 12(a0)
+    srl               t7, t2, 16           /* t7 =  0 |  0 | R5 | G5 */
+    b                 21b
+     addiu            a0, a0, 16
+
+3:
+    lbu               t7, 0(a1)            /* t7 =  0 |  0 |  0 | R1 */
+31:
+    beqz              t9, 4f
+     addiu            t9, t9, -1
+    lw                t0, 1(a1)            /* t0 = G2 | R2 | B1 | G1 */
+    lw                t1, 5(a1)            /* t1 = B3 | G3 | R3 | B2 */
+    lw                t2, 9(a1)            /* t2 = R5 | B4 | G4 | R4 */
+
+    addiu             a1, a1, 12
+    addiu             a2, a2, -4
+
+    wsbh              t0, t0               /* t0 = R2 | G2 | G1 | B1 */
+    wsbh              t1, t1               /* t1 = G3 | B3 | B2 | R3 */
+    wsbh              t2, t2               /* t2 = B4 | R5 | R4 | G4 */
+
+    precr_sra.ph.w    t7, t0, 0            /* t7 = xx | R1 | G1 | B1 */
+    packrl.ph         t3, t1, t0           /* t3 = B2 | R3 | R2 | G2 */
+    rotr              t1, t1, 16           /* t1 = B2 | R3 | G3 | B3 */
+    rotr              t4, t2, 24           /* t4 = R5 | R4 | G4 | B4 */
+    rotr              t3, t3, 24           /* t3 = R3 | R2 | G2 | B2 */
+    or                t7, t7, t8           /* t7 = FF | R1 | G1 | B1 */
+    or                t3, t3, t8           /* t3 = FF | R2 | G2 | B2 */
+    or                t1, t1, t8           /* t1 = FF | R3 | G3 | B3 */
+    or                t4, t4, t8           /* t4 = FF | R4 | G4 | B4 */
+
+    sw                t7, 0(a0)
+    sw                t3, 4(a0)
+    sw                t1, 8(a0)
+    sw                t4, 12(a0)
+    srl               t7, t2, 16           /* t7 =  0 |  0 | xx | R5 */
+    b                 31b
+     addiu            a0, a0, 16
+
+4:
+    beqz              a2, 6f
+     nop
+5:
+    lbu               t0, 0(a1)            /* t0 =  0 | 0 | 0 | R */
+    lbu               t1, 1(a1)            /* t1 =  0 | 0 | 0 | G */
+    lbu               t2, 2(a1)            /* t2 =  0 | 0 | 0 | B */
+    addiu             a1, a1, 3
+
+    sll               t0, t0, 16           /* t2 =  0 | R | 0 | 0 */
+    sll               t1, t1, 8            /* t1 =  0 | 0 | G | 0 */
+
+    or                t2, t2, t1           /* t2 =  0 | 0 | G | B */
+    or                t2, t2, t0           /* t2 =  0 | R | G | B */
+    or                t2, t2, t8           /* t2 = FF | R | G | B */
+
+    sw                t2, 0(a0)
+    addiu             a2, a2, -1
+    bnez              a2, 5b
+     addiu            a0, a0, 4
+6:
+    j                 ra
+     nop
+
+END(pixman_composite_src_0888_8888_rev_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_src_0888_0565_rev_asm_mips)
+/*
+ * a0 - dst (r5g6b5)
+ * a1 - src (b8g8r8)
+ * a2 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, v0, v1
+    beqz              a2, 6f
+     nop
+
+    li                t6, 0xf800f800
+    li                t7, 0x07e007e0
+    li                t8, 0x001F001F
+    srl               t9, a2, 2   /* t9 = how many multiples of 4 src pixels */
+    beqz              t9, 4f      /* branch if less than 4 src pixels */
+     nop
+
+    li                t0, 0x1
+    li                t1, 0x2
+    li                t2, 0x3
+    andi              t3, a1, 0x3
+    beq               t3, t0, 1f
+     nop
+    beq               t3, t1, 2f
+     nop
+    beq               t3, t2, 3f
+     nop
+
+0:
+    beqz              t9, 4f
+     addiu            t9, t9, -1
+    lw                t0, 0(a1)            /* t0 = R2 | B1 | G1 | R1 */
+    lw                t1, 4(a1)            /* t1 = G3 | R3 | B2 | G2 */
+    lw                t2, 8(a1)            /* t2 = B4 | G4 | R4 | B3 */
+
+    addiu             a1, a1, 12
+    addiu             a2, a2, -4
+
+    wsbh              t0, t0               /* t0 = B1 | R2 | R1 | G1 */
+    wsbh              t1, t1               /* t1 = R3 | G3 | G2 | B2 */
+    wsbh              t2, t2               /* t2 = G4 | B4 | B3 | R4 */
+
+    packrl.ph         t3, t1, t0           /* t3 = G2 | B2 | B1 | R2 */
+    packrl.ph         t4, t0, t0           /* t4 = R1 | G1 | B1 | R2 */
+    rotr              t3, t3, 16           /* t3 = B1 | R2 | G2 | B2 */
+    srl               t4, t4, 8            /* t4 =  0 | R1 | G1 | B1 */
+    packrl.ph         t5, t2, t1           /* t5 = B3 | R4 | R3 | G3 */
+    rotr              t5, t5, 24           /* t5 = R4 | R3 | G3 | B3 */
+    rotr              t2, t2, 16           /* t2 = B3 | R4 | G4 | B4 */
+
+    CONVERT_2x8888_TO_2x0565 t4, t3, t4, t3, t6, t7, t8, v0, v1
+    CONVERT_2x8888_TO_2x0565 t5, t2, t5, t2, t6, t7, t8, v0, v1
+
+    sh                t4, 0(a0)
+    sh                t3, 2(a0)
+    sh                t5, 4(a0)
+    sh                t2, 6(a0)
+    b                 0b
+     addiu            a0, a0, 8
+
+1:
+    lbu               t4, 0(a1)            /* t4 =  0 |  0 |  0 | R1 */
+    lhu               t5, 1(a1)            /* t5 =  0 |  0 | B1 | G1 */
+    sll               t4, t4, 16           /* t4 =  0 | R1 |  0 | 0  */
+    wsbh              t5, t5               /* t5 =  0 |  0 | G1 | B1 */
+    or                t5, t4, t5           /* t5 =  0 | R1 | G1 | B1 */
+11:
+    beqz              t9, 4f
+     addiu            t9, t9, -1
+    lw                t0, 3(a1)            /* t0 = R3 | B2 | G2 | R2 */
+    lw                t1, 7(a1)            /* t1 = G4 | R4 | B3 | G3 */
+    lw                t2, 11(a1)           /* t2 = B5 | G5 | R5 | B4 */
+
+    addiu             a1, a1, 12
+    addiu             a2, a2, -4
+
+    wsbh              t0, t0               /* t0 = B2 | R3 | R2 | G2 */
+    wsbh              t1, t1               /* t1 = R4 | G4 | G3 | B3 */
+    wsbh              t2, t2               /* t2 = G5 | B5 | B4 | R5 */
+
+    packrl.ph         t3, t1, t0           /* t3 = G3 | B3 | B2 | R3 */
+    packrl.ph         t4, t2, t1           /* t4 = B4 | R5 | R4 | G4 */
+    rotr              t0, t0, 24           /* t0 = R3 | R2 | G2 | B2 */
+    rotr              t3, t3, 16           /* t3 = B2 | R3 | G3 | B3 */
+    rotr              t4, t4, 24           /* t4 = R5 | R4 | G4 | B4 */
+
+    CONVERT_2x8888_TO_2x0565 t5, t0, t5, t0, t6, t7, t8, v0, v1
+    CONVERT_2x8888_TO_2x0565 t3, t4, t3, t4, t6, t7, t8, v0, v1
+
+    sh                t5, 0(a0)
+    sh                t0, 2(a0)
+    sh                t3, 4(a0)
+    sh                t4, 6(a0)
+    rotr              t5, t2, 16           /* t5 = xx | R5 | G5 | B5 */
+    b                 11b
+     addiu            a0, a0, 8
+
+2:
+    lhu               t5, 0(a1)            /* t5 =  0 |  0 | G1 | R1 */
+    wsbh              t5, t5               /* t5 =  0 |  0 | R1 | G1 */
+21:
+    beqz              t9, 4f
+     addiu            t9, t9, -1
+    lw                t0, 2(a1)            /* t0 = B2 | G2 | R2 | B1 */
+    lw                t1, 6(a1)            /* t1 = R4 | B3 | G3 | R3 */
+    lw                t2, 10(a1)           /* t2 = G5 | R5 | B4 | G4 */
+
+    addiu             a1, a1, 12
+    addiu             a2, a2, -4
+
+    wsbh              t0, t0               /* t0 = G2 | B2 | B1 | R2 */
+    wsbh              t1, t1               /* t1 = B3 | R4 | R3 | G3 */
+    wsbh              t2, t2               /* t2 = R5 | G5 | G4 | B4 */
+
+    precr_sra.ph.w    t5, t0, 0            /* t5 = R1 | G1 | B1 | R2 */
+    rotr              t0, t0, 16           /* t0 = B1 | R2 | G2 | B2 */
+    packrl.ph         t3, t2, t1           /* t3 = G4 | B4 | B3 | R4 */
+    rotr              t1, t1, 24           /* t1 = R4 | R3 | G3 | B3 */
+    srl               t5, t5, 8            /* t5 =  0 | R1 | G1 | B1 */
+    rotr              t3, t3, 16           /* t3 = B3 | R4 | G4 | B4 */
+
+    CONVERT_2x8888_TO_2x0565 t5, t0, t5, t0, t6, t7, t8, v0, v1
+    CONVERT_2x8888_TO_2x0565 t1, t3, t1, t3, t6, t7, t8, v0, v1
+
+    sh                t5, 0(a0)
+    sh                t0, 2(a0)
+    sh                t1, 4(a0)
+    sh                t3, 6(a0)
+    srl               t5, t2, 16           /* t5 =  0 |  0 | R5 | G5 */
+    b                 21b
+     addiu            a0, a0, 8
+
+3:
+    lbu               t5, 0(a1)            /* t5 =  0 |  0 |  0 | R1 */
+31:
+    beqz              t9, 4f
+     addiu            t9, t9, -1
+    lw                t0, 1(a1)            /* t0 = G2 | R2 | B1 | G1 */
+    lw                t1, 5(a1)            /* t1 = B3 | G3 | R3 | B2 */
+    lw                t2, 9(a1)            /* t2 = R5 | B4 | G4 | R4 */
+
+    addiu             a1, a1, 12
+    addiu             a2, a2, -4
+
+    wsbh              t0, t0               /* t0 = R2 | G2 | G1 | B1 */
+    wsbh              t1, t1               /* t1 = G3 | B3 | B2 | R3 */
+    wsbh              t2, t2               /* t2 = B4 | R5 | R4 | G4 */
+
+    precr_sra.ph.w    t5, t0, 0            /* t5 = xx | R1 | G1 | B1 */
+    packrl.ph         t3, t1, t0           /* t3 = B2 | R3 | R2 | G2 */
+    rotr              t1, t1, 16           /* t1 = B2 | R3 | G3 | B3 */
+    rotr              t4, t2, 24           /* t4 = R5 | R4 | G4 | B4 */
+    rotr              t3, t3, 24           /* t3 = R3 | R2 | G2 | B2 */
+
+    CONVERT_2x8888_TO_2x0565 t5, t3, t5, t3, t6, t7, t8, v0, v1
+    CONVERT_2x8888_TO_2x0565 t1, t4, t1, t4, t6, t7, t8, v0, v1
+
+    sh                t5, 0(a0)
+    sh                t3, 2(a0)
+    sh                t1, 4(a0)
+    sh                t4, 6(a0)
+    srl               t5, t2, 16           /* t5 =  0 |  0 | xx | R5 */
+    b                 31b
+     addiu            a0, a0, 8
+
+4:
+    beqz              a2, 6f
+     nop
+5:
+    lbu               t0, 0(a1)            /* t0 =  0 | 0 | 0 | R */
+    lbu               t1, 1(a1)            /* t1 =  0 | 0 | 0 | G */
+    lbu               t2, 2(a1)            /* t2 =  0 | 0 | 0 | B */
+    addiu             a1, a1, 3
+
+    sll               t0, t0, 16           /* t2 =  0 | R | 0 | 0 */
+    sll               t1, t1, 8            /* t1 =  0 | 0 | G | 0 */
+
+    or                t2, t2, t1           /* t2 =  0 | 0 | G | B */
+    or                t2, t2, t0           /* t2 =  0 | R | G | B */
+
+    CONVERT_1x8888_TO_1x0565 t2, t3, t4, t5
+
+    sh                t3, 0(a0)
+    addiu             a2, a2, -1
+    bnez              a2, 5b
+     addiu            a0, a0, 2
+6:
+    RESTORE_REGS_FROM_STACK 0, v0, v1
+    j                 ra
+     nop
+
+END(pixman_composite_src_0888_0565_rev_asm_mips)
+#endif
+
 LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8888_asm_mips)
 /*
  * a0 - dst  (a8r8g8b8)
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index cdc71cd..1ea2445 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -48,6 +48,12 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_8888_8888,
                                     uint32_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888,
                                     uint8_t, 3, uint8_t, 3)
+#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0888_8888_rev,
+                                    uint8_t, 3, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0888_0565_rev,
+                                    uint8_t, 3, uint16_t, 1)
+#endif
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_8888,
                                     uint32_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_0565,
@@ -282,6 +288,10 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, mips_composite_src_x888_8888),
     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, mips_composite_src_x888_8888),
     PIXMAN_STD_FAST_PATH (SRC, r8g8b8,   null, r8g8b8,   mips_composite_src_0888_0888),
+#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8,   null, x8r8g8b8, mips_composite_src_0888_8888_rev),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8,   null, r5g6b5,   mips_composite_src_0888_0565_rev),
+#endif
     PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   a8r8g8b8, mips_composite_src_n_8_8888),
     PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   x8r8g8b8, mips_composite_src_n_8_8888),
     PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   a8b8g8r8, mips_composite_src_n_8_8888),
commit 43914d68d1c87a9da6f53e6b0a12941c97bb0e5d
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date:   Wed Feb 27 14:39:45 2013 +0100

    MIPS: DSPr2: Added more fast-paths for OVER operation:
     - over_8888_0565
     - over_n_8_8
    
    Performance numbers before/after on MIPS-74kc @ 1GHz:
    
    lowlevel-blt-bench results
    
    Referent (before):
            over_8888_0565 =  L1:  14.30  L2:  13.22  M: 10.43 ( 41.56%)  HT: 12.51  VT: 12.95  R: 11.82  RT:  7.34 (  49Kops/s)
                over_n_8_8 =  L1:  12.77  L2:  16.93  M: 15.03 ( 29.94%)  HT: 10.78  VT: 10.72  R: 10.29  RT:  4.92 (  33Kops/s)
    
    Optimized:
            over_8888_0565 =  L1:  26.03  L2:  22.92  M: 15.68 ( 62.43%)  HT: 16.19  VT: 16.27  R: 14.93  RT:  8.60 (  52Kops/s)
                over_n_8_8 =  L1:  62.00  L2:  55.17  M: 40.29 ( 80.23%)  HT: 26.77  VT: 25.64  R: 24.13  RT: 10.01 (  47Kops/s)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index ddfacef..299f739 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -658,6 +658,126 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips)
 
 END(pixman_composite_over_n_8888_0565_ca_asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_composite_over_n_8_8_asm_mips)
+/*
+ * a0 - dst  (a8)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, v0
+    li                t9, 0x00ff00ff
+    beqz              a3, 3f
+     nop
+    srl               v0, a3, 2   /* v0 = how many multiples of 4 dst pixels */
+    beqz              v0, 1f      /* branch if less than 4 src pixels */
+     nop
+
+    srl               t8, a1, 24
+    replv.ph          t8, t8
+
+0:
+    beqz              v0, 1f
+     addiu            v0, v0, -1
+    lbu               t0, 0(a2)
+    lbu               t1, 1(a2)
+    lbu               t2, 2(a2)
+    lbu               t3, 3(a2)
+    lbu               t4, 0(a0)
+    lbu               t5, 1(a0)
+    lbu               t6, 2(a0)
+    lbu               t7, 3(a0)
+
+    addiu             a2, a2, 4
+
+    precr_sra.ph.w    t1, t0, 0
+    precr_sra.ph.w    t3, t2, 0
+    precr_sra.ph.w    t5, t4, 0
+    precr_sra.ph.w    t7, t6, 0
+
+    precr.qb.ph       t0, t3, t1
+    precr.qb.ph       t1, t7, t5
+
+    muleu_s.ph.qbl    t2, t0, t8
+    muleu_s.ph.qbr    t3, t0, t8
+    shra_r.ph         t4, t2, 8
+    shra_r.ph         t5, t3, 8
+    and               t4, t4, t9
+    and               t5, t5, t9
+    addq.ph           t2, t2, t4
+    addq.ph           t3, t3, t5
+    shra_r.ph         t2, t2, 8
+    shra_r.ph         t3, t3, 8
+    precr.qb.ph       t0, t2, t3
+    not               t6, t0
+
+    preceu.ph.qbl     t7, t6
+    preceu.ph.qbr     t6, t6
+
+    muleu_s.ph.qbl    t2, t1, t7
+    muleu_s.ph.qbr    t3, t1, t6
+    shra_r.ph         t4, t2, 8
+    shra_r.ph         t5, t3, 8
+    and               t4, t4, t9
+    and               t5, t5, t9
+    addq.ph           t2, t2, t4
+    addq.ph           t3, t3, t5
+    shra_r.ph         t2, t2, 8
+    shra_r.ph         t3, t3, 8
+    precr.qb.ph       t1, t2, t3
+
+    addu_s.qb         t2, t0, t1
+
+    sb                t2, 0(a0)
+    srl               t2, t2, 8
+    sb                t2, 1(a0)
+    srl               t2, t2, 8
+    sb                t2, 2(a0)
+    srl               t2, t2, 8
+    sb                t2, 3(a0)
+    addiu             a3, a3, -4
+    b                 0b
+     addiu            a0, a0, 4
+
+1:
+    beqz              a3, 3f
+     nop
+    srl               t8, a1, 24
+2:
+    lbu               t0, 0(a2)
+    lbu               t1, 0(a0)
+    addiu             a2, a2, 1
+
+    mul               t2, t0, t8
+    shra_r.ph         t3, t2, 8
+    andi              t3, t3, 0x00ff
+    addq.ph           t2, t2, t3
+    shra_r.ph         t2, t2, 8
+    not               t3, t2
+    andi              t3, t3, 0x00ff
+
+
+    mul               t4, t1, t3
+    shra_r.ph         t5, t4, 8
+    andi              t5, t5, 0x00ff
+    addq.ph           t4, t4, t5
+    shra_r.ph         t4, t4, 8
+    andi              t4, t4, 0x00ff
+
+    addu_s.qb         t2, t2, t4
+    sb                t2, 0(a0)
+    addiu             a3, a3, -1
+    bnez              a3, 2b
+     addiu            a0, a0, 1
+
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j                 ra
+     nop
+
+END(pixman_composite_over_n_8_8_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_composite_over_n_8_8888_asm_mips)
 /*
  * a0 - dst  (a8r8g8b8)
@@ -1342,6 +1462,84 @@ LEAF_MIPS_DSPR2(pixman_composite_over_8888_8888_asm_mips)
 
 END(pixman_composite_over_8888_8888_asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_composite_over_8888_0565_asm_mips)
+/*
+ * a0 - dst  (r5g6b5)
+ * a1 - src  (a8r8g8b8)
+ * a2 - w
+ */
+
+    SAVE_REGS_ON_STACK 8, s0, s1, s2, s3, s4, s5
+    li           t4, 0x00ff00ff
+    li           s3, 0xf800f800
+    li           s4, 0x07e007e0
+    li           s5, 0x001F001F
+    beqz         a2, 3f
+     nop
+    addiu        t1, a2, -1
+    beqz         t1, 2f
+     nop
+1:
+    lw           t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw           t1, 4(a1) /* t1 = source      (a8r8g8b8) */
+    lhu          t2, 0(a0) /* t2 = destination (r5g6b5) */
+    lhu          t3, 2(a0) /* t3 = destination (r5g6b5) */
+    addiu        a1, a1, 8
+
+    not          t5, t0
+    srl          t5, t5, 24
+    not          t6, t1
+    srl          t6, t6, 24
+
+    or           t7, t5, t6
+    beqz         t7, 11f
+     or          t8, t0, t1
+    beqz         t8, 12f
+
+    CONVERT_2x0565_TO_2x8888 t2, t3, s0, s1, s4, s5, t7, t8, t9, s2
+    MIPS_2xUN8x4_MUL_2xUN8   s0, s1, t5, t6, t7, t8, t4, t9, t2, t3, s2, s0, s1
+
+    addu_s.qb    t0, t7, t0
+    addu_s.qb    t1, t8, t1
+11:
+    CONVERT_2x8888_TO_2x0565 t0, t1, t7, t8, s3, s4, s5, t2, t3
+    sh           t7, 0(a0)
+    sh           t8, 2(a0)
+12:
+    addiu        a2, a2, -2
+    addiu        t1, a2, -1
+    bgtz         t1, 1b
+     addiu       a0, a0, 4
+2:
+    beqz         a2, 3f
+     nop
+
+    lw           t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lhu          t1, 0(a0) /* t1 = destination (r5g6b5) */
+    addiu        a1, a1, 4
+
+    not          t2, t0
+    srl          t2, t2, 24
+
+    beqz         t2, 21f
+     nop
+    beqz         t0, 3f
+
+    CONVERT_1x0565_TO_1x8888 t1, s0, t8, t9
+    MIPS_UN8x4_MUL_UN8       s0, t2, t3, t4, t5, t6, t7
+
+    addu_s.qb    t0, t3, t0
+21:
+    CONVERT_1x8888_TO_1x0565 t0, s0, t8, t9
+    sh           s0, 0(a0)
+
+3:
+    RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5
+    j            ra
+     nop
+
+END(pixman_composite_over_8888_0565_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_composite_over_n_0565_asm_mips)
 /*
  * a0 - dst  (r5g6b5)
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index e14e1c4..cdc71cd 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -50,6 +50,8 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888,
                                     uint8_t, 3, uint8_t, 3)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_8888,
                                     uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_0565,
+                                    uint32_t, 1, uint16_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8_8,
                                     uint8_t, 1, uint8_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8888_8888,
@@ -67,6 +69,8 @@ PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca,
                                        uint32_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_0565_ca,
                                        uint32_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8,
+                                       uint8_t, 1, uint8_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8888,
                                        uint8_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_0565,
@@ -290,6 +294,7 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mips_composite_over_n_8888_8888_ca),
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5,   mips_composite_over_n_8888_0565_ca),
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5,   mips_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8,       mips_composite_over_n_8_8),
     PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8r8g8b8, mips_composite_over_n_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8r8g8b8, mips_composite_over_n_8_8888),
     PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8b8g8r8, mips_composite_over_n_8_8888),
@@ -318,6 +323,8 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     x8r8g8b8, mips_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     a8b8g8r8, mips_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     x8b8g8r8, mips_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     r5g6b5,   mips_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     b5g6r5,   mips_composite_over_8888_0565),
     PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8,       mips_composite_add_n_8_8),
     PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8r8g8b8, mips_composite_add_n_8_8888),
     PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8b8g8r8, mips_composite_add_n_8_8888),


More information about the xorg-commit mailing list