pixman: Branch 'master' - 5 commits
Matt Turner
mattst88 at kemper.freedesktop.org
Sat May 26 17:33:33 PDT 2012
pixman/loongson-mmintrin.h | 21 +++
pixman/pixman-fast-path.c | 44 +++++++
pixman/pixman-mmx.c | 282 ++++++++++++++++++++++++++++++++++++++++-----
test/lowlevel-blt-bench.c | 1
4 files changed, 318 insertions(+), 30 deletions(-)
New commits:
commit 62c4bdc94f82d1e4c5dc0e58b5903382d74f3883
Author: Matt Turner <mattst88 at gmail.com>
Date: Fri May 18 01:37:07 2012 -0400
mmx: add over_reverse_n_8888
Loongson:
over_reverse_n_8888 = L1: 16.04 L2: 15.35 M: 10.20 ( 27.96%) HT: 10.95 VT: 10.45 R: 9.18 RT: 6.99 ( 76Kops/s)
over_reverse_n_8888 = L1: 27.40 L2: 26.67 M: 16.97 ( 45.78%) HT: 16.66 VT: 15.38 R: 14.15 RT: 9.44 ( 97Kops/s)
image poppler 34.106 35.500 1.48% 6/6
image poppler 29.598 30.835 1.70% 6/6
ARM/iwMMXt:
over_reverse_n_8888 = L1: 15.63 L2: 14.33 M: 10.83 ( 27.55%) HT: 9.78 VT: 9.91 R: 9.49 RT: 6.96 ( 69Kops/s)
over_reverse_n_8888 = L1: 22.79 L2: 19.40 M: 13.76 ( 34.19%) HT: 11.66 VT: 11.86 R: 11.17 RT: 7.85 ( 75Kops/s)
image poppler 38.040 38.606 1.10% 6/6
image poppler 31.686 32.278 0.80% 5/6
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index a692837..bb125bf 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -3435,6 +3435,75 @@ mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
_mm_empty ();
}
+static void
+mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint32_t src;
+ uint32_t *dst_line, *dst;
+ int32_t w;
+ int dst_stride;
+ __m64 vsrc;
+
+ CHECKPOINT ();
+
+ src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+ if (src == 0)
+ return;
+
+ PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+ vsrc = load8888 (&src);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ w = width;
+
+ CHECKPOINT ();
+
+ while (w && (unsigned long)dst & 7)
+ {
+ __m64 vdest = load8888 (dst);
+
+ store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
+
+ w--;
+ dst++;
+ }
+
+ while (w >= 2)
+ {
+ __m64 vdest = *(__m64 *)dst;
+ __m64 dest0 = expand8888 (vdest, 0);
+ __m64 dest1 = expand8888 (vdest, 1);
+
+
+ dest0 = over (dest0, expand_alpha (dest0), vsrc);
+ dest1 = over (dest1, expand_alpha (dest1), vsrc);
+
+ *(__m64 *)dst = pack8888 (dest0, dest1);
+
+ dst += 2;
+ w -= 2;
+ }
+
+ CHECKPOINT ();
+
+ if (w)
+ {
+ __m64 vdest = load8888 (dst);
+
+ store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
+ }
+ }
+
+ _mm_empty ();
+}
+
static uint32_t *
mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
{
@@ -3663,6 +3732,9 @@ static const pixman_fast_path_t mmx_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
+
PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ),
PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
diff --git a/test/lowlevel-blt-bench.c b/test/lowlevel-blt-bench.c
index 8a39a46..b44b9f8 100644
--- a/test/lowlevel-blt-bench.c
+++ b/test/lowlevel-blt-bench.c
@@ -661,6 +661,7 @@ tests_tbl[] =
{ "outrev_n_8888_1555_ca", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_a1r5g5b5 },
{ "outrev_n_8888_x888_ca", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_x8r8g8b8 },
{ "outrev_n_8888_8888_ca", PIXMAN_a8r8g8b8, 1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_a8r8g8b8 },
+ { "over_reverse_n_8888", PIXMAN_a8r8g8b8, 0, PIXMAN_OP_OVER_REVERSE, PIXMAN_null, 0, PIXMAN_a8r8g8b8 },
};
int
commit 17acc7a4c707db4804b6bf47db30883745049fdb
Author: Matt Turner <mattst88 at gmail.com>
Date: Thu May 17 23:27:59 2012 -0400
mmx: add add_0565_0565
Loongson:
add_0565_0565 = L1: 15.37 L2: 14.91 M: 11.83 ( 16.06%) HT: 10.53 VT: 10.15 R: 9.74 RT: 6.19 ( 68Kops/s)
add_0565_0565 = L1: 45.06 L2: 46.71 M: 27.45 ( 38.00%) HT: 23.76 VT: 22.84 R: 18.96 RT: 9.79 ( 104Kops/s)
ARM/iwMMXt:
add_0565_0565 = L1: 12.87 L2: 11.58 M: 10.11 ( 12.50%) HT: 9.06 VT: 8.66 R: 7.70 RT: 5.62 ( 58Kops/s)
add_0565_0565 = L1: 31.14 L2: 28.87 M: 22.46 ( 28.60%) HT: 18.61 VT: 17.04 R: 15.21 RT: 9.35 ( 90Kops/s)
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 70dd4e0..a692837 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -3077,6 +3077,90 @@ mmx_composite_add_8_8 (pixman_implementation_t *imp,
}
static void
+mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint16_t *dst_line, *dst;
+ uint32_t d;
+ uint16_t *src_line, *src;
+ uint32_t s;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ CHECKPOINT ();
+
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 7)
+ {
+ s = *src++;
+ if (s)
+ {
+ d = *dst;
+ s = CONVERT_0565_TO_8888 (s);
+ if (d)
+ {
+ d = CONVERT_0565_TO_8888 (d);
+ UN8x4_ADD_UN8x4 (s, d);
+ }
+ *dst = CONVERT_8888_TO_0565 (s);
+ }
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ __m64 vdest = *(__m64 *)dst;
+ __m64 vsrc = ldq_u ((__m64 *)src);
+ __m64 vd0, vd1;
+ __m64 vs0, vs1;
+
+ expand_4xpacked565 (vdest, &vd0, &vd1, 0);
+ expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
+
+ vd0 = _mm_adds_pu8 (vd0, vs0);
+ vd1 = _mm_adds_pu8 (vd1, vs1);
+
+ *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
+
+ dst += 4;
+ src += 4;
+ w -= 4;
+ }
+
+ while (w--)
+ {
+ s = *src++;
+ if (s)
+ {
+ d = *dst;
+ s = CONVERT_0565_TO_8888 (s);
+ if (d)
+ {
+ d = CONVERT_0565_TO_8888 (d);
+ UN8x4_ADD_UN8x4 (s, d);
+ }
+ *dst = CONVERT_8888_TO_0565 (s);
+ }
+ dst++;
+ }
+ }
+
+ _mm_empty ();
+}
+
+static void
mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
pixman_composite_info_t *info)
{
@@ -3579,6 +3663,8 @@ static const pixman_fast_path_t mmx_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
+ PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ),
+ PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
commit d551dc049498d17ab879fd67d47508cafaaede06
Author: Matt Turner <mattst88 at gmail.com>
Date: Thu May 17 23:29:51 2012 -0400
fast: add add_0565_0565 function
I'll need this code for header and tail alignment loops in MMX, so I
might as well implement a fast path here.
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 0a134ed..e79b069 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -810,6 +810,48 @@ fast_composite_add_8_8 (pixman_implementation_t *imp,
}
static void
+fast_composite_add_0565_0565 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint16_t *dst_line, *dst;
+ uint32_t d;
+ uint16_t *src_line, *src;
+ uint32_t s;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w--)
+ {
+ s = *src++;
+ if (s)
+ {
+ d = *dst;
+ s = CONVERT_0565_TO_8888 (s);
+ if (d)
+ {
+ d = CONVERT_0565_TO_8888 (d);
+ UN8x4_ADD_UN8x4 (s, d);
+ }
+ *dst = CONVERT_8888_TO_0565 (s);
+ }
+ dst++;
+ }
+ }
+}
+
+static void
fast_composite_add_8888_8888 (pixman_implementation_t *imp,
pixman_composite_info_t *info)
{
@@ -1836,6 +1878,8 @@ static const pixman_fast_path_t c_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
+ PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, fast_composite_add_0565_0565),
+ PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, fast_composite_add_0565_0565),
PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
commit f8dc0e98343c7936a37a3624721c5782e7ac309c
Author: Matt Turner <mattst88 at gmail.com>
Date: Thu May 17 13:22:18 2012 -0400
mmx: implement expand_4x565 in terms of expand_4xpacked565
Loongson:
over_n_0565 = L1: 38.57 L2: 38.88 M: 30.01 ( 20.97%) HT: 23.60 VT: 23.88 R: 21.95 RT: 11.65 ( 113Kops/s)
over_n_0565 = L1: 56.28 L2: 55.90 M: 34.20 ( 23.82%) HT: 25.66 VT: 26.60 R: 23.78 RT: 11.80 ( 115Kops/s)
over_8888_0565 = L1: 35.89 L2: 36.11 M: 21.56 ( 45.47%) HT: 18.33 VT: 17.90 R: 16.27 RT: 9.07 ( 98Kops/s)
over_8888_0565 = L1: 40.91 L2: 41.06 M: 23.13 ( 48.46%) HT: 19.24 VT: 18.71 R: 16.82 RT: 9.18 ( 99Kops/s)
over_n_8_0565 = L1: 28.92 L2: 29.12 M: 21.42 ( 30.00%) HT: 18.37 VT: 17.75 R: 16.15 RT: 8.79 ( 91Kops/s)
over_n_8_0565 = L1: 32.32 L2: 32.13 M: 22.44 ( 31.27%) HT: 19.15 VT: 18.66 R: 16.62 RT: 8.86 ( 92Kops/s)
over_n_8888_0565_ca = L1: 29.33 L2: 29.22 M: 18.99 ( 66.69%) HT: 16.69 VT: 16.22 R: 14.63 RT: 8.42 ( 88Kops/s)
over_n_8888_0565_ca = L1: 34.97 L2: 34.14 M: 20.32 ( 71.73%) HT: 17.67 VT: 17.19 R: 15.23 RT: 8.50 ( 89Kops/s)
ARM/iwMMXt:
over_n_0565 = L1: 29.70 L2: 30.53 M: 24.47 ( 14.84%) HT: 22.28 VT: 21.72 R: 21.13 RT: 12.58 ( 105Kops/s)
over_n_0565 = L1: 41.42 L2: 40.00 M: 30.95 ( 19.13%) HT: 27.06 VT: 27.28 R: 23.43 RT: 14.44 ( 114Kops/s)
over_8888_0565 = L1: 12.73 L2: 11.53 M: 9.07 ( 16.47%) HT: 9.00 VT: 9.25 R: 8.44 RT: 7.27 ( 76Kops/s)
over_8888_0565 = L1: 23.72 L2: 21.76 M: 15.89 ( 29.51%) HT: 14.36 VT: 14.05 R: 12.44 RT: 8.94 ( 86Kops/s)
over_n_8_0565 = L1: 6.80 L2: 7.15 M: 6.37 ( 7.90%) HT: 6.58 VT: 6.24 R: 6.49 RT: 5.94 ( 59Kops/s)
over_n_8_0565 = L1: 12.06 L2: 11.02 M: 10.16 ( 13.43%) HT: 9.57 VT: 8.49 R: 9.10 RT: 6.86 ( 69Kops/s)
over_n_8888_0565_ca = L1: 7.62 L2: 7.01 M: 6.27 ( 20.52%) HT: 6.00 VT: 6.07 R: 5.68 RT: 5.53 ( 57Kops/s)
over_n_8888_0565_ca = L1: 13.54 L2: 11.96 M: 9.76 ( 30.66%) HT: 9.72 VT: 8.45 R: 9.37 RT: 6.85 ( 67Kops/s)
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index af34755..70dd4e0 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -529,12 +529,14 @@ expand565 (__m64 pixel, int pos)
* AARRGGBBRRGGBB
*/
static force_inline void
-expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1)
+expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
{
- __m64 t0, t1, alpha = _mm_cmpeq_pi32 (_mm_setzero_si64 (), _mm_setzero_si64 ());
+ __m64 t0, t1, alpha = _mm_setzero_si64 ();;
__m64 r = _mm_and_si64 (vin, MC (expand_565_r));
__m64 g = _mm_and_si64 (vin, MC (expand_565_g));
__m64 b = _mm_and_si64 (vin, MC (expand_565_b));
+ if (full_alpha)
+ alpha = _mm_cmpeq_pi32 (alpha, alpha);
/* Replicate high bits into empty low bits. */
r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
@@ -567,6 +569,17 @@ expandx888 (__m64 in, int pos)
return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
}
+static force_inline void
+expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
+{
+ __m64 v0, v1;
+ expand_4xpacked565 (vin, &v0, &v1, full_alpha);
+ *vout0 = expand8888 (v0, 0);
+ *vout1 = expand8888 (v0, 1);
+ *vout2 = expand8888 (v1, 0);
+ *vout3 = expand8888 (v1, 1);
+}
+
static force_inline __m64
pack_565 (__m64 pixel, __m64 target, int pos)
{
@@ -1442,11 +1455,14 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp,
while (w >= 4)
{
__m64 vdest = *(__m64 *)dst;
+ __m64 v0, v1, v2, v3;
- __m64 v0 = over (vsrc, vsrca, expand565 (vdest, 0));
- __m64 v1 = over (vsrc, vsrca, expand565 (vdest, 1));
- __m64 v2 = over (vsrc, vsrca, expand565 (vdest, 2));
- __m64 v3 = over (vsrc, vsrca, expand565 (vdest, 3));
+ expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
+
+ v0 = over (vsrc, vsrca, v0);
+ v1 = over (vsrc, vsrca, v1);
+ v2 = over (vsrc, vsrca, v2);
+ v3 = over (vsrc, vsrca, v3);
*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
@@ -1862,16 +1878,19 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
while (w >= 4)
{
__m64 vdest = *(__m64 *)dst;
+ __m64 v0, v1, v2, v3;
+
+ expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
__m64 vsrc0 = load8888 ((src + 0));
__m64 vsrc1 = load8888 ((src + 1));
__m64 vsrc2 = load8888 ((src + 2));
__m64 vsrc3 = load8888 ((src + 3));
- __m64 v0 = over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0));
- __m64 v1 = over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1));
- __m64 v2 = over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2));
- __m64 v3 = over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3));
+ v0 = over (vsrc0, expand_alpha (vsrc0), v0);
+ v1 = over (vsrc1, expand_alpha (vsrc1), v1);
+ v2 = over (vsrc2, expand_alpha (vsrc2), v2);
+ v3 = over (vsrc3, expand_alpha (vsrc3), v3);
*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
@@ -2409,19 +2428,21 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
else if (m0 | m1 | m2 | m3)
{
__m64 vdest = *(__m64 *)dst;
+ __m64 v0, v1, v2, v3;
+
+ expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
__m64 vm0 = to_m64 (m0);
- __m64 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0),
- expand565 (vdest, 0));
+ v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
+
__m64 vm1 = to_m64 (m1);
- __m64 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1),
- expand565 (vdest, 1));
+ v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
+
__m64 vm2 = to_m64 (m2);
- __m64 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2),
- expand565 (vdest, 2));
+ v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
+
__m64 vm3 = to_m64 (m3);
- __m64 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3),
- expand565 (vdest, 3));
+ v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
}
@@ -2530,11 +2551,19 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
else if (s0 | s1 | s2 | s3)
{
__m64 vdest = *(__m64 *)dst;
+ __m64 v0, v1, v2, v3;
- __m64 v0 = over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0));
- __m64 v1 = over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1));
- __m64 v2 = over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2));
- __m64 v3 = over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3));
+ __m64 vsrc0 = load8888 (&s0);
+ __m64 vsrc1 = load8888 (&s1);
+ __m64 vsrc2 = load8888 (&s2);
+ __m64 vsrc3 = load8888 (&s3);
+
+ expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
+
+ v0 = over_rev_non_pre (vsrc0, v0);
+ v1 = over_rev_non_pre (vsrc1, v1);
+ v2 = over_rev_non_pre (vsrc2, v2);
+ v3 = over_rev_non_pre (vsrc3, v3);
*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
}
@@ -2710,11 +2739,14 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
if ((m0 | m1 | m2 | m3))
{
__m64 vdest = *(__m64 *)q;
+ __m64 v0, v1, v2, v3;
+
+ expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
- __m64 v0 = in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0));
- __m64 v1 = in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1));
- __m64 v2 = in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2));
- __m64 v3 = in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3));
+ v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
+ v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
+ v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
+ v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
*(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
}
@@ -3382,7 +3414,7 @@ mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
__m64 vsrc = ldq_u ((__m64 *)src);
__m64 mm0, mm1;
- expand_4xpacked565 (vsrc, &mm0, &mm1);
+ expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
*(__m64 *)(dst + 0) = mm0;
*(__m64 *)(dst + 2) = mm1;
commit 51681a052f9e1d0970a79187974da77d9bf69450
Author: Matt Turner <mattst88 at gmail.com>
Date: Sun May 13 20:39:05 2012 -0400
mmx: add and use expand_4xpacked565 function
Loongson:
add_0565_0565 = L1: 14.39 L2: 13.98 M: 11.28 ( 15.22%) HT: 10.11 VT: 9.74 R: 9.39 RT: 6.05 ( 67Kops/s)
add_0565_0565 = L1: 15.37 L2: 14.91 M: 11.83 ( 16.06%) HT: 10.53 VT: 10.15 R: 9.74 RT: 6.19 ( 68Kops/s)
ARM/iwMMXt:
add_0565_0565 = L1: 11.12 L2: 10.40 M: 8.82 ( 10.65%) HT: 7.98 VT: 7.41 R: 7.57 RT: 5.21 ( 54Kops/s)
add_0565_0565 = L1: 12.87 L2: 11.58 M: 10.11 ( 12.50%) HT: 9.06 VT: 8.66 R: 7.70 RT: 5.62 ( 58Kops/s)
diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 8295ba0..1a114fe 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -77,6 +77,17 @@ _mm_and_si64 (__m64 __m1, __m64 __m2)
return ret;
}
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("pcmpeqw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_empty (void)
{
@@ -150,6 +161,16 @@ _mm_shuffle_pi16 (__m64 __m, int64_t __n)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_pi16 (__m64 __m, int64_t __count)
+{
+ __m64 ret;
+ asm("psllh %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m), "f" (*(__m64 *)&__count)
+ );
+ return ret;
+}
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si64 (__m64 __m, int64_t __count)
{
__m64 ret;
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index eb02d1a..af34755 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -185,6 +185,9 @@ typedef struct
mmxdatafield mmx_565_b;
mmxdatafield mmx_packed_565_rb;
mmxdatafield mmx_packed_565_g;
+ mmxdatafield mmx_expand_565_g;
+ mmxdatafield mmx_expand_565_b;
+ mmxdatafield mmx_expand_565_r;
#ifndef USE_LOONGSON_MMI
mmxdatafield mmx_mask_0;
mmxdatafield mmx_mask_1;
@@ -216,6 +219,9 @@ static const mmx_data_t c =
MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8),
MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00),
+ MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0),
+ MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f),
+ MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800),
#ifndef USE_LOONGSON_MMI
MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
@@ -518,6 +524,34 @@ expand565 (__m64 pixel, int pos)
return _mm_srli_pi16 (pixel, 8);
}
+/* Expand 4 16 bit pixels in an mmx register into two mmx registers of
+ *
+ * AARRGGBBRRGGBB
+ */
+static force_inline void
+expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1)
+{
+ __m64 t0, t1, alpha = _mm_cmpeq_pi32 (_mm_setzero_si64 (), _mm_setzero_si64 ());
+ __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
+ __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
+ __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
+
+ /* Replicate high bits into empty low bits. */
+ r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
+ g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
+ b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
+
+ r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */
+ g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */
+ b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */
+
+ t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */
+ t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */
+
+ *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */
+ *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */
+}
+
static force_inline __m64
expand8888 (__m64 in, int pos)
{
@@ -3346,14 +3380,12 @@ mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
while (w >= 4)
{
__m64 vsrc = ldq_u ((__m64 *)src);
+ __m64 mm0, mm1;
- __m64 mm0 = expand565 (vsrc, 0);
- __m64 mm1 = expand565 (vsrc, 1);
- __m64 mm2 = expand565 (vsrc, 2);
- __m64 mm3 = expand565 (vsrc, 3);
+ expand_4xpacked565 (vsrc, &mm0, &mm1);
- *(__m64 *)(dst + 0) = _mm_or_si64 (pack8888 (mm0, mm1), MC (ff000000));
- *(__m64 *)(dst + 2) = _mm_or_si64 (pack8888 (mm2, mm3), MC (ff000000));
+ *(__m64 *)(dst + 0) = mm0;
+ *(__m64 *)(dst + 2) = mm1;
dst += 4;
src += 4;
More information about the xorg-commit
mailing list