pixman: Branch 'master' - 2 commits
Siarhei Siamashka
siamashka at kemper.freedesktop.org
Mon Dec 10 10:07:48 PST 2012
pixman/pixman-combine32.c | 58 ++++++++++++++++++++++++++++++++++++++++------
test/utils-prng.c | 12 +++++++++
2 files changed, 63 insertions(+), 7 deletions(-)
New commits:
commit fdab3c1b6cd9c5e197ec3f6bc0a03da32880e317
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date: Sat Dec 8 15:16:51 2012 +0200
test: Workaround unaligned MOVDQA bug (http://gcc.gnu.org/PR55614)
Just use SSE2 intrinsics to do unaligned memory accesses as
a workaround for this gcc bug related to vector extensions.
diff --git a/test/utils-prng.c b/test/utils-prng.c
index 7c2dd6a..967b898 100644
--- a/test/utils-prng.c
+++ b/test/utils-prng.c
@@ -27,6 +27,10 @@
#include "utils.h"
#include "utils-prng.h"
+#if defined(GCC_VECTOR_EXTENSIONS_SUPPORTED) && defined(__SSE2__)
+#include <xmmintrin.h>
+#endif
+
void smallprng_srand_r (smallprng_t *x, uint32_t seed)
{
uint32_t i;
@@ -77,6 +81,14 @@ store_rand_128_data (void *addr, prng_rand_128_data_t *d, int aligned)
*(uint8x16 *)addr = d->vb;
return;
}
+ else
+ {
+#ifdef __SSE2__
+ /* workaround for http://gcc.gnu.org/PR55614 */
+ _mm_storeu_si128 (addr, _mm_loadu_si128 ((__m128i *)d));
+ return;
+#endif
+ }
#endif
/* we could try something better for unaligned writes (packed attribute),
* but GCC is not very reliable: http://gcc.gnu.org/PR55454 */
commit 2bc59006d7fe91abf68a2061ad86c06e1b2964ab
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date: Fri Nov 30 12:00:47 2012 +0200
Improve performance of combine_over_u
The generic C over_u combiner can be a lot faster with the
addition of special shortcuts for 0xFF and 0x00 alpha/mask
values. This is already implemented in C and SSE2 fast paths.
Profiling the run of cairo-perf-trace benchmarks with PIXMAN_DISABLE
environment variable set to "fast mmx sse2" on Intel Core i7:
=== before ===
37.32% cairo-perf-trac libpixman-1.so.0.29.1 [.] combine_over_u
21.37% cairo-perf-trac libpixman-1.so.0.29.1 [.] bits_image_fetch_bilinear_no_repeat_8888
13.51% cairo-perf-trac libpixman-1.so.0.29.1 [.] bits_image_fetch_bilinear_affine_none_a8r8g8b8
2.96% cairo-perf-trac libpixman-1.so.0.29.1 [.] radial_compute_color
2.74% cairo-perf-trac libpixman-1.so.0.29.1 [.] fetch_scanline_a8
2.71% cairo-perf-trac libpixman-1.so.0.29.1 [.] fetch_scanline_x8r8g8b8
2.17% cairo-perf-trac libpixman-1.so.0.29.1 [.] _pixman_gradient_walker_pixel
1.86% cairo-perf-trac libcairo.so.2.11200.0 [.] _cairo_tor_scan_converter_generate
1.57% cairo-perf-trac libpixman-1.so.0.29.1 [.] bits_image_fetch_bilinear_affine_pad_a8r8g8b8
0.97% cairo-perf-trac libpixman-1.so.0.29.1 [.] combine_in_reverse_u
0.96% cairo-perf-trac libpixman-1.so.0.29.1 [.] combine_over_ca
=== after ===
28.79% cairo-perf-trac libpixman-1.so.0.29.1 [.] bits_image_fetch_bilinear_no_repeat_8888
18.44% cairo-perf-trac libpixman-1.so.0.29.1 [.] bits_image_fetch_bilinear_affine_none_a8r8g8b8
15.54% cairo-perf-trac libpixman-1.so.0.29.1 [.] combine_over_u
3.94% cairo-perf-trac libpixman-1.so.0.29.1 [.] radial_compute_color
3.69% cairo-perf-trac libpixman-1.so.0.29.1 [.] fetch_scanline_a8
3.69% cairo-perf-trac libpixman-1.so.0.29.1 [.] fetch_scanline_x8r8g8b8
2.94% cairo-perf-trac libpixman-1.so.0.29.1 [.] _pixman_gradient_walker_pixel
2.52% cairo-perf-trac libcairo.so.2.11200.0 [.] _cairo_tor_scan_converter_generate
2.08% cairo-perf-trac libpixman-1.so.0.29.1 [.] bits_image_fetch_bilinear_affine_pad_a8r8g8b8
1.31% cairo-perf-trac libpixman-1.so.0.29.1 [.] combine_in_reverse_u
1.29% cairo-perf-trac libpixman-1.so.0.29.1 [.] combine_over_ca
diff --git a/pixman/pixman-combine32.c b/pixman/pixman-combine32.c
index 54cc877..3ac7576 100644
--- a/pixman/pixman-combine32.c
+++ b/pixman/pixman-combine32.c
@@ -196,14 +196,58 @@ combine_over_u (pixman_implementation_t *imp,
{
int i;
- for (i = 0; i < width; ++i)
+ if (!mask)
{
- uint32_t s = combine_mask (src, mask, i);
- uint32_t d = *(dest + i);
- uint32_t ia = ALPHA_8 (~s);
-
- UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
- *(dest + i) = d;
+ for (i = 0; i < width; ++i)
+ {
+ uint32_t s = *(src + i);
+ uint32_t a = ALPHA_8 (s);
+ if (a == 0xFF)
+ {
+ *(dest + i) = s;
+ }
+ else if (s)
+ {
+ uint32_t d = *(dest + i);
+ uint32_t ia = a ^ 0xFF;
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+ *(dest + i) = d;
+ }
+ }
+ }
+ else
+ {
+ for (i = 0; i < width; ++i)
+ {
+ uint32_t m = ALPHA_8 (*(mask + i));
+ if (m == 0xFF)
+ {
+ uint32_t s = *(src + i);
+ uint32_t a = ALPHA_8 (s);
+ if (a == 0xFF)
+ {
+ *(dest + i) = s;
+ }
+ else if (s)
+ {
+ uint32_t d = *(dest + i);
+ uint32_t ia = a ^ 0xFF;
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+ *(dest + i) = d;
+ }
+ }
+ else if (m)
+ {
+ uint32_t s = *(src + i);
+ if (s)
+ {
+ uint32_t d = *(dest + i);
+ UN8x4_MUL_UN8 (s, m);
+ UN8x4_MUL_UN8_ADD_UN8x4 (d, ALPHA_8 (~s), s);
+ *(dest + i) = d;
+ }
+ }
+ }
}
}
More information about the xorg-commit
mailing list