pixman: Branch 'master' - 19 commits
Søren Sandmann Pedersen
sandmann at kemper.freedesktop.org
Sun Jun 8 16:57:42 PDT 2008
configure.ac | 33 +
pixman/Makefile.am | 21 -
pixman/combine.h.inc | 215 ++++++++++
pixman/combine.inc | 199 ---------
pixman/combine.pl | 53 +-
pixman/pixman-pict.c | 70 +++
pixman/pixman-vmx.c | 1068 +++++++++++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-vmx.h | 308 ++++++++++++++
8 files changed, 1740 insertions(+), 227 deletions(-)
New commits:
commit fb8f17fdf1eaec4ab8edba1486bfa83c0965d738
Merge: 9267b0b... 1063933...
Author: Søren Sandmann <sandmann at redhat.com>
Date: Sun Jun 8 19:55:43 2008 -0400
Merge branch 'vmx'
commit 1063933bacb8b5d06b42b7b06a116339ce7c1f0c
Author: Søren Sandmann <sandmann at redhat.com>
Date: Sun Jun 8 19:55:35 2008 -0400
Rename pixman-combine.h -> pixman-combin32.h
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index de9327c..377f958 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -28,9 +28,9 @@ libpixmanincludedir = $(includedir)/pixman-1/
libpixmaninclude_HEADERS = pixman.h pixman-version.h
noinst_LTLIBRARIES =
-pixman-combine32.c : combine.inc pixman-combine.h combine.pl
+pixman-combine32.c : combine.inc pixman-combine32.h combine.pl
$(PERL) $(srcdir)/combine.pl 8 < $(srcdir)/combine.inc > $@ || ($(RM) $@; exit 1)
-pixman-combine.h : combine.h.inc combine.pl
+pixman-combine32.h : combine.h.inc combine.pl
$(PERL) $(srcdir)/combine.pl 8 < $(srcdir)/combine.h.inc > $@ || ($(RM) $@; exit 1)
pixman-combine64.c : combine.inc pixman-combine64.h combine.pl
@@ -39,7 +39,7 @@ pixman-combine64.h : combine.h.inc combine.pl
$(PERL) $(srcdir)/combine.pl 16 < $(srcdir)/combine.h.inc > $@ || ($(RM) $@; exit 1)
EXTRA_DIST = Makefile.win32 combine.inc combine.pl
-CLEANFILES = pixman-combine32.c pixman-combine64.c pixman-combine.h pixman-combine64.h
+CLEANFILES = pixman-combine32.c pixman-combine64.c pixman-combine32.h pixman-combine64.h
# mmx code
if USE_MMX
@@ -58,7 +58,7 @@ noinst_LTLIBRARIES += libpixman-vmx.la
libpixman_vmx_la_SOURCES = \
pixman-vmx.c \
pixman-vmx.h \
- pixman-combine.h
+ pixman-combine32.h
libpixman_vmx_la_CFLAGS = $(DEP_CFLAGS) $(VMX_CFLAGS)
libpixman_vmx_la_LIBADD = $(DEP_LIBS)
libpixman_1_la_LIBADD += libpixman-vmx.la
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 76f3592..8c8a2a3 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -27,7 +27,7 @@
#include <config.h>
#include "pixman-vmx.h"
-#include "pixman-combine.h"
+#include "pixman-combine32.h"
#include <altivec.h>
#ifdef __GNUC__
commit 567b4c255050ee3cc2dd0c03fb091d1f981332eb
Author: Luca Barbato <lu_zero at gentoo.org>
Date: Sat Jun 7 19:38:01 2008 +0200
Use sigaction instead of signal to restore the previous handler
diff --git a/pixman/pixman-pict.c b/pixman/pixman-pict.c
index 948c666..e49a864 100644
--- a/pixman/pixman-pict.c
+++ b/pixman/pixman-pict.c
@@ -1923,15 +1923,19 @@ pixman_bool_t pixman_have_vmx (void) {
#else
#include <signal.h>
-static void vmx_test (int sig) {
+static void vmx_test(int sig, siginfo_t *si, void *unused) {
have_vmx = FALSE;
}
pixman_bool_t pixman_have_vmx (void) {
+ struct sigaction sa, osa;
if (!initialized) {
- signal(SIGILL, vmx_test);
+ sa.sa_flags = SA_SIGINFO;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_sigaction = vmx_test;
+ sigaction(SIGILL, &sa, &osa);
asm volatile ( "vor 0, 0, 0" );
- signal(SIGILL, SIG_DFL);
+ sigaction(SIGILL, &osa, NULL);
initialized = TRUE;
}
return have_vmx;
commit 7ef19261ee5bb4c78ca55533c67e1f267faed61e
Author: Luca Barbato <lu_zero at gentoo.org>
Date: Sat Jun 7 19:28:10 2008 +0200
Use combine macros from the generated header
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index f0770a6..de9327c 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -56,8 +56,9 @@ endif
if USE_VMX
noinst_LTLIBRARIES += libpixman-vmx.la
libpixman_vmx_la_SOURCES = \
- pixman-vmx.c \
- pixman-vmx.h
+ pixman-vmx.c \
+ pixman-vmx.h \
+ pixman-combine.h
libpixman_vmx_la_CFLAGS = $(DEP_CFLAGS) $(VMX_CFLAGS)
libpixman_vmx_la_LIBADD = $(DEP_LIBS)
libpixman_1_la_LIBADD += libpixman-vmx.la
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 87dc4d1..76f3592 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -27,172 +27,13 @@
#include <config.h>
#include "pixman-vmx.h"
+#include "pixman-combine.h"
#include <altivec.h>
#ifdef __GNUC__
# define inline __inline__ __attribute__ ((__always_inline__))
#endif
-#define Alpha(x) ((x) >> 24)
-
-/*
- x_c = (x_c * a) / 255
-*/
-#define FbByteMul(x, a) do { \
- uint32_t t = ((x & 0xff00ff) * a) + 0x800080; \
- t = (t + ((t >> 8) & 0xff00ff)) >> 8; \
- t &= 0xff00ff; \
- \
- x = (((x >> 8) & 0xff00ff) * a) + 0x800080; \
- x = (x + ((x >> 8) & 0xff00ff)); \
- x &= 0xff00ff00; \
- x += t; \
- } while (0)
-
-/*
- x_c = (x_c * a) / 255 + y
-*/
-#define FbByteMulAdd(x, a, y) do { \
- uint32_t t = ((x & 0xff00ff) * a) + 0x800080; \
- t = (t + ((t >> 8) & 0xff00ff)) >> 8; \
- t &= 0xff00ff; \
- t += y & 0xff00ff; \
- t |= 0x1000100 - ((t >> 8) & 0xff00ff); \
- t &= 0xff00ff; \
- \
- x = (((x >> 8) & 0xff00ff) * a) + 0x800080; \
- x = (x + ((x >> 8) & 0xff00ff)) >> 8; \
- x &= 0xff00ff; \
- x += (y >> 8) & 0xff00ff; \
- x |= 0x1000100 - ((x >> 8) & 0xff00ff); \
- x &= 0xff00ff; \
- x <<= 8; \
- x += t; \
- } while (0)
-
-/*
- x_c = (x_c * a + y_c * b) / 255
-*/
-#define FbByteAddMul(x, a, y, b) do { \
- uint32_t t; \
- uint32_t r = (x >> 24) * a + (y >> 24) * b + 0x80; \
- r += (r >> 8); \
- r >>= 8; \
- \
- t = (x & 0xff00) * a + (y & 0xff00) * b; \
- t += (t >> 8) + 0x8000; \
- t >>= 16; \
- \
- t |= r << 16; \
- t |= 0x1000100 - ((t >> 8) & 0xff00ff); \
- t &= 0xff00ff; \
- t <<= 8; \
- \
- r = ((x >> 16) & 0xff) * a + ((y >> 16) & 0xff) * b + 0x80; \
- r += (r >> 8); \
- r >>= 8; \
- \
- x = (x & 0xff) * a + (y & 0xff) * b + 0x80; \
- x += (x >> 8); \
- x >>= 8; \
- x |= r << 16; \
- x |= 0x1000100 - ((x >> 8) & 0xff00ff); \
- x &= 0xff00ff; \
- x |= t; \
- } while (0)
-
-/*
- x_c = (x_c * a_c) / 255
-*/
-#define FbByteMulC(x, a) do { \
- uint32_t t; \
- uint32_t r = (x & 0xff) * (a & 0xff); \
- r |= (x & 0xff0000) * ((a >> 16) & 0xff); \
- r += 0x800080; \
- r = (r + ((r >> 8) & 0xff00ff)) >> 8; \
- r &= 0xff00ff; \
- \
- x >>= 8; \
- t = (x & 0xff) * ((a >> 8) & 0xff); \
- t |= (x & 0xff0000) * (a >> 24); \
- t += 0x800080; \
- t = t + ((t >> 8) & 0xff00ff); \
- x = r | (t & 0xff00ff00); \
- \
- } while (0)
-
-/*
- x_c = (x_c * a) / 255 + y
-*/
-#define FbByteMulAddC(x, a, y) do { \
- uint32_t t; \
- uint32_t r = (x & 0xff) * (a & 0xff); \
- r |= (x & 0xff0000) * ((a >> 16) & 0xff); \
- r += 0x800080; \
- r = (r + ((r >> 8) & 0xff00ff)) >> 8; \
- r &= 0xff00ff; \
- r += y & 0xff00ff; \
- r |= 0x1000100 - ((r >> 8) & 0xff00ff); \
- r &= 0xff00ff; \
- \
- x >>= 8; \
- t = (x & 0xff) * ((a >> 8) & 0xff); \
- t |= (x & 0xff0000) * (a >> 24); \
- t += 0x800080; \
- t = (t + ((t >> 8) & 0xff00ff)) >> 8; \
- t &= 0xff00ff; \
- t += (y >> 8) & 0xff00ff; \
- t |= 0x1000100 - ((t >> 8) & 0xff00ff); \
- t &= 0xff00ff; \
- x = r | (t << 8); \
- } while (0)
-
-/*
- x_c = (x_c * a_c + y_c * b) / 255
-*/
-#define FbByteAddMulC(x, a, y, b) do { \
- uint32_t t; \
- uint32_t r = (x >> 24) * (a >> 24) + (y >> 24) * b; \
- r += (r >> 8) + 0x80; \
- r >>= 8; \
- \
- t = (x & 0xff00) * ((a >> 8) & 0xff) + (y & 0xff00) * b; \
- t += (t >> 8) + 0x8000; \
- t >>= 16; \
- \
- t |= r << 16; \
- t |= 0x1000100 - ((t >> 8) & 0xff00ff); \
- t &= 0xff00ff; \
- t <<= 8; \
- \
- r = ((x >> 16) & 0xff) * ((a >> 16) & 0xff) + ((y >> 16) & 0xff) * b + 0x80; \
- r += (r >> 8); \
- r >>= 8; \
- \
- x = (x & 0xff) * (a & 0xff) + (y & 0xff) * b + 0x80; \
- x += (x >> 8); \
- x >>= 8; \
- x |= r << 16; \
- x |= 0x1000100 - ((x >> 8) & 0xff00ff); \
- x &= 0xff00ff; \
- x |= t; \
- } while (0)
-
-/*
- x_c = min(x_c + y_c, 255)
-*/
-#define FbByteAdd(x, y) do { \
- uint32_t t; \
- uint32_t r = (x & 0xff00ff) + (y & 0xff00ff); \
- r |= 0x1000100 - ((r >> 8) & 0xff00ff); \
- r &= 0xff00ff; \
- \
- t = ((x >> 8) & 0xff00ff) + ((y >> 8) & 0xff00ff); \
- t |= 0x1000100 - ((t >> 8) & 0xff00ff); \
- r |= (t & 0xff00ff) << 8; \
- x = r; \
- } while (0)
-
static inline vector unsigned int
splat_alpha (vector unsigned int pix) {
return vec_perm (pix, pix,
commit 795fd8a4c0f9417fb92beaff8595064c573b7652
Author: Luca Barbato <lu_zero at gentoo.org>
Date: Sat Jun 7 19:25:09 2008 +0200
Split combine.inc generated files in source and header
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index effa959..f0770a6 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -28,14 +28,18 @@ libpixmanincludedir = $(includedir)/pixman-1/
libpixmaninclude_HEADERS = pixman.h pixman-version.h
noinst_LTLIBRARIES =
-pixman-combine32.c : combine.inc combine.pl
+pixman-combine32.c : combine.inc pixman-combine.h combine.pl
$(PERL) $(srcdir)/combine.pl 8 < $(srcdir)/combine.inc > $@ || ($(RM) $@; exit 1)
+pixman-combine.h : combine.h.inc combine.pl
+ $(PERL) $(srcdir)/combine.pl 8 < $(srcdir)/combine.h.inc > $@ || ($(RM) $@; exit 1)
-pixman-combine64.c : combine.inc combine.pl
+pixman-combine64.c : combine.inc pixman-combine64.h combine.pl
$(PERL) $(srcdir)/combine.pl 16 < $(srcdir)/combine.inc > $@ || ($(RM) $@; exit 1)
+pixman-combine64.h : combine.h.inc combine.pl
+ $(PERL) $(srcdir)/combine.pl 16 < $(srcdir)/combine.h.inc > $@ || ($(RM) $@; exit 1)
EXTRA_DIST = Makefile.win32 combine.inc combine.pl
-CLEANFILES = pixman-combine32.c pixman-combine64.c
+CLEANFILES = pixman-combine32.c pixman-combine64.c pixman-combine.h pixman-combine64.h
# mmx code
if USE_MMX
diff --git a/pixman/combine.h.inc b/pixman/combine.h.inc
new file mode 100644
index 0000000..7dd97ae
--- /dev/null
+++ b/pixman/combine.h.inc
@@ -0,0 +1,215 @@
+
+#define COMPONENT_SIZE
+#define MASK
+#define ONE_HALF
+
+#define G_SHIFT
+#define B_SHIFT
+#define A_SHIFT
+#define G_MASK
+#define B_MASK
+#define A_MASK
+
+#define RB_MASK
+#define AG_MASK
+#define RB_ONE_HALF
+#define RB_MASK_PLUS_ONE
+
+#define Alpha(x) ((x) >> A_SHIFT)
+
+/*
+ * Helper macros.
+ */
+
+#define IntMult(a,b,t) ( (t) = (a) * (b) + ONE_HALF, ( ( ( (t)>>G_SHIFT ) + (t) )>>G_SHIFT ) )
+#define IntDiv(a,b) (((comp2_t) (a) * MASK) / (b))
+
+#define GetComp(v,i) ((comp2_t) (comp1_t) ((v) >> i))
+
+#define Add(x,y,i,t) ((t) = GetComp(x,i) + GetComp(y,i), \
+ (comp4_t) ((comp1_t) ((t) | (0 - ((t) >> G_SHIFT)))) << (i))
+
+#define FbGen(x,y,i,ax,ay,t,u,v) ((t) = (IntMult(GetComp(y,i),ay,(u)) + \
+ IntMult(GetComp(x,i),ax,(v))), \
+ (comp4_t) ((comp1_t) ((t) | \
+ (0 - ((t) >> G_SHIFT)))) << (i))
+
+/*
+ The methods below use some tricks to be able to do two color
+ components at the same time.
+*/
+
+/*
+ x_c = (x_c * a) / 255
+*/
+#define FbByteMul(x, a) do { \
+ comp4_t t = ((x & RB_MASK) * a) + RB_ONE_HALF; \
+ t = (t + ((t >> COMPONENT_SIZE) & RB_MASK)) >> COMPONENT_SIZE; \
+ t &= RB_MASK; \
+ \
+ x = (((x >> COMPONENT_SIZE) & RB_MASK) * a) + RB_ONE_HALF; \
+ x = (x + ((x >> COMPONENT_SIZE) & RB_MASK)); \
+ x &= RB_MASK << COMPONENT_SIZE; \
+ x += t; \
+ } while (0)
+
+/*
+ x_c = (x_c * a) / 255 + y
+*/
+#define FbByteMulAdd(x, a, y) do { \
+ comp4_t t = ((x & RB_MASK) * a) + RB_ONE_HALF; \
+ t = (t + ((t >> COMPONENT_SIZE) & RB_MASK)) >> COMPONENT_SIZE; \
+ t &= RB_MASK; \
+ t += y & RB_MASK; \
+ t |= RB_MASK_PLUS_ONE - ((t >> COMPONENT_SIZE) & RB_MASK); \
+ t &= RB_MASK; \
+ \
+ x = (((x >> COMPONENT_SIZE) & RB_MASK) * a) + RB_ONE_HALF; \
+ x = (x + ((x >> COMPONENT_SIZE) & RB_MASK)) >> COMPONENT_SIZE; \
+ x &= RB_MASK; \
+ x += (y >> COMPONENT_SIZE) & RB_MASK; \
+ x |= RB_MASK_PLUS_ONE - ((x >> COMPONENT_SIZE) & RB_MASK); \
+ x &= RB_MASK; \
+ x <<= COMPONENT_SIZE; \
+ x += t; \
+ } while (0)
+
+/*
+ x_c = (x_c * a + y_c * b) / 255
+*/
+#define FbByteAddMul(x, a, y, b) do { \
+ comp4_t t; \
+ comp4_t r = (x >> A_SHIFT) * a + (y >> A_SHIFT) * b + ONE_HALF; \
+ r += (r >> G_SHIFT); \
+ r >>= G_SHIFT; \
+ \
+ t = (x & G_MASK) * a + (y & G_MASK) * b; \
+ t += (t >> G_SHIFT) + (ONE_HALF << G_SHIFT); \
+ t >>= B_SHIFT; \
+ \
+ t |= r << B_SHIFT; \
+ t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK); \
+ t &= RB_MASK; \
+ t <<= G_SHIFT; \
+ \
+ r = ((x >> B_SHIFT) & MASK) * a + \
+ ((y >> B_SHIFT) & MASK) * b + ONE_HALF; \
+ r += (r >> G_SHIFT); \
+ r >>= G_SHIFT; \
+ \
+ x = (x & MASK) * a + (y & MASK) * b + ONE_HALF; \
+ x += (x >> G_SHIFT); \
+ x >>= G_SHIFT; \
+ x |= r << B_SHIFT; \
+ x |= RB_MASK_PLUS_ONE - ((x >> G_SHIFT) & RB_MASK); \
+ x &= RB_MASK; \
+ x |= t; \
+ } while (0)
+
+/*
+ x_c = (x_c * a + y_c *b) / 256
+*/
+#define FbByteAddMul_256(x, a, y, b) do { \
+ comp4_t t = (x & RB_MASK) * a + (y & RB_MASK) * b; \
+ t >>= G_SHIFT; \
+ t &= RB_MASK; \
+ \
+ x = ((x >> G_SHIFT) & RB_MASK) * a + \
+ ((y >> G_SHIFT) & RB_MASK) * b; \
+ x &= AG_MASK; \
+ x += t; \
+ } while (0)
+
+/*
+ x_c = (x_c * a_c) / 255
+*/
+#define FbByteMulC(x, a) do { \
+ comp4_t t; \
+ comp4_t r = (x & MASK) * (a & MASK); \
+ r |= (x & B_MASK) * ((a >> B_SHIFT) & MASK); \
+ r += RB_ONE_HALF; \
+ r = (r + ((r >> G_SHIFT) & RB_MASK)) >> G_SHIFT; \
+ r &= RB_MASK; \
+ \
+ x >>= G_SHIFT; \
+ t = (x & MASK) * ((a >> G_SHIFT) & MASK); \
+ t |= (x & B_MASK) * (a >> A_SHIFT); \
+ t += RB_ONE_HALF; \
+ t = t + ((t >> G_SHIFT) & RB_MASK); \
+ x = r | (t & AG_MASK); \
+ } while (0)
+
+/*
+ x_c = (x_c * a) / 255 + y
+*/
+#define FbByteMulAddC(x, a, y) do { \
+ comp4_t t; \
+ comp4_t r = (x & MASK) * (a & MASK); \
+ r |= (x & B_MASK) * ((a >> B_SHIFT) & MASK); \
+ r += RB_ONE_HALF; \
+ r = (r + ((r >> G_SHIFT) & RB_MASK)) >> G_SHIFT; \
+ r &= RB_MASK; \
+ r += y & RB_MASK; \
+ r |= RB_MASK_PLUS_ONE - ((r >> G_SHIFT) & RB_MASK); \
+ r &= RB_MASK; \
+ \
+ x >>= G_SHIFT; \
+ t = (x & MASK) * ((a >> G_SHIFT) & MASK); \
+ t |= (x & B_MASK) * (a >> A_SHIFT); \
+ t += RB_ONE_HALF; \
+ t = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT; \
+ t &= RB_MASK; \
+ t += (y >> G_SHIFT) & RB_MASK; \
+ t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK); \
+ t &= RB_MASK; \
+ x = r | (t << G_SHIFT); \
+ } while (0)
+
+/*
+ x_c = (x_c * a_c + y_c * b) / 255
+*/
+#define FbByteAddMulC(x, a, y, b) do { \
+ comp4_t t; \
+ comp4_t r = (x >> A_SHIFT) * (a >> A_SHIFT) + \
+ (y >> A_SHIFT) * b; \
+ r += (r >> G_SHIFT) + ONE_HALF; \
+ r >>= G_SHIFT; \
+ \
+ t = (x & G_MASK) * ((a >> G_SHIFT) & MASK) + (y & G_MASK) * b; \
+ t += (t >> G_SHIFT) + (ONE_HALF << G_SHIFT); \
+ t >>= B_SHIFT; \
+ \
+ t |= r << B_SHIFT; \
+ t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK); \
+ t &= RB_MASK; \
+ t <<= G_SHIFT; \
+ \
+ r = ((x >> B_SHIFT) & MASK) * ((a >> B_SHIFT) & MASK) + \
+ ((y >> B_SHIFT) & MASK) * b + ONE_HALF; \
+ r += (r >> G_SHIFT); \
+ r >>= G_SHIFT; \
+ \
+ x = (x & MASK) * (a & MASK) + (y & MASK) * b + ONE_HALF; \
+ x += (x >> G_SHIFT); \
+ x >>= G_SHIFT; \
+ x |= r << B_SHIFT; \
+ x |= RB_MASK_PLUS_ONE - ((x >> G_SHIFT) & RB_MASK); \
+ x &= RB_MASK; \
+ x |= t; \
+ } while (0)
+
+/*
+ x_c = min(x_c + y_c, 255)
+*/
+#define FbByteAdd(x, y) do { \
+ comp4_t t; \
+ comp4_t r = (x & RB_MASK) + (y & RB_MASK); \
+ r |= RB_MASK_PLUS_ONE - ((r >> G_SHIFT) & RB_MASK); \
+ r &= RB_MASK; \
+ \
+ t = ((x >> G_SHIFT) & RB_MASK) + ((y >> G_SHIFT) & RB_MASK); \
+ t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK); \
+ r |= (t & RB_MASK) << G_SHIFT; \
+ x = r; \
+ } while (0)
+
diff --git a/pixman/combine.inc b/pixman/combine.inc
index 63a3fe1..9f88dee 100644
--- a/pixman/combine.inc
+++ b/pixman/combine.inc
@@ -6,204 +6,7 @@
#include "pixman-private.h"
-#define Alpha(x) ((x) >> A_SHIFT)
-
-/*
- * Helper macros.
- */
-
-#define IntMult(a,b,t) ( (t) = (a) * (b) + ONE_HALF, ( ( ( (t)>>G_SHIFT ) + (t) )>>G_SHIFT ) )
-#define IntDiv(a,b) (((comp2_t) (a) * MASK) / (b))
-
-#define GetComp(v,i) ((comp2_t) (comp1_t) ((v) >> i))
-
-#define Add(x,y,i,t) ((t) = GetComp(x,i) + GetComp(y,i), \
- (comp4_t) ((comp1_t) ((t) | (0 - ((t) >> G_SHIFT)))) << (i))
-
-#define FbGen(x,y,i,ax,ay,t,u,v) ((t) = (IntMult(GetComp(y,i),ay,(u)) + \
- IntMult(GetComp(x,i),ax,(v))), \
- (comp4_t) ((comp1_t) ((t) | \
- (0 - ((t) >> G_SHIFT)))) << (i))
-
-/*
- The methods below use some tricks to be able to do two color
- components at the same time.
-*/
-
-/*
- x_c = (x_c * a) / 255
-*/
-#define FbByteMul(x, a) do { \
- comp4_t t = ((x & RB_MASK) * a) + RB_ONE_HALF; \
- t = (t + ((t >> COMPONENT_SIZE) & RB_MASK)) >> COMPONENT_SIZE; \
- t &= RB_MASK; \
- \
- x = (((x >> COMPONENT_SIZE) & RB_MASK) * a) + RB_ONE_HALF; \
- x = (x + ((x >> COMPONENT_SIZE) & RB_MASK)); \
- x &= RB_MASK << COMPONENT_SIZE; \
- x += t; \
- } while (0)
-
-/*
- x_c = (x_c * a) / 255 + y
-*/
-#define FbByteMulAdd(x, a, y) do { \
- comp4_t t = ((x & RB_MASK) * a) + RB_ONE_HALF; \
- t = (t + ((t >> COMPONENT_SIZE) & RB_MASK)) >> COMPONENT_SIZE; \
- t &= RB_MASK; \
- t += y & RB_MASK; \
- t |= RB_MASK_PLUS_ONE - ((t >> COMPONENT_SIZE) & RB_MASK); \
- t &= RB_MASK; \
- \
- x = (((x >> COMPONENT_SIZE) & RB_MASK) * a) + RB_ONE_HALF; \
- x = (x + ((x >> COMPONENT_SIZE) & RB_MASK)) >> COMPONENT_SIZE; \
- x &= RB_MASK; \
- x += (y >> COMPONENT_SIZE) & RB_MASK; \
- x |= RB_MASK_PLUS_ONE - ((x >> COMPONENT_SIZE) & RB_MASK); \
- x &= RB_MASK; \
- x <<= COMPONENT_SIZE; \
- x += t; \
- } while (0)
-
-/*
- x_c = (x_c * a + y_c * b) / 255
-*/
-#define FbByteAddMul(x, a, y, b) do { \
- comp4_t t; \
- comp4_t r = (x >> A_SHIFT) * a + (y >> A_SHIFT) * b + ONE_HALF; \
- r += (r >> G_SHIFT); \
- r >>= G_SHIFT; \
- \
- t = (x & G_MASK) * a + (y & G_MASK) * b; \
- t += (t >> G_SHIFT) + (ONE_HALF << G_SHIFT); \
- t >>= B_SHIFT; \
- \
- t |= r << B_SHIFT; \
- t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK); \
- t &= RB_MASK; \
- t <<= G_SHIFT; \
- \
- r = ((x >> B_SHIFT) & MASK) * a + \
- ((y >> B_SHIFT) & MASK) * b + ONE_HALF; \
- r += (r >> G_SHIFT); \
- r >>= G_SHIFT; \
- \
- x = (x & MASK) * a + (y & MASK) * b + ONE_HALF; \
- x += (x >> G_SHIFT); \
- x >>= G_SHIFT; \
- x |= r << B_SHIFT; \
- x |= RB_MASK_PLUS_ONE - ((x >> G_SHIFT) & RB_MASK); \
- x &= RB_MASK; \
- x |= t; \
- } while (0)
-
-/*
- x_c = (x_c * a + y_c *b) / 256
-*/
-#define FbByteAddMul_256(x, a, y, b) do { \
- comp4_t t = (x & RB_MASK) * a + (y & RB_MASK) * b; \
- t >>= G_SHIFT; \
- t &= RB_MASK; \
- \
- x = ((x >> G_SHIFT) & RB_MASK) * a + \
- ((y >> G_SHIFT) & RB_MASK) * b; \
- x &= AG_MASK; \
- x += t; \
- } while (0)
-
-/*
- x_c = (x_c * a_c) / 255
-*/
-#define FbByteMulC(x, a) do { \
- comp4_t t; \
- comp4_t r = (x & MASK) * (a & MASK); \
- r |= (x & B_MASK) * ((a >> B_SHIFT) & MASK); \
- r += RB_ONE_HALF; \
- r = (r + ((r >> G_SHIFT) & RB_MASK)) >> G_SHIFT; \
- r &= RB_MASK; \
- \
- x >>= G_SHIFT; \
- t = (x & MASK) * ((a >> G_SHIFT) & MASK); \
- t |= (x & B_MASK) * (a >> A_SHIFT); \
- t += RB_ONE_HALF; \
- t = t + ((t >> G_SHIFT) & RB_MASK); \
- x = r | (t & AG_MASK); \
- } while (0)
-
-/*
- x_c = (x_c * a) / 255 + y
-*/
-#define FbByteMulAddC(x, a, y) do { \
- comp4_t t; \
- comp4_t r = (x & MASK) * (a & MASK); \
- r |= (x & B_MASK) * ((a >> B_SHIFT) & MASK); \
- r += RB_ONE_HALF; \
- r = (r + ((r >> G_SHIFT) & RB_MASK)) >> G_SHIFT; \
- r &= RB_MASK; \
- r += y & RB_MASK; \
- r |= RB_MASK_PLUS_ONE - ((r >> G_SHIFT) & RB_MASK); \
- r &= RB_MASK; \
- \
- x >>= G_SHIFT; \
- t = (x & MASK) * ((a >> G_SHIFT) & MASK); \
- t |= (x & B_MASK) * (a >> A_SHIFT); \
- t += RB_ONE_HALF; \
- t = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT; \
- t &= RB_MASK; \
- t += (y >> G_SHIFT) & RB_MASK; \
- t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK); \
- t &= RB_MASK; \
- x = r | (t << G_SHIFT); \
- } while (0)
-
-/*
- x_c = (x_c * a_c + y_c * b) / 255
-*/
-#define FbByteAddMulC(x, a, y, b) do { \
- comp4_t t; \
- comp4_t r = (x >> A_SHIFT) * (a >> A_SHIFT) + \
- (y >> A_SHIFT) * b; \
- r += (r >> G_SHIFT) + ONE_HALF; \
- r >>= G_SHIFT; \
- \
- t = (x & G_MASK) * ((a >> G_SHIFT) & MASK) + (y & G_MASK) * b; \
- t += (t >> G_SHIFT) + (ONE_HALF << G_SHIFT); \
- t >>= B_SHIFT; \
- \
- t |= r << B_SHIFT; \
- t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK); \
- t &= RB_MASK; \
- t <<= G_SHIFT; \
- \
- r = ((x >> B_SHIFT) & MASK) * ((a >> B_SHIFT) & MASK) + \
- ((y >> B_SHIFT) & MASK) * b + ONE_HALF; \
- r += (r >> G_SHIFT); \
- r >>= G_SHIFT; \
- \
- x = (x & MASK) * (a & MASK) + (y & MASK) * b + ONE_HALF; \
- x += (x >> G_SHIFT); \
- x >>= G_SHIFT; \
- x |= r << B_SHIFT; \
- x |= RB_MASK_PLUS_ONE - ((x >> G_SHIFT) & RB_MASK); \
- x &= RB_MASK; \
- x |= t; \
- } while (0)
-
-/*
- x_c = min(x_c + y_c, 255)
-*/
-#define FbByteAdd(x, y) do { \
- comp4_t t; \
- comp4_t r = (x & RB_MASK) + (y & RB_MASK); \
- r |= RB_MASK_PLUS_ONE - ((r >> G_SHIFT) & RB_MASK); \
- r &= RB_MASK; \
- \
- t = ((x >> G_SHIFT) & RB_MASK) + ((y >> G_SHIFT) & RB_MASK); \
- t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK); \
- r |= (t & RB_MASK) << G_SHIFT; \
- x = r; \
- } while (0)
-
+#include "pixman-combine.h"
/*
* There are two ways of handling alpha -- either as a single unified value or
diff --git a/pixman/combine.pl b/pixman/combine.pl
index ba13d6c..7258ff3 100644
--- a/pixman/combine.pl
+++ b/pixman/combine.pl
@@ -27,30 +27,38 @@ print "/* WARNING: This file is generated by combine.pl from combine.inc.\n";
print " Please edit one of those files rather than this one. */\n";
print "\n";
-# Mask and 1/2 value for a single component.
-print "#define COMPONENT_SIZE ", $size, "\n";
-print "#define MASK ", mask($mask), "\n";
-print "#define ONE_HALF ", mask($one_half), "\n";
-print "\n";
-
-# Shifts and masks for green, blue, and alpha.
-print "#define G_SHIFT ", $size, "\n";
-print "#define B_SHIFT ", $size * 2, "\n";
-print "#define A_SHIFT ", $size * 3, "\n";
-print "#define G_MASK ", mask($mask . $zero_mask), "\n";
-print "#define B_MASK ", mask($mask . $zero_mask x 2), "\n";
-print "#define A_MASK ", mask($mask . $zero_mask x 3), "\n";
-print "\n";
+print "#line 1 \"combine.inc\"\n";
-# Special values for dealing with red + blue at the same time.
-print "#define RB_MASK ", mask($mask . $zero_mask . $mask), "\n";
-print "#define AG_MASK ", mask($mask . $zero_mask . $mask . $zero_mask), "\n";
-print "#define RB_ONE_HALF ", mask($one_half . $zero_mask . $one_half), "\n";
-print "#define RB_MASK_PLUS_ONE ", mask("1" . $zero_mask x 2 . "1" . $zero_mask), "\n";
-print "\n";
+$mask_ = mask($mask);
+$one_half_ = mask($one_half);
+$g_mask = mask($mask . $zero_mask);
+$b_mask = mask($mask . $zero_mask x 2);
+$a_mask = mask($mask . $zero_mask x 3);
+$rb_mask = mask($mask . $zero_mask . $mask);
+$ag_mask = mask($mask . $zero_mask . $mask . $zero_mask);
+$rb_one_half = mask($one_half . $zero_mask . $one_half);
+$rb_mask_plus_one = mask("1" . $zero_mask x 2 . "1" . $zero_mask);
-print "#line 1 \"combine.inc\"\n";
while (<STDIN>) {
+ # Mask and 1/2 value for a single component.
+ s/#define COMPONENT_SIZE\b/$& $size/;
+ s/#define MASK\b/$& $mask_/;
+ s/#define ONE_HALF\b/$& $one_half_/;
+
+ # Shifts and masks for green, blue, and alpha.
+ s/#define G_SHIFT\b/$& $size/;
+ s/#define B_SHIFT\b/$& $size * 2/;
+ s/#define A_SHIFT\b/$& $size * 3/;
+ s/#define G_MASK\b/$& $g_mask/;
+ s/#define B_MASK\b/$& $b_mask/;
+ s/#define A_MASK\b/$& $a_mask/;
+
+ # Special values for dealing with red + blue at the same time.
+ s/#define RB_MASK\b/$& $rb_mask/;
+ s/#define AG_MASK\b/$& $ag_mask/;
+ s/#define RB_ONE_HALF\b/$& $rb_one_half/;
+ s/#define RB_MASK_PLUS_ONE\b/$& $rb_mask_plus_one/;
+
# Add 32/64 suffix to combining function types.
s/\bCombineFuncC\b/CombineFuncC$pixel_size/;
s/\bCombineFuncU\b/CombineFuncU$pixel_size/;
@@ -65,5 +73,8 @@ while (<STDIN>) {
# Change the function table name for the 64-bit version.
s/pixman_composeFunctions/pixman_composeFunctions64/ if $size == 16;
+ # Change the header for the 64-bit version
+ s/pixman-combine.h/pixman-combine64.h/ if $size == 16;
+
print;
}
commit 8ef3f49a9580fb148c2e5f567c0aafddd4b0f136
Merge: 27b753c... 9a6d3a1...
Author: Luca Barbato <lu_zero at gentoo.org>
Date: Sun Jun 1 16:37:52 2008 +0200
Fixup
diff --cc configure.ac
index 3b73d7f,988bee1..6365c4c
--- a/configure.ac
+++ b/configure.ac
@@@ -229,42 -237,17 +237,50 @@@ dnl ===================================
AC_SUBST(MMX_CFLAGS)
AC_SUBST(SSE_CFLAGS)
+dnl Check for VMX/Altivec
+if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then
+ VMX_CFLAGS="-faltivec"
+else
+ VMX_CFLAGS="-maltivec -mabi=altivec"
+fi
+
+have_vmx_intrinsics=no
+AC_MSG_CHECKING(whether to use VMX/Altivec intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$CFLAGS $VMX_CFLAGS"
+AC_COMPILE_IFELSE([
+#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
+#error "Need GCC >= 3.4 for sane altivec support"
+#endif
+#include <altivec.h>
+int main () {
+ vector unsigned int v = vec_splat_u32 (1);
+ v = vec_sub (v, v);
+ return 0;
+}], have_vmx_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+AC_MSG_RESULT($have_vmx_intrinsics)
+
+if test $have_vmx_intrinsics = yes ; then
+ AC_DEFINE(USE_VMX, 1, [use VMX compiler intrinsics])
+else
+ VMX_CFLAGS=
+fi
+AC_SUBST(VMX_CFLAGS)
+
+AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
+
- dnl ===========================================================================
+ AC_ARG_ENABLE(gtk,
+ [AC_HELP_STRING([--disable-gtk],
+ [disable tests using GTK+])],
+ [disable_gtk=yes], [disable_gtk=no])
+
+ if test $disable_gtk = no ; then
+ PKG_CHECK_MODULES(GTK, [gtk+-2.0], [HAVE_GTK=yes], [HAVE_GTK=no])
+ else
+ HAVE_GTK=no
+ fi
- PKG_CHECK_MODULES(GTK, [gtk+-2.0], [HAVE_GTK=yes], [HAVE_GTK=no])
AM_CONDITIONAL(HAVE_GTK, [test "x$HAVE_GTK" = xyes])
AC_SUBST(GTK_CFLAGS)
diff --cc pixman/pixman-pict.c
index 9147af7,1479670..948c666
--- a/pixman/pixman-pict.c
+++ b/pixman/pixman-pict.c
@@@ -1679,10 -1744,10 +1752,14 @@@ pixman_image_composite (pixman_op_
fbComposeSetupMMX();
#endif
+#ifdef USE_VMX
+ fbComposeSetupVMX();
+#endif
+
+ #ifdef USE_SSE2
+ fbComposeSetupSSE();
+ #endif
+
if (srcRepeat && srcTransform &&
pSrc->bits.width == 1 &&
pSrc->bits.height == 1)
diff --cc pixman/pixman-vmx.c
index ac050a4,0000000..87dc4d1
mode 100644,000000..100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@@ -1,1225 -1,0 +1,1227 @@@
+/*
+ * Copyright © 2007 Luca Barbato
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Luca Barbato not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. Luca Barbato makes no representations about the
+ * suitability of this software for any purpose. It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Luca Barbato (lu_zero at gentoo.org)
+ *
+ * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell
+ */
+
+#include <config.h>
+#include "pixman-vmx.h"
+#include <altivec.h>
+
+#ifdef __GNUC__
+# define inline __inline__ __attribute__ ((__always_inline__))
+#endif
+
++#define Alpha(x) ((x) >> 24)
++
+/*
+ x_c = (x_c * a) / 255
+*/
+#define FbByteMul(x, a) do { \
+ uint32_t t = ((x & 0xff00ff) * a) + 0x800080; \
+ t = (t + ((t >> 8) & 0xff00ff)) >> 8; \
+ t &= 0xff00ff; \
+ \
+ x = (((x >> 8) & 0xff00ff) * a) + 0x800080; \
+ x = (x + ((x >> 8) & 0xff00ff)); \
+ x &= 0xff00ff00; \
+ x += t; \
+ } while (0)
+
+/*
+ x_c = (x_c * a) / 255 + y
+*/
+#define FbByteMulAdd(x, a, y) do { \
+ uint32_t t = ((x & 0xff00ff) * a) + 0x800080; \
+ t = (t + ((t >> 8) & 0xff00ff)) >> 8; \
+ t &= 0xff00ff; \
+ t += y & 0xff00ff; \
+ t |= 0x1000100 - ((t >> 8) & 0xff00ff); \
+ t &= 0xff00ff; \
+ \
+ x = (((x >> 8) & 0xff00ff) * a) + 0x800080; \
+ x = (x + ((x >> 8) & 0xff00ff)) >> 8; \
+ x &= 0xff00ff; \
+ x += (y >> 8) & 0xff00ff; \
+ x |= 0x1000100 - ((x >> 8) & 0xff00ff); \
+ x &= 0xff00ff; \
+ x <<= 8; \
+ x += t; \
+ } while (0)
+
+/*
+ x_c = (x_c * a + y_c * b) / 255
+*/
+#define FbByteAddMul(x, a, y, b) do { \
+ uint32_t t; \
+ uint32_t r = (x >> 24) * a + (y >> 24) * b + 0x80; \
+ r += (r >> 8); \
+ r >>= 8; \
+ \
+ t = (x & 0xff00) * a + (y & 0xff00) * b; \
+ t += (t >> 8) + 0x8000; \
+ t >>= 16; \
+ \
+ t |= r << 16; \
+ t |= 0x1000100 - ((t >> 8) & 0xff00ff); \
+ t &= 0xff00ff; \
+ t <<= 8; \
+ \
+ r = ((x >> 16) & 0xff) * a + ((y >> 16) & 0xff) * b + 0x80; \
+ r += (r >> 8); \
+ r >>= 8; \
+ \
+ x = (x & 0xff) * a + (y & 0xff) * b + 0x80; \
+ x += (x >> 8); \
+ x >>= 8; \
+ x |= r << 16; \
+ x |= 0x1000100 - ((x >> 8) & 0xff00ff); \
+ x &= 0xff00ff; \
+ x |= t; \
+ } while (0)
+
+/*
+ x_c = (x_c * a_c) / 255
+*/
+#define FbByteMulC(x, a) do { \
+ uint32_t t; \
+ uint32_t r = (x & 0xff) * (a & 0xff); \
+ r |= (x & 0xff0000) * ((a >> 16) & 0xff); \
+ r += 0x800080; \
+ r = (r + ((r >> 8) & 0xff00ff)) >> 8; \
+ r &= 0xff00ff; \
+ \
+ x >>= 8; \
+ t = (x & 0xff) * ((a >> 8) & 0xff); \
+ t |= (x & 0xff0000) * (a >> 24); \
+ t += 0x800080; \
+ t = t + ((t >> 8) & 0xff00ff); \
+ x = r | (t & 0xff00ff00); \
+ \
+ } while (0)
+
+/*
+ x_c = (x_c * a) / 255 + y
+*/
+#define FbByteMulAddC(x, a, y) do { \
+ uint32_t t; \
+ uint32_t r = (x & 0xff) * (a & 0xff); \
+ r |= (x & 0xff0000) * ((a >> 16) & 0xff); \
+ r += 0x800080; \
+ r = (r + ((r >> 8) & 0xff00ff)) >> 8; \
+ r &= 0xff00ff; \
+ r += y & 0xff00ff; \
+ r |= 0x1000100 - ((r >> 8) & 0xff00ff); \
+ r &= 0xff00ff; \
+ \
+ x >>= 8; \
+ t = (x & 0xff) * ((a >> 8) & 0xff); \
+ t |= (x & 0xff0000) * (a >> 24); \
+ t += 0x800080; \
+ t = (t + ((t >> 8) & 0xff00ff)) >> 8; \
+ t &= 0xff00ff; \
+ t += (y >> 8) & 0xff00ff; \
+ t |= 0x1000100 - ((t >> 8) & 0xff00ff); \
+ t &= 0xff00ff; \
+ x = r | (t << 8); \
+ } while (0)
+
+/*
+ x_c = (x_c * a_c + y_c * b) / 255
+*/
+#define FbByteAddMulC(x, a, y, b) do { \
+ uint32_t t; \
+ uint32_t r = (x >> 24) * (a >> 24) + (y >> 24) * b; \
+ r += (r >> 8) + 0x80; \
+ r >>= 8; \
+ \
+ t = (x & 0xff00) * ((a >> 8) & 0xff) + (y & 0xff00) * b; \
+ t += (t >> 8) + 0x8000; \
+ t >>= 16; \
+ \
+ t |= r << 16; \
+ t |= 0x1000100 - ((t >> 8) & 0xff00ff); \
+ t &= 0xff00ff; \
+ t <<= 8; \
+ \
+ r = ((x >> 16) & 0xff) * ((a >> 16) & 0xff) + ((y >> 16) & 0xff) * b + 0x80; \
+ r += (r >> 8); \
+ r >>= 8; \
+ \
+ x = (x & 0xff) * (a & 0xff) + (y & 0xff) * b + 0x80; \
+ x += (x >> 8); \
+ x >>= 8; \
+ x |= r << 16; \
+ x |= 0x1000100 - ((x >> 8) & 0xff00ff); \
+ x &= 0xff00ff; \
+ x |= t; \
+ } while (0)
+
+/*
+ x_c = min(x_c + y_c, 255)
+*/
+#define FbByteAdd(x, y) do { \
+ uint32_t t; \
+ uint32_t r = (x & 0xff00ff) + (y & 0xff00ff); \
+ r |= 0x1000100 - ((r >> 8) & 0xff00ff); \
+ r &= 0xff00ff; \
+ \
+ t = ((x >> 8) & 0xff00ff) + ((y >> 8) & 0xff00ff); \
+ t |= 0x1000100 - ((t >> 8) & 0xff00ff); \
+ r |= (t & 0xff00ff) << 8; \
+ x = r; \
+ } while (0)
+
+static inline vector unsigned int
+splat_alpha (vector unsigned int pix) {
+ return vec_perm (pix, pix,
+ (vector unsigned char)AVV(0x00,0x00,0x00,0x00, 0x04,0x04,0x04,0x04,
+ 0x08,0x08,0x08,0x08, 0x0C,0x0C,0x0C,0x0C));
+}
+
+static inline vector unsigned int
+pix_multiply (vector unsigned int p, vector unsigned int a)
+{
+ vector unsigned short hi, lo, mod;
+ /* unpack to short */
+ hi = (vector unsigned short)
+ vec_mergeh ((vector unsigned char)AVV(0),
+ (vector unsigned char)p);
+ mod = (vector unsigned short)
+ vec_mergeh ((vector unsigned char)AVV(0),
+ (vector unsigned char)a);
+
+ hi = vec_mladd (hi, mod, (vector unsigned short)
+ AVV(0x0080,0x0080,0x0080,0x0080,
+ 0x0080,0x0080,0x0080,0x0080));
+
+ hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
+
+ hi = vec_sr (hi, vec_splat_u16 (8));
+
+ /* unpack to short */
+ lo = (vector unsigned short)
+ vec_mergel ((vector unsigned char)AVV(0),
+ (vector unsigned char)p);
+ mod = (vector unsigned short)
+ vec_mergel ((vector unsigned char)AVV(0),
+ (vector unsigned char)a);
+
+ lo = vec_mladd (lo, mod, (vector unsigned short)
+ AVV(0x0080,0x0080,0x0080,0x0080,
+ 0x0080,0x0080,0x0080,0x0080));
+
+ lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
+
+ lo = vec_sr (lo, vec_splat_u16 (8));
+
+ return (vector unsigned int)vec_packsu (hi, lo);
+}
+
+static inline vector unsigned int
+pix_add (vector unsigned int a, vector unsigned int b)
+{
+ return (vector unsigned int)vec_adds ((vector unsigned char)a,
+ (vector unsigned char)b);
+}
+
+static inline vector unsigned int
+pix_add_mul (vector unsigned int x, vector unsigned int a,
+ vector unsigned int y, vector unsigned int b)
+{
+ vector unsigned short hi, lo, mod, hiy, loy, mody;
+
+ hi = (vector unsigned short)
+ vec_mergeh ((vector unsigned char)AVV(0),
+ (vector unsigned char)x);
+ mod = (vector unsigned short)
+ vec_mergeh ((vector unsigned char)AVV(0),
+ (vector unsigned char)a);
+ hiy = (vector unsigned short)
+ vec_mergeh ((vector unsigned char)AVV(0),
+ (vector unsigned char)y);
+ mody = (vector unsigned short)
+ vec_mergeh ((vector unsigned char)AVV(0),
+ (vector unsigned char)b);
+
+ hi = vec_mladd (hi, mod, (vector unsigned short)
+ AVV(0x0080,0x0080,0x0080,0x0080,
+ 0x0080,0x0080,0x0080,0x0080));
+
+ hi = vec_mladd (hiy, mody, hi);
+
+ hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
+
+ hi = vec_sr (hi, vec_splat_u16 (8));
+
+ lo = (vector unsigned short)
+ vec_mergel ((vector unsigned char)AVV(0),
+ (vector unsigned char)x);
+ mod = (vector unsigned short)
+ vec_mergel ((vector unsigned char)AVV(0),
+ (vector unsigned char)a);
+
+ loy = (vector unsigned short)
+ vec_mergel ((vector unsigned char)AVV(0),
+ (vector unsigned char)y);
+ mody = (vector unsigned short)
+ vec_mergel ((vector unsigned char)AVV(0),
+ (vector unsigned char)b);
+
+ lo = vec_mladd (lo, mod, (vector unsigned short)
+ AVV(0x0080,0x0080,0x0080,0x0080,
+ 0x0080,0x0080,0x0080,0x0080));
+
+ lo = vec_mladd (loy, mody, lo);
+
+ lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
+
+ lo = vec_sr (lo, vec_splat_u16 (8));
+
+ return (vector unsigned int)vec_packsu (hi, lo);
+}
+
+static inline vector unsigned int
+negate (vector unsigned int src)
+{
+ return vec_nor (src, src);
+}
+/* dest*~srca + src */
+static inline vector unsigned int
+over (vector unsigned int src, vector unsigned int srca,
+ vector unsigned int dest)
+{
+ vector unsigned char tmp = (vector unsigned char)
+ pix_multiply (dest, negate (srca));
+ tmp = vec_adds ((vector unsigned char)src, tmp);
+ return (vector unsigned int)tmp;
+}
+
+/* in == pix_multiply */
+#define in_over(src, srca, mask, dest) over (pix_multiply (src, mask),\
+ pix_multiply (srca, mask), dest)
+
+
+#define COMPUTE_SHIFT_MASK(source) \
+ source ## _mask = vec_lvsl (0, source);
+
+#define COMPUTE_SHIFT_MASKS(dest, source) \
+ dest ## _mask = vec_lvsl (0, dest); \
+ source ## _mask = vec_lvsl (0, source); \
+ store_mask = vec_lvsr (0, dest);
+
+#define COMPUTE_SHIFT_MASKC(dest, source, mask) \
+ mask ## _mask = vec_lvsl (0, mask); \
+ dest ## _mask = vec_lvsl (0, dest); \
+ source ## _mask = vec_lvsl (0, source); \
+ store_mask = vec_lvsr (0, dest);
+
+/* notice you have to declare temp vars...
+ * Note: tmp3 and tmp4 must remain untouched!
+ */
+
+#define LOAD_VECTORS(dest, source) \
+ tmp1 = (typeof(tmp1))vec_ld(0, source); \
+ tmp2 = (typeof(tmp2))vec_ld(15, source); \
+ tmp3 = (typeof(tmp3))vec_ld(0, dest); \
+ v ## source = (typeof(v ## source)) \
+ vec_perm(tmp1, tmp2, source ## _mask); \
+ tmp4 = (typeof(tmp4))vec_ld(15, dest); \
+ v ## dest = (typeof(v ## dest)) \
+ vec_perm(tmp3, tmp4, dest ## _mask);
+
+#define LOAD_VECTORSC(dest, source, mask) \
+ tmp1 = (typeof(tmp1))vec_ld(0, source); \
+ tmp2 = (typeof(tmp2))vec_ld(15, source); \
+ tmp3 = (typeof(tmp3))vec_ld(0, dest); \
+ v ## source = (typeof(v ## source)) \
+ vec_perm(tmp1, tmp2, source ## _mask); \
+ tmp4 = (typeof(tmp4))vec_ld(15, dest); \
+ tmp1 = (typeof(tmp1))vec_ld(0, mask); \
+ v ## dest = (typeof(v ## dest)) \
+ vec_perm(tmp3, tmp4, dest ## _mask); \
+ tmp2 = (typeof(tmp2))vec_ld(15, mask); \
+ v ## mask = (typeof(v ## mask)) \
+ vec_perm(tmp1, tmp2, mask ## _mask);
+#define STORE_VECTOR(dest) \
+ edges = vec_perm (tmp4, tmp3, dest ## _mask); \
+ tmp3 = vec_perm ((vector unsigned char)v ## dest, edges, store_mask); \
+ tmp1 = vec_perm (edges, (vector unsigned char)v ## dest, store_mask); \
+ vec_st ((vector unsigned int) tmp3, 15, dest ); \
+ vec_st ((vector unsigned int) tmp1, 0, dest );
+
+static FASTCALL void
+vmxCombineMaskU (uint32_t *src, const uint32_t *msk, int width)
+{
+ int i;
+ vector unsigned int vsrc, vmsk;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ src_mask, msk_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(src, msk)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(src, msk)
+
+ vsrc = pix_multiply (vsrc, splat_alpha (vmsk));
+
+ STORE_VECTOR(src)
+
+ msk+=4;
+ src+=4;
+ }
+
+ for (i = width%4; --i >= 0;) {
+ uint32_t a = msk[i] >> 24;
+ uint32_t s = src[i];
+ FbByteMul (s, a);
+ src[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineOverU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = over (vsrc, splat_alpha (vsrc), vdest);
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t ia = Alpha (~s);
+
+ FbByteMulAdd (d, ia, s);
+ dest[i] = d;
+ }
+}
+
+
+static FASTCALL void
+vmxCombineOverReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = over (vdest, splat_alpha (vdest) , vsrc);
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t ia = Alpha (~dest[i]);
+
+ FbByteMulAdd (s, ia, d);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineInU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = pix_multiply (vsrc, splat_alpha (vdest));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+
+ uint32_t s = src[i];
+ uint32_t a = Alpha (dest[i]);
+ FbByteMul (s, a);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineInReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = pix_multiply (vdest, splat_alpha (vsrc));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t d = dest[i];
+ uint32_t a = Alpha (src[i]);
+ FbByteMul (d, a);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineOutU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t s = src[i];
+ uint32_t a = Alpha (~dest[i]);
+ FbByteMul (s, a);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineOutReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t d = dest[i];
+ uint32_t a = Alpha (~src[i]);
+ FbByteMul (d, a);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineAtopU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = pix_add_mul (vsrc, splat_alpha (vdest),
+ vdest, splat_alpha (negate (vsrc)));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t dest_a = Alpha (d);
+ uint32_t src_ia = Alpha (~s);
+
+ FbByteAddMul (s, dest_a, d, src_ia);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineAtopReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = pix_add_mul (vdest, splat_alpha (vsrc),
+ vsrc, splat_alpha (negate (vdest)));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t src_a = Alpha (s);
+ uint32_t dest_ia = Alpha (~d);
+
+ FbByteAddMul (s, dest_ia, d, src_a);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineXorU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS (dest, src)
+
+ vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
+ vdest, splat_alpha (negate (vsrc)));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t src_ia = Alpha (~s);
+ uint32_t dest_ia = Alpha (~d);
+
+ FbByteAddMul (s, dest_ia, d, src_ia);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineAddU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = pix_add (vsrc, vdest);
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ FbByteAdd (d, s);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineSrcC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask);
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_multiply (vsrc, vmask);
+
+ STORE_VECTOR(dest)
+
+ mask+=4;
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ FbByteMulC (s, a);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineOverC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask);
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
+
+ STORE_VECTOR(dest)
+
+ mask+=4;
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ FbByteMulC (s, a);
+ FbByteMulAddC (d, ~a, s);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineOverReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask);
+ /* printf("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC (dest, src, mask)
+
+ vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask));
+
+ STORE_VECTOR(dest)
+
+ mask+=4;
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t da = Alpha (d);
+ FbByteMulC (s, a);
+ FbByteMulAddC (s, ~da, d);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineInC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ mask+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t da = Alpha (dest[i]);
+ FbByteMul (s, a);
+ FbByteMul (s, da);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineInReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc)));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ mask+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t d = dest[i];
+ uint32_t sa = Alpha (src[i]);
+ FbByteMul (a, sa);
+ FbByteMulC (d, a);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineOutC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ mask+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t da = Alpha (~d);
+ FbByteMulC (s, a);
+ FbByteMulC (s, da);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineOutReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_multiply (vdest,
+ negate (pix_multiply (vmask, splat_alpha (vsrc))));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ mask+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t sa = Alpha (s);
+ FbByteMulC (a, sa);
+ FbByteMulC (d, ~a);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineAtopC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_add_mul (pix_multiply (vsrc, vmask), splat_alpha (vdest),
+ vdest,
+ negate (pix_multiply (vmask,
+ splat_alpha (vmask))));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ mask+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t sa = Alpha (s);
+ uint32_t da = Alpha (d);
+
+ FbByteMulC (s, a);
+ FbByteMul (a, sa);
+ FbByteAddMulC (d, ~a, s, da);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineAtopReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_add_mul (vdest,
+ pix_multiply (vmask, splat_alpha (vsrc)),
+ pix_multiply (vsrc, vmask),
+ negate (splat_alpha (vdest)));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ mask+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t sa = Alpha (s);
+ uint32_t da = Alpha (d);
+
+ FbByteMulC (s, a);
+ FbByteMul (a, sa);
+ FbByteAddMulC (d, a, s, ~da);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineXorC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_add_mul (vdest,
+ negate (pix_multiply (vmask, splat_alpha (vsrc))),
+ pix_multiply (vsrc, vmask),
+ negate (splat_alpha (vdest)));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ mask+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t sa = Alpha (s);
+ uint32_t da = Alpha (d);
+
+ FbByteMulC (s, a);
+ FbByteMul (a, sa);
+ FbByteAddMulC (d, ~a, s, ~da);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineAddC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_add (pix_multiply (vsrc, vmask), vdest);
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ mask+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+
+ FbByteMulC (s, a);
+ FbByteAdd (s, d);
+ dest[i] = s;
+ }
+}
+
+
+#if 0
+void
+fbCompositeSolid_nx8888vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int16_t xSrc,
+ int16_t ySrc,
+ int16_t xMask,
+ int16_t yMask,
+ int16_t xDst,
+ int16_t yDst,
+ uint16_t width,
+ uint16_t height)
+{
+ uint32_t src;
+ uint32_t *dstLine, *dst;
+ int dstStride;
+
+ fbComposeGetSolid (pSrc, pDst, src);
+
+ if (src >> 24 == 0)
+ return;
+
+ fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ /* XXX vmxCombineOverU (dst, src, width); */
+ }
+}
+
+void
+fbCompositeSolid_nx0565vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int16_t xSrc,
+ int16_t ySrc,
+ int16_t xMask,
+ int16_t yMask,
+ int16_t xDst,
+ int16_t yDst,
+ uint16_t width,
+ uint16_t height)
+{
+ uint32_t src;
+ uint16_t *dstLine, *dst;
+ uint16_t w;
+ int dstStride;
+
+ fbComposeGetSolid (pSrc, pDst, src);
+
+ if (src >> 24 == 0)
+ return;
+
+ fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ vmxCombineOverU565(dst, src, width);
+ }
+}
+
+#endif
+
+void fbComposeSetupVMX (void)
+{
+ /* check if we have VMX support and initialize accordingly */
+ if (pixman_have_vmx ()) {
+ pixman_composeFunctions.combineU[PIXMAN_OP_OVER] = vmxCombineOverU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_OVER_REVERSE] = vmxCombineOverReverseU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_IN] = vmxCombineInU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_IN_REVERSE] = vmxCombineInReverseU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_OUT] = vmxCombineOutU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_OUT_REVERSE] = vmxCombineOutReverseU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_ATOP] = vmxCombineAtopU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_ATOP_REVERSE] = vmxCombineAtopReverseU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_XOR] = vmxCombineXorU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_ADD] = vmxCombineAddU;
+
+ pixman_composeFunctions.combineC[PIXMAN_OP_SRC] = vmxCombineSrcC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_OVER] = vmxCombineOverC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_OVER_REVERSE] = vmxCombineOverReverseC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_IN] = vmxCombineInC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_IN_REVERSE] = vmxCombineInReverseC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_OUT] = vmxCombineOutC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_OUT_REVERSE] = vmxCombineOutReverseC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_ATOP] = vmxCombineAtopC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_ATOP_REVERSE] = vmxCombineAtopReverseC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_XOR] = vmxCombineXorC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_ADD] = vmxCombineAddC;
+
+ pixman_composeFunctions.combineMaskU = vmxCombineMaskU;
+ }
+}
commit 27b753c9deabe5ac775021abfae98a6a1830cfc2
Author: Luca Barbato <lu_zero at gentoo.org>
Date: Thu Apr 24 01:08:29 2008 +0200
Remove unused macro
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 8b17f66..ac050a4 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -338,11 +338,6 @@ over (vector unsigned int src, vector unsigned int srca,
/* notice you have to declare temp vars...
* Note: tmp3 and tmp4 must remain untouched!
*/
-#define LOAD_VECTOR (source) \
- tmp1 = (typeof(v ## source))vec_ld(0, source); \
- tmp2 = (typeof(v ## source))vec_ld(15, source); \
- v ## source = (typeof(v ## source)) \
- vec_perm(tmp1, tmp2, source ## _mask);
#define LOAD_VECTORS(dest, source) \
tmp1 = (typeof(tmp1))vec_ld(0, source); \
commit 584118fb6c15d695b6a203c2df51411958957880
Author: Luca Barbato <lu_zero at gentoo.org>
Date: Thu Apr 24 01:06:38 2008 +0200
Remove VMX from CPUFeatures, ppc isn't using it at all
diff --git a/pixman/pixman-pict.c b/pixman/pixman-pict.c
index 9c6a375..9147af7 100644
--- a/pixman/pixman-pict.c
+++ b/pixman/pixman-pict.c
@@ -1878,7 +1878,6 @@ enum CPUFeatures {
NoFeatures = 0,
MMX = 0x1,
MMX_Extensions = 0x2,
- VMX = 0x4,
SSE = 0x6,
SSE2 = 0x8,
CMOV = 0x10
commit fc96121afd5d8451c9d8ba8a693e589d1999d131
Author: Luca Barbato <lu_zero at gentoo.org>
Date: Thu Apr 24 01:03:08 2008 +0200
Simplify cpu feature check
diff --git a/pixman/pixman-pict.c b/pixman/pixman-pict.c
index 1b4c81f..9c6a375 100644
--- a/pixman/pixman-pict.c
+++ b/pixman/pixman-pict.c
@@ -1826,44 +1826,38 @@ pixman_image_composite (pixman_op_t op,
* "-maltivec -mabi=altivec", as gcc would try to save vector register
* across function calls causing SIGILL on cpus without Altivec/vmx.
*/
+static pixman_bool_t initialized = FALSE;
+static volatile pixman_bool_t have_vmx = TRUE;
+
#ifdef __APPLE__
#include <sys/sysctl.h>
pixman_bool_t pixman_have_vmx (void) {
- int hasVMX = 0;
- size_t length = sizeof ( hasVMX );
- int error = sysctlbyname ("hw.optional.altivec", &hasVMX, &length, NULL, 0);
- if ( 0 != error ) return 0;
- return hasVMX;
+ if(!initialized) {
+ size_t length = sizeof(have_vmx);
+ int error =
+ sysctlbyname("hw.optional.altivec", &have_vmx, &length, NULL, 0);
+ if(error) have_vmx = FALSE;
+ initialized = TRUE;
+ }
+ return have_vmx;
}
#else
#include <signal.h>
-#include <setjmp.h>
-
-static sigjmp_buf jmp;
-static volatile sig_atomic_t in_test = 0;
static void vmx_test (int sig) {
- if (! in_test) {
- signal (sig, SIG_DFL);
- raise (sig);
- }
- in_test = 0;
- siglongjmp (jmp, 1);
+ have_vmx = FALSE;
}
pixman_bool_t pixman_have_vmx (void) {
- signal (SIGILL, vmx_test);
- if (sigsetjmp (jmp, 1)) {
- signal (SIGILL, SIG_DFL);
- } else {
- in_test = 1;
+ if (!initialized) {
+ signal(SIGILL, vmx_test);
asm volatile ( "vor 0, 0, 0" );
- signal (SIGILL, SIG_DFL);
- return 1;
+ signal(SIGILL, SIG_DFL);
+ initialized = TRUE;
}
- return 0;
+ return have_vmx;
}
#endif /* __APPLE__ */
#endif /* USE_VMX */
commit 08b317a5f519978cfabebd75d5595b19fc1d1425
Author: Luca Barbato <lu_zero at gentoo.org>
Date: Thu Apr 24 00:41:16 2008 +0200
Refactor path selection
diff --git a/pixman/pixman-pict.c b/pixman/pixman-pict.c
index c758823..1b4c81f 100644
--- a/pixman/pixman-pict.c
+++ b/pixman/pixman-pict.c
@@ -1742,23 +1742,19 @@ pixman_image_composite (pixman_op_t op,
#ifdef USE_SSE2
if (pixman_have_sse ())
info = get_fast_path (sse_fast_paths, op, pSrc, pMask, pDst, pixbuf);
- if (!info)
#endif
#ifdef USE_MMX
-
- if (pixman_have_mmx())
+ if (!info && pixman_have_mmx())
info = get_fast_path (mmx_fast_paths, op, pSrc, pMask, pDst, pixbuf);
- if (!info)
#endif
#ifdef USE_VMX
- if (pixman_have_vmx())
+ if (!info && pixman_have_vmx())
info = get_fast_path (vmx_fast_paths, op, pSrc, pMask, pDst, pixbuf);
- if (!info)
#endif
-
+ if (!info)
info = get_fast_path (c_fast_paths, op, pSrc, pMask, pDst, pixbuf);
if (info)
commit 083cadd4c7d1270b0ee9f0365327b872898d1561
Author: Luca Barbato <lu_zero at gentoo.org>
Date: Thu Apr 24 00:36:51 2008 +0200
Force inlining
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 6d275ee..8b17f66 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -29,6 +29,10 @@
#include "pixman-vmx.h"
#include <altivec.h>
+#ifdef __GNUC__
+# define inline __inline__ __attribute__ ((__always_inline__))
+#endif
+
/*
x_c = (x_c * a) / 255
*/
commit 8e68544e0d8cc7af24fb8b298fd6afd47c620136
Author: Luca Barbato <lu_zero at gentoo.org>
Date: Sat Apr 12 13:16:46 2008 +0200
Unbreak vmx pixman
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index 2217e4e..b25fd41 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -50,7 +50,7 @@ endif
# vmx code
if USE_VMX
-noinst_LTLIBRARIES = libpixman-vmx.la
+noinst_LTLIBRARIES += libpixman-vmx.la
libpixman_vmx_la_SOURCES = \
pixman-vmx.c \
pixman-vmx.h
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 0008dc5..6d275ee 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -29,6 +29,164 @@
#include "pixman-vmx.h"
#include <altivec.h>
+/*
+ x_c = (x_c * a) / 255
+*/
+#define FbByteMul(x, a) do { \
+ uint32_t t = ((x & 0xff00ff) * a) + 0x800080; \
+ t = (t + ((t >> 8) & 0xff00ff)) >> 8; \
+ t &= 0xff00ff; \
+ \
+ x = (((x >> 8) & 0xff00ff) * a) + 0x800080; \
+ x = (x + ((x >> 8) & 0xff00ff)); \
+ x &= 0xff00ff00; \
+ x += t; \
+ } while (0)
+
+/*
+ x_c = (x_c * a) / 255 + y
+*/
+#define FbByteMulAdd(x, a, y) do { \
+ uint32_t t = ((x & 0xff00ff) * a) + 0x800080; \
+ t = (t + ((t >> 8) & 0xff00ff)) >> 8; \
+ t &= 0xff00ff; \
+ t += y & 0xff00ff; \
+ t |= 0x1000100 - ((t >> 8) & 0xff00ff); \
+ t &= 0xff00ff; \
+ \
+ x = (((x >> 8) & 0xff00ff) * a) + 0x800080; \
+ x = (x + ((x >> 8) & 0xff00ff)) >> 8; \
+ x &= 0xff00ff; \
+ x += (y >> 8) & 0xff00ff; \
+ x |= 0x1000100 - ((x >> 8) & 0xff00ff); \
+ x &= 0xff00ff; \
+ x <<= 8; \
+ x += t; \
+ } while (0)
+
+/*
+ x_c = (x_c * a + y_c * b) / 255
+*/
+#define FbByteAddMul(x, a, y, b) do { \
+ uint32_t t; \
+ uint32_t r = (x >> 24) * a + (y >> 24) * b + 0x80; \
+ r += (r >> 8); \
+ r >>= 8; \
+ \
+ t = (x & 0xff00) * a + (y & 0xff00) * b; \
+ t += (t >> 8) + 0x8000; \
+ t >>= 16; \
+ \
+ t |= r << 16; \
+ t |= 0x1000100 - ((t >> 8) & 0xff00ff); \
+ t &= 0xff00ff; \
+ t <<= 8; \
+ \
+ r = ((x >> 16) & 0xff) * a + ((y >> 16) & 0xff) * b + 0x80; \
+ r += (r >> 8); \
+ r >>= 8; \
+ \
+ x = (x & 0xff) * a + (y & 0xff) * b + 0x80; \
+ x += (x >> 8); \
+ x >>= 8; \
+ x |= r << 16; \
+ x |= 0x1000100 - ((x >> 8) & 0xff00ff); \
+ x &= 0xff00ff; \
+ x |= t; \
+ } while (0)
+
+/*
+ x_c = (x_c * a_c) / 255
+*/
+#define FbByteMulC(x, a) do { \
+ uint32_t t; \
+ uint32_t r = (x & 0xff) * (a & 0xff); \
+ r |= (x & 0xff0000) * ((a >> 16) & 0xff); \
+ r += 0x800080; \
+ r = (r + ((r >> 8) & 0xff00ff)) >> 8; \
+ r &= 0xff00ff; \
+ \
+ x >>= 8; \
+ t = (x & 0xff) * ((a >> 8) & 0xff); \
+ t |= (x & 0xff0000) * (a >> 24); \
+ t += 0x800080; \
+ t = t + ((t >> 8) & 0xff00ff); \
+ x = r | (t & 0xff00ff00); \
+ \
+ } while (0)
+
+/*
+ x_c = (x_c * a) / 255 + y
+*/
+#define FbByteMulAddC(x, a, y) do { \
+ uint32_t t; \
+ uint32_t r = (x & 0xff) * (a & 0xff); \
+ r |= (x & 0xff0000) * ((a >> 16) & 0xff); \
+ r += 0x800080; \
+ r = (r + ((r >> 8) & 0xff00ff)) >> 8; \
+ r &= 0xff00ff; \
+ r += y & 0xff00ff; \
+ r |= 0x1000100 - ((r >> 8) & 0xff00ff); \
+ r &= 0xff00ff; \
+ \
+ x >>= 8; \
+ t = (x & 0xff) * ((a >> 8) & 0xff); \
+ t |= (x & 0xff0000) * (a >> 24); \
+ t += 0x800080; \
+ t = (t + ((t >> 8) & 0xff00ff)) >> 8; \
+ t &= 0xff00ff; \
+ t += (y >> 8) & 0xff00ff; \
+ t |= 0x1000100 - ((t >> 8) & 0xff00ff); \
+ t &= 0xff00ff; \
+ x = r | (t << 8); \
+ } while (0)
+
+/*
+ x_c = (x_c * a_c + y_c * b) / 255
+*/
+#define FbByteAddMulC(x, a, y, b) do { \
+ uint32_t t; \
+ uint32_t r = (x >> 24) * (a >> 24) + (y >> 24) * b; \
+ r += (r >> 8) + 0x80; \
+ r >>= 8; \
+ \
+ t = (x & 0xff00) * ((a >> 8) & 0xff) + (y & 0xff00) * b; \
+ t += (t >> 8) + 0x8000; \
+ t >>= 16; \
+ \
+ t |= r << 16; \
+ t |= 0x1000100 - ((t >> 8) & 0xff00ff); \
+ t &= 0xff00ff; \
+ t <<= 8; \
+ \
+ r = ((x >> 16) & 0xff) * ((a >> 16) & 0xff) + ((y >> 16) & 0xff) * b + 0x80; \
+ r += (r >> 8); \
+ r >>= 8; \
+ \
+ x = (x & 0xff) * (a & 0xff) + (y & 0xff) * b + 0x80; \
+ x += (x >> 8); \
+ x >>= 8; \
+ x |= r << 16; \
+ x |= 0x1000100 - ((x >> 8) & 0xff00ff); \
+ x &= 0xff00ff; \
+ x |= t; \
+ } while (0)
+
+/*
+ x_c = min(x_c + y_c, 255)
+*/
+#define FbByteAdd(x, y) do { \
+ uint32_t t; \
+ uint32_t r = (x & 0xff00ff) + (y & 0xff00ff); \
+ r |= 0x1000100 - ((r >> 8) & 0xff00ff); \
+ r &= 0xff00ff; \
+ \
+ t = ((x >> 8) & 0xff00ff) + ((y >> 8) & 0xff00ff); \
+ t |= 0x1000100 - ((t >> 8) & 0xff00ff); \
+ r |= (t & 0xff00ff) << 8; \
+ x = r; \
+ } while (0)
+
static inline vector unsigned int
splat_alpha (vector unsigned int pix) {
return vec_perm (pix, pix,
commit 1ec7bd2cb2d02caca06742b0091f293d29d95a44
Merge: e63bf15... 5388222...
Author: Luca Barbato <lu_zero at gentoo.org>
Date: Sat Apr 12 09:53:24 2008 +0200
Merge branch 'master' into vmx
Conflicts:
pixman/pixman-pict.c
diff --cc pixman/pixman-pict.c
index 6cc81d7,f01a643..c758823
--- a/pixman/pixman-pict.c
+++ b/pixman/pixman-pict.c
@@@ -30,10 -30,9 +30,9 @@@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
- #include "pixman.h"
-
#include "pixman-private.h"
#include "pixman-mmx.h"
+#include "pixman-vmx.h"
#include "pixman-sse.h"
#define FbFullMask(n) ((n) == 32 ? (uint32_t)-1 : ((((uint32_t) 1) << n) - 1))
commit e63bf1554b4adf9e687ec86213a97caab2218a77
Author: Luca Barbato <lu_zero at gentoo.org>
Date: Sun Mar 23 16:12:31 2008 +0100
Make configure message alike the mmx/sse/sse2 ones
diff --git a/configure.ac b/configure.ac
index 124d3e2..8a396ec 100644
--- a/configure.ac
+++ b/configure.ac
@@ -205,7 +205,7 @@ else
fi
have_vmx_intrinsics=no
-AC_MSG_CHECKING(For VMX/Altivec intrinsics in the compiler)
+AC_MSG_CHECKING(whether to use VMX/Altivec intrinsics)
xserver_save_CFLAGS=$CFLAGS
CFLAGS="$CFLAGS $VMX_CFLAGS"
AC_COMPILE_IFELSE([
commit dcc530178050522705e70ff2f09b9da2b358ac01
Merge: 550e5f5... 29a8ae4...
Author: Luca Barbato <lu_zero at gentoo.org>
Date: Sun Mar 23 16:04:26 2008 +0100
Update vmx
diff --cc configure.ac
index 394ce72,c416bc8..124d3e2
--- a/configure.ac
+++ b/configure.ac
@@@ -165,44 -165,38 +165,73 @@@ f
AM_CONDITIONAL(USE_SSE, test $have_sse_intrinsics = yes)
+
+ dnl ===========================================================================
+ dnl Check for SSE2
+
+ SSE_CFLAGS="-mmmx -msse2 -Winline --param inline-unit-growth=10000 --param large-function-growth=10000"
+
+ have_sse2_intrinsics=no
+ AC_MSG_CHECKING(whether to use SSE2 intrinsics)
+ xserver_save_CFLAGS=$CFLAGS
+ CFLAGS="$CFLAGS -msse2 $MMX_CFLAGS"
+
+ AC_COMPILE_IFELSE([
+ #include <mmintrin.h>
+ #include <xmmintrin.h>
+ int main () {
+ __m128i a, b, c;
+ c = _mm_xor_si128 (a, b);
+ return 0;
+ }], have_sse2_intrinsics=yes)
+ CFLAGS=$xserver_save_CFLAGS
+ AC_MSG_RESULT($have_sse2_intrinsics)
+
+ if test $have_sse2_intrinsics = yes ; then
+ AC_DEFINE(USE_SSE2, 1, [use SSE compiler intrinsics])
+ fi
+
+ AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)
+
dnl ========================================================
AC_SUBST(MMX_CFLAGS)
+ AC_SUBST(SSE_CFLAGS)
+dnl Check for VMX/Altivec
+if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then
+ VMX_CFLAGS="-faltivec"
+else
+ VMX_CFLAGS="-maltivec -mabi=altivec"
+fi
+
+have_vmx_intrinsics=no
+AC_MSG_CHECKING(For VMX/Altivec intrinsics in the compiler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$CFLAGS $VMX_CFLAGS"
+AC_COMPILE_IFELSE([
+#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
+#error "Need GCC >= 3.4 for sane altivec support"
+#endif
+#include <altivec.h>
+int main () {
+ vector unsigned int v = vec_splat_u32 (1);
+ v = vec_sub (v, v);
+ return 0;
+}], have_vmx_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+AC_MSG_RESULT($have_vmx_intrinsics)
+
+if test $have_vmx_intrinsics = yes ; then
+ AC_DEFINE(USE_VMX, 1, [use VMX compiler intrinsics])
+else
+ VMX_CFLAGS=
+fi
+AC_SUBST(VMX_CFLAGS)
+
+AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
+
+dnl ===========================================================================
+
PKG_CHECK_MODULES(GTK, [gtk+-2.0], [HAVE_GTK=yes], [HAVE_GTK=no])
AM_CONDITIONAL(HAVE_GTK, [test "x$HAVE_GTK" = xyes])
diff --cc pixman/Makefile.am
index 467ebdd,1f21f8c..20288b6
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@@ -31,14 -31,15 +31,25 @@@ libpixman_mmx_la_LIBADD = $(DEP_LIBS
libpixman_1_la_LIBADD += libpixman-mmx.la
endif
+# vmx code
+if USE_VMX
+noinst_LTLIBRARIES = libpixman-vmx.la
+libpixman_vmx_la_SOURCES = \
+ pixman-vmx.c \
+ pixman-vmx.h
+libpixman_vmx_la_CFLAGS = $(DEP_CFLAGS) $(VMX_CFLAGS)
+libpixman_vmx_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LIBADD += libpixman-vmx.la
+endif
+ # sse2 code
+ if USE_SSE2
+ noinst_LTLIBRARIES = libpixman-sse.la
+ libpixman_sse_la_SOURCES = \
+ pixman-sse.c \
+ pixman-sse.h
+ libpixman_sse_la_CFLAGS = $(DEP_CFLAGS) $(SSE_CFLAGS)
+ libpixman_sse_la_LIBADD = $(DEP_LIBS)
+ libpixman_1_la_LIBADD += libpixman-sse.la
+ endif
+
diff --cc pixman/pixman-pict.c
index f36ca0e,e4430d1..6cc81d7
--- a/pixman/pixman-pict.c
+++ b/pixman/pixman-pict.c
@@@ -29,10 -30,10 +30,11 @@@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-
+#include "pixman.h"
#include "pixman-private.h"
#include "pixman-mmx.h"
+#include "pixman-vmx.h"
+ #include "pixman-sse.h"
#define FbFullMask(n) ((n) == 32 ? (uint32_t)-1 : ((((uint32_t) 1) << n) - 1))
@@@ -1495,6 -1484,13 +1485,21 @@@ static const FastPathInfo mmx_fast_path
};
#endif
+ #ifdef USE_SSE2
+ static const FastPathInfo sse_fast_paths[] =
+ {
+ { PIXMAN_OP_NONE },
+ };
+ #endif
+
++#ifdef USE_VMX
++static const FastPathInfo vmx_fast_paths[] =
++{
++ { PIXMAN_OP_NONE },
++};
++#endif
++
++
static const FastPathInfo c_fast_paths[] =
{
{ PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, fbCompositeSolidMask_nx8x0565, 0 },
@@@ -1658,31 -1654,23 +1663,27 @@@ pixman_image_composite (pixman_op_
uint16_t width,
uint16_t height)
{
- pixman_bool_t srcRepeat = pSrc->type == BITS && pSrc->common.repeat == PIXMAN_REPEAT_NORMAL;
- pixman_bool_t maskRepeat = FALSE;
- pixman_bool_t srcTransform = pSrc->common.transform != NULL;
- pixman_bool_t maskTransform = FALSE;
- pixman_bool_t srcAlphaMap = pSrc->common.alpha_map != NULL;
- pixman_bool_t maskAlphaMap = FALSE;
- pixman_bool_t dstAlphaMap = pDst->common.alpha_map != NULL;
- CompositeFunc func = NULL;
-
+ pixman_bool_t srcRepeat = pSrc->type == BITS && pSrc->common.repeat == PIXMAN_REPEAT_NORMAL;
+ pixman_bool_t maskRepeat = FALSE;
+ pixman_bool_t srcTransform = pSrc->common.transform != NULL;
+ pixman_bool_t maskTransform = FALSE;
+ pixman_bool_t srcAlphaMap = pSrc->common.alpha_map != NULL;
+ pixman_bool_t maskAlphaMap = FALSE;
+ pixman_bool_t dstAlphaMap = pDst->common.alpha_map != NULL;
+ CompositeFunc func = NULL;
+
+ #ifdef USE_SSE2
+ fbComposeSetupSSE();
+ #endif
+
#ifdef USE_MMX
- static pixman_bool_t mmx_setup = FALSE;
- if (!mmx_setup)
- {
- fbComposeSetupMMX();
- mmx_setup = TRUE;
- }
+ fbComposeSetupMMX();
#endif
+
+#ifdef USE_VMX
- static pixman_bool_t vmx_setup = FALSE;
- if (!vmx_setup) {
- fbComposeSetupVMX();
- vmx_setup = TRUE;
- }
++ fbComposeSetupVMX();
+#endif
+
if (srcRepeat && srcTransform &&
pSrc->bits.width == 1 &&
pSrc->bits.height == 1)
@@@ -1731,6 -1731,6 +1744,14 @@@
info = get_fast_path (mmx_fast_paths, op, pSrc, pMask, pDst, pixbuf);
if (!info)
#endif
++
++#ifdef USE_VMX
++
++ if (pixman_have_vmx())
++ info = get_fast_path (vmx_fast_paths, op, pSrc, pMask, pDst, pixbuf);
++ if (!info)
++#endif
++
info = get_fast_path (c_fast_paths, op, pSrc, pMask, pDst, pixbuf);
if (info)
@@@ -1860,6 -1813,6 +1881,7 @@@ enum CPUFeatures
NoFeatures = 0,
MMX = 0x1,
MMX_Extensions = 0x2,
++ VMX = 0x4,
SSE = 0x6,
SSE2 = 0x8,
CMOV = 0x10
commit 550e5f54abe4f3f0b6fcd278c3b4533036276e3f
Author: Luca Barbato <lu_zero at gentoo.org>
Date: Sat Mar 22 11:28:48 2008 +0100
update patch
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index e530a66..467ebdd 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -33,12 +33,12 @@ endif
# vmx code
if USE_VMX
-noinst_LTLIBRARIES += libpixman-vmx.la
+noinst_LTLIBRARIES = libpixman-vmx.la
libpixman_vmx_la_SOURCES = \
pixman-vmx.c \
pixman-vmx.h
libpixman_vmx_la_CFLAGS = $(DEP_CFLAGS) $(VMX_CFLAGS)
libpixman_vmx_la_LIBADD = $(DEP_LIBS)
-libpixman_la_LIBADD += libpixman-vmx.la
+libpixman_1_la_LIBADD += libpixman-vmx.la
endif
commit 49240111dbb31c335856f9653544a039275bf033
Merge: 808e4f5... 72b46bc...
Author: Luca Barbato <lu_zero at gentoo.org>
Date: Sun Dec 16 00:38:16 2007 +0100
Merge branch 'master' of git://anongit.freedesktop.org/pixman
diff --cc configure.ac
index b6a9732,b1c2015..394ce72
--- a/configure.ac
+++ b/configure.ac
@@@ -114,44 -130,44 +130,79 @@@ f
- AC_SUBST(MMX_CFLAGS)
AM_CONDITIONAL(USE_MMX, test $have_mmx_intrinsics = yes)
+ dnl =======================================================
+
+ dnl GCC 4.2 when compiling with -msse will generate SSE instructions
+ dnl on its own. This means anything compiled with -mss can only be
+ dnl run after a runtime check for SSE. Unfortunately, since we still
+ dnl need to support MMX-but-not-SSE (such as the OLPC), this means we
+ dnl can only use SSE when compiling for x86-64 (where SSE is always
+ dnl supported).
+
+ have_sse_intrinsics=no
+ AC_MSG_CHECKING(whether to use SSE intrinsics)
+ xserver_save_CFLAGS=$CFLAGS
+ CFLAGS="$CFLAGS -msse $MMX_CFLAGS"
+
+ AC_COMPILE_IFELSE([
+ #if !defined(__amd64__) && !defined(__x86_64__)
+ #error "Need x86-64 for SSE"
+ #endif
+ #include <mmintrin.h>
+ #include <xmmintrin.h>
+ int main () {
+ __m64 v = _mm_cvtsi32_si64 (1);
+ v = _mm_shuffle_pi16 (v, _MM_SHUFFLE(3, 3, 3, 3));
+ return _mm_cvtsi64_si32 (v);
+ }], have_sse_intrinsics=yes)
+ CFLAGS=$xserver_save_CFLAGS
+ AC_MSG_RESULT($have_sse_intrinsics)
+
+ if test $have_sse_intrinsics = yes ; then
+ AC_DEFINE(USE_SSE, 1, [use SSE compiler intrinsics])
+ MMX_CFLAGS="-msse $MMX_CFLAGS"
+ fi
+
+ AM_CONDITIONAL(USE_SSE, test $have_sse_intrinsics = yes)
+
dnl ========================================================
+ AC_SUBST(MMX_CFLAGS)
+dnl Check for VMX/Altivec
+if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then
+ VMX_CFLAGS="-faltivec"
+else
+ VMX_CFLAGS="-maltivec -mabi=altivec"
+fi
+
+have_vmx_intrinsics=no
+AC_MSG_CHECKING(For VMX/Altivec intrinsics in the compiler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$CFLAGS $VMX_CFLAGS"
+AC_COMPILE_IFELSE([
+#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
+#error "Need GCC >= 3.4 for sane altivec support"
+#endif
+#include <altivec.h>
+int main () {
+ vector unsigned int v = vec_splat_u32 (1);
+ v = vec_sub (v, v);
+ return 0;
+}], have_vmx_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+AC_MSG_RESULT($have_vmx_intrinsics)
+
+if test $have_vmx_intrinsics = yes ; then
+ AC_DEFINE(USE_VMX, 1, [use VMX compiler intrinsics])
+else
+ VMX_CFLAGS=
+fi
+AC_SUBST(VMX_CFLAGS)
+
+AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
+
+dnl ===========================================================================
+
PKG_CHECK_MODULES(GTK, [gtk+-2.0], [HAVE_GTK=yes], [HAVE_GTK=no])
AM_CONDITIONAL(HAVE_GTK, [test "x$HAVE_GTK" = xyes])
commit 808e4f541b4cfde40c91e6c6cd942f9074d38e94
Merge: 33d4028... 39a67d3...
Author: Luca Barbato <lu_zero at gentoo.org>
Date: Mon Oct 1 22:13:05 2007 +0000
Merge branch 'master' of git://anongit.freedesktop.org/pixman
diff --cc pixman/Makefile.am
index e60c4eb,66283a2..e530a66
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@@ -34,17 -28,5 +28,17 @@@ libpixman_mmx_la_SOURCES =
pixman-mmx.h
libpixman_mmx_la_CFLAGS = $(DEP_CFLAGS) $(MMX_CFLAGS)
libpixman_mmx_la_LIBADD = $(DEP_LIBS)
- libpixman_la_LIBADD += libpixman-mmx.la
+ libpixman_1_la_LIBADD += libpixman-mmx.la
endif
+
+# vmx code
+if USE_VMX
+noinst_LTLIBRARIES += libpixman-vmx.la
+libpixman_vmx_la_SOURCES = \
+ pixman-vmx.c \
+ pixman-vmx.h
+libpixman_vmx_la_CFLAGS = $(DEP_CFLAGS) $(VMX_CFLAGS)
+libpixman_vmx_la_LIBADD = $(DEP_LIBS)
+libpixman_la_LIBADD += libpixman-vmx.la
+endif
+
commit 33d4028e3fffa231f40d66b5843de589ec2642fe
Author: root <root at echo.(none)>
Date: Sun Jul 1 11:42:49 2007 +0000
First import of vmx
diff --git a/configure.ac b/configure.ac
index b759c7f..81e2a26 100644
--- a/configure.ac
+++ b/configure.ac
@@ -76,6 +76,41 @@ AM_CONDITIONAL(USE_MMX, test $have_mmx_intrinsics = yes)
dnl ========================================================
+dnl Check for VMX/Altivec
+if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then
+ VMX_CFLAGS="-faltivec"
+else
+ VMX_CFLAGS="-maltivec -mabi=altivec"
+fi
+
+have_vmx_intrinsics=no
+AC_MSG_CHECKING(For VMX/Altivec intrinsics in the compiler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$CFLAGS $VMX_CFLAGS"
+AC_COMPILE_IFELSE([
+#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
+#error "Need GCC >= 3.4 for sane altivec support"
+#endif
+#include <altivec.h>
+int main () {
+ vector unsigned int v = vec_splat_u32 (1);
+ v = vec_sub (v, v);
+ return 0;
+}], have_vmx_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+AC_MSG_RESULT($have_vmx_intrinsics)
+
+if test $have_vmx_intrinsics = yes ; then
+ AC_DEFINE(USE_VMX, 1, [use VMX compiler intrinsics])
+else
+ VMX_CFLAGS=
+fi
+AC_SUBST(VMX_CFLAGS)
+
+AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
+
+dnl ===========================================================================
+
PKG_CHECK_MODULES(GTK, [gtk+-2.0], [HAVE_GTK=yes], [HAVE_GTK=no])
AM_CONDITIONAL(HAVE_GTK, [test "x$HAVE_GTK" = xyes])
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index 90c6436..e60c4eb 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -36,3 +36,15 @@ libpixman_mmx_la_CFLAGS = $(DEP_CFLAGS) $(MMX_CFLAGS)
libpixman_mmx_la_LIBADD = $(DEP_LIBS)
libpixman_la_LIBADD += libpixman-mmx.la
endif
+
+# vmx code
+if USE_VMX
+noinst_LTLIBRARIES += libpixman-vmx.la
+libpixman_vmx_la_SOURCES = \
+ pixman-vmx.c \
+ pixman-vmx.h
+libpixman_vmx_la_CFLAGS = $(DEP_CFLAGS) $(VMX_CFLAGS)
+libpixman_vmx_la_LIBADD = $(DEP_LIBS)
+libpixman_la_LIBADD += libpixman-vmx.la
+endif
+
diff --git a/pixman/pixman-pict.c b/pixman/pixman-pict.c
index cad11dd..a857de5 100644
--- a/pixman/pixman-pict.c
+++ b/pixman/pixman-pict.c
@@ -29,6 +29,7 @@
#include "pixman.h"
#include "pixman-private.h"
#include "pixman-mmx.h"
+#include "pixman-vmx.h"
#define FbFullMask(n) ((n) == 32 ? (uint32_t)-1 : ((((uint32_t) 1) << n) - 1))
@@ -1416,6 +1417,13 @@ pixman_image_composite (pixman_op_t op,
mmx_setup = TRUE;
}
#endif
+#ifdef USE_VMX
+ static pixman_bool_t vmx_setup = FALSE;
+ if (!vmx_setup) {
+ fbComposeSetupVMX();
+ vmx_setup = TRUE;
+ }
+#endif
if (srcRepeat && srcTransform &&
pSrc->bits.width == 1 &&
@@ -2062,6 +2070,53 @@ pixman_image_composite (pixman_op_t op,
}
+#ifdef USE_VMX
+/* The CPU detection code needs to be in a file not compiled with
+ * "-maltivec -mabi=altivec", as gcc would try to save vector register
+ * across function calls causing SIGILL on cpus without Altivec/vmx.
+ */
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+
+pixman_bool_t pixman_have_vmx (void) {
+ int hasVMX = 0;
+ size_t length = sizeof ( hasVMX );
+ int error = sysctlbyname ("hw.optional.altivec", &hasVMX, &length, NULL, 0);
+ if ( 0 != error ) return 0;
+ return hasVMX;
+}
+
+#else
+#include <signal.h>
+#include <setjmp.h>
+
+static sigjmp_buf jmp;
+static volatile sig_atomic_t in_test = 0;
+
+static void vmx_test (int sig) {
+ if (! in_test) {
+ signal (sig, SIG_DFL);
+ raise (sig);
+ }
+ in_test = 0;
+ siglongjmp (jmp, 1);
+}
+
+pixman_bool_t pixman_have_vmx (void) {
+ signal (SIGILL, vmx_test);
+ if (sigsetjmp (jmp, 1)) {
+ signal (SIGILL, SIG_DFL);
+ } else {
+ in_test = 1;
+ asm volatile ( "vor 0, 0, 0" );
+ signal (SIGILL, SIG_DFL);
+ return 1;
+ }
+ return 0;
+}
+#endif /* __APPLE__ */
+#endif /* USE_VMX */
+
#ifdef USE_MMX
/* The CPU detection code needs to be in a file not compiled with
* "-mmmx -msse", as gcc would generate CMOV instructions otherwise
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
new file mode 100644
index 0000000..0008dc5
--- /dev/null
+++ b/pixman/pixman-vmx.c
@@ -0,0 +1,1068 @@
+/*
+ * Copyright © 2007 Luca Barbato
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Luca Barbato not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. Luca Barbato makes no representations about the
+ * suitability of this software for any purpose. It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Luca Barbato (lu_zero at gentoo.org)
+ *
+ * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell
+ */
+
+#include <config.h>
+#include "pixman-vmx.h"
+#include <altivec.h>
+
+static inline vector unsigned int
+splat_alpha (vector unsigned int pix) {
+ return vec_perm (pix, pix,
+ (vector unsigned char)AVV(0x00,0x00,0x00,0x00, 0x04,0x04,0x04,0x04,
+ 0x08,0x08,0x08,0x08, 0x0C,0x0C,0x0C,0x0C));
+}
+
+static inline vector unsigned int
+pix_multiply (vector unsigned int p, vector unsigned int a)
+{
+ vector unsigned short hi, lo, mod;
+ /* unpack to short */
+ hi = (vector unsigned short)
+ vec_mergeh ((vector unsigned char)AVV(0),
+ (vector unsigned char)p);
+ mod = (vector unsigned short)
+ vec_mergeh ((vector unsigned char)AVV(0),
+ (vector unsigned char)a);
+
+ hi = vec_mladd (hi, mod, (vector unsigned short)
+ AVV(0x0080,0x0080,0x0080,0x0080,
+ 0x0080,0x0080,0x0080,0x0080));
+
+ hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
+
+ hi = vec_sr (hi, vec_splat_u16 (8));
+
+ /* unpack to short */
+ lo = (vector unsigned short)
+ vec_mergel ((vector unsigned char)AVV(0),
+ (vector unsigned char)p);
+ mod = (vector unsigned short)
+ vec_mergel ((vector unsigned char)AVV(0),
+ (vector unsigned char)a);
+
+ lo = vec_mladd (lo, mod, (vector unsigned short)
+ AVV(0x0080,0x0080,0x0080,0x0080,
+ 0x0080,0x0080,0x0080,0x0080));
+
+ lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
+
+ lo = vec_sr (lo, vec_splat_u16 (8));
+
+ return (vector unsigned int)vec_packsu (hi, lo);
+}
+
+static inline vector unsigned int
+pix_add (vector unsigned int a, vector unsigned int b)
+{
+ return (vector unsigned int)vec_adds ((vector unsigned char)a,
+ (vector unsigned char)b);
+}
+
+static inline vector unsigned int
+pix_add_mul (vector unsigned int x, vector unsigned int a,
+ vector unsigned int y, vector unsigned int b)
+{
+ vector unsigned short hi, lo, mod, hiy, loy, mody;
+
+ hi = (vector unsigned short)
+ vec_mergeh ((vector unsigned char)AVV(0),
+ (vector unsigned char)x);
+ mod = (vector unsigned short)
+ vec_mergeh ((vector unsigned char)AVV(0),
+ (vector unsigned char)a);
+ hiy = (vector unsigned short)
+ vec_mergeh ((vector unsigned char)AVV(0),
+ (vector unsigned char)y);
+ mody = (vector unsigned short)
+ vec_mergeh ((vector unsigned char)AVV(0),
+ (vector unsigned char)b);
+
+ hi = vec_mladd (hi, mod, (vector unsigned short)
+ AVV(0x0080,0x0080,0x0080,0x0080,
+ 0x0080,0x0080,0x0080,0x0080));
+
+ hi = vec_mladd (hiy, mody, hi);
+
+ hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
+
+ hi = vec_sr (hi, vec_splat_u16 (8));
+
+ lo = (vector unsigned short)
+ vec_mergel ((vector unsigned char)AVV(0),
+ (vector unsigned char)x);
+ mod = (vector unsigned short)
+ vec_mergel ((vector unsigned char)AVV(0),
+ (vector unsigned char)a);
+
+ loy = (vector unsigned short)
+ vec_mergel ((vector unsigned char)AVV(0),
+ (vector unsigned char)y);
+ mody = (vector unsigned short)
+ vec_mergel ((vector unsigned char)AVV(0),
+ (vector unsigned char)b);
+
+ lo = vec_mladd (lo, mod, (vector unsigned short)
+ AVV(0x0080,0x0080,0x0080,0x0080,
+ 0x0080,0x0080,0x0080,0x0080));
+
+ lo = vec_mladd (loy, mody, lo);
+
+ lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
+
+ lo = vec_sr (lo, vec_splat_u16 (8));
+
+ return (vector unsigned int)vec_packsu (hi, lo);
+}
+
+static inline vector unsigned int
+negate (vector unsigned int src)
+{
+ return vec_nor (src, src);
+}
+/* dest*~srca + src */
+static inline vector unsigned int
+over (vector unsigned int src, vector unsigned int srca,
+ vector unsigned int dest)
+{
+ vector unsigned char tmp = (vector unsigned char)
+ pix_multiply (dest, negate (srca));
+ tmp = vec_adds ((vector unsigned char)src, tmp);
+ return (vector unsigned int)tmp;
+}
+
+/* in == pix_multiply */
+#define in_over(src, srca, mask, dest) over (pix_multiply (src, mask),\
+ pix_multiply (srca, mask), dest)
+
+
+#define COMPUTE_SHIFT_MASK(source) \
+ source ## _mask = vec_lvsl (0, source);
+
+#define COMPUTE_SHIFT_MASKS(dest, source) \
+ dest ## _mask = vec_lvsl (0, dest); \
+ source ## _mask = vec_lvsl (0, source); \
+ store_mask = vec_lvsr (0, dest);
+
+#define COMPUTE_SHIFT_MASKC(dest, source, mask) \
+ mask ## _mask = vec_lvsl (0, mask); \
+ dest ## _mask = vec_lvsl (0, dest); \
+ source ## _mask = vec_lvsl (0, source); \
+ store_mask = vec_lvsr (0, dest);
+
+/* notice you have to declare temp vars...
+ * Note: tmp3 and tmp4 must remain untouched!
+ */
+#define LOAD_VECTOR (source) \
+ tmp1 = (typeof(v ## source))vec_ld(0, source); \
+ tmp2 = (typeof(v ## source))vec_ld(15, source); \
+ v ## source = (typeof(v ## source)) \
+ vec_perm(tmp1, tmp2, source ## _mask);
+
+#define LOAD_VECTORS(dest, source) \
+ tmp1 = (typeof(tmp1))vec_ld(0, source); \
+ tmp2 = (typeof(tmp2))vec_ld(15, source); \
+ tmp3 = (typeof(tmp3))vec_ld(0, dest); \
+ v ## source = (typeof(v ## source)) \
+ vec_perm(tmp1, tmp2, source ## _mask); \
+ tmp4 = (typeof(tmp4))vec_ld(15, dest); \
+ v ## dest = (typeof(v ## dest)) \
+ vec_perm(tmp3, tmp4, dest ## _mask);
+
+#define LOAD_VECTORSC(dest, source, mask) \
+ tmp1 = (typeof(tmp1))vec_ld(0, source); \
+ tmp2 = (typeof(tmp2))vec_ld(15, source); \
+ tmp3 = (typeof(tmp3))vec_ld(0, dest); \
+ v ## source = (typeof(v ## source)) \
+ vec_perm(tmp1, tmp2, source ## _mask); \
+ tmp4 = (typeof(tmp4))vec_ld(15, dest); \
+ tmp1 = (typeof(tmp1))vec_ld(0, mask); \
+ v ## dest = (typeof(v ## dest)) \
+ vec_perm(tmp3, tmp4, dest ## _mask); \
+ tmp2 = (typeof(tmp2))vec_ld(15, mask); \
+ v ## mask = (typeof(v ## mask)) \
+ vec_perm(tmp1, tmp2, mask ## _mask);
+#define STORE_VECTOR(dest) \
+ edges = vec_perm (tmp4, tmp3, dest ## _mask); \
+ tmp3 = vec_perm ((vector unsigned char)v ## dest, edges, store_mask); \
+ tmp1 = vec_perm (edges, (vector unsigned char)v ## dest, store_mask); \
+ vec_st ((vector unsigned int) tmp3, 15, dest ); \
+ vec_st ((vector unsigned int) tmp1, 0, dest );
+
+static FASTCALL void
+vmxCombineMaskU (uint32_t *src, const uint32_t *msk, int width)
+{
+ int i;
+ vector unsigned int vsrc, vmsk;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ src_mask, msk_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(src, msk)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(src, msk)
+
+ vsrc = pix_multiply (vsrc, splat_alpha (vmsk));
+
+ STORE_VECTOR(src)
+
+ msk+=4;
+ src+=4;
+ }
+
+ for (i = width%4; --i >= 0;) {
+ uint32_t a = msk[i] >> 24;
+ uint32_t s = src[i];
+ FbByteMul (s, a);
+ src[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineOverU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = over (vsrc, splat_alpha (vsrc), vdest);
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t ia = Alpha (~s);
+
+ FbByteMulAdd (d, ia, s);
+ dest[i] = d;
+ }
+}
+
+
+static FASTCALL void
+vmxCombineOverReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = over (vdest, splat_alpha (vdest) , vsrc);
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t ia = Alpha (~dest[i]);
+
+ FbByteMulAdd (s, ia, d);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineInU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = pix_multiply (vsrc, splat_alpha (vdest));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+
+ uint32_t s = src[i];
+ uint32_t a = Alpha (dest[i]);
+ FbByteMul (s, a);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineInReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = pix_multiply (vdest, splat_alpha (vsrc));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t d = dest[i];
+ uint32_t a = Alpha (src[i]);
+ FbByteMul (d, a);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineOutU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t s = src[i];
+ uint32_t a = Alpha (~dest[i]);
+ FbByteMul (s, a);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineOutReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t d = dest[i];
+ uint32_t a = Alpha (~src[i]);
+ FbByteMul (d, a);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineAtopU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = pix_add_mul (vsrc, splat_alpha (vdest),
+ vdest, splat_alpha (negate (vsrc)));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t dest_a = Alpha (d);
+ uint32_t src_ia = Alpha (~s);
+
+ FbByteAddMul (s, dest_a, d, src_ia);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineAtopReverseU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = pix_add_mul (vdest, splat_alpha (vsrc),
+ vsrc, splat_alpha (negate (vdest)));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t src_a = Alpha (s);
+ uint32_t dest_ia = Alpha (~d);
+
+ FbByteAddMul (s, dest_ia, d, src_a);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineXorU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS (dest, src)
+
+ vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
+ vdest, splat_alpha (negate (vsrc)));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t src_ia = Alpha (~s);
+ uint32_t dest_ia = Alpha (~d);
+
+ FbByteAddMul (s, dest_ia, d, src_ia);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineAddU (uint32_t *dest, const uint32_t *src, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKS(dest, src)
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORS(dest, src)
+
+ vdest = pix_add (vsrc, vdest);
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ FbByteAdd (d, s);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineSrcC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask);
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_multiply (vsrc, vmask);
+
+ STORE_VECTOR(dest)
+
+ mask+=4;
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ FbByteMulC (s, a);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineOverC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask);
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
+
+ STORE_VECTOR(dest)
+
+ mask+=4;
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ FbByteMulC (s, a);
+ FbByteMulAddC (d, ~a, s);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineOverReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask);
+ /* printf("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC (dest, src, mask)
+
+ vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask));
+
+ STORE_VECTOR(dest)
+
+ mask+=4;
+ src+=4;
+ dest+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t da = Alpha (d);
+ FbByteMulC (s, a);
+ FbByteMulAddC (s, ~da, d);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineInC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ mask+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t da = Alpha (dest[i]);
+ FbByteMul (s, a);
+ FbByteMul (s, da);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineInReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc)));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ mask+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t d = dest[i];
+ uint32_t sa = Alpha (src[i]);
+ FbByteMul (a, sa);
+ FbByteMulC (d, a);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineOutC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ mask+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t da = Alpha (~d);
+ FbByteMulC (s, a);
+ FbByteMulC (s, da);
+ dest[i] = s;
+ }
+}
+
+static FASTCALL void
+vmxCombineOutReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_multiply (vdest,
+ negate (pix_multiply (vmask, splat_alpha (vsrc))));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ mask+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t sa = Alpha (s);
+ FbByteMulC (a, sa);
+ FbByteMulC (d, ~a);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineAtopC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_add_mul (pix_multiply (vsrc, vmask), splat_alpha (vdest),
+ vdest,
+ negate (pix_multiply (vmask,
+ splat_alpha (vmask))));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ mask+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t sa = Alpha (s);
+ uint32_t da = Alpha (d);
+
+ FbByteMulC (s, a);
+ FbByteMul (a, sa);
+ FbByteAddMulC (d, ~a, s, da);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineAtopReverseC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_add_mul (vdest,
+ pix_multiply (vmask, splat_alpha (vsrc)),
+ pix_multiply (vsrc, vmask),
+ negate (splat_alpha (vdest)));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ mask+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t sa = Alpha (s);
+ uint32_t da = Alpha (d);
+
+ FbByteMulC (s, a);
+ FbByteMul (a, sa);
+ FbByteAddMulC (d, a, s, ~da);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineXorC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_add_mul (vdest,
+ negate (pix_multiply (vmask, splat_alpha (vsrc))),
+ pix_multiply (vsrc, vmask),
+ negate (splat_alpha (vdest)));
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ mask+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+ uint32_t sa = Alpha (s);
+ uint32_t da = Alpha (d);
+
+ FbByteMulC (s, a);
+ FbByteMul (a, sa);
+ FbByteAddMulC (d, ~a, s, ~da);
+ dest[i] = d;
+ }
+}
+
+static FASTCALL void
+vmxCombineAddC (uint32_t *dest, uint32_t *src, uint32_t *mask, int width)
+{
+ int i;
+ vector unsigned int vdest, vsrc, vmask;
+ vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+ dest_mask, mask_mask, src_mask, store_mask;
+
+ COMPUTE_SHIFT_MASKC(dest, src, mask)
+
+ /* printf ("%s\n",__PRETTY_FUNCTION__); */
+ for (i = width/4; i > 0; i--) {
+
+ LOAD_VECTORSC(dest, src, mask)
+
+ vdest = pix_add (pix_multiply (vsrc, vmask), vdest);
+
+ STORE_VECTOR(dest)
+
+ src+=4;
+ dest+=4;
+ mask+=4;
+ }
+
+ for (i = width%4; --i >=0;) {
+ uint32_t a = mask[i];
+ uint32_t s = src[i];
+ uint32_t d = dest[i];
+
+ FbByteMulC (s, a);
+ FbByteAdd (s, d);
+ dest[i] = s;
+ }
+}
+
+
+#if 0
+void
+fbCompositeSolid_nx8888vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int16_t xSrc,
+ int16_t ySrc,
+ int16_t xMask,
+ int16_t yMask,
+ int16_t xDst,
+ int16_t yDst,
+ uint16_t width,
+ uint16_t height)
+{
+ uint32_t src;
+ uint32_t *dstLine, *dst;
+ int dstStride;
+
+ fbComposeGetSolid (pSrc, pDst, src);
+
+ if (src >> 24 == 0)
+ return;
+
+ fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ /* XXX vmxCombineOverU (dst, src, width); */
+ }
+}
+
+void
+fbCompositeSolid_nx0565vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ int16_t xSrc,
+ int16_t ySrc,
+ int16_t xMask,
+ int16_t yMask,
+ int16_t xDst,
+ int16_t yDst,
+ uint16_t width,
+ uint16_t height)
+{
+ uint32_t src;
+ uint16_t *dstLine, *dst;
+ uint16_t w;
+ int dstStride;
+
+ fbComposeGetSolid (pSrc, pDst, src);
+
+ if (src >> 24 == 0)
+ return;
+
+ fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ vmxCombineOverU565(dst, src, width);
+ }
+}
+
+#endif
+
+void fbComposeSetupVMX (void)
+{
+ /* check if we have VMX support and initialize accordingly */
+ if (pixman_have_vmx ()) {
+ pixman_composeFunctions.combineU[PIXMAN_OP_OVER] = vmxCombineOverU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_OVER_REVERSE] = vmxCombineOverReverseU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_IN] = vmxCombineInU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_IN_REVERSE] = vmxCombineInReverseU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_OUT] = vmxCombineOutU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_OUT_REVERSE] = vmxCombineOutReverseU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_ATOP] = vmxCombineAtopU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_ATOP_REVERSE] = vmxCombineAtopReverseU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_XOR] = vmxCombineXorU;
+ pixman_composeFunctions.combineU[PIXMAN_OP_ADD] = vmxCombineAddU;
+
+ pixman_composeFunctions.combineC[PIXMAN_OP_SRC] = vmxCombineSrcC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_OVER] = vmxCombineOverC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_OVER_REVERSE] = vmxCombineOverReverseC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_IN] = vmxCombineInC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_IN_REVERSE] = vmxCombineInReverseC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_OUT] = vmxCombineOutC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_OUT_REVERSE] = vmxCombineOutReverseC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_ATOP] = vmxCombineAtopC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_ATOP_REVERSE] = vmxCombineAtopReverseC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_XOR] = vmxCombineXorC;
+ pixman_composeFunctions.combineC[PIXMAN_OP_ADD] = vmxCombineAddC;
+
+ pixman_composeFunctions.combineMaskU = vmxCombineMaskU;
+ }
+}
diff --git a/pixman/pixman-vmx.h b/pixman/pixman-vmx.h
new file mode 100644
index 0000000..70cb53a
--- /dev/null
+++ b/pixman/pixman-vmx.h
@@ -0,0 +1,308 @@
+/*
+ * Copyright é 2007 Luca Barbato
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Luca Barbato not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission. Luca Barbato makes no representations about the
+ * suitability of this software for any purpose. It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author: Luca Barbato (lu_zero at gentoo.org)
+ *
+ * Based on work by Owen Taylor, Søren Sandmann and Lars Knoll
+ */
+
+#include "pixman-private.h"
+
+#ifdef USE_VMX
+
+pixman_bool_t pixman_have_vmx(void);
+
+#else
+#define pixman_have_vmx() FALSE
+#endif
+
+#ifdef USE_VMX
+
+#define AVV(x...) {x}
+
+void fbComposeSetupVMX (void);
+
+#if 0
+void fbCompositeIn_nx8x8vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeSolidMask_nx8888x0565Cvmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeSrcAdd_8888x8888vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeSolidMask_nx8888x8888Cvmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeSolidMask_nx8x8888vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeSolidMaskSrc_nx8x8888vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeSrcAdd_8888x8x8vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeIn_8x8vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeSrcAdd_8000x8000vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeSrc_8888RevNPx8888vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeSrc_8888x0565vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeSrc_8888RevNPx0565vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeSolid_nx8888vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeSolid_nx0565vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeSolidMask_nx8x0565vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeSrc_x888x8x8888vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeSrc_8888x8x8888vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+void fbCompositeSrc_8888x8888vmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+pixman_bool_t fbCopyAreavmx (FbPixels *pSrc,
+ FbPixels *pDst,
+ int src_x,
+ int src_y,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height);
+
+void fbCompositeCopyAreavmx (pixman_operator_t op,
+ pixman_image_t * pSrc,
+ pixman_image_t * pMask,
+ pixman_image_t * pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height);
+
+pixman_bool_t fbSolidFillvmx (FbPixels *pDraw,
+ int x,
+ int y,
+ int width,
+ int height,
+ FbBits xor);
+#endif
+#endif /* USE_VMX */
More information about the xorg-commit
mailing list