pixman: Branch 'master' - 3 commits

Tue Jun 16 09:11:45 PDT 2009

pixman/pixman-arm-neon.c |  806 +++++++++++++++++++++++++++--------------------
 1 file changed, 469 insertions(+), 337 deletions(-)

New commits:
commit 94964c221fe8141e5177d98f5357dca33fa00544
Author: Jonathan Morton <jmorton at sd070.hel.movial.fi>
Date:   Tue Jun 16 12:08:29 2009 -0400

    [NEON] Add ARGB8-over-RGB565 compositing blitter.

diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 36a69aa..37ae9f2 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -1941,6 +1941,155 @@ fbCompositeSolid_nx0565neon (
 	}
 }
 
+static inline void ARGB8_Over565_8pix_neon(
+	uint32_t *src,
+	uint16_t *dest,
+	uint32_t  srcStride,  // bytes, not elements
+	uint32_t  count        // 8-pixel groups
+)
+{
+	asm volatile (
+	"0:	@ loop\n"
+	"	pld   [%[src], %[srcStride]]         @ preload from next scanline	\n"
+	"	vld1.16   {d0,d1}, [%[dest]]         @ load pixels from framebuffer	\n"
+	"	vld4.8   {d20,d21,d22,d23},[%[src]]! @ load source image pixels		\n"
+	"	vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits		\n"
+	"	vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels	\n"
+	"	vshrn.u16 d4, q0, #3                 @ unpack green				\n"
+	"	vmvn      d18, d23                   @ we need the inverse alpha for the background	\n"
+	"	vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)	\n"
+	"	vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)	\n"
+	"	vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)	\n"
+	"	vmull.u8  q1, d2, d18                @ apply inverse alpha to background red...	\n"
+	"	vmull.u8  q2, d4, d18                @ ...green...				\n"
+	"	vmull.u8  q3, d6, d18                @ ...blue					\n"
+	"	subs      %[count], %[count], #1     @ decrement/test loop counter		\n"
+	"	vmlal.u8  q1, d23, d22               @ add blended foreground red...		\n"
+	"	vmlal.u8  q2, d23, d21               @ ...green...				\n"
+	"	vmlal.u8  q3, d23, d20               @ ...blue					\n"
+	"	vsri.16   q1, q2, #5                 @ pack green behind red			\n"
+	"	vsri.16   q1, q3, #11                @ pack blue into pixels			\n"
+	"	vst1.16   {d2,d3}, [%[dest]]!        @ store composited pixels			\n"
+	"	bne 0b                               @ next please				\n"
+
+	// Clobbered registers marked as input/outputs
+	: [dest] "+r" (dest), [src] "+r" (src), [count] "+r" (count)
+
+	// Inputs
+	: [srcStride] "r" (srcStride)
+
+	// Clobbers, including the inputs we modify, and potentially lots of memory
+	: "q0", "q1", "q2", "q3", "d17", "d18", "q10", "q11", "cc", "memory"
+	);
+}
+
+void
+fbCompositeOver_8888x0565neon (
+	pixman_implementation_t * impl,
+	pixman_op_t op,
+	pixman_image_t * pSrc,
+	pixman_image_t * pMask,
+	pixman_image_t * pDst,
+	int32_t      xSrc,
+	int32_t      ySrc,
+	int32_t      xMask,
+	int32_t      yMask,
+	int32_t      xDst,
+	int32_t      yDst,
+	int32_t      width,
+	int32_t      height)
+{
+	uint32_t    *srcLine;
+	uint16_t    *dstLine, *alignedLine;
+	uint32_t     dstStride, srcStride;
+	uint32_t     kernelCount, copyCount;
+	uint8_t      kernelOffset, copyOffset;
+
+	// we assume mask is opaque
+	// so the only alpha to deal with is embedded in src
+
+	if(width > NEON_SCANLINE_BUFFER_PIXELS) {
+		// split the blit, so we can use a fixed-size scanline buffer
+		int x;
+		for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
+			fbCompositeOver_8888x0565neon(impl, op, pSrc, pMask, pDst, xSrc+x, ySrc, xMask+x, yMask, xDst+x, yDst,
+										  (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
+		}
+		return;
+	}
+
+	fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+	fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+
+	// keep within minimum number of aligned quadwords on width
+	// while also keeping the minimum number of columns to process
+	{
+		unsigned long alignedLeft = (unsigned long)(dstLine) & ~0xF;
+		unsigned long alignedRight = (((unsigned long)(dstLine + width)) + 0xF) & ~0xF;
+		unsigned long ceilingLength = (((unsigned long) width) * sizeof(*dstLine) + 0xF) & ~0xF;
+
+		// the fast copy must always be quadword aligned
+		copyOffset = dstLine - ((uint16_t*) alignedLeft);
+		alignedLine = dstLine - copyOffset;
+		copyCount = (uint32_t) ((alignedRight - alignedLeft) >> 4);
+
+		if(alignedRight - alignedLeft > ceilingLength) {
+			// unaligned routine is tightest, and will not overrun
+			kernelCount = (uint32_t) (ceilingLength >> 4);
+			kernelOffset = copyOffset;
+		} else {
+			// aligned routine is equally tight, so it is safer to align
+			kernelCount = copyCount;
+			kernelOffset = 0;
+		}
+	}
+
+	/* Preload the first input scanline */
+	{
+		uint8_t *srcPtr = (uint8_t*) srcLine;
+		uint32_t count = (width + 15) / 16;
+
+#ifdef USE_GCC_INLINE_ASM
+		asm volatile (
+		"0: @ loop						\n"
+		"	subs    %[count], %[count], #1			\n"
+		"	pld     [%[src]]				\n"
+		"	add     %[src], %[src], #64			\n"
+		"	bgt 0b						\n"
+
+		// Clobbered input registers marked as input/outputs
+		: [src] "+r" (srcPtr), [count] "+r" (count)
+		: // no unclobbered inputs
+		: "cc"
+		);
+#else
+		do {
+			__pld(srcPtr);
+			srcPtr += 64;
+		} while(--count);
+#endif
+	}
+
+	{
+		uint16_t scanLine[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
+
+		// row-major order
+		// left edge, middle block, right edge
+		for( ; height--; srcLine += srcStride, alignedLine += dstStride) {
+			// Uncached framebuffer access is really, really slow if we do it piecemeal.
+			// It should be much faster if we grab it all at once.
+			// One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
+			QuadwordCopy_neon(scanLine, alignedLine, copyCount, 0);
+
+			// Apply the actual filter
+			ARGB8_Over565_8pix_neon(srcLine, scanLine + kernelOffset, srcStride * sizeof(*srcLine), kernelCount);
+
+			// Copy the modified scanline back
+			QuadwordCopy_neon(dstLine, scanLine + copyOffset, width >> 3, (width & 7) * 2);
+		}
+	}
+}
+
 #endif  // USE_GCC_INLINE_ASM
 
 static const FastPathInfo arm_neon_fast_path_array[] = 
@@ -1958,6 +2107,8 @@ static const FastPathInfo arm_neon_fast_path_array[] =
     { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_16x16neon,              0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSolid_nx0565neon,           0 },
     { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSolid_nx0565neon,           0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeOver_8888x0565neon,         0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeOver_8888x0565neon,         0 },
 #endif
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888neon,          0 },
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888neon,          0 },
commit af660613eefbb474fd62f01b6f073fae389bd6f7
Author: Jonathan Morton <jmorton at sd070.hel.movial.fi>
Date:   Tue Jun 16 12:08:29 2009 -0400

    [NEON] Add transparent rect blitter.

diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 1aefb5a..36a69aa 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -1805,6 +1805,144 @@ fbCompositeSolidMask_nx8x0565neon (
 	}
 }
 
+#ifdef USE_GCC_INLINE_ASM
+
+static inline void PlainOver565_8pix_neon(
+	uint32_t  colour,
+	uint16_t *dest,
+	uint32_t  destStride,  // bytes, not elements
+	uint32_t  count        // 8-pixel groups
+)
+{
+	// Inner loop for plain translucent rects (solid colour without alpha mask)
+	asm volatile (
+	"	vld4.8   {d20[],d21[],d22[],d23[]}, [%[colour]]  @ solid colour load/splat \n"
+	"	vmull.u8  q12, d23, d22              @ premultiply alpha red   \n"
+	"	vmull.u8  q13, d23, d21              @ premultiply alpha green \n"
+	"	vmull.u8  q14, d23, d20              @ premultiply alpha blue  \n"
+	"	vmvn      d18, d23                   @ inverse alpha for background \n"
+	"0:	@ loop\n"
+	"	vld1.16   {d0,d1}, [%[dest]]         @ load first pixels from framebuffer	\n"
+	"	vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels	\n"
+	"	vshrn.u16 d4, q0, #3                 @ unpack green				\n"
+	"	vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits		\n"
+	"	vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)	\n"
+	"	vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)	\n"
+	"	vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)	\n"
+	"	vmov      q0, q12                    @ retrieve foreground red   \n"
+	"	vmlal.u8  q0, d2, d18                @ blend red - my kingdom for a four-operand MLA \n"
+	"	vmov      q1, q13                    @ retrieve foreground green \n"
+	"	vmlal.u8  q1, d4, d18                @ blend green               \n"
+	"	vmov      q2, q14                    @ retrieve foreground blue  \n"
+	"	vmlal.u8  q2, d6, d18                @ blend blue                \n"
+	"	subs      %[count], %[count], #1     @ decrement/test loop counter		\n"
+	"	vsri.16   q0, q1, #5                 @ pack green behind red			\n"
+	"	vsri.16   q0, q2, #11                @ pack blue into pixels			\n"
+	"	vst1.16   {d0,d1}, [%[dest]]         @ store composited pixels			\n"
+	"	add %[dest], %[dest], %[destStride]  @ advance framebuffer pointer		\n"
+	"	bne 0b                               @ next please				\n"
+
+	// Clobbered registers marked as input/outputs
+	: [dest] "+r" (dest), [count] "+r" (count)
+
+	// Inputs
+	: [destStride] "r" (destStride), [colour] "r" (&colour)
+
+	// Clobbers, including the inputs we modify, and potentially lots of memory
+	: "q0", "q1", "q2", "q3", "q9", "q10", "q11", "q12", "q13", "q14", "cc", "memory"
+	);
+}
+
+void
+fbCompositeSolid_nx0565neon (
+	pixman_implementation_t * impl,
+	pixman_op_t op,
+	pixman_image_t * pSrc,
+	pixman_image_t * pMask,
+	pixman_image_t * pDst,
+	int32_t      xSrc,
+	int32_t      ySrc,
+	int32_t      xMask,
+	int32_t      yMask,
+	int32_t      xDst,
+	int32_t      yDst,
+	int32_t      width,
+	int32_t      height)
+{
+	uint32_t     src, srca;
+	uint16_t    *dstLine, *alignedLine;
+	uint32_t     dstStride;
+	uint32_t     kernelCount, copyCount;
+	uint8_t      kernelOffset, copyOffset;
+
+	fbComposeGetSolid(pSrc, src, pDst->bits.format);
+
+	// bail out if fully transparent
+	srca = src >> 24;
+	if(srca == 0)
+		return;
+	if(width == 0 || height == 0)
+		return;
+
+	if(width > NEON_SCANLINE_BUFFER_PIXELS) {
+		// split the blit, so we can use a fixed-size scanline buffer
+		// TODO: there must be a more elegant way of doing this.
+		int x;
+		for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
+			fbCompositeSolid_nx0565neon(impl, op, pSrc, pMask, pDst, xSrc+x, ySrc, xMask+x, yMask, xDst+x, yDst,
+										(x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
+		}
+		return;
+	}
+
+	fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+
+	// keep within minimum number of aligned quadwords on width
+	// while also keeping the minimum number of columns to process
+	{
+		unsigned long alignedLeft = (unsigned long)(dstLine) & ~0xF;
+		unsigned long alignedRight = (((unsigned long)(dstLine + width)) + 0xF) & ~0xF;
+		unsigned long ceilingLength = (((unsigned long) width) * sizeof(*dstLine) + 0xF) & ~0xF;
+
+		// the fast copy must always be quadword aligned
+		copyOffset = dstLine - ((uint16_t*) alignedLeft);
+		alignedLine = dstLine - copyOffset;
+		copyCount = (uint32_t) ((alignedRight - alignedLeft) >> 4);
+
+		if(alignedRight - alignedLeft > ceilingLength) {
+			// unaligned routine is tightest, and will not overrun
+			kernelCount = (uint32_t) (ceilingLength >> 4);
+			kernelOffset = copyOffset;
+		} else {
+			// aligned routine is equally tight, so it is safer to align
+			kernelCount = copyCount;
+			kernelOffset = 0;
+		}
+	}
+
+	{
+		uint16_t scanLine[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
+
+		// row-major order
+		// left edge, middle block, right edge
+		for( ; height--; alignedLine += dstStride, dstLine += dstStride) {
+
+			// Uncached framebuffer access is really, really slow if we do it piecemeal.
+			// It should be much faster if we grab it all at once.
+			// One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
+			QuadwordCopy_neon(scanLine, alignedLine, copyCount, 0);
+
+			// Apply the actual filter
+			PlainOver565_8pix_neon(src, scanLine + kernelOffset, 8 * sizeof(*dstLine), kernelCount);
+
+			// Copy the modified scanline back
+			QuadwordCopy_neon(dstLine, scanLine + copyOffset, width >> 3, (width & 7) * 2);
+		}
+	}
+}
+
+#endif  // USE_GCC_INLINE_ASM
+
 static const FastPathInfo arm_neon_fast_path_array[] = 
 {
     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       fbCompositeSrcAdd_8888x8x8neon,        0 },
@@ -1818,6 +1956,8 @@ static const FastPathInfo arm_neon_fast_path_array[] =
 #ifdef USE_GCC_INLINE_ASM
     { PIXMAN_OP_SRC,  PIXMAN_r5g6b5,   PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_16x16neon,              0 },
     { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_16x16neon,              0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSolid_nx0565neon,           0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSolid_nx0565neon,           0 },
 #endif
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888neon,          0 },
     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888neon,          0 },
commit 8eeeca993252edc39da9c5c57545b81215fafc81
Author: Jonathan Morton <jmorton at sd070.hel.movial.fi>
Date:   Tue Jun 16 12:08:29 2009 -0400

    [NEON] Replace Ian's glyph-blitter with a better one.
    
    Each scanline of the destination is bulk-loaded into a cached buffer on
    the stack (using the QuadWordCopy routine) before being processed.  This
    is the primary benefit on uncached framebuffers, since it is necessary
    to minimise the number of accesses to such things and avoid
    write-to-read turnarounds.
    
    This also simplifies edge handling, since QuadWordCopy() can do a
    precise writeback efficiently via the write-combiner, allowing the main
    routine to "over-read" the scanline edge safely when required.  This is
    why the glyph's mask data is also copied into a temporary buffer of
    known size.
    
    Each group of 8 pixels is then processed using fewer instructions,
    taking advantage of the lower precision requirements of the 6-bit
    destination (so a simpler pixel multiply can be used) and using a more
    efficient bit-repacking method.
    
    (As an aside, this patch removes nearly twice as much code as it
    introduces.  Most of this is due to duplication of Ian's inner loop,
    since he has to handle narrow cases separately.  RVCT support is of
    course preserved.)
    
    We measured the doubling of performance by rendering 96-pixel height
    glyph strings, which are fillrate limited rather than latency/overhead
    limited.  The performance is also improved, albeit by a smaller amount,
    on the more usual smaller text, demonstrating that internal overhead is
    not a problem.

diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 467a0dd..1aefb5a 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -632,343 +632,6 @@ fbCompositeSrc_8888x8x8888neon (
 }
 
 
-
-void
-fbCompositeSolidMask_nx8x0565neon (
-                               pixman_implementation_t * impl,
-                               pixman_op_t op,
-                               pixman_image_t * pSrc,
-                               pixman_image_t * pMask,
-                               pixman_image_t * pDst,
-                               int32_t      xSrc,
-                               int32_t      ySrc,
-                               int32_t      xMask,
-                               int32_t      yMask,
-                               int32_t      xDst,
-                               int32_t      yDst,
-                               int32_t      width,
-                               int32_t      height)
-{
-    uint32_t     src, srca;
-    uint16_t    *dstLine, *dst;
-    uint8_t     *maskLine, *mask;
-    int          dstStride, maskStride;
-    uint32_t     w;
-    uint8x8_t    sval2;
-    uint8x8x4_t  sval8;
-
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-        return;
-
-    sval2=vreinterpret_u8_u32(vdup_n_u32(src));
-    sval8.val[0]=vdup_lane_u8(sval2,0);
-    sval8.val[1]=vdup_lane_u8(sval2,1);
-    sval8.val[2]=vdup_lane_u8(sval2,2);
-    sval8.val[3]=vdup_lane_u8(sval2,3);
-
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
-
-    if (width>=8)
-    {
-        // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
-        while (height--)
-        {
-            uint16_t *keep_dst=0;
-
-            dst = dstLine;
-            dstLine += dstStride;
-            mask = maskLine;
-            maskLine += maskStride;
-            w = width;
-
-#ifndef USE_GCC_INLINE_ASM
-            uint8x8_t alpha;
-            uint16x8_t dval, temp; 
-            uint8x8x4_t sval8temp;
-
-            alpha = vld1_u8((void*)mask);
-            dval = vld1q_u16((void*)dst);
-            keep_dst = dst;
-
-            sval8temp = neon8mul(sval8,alpha);
-            temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3]))));
-
-            mask += (w & 7);
-            dst += (w & 7);
-            w -= (w & 7);
-
-            while (w)
-            {
-                dval = vld1q_u16((void*)dst);
-	        alpha = vld1_u8((void*)mask);
-
-                vst1q_u16((void*)keep_dst,temp);
-                keep_dst = dst;
-
-                sval8temp = neon8mul(sval8,alpha);
-                temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3]))));
-
-                mask+=8;
-                dst+=8;
-                w-=8;
-            }
-            vst1q_u16((void*)keep_dst,temp);
-#else
-        asm volatile (
-                        "vdup.32      d0, %[src]\n\t"
-                        "vdup.8       d1, d0[1]\n\t"
-                        "vdup.8       d2, d0[2]\n\t"
-                        "vdup.8       d3, d0[3]\n\t"
-                        "vdup.8       d0, d0[0]\n\t"
-
-                        "vld1.8       {q12}, [%[dst]]\n\t"
-                        "vld1.8       {d31}, [%[mask]]\n\t"
-                        "mov  %[keep_dst], %[dst]\n\t"
-
-                        "and  ip, %[w], #7\n\t"
-                        "add  %[mask], %[mask], ip\n\t"
-                        "add  %[dst], %[dst], ip, LSL#1\n\t"
-                        "subs  %[w], %[w], ip\n\t"
-                        "b  9f\n\t"
-// LOOP
-                        "2:\n\t"
-
-                        "vld1.16      {q12}, [%[dst]]!\n\t"
-                        "vld1.8       {d31}, [%[mask]]!\n\t"
-                        "vst1.16      {q10}, [%[keep_dst]]\n\t"
-                        "sub  %[keep_dst], %[dst], #8*2\n\t"
-                        "subs  %[w], %[w], #8\n\t"
-                        "9:\n\t"
-// expand 0565 q12 to 8888 {d4-d7}
-                        "vmovn.u16    d4, q12\t\n"
-                        "vshr.u16     q11, q12, #5\t\n"
-                        "vshr.u16     q10, q12, #6+5\t\n"
-                        "vmovn.u16    d5, q11\t\n"
-                        "vmovn.u16    d6, q10\t\n"
-                        "vshl.u8      d4, d4, #3\t\n"
-                        "vshl.u8      d5, d5, #2\t\n"
-                        "vshl.u8      d6, d6, #3\t\n"
-                        "vsri.u8      d4, d4, #5\t\n"
-                        "vsri.u8      d5, d5, #6\t\n"
-                        "vsri.u8      d6, d6, #5\t\n"
-
-                        "vmull.u8     q10, d31, d0\n\t"
-                        "vmull.u8     q11, d31, d1\n\t"
-                        "vmull.u8     q12, d31, d2\n\t"
-                        "vmull.u8     q13, d31, d3\n\t"
-                        "vrshr.u16    q8, q10, #8\n\t"
-                        "vrshr.u16    q9, q11, #8\n\t"
-                        "vraddhn.u16  d20, q10, q8\n\t"
-                        "vraddhn.u16  d21, q11, q9\n\t"
-                        "vrshr.u16    q9, q13, #8\n\t"
-                        "vrshr.u16    q8, q12, #8\n\t"
-                        "vraddhn.u16  d23, q13, q9\n\t"
-                        "vraddhn.u16  d22, q12, q8\n\t"
-
-// duplicate in 4/2/1 & 8pix vsns
-                        "vmvn.8       d30, d23\n\t"
-                        "vmull.u8     q14, d30, d6\n\t"
-                        "vmull.u8     q13, d30, d5\n\t"
-                        "vmull.u8     q12, d30, d4\n\t"
-                        "vrshr.u16    q8, q14, #8\n\t"
-                        "vrshr.u16    q9, q13, #8\n\t"
-                        "vraddhn.u16  d6, q14, q8\n\t"
-                        "vrshr.u16    q8, q12, #8\n\t"
-                        "vraddhn.u16  d5, q13, q9\n\t"
-                        "vqadd.u8     d6, d6, d22\n\t"  // moved up
-                        "vraddhn.u16  d4, q12, q8\n\t"
-// intentionally don't calculate alpha
-// result in d4-d6
-
-//                      "vqadd.u8     d6, d6, d22\n\t"  ** moved up
-                        "vqadd.u8     d5, d5, d21\n\t"
-                        "vqadd.u8     d4, d4, d20\n\t"
-
-// pack 8888 {d20-d23} to 0565 q10
-                        "vshll.u8     q10, d6, #8\n\t"
-                        "vshll.u8     q3, d5, #8\n\t"
-                        "vshll.u8     q2, d4, #8\n\t"
-                        "vsri.u16     q10, q3, #5\t\n"
-                        "vsri.u16     q10, q2, #11\t\n"
-
-                        "bne 2b\n\t"
-
-                        "1:\n\t"
-                        "vst1.16      {q10}, [%[keep_dst]]\n\t"
-
-                        : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
-                        : [src] "r" (src)
-                        : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
-                          "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
-                          "d30","d31"
-                        );
-#endif
-        }
-    }
-    else
-    {
-        while (height--)
-        {
-            void *dst4=0, *dst2=0;
-
-            dst = dstLine;
-            dstLine += dstStride;
-            mask = maskLine;
-            maskLine += maskStride;
-            w = width;
-
-
-#ifndef USE_GCC_INLINE_ASM
-            uint8x8_t alpha;
-            uint16x8_t dval, temp;
-            uint8x8x4_t sval8temp;
-
-            if (w&4)
-            {
-                alpha = vreinterpret_u8_u32(vld1_lane_u32((void*)mask,vreinterpret_u32_u8(alpha),1));
-                dval = vreinterpretq_u16_u64(vld1q_lane_u64((void*)dst,vreinterpretq_u64_u16(dval),1));
-                dst4=dst;
-                mask+=4;
-                dst+=4;
-            }
-            if (w&2)
-            {
-                alpha = vreinterpret_u8_u16(vld1_lane_u16((void*)mask,vreinterpret_u16_u8(alpha),1));
-                dval = vreinterpretq_u16_u32(vld1q_lane_u32((void*)dst,vreinterpretq_u32_u16(dval),1));
-                dst2=dst;
-                mask+=2;
-                dst+=2;
-            }
-            if (w&1)
-            {
-                alpha = vld1_lane_u8((void*)mask,alpha,1);
-                dval = vld1q_lane_u16((void*)dst,dval,1);
-            }
-
-            sval8temp = neon8mul(sval8,alpha);
-            temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3]))));
-
-            if (w&1)
-                vst1q_lane_u16((void*)dst,temp,1);
-            if (w&2)
-                vst1q_lane_u32((void*)dst2,vreinterpretq_u32_u16(temp),1);
-            if (w&4)
-                vst1q_lane_u64((void*)dst4,vreinterpretq_u64_u16(temp),1);
-#else
-            asm volatile (
-                        "vdup.32      d0, %[src]\n\t"
-                        "vdup.8       d1, d0[1]\n\t"
-                        "vdup.8       d2, d0[2]\n\t"
-                        "vdup.8       d3, d0[3]\n\t"
-                        "vdup.8       d0, d0[0]\n\t"
-
-                        "tst  %[w], #4\t\n"
-                        "beq  skip_load4\t\n"
-
-                        "vld1.64      {d25}, [%[dst]]\n\t"
-                        "vld1.32      {d31[1]}, [%[mask]]\n\t"
-                        "mov  %[dst4], %[dst]\t\n"
-                        "add  %[mask], %[mask], #4\t\n"
-                        "add  %[dst], %[dst], #4*2\t\n"
-
-                        "skip_load4:\t\n"
-                        "tst  %[w], #2\t\n"
-                        "beq  skip_load2\t\n"
-                        "vld1.32      {d24[1]}, [%[dst]]\n\t"
-                        "vld1.16      {d31[1]}, [%[mask]]\n\t"
-                        "mov  %[dst2], %[dst]\t\n"
-                        "add  %[mask], %[mask], #2\t\n"
-                        "add  %[dst], %[dst], #2*2\t\n"
-
-                        "skip_load2:\t\n"
-                        "tst  %[w], #1\t\n"
-                        "beq  skip_load1\t\n"
-                        "vld1.16      {d24[1]}, [%[dst]]\n\t"
-                        "vld1.8       {d31[1]}, [%[mask]]\n\t"
-
-                        "skip_load1:\t\n"
-// expand 0565 q12 to 8888 {d4-d7}
-                        "vmovn.u16    d4, q12\t\n"
-                        "vshr.u16     q11, q12, #5\t\n"
-                        "vshr.u16     q10, q12, #6+5\t\n"
-                        "vmovn.u16    d5, q11\t\n"
-                        "vmovn.u16    d6, q10\t\n"
-                        "vshl.u8      d4, d4, #3\t\n"
-                        "vshl.u8      d5, d5, #2\t\n"
-                        "vshl.u8      d6, d6, #3\t\n"
-                        "vsri.u8      d4, d4, #5\t\n"
-                        "vsri.u8      d5, d5, #6\t\n"
-                        "vsri.u8      d6, d6, #5\t\n"
-
-                        "vmull.u8     q10, d31, d0\n\t"
-                        "vmull.u8     q11, d31, d1\n\t"
-                        "vmull.u8     q12, d31, d2\n\t"
-                        "vmull.u8     q13, d31, d3\n\t"
-                        "vrshr.u16    q8, q10, #8\n\t"
-                        "vrshr.u16    q9, q11, #8\n\t"
-                        "vraddhn.u16  d20, q10, q8\n\t"
-                        "vraddhn.u16  d21, q11, q9\n\t"
-                        "vrshr.u16    q9, q13, #8\n\t"
-                        "vrshr.u16    q8, q12, #8\n\t"
-                        "vraddhn.u16  d23, q13, q9\n\t"
-                        "vraddhn.u16  d22, q12, q8\n\t"
-
-// duplicate in 4/2/1 & 8pix vsns
-                        "vmvn.8       d30, d23\n\t"
-                        "vmull.u8     q14, d30, d6\n\t"
-                        "vmull.u8     q13, d30, d5\n\t"
-                        "vmull.u8     q12, d30, d4\n\t"
-                        "vrshr.u16    q8, q14, #8\n\t"
-                        "vrshr.u16    q9, q13, #8\n\t"
-                        "vraddhn.u16  d6, q14, q8\n\t"
-                        "vrshr.u16    q8, q12, #8\n\t"
-                        "vraddhn.u16  d5, q13, q9\n\t"
-                        "vqadd.u8     d6, d6, d22\n\t"  // moved up
-                        "vraddhn.u16  d4, q12, q8\n\t"
-// intentionally don't calculate alpha
-// result in d4-d6
-
-//                      "vqadd.u8     d6, d6, d22\n\t"  ** moved up
-                        "vqadd.u8     d5, d5, d21\n\t"
-                        "vqadd.u8     d4, d4, d20\n\t"
-
-// pack 8888 {d20-d23} to 0565 q10
-                        "vshll.u8     q10, d6, #8\n\t"
-                        "vshll.u8     q3, d5, #8\n\t"
-                        "vshll.u8     q2, d4, #8\n\t"
-                        "vsri.u16     q10, q3, #5\t\n"
-                        "vsri.u16     q10, q2, #11\t\n"
-
-                        "tst  %[w], #1\n\t"
-                        "beq skip_store1\t\n"
-                        "vst1.16      {d20[1]}, [%[dst]]\t\n"
-                        "skip_store1:\t\n"
-                        "tst  %[w], #2\n\t"
-                        "beq  skip_store2\t\n"
-                        "vst1.32      {d20[1]}, [%[dst2]]\t\n"
-                        "skip_store2:\t\n"
-                        "tst  %[w], #4\n\t"
-                        "beq skip_store4\t\n"
-                        "vst1.16      {d21}, [%[dst4]]\t\n"
-                        "skip_store4:\t\n"
-
-                        : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2)
-                        : [src] "r" (src)
-                        : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
-                          "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
-                          "d30","d31"
-                        );
-#endif
-        }
-    }
-}
-
-
-
 void
 fbCompositeSolidMask_nx8x8888neon (
                             pixman_implementation_t * impl,
@@ -1964,6 +1627,184 @@ static inline void QuadwordCopy_neon(
 	}
 }
 
+static inline void SolidOver565_8pix_neon(
+	uint32_t  glyphColour,
+	uint16_t *dest,
+	uint8_t  *inMask,
+	uint32_t  destStride,  // bytes, not elements
+	uint32_t  maskStride,
+	uint32_t  count        // 8-pixel groups
+)
+{
+	// Inner loop of glyph blitter (solid colour, alpha mask)
+
+#ifdef USE_GCC_INLINE_ASM
+
+	asm volatile (
+	"	vld4.8 {d20[],d21[],d22[],d23[]}, [%[glyphColour]]  @ splat solid colour components	\n"
+	"0:	@ loop																				\n"
+	"	vld1.16   {d0,d1}, [%[dest]]         @ load first pixels from framebuffer			\n"
+	"	vld1.8    {d17}, [%[inMask]]         @ load alpha mask of glyph						\n"
+	"	vmull.u8  q9, d17, d23               @ apply glyph colour alpha to mask				\n"
+	"	vshrn.u16 d17, q9, #8                @ reformat it to match original mask			\n"
+	"	vmvn      d18, d17                   @ we need the inverse mask for the background	\n"
+	"	vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits				\n"
+	"	vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels			\n"
+	"	vshrn.u16 d4, q0, #3                 @ unpack green									\n"
+	"	vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)			\n"
+	"	vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)		\n"
+	"	vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)			\n"
+	"	vmull.u8  q1, d2, d18                @ apply inverse mask to background red...		\n"
+	"	vmull.u8  q2, d4, d18                @ ...green...									\n"
+	"	vmull.u8  q3, d6, d18                @ ...blue										\n"
+	"	subs      %[count], %[count], #1     @ decrement/test loop counter					\n"
+	"	vmlal.u8  q1, d17, d22               @ add masked foreground red...					\n"
+	"	vmlal.u8  q2, d17, d21               @ ...green...									\n"
+	"	vmlal.u8  q3, d17, d20               @ ...blue										\n"
+	"	add %[inMask], %[inMask], %[maskStride] @ advance mask pointer, while we wait		\n"
+	"	vsri.16   q1, q2, #5                 @ pack green behind red						\n"
+	"	vsri.16   q1, q3, #11                @ pack blue into pixels						\n"
+	"	vst1.16   {d2,d3}, [%[dest]]         @ store composited pixels						\n"
+	"	add %[dest], %[dest], %[destStride]  @ advance framebuffer pointer					\n"
+	"	bne 0b                               @ next please									\n"
+
+	// Clobbered registers marked as input/outputs
+	: [dest] "+r" (dest), [inMask] "+r" (inMask), [count] "+r" (count)
+
+	// Inputs
+	: [destStride] "r" (destStride), [maskStride] "r" (maskStride), [glyphColour] "r" (&glyphColour)
+
+	// Clobbers, including the inputs we modify, and potentially lots of memory
+	: "q0", "q1", "q2", "q3", "d17", "q9", "q10", "q11", "q12", "cc", "memory"
+	);
+
+#else
+
+	uint8x8x4_t solidColour = vld4_dup_u8((uint8_t*) &glyphColour);
+
+	while(count--)
+	{
+		uint16x8_t  pixels = vld1q_u16(dest);
+		uint8x8_t   mask = vshrn_n_u16(vmull_u8(solidColour.val[3], vld1_u8(inMask)), 8);
+		uint8x8_t  iMask = vmvn_u8(mask);
+
+		uint8x8_t  tRed   = vshrn_n_u16(pixels, 8);
+		uint8x8_t  tGreen = vshrn_n_u16(pixels, 3);
+		uint8x8_t  tBlue  = vshrn_n_u16(vsli_n_u8(pixels, pixels, 5), 2);
+
+		uint16x8_t sRed   = vmull_u8(vsri_n_u8(tRed  , tRed  , 5), iMask);
+		uint16x8_t sGreen = vmull_u8(vsri_n_u8(tGreen, tGreen, 6), iMask);
+		uint16x8_t sBlue  = vmull_u8(          tBlue             , iMask);
+
+		sRed   = vmlal(sRed  , mask, solidColour.val[2]);
+		sGreen = vmlal(sGreen, mask, solidColour.val[1]);
+		sBlue  = vmlal(sBlue , mask, solidColour.val[0]);
+
+		pixels = vsri_n_u16(sRed, sGreen, 5);
+		pixels = vsri_n_u16(pixels, sBlue, 11);
+		vst1q_u16(dest, pixels);
+
+		dest += destStride;
+		mask += maskStride;
+	}
+
+#endif
+}
+
+void
+fbCompositeSolidMask_nx8x0565neon (
+	pixman_implementation_t * impl,
+	pixman_op_t op,
+	pixman_image_t * pSrc,
+	pixman_image_t * pMask,
+	pixman_image_t * pDst,
+	int32_t      xSrc,
+	int32_t      ySrc,
+	int32_t      xMask,
+	int32_t      yMask,
+	int32_t      xDst,
+	int32_t      yDst,
+	int32_t      width,
+	int32_t      height)
+{
+	uint32_t     src, srca;
+	uint16_t    *dstLine, *alignedLine;
+	uint8_t     *maskLine;
+	uint32_t     dstStride, maskStride;
+	uint32_t     kernelCount, copyCount;
+	uint8_t      kernelOffset, copyOffset;
+
+	fbComposeGetSolid(pSrc, src, pDst->bits.format);
+
+	// bail out if fully transparent or degenerate
+	srca = src >> 24;
+	if(srca == 0)
+		return;
+	if(width == 0 || height == 0)
+		return;
+
+	if(width > NEON_SCANLINE_BUFFER_PIXELS) {
+		// split the blit, so we can use a fixed-size scanline buffer
+		// TODO: there must be a more elegant way of doing this.
+		int x;
+		for(x=0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS) {
+			fbCompositeSolidMask_nx8x0565neon(impl, op, pSrc, pMask, pDst, xSrc+x, ySrc, xMask+x, yMask, xDst+x, yDst,
+											  (x+NEON_SCANLINE_BUFFER_PIXELS > width) ? width-x : NEON_SCANLINE_BUFFER_PIXELS, height);
+		}
+		return;
+	}
+
+	fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+	fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+
+	// keep within minimum number of aligned quadwords on width
+	// while also keeping the minimum number of columns to process
+	{
+		unsigned long alignedLeft = (unsigned long)(dstLine) & ~0xF;
+		unsigned long alignedRight = (((unsigned long)(dstLine + width)) + 0xF) & ~0xF;
+		unsigned long ceilingLength = (((unsigned long) width) * sizeof(*dstLine) + 0xF) & ~0xF;
+
+		// the fast copy must always be quadword aligned
+		copyOffset = dstLine - ((uint16_t*) alignedLeft);
+		alignedLine = dstLine - copyOffset;
+		copyCount = (uint32_t) ((alignedRight - alignedLeft) >> 4);
+
+		if(alignedRight - alignedLeft > ceilingLength) {
+			// unaligned routine is tightest, and will not overrun
+			kernelCount = (uint32_t) (ceilingLength >> 4);
+			kernelOffset = copyOffset;
+		} else {
+			// aligned routine is equally tight, so it is safer to align
+			kernelCount = copyCount;
+			kernelOffset = 0;
+		}
+	}
+
+	{
+		uint16_t scanLine[NEON_SCANLINE_BUFFER_PIXELS + 8]; // deliberately not initialised
+		uint8_t glyphLine[NEON_SCANLINE_BUFFER_PIXELS + 8];
+		int y = height;
+
+		// row-major order
+		// left edge, middle block, right edge
+		for( ; y--; maskLine += maskStride, alignedLine += dstStride, dstLine += dstStride) {
+			// We don't want to overrun the edges of the glyph, so realign the edge data into known buffers
+			QuadwordCopy_neon(glyphLine + copyOffset, maskLine, width >> 4, width & 0xF);
+
+			// Uncached framebuffer access is really, really slow if we do it piecemeal.
+			// It should be much faster if we grab it all at once.
+			// One scanline should easily fit in L1 cache, so this should not waste RAM bandwidth.
+			QuadwordCopy_neon(scanLine, alignedLine, copyCount, 0);
+
+			// Apply the actual filter
+			SolidOver565_8pix_neon(src, scanLine + kernelOffset, glyphLine + kernelOffset, 8 * sizeof(*dstLine), 8, kernelCount);
+
+			// Copy the modified scanline back
+			QuadwordCopy_neon(dstLine, scanLine + copyOffset, width >> 3, (width & 7) * 2);
+		}
+	}
+}
+
 static const FastPathInfo arm_neon_fast_path_array[] = 
 {
     { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       fbCompositeSrcAdd_8888x8x8neon,        0 },