xserver: Branch 'xgl-0-0-1' - 2 commits

David Reveman davidr at kemper.freedesktop.org
Mon Jun 12 11:51:07 EEST 2006


 fb/fbmmx.c     | 1393 +++++++++++++++++++++++++++++++++++++++++++++------------
 fb/fbmmx.h     |   14 
 fb/fbpict.c    |   61 ++
 hw/xgl/xglxv.c |   13 
 4 files changed, 1200 insertions(+), 281 deletions(-)

New commits:
diff-tree 450e02cce24cec47b7f08b87952d9f3430f2468b (from 75846b85b0c46d0de52b03763ab2f44ab85dc963)
Author: David Reveman <davidr at novell.com>
Date:   Mon Jun 12 10:41:02 2006 +0200

    Add fast scaling path to fbCompositeSrc_8888x8888mmx and
    fbCompositeSrc_8888x8x8888mm.

diff --git a/fb/fbmmx.c b/fb/fbmmx.c
index c2c5598..2e5a068 100644
--- a/fb/fbmmx.c
+++ b/fb/fbmmx.c
@@ -855,6 +855,187 @@ void fbComposeSetupMMX(void)
     } 
 }
 
+static __inline__ CARD8
+interpolate_bilinear (int   distx,
+		      int   idistx,
+		      int   disty,
+		      int   idisty,
+		      CARD8 tl,
+		      CARD8 tr,
+		      CARD8 bl,
+		      CARD8 br)
+{
+    return ((tl * idistx + tr * distx) * idisty +
+	    (bl * idistx + br * distx) * disty) >> 16;
+}
+
+static __inline__ CARD32
+interpolate_bilinear_8888 (int   distx,
+			   int   idistx,
+			   int   disty,
+			   int   idisty,
+			   CARD8 *l00,
+			   CARD8 *l01,
+			   CARD8 *l10,
+			   CARD8 *l11,
+			   int   x00,
+			   int   x01,
+			   int   x10,
+			   int   x11)
+{
+    CARD8 buffer[4];
+
+    buffer[0] = interpolate_bilinear (distx, idistx, disty, idisty,
+				      l00[x00], l01[x01],
+				      l10[x10], l11[x11]);
+
+    buffer[1] = interpolate_bilinear (distx, idistx, disty, idisty,
+				      l00[x00 + 1], l01[x01 + 1],
+				      l10[x10 + 1], l11[x11 + 1]);
+
+    buffer[2] = interpolate_bilinear (distx, idistx, disty, idisty,
+				      l00[x00 + 2], l01[x01 + 2],
+				      l10[x10 + 2], l11[x11 + 2]);
+
+    buffer[3] = interpolate_bilinear (distx, idistx, disty, idisty,
+				      l00[x00 + 3], l01[x01 + 3],
+				      l10[x10 + 3], l11[x11 + 3]);
+
+    return *((CARD32 *) buffer);
+}
+
+static __inline__ CARD32
+fetch_bilinear2_8888 (int   distx,
+		      int   idistx,
+		      int   disty,
+		      int   idisty,
+		      CARD8 *l0,
+		      CARD8 *l1,
+		      int   x0,
+		      int   x1)
+{
+    return interpolate_bilinear_8888 (distx,
+				      idistx,
+				      disty,
+				      idisty,
+				      l0,
+				      l0,
+				      l1,
+				      l1,
+				      x0,
+				      x0 + 4,
+				      x1,
+				      x1 + 4);
+}
+
+static __inline__ CARD32
+fetch_bilinear_8888 (int   distx,
+		     int   idistx,
+		     int   disty,
+		     int   idisty,
+		     CARD8 *l0,
+		     CARD8 *l1,
+		     int   x)
+{
+    return fetch_bilinear2_8888 (distx, idistx, disty, idisty, l0, l1, x, x);
+}
+
+static CARD32 _zero32x2[2] = { 0x0, 0x0 };
+static CARD8  *_zero8x8 = (CARD8 *) _zero32x2;
+
+static __inline__ int
+set_scale_steps (FbBits   *src,
+		 FbStride srcStride,
+		 int	  xStart,
+		 int	  xStep,
+		 int	  width,
+		 int	  line,
+		 int	  lastLine,
+		 int	  repeatType,
+		 CARD8	  **s0,
+		 CARD8	  **s1,
+		 int	  *x0,
+		 int	  *x0Step,
+		 int	  *x1,
+		 int	  *x1Step)
+{
+    if (line < 0)
+    {
+	if (repeatType == RepeatPad)
+	{
+	    *s0 = (CARD8 *) src;
+	    *s1 = (CARD8 *) src;
+
+	    *x0     = xStart;
+	    *x0Step = xStep;
+	    *x1     = xStart;
+	    *x1Step = xStep;
+	}
+	else
+	{
+	    if (line == -1)
+	    {
+		*s0 = _zero8x8;
+
+		*x0     = 0;
+		*x0Step = 0;
+
+		*s1 = (CARD8 *) src;
+
+		*x1     = xStart;
+		*x1Step = xStep;
+	    }
+	    else
+	    {
+		return 0;
+	    }
+	}
+    }
+    else if (line >= lastLine)
+    {
+	if (repeatType == RepeatPad)
+	{
+	    *s0 = (CARD8 *) (src + srcStride * lastLine);
+	    *s1 = (CARD8 *) (src + srcStride * lastLine);
+
+	    *x0     = xStart;
+	    *x0Step = xStep;
+	    *x1     = xStart;
+	    *x1Step = xStep;
+	}
+	else
+	{
+	    if (line == lastLine)
+	    {
+		*s0 = (CARD8 *) (src + srcStride * line);
+
+		*x0     = xStart;
+		*x0Step = xStep;
+
+		*s1 = _zero8x8;
+
+		*x1     = 0;
+		*x1Step = 0;
+	    }
+	    else
+	    {
+		return 0;
+	    }
+	}
+    }
+    else
+    {
+	*s0 = (CARD8 *) (src + srcStride * line);
+	*s1 = (CARD8 *) (src + srcStride * (line + 1));
+
+	*x0     = xStart;
+	*x0Step = xStep;
+	*x1     = xStart;
+	*x1Step = xStep;
+    }
+
+    return width;
+}
 
 /* ------------------ MMX code paths called from fbpict.c ----------------------- */
 
@@ -1150,109 +1331,333 @@ fbCompositeSrc_8888x8x8888mmx (CARD8	op,
     CHECKPOINT();
     
     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
     fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
 
     mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine;
     vmask = load8888 (mask);
     srca = MC(4x00ff);
-    
-    while (height--)
+
+    if (pSrc->transform)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
-	w = width;
+	CARD8	 *src0, *src1;
+	int	 xStart, x, x0, x1, y, line, xStep, x0Step, x1Step, yStep;
+	int	 distx, idistx, disty, idisty;
+	int	 srcEnd = pSrc->pDrawable->width << 16;
+	int	 srcEndIndex = (pSrc->pDrawable->width - 1) << 16;
+	int	 lastLine = pSrc->pDrawable->height - 1;
+	FbStride srcStride;
+	int	 xOff, yOff;
+	int	 bpp;
+	__m64	 d, s;
 
-	while (w && (unsigned long)dst & 7)
+	fbGetDrawable (pSrc->pDrawable, srcLine, srcStride, bpp, xOff, yOff);
+
+	xStep = pSrc->transform->matrix[0][0];
+	yStep = pSrc->transform->matrix[1][1];
+
+	xStart = pSrc->transform->matrix[0][2] + xStep * (xSrc + xOff);
+	y      = pSrc->transform->matrix[1][2] + yStep * (ySrc + yOff);
+
+	while (height--)
 	{
-	    __m64 s = load8888 (*src);
-	    __m64 d = load8888 (*dst);
-	    
-	    *dst = store8888 (in_over (s, srca, vmask, d));
-	    
-	    w--;
-	    dst++;
-	    src++;
+	    disty  = (y >> 8) & 0xff;
+	    idisty = 256 - disty;
+	    line   = y >> 16;
+
+	    dst = dstLine;
+	    dstLine += dstStride;
+
+	    x = xStart;
+	    w = set_scale_steps (srcLine, srcStride,
+				 xStart, xStep, width, line, lastLine,
+				 pSrc->repeatType,
+				 &src0, &src1, &x0, &x0Step, &x1, &x1Step);
+
+	    if (pSrc->filter == PictFilterBilinear)
+	    {
+		if (pSrc->repeatType == RepeatPad)
+		{
+		    if (w && x < 0)
+		    {
+			s = load8888 (fetch_bilinear_8888 (0, 255,
+							   disty, idisty,
+							   src0, src1,
+							   0));
+
+			while (w && x < 0)
+			{
+			    d = load8888 (*dst);
+
+			    *dst = store8888 (in_over (s, srca, vmask, d));
+
+			    x   += xStep;
+			    x0  += x0Step;
+			    x1  += x1Step;
+			    dst += 1;
+			    w   -= 1;
+			}
+		    }
+		}
+		else
+		{
+		    while (w && x < -xFixed1)
+		    {
+			x   += xStep;
+			x0  += x0Step;
+			x1  += x1Step;
+			dst += 1;
+			w   -= 1;
+		    }
+
+		    while (w && x < 0)
+		    {
+			distx  = (x >> 8) & 0xff;
+			idistx = 256 - distx;
+
+			d = load8888 (*dst);
+			s = load8888 (
+			    interpolate_bilinear_8888 (distx, idistx,
+						       disty, idisty,
+						       _zero8x8, src0,
+						       _zero8x8, src1,
+						       0, 0, 0, 0));
+
+			*dst = store8888 (in_over (s, srca, vmask, d));
+
+			x   += xStep;
+			x0  += x0Step;
+			x1  += x1Step;
+			dst += 1;
+			w   -= 1;
+		    }
+		}
+
+		while (w && x < srcEndIndex)
+		{
+		    distx  = (x >> 8) & 0xff;
+		    idistx = 256 - distx;
+
+		    d = load8888 (*dst);
+		    s = load8888 (fetch_bilinear2_8888 (distx, idistx,
+							disty, idisty,
+							src0, src1,
+							(x0 >> 14) & ~3,
+							(x1 >> 14) & ~3));
+
+		    *dst = store8888 (in_over (s, srca, vmask, d));
+
+		    x   += xStep;
+		    x0  += x0Step;
+		    x1  += x1Step;
+		    dst += 1;
+		    w   -= 1;
+		}
+
+		if (pSrc->repeatType == RepeatPad)
+		{
+		    if (w)
+		    {
+			s = load8888 (fetch_bilinear_8888 (0, 255,
+							   disty, idisty,
+							   src0, src1,
+							   srcEndIndex >> 16));
+
+			while (w)
+			{
+			    d = load8888 (*dst);
+
+			    *dst = store8888 (in_over (s, srca, vmask, d));
+
+			    dst += 1;
+			    w   -= 1;
+			}
+		    }
+		}
+		else
+		{
+		    while (w && x < srcEnd)
+		    {
+			distx  = (x >> 8) & 0xff;
+			idistx = 256 - distx;
+
+			d = load8888 (*dst);
+			s = load8888 (
+			    interpolate_bilinear_8888 (distx, idistx,
+						       disty, idisty,
+						       src0, _zero8x8,
+						       src1, _zero8x8,
+						       (x0 >> 14) & ~3, 0,
+						       (x1 >> 14) & ~3, 0));
+
+			*dst = store8888 (in_over (s, srca, vmask, d));
+
+			x   += xStep;
+			x0  += x0Step;
+			x1  += x1Step;
+			dst += 1;
+			w   -= 1;
+		    }
+		}
+	    }
+	    else
+	    {
+		if (pSrc->repeatType == RepeatPad)
+		{
+		    s  = load8888 (*((CARD32 *) src0));
+
+		    while (w && x < 0)
+		    {
+			d = load8888 (*dst);
+
+			*dst = store8888 (in_over (s, srca, vmask, d));
+
+			x   += xStep;
+			dst += 1;
+			w   -= 1;
+		    }
+		}
+		else
+		{
+		    while (w && x < 0)
+		    {
+			x   += xStep;
+			dst += 1;
+			w   -= 1;
+		    }
+		}
+
+		while (w && x < srcEnd)
+		{
+		    d = load8888 (*dst);
+		    s = load8888 (((CARD32 *) src0)[x >> 16]);
+
+		    *dst = store8888 (in_over (s, srca, vmask, d));
+
+		    x   += xStep;
+		    dst += 1;
+		    w   -= 1;
+		}
+
+		if (w && pSrc->repeatType == RepeatPad)
+		{
+		    s  = load8888 (((CARD32 *) src0)[srcEndIndex >> 16]);
+
+		    while (w)
+		    {
+			d = load8888 (*dst);
+
+			*dst = store8888 (in_over (s, srca, vmask, d));
+
+			dst += 1;
+			w   -= 1;
+		    }
+		}
+	    }
+
+	    y += yStep;
 	}
+    }
+    else
+    {
+	fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
 
-	while (w >= 16)
+	while (height--)
 	{
-	    __m64 vd0 = *(__m64 *)(dst + 0);
-	    __m64 vd1 = *(__m64 *)(dst + 2);
-	    __m64 vd2 = *(__m64 *)(dst + 4);
-	    __m64 vd3 = *(__m64 *)(dst + 6);
-	    __m64 vd4 = *(__m64 *)(dst + 8);
-	    __m64 vd5 = *(__m64 *)(dst + 10);
-	    __m64 vd6 = *(__m64 *)(dst + 12);
-	    __m64 vd7 = *(__m64 *)(dst + 14);
-
-	    __m64 vs0 = *(__m64 *)(src + 0);
-	    __m64 vs1 = *(__m64 *)(src + 2);
-	    __m64 vs2 = *(__m64 *)(src + 4);
-	    __m64 vs3 = *(__m64 *)(src + 6);
-	    __m64 vs4 = *(__m64 *)(src + 8);
-	    __m64 vs5 = *(__m64 *)(src + 10);
-	    __m64 vs6 = *(__m64 *)(src + 12);
-	    __m64 vs7 = *(__m64 *)(src + 14);
-
-	    vd0 = (__m64)pack8888 (
-		in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
-		in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
-	
-	    vd1 = (__m64)pack8888 (
-		in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
-		in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
-	
-	    vd2 = (__m64)pack8888 (
-		in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
-		in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
-	
-	    vd3 = (__m64)pack8888 (
-		in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
-		in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
-	
-	    vd4 = (__m64)pack8888 (
-		in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
-		in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
-	
-	    vd5 = (__m64)pack8888 (
-		in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
-		in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
-	
-	    vd6 = (__m64)pack8888 (
-		in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
-		in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
-	
-	    vd7 = (__m64)pack8888 (
-		in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
-		in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
-
-    	    *(__m64 *)(dst + 0) = vd0;
-	    *(__m64 *)(dst + 2) = vd1;
-	    *(__m64 *)(dst + 4) = vd2;
-	    *(__m64 *)(dst + 6) = vd3;
-	    *(__m64 *)(dst + 8) = vd4;
-	    *(__m64 *)(dst + 10) = vd5;
-	    *(__m64 *)(dst + 12) = vd6;
-	    *(__m64 *)(dst + 14) = vd7;
-	
-	    w -= 16;
-	    dst += 16;
-	    src += 16;
-	}
+	    dst = dstLine;
+	    dstLine += dstStride;
+	    src = srcLine;
+	    srcLine += srcStride;
+	    w = width;
+
+	    while (w && (unsigned long)dst & 7)
+	    {
+		__m64 s = load8888 (*src);
+		__m64 d = load8888 (*dst);
+	    
+		*dst = store8888 (in_over (s, srca, vmask, d));
+	    
+		w--;
+		dst++;
+		src++;
+	    }
+
+	    while (w >= 16)
+	    {
+		__m64 vd0 = *(__m64 *)(dst + 0);
+		__m64 vd1 = *(__m64 *)(dst + 2);
+		__m64 vd2 = *(__m64 *)(dst + 4);
+		__m64 vd3 = *(__m64 *)(dst + 6);
+		__m64 vd4 = *(__m64 *)(dst + 8);
+		__m64 vd5 = *(__m64 *)(dst + 10);
+		__m64 vd6 = *(__m64 *)(dst + 12);
+		__m64 vd7 = *(__m64 *)(dst + 14);
+
+		__m64 vs0 = *(__m64 *)(src + 0);
+		__m64 vs1 = *(__m64 *)(src + 2);
+		__m64 vs2 = *(__m64 *)(src + 4);
+		__m64 vs3 = *(__m64 *)(src + 6);
+		__m64 vs4 = *(__m64 *)(src + 8);
+		__m64 vs5 = *(__m64 *)(src + 10);
+		__m64 vs6 = *(__m64 *)(src + 12);
+		__m64 vs7 = *(__m64 *)(src + 14);
+
+		vd0 = (__m64)pack8888 (
+		    in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
+		    in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
+	
+		vd1 = (__m64)pack8888 (
+		    in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
+		    in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
+	
+		vd2 = (__m64)pack8888 (
+		    in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
+		    in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
+	
+		vd3 = (__m64)pack8888 (
+		    in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
+		    in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
+	
+		vd4 = (__m64)pack8888 (
+		    in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
+		    in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
+	
+		vd5 = (__m64)pack8888 (
+		    in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
+		    in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
+	
+		vd6 = (__m64)pack8888 (
+		    in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
+		    in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
+	
+		vd7 = (__m64)pack8888 (
+		    in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
+		    in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
+
+		*(__m64 *)(dst + 0) = vd0;
+		*(__m64 *)(dst + 2) = vd1;
+		*(__m64 *)(dst + 4) = vd2;
+		*(__m64 *)(dst + 6) = vd3;
+		*(__m64 *)(dst + 8) = vd4;
+		*(__m64 *)(dst + 10) = vd5;
+		*(__m64 *)(dst + 12) = vd6;
+		*(__m64 *)(dst + 14) = vd7;
+	
+		w -= 16;
+		dst += 16;
+		src += 16;
+	    }
 	
-	while (w)
-	{
-	    __m64 s = load8888 (*src);
-	    __m64 d = load8888 (*dst);
+	    while (w)
+	    {
+		__m64 s = load8888 (*src);
+		__m64 d = load8888 (*dst);
 	    
-	    *dst = store8888 (in_over (s, srca, vmask, d));
+		*dst = store8888 (in_over (s, srca, vmask, d));
 	    
-	    w--;
-	    dst++;
-	    src++;
+		w--;
+		dst++;
+		src++;
+	    }
 	}
     }
 
@@ -1277,61 +1682,286 @@ fbCompositeSrc_8888x8888mmx (CARD8	op,
     CARD32	*srcLine, *src;
     FbStride	dstStride, srcStride;
     CARD16	w;
-    __m64  srca;
     
     CHECKPOINT();
     
     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
 
-    srca = MC (4x00ff);
-    
-    while (height--)
+    if (pSrc->transform)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
-	w = width;
+	CARD8	 *src0, *src1;
+	int	 xStart, x, x0, x1, y, line, xStep, x0Step, x1Step, yStep;
+	int	 distx, idistx, disty, idisty;
+	int	 srcEnd = pSrc->pDrawable->width << 16;
+	int	 srcEndIndex = (pSrc->pDrawable->width - 1) << 16;
+	int	 lastLine = pSrc->pDrawable->height - 1;
+	FbStride srcStride;
+	int	 xOff, yOff;
+	int	 bpp;
+	__m64	 d, s, sa;
 
-	while (w && (unsigned long)dst & 7)
+	fbGetDrawable (pSrc->pDrawable, srcLine, srcStride, bpp, xOff, yOff);
+
+	xStep = pSrc->transform->matrix[0][0];
+	yStep = pSrc->transform->matrix[1][1];
+
+	xStart = pSrc->transform->matrix[0][2] + xStep * (xSrc + xOff);
+	y      = pSrc->transform->matrix[1][2] + yStep * (ySrc + yOff);
+
+	while (height--)
 	{
-	    __m64 s = load8888 (*src);
-	    __m64 d = load8888 (*dst);
-	    
-	    *dst = store8888 (over (s, expand_alpha (s), d));
-	    
-	    w--;
-	    dst++;
-	    src++;
+	    disty  = (y >> 8) & 0xff;
+	    idisty = 256 - disty;
+	    line   = y >> 16;
+
+	    dst = dstLine;
+	    dstLine += dstStride;
+
+	    x = xStart;
+	    w = set_scale_steps (srcLine, srcStride,
+				 xStart, xStep, width, line, lastLine,
+				 pSrc->repeatType,
+				 &src0, &src1, &x0, &x0Step, &x1, &x1Step);
+
+	    if (pSrc->filter == PictFilterBilinear)
+	    {
+		if (pSrc->repeatType == RepeatPad)
+		{
+		    if (w && x < 0)
+		    {
+			s = load8888 (fetch_bilinear_8888 (0, 255,
+							   disty, idisty,
+							   src0, src1,
+							   0));
+			sa = expand_alpha (s);
+
+			while (w && x < 0)
+			{
+			    d = load8888 (*dst);
+
+			    *dst = store8888 (over (s, sa, d));
+
+			    x   += xStep;
+			    x0  += x0Step;
+			    x1  += x1Step;
+			    dst += 1;
+			    w   -= 1;
+			}
+		    }
+		}
+		else
+		{
+		    while (w && x < -xFixed1)
+		    {
+			x   += xStep;
+			x0  += x0Step;
+			x1  += x1Step;
+			dst += 1;
+			w   -= 1;
+		    }
+
+		    while (w && x < 0)
+		    {
+			distx  = (x >> 8) & 0xff;
+			idistx = 256 - distx;
+
+			d = load8888 (*dst);
+			s = load8888 (
+			    interpolate_bilinear_8888 (distx, idistx,
+						       disty, idisty,
+						       _zero8x8, src0,
+						       _zero8x8, src1,
+						       0, 0, 0, 0));
+
+			*dst = store8888 (over (s, expand_alpha (s), d));
+
+			x   += xStep;
+			x0  += x0Step;
+			x1  += x1Step;
+			dst += 1;
+			w   -= 1;
+		    }
+		}
+
+		while (w && x < srcEndIndex)
+		{
+		    distx  = (x >> 8) & 0xff;
+		    idistx = 256 - distx;
+
+		    d = load8888 (*dst);
+		    s = load8888 (fetch_bilinear2_8888 (distx, idistx,
+							disty, idisty,
+							src0, src1,
+							(x0 >> 14) & ~3,
+							(x1 >> 14) & ~3));
+
+		    *dst = store8888 (over (s, expand_alpha (s), d));
+
+		    x   += xStep;
+		    x0  += x0Step;
+		    x1  += x1Step;
+		    dst += 1;
+		    w   -= 1;
+		}
+
+		if (pSrc->repeatType == RepeatPad)
+		{
+		    if (w)
+		    {
+			s = load8888 (fetch_bilinear_8888 (0, 255,
+							   disty, idisty,
+							   src0, src1,
+							   srcEndIndex >> 16));
+			sa = expand_alpha (s);
+
+			while (w)
+			{
+			    d = load8888 (*dst);
+
+			    *dst = store8888 (over (s, sa, d));
+
+			    dst += 1;
+			    w   -= 1;
+			}
+		    }
+		}
+		else
+		{
+		    while (w && x < srcEnd + xFixed1)
+		    {
+			distx  = (x >> 8) & 0xff;
+			idistx = 256 - distx;
+
+			d = load8888 (*dst);
+			s = load8888 (
+			    interpolate_bilinear_8888 (distx, idistx,
+						       disty, idisty,
+						       src0, _zero8x8,
+						       src1, _zero8x8,
+						       (x0 >> 14) & ~3, 0,
+						       (x1 >> 14) & ~3, 0));
+
+			*dst = store8888 (over (s, expand_alpha (s), d));
+
+			x   += xStep;
+			x0  += x0Step;
+			x1  += x1Step;
+			dst += 1;
+			w   -= 1;
+		    }
+		}
+	    }
+	    else
+	    {
+		if (pSrc->repeatType == RepeatPad)
+		{
+		    s  = load8888 (*((CARD32 *) src0));
+		    sa = expand_alpha (s);
+
+		    while (w && x < 0)
+		    {
+			d = load8888 (*dst);
+
+			*dst = store8888 (over (s, sa, d));
+
+			x   += xStep;
+			dst += 1;
+			w   -= 1;
+		    }
+		}
+		else
+		{
+		    while (w && x < 0)
+		    {
+			x   += xStep;
+			dst += 1;
+			w   -= 1;
+		    }
+		}
+
+		while (w && x < srcEnd)
+		{
+		    d = load8888 (*dst);
+		    s = load8888 (((CARD32 *) src0)[x >> 16]);
+
+		    *dst = store8888 (over (s, expand_alpha (s), d));
+
+		    x   += xStep;
+		    dst += 1;
+		    w   -= 1;
+		}
+
+		if (w && pSrc->repeatType == RepeatPad)
+		{
+		    s  = load8888 (((CARD32 *) src0)[srcEndIndex >> 16]);
+		    sa = expand_alpha (s);
+
+		    while (w)
+		    {
+			d = load8888 (*dst);
+
+			*dst = store8888 (over (s, sa, d));
+
+			dst += 1;
+			w   -= 1;
+		    }
+		}
+	    }
+
+	    y += yStep;
 	}
+    }
+    else
+    {
+	fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
 
-	while (w >= 2)
+	while (height--)
 	{
-	    __m64 vd = *(__m64 *)(dst + 0);
-	    __m64 vs = *(__m64 *)(src + 0);
-	    __m64 vs0 = expand8888 (vs, 0);
-	    __m64 vs1 = expand8888 (vs, 1);
-
-	    *(__m64 *)dst = (__m64)pack8888 (
-		over (vs0, expand_alpha (vs0), expand8888 (vd, 0)),
-		over (vs1, expand_alpha (vs1), expand8888 (vd, 1)));
+	    dst = dstLine;
+	    dstLine += dstStride;
+	    src = srcLine;
+	    srcLine += srcStride;
+	    w = width;
+
+	    while (w && (unsigned long)dst & 7)
+	    {
+		__m64 s = load8888 (*src);
+		__m64 d = load8888 (*dst);
 	    
-	    w -= 2;
-	    dst += 2;
-	    src += 2;
-	}
+		*dst = store8888 (over (s, expand_alpha (s), d));
+	    
+		w--;
+		dst++;
+		src++;
+	    }
+
+	    while (w >= 2)
+	    {
+		__m64 vd = *(__m64 *)(dst + 0);
+		__m64 vs = *(__m64 *)(src + 0);
+		__m64 vs0 = expand8888 (vs, 0);
+		__m64 vs1 = expand8888 (vs, 1);
+
+		*(__m64 *)dst = (__m64)pack8888 (
+		    over (vs0, expand_alpha (vs0), expand8888 (vd, 0)),
+		    over (vs1, expand_alpha (vs1), expand8888 (vd, 1)));
+	    
+		w -= 2;
+		dst += 2;
+		src += 2;
+	    }
 	
-	while (w)
-	{
-	    __m64 s = load8888 (*src);
-	    __m64 d = load8888 (*dst);
+	    while (w)
+	    {
+		__m64 s = load8888 (*src);
+		__m64 d = load8888 (*dst);
 	    
-	    *dst = store8888 (over (s, expand_alpha (s), d));
+		*dst = store8888 (over (s, expand_alpha (s), d));
 	    
-	    w--;
-	    dst++;
-	    src++;
+		w--;
+		dst++;
+		src++;
+	    }
 	}
     }
 
@@ -2307,6 +2937,95 @@ fbCompositeCopyAreammx (CARD8		op,
 		   width, height);
 }
 
+typedef struct _ScanlineBuf {
+    Bool   lock[2];
+    int    y[2];
+    CARD8 *line[2];
+    int   height;
+    CARD8 *heap;
+} ScanlineBuf;
+
+static Bool
+init_scanline_buffer (ScanlineBuf *slb,
+		      CARD8	  *buffer,
+		      int	  size,
+		      int	  length,
+		      int	  height)
+{
+    int i, s;
+
+    s = length << 1;
+
+    if (size < s)
+    {
+	slb->heap = xalloc (s);
+	if (!slb->heap)
+	    return FALSE;
+
+	buffer = slb->heap;
+    }
+    else
+    {
+	slb->heap = NULL;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+	slb->lock[i] = FALSE;
+	slb->y[i]    = SHRT_MAX;
+	slb->line[i] = buffer;
+
+	buffer += length;
+    }
+
+    slb->height = height;
+
+    return TRUE;
+}
+
+static void
+fini_scanline_buffer (ScanlineBuf *slb)
+{
+    if (slb->heap)
+	xfree (slb->heap);
+}
+
+static __inline__ void
+release_scanlines (ScanlineBuf *slb)
+{
+    int i;
+
+    for (i = 0; i < 2; i++)
+	slb->lock[i] = FALSE;
+}
+
+static __inline__ int
+_y_to_scanline (ScanlineBuf *slb,
+		int	    y)
+{
+    return (y < 0) ? 0 : (y >= slb->height) ? slb->height - 1 : y;
+}
+
+static __inline__ CARD8 *
+get_scanline (ScanlineBuf *slb,
+	      int	  y)
+{
+    int i;
+
+    y = _y_to_scanline (slb, y);
+
+    for (i = 0; i < 2; i++)
+    {
+	if (slb->y[i] == y)
+	{
+	    slb->lock[i] = TRUE;
+	    return slb->line[i];
+	}
+    }
+
+    return NULL;
+}
+
 typedef struct {
     ullong subYw;
     ullong U_green;
@@ -2440,95 +3159,6 @@ loadyuv (CARD8 *py,
 	(b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
 }
 
-typedef struct _ScanlineBuf {
-    Bool   lock[2];
-    int    y[2];
-    CARD8 *line[2];
-    int   height;
-    CARD8 *heap;
-} ScanlineBuf;
-
-static Bool
-init_scanline_buffer (ScanlineBuf *slb,
-		      CARD8	  *buffer,
-		      int	  size,
-		      int	  length,
-		      int	  height)
-{
-    int i, s;
-
-    s = length << 1;
-
-    if (size < s)
-    {
-	slb->heap = xalloc (s);
-	if (!slb->heap)
-	    return FALSE;
-
-	buffer = slb->heap;
-    }
-    else
-    {
-	slb->heap = NULL;
-    }
-
-    for (i = 0; i < 2; i++)
-    {
-	slb->lock[i] = FALSE;
-	slb->y[i]    = SHRT_MAX;
-	slb->line[i] = buffer;
-
-	buffer += length;
-    }
-
-    slb->height = height;
-
-    return TRUE;
-}
-
-static void
-fini_scanline_buffer (ScanlineBuf *slb)
-{
-    if (slb->heap)
-	xfree (slb->heap);
-}
-
-static __inline__ void
-release_scanlines (ScanlineBuf *slb)
-{
-    int i;
-
-    for (i = 0; i < 2; i++)
-	slb->lock[i] = FALSE;
-}
-
-static __inline__ int
-_y_to_scanline (ScanlineBuf *slb,
-		int	    y)
-{
-    return (y < 0) ? 0 : (y >= slb->height) ? slb->height - 1 : y;
-}
-
-static __inline__ CARD8 *
-get_scanline (ScanlineBuf *slb,
-	      int	  y)
-{
-    int i;
-
-    y = _y_to_scanline (slb, y);
-
-    for (i = 0; i < 2; i++)
-    {
-	if (slb->y[i] == y)
-	{
-	    slb->lock[i] = TRUE;
-	    return slb->line[i];
-	}
-    }
-
-    return NULL;
-}
-
 static __inline__ CARD8 *
 loadyv12_scanline (ScanlineBuf *slb,
 		   int	       y,
@@ -2651,47 +3281,6 @@ loadyuy2_scanline (ScanlineBuf *slb,
     return slb->line[i];
 }
 
-static __inline__ CARD8
-interpolate_bilinear (int   distx,
-		      int   idistx,
-		      int   disty,
-		      int   idisty,
-		      CARD8 tl,
-		      CARD8 tr,
-		      CARD8 bl,
-		      CARD8 br)
-{
-    return ((tl * idistx + tr * distx) * idisty +
-	    (bl * idistx + br * distx) * disty) >> 16;
-}
-
-static __inline__ void
-interpolate_bilinear_8888 (int   distx,
-			   int   idistx,
-			   int   disty,
-			   int   idisty,
-			   CARD8 *l0,
-			   CARD8 *l1,
-			   int   x,
-			   CARD8 buffer[4])
-{
-    buffer[0] = interpolate_bilinear (distx, idistx, disty, idisty,
-				      l0[x], l0[x + 4],
-				      l1[x], l1[x + 4]);
-
-    buffer[1] = interpolate_bilinear (distx, idistx, disty, idisty,
-				      l0[x + 1], l0[x + 5],
-				      l1[x + 1], l1[x + 5]);
-
-    buffer[2] = interpolate_bilinear (distx, idistx, disty, idisty,
-				      l0[x + 2], l0[x + 6],
-				      l1[x + 2], l1[x + 6]);
-
-    buffer[3] = interpolate_bilinear (distx, idistx, disty, idisty,
-				      l0[x + 3], l0[x + 7],
-				      l1[x + 3], l1[x + 7]);
-}
-
 /* TODO: MMX code for bilinear interpolation */
 void
 fbCompositeSrc_yv12x8888mmx (CARD8      op,
@@ -2776,13 +3365,14 @@ fbCompositeSrc_yv12x8888mmx (CARD8      
 	int	    x, x0, y, line, xStep, yStep;
 	int         distx, idistx, disty, idisty;
 	int	    srcEnd = pSrc->pDrawable->width << 16;
-
-	x0 = pSrc->transform->matrix[0][2] + ((xSrc + srcXoff) << 16);
-	y  = pSrc->transform->matrix[1][2] + ((ySrc + srcYoff) << 16);
+	int	    srcEndIndex = (pSrc->pDrawable->width - 1) << 16;
 
 	xStep = pSrc->transform->matrix[0][0];
 	yStep = pSrc->transform->matrix[1][1];
 
+	x0 = pSrc->transform->matrix[0][2] + xStep * (xSrc + srcXoff);
+	y  = pSrc->transform->matrix[1][2] + yStep * (ySrc + srcYoff);
+
 	init_scanline_buffer (&slb,
 			      _scanline_buf, sizeof (_scanline_buf),
 			      pSrc->pDrawable->width << 2,
@@ -2816,21 +3406,23 @@ fbCompositeSrc_yv12x8888mmx (CARD8      
 	    {
 		while (w && x < 0)
 		{
-		    interpolate_bilinear_8888 (0, 256, disty, idisty,
-					       ps0, ps1, 0, pd);
+		    *(CARD32 *) pd = fetch_bilinear_8888 (0, 256, disty, idisty,
+							  ps0, ps1, 0);
 
 		    x  += xStep;
 		    pd += 4;
 		    w  -= 1;
 		}
 
-		while (w && x < srcEnd)
+		while (w && x < srcEndIndex)
 		{
 		    distx  = (x >> 8) & 0xff;
 		    idistx = 256 - distx;
 
-		    interpolate_bilinear_8888 (distx, idistx, disty, idisty,
-					       ps0, ps1, (x >> 14) & ~3, pd);
+		    *(CARD32 *) pd = fetch_bilinear_8888 (distx, idistx,
+							  disty, idisty,
+							  ps0, ps1,
+							  (x >> 14) & ~3);
 
 		    x  += xStep;
 		    pd += 4;
@@ -2839,8 +3431,10 @@ fbCompositeSrc_yv12x8888mmx (CARD8      
 
 		while (w)
 		{
-		    interpolate_bilinear_8888 (256, 0, disty, idisty,
-					       ps0, ps1, (x >> 14) & ~3, pd);
+		    *(CARD32 *) pd = fetch_bilinear_8888 (256, 0,
+							  disty, idisty,
+							  ps0, ps1,
+							  (x >> 14) & ~3);
 
 		    pd += 4;
 		    w  -= 1;
@@ -3025,13 +3619,14 @@ fbCompositeSrc_yuy2x8888mmx (CARD8      
 	int	    x, x0, y, line, xStep, yStep;
 	int         distx, idistx, disty, idisty;
 	int	    srcEnd = pSrc->pDrawable->width << 16;
-
-	x0 = pSrc->transform->matrix[0][2] + ((xSrc + srcXoff) << 16);
-	y  = pSrc->transform->matrix[1][2] + ((ySrc + srcYoff) << 16);
+	int	    srcEndIndex = (pSrc->pDrawable->width - 1) << 16;
 
 	xStep = pSrc->transform->matrix[0][0];
 	yStep = pSrc->transform->matrix[1][1];
 
+	x0 = pSrc->transform->matrix[0][2] + xStep * (xSrc + srcXoff);
+	y  = pSrc->transform->matrix[1][2] + yStep * (ySrc + srcYoff);
+
 	init_scanline_buffer (&slb,
 			      _scanline_buf, sizeof (_scanline_buf),
 			      pSrc->pDrawable->width << 2,
@@ -3065,21 +3660,23 @@ fbCompositeSrc_yuy2x8888mmx (CARD8      
 	    {
 		while (w && x < 0)
 		{
-		    interpolate_bilinear_8888 (0, 256, disty, idisty,
-					       ps0, ps1, 0, pd);
+		    *(CARD32 *) pd = fetch_bilinear_8888 (0, 256, disty, idisty,
+							  ps0, ps1, 0);
 
 		    x  += xStep;
 		    pd += 4;
 		    w  -= 1;
 		}
 
-		while (w && x < srcEnd)
+		while (w && x < srcEndIndex)
 		{
 		    distx  = (x >> 8) & 0xff;
 		    idistx = 256 - distx;
 
-		    interpolate_bilinear_8888 (distx, idistx, disty, idisty,
-					       ps0, ps1, (x >> 14) & ~3, pd);
+		    *(CARD32 *) pd = fetch_bilinear_8888 (distx, idistx,
+							  disty, idisty,
+							  ps0, ps1,
+							  (x >> 14) & ~3);
 
 		    x  += xStep;
 		    pd += 4;
@@ -3088,8 +3685,9 @@ fbCompositeSrc_yuy2x8888mmx (CARD8      
 
 		while (w)
 		{
-		    interpolate_bilinear_8888 (256, 0, disty, idisty,
-					       ps0, ps1, (x >> 14) & ~3, pd);
+		    *(CARD32 *) pd = fetch_bilinear_8888 (256, 0, disty, idisty,
+							  ps0, ps1,
+							  (x >> 14) & ~3);
 
 		    pd += 4;
 		    w  -= 1;
diff --git a/fb/fbpict.c b/fb/fbpict.c
index 262a301..638ffb9 100644
--- a/fb/fbpict.c
+++ b/fb/fbpict.c
@@ -902,12 +902,15 @@ fbComposite (CARD8      op,
 #endif
     }
     else if (pSrc->pDrawable && (!pMask || pMask->pDrawable)
-        && !pSrc->transform && !(pMask && pMask->transform)
+        && !(pMask && pMask->transform)
         && !maskAlphaMap && !srcAlphaMap && !dstAlphaMap
 	&& (!pSrc->repeatType || srcRepeat)
 	&& (!pMask || (!pMask->repeatType || maskRepeat))
         && (pSrc->filter != PictFilterConvolution)
         && (!pMask || pMask->filter != PictFilterConvolution))
+    {
+	if (!pSrc->transform)
+	{
     switch (op) {
     case PictOpSrc:
 #ifdef USE_MMX
@@ -1258,6 +1261,51 @@ fbComposite (CARD8      op,
 	}
 	break;
     }
+	}
+	else if (op == PictOpOver && (pSrc->repeatType == RepeatPad ||
+				      pSrc->repeatType == RepeatNone))
+	{
+	    /* non rotating transformation */
+	    if (pSrc->transform->matrix[0][1] == 0 &&
+		pSrc->transform->matrix[1][0] == 0 &&
+		pSrc->transform->matrix[2][0] == 0 &&
+		pSrc->transform->matrix[2][1] == 0 &&
+		pSrc->transform->matrix[2][2] == 1 << 16)
+	    {
+		if (pMask)
+		{
+		    if (maskRepeat &&
+			pMask->pDrawable->width == 1 &&
+			pMask->pDrawable->height == 1)
+		    {
+			if (pSrc->format == PICT_x8r8g8b8 &&
+			    pDst->format == PICT_x8r8g8b8 &&
+			    pMask->format == PICT_a8)
+			{
+#ifdef USE_MMX
+			    if (fbHaveMMX())
+				func = fbCompositeSrc_8888x8x8888mmx;
+#endif
+			}
+		    }
+		}
+		else
+		{
+		    switch (pSrc->format) {
+		    case PICT_a8r8g8b8:
+			switch (pDst->format) {
+			case PICT_a8r8g8b8:
+			case PICT_x8r8g8b8:
+#ifdef USE_MMX
+			    if (fbHaveMMX())
+				func = fbCompositeSrc_8888x8888mmx;
+#endif
+			}
+		    }
+		}
+	    }
+	}
+    }
 
     if (!func) {
          /* no fast path, use the general code */
diff-tree 75846b85b0c46d0de52b03763ab2f44ab85dc963 (from 6296f798861f4c0af4a1397ca2191e4311c1d217)
Author: David Reveman <davidr at novell.com>
Date:   Mon Jun 12 10:12:14 2006 +0200

    Compute stride for other formats than YV12 correctly in xglXvPutImage. Add
    some decent YUY2 conversion code and enable YUV2 image format again.

diff --git a/fb/fbmmx.c b/fb/fbmmx.c
index b55791f..c2c5598 100644
--- a/fb/fbmmx.c
+++ b/fb/fbmmx.c
@@ -2416,9 +2416,9 @@ mmx_pack8888 (CARD8 *image)
 }
 
 static __inline__ CARD32
-loadyv12 (CARD8 *py,
-	  CARD8 *pu,
-	  CARD8 *pv)
+loadyuv (CARD8 *py,
+	 CARD8 *pu,
+	 CARD8 *pv)
 {
     INT16 y, u, v;
     INT32 r, g, b;
@@ -2560,7 +2560,7 @@ loadyv12_scanline (ScanlineBuf *slb,
 
     while (w && (unsigned long) py & 7)
     {
-	*((CARD32 *) pd) = loadyv12 (py, pu, pv);
+	*((CARD32 *) pd) = loadyuv (py, pu, pv);
 
 	pd += 4;
 	py += 1;
@@ -2589,7 +2589,7 @@ loadyv12_scanline (ScanlineBuf *slb,
 
     while (w)
     {
-	*((CARD32 *) pd) = loadyv12 (py, pu, pv);
+	*((CARD32 *) pd) = loadyuv (py, pu, pv);
 
 	pd += 4;
 	py += 1;
@@ -2606,6 +2606,51 @@ loadyv12_scanline (ScanlineBuf *slb,
     return slb->line[i];
 }
 
+static __inline__ CARD8 *
+loadyuy2_scanline (ScanlineBuf *slb,
+		   int	       y,
+		   CARD8       *src,
+		   int	       stride,
+		   int	       x,
+		   int	       width)
+{
+    CARD8 *py, *pu, *pv, *pd;
+    int   i, w;
+
+    y = _y_to_scanline (slb, y);
+
+    for (i = 0; slb->lock[i]; i++);
+
+    slb->y[i]    = y;
+    slb->lock[i] = TRUE;
+
+    py = src + stride * (y >> 0);
+    pu = py + 1;
+    pv = py + 3;
+
+    pd = slb->line[i];
+
+    w = width;
+
+    while (w)
+    {
+	*((CARD32 *) pd) = loadyuv (py, pu, pv);
+
+	pd += 4;
+	py += 2;
+
+	if (w & 1)
+	{
+	    pu += 4;
+	    pv += 4;
+	}
+
+	w--;
+    }
+
+    return slb->line[i];
+}
+
 static __inline__ CARD8
 interpolate_bilinear (int   distx,
 		      int   idistx,
@@ -2857,7 +2902,7 @@ fbCompositeSrc_yv12x8888mmx (CARD8      
 
 	    while (w && (unsigned long) py & 7)
 	    {
-		*((CARD32 *) pd) = loadyv12 (py, pu, pv);
+		*((CARD32 *) pd) = loadyuv (py, pu, pv);
 
 		pd += 4;
 		py += 1;
@@ -2886,7 +2931,7 @@ fbCompositeSrc_yv12x8888mmx (CARD8      
 
 	    while (w)
 	    {
-		*((CARD32 *) pd) = loadyv12 (py, pu, pv);
+		*((CARD32 *) pd) = loadyuv (py, pu, pv);
 
 		pd += 4;
 		py += 1;
@@ -2916,5 +2961,215 @@ fbCompositeSrc_yv12x8888mmx (CARD8      
     _mm_empty ();
 }
 
+/* TODO: MMX code for yuy2 */
+void
+fbCompositeSrc_yuy2x8888mmx (CARD8      op,
+			     PicturePtr pSrc,
+			     PicturePtr pMask,
+			     PicturePtr pDst,
+			     INT16      xSrc,
+			     INT16      ySrc,
+			     INT16      xMask,
+			     INT16      yMask,
+			     INT16      xDst,
+			     INT16      yDst,
+			     CARD16     width,
+			     CARD16     height)
+{
+    PictTransform *transform = pSrc->transform;
+    CARD8	  *dst, *src;
+    FbBits	  *srcBits;
+    FbStride	  srcStride;
+    int		  srcXoff;
+    int		  srcYoff;
+    FbBits	  *dstBits;
+    FbStride	  dstStride;
+    int		  dstXoff;
+    int		  dstYoff;
+    int		  bpp, offset, w;
+    CARD8	  *pd;
+
+    fbGetDrawable (pSrc->pDrawable, srcBits, srcStride, bpp, srcXoff, srcYoff);
+    fbGetDrawable (pDst->pDrawable, dstBits, dstStride, bpp, dstXoff, dstYoff);
+
+    dst = (CARD8 *) dstBits;
+    dstStride *= sizeof (FbBits);
+
+    src = (CARD8 *) srcBits;
+    srcStride *= sizeof (FbBits);
+
+    if (transform)
+    {
+	/* transformation is a Y coordinate flip, this is achieved by
+	   moving start offsets for each plane and changing sign of stride */
+	if (pSrc->transform->matrix[0][0] == (1 << 16)  &&
+	    pSrc->transform->matrix[1][1] == -(1 << 16) &&
+	    pSrc->transform->matrix[0][2] == 0          &&
+	    pSrc->transform->matrix[1][2] == (pSrc->pDrawable->height << 16))
+	{
+	    src = src + (pSrc->pDrawable->height - 1) * srcStride;
+
+	    srcStride = -srcStride;
+
+	    transform = 0;
+	}
+    }
+
+    dst += dstStride * (yDst + dstYoff) + ((xDst + dstXoff) << 2);
+
+    if (transform)
+    {
+	ScanlineBuf slb;
+	CARD8	    _scanline_buf[8192];
+	CARD8	    *ps, *ps0, *ps1;
+	int	    x, x0, y, line, xStep, yStep;
+	int         distx, idistx, disty, idisty;
+	int	    srcEnd = pSrc->pDrawable->width << 16;
+
+	x0 = pSrc->transform->matrix[0][2] + ((xSrc + srcXoff) << 16);
+	y  = pSrc->transform->matrix[1][2] + ((ySrc + srcYoff) << 16);
+
+	xStep = pSrc->transform->matrix[0][0];
+	yStep = pSrc->transform->matrix[1][1];
+
+	init_scanline_buffer (&slb,
+			      _scanline_buf, sizeof (_scanline_buf),
+			      pSrc->pDrawable->width << 2,
+			      pSrc->pDrawable->height);
+
+	while (height--)
+	{
+	    disty  = (y >> 8) & 0xff;
+	    idisty = 256 - disty;
+	    line   = y >> 16;
+
+	    ps0 = get_scanline (&slb, line);
+	    ps1 = get_scanline (&slb, line + 1);
+
+	    if (!ps0)
+		ps0 = loadyuy2_scanline (&slb, line,
+					 src, srcStride,
+					 0, pSrc->pDrawable->width);
+
+	    if (!ps1)
+		ps1 = loadyuy2_scanline (&slb, line + 1,
+					 src, srcStride,
+					 0, pSrc->pDrawable->width);
+
+	    pd = dst;
+
+	    x = x0;
+	    w = width;
+
+	    if (pSrc->filter == PictFilterBilinear)
+	    {
+		while (w && x < 0)
+		{
+		    interpolate_bilinear_8888 (0, 256, disty, idisty,
+					       ps0, ps1, 0, pd);
+
+		    x  += xStep;
+		    pd += 4;
+		    w  -= 1;
+		}
+
+		while (w && x < srcEnd)
+		{
+		    distx  = (x >> 8) & 0xff;
+		    idistx = 256 - distx;
+
+		    interpolate_bilinear_8888 (distx, idistx, disty, idisty,
+					       ps0, ps1, (x >> 14) & ~3, pd);
+
+		    x  += xStep;
+		    pd += 4;
+		    w  -= 1;
+		}
+
+		while (w)
+		{
+		    interpolate_bilinear_8888 (256, 0, disty, idisty,
+					       ps0, ps1, (x >> 14) & ~3, pd);
+
+		    pd += 4;
+		    w  -= 1;
+		}
+	    }
+	    else
+	    {
+		while (w && x < 0)
+		{
+		    *(CARD32 *) pd = *(CARD32 *) ps0;
+
+		    x  += xStep;
+		    pd += 4;
+		    w  -= 1;
+		}
+
+		while (w && x < srcEnd)
+		{
+		    *(CARD32 *) pd = ((CARD32 *) ps0)[x >> 16];
+
+		    x  += xStep;
+		    pd += 4;
+		    w  -= 1;
+		}
+
+		while (w)
+		{
+		    *(CARD32 *) pd = ((CARD32 *) ps0)[x >> 16];
+
+		    pd += 4;
+		    w  -= 1;
+		}
+	    }
+
+	    y   += yStep;
+	    dst += dstStride;
+
+	    release_scanlines (&slb);
+	}
+
+	fini_scanline_buffer (&slb);
+    }
+    else
+    {
+	CARD8 *py, *pu, *pv;
+
+	src += srcStride * (ySrc >> 0) + srcYoff + (xSrc + srcXoff);
+
+	while (height)
+	{
+	    py = src;
+	    pu = src + 1;
+	    pv = src + 3;
+	    pd = dst;
+
+	    w = width;
+
+	    while (w)
+	    {
+		*((CARD32 *) pd) = loadyuv (py, pu, pv);
+
+		pd += 4;
+		py += 2;
+
+		if (w & 1)
+		{
+		    pu += 4;
+		    pv += 4;
+		}
+
+		w--;
+	    }
+
+	    dst += dstStride;
+	    src += srcStride;
+
+	    height--;
+	}
+    }
+}
+
 #endif /* RENDER */
 #endif /* USE_MMX */
diff --git a/fb/fbmmx.h b/fb/fbmmx.h
index 1a6fd9b..1680efc 100644
--- a/fb/fbmmx.h
+++ b/fb/fbmmx.h
@@ -231,4 +231,18 @@ fbCompositeSrc_yv12x8888mmx (CARD8      
 			     CARD16     width,
 			     CARD16     height);
 
+void
+fbCompositeSrc_yuy2x8888mmx (CARD8      op,
+			     PicturePtr pSrc,
+			     PicturePtr pMask,
+			     PicturePtr pDst,
+			     INT16      xSrc,
+			     INT16      ySrc,
+			     INT16      xMask,
+			     INT16      yMask,
+			     INT16      xDst,
+			     INT16      yDst,
+			     CARD16     width,
+			     CARD16     height);
+
 #endif /* USE_MMX */
diff --git a/fb/fbpict.c b/fb/fbpict.c
index 0106bba..262a301 100644
--- a/fb/fbpict.c
+++ b/fb/fbpict.c
@@ -874,8 +874,8 @@ fbComposite (CARD8      op,
 	maskAlphaMap = pMask->alphaMap != 0;
     }
 
-    /* YV12 is only used internally for XVideo */
-    if (pSrc->format == PICT_yv12)
+    /* YUV is only used internally for XVideo */
+    if (pSrc->format == PICT_yv12 || pSrc->format == PICT_yuy2)
     {
 #ifdef USE_MMX
 	/* non rotating transformation */
@@ -890,7 +890,12 @@ fbComposite (CARD8      op,
 	    case PICT_a8r8g8b8:
 	    case PICT_x8r8g8b8:
 		if (fbHaveMMX())
-		    func = fbCompositeSrc_yv12x8888mmx;
+		{
+		    if (pSrc->format == PICT_yv12)
+			func = fbCompositeSrc_yv12x8888mmx;
+		    else
+			func = fbCompositeSrc_yuy2x8888mmx;
+		}
 		break;
 	    }
 	}
diff --git a/hw/xgl/xglxv.c b/hw/xgl/xglxv.c
index 4475a8a..5633b6d 100644
--- a/hw/xgl/xglxv.c
+++ b/hw/xgl/xglxv.c
@@ -298,7 +298,7 @@ xglXvPutImage (ClientPtr     client,
     ScreenPtr	  pScreen = pDrawable->pScreen;
     PicturePtr	  pSrc;
     PictTransform transform;
-    int		  depth, bpp, noVisual = FALSE;
+    int		  depth, bpp, stride, noVisual = FALSE;
     CARD32	  format;
 
     XGL_SCREEN_PRIV (pScreen);
@@ -306,11 +306,14 @@ xglXvPutImage (ClientPtr     client,
     XGL_DRAWABLE_PIXMAP (pDrawable);
     XGL_PIXMAP_PRIV (pPixmap);
 
+    stride = ((srcWidth + 7) & ~7);
+
     switch (pImage->id) {
     case GLITZ_FOURCC_YUY2:
 	bpp = depth = 16;
 	format = PICT_yuy2;
 	noVisual = !pScreenPriv->pXvVisual[XGL_XV_FORMAT_YUY2].format.surface;
+	stride *= 2;
 	break;
     case GLITZ_FOURCC_YV12:
 	depth = bpp = 12;
@@ -321,6 +324,7 @@ xglXvPutImage (ClientPtr     client,
 	depth = 24;
 	bpp = 32;
 	format = PICT_x8r8g8b8;
+	stride *= 4;
 	break;
     default:
 	return BadImplementation;
@@ -339,7 +343,7 @@ xglXvPutImage (ClientPtr     client,
 				    srcWidth, srcHeight,
 				    depth, bpp, -1, (pointer) data);
 
-    XGL_GET_PIXMAP_PRIV (pPortPriv->pPixmap)->stride = -((srcWidth + 7) & ~7);
+    XGL_GET_PIXMAP_PRIV (pPortPriv->pPixmap)->stride = -stride;
 
     pPortPriv->pPixmap->drawable.serialNumber = NEXT_SERIAL_NUMBER;
 
@@ -594,11 +598,6 @@ xglXvInitAdaptors (ScreenPtr pScreen)
     pAdaptor->nImages = sizeof (xvImages) / sizeof (XvImageRec);
     pAdaptor->pImages = xvImages;
 
-    /* XXX: Disable YUY2 format as it's not accelerated and the software
-       fallback got issues. */
-    pAdaptor->nImages = sizeof (xvImages) / sizeof (XvImageRec) - 1;
-    pAdaptor->pImages = &xvImages[1];
-
     /* TODO: Currently no attributes */
     pAdaptor->nAttributes = 0;
     pAdaptor->pAttributes = 0;



More information about the xorg-commit mailing list