xf86-video-ati: Branch 'master' - 4 commits

Roland Scheidegger sroland at kemper.freedesktop.org
Tue Mar 24 12:01:58 PDT 2009


 src/radeon_reg.h                 |   27 +
 src/radeon_textured_video.c      |  114 +++++--
 src/radeon_textured_videofuncs.c |  634 +++++++++++++++++++++++++++++++++++----
 src/radeon_video.c               |   21 -
 src/radeon_video.h               |    5 
 5 files changed, 708 insertions(+), 93 deletions(-)

New commits:
commit d2c3964fe04be42fe538f36439ed5ffca96e436a
Author: Roland Scheidegger <sroland at tungstengraphics.com>
Date:   Wed Mar 18 01:55:12 2009 +0100

    fix textured video allocation bug
    
    size needs to be calculated after dstPitch adjustments, got already fixed
    for earlier than R600 chips by planar textured yuv patches, clean this up.

diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index 63d5674..79671c0 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -376,11 +376,9 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 	    dstPitch = (dstPitch + 63) & ~63;
 	    dstPitch2 = ((dst_width >> 1) + 15) & ~15;
 	    dstPitch2 = (dstPitch2 + 63) & ~63;
-	    size = dstPitch * dst_height + 2 * dstPitch2 * ((dst_height + 1) >> 1);
 	} else {
 	    dstPitch = ((dst_width << 1) + 15) & ~15;
 	    dstPitch = (dstPitch + 63) & ~63;
-	    size = dstPitch * dst_height;
 	}
 	break;
     case FOURCC_UYVY:
@@ -390,13 +388,13 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 	dstPitch = (dstPitch + 63) & ~63;
 	srcPitch = (width << 1);
 	srcPitch2 = 0;
-	size = dstPitch * dst_height;
 	break;
     }
 
     if (info->ChipFamily >= CHIP_FAMILY_R600)
 	dstPitch = (dstPitch + 255) & ~255;
-    /* FIXME: size calc (adjust dstPitch earlier) */
+
+    size = dstPitch * dst_height + 2 * dstPitch2 * ((dst_height + 1) >> 1);
 
     if (pPriv->video_memory != NULL && size != pPriv->size) {
 	radeon_legacy_free_memory(pScrn, pPriv->video_memory);
commit 18e56eb179fde28477487c63e6f9ebf7579e2cd5
Author: Roland Scheidegger <sroland at tungstengraphics.com>
Date:   Thu Mar 5 02:07:46 2009 +0100

    don't convert planar yuv to packed for r200
    
    uses 3 textures for planar yuv and does yuv->rgb conversion in the shader.
    Similar to r300 code, but might have precision issues - hardware alu should
    have enough precision but hardware consts are only 8bit and we'd want
    at least 11.
    This also enables textured video on rv250 (and also supports packed yuv
    on that chip by using basically the same shader with packed data).

diff --git a/src/radeon_reg.h b/src/radeon_reg.h
index 247a0e7..98b6d0b 100644
--- a/src/radeon_reg.h
+++ b/src/radeon_reg.h
@@ -3027,6 +3027,18 @@
 #       define R200_TXA_REPL_ARG_B_MASK		(3 << 28)
 #       define R200_TXA_REPL_ARG_C_SHIFT	30
 #       define R200_TXA_REPL_ARG_C_MASK		(3 << 30)
+#define R200_PP_TXCBLEND_1			0x2f10
+#define R200_PP_TXCBLEND2_1			0x2f14
+#define R200_PP_TXABLEND_1			0x2f18
+#define R200_PP_TXABLEND2_1			0x2f1c
+#define R200_PP_TXCBLEND_2			0x2f20
+#define R200_PP_TXCBLEND2_2			0x2f24
+#define R200_PP_TXABLEND_2			0x2f28
+#define R200_PP_TXABLEND2_2			0x2f2c
+#define R200_PP_TXCBLEND_3			0x2f30
+#define R200_PP_TXCBLEND2_3			0x2f34
+#define R200_PP_TXABLEND_3			0x2f38
+#define R200_PP_TXABLEND2_3			0x2f3c
 
 #define R200_SE_VTX_FMT_0			0x2088
 #       define R200_VTX_XY			0 /* always have xy */
diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index ed4dd3e..63d5674 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -119,6 +119,15 @@ static __inline__ uint32_t F_TO_24(float val)
 	return float24;
 }
 
+static __inline__ uint32_t float4touint(float fr, float fg, float fb, float fa)
+{
+    unsigned ur = fr * 255.0 + 0.5;
+    unsigned ug = fg * 255.0 + 0.5;
+    unsigned ub = fb * 255.0 + 0.5;
+    unsigned ua = fa * 255.0 + 0.5;
+    return (ua << 24) | (ur << 16) | (ug << 8) | ub;
+}
+
 #define ACCEL_MMIO
 #define ACCEL_PREAMBLE()	unsigned char *RADEONMMIO = info->MMIO
 #define BEGIN_ACCEL(n)		RADEONWaitForFifo(pScrn, (n))
@@ -350,7 +359,11 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     }
 
     pPriv->planar_hw = pPriv->planar_state;
-    if (pPriv->bicubic_enabled || !( IS_R300_3D ))
+    if (pPriv->bicubic_enabled || !( IS_R300_3D ||
+	    (info->ChipFamily == CHIP_FAMILY_RV250) ||
+	    (info->ChipFamily == CHIP_FAMILY_RV280) ||
+	    (info->ChipFamily == CHIP_FAMILY_RS300) ||
+	    (info->ChipFamily == CHIP_FAMILY_R200) ))
         pPriv->planar_hw = 0;
 
     switch(id) {
@@ -625,11 +638,12 @@ static XF86VideoFormatRec Formats[NUM_FORMATS] =
     {15, TrueColor}, {16, TrueColor}, {24, TrueColor}
 };
 
-#define NUM_ATTRIBUTES 1
+#define NUM_ATTRIBUTES 2
 
 static XF86AttributeRec Attributes[NUM_ATTRIBUTES+1] =
 {
     {XvSettable | XvGettable, 0, 1, "XV_VSYNC"},
+    {XvSettable | XvGettable, 0, 1, "XV_HWPLANAR"},
     {0, 0, 0, NULL}
 };
 
diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c
index aa5d410..05acb93 100644
--- a/src/radeon_textured_videofuncs.c
+++ b/src/radeon_textured_videofuncs.c
@@ -758,7 +758,7 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	     * result = MAD(vco, yuv.vvvv, temp)
 	     */
 		float yco = 1.1643;
-		float uco[3] = {0.0, -0.39173, 2.017};
+		float uco[3] = {0.0, -0.39173, 2.018};
 		float vco[3] = {1.5958, -0.8129, 0.0};
 		float off[3] = {-0.0625 * yco + -0.5 * uco[0] + -0.5 * vco[0],
 				-0.0625 * yco + -0.5 * uco[1] + -0.5 * vco[1],
@@ -1567,10 +1567,18 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	    return;
 	}
 
-	if (pPriv->id == FOURCC_UYVY)
-	    txformat = RADEON_TXFORMAT_YVYU422;
-	else
-	    txformat = RADEON_TXFORMAT_VYUY422;
+	if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
+	    isplanar = TRUE;
+	}
+
+	if (isplanar) {
+	    txformat = RADEON_TXFORMAT_I8;
+	} else {
+	    if (pPriv->id == FOURCC_UYVY)
+		txformat = RADEON_TXFORMAT_YVYU422;
+	    else
+		txformat = RADEON_TXFORMAT_VYUY422;
+	}
 
 	txformat |= RADEON_TXFORMAT_NON_POWER2;
 
@@ -1579,12 +1587,10 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	if (RADEONTilingEnabled(pScrn, pPixmap))
 	    colorpitch |= RADEON_COLOR_TILE_ENABLE;
 
-	BEGIN_ACCEL(5);
+	BEGIN_ACCEL(4);
 
-	OUT_ACCEL_REG(RADEON_PP_CNTL,
-		      RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
 	OUT_ACCEL_REG(RADEON_RB3D_CNTL,
-		      dst_format | RADEON_ALPHA_BLEND_ENABLE);
+		      dst_format /*| RADEON_ALPHA_BLEND_ENABLE*/);
 	OUT_ACCEL_REG(RADEON_RB3D_COLOROFFSET, dst_offset);
 
 	OUT_ACCEL_REG(RADEON_RB3D_COLORPITCH, colorpitch);
@@ -1603,48 +1609,346 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	    info->accel_state->texW[0] = pPriv->w;
 	    info->accel_state->texH[0] = pPriv->h;
 
-	    BEGIN_ACCEL(12);
-
-	    OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
-	    OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
-			  (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
-
-	    OUT_ACCEL_REG(R200_PP_TXFILTER_0,
-			  R200_MAG_FILTER_LINEAR |
-			  R200_MIN_FILTER_LINEAR |
-			  R200_CLAMP_S_CLAMP_LAST |
-			  R200_CLAMP_T_CLAMP_LAST |
-			  R200_YUV_TO_RGB);
-	    OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
-	    OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
-	    OUT_ACCEL_REG(R200_PP_TXSIZE_0,
-			  (pPriv->w - 1) |
-			  ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
-	    OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
-
-	    OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);
-
-	    OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
-			  R200_TXC_ARG_A_ZERO |
-			  R200_TXC_ARG_B_ZERO |
-			  R200_TXC_ARG_C_R0_COLOR |
-			  R200_TXC_OP_MADD);
-	    OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
-			  R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
-	    OUT_ACCEL_REG(R200_PP_TXABLEND_0,
-			  R200_TXA_ARG_A_ZERO |
-			  R200_TXA_ARG_B_ZERO |
-			  R200_TXA_ARG_C_R0_ALPHA |
-			  R200_TXA_OP_MADD);
-	    OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
-			  R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
-	    FINISH_ACCEL();
+	    if (isplanar) {
+		/* note: in contrast to r300, use input biasing on uv components */
+		float yco = 1.1643;
+		float yoff = -0.0625 * yco;
+		float uco[3] = {0.0, -0.39173, 2.018};
+		float vco[3] = {1.5958, -0.8129, 0.0};
+
+		/* need 2 texcoord sets (even though they are identical) due
+		   to denormalization! hw apparently can't premultiply
+		   same coord set by different texture size */
+		vtx_count = 6;
+
+		txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
+			    (((((pPriv->h + 1 ) >> 1) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
+		txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
+		txpitch -= 32;
+		txfilter =  R200_MAG_FILTER_LINEAR |
+			    R200_MIN_FILTER_LINEAR |
+			    R200_CLAMP_S_CLAMP_LAST |
+			    R200_CLAMP_T_CLAMP_LAST;
+
+		BEGIN_ACCEL(36);
+
+		OUT_ACCEL_REG(RADEON_PP_CNTL,
+			      RADEON_TEX_0_ENABLE | RADEON_TEX_1_ENABLE | RADEON_TEX_2_ENABLE |
+			      RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
+			      RADEON_TEX_BLEND_2_ENABLE);
+
+		OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
+		OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
+			      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT) |
+			      (2 << R200_VTX_TEX1_COMP_CNT_SHIFT));
+
+		OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
+		OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
+		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
+		OUT_ACCEL_REG(R200_PP_TXSIZE_0,
+			      (pPriv->w - 1) |
+			      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
+		OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
+		OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);
+
+		OUT_ACCEL_REG(R200_PP_TXFILTER_1, txfilter);
+		OUT_ACCEL_REG(R200_PP_TXFORMAT_1, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
+		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_1, 0);
+		OUT_ACCEL_REG(R200_PP_TXSIZE_1, txformat0);
+		OUT_ACCEL_REG(R200_PP_TXPITCH_1, txpitch);
+		OUT_ACCEL_REG(R200_PP_TXOFFSET_1, pPriv->src_offset + pPriv->planeu_offset);
+
+		OUT_ACCEL_REG(R200_PP_TXFILTER_2, txfilter);
+		OUT_ACCEL_REG(R200_PP_TXFORMAT_2, txformat | R200_TXFORMAT_ST_ROUTE_STQ1);
+		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_2, 0);
+		OUT_ACCEL_REG(R200_PP_TXSIZE_2, txformat0);
+		OUT_ACCEL_REG(R200_PP_TXPITCH_2, txpitch);
+		OUT_ACCEL_REG(R200_PP_TXOFFSET_2, pPriv->src_offset + pPriv->planev_offset);
+
+		/* similar to r300 code. Note the big problem is that hardware constants
+		 * are 8 bits only, representing 0.0-1.0. We can get that up (using bias
+		 * + scale) to -1.0-1.0 (but precision will suffer). AFAIK the hw actually
+		 * has 12 bits fractional precision (plus 1 sign bit, 3 range bits) but
+		 * the constants not. To get larger range can use output scale, but for
+		 * that 2.018 value we need a total scale by 8, which means the constants
+		 * really have no accuracy whatsoever (5 fractional bits only).
+		 * The only direct way to get high  precision "constants" into the fragment
+		 * pipe I know of is to use the texcoord interpolator (not color, this one
+		 * is 8 bit only too), which seems a bit expensive. We're lucky though it
+		 * seems the values we need seem to fit better than worst case (get about
+		 * 6 fractional bits for this instead of 5, at least when not correcting for
+		 * hue/saturation/contrast/brightness, which is the same as for vco - yco and
+		 * yoff get 8 fractional bits).
+		 *
+		 * A higher precision (8 fractional bits) version might just put uco into
+		 * a texcoord, and calculate a new vcoconst in the shader, like so:
+		 * cohelper = {1.0, 0.0, 0.0} - shouldn't use 0.5 since not exactly representable
+		 * vco = {1.5958 - 1.0, -0.8129 + 1.0, 1.0}
+		 * vcocalc = ADD temp, bias/scale(cohelper), vco
+		 * would in total use 4 tex units, 4 instructions which seems fairly
+		 * balanced for this architecture (instead of 3 + 3 for the solution here)
+		 *
+		 * temp = MAD(yco, yuv.yyyy, yoff)
+		 * temp = MAD(uco, yuv.uuuu, temp)
+		 * result = MAD(vco, yuv.vvvv, temp)
+		 *
+		 * note first mad produces actually scalar, hence we transform
+		 * it into a dp2a to get 8 bit precision of yco instead of 7 -
+		 * That's assuming hw correctly expands consts to internal precision.
+		 * (y * 1 + y * (yco - 1) + yoff)
+		 * temp = DP2A / 2 (yco, yuv.yyyy, yoff)
+		 * temp = MAD (uco / 4, yuv.uuuu * 2, temp)
+		 * result = MAD x2 (vco / 2, yuv.vvvv, temp)
+		 *
+		 * vco, uco need bias (and hence scale too)
+		 *
+		 */
+
+		/* MAD temp0 / 2, const0.a * 2, temp0, -const0.rgb */
+		OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
+			      R200_TXC_ARG_A_TFACTOR_COLOR |
+			      R200_TXC_ARG_B_R0_COLOR |
+			      R200_TXC_ARG_C_TFACTOR_COLOR |
+			      R200_TXC_NEG_ARG_C |
+			      R200_TXC_OP_DOT2_ADD);
+		OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
+			      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
+			      R200_TXC_SCALE_INV2 |
+			      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
+		OUT_ACCEL_REG(R200_PP_TXABLEND_0,
+			      R200_TXA_ARG_A_ZERO |
+			      R200_TXA_ARG_B_ZERO |
+			      R200_TXA_ARG_C_ZERO |
+			      R200_TXA_OP_MADD);
+		OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
+			      R200_TXA_OUTPUT_REG_NONE);
+
+		/* MAD temp0, (const1 - 0.5) * 2, (temp1 - 0.5) * 2, temp0 */
+		OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
+			      R200_TXC_ARG_A_TFACTOR_COLOR |
+			      R200_TXC_BIAS_ARG_A |
+			      R200_TXC_SCALE_ARG_A |
+			      R200_TXC_ARG_B_R1_COLOR |
+			      R200_TXC_BIAS_ARG_B |
+			      R200_TXC_SCALE_ARG_B |
+			      R200_TXC_ARG_C_R0_COLOR |
+			      R200_TXC_OP_MADD);
+		OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
+			      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
+			      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R0);
+		OUT_ACCEL_REG(R200_PP_TXABLEND_1,
+			      R200_TXA_ARG_A_ZERO |
+			      R200_TXA_ARG_B_ZERO |
+			      R200_TXA_ARG_C_ZERO |
+			      R200_TXA_OP_MADD);
+		OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
+			      R200_TXA_OUTPUT_REG_NONE);
+
+		/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp2 - 0.5), temp0 */
+		OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
+			      R200_TXC_ARG_A_TFACTOR_COLOR |
+			      R200_TXC_BIAS_ARG_A |
+			      R200_TXC_SCALE_ARG_A |
+			      R200_TXC_ARG_B_R2_COLOR |
+			      R200_TXC_BIAS_ARG_B |
+			      R200_TXC_ARG_C_R0_COLOR |
+			      R200_TXC_OP_MADD);
+		OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
+			      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
+			      R200_TXC_SCALE_2X |
+			      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
+		OUT_ACCEL_REG(R200_PP_TXABLEND_2,
+			      R200_TXA_ARG_A_ZERO |
+			      R200_TXA_ARG_B_ZERO |
+			      R200_TXA_ARG_C_ZERO |
+			      R200_TXA_COMP_ARG_C |
+			      R200_TXA_OP_MADD);
+		OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
+			      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
+
+		/* shader constants */
+		OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(1.0, /* src range [1, 2] */
+							      yco - 1.0,
+							      -yoff, /* range [-1, 0] */
+							      0.0));
+		OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * 0.125 + 0.5, /* range [-4, 4] */
+							      uco[1] * 0.125 + 0.5,
+							      uco[2] * 0.125 + 0.5,
+							      0.0));
+		OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * 0.25 + 0.5, /* range [-2, 2] */
+							      vco[1] * 0.25 + 0.5,
+							      vco[2] * 0.25 + 0.5,
+							      0.0));
+
+		FINISH_ACCEL();
+	    }
+	    else if (info->ChipFamily == CHIP_FAMILY_RV250) {
+		/* fix up broken packed yuv - shader same as above except
+		   yuv compoents are all in same reg */
+		float yco = 1.1643;
+		float yoff = -0.0625 * yco;
+		float uco[3] = {0.0, -0.39173, 2.018};
+		float vco[3] = {1.5958, -0.8129, 0.0};
+
+		txformat0 = (((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) |
+			    (((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << RADEON_TEX_VSIZE_SHIFT));
+		txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
+		txpitch -= 32;
+		txfilter =  R200_MAG_FILTER_LINEAR |
+			    R200_MIN_FILTER_LINEAR |
+			    R200_CLAMP_S_CLAMP_LAST |
+			    R200_CLAMP_T_CLAMP_LAST;
+
+		BEGIN_ACCEL(24);
+
+		OUT_ACCEL_REG(RADEON_PP_CNTL,
+			      RADEON_TEX_0_ENABLE |
+			      RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE |
+			      RADEON_TEX_BLEND_2_ENABLE);
+
+		OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
+		OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
+			      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
+
+		OUT_ACCEL_REG(R200_PP_TXFILTER_0, txfilter);
+		OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
+		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
+		OUT_ACCEL_REG(R200_PP_TXSIZE_0,
+			      (pPriv->w - 1) |
+			      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
+		OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
+		OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);
+
+		/* MAD temp1 / 2, const0.a * 2, temp0.ggg, -const0.rgb */
+		OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
+			      R200_TXC_ARG_A_TFACTOR_COLOR |
+			      R200_TXC_ARG_B_R0_COLOR |
+			      R200_TXC_ARG_C_TFACTOR_COLOR |
+			      R200_TXC_NEG_ARG_C |
+			      R200_TXC_OP_DOT2_ADD);
+		OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
+			      (0 << R200_TXC_TFACTOR_SEL_SHIFT) |
+			      R200_TXC_SCALE_INV2 |
+			      (R200_TXC_REPL_GREEN << R200_TXC_REPL_ARG_B_SHIFT) |
+			      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
+		OUT_ACCEL_REG(R200_PP_TXABLEND_0,
+			      R200_TXA_ARG_A_ZERO |
+			      R200_TXA_ARG_B_ZERO |
+			      R200_TXA_ARG_C_ZERO |
+			      R200_TXA_OP_MADD);
+		OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
+			      R200_TXA_OUTPUT_REG_NONE);
+
+		/* MAD temp1, (const1 - 0.5) * 2, (temp0.rrr - 0.5) * 2, temp1 */
+		OUT_ACCEL_REG(R200_PP_TXCBLEND_1,
+			      R200_TXC_ARG_A_TFACTOR_COLOR |
+			      R200_TXC_BIAS_ARG_A |
+			      R200_TXC_SCALE_ARG_A |
+			      R200_TXC_ARG_B_R0_COLOR |
+			      R200_TXC_BIAS_ARG_B |
+			      R200_TXC_SCALE_ARG_B |
+			      R200_TXC_ARG_C_R1_COLOR |
+			      R200_TXC_OP_MADD);
+		OUT_ACCEL_REG(R200_PP_TXCBLEND2_1,
+			      (1 << R200_TXC_TFACTOR_SEL_SHIFT) |
+			      (R200_TXC_REPL_BLUE << R200_TXC_REPL_ARG_B_SHIFT) |
+			      R200_TXC_CLAMP_8_8 | R200_TXC_OUTPUT_REG_R1);
+		OUT_ACCEL_REG(R200_PP_TXABLEND_1,
+			      R200_TXA_ARG_A_ZERO |
+			      R200_TXA_ARG_B_ZERO |
+			      R200_TXA_ARG_C_ZERO |
+			      R200_TXA_OP_MADD);
+		OUT_ACCEL_REG(R200_PP_TXABLEND2_1,
+			      R200_TXA_OUTPUT_REG_NONE);
+
+		/* MAD temp0 x 2, (const2 - 0.5) * 2, (temp0.bbb - 0.5), temp1 */
+		OUT_ACCEL_REG(R200_PP_TXCBLEND_2,
+			      R200_TXC_ARG_A_TFACTOR_COLOR |
+			      R200_TXC_BIAS_ARG_A |
+			      R200_TXC_SCALE_ARG_A |
+			      R200_TXC_ARG_B_R0_COLOR |
+			      R200_TXC_BIAS_ARG_B |
+			      R200_TXC_ARG_C_R1_COLOR |
+			      R200_TXC_OP_MADD);
+		OUT_ACCEL_REG(R200_PP_TXCBLEND2_2,
+			      (2 << R200_TXC_TFACTOR_SEL_SHIFT) |
+			      R200_TXC_SCALE_2X |
+			      (R200_TXC_REPL_RED << R200_TXC_REPL_ARG_B_SHIFT) |
+			      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
+		OUT_ACCEL_REG(R200_PP_TXABLEND_2,
+			      R200_TXA_ARG_A_ZERO |
+			      R200_TXA_ARG_B_ZERO |
+			      R200_TXA_ARG_C_ZERO |
+			      R200_TXA_COMP_ARG_C |
+			      R200_TXA_OP_MADD);
+		OUT_ACCEL_REG(R200_PP_TXABLEND2_2,
+			      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
+
+		/* shader constants */
+		OUT_ACCEL_REG(R200_PP_TFACTOR_0, float4touint(1.0, /* src range [1, 2] */
+							      yco - 1.0,
+							      -yoff, /* range [-1, 0] */
+							      0.0));
+		OUT_ACCEL_REG(R200_PP_TFACTOR_1, float4touint(uco[0] * 0.125 + 0.5, /* range [-4, 4] */
+							      uco[1] * 0.125 + 0.5,
+							      uco[2] * 0.125 + 0.5,
+							      0.0));
+		OUT_ACCEL_REG(R200_PP_TFACTOR_2, float4touint(vco[0] * 0.25 + 0.5, /* range [-2, 2] */
+							      vco[1] * 0.25 + 0.5,
+							      vco[2] * 0.25 + 0.5,
+							      0.0));
+
+		FINISH_ACCEL();
+	    }
+	    else {
+		BEGIN_ACCEL(13);
+		OUT_ACCEL_REG(RADEON_PP_CNTL,
+			      RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
+
+		OUT_ACCEL_REG(R200_SE_VTX_FMT_0, R200_VTX_XY);
+		OUT_ACCEL_REG(R200_SE_VTX_FMT_1,
+			      (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
+
+		OUT_ACCEL_REG(R200_PP_TXFILTER_0,
+			      R200_MAG_FILTER_LINEAR |
+			      R200_MIN_FILTER_LINEAR |
+			      R200_CLAMP_S_CLAMP_LAST |
+			      R200_CLAMP_T_CLAMP_LAST |
+			      R200_YUV_TO_RGB);
+		OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
+		OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
+		OUT_ACCEL_REG(R200_PP_TXSIZE_0,
+			      (pPriv->w - 1) |
+			      ((pPriv->h - 1) << RADEON_TEX_VSIZE_SHIFT));
+		OUT_ACCEL_REG(R200_PP_TXPITCH_0, pPriv->src_pitch - 32);
+
+		OUT_ACCEL_REG(R200_PP_TXOFFSET_0, pPriv->src_offset);
+
+		OUT_ACCEL_REG(R200_PP_TXCBLEND_0,
+			      R200_TXC_ARG_A_ZERO |
+			      R200_TXC_ARG_B_ZERO |
+			      R200_TXC_ARG_C_R0_COLOR |
+			      R200_TXC_OP_MADD);
+		OUT_ACCEL_REG(R200_PP_TXCBLEND2_0,
+			      R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0);
+		OUT_ACCEL_REG(R200_PP_TXABLEND_0,
+			      R200_TXA_ARG_A_ZERO |
+			      R200_TXA_ARG_B_ZERO |
+			      R200_TXA_ARG_C_R0_ALPHA |
+			      R200_TXA_OP_MADD);
+		OUT_ACCEL_REG(R200_PP_TXABLEND2_0,
+			      R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0);
+		FINISH_ACCEL();
+	    }
 	} else {
 
 	    info->accel_state->texW[0] = 1;
 	    info->accel_state->texH[0] = 1;
 
-	    BEGIN_ACCEL(8);
+	    BEGIN_ACCEL(9);
+
+	    OUT_ACCEL_REG(RADEON_PP_CNTL,
+			  RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE);
 
 	    OUT_ACCEL_REG(RADEON_SE_VTX_FMT, (RADEON_SE_VTX_FMT_XY |
 					      RADEON_SE_VTX_FMT_ST0));
@@ -1876,6 +2180,20 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 			    ((float)srcX + (float)srcw * (((float)dsth / (float)dstw) + 1.0)) / info->accel_state->texW[0],
 			                                              (float)srcY / info->accel_state->texH[0]);
 		}
+	    } else if (isplanar) {
+		/*
+		 * Just render a rect (using three coords).
+		 * Filter is a bit a misnomer, it's just texcoords...
+		 */
+		VTX_OUT_FILTER((float)dstX,                                (float)(dstY + dsth),
+			(float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0],
+			(float)srcX / info->accel_state->texW[0],          (float)(srcY + srch) / info->accel_state->texH[0]);
+		VTX_OUT_FILTER((float)(dstX + dstw),                       (float)(dstY + dsth),
+			(float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0],
+			(float)(srcX + srcw) / info->accel_state->texW[0], (float)(srcY + srch) / info->accel_state->texH[0]);
+		VTX_OUT_FILTER((float)(dstX + dstw),                       (float)dstY,
+			(float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0],
+			(float)(srcX + srcw) / info->accel_state->texW[0], (float)srcY / info->accel_state->texH[0]);
 	    } else {
 		/*
 		 * Just render a rect (using three coords).
diff --git a/src/radeon_video.c b/src/radeon_video.c
index 92d1a71..a2a4696 100644
--- a/src/radeon_video.c
+++ b/src/radeon_video.c
@@ -297,22 +297,19 @@ void RADEONInitVideo(ScreenPtr pScreen)
 	RADEONInitOffscreenImages(pScreen);
     }
 
-    if (info->ChipFamily != CHIP_FAMILY_RV250) {
-	if ((info->ChipFamily < CHIP_FAMILY_RS400)
+    if ((info->ChipFamily < CHIP_FAMILY_RS400)
 #ifdef XF86DRI
-	    || (info->directRenderingEnabled)
+	|| (info->directRenderingEnabled)
 #endif
-	    ) {
-	    texturedAdaptor = RADEONSetupImageTexturedVideo(pScreen);
-	    if (texturedAdaptor != NULL) {
-		adaptors[num_adaptors++] = texturedAdaptor;
-		xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Set up textured video\n");
-	    } else
-		xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Failed to set up textured video\n");
+	) {
+	texturedAdaptor = RADEONSetupImageTexturedVideo(pScreen);
+	if (texturedAdaptor != NULL) {
+	    adaptors[num_adaptors++] = texturedAdaptor;
+	    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Set up textured video\n");
 	} else
-	    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Textured video requires CP on R5xx/R6xx/R7xx/IGP\n");
+	    xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Failed to set up textured video\n");
     } else
-	xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Textured video disabled on RV250 due to HW bug\n");
+	xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Textured video requires CP on R5xx/R6xx/R7xx/IGP\n");
 
     if(num_adaptors)
 	xf86XVScreenInit(pScreen, adaptors, num_adaptors);
commit 58530bf4912800f9e09ebaea42a13cff8a80c19e
Author: Roland Scheidegger <sroland at tungstengraphics.com>
Date:   Sat Feb 21 04:46:31 2009 +0100

    don't convert planar yuv to packed for r300
    
    uses 3 textures for planar yuv and does yuv->rgb conversion in the shader.
    small performance advantage, but manual texture cache setting is necessary
    otherwise it may be measurably slower (but probably not relevant) in some
    cases.
    Unlike some other drivers, using MADs instead of DP3s, since this requires
    less instructions due to no MOVs are required, the end result is the same
    though the constants need to be different.
    Use of this is user settable for now (XV_HWPLANAR attrib).

diff --git a/src/radeon_reg.h b/src/radeon_reg.h
index 0af8859..247a0e7 100644
--- a/src/radeon_reg.h
+++ b/src/radeon_reg.h
@@ -4406,6 +4406,7 @@
 #define R300_TX_INVALTAGS				0x4100
 #define R300_TX_FILTER0_0				0x4400
 #define R300_TX_FILTER0_1				0x4404
+#define R300_TX_FILTER0_2				0x4408
 #       define R300_TX_CLAMP_S(x)                       ((x) << 0)
 #       define R300_TX_CLAMP_T(x)                       ((x) << 3)
 #       define R300_TX_CLAMP_R(x)                       ((x) << 6)
@@ -4424,8 +4425,10 @@
 #       define R300_TX_ID_SHIFT                         28
 #define R300_TX_FILTER1_0				0x4440
 #define R300_TX_FILTER1_1				0x4444
+#define R300_TX_FILTER1_2				0x4448
 #define R300_TX_FORMAT0_0				0x4480
 #define R300_TX_FORMAT0_1				0x4484
+#define R300_TX_FORMAT0_2				0x4488
 #       define R300_TXWIDTH_SHIFT                       0
 #       define R300_TXHEIGHT_SHIFT                      11
 #       define R300_NUM_LEVELS_SHIFT                    26
@@ -4434,6 +4437,7 @@
 #       define R300_TXPITCH_EN                          (1 << 31)
 #define R300_TX_FORMAT1_0				0x44c0
 #define R300_TX_FORMAT1_1				0x44c4
+#define R300_TX_FORMAT1_2				0x44c8
 #	define R300_TX_FORMAT_X8		    0x0
 #	define R300_TX_FORMAT_X16		    0x1
 #	define R300_TX_FORMAT_Y4X4		    0x2
@@ -4506,13 +4510,23 @@
 #       define R300_TX_FORMAT_YUV_TO_RGB_NO_CLAMP      (2 << 22)
 #       define R300_TX_FORMAT_SWAP_YUV                 (1 << 24)
 
+#       define R300_TX_FORMAT_CACHE_WHOLE              (0 << 27)
+#       define R300_TX_FORMAT_CACHE_HALF_REGION_0      (2 << 27)
+#       define R300_TX_FORMAT_CACHE_HALF_REGION_1      (3 << 27)
+#       define R300_TX_FORMAT_CACHE_FOURTH_REGION_0    (4 << 27)
+#       define R300_TX_FORMAT_CACHE_FOURTH_REGION_1    (5 << 27)
+#       define R300_TX_FORMAT_CACHE_FOURTH_REGION_2    (6 << 27)
+#       define R300_TX_FORMAT_CACHE_FOURTH_REGION_3    (7 << 27)
+
 #define R300_TX_FORMAT2_0				0x4500
 #define R300_TX_FORMAT2_1				0x4504
+#define R300_TX_FORMAT2_2				0x4508
 #       define R500_TXWIDTH_11                          (1 << 15)
 #       define R500_TXHEIGHT_11                         (1 << 16)
 
 #define R300_TX_OFFSET_0				0x4540
 #define R300_TX_OFFSET_1				0x4544
+#define R300_TX_OFFSET_2				0x4548
 #       define R300_ENDIAN_SWAP_16_BIT                  (1 << 0)
 #       define R300_ENDIAN_SWAP_32_BIT                  (2 << 0)
 #       define R300_ENDIAN_SWAP_HALF_DWORD              (3 << 0)
@@ -4523,6 +4537,7 @@
 #define R300_TX_ENABLE				        0x4104
 #       define R300_TEX_0_ENABLE                        (1 << 0)
 #       define R300_TEX_1_ENABLE                        (1 << 1)
+#       define R300_TEX_2_ENABLE                        (1 << 2)
 
 #define R300_US_W_FMT				        0x46b4
 #define R300_US_OUT_FMT_1				0x46a8
diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index f72f2c5..ed4dd3e 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -304,8 +304,9 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     RADEONInfoPtr info = RADEONPTR(pScrn);
     RADEONPortPrivPtr pPriv = (RADEONPortPrivPtr)data;
     INT32 x1, x2, y1, y2;
-    int srcPitch, srcPitch2, dstPitch;
+    int srcPitch, srcPitch2, dstPitch, dstPitch2 = 0;
     int s2offset, s3offset, tmp;
+    int d2line, d3line;
     int top, left, npixels, nlines, size;
     BoxRec dstBox;
     int dst_width = width, dst_height = height;
@@ -335,18 +336,45 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     if ((x1 >= x2) || (y1 >= y2))
 	return Success;
 
+    /* Bicubic filter setup */
+    pPriv->bicubic_enabled = (pPriv->bicubic_state != BICUBIC_OFF);
+    if (!(IS_R300_3D || IS_R500_3D || IS_R600_3D))
+	pPriv->bicubic_enabled = FALSE;
+    if (pPriv->bicubic_enabled && (pPriv->bicubic_state == BICUBIC_AUTO)) {
+	/*
+	 * Applying the bicubic filter with a scale of less than 200%
+	 * results in a blurred picture, so disable the filter.
+	 */
+	if ((src_w > drw_w / 2) || (src_h > drw_h / 2))
+	    pPriv->bicubic_enabled = FALSE;
+    }
+
+    pPriv->planar_hw = pPriv->planar_state;
+    if (pPriv->bicubic_enabled || !( IS_R300_3D ))
+        pPriv->planar_hw = 0;
+
     switch(id) {
     case FOURCC_YV12:
     case FOURCC_I420:
-	dstPitch = ((dst_width << 1) + 15) & ~15;
 	srcPitch = (width + 3) & ~3;
 	srcPitch2 = ((width >> 1) + 3) & ~3;
-	size = dstPitch * dst_height;
+        if (pPriv->planar_hw) {
+	    dstPitch = (dst_width + 15) & ~15;
+	    dstPitch = (dstPitch + 63) & ~63;
+	    dstPitch2 = ((dst_width >> 1) + 15) & ~15;
+	    dstPitch2 = (dstPitch2 + 63) & ~63;
+	    size = dstPitch * dst_height + 2 * dstPitch2 * ((dst_height + 1) >> 1);
+	} else {
+	    dstPitch = ((dst_width << 1) + 15) & ~15;
+	    dstPitch = (dstPitch + 63) & ~63;
+	    size = dstPitch * dst_height;
+	}
 	break;
     case FOURCC_UYVY:
     case FOURCC_YUY2:
     default:
 	dstPitch = ((dst_width << 1) + 15) & ~15;
+	dstPitch = (dstPitch + 63) & ~63;
 	srcPitch = (width << 1);
 	srcPitch2 = 0;
 	size = dstPitch * dst_height;
@@ -355,8 +383,7 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 
     if (info->ChipFamily >= CHIP_FAMILY_R600)
 	dstPitch = (dstPitch + 255) & ~255;
-    else
-	dstPitch = (dstPitch + 63) & ~63;
+    /* FIXME: size calc (adjust dstPitch earlier) */
 
     if (pPriv->video_memory != NULL && size != pPriv->size) {
 	radeon_legacy_free_memory(pScrn, pPriv->video_memory);
@@ -376,19 +403,6 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 	    return BadAlloc;
     }
 
-    /* Bicubic filter setup */
-    pPriv->bicubic_enabled = (pPriv->bicubic_state != BICUBIC_OFF);
-    if (!(IS_R300_3D || IS_R500_3D || IS_R600_3D))
-	pPriv->bicubic_enabled = FALSE;
-    if (pPriv->bicubic_enabled && (pPriv->bicubic_state == BICUBIC_AUTO)) {
-	/*
-	 * Applying the bicubic filter with a scale of less than 200%
-	 * results in a blurred picture, so disable the filter.
-	 */
-	if ((src_w > drw_w / 2) || (src_h > drw_h / 2))
-	    pPriv->bicubic_enabled = FALSE;
-    }
-
     /* Bicubic filter loading */
     if (pPriv->bicubic_memory == NULL && pPriv->bicubic_enabled) {
 	pPriv->bicubic_offset = radeon_legacy_allocate_memory(pScrn,
@@ -432,10 +446,16 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     else
 	pPriv->src_addr = (uint8_t *)(info->FB + pPriv->video_offset + (top * dstPitch));
     pPriv->src_pitch = dstPitch;
+    pPriv->planeu_offset = dstPitch * dst_height;
+    pPriv->planev_offset = pPriv->planeu_offset + dstPitch2 * ((dst_height + 1) >> 1);
     pPriv->size = size;
     pPriv->pDraw = pDraw;
 
+
 #if 0
+    ErrorF("planeu_offset: 0x%x\n", pPriv->planeu_offset);
+    ErrorF("planev_offset: 0x%x\n", pPriv->planev_offset);
+    ErrorF("dstPitch2: 0x%x\n", dstPitch2);
     ErrorF("src_offset: 0x%x\n", pPriv->src_offset);
     ErrorF("src_addr: 0x%x\n", pPriv->src_addr);
     ErrorF("src_pitch: 0x%x\n", pPriv->src_pitch);
@@ -470,6 +490,29 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 				     srcPitch, srcPitch2, pPriv->src_pitch,
 				     width, height);
 	    }
+	}
+        else if (pPriv->planar_hw) {
+	    top &= ~1;
+	    s2offset = srcPitch * ((height + 1) & ~1);
+	    s3offset = s2offset + srcPitch2 * ((height + 1) >> 1);
+	    s2offset += (top >> 1) * srcPitch2 + (left >> 1);
+	    s3offset += (top >> 1) * srcPitch2 + (left >> 1);
+	    d2line = pPriv->planeu_offset;
+	    d3line = pPriv->planev_offset;
+	    d2line += (top >> 1) * dstPitch2 - (top * dstPitch);
+	    d3line += (top >> 1) * dstPitch2 - (top * dstPitch);
+	    nlines = ((y2 + 0xffff) >> 16) - top;
+	    if(id == FOURCC_YV12) {
+		tmp = s2offset;
+		s2offset = s3offset;
+		s3offset = tmp;
+	    }
+	    RADEONCopyData(pScrn, buf + (top * srcPitch) + left, pPriv->src_addr + left,
+		srcPitch, dstPitch, nlines, npixels, 1);
+	    RADEONCopyData(pScrn, buf + s2offset,  pPriv->src_addr + d2line + (left >> 1),
+		srcPitch2, dstPitch2, (nlines + 1) >> 1, npixels >> 1, 1);
+	    RADEONCopyData(pScrn, buf + s3offset, pPriv->src_addr + d3line + (left >> 1),
+		srcPitch2, dstPitch2, (nlines + 1) >> 1, npixels >> 1, 1);
 	} else {
 	    top &= ~1;
 	    nlines = ((((y2 + 0xffff) >> 16) + 1) & ~1) - top;
@@ -590,17 +633,19 @@ static XF86AttributeRec Attributes[NUM_ATTRIBUTES+1] =
     {0, 0, 0, NULL}
 };
 
-#define NUM_ATTRIBUTES_R300 2
+#define NUM_ATTRIBUTES_R300 3
 
 static XF86AttributeRec Attributes_r300[NUM_ATTRIBUTES_R300+1] =
 {
     {XvSettable | XvGettable, 0, 2, "XV_BICUBIC"},
     {XvSettable | XvGettable, 0, 1, "XV_VSYNC"},
+    {XvSettable | XvGettable, 0, 1, "XV_HWPLANAR"},
     {0, 0, 0, NULL}
 };
 
 static Atom xvBicubic;
 static Atom xvVSync;
+static Atom xvHWPlanar;
 
 #define NUM_IMAGES 4
 
@@ -627,6 +672,8 @@ RADEONGetTexPortAttribute(ScrnInfoPtr  pScrn,
 	*value = pPriv->bicubic_state;
     else if (attribute == xvVSync)
 	*value = pPriv->vsync;
+    else if (attribute == xvHWPlanar)
+	*value = pPriv->planar_state;
     else
 	return BadMatch;
 
@@ -648,6 +695,8 @@ RADEONSetTexPortAttribute(ScrnInfoPtr  pScrn,
 	pPriv->bicubic_state = ClipValue (value, 0, 2);
     else if (attribute == xvVSync)
 	pPriv->vsync = ClipValue (value, 0, 1);
+    else if (attribute == xvHWPlanar)
+	pPriv->planar_state = ClipValue (value, 0, 1);
     else
 	return BadMatch;
 
@@ -671,6 +720,7 @@ RADEONSetupImageTexturedVideo(ScreenPtr pScreen)
 
     xvBicubic         = MAKE_ATOM("XV_BICUBIC");
     xvVSync           = MAKE_ATOM("XV_VSYNC");
+    xvHWPlanar        = MAKE_ATOM("XV_HWPLANAR");
 
     adapt->type = XvWindowMask | XvInputMask | XvImageMask;
     adapt->flags = 0;
@@ -720,6 +770,7 @@ RADEONSetupImageTexturedVideo(ScreenPtr pScreen)
 	pPriv->doubleBuffer = 0;
 	pPriv->bicubic_state = BICUBIC_AUTO;
 	pPriv->vsync = TRUE;
+	pPriv->planar_state = 1;
 
 	/* gotta uninit this someplace, XXX: shouldn't be necessary for textured */
 	REGION_NULL(pScreen, &pPriv->clip);
diff --git a/src/radeon_textured_videofuncs.c b/src/radeon_textured_videofuncs.c
index f55ae12..aa5d410 100644
--- a/src/radeon_textured_videofuncs.c
+++ b/src/radeon_textured_videofuncs.c
@@ -97,6 +97,7 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
     uint32_t dst_offset, dst_pitch, dst_format;
     uint32_t txenable, colorpitch;
     uint32_t blendcntl;
+    Bool isplanar = FALSE;
     int dstxoff, dstyoff, pixel_shift, vtx_count;
     BoxPtr pBox = REGION_RECTS(&pPriv->clip);
     int nBox = REGION_NUM_RECTS(&pPriv->clip);
@@ -181,16 +182,29 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 	if (RADEONTilingEnabled(pScrn, pPixmap))
 	    colorpitch |= R300_COLORTILE;
 
-	if (pPriv->id == FOURCC_UYVY)
-	    txformat1 = R300_TX_FORMAT_YVYU422;
-	else
-	    txformat1 = R300_TX_FORMAT_VYUY422;
+	if (pPriv->planar_hw && (pPriv->id == FOURCC_I420 || pPriv->id == FOURCC_YV12)) {
+	    isplanar = TRUE;
+	}
 
-	txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
+	if (isplanar) {
+	    txformat1 = R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_HALF_REGION_0;
+	    txpitch = pPriv->src_pitch;
+	} else {
+	    if (pPriv->id == FOURCC_UYVY)
+		txformat1 = R300_TX_FORMAT_YVYU422;
+	    else
+		txformat1 = R300_TX_FORMAT_VYUY422;
+
+	    txformat1 |= R300_TX_FORMAT_YUV_TO_RGB_CLAMP;
+
+	    /* pitch is in pixels */
+	    txpitch = pPriv->src_pitch / 2;
+	}
+	txpitch -= 1;
 
 	txformat0 = ((((pPriv->w - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
-		     (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
-		     R300_TXPITCH_EN);
+		    (((pPriv->h - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
+		    R300_TXPITCH_EN);
 
 	info->accel_state->texW[0] = pPriv->w;
 	info->accel_state->texH[0] = pPriv->h;
@@ -201,9 +215,6 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 		    R300_TX_MIN_FILTER_LINEAR |
 		    (0 << R300_TX_ID_SHIFT));
 
-	/* pitch is in pixels */
-	txpitch = pPriv->src_pitch / 2;
-	txpitch -= 1;
 
 	if (IS_R500_3D && ((pPriv->w - 1) & 0x800))
 	    txpitch |= R500_TXWIDTH_11;
@@ -224,6 +235,34 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 
 	txenable = R300_TEX_0_ENABLE;
 
+	if (isplanar) {
+	    txformat0 = ((((((pPriv->w + 1 ) >> 1) - 1) & 0x7ff) << R300_TXWIDTH_SHIFT) |
+			(((((pPriv->h + 1 ) >> 1 ) - 1) & 0x7ff) << R300_TXHEIGHT_SHIFT) |
+			R300_TXPITCH_EN);
+	    txpitch = ((pPriv->src_pitch >> 1) + 63) & ~63;
+	    txpitch -= 1;
+	    txfilter = (R300_TX_CLAMP_S(R300_TX_CLAMP_CLAMP_LAST) |
+		        R300_TX_CLAMP_T(R300_TX_CLAMP_CLAMP_LAST) |
+			R300_TX_MIN_FILTER_LINEAR |
+			R300_TX_MAG_FILTER_LINEAR);
+
+		BEGIN_ACCEL(12);
+		OUT_ACCEL_REG(R300_TX_FILTER0_1, txfilter | (1 << R300_TX_ID_SHIFT));
+		OUT_ACCEL_REG(R300_TX_FILTER1_1, 0);
+		OUT_ACCEL_REG(R300_TX_FORMAT0_1, txformat0);
+		OUT_ACCEL_REG(R300_TX_FORMAT1_1, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_2);
+		OUT_ACCEL_REG(R300_TX_FORMAT2_1, txpitch);
+		OUT_ACCEL_REG(R300_TX_OFFSET_1, txoffset + pPriv->planeu_offset);
+		OUT_ACCEL_REG(R300_TX_FILTER0_2, txfilter | (2 << R300_TX_ID_SHIFT));
+		OUT_ACCEL_REG(R300_TX_FILTER1_2, 0);
+		OUT_ACCEL_REG(R300_TX_FORMAT0_2, txformat0);
+		OUT_ACCEL_REG(R300_TX_FORMAT1_2, R300_TX_FORMAT_X8 | R300_TX_FORMAT_CACHE_FOURTH_REGION_3);
+		OUT_ACCEL_REG(R300_TX_FORMAT2_2, txpitch);
+		OUT_ACCEL_REG(R300_TX_OFFSET_2, txoffset + pPriv->planev_offset);
+		FINISH_ACCEL();
+		txenable |= R300_TEX_1_ENABLE | R300_TEX_2_ENABLE;
+	}
+
 	if (pPriv->bicubic_enabled) {
 		/* Size is 128x1 */
 		txformat0 = ((0x7f << R300_TXWIDTH_SHIFT) |
@@ -691,6 +730,171 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 		OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), 0);
 
 		FINISH_ACCEL();
+	    } else if (isplanar) {
+	    /*
+	     * y' = y - .0625
+	     * u' = u - .5
+	     * v' = v - .5;
+	     *
+	     * r = 1.1643 * y' + 0.0     * u' + 1.5958  * v'
+	     * g = 1.1643 * y' - 0.39173 * u' - 0.81290 * v'
+	     * b = 1.1643 * y' + 2.017   * u' + 0.0     * v'
+	     *
+	     * DP3 might look like the straightforward solution
+	     * but we'd need to move the texture yuv values in
+	     * the same reg for this to work. Therefore use MADs.
+	     * Without changing the shader at all (only the constants)
+	     * could also provide hue/saturation/brightness/contrast control.
+	     *
+	     * yco = 1.1643
+	     * uco = 0, -0.39173, 2.017
+	     * vco = 1.5958, -0.8129, 0
+	     * off = -0.0625 * yco + -0.5 * uco[r] + -0.5 * vco[r],
+	     *       -0.0625 * yco + -0.5 * uco[g] + -0.5 * vco[g],
+	     *       -0.0625 * yco + -0.5 * uco[b] + -0.5 * vco[b],
+	     *
+	     * temp = MAD(yco, yuv.yyyy, off)
+	     * temp = MAD(uco, yuv.uuuu, temp)
+	     * result = MAD(vco, yuv.vvvv, temp)
+	     */
+		float yco = 1.1643;
+		float uco[3] = {0.0, -0.39173, 2.017};
+		float vco[3] = {1.5958, -0.8129, 0.0};
+		float off[3] = {-0.0625 * yco + -0.5 * uco[0] + -0.5 * vco[0],
+				-0.0625 * yco + -0.5 * uco[1] + -0.5 * vco[1],
+				-0.0625 * yco + -0.5 * uco[2] + -0.5 * vco[2]};
+
+		BEGIN_ACCEL(33);
+		/* 2 components: same 2 for tex0/1/2 */
+		OUT_ACCEL_REG(R300_RS_COUNT,
+			  ((2 << R300_RS_COUNT_IT_COUNT_SHIFT) |
+			   R300_RS_COUNT_HIRES_EN));
+		/* R300_INST_COUNT_RS - highest RS instruction used */
+		OUT_ACCEL_REG(R300_RS_INST_COUNT, R300_INST_COUNT_RS(0) | R300_TX_OFFSET_RS(6));
+
+		OUT_ACCEL_REG(R300_US_PIXSIZE, 2); /* highest temp used */
+
+		/* Indirection levels */
+		OUT_ACCEL_REG(R300_US_CONFIG, ((0 << R300_NLEVEL_SHIFT) |
+							R300_FIRST_TEX));
+
+		OUT_ACCEL_REG(R300_US_CODE_OFFSET, (R300_ALU_CODE_OFFSET(0) |
+						   R300_ALU_CODE_SIZE(3) |
+						   R300_TEX_CODE_OFFSET(0) |
+						   R300_TEX_CODE_SIZE(3)));
+
+		OUT_ACCEL_REG(R300_US_CODE_ADDR_3, (R300_ALU_START(0) |
+						   R300_ALU_SIZE(2) |
+						   R300_TEX_START(0) |
+						   R300_TEX_SIZE(2) |
+						   R300_RGBA_OUT));
+
+		/* tex inst */
+		OUT_ACCEL_REG(R300_US_TEX_INST_0, (R300_TEX_SRC_ADDR(0) |
+						  R300_TEX_DST_ADDR(0) |
+						  R300_TEX_ID(0) |
+						  R300_TEX_INST(R300_TEX_INST_LD)));
+		OUT_ACCEL_REG(R300_US_TEX_INST_1, (R300_TEX_SRC_ADDR(0) |
+						  R300_TEX_DST_ADDR(1) |
+						  R300_TEX_ID(1) |
+						  R300_TEX_INST(R300_TEX_INST_LD)));
+		OUT_ACCEL_REG(R300_US_TEX_INST_2, (R300_TEX_SRC_ADDR(0) |
+						  R300_TEX_DST_ADDR(2) |
+						  R300_TEX_ID(2) |
+						  R300_TEX_INST(R300_TEX_INST_LD)));
+
+		/* ALU inst */
+		/* MAD temp0, const0.a, temp0, const0.rgb */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(0), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(0)) |
+						   R300_ALU_RGB_ADDR1(0) |
+						   R300_ALU_RGB_ADDR2(0) |
+						   R300_ALU_RGB_ADDRD(0) |
+						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(0), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_AAA) |
+						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC0_RGB) |
+						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
+		/* alpha nop, but need to set up alpha source for rgb usage */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(0), (R300_ALU_ALPHA_ADDR0(R300_ALU_ALPHA_CONST(0)) |
+						   R300_ALU_ALPHA_ADDR1(0) |
+						   R300_ALU_ALPHA_ADDR2(0) |
+						   R300_ALU_ALPHA_ADDRD(0) |
+						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(0), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		/* MAD const1, temp1, temp0 */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(1), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(1)) |
+						   R300_ALU_RGB_ADDR1(1) |
+						   R300_ALU_RGB_ADDR2(0) |
+						   R300_ALU_RGB_ADDRD(0) |
+						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(1), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
+						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE)));
+		/* alpha nop */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(1), (R300_ALU_ALPHA_ADDRD(0) |
+						   R300_ALU_ALPHA_WMASK(R300_ALU_ALPHA_MASK_NONE)));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(1), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_0_0)));
+
+		/* MAD result, const2, temp2, temp0 */
+		OUT_ACCEL_REG(R300_US_ALU_RGB_ADDR(2), (R300_ALU_RGB_ADDR0(R300_ALU_RGB_CONST(2)) |
+						   R300_ALU_RGB_ADDR1(2) |
+						   R300_ALU_RGB_ADDR2(0) |
+						   R300_ALU_RGB_ADDRD(0) |
+						   R300_ALU_RGB_WMASK(R300_ALU_RGB_MASK_RGB) |
+						   R300_ALU_RGB_OMASK(R300_ALU_RGB_MASK_RGB)));
+		OUT_ACCEL_REG(R300_US_ALU_RGB_INST(2), (R300_ALU_RGB_SEL_A(R300_ALU_RGB_SRC0_RGB) |
+						   R300_ALU_RGB_MOD_A(R300_ALU_RGB_MOD_NOP) |
+						   R300_ALU_RGB_SEL_B(R300_ALU_RGB_SRC1_RGB) |
+						   R300_ALU_RGB_MOD_B(R300_ALU_RGB_MOD_NOP) |
+						   R300_ALU_RGB_SEL_C(R300_ALU_RGB_SRC2_RGB) |
+						   R300_ALU_RGB_MOD_C(R300_ALU_RGB_MOD_NOP) |
+						   R300_ALU_RGB_OP(R300_ALU_RGB_OP_MAD) |
+						   R300_ALU_RGB_OMOD(R300_ALU_RGB_OMOD_NONE) |
+						   R300_ALU_RGB_CLAMP));
+		/* write alpha 1 */
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_ADDR(4), (R300_ALU_ALPHA_ADDRD(0) |
+						   R300_ALU_ALPHA_OMASK(R300_ALU_ALPHA_MASK_A) |
+						   R300_ALU_ALPHA_TARGET_A));
+		OUT_ACCEL_REG(R300_US_ALU_ALPHA_INST(4), (R300_ALU_ALPHA_OP(R300_ALU_ALPHA_OP_MAD) |
+						   R300_ALU_ALPHA_SEL_A(R300_ALU_ALPHA_0_0) |
+						   R300_ALU_ALPHA_SEL_B(R300_ALU_ALPHA_0_0) |
+						   R300_ALU_ALPHA_SEL_C(R300_ALU_ALPHA_1_0)));
+
+		/* Shader constants. */
+		/* constant 0: off, yco */
+		OUT_ACCEL_REG(R300_US_ALU_CONST_R(0), F_TO_24(off[0]));
+		OUT_ACCEL_REG(R300_US_ALU_CONST_G(0), F_TO_24(off[1]));
+		OUT_ACCEL_REG(R300_US_ALU_CONST_B(0), F_TO_24(off[2]));
+		OUT_ACCEL_REG(R300_US_ALU_CONST_A(0), F_TO_24(yco));
+		/* constant 1: uco */
+		OUT_ACCEL_REG(R300_US_ALU_CONST_R(1), F_TO_24(uco[0]));
+		OUT_ACCEL_REG(R300_US_ALU_CONST_G(1), F_TO_24(uco[1]));
+		OUT_ACCEL_REG(R300_US_ALU_CONST_B(1), F_TO_24(uco[2]));
+		OUT_ACCEL_REG(R300_US_ALU_CONST_A(1), F_TO_24(0.0));
+		/* constant 2: vco */
+		OUT_ACCEL_REG(R300_US_ALU_CONST_R(2), F_TO_24(vco[0]));
+		OUT_ACCEL_REG(R300_US_ALU_CONST_G(2), F_TO_24(vco[1]));
+		OUT_ACCEL_REG(R300_US_ALU_CONST_B(2), F_TO_24(vco[2]));
+		OUT_ACCEL_REG(R300_US_ALU_CONST_A(2), F_TO_24(0.0));
+
+		FINISH_ACCEL();
+
 	    } else {
 		BEGIN_ACCEL(11);
 		/* 2 components: 2 for tex0 */
@@ -760,7 +964,7 @@ FUNC_NAME(RADEONDisplayTexturedVideo)(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv
 						   R300_ALU_ALPHA_OMOD(R300_ALU_ALPHA_OMOD_NONE) |
 						   R300_ALU_ALPHA_CLAMP));
 		FINISH_ACCEL();
-		}
+	    }
 	} else {
 	    if (pPriv->bicubic_enabled) {
 		BEGIN_ACCEL(7);
diff --git a/src/radeon_video.h b/src/radeon_video.h
index 7f1891e..34fb07f 100644
--- a/src/radeon_video.h
+++ b/src/radeon_video.h
@@ -90,6 +90,11 @@ typedef struct {
    void         *video_memory;
    int           video_offset;
 
+   Bool          planar_hw;
+   Bool          planar_state;
+   int           planeu_offset;
+   int           planev_offset;
+
    /* bicubic filtering */
    void         *bicubic_memory;
    int           bicubic_offset;
commit 97e19d96ba65a3df2fa3bbf73cfcc01b6dc3e796
Author: Roland Scheidegger <sroland at tungstengraphics.com>
Date:   Tue Dec 30 22:23:39 2008 +0100

    clip fixes
    
    This fixes some oddities observed when the video is only partly visible.
    Instead of recalculating the geometry of the video, always use the same.
    Also fixes a assignment present twice, and another issue (bring in line with
    what the overlay code does).

diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index 2df299f..f72f2c5 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -327,10 +327,10 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     if (!xf86XVClipVideoHelper(&dstBox, &x1, &x2, &y1, &y2, clipBoxes, width, height))
 	return Success;
 
-    src_w = (x2 - x1) >> 16;
+/*    src_w = (x2 - x1) >> 16;
     src_h = (y2 - y1) >> 16;
     drw_w = dstBox.x2 - dstBox.x1;
-    drw_h = dstBox.y2 - dstBox.y1;
+    drw_h = dstBox.y2 - dstBox.y1;*/
 
     if ((x1 >= x2) || (y1 >= y2))
 	return Success;
@@ -475,7 +475,6 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 	    nlines = ((((y2 + 0xffff) >> 16) + 1) & ~1) - top;
 	    s2offset = srcPitch * height;
 	    s3offset = (srcPitch2 * (height >> 1)) + s2offset;
-	    top &= ~1;
 	    pPriv->src_addr += left << 1;
 	    tmp = ((top >> 1) * srcPitch2) + (left >> 1);
 	    s2offset += tmp;
@@ -504,7 +503,9 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 				 width, height);
 	} else {
 	    nlines = ((y2 + 0xffff) >> 16) - top;
-	    RADEONCopyData(pScrn, buf, pPriv->src_addr, srcPitch, dstPitch, nlines, npixels, 2);
+	    pPriv->src_addr += left << 1;
+	    RADEONCopyData(pScrn, buf + (top * srcPitch) + (left << 1),
+			   pPriv->src_addr, srcPitch, dstPitch, nlines, npixels, 2);
 	}
 	break;
     }


More information about the xorg-commit mailing list