[PATCH] EXA: Move floating point math to the GPU as much as possible for R1-5xx.

Michel Dänzer michel at daenzer.net
Sat Oct 3 07:48:44 PDT 2009


From: Michel Dänzer <daenzer at vmware.com>

Also add fast paths for untransformed Composite operations.

This can significantly reduce the CPU overhead in RadeonCompositeTileCP, at
least for TCL capable GPUs.
---

I think the basic idea is sound, but I'm not sure if some parts are going too
far, e.g. the float fw, fh locals in the fastpath. Opinions?


 src/r600_exa.c           |    2 -
 src/radeon.h             |    6 +-
 src/radeon_commonfuncs.c |    4 +-
 src/radeon_exa_render.c  |  242 +++++++++++++++++++++++++---------------------
 src/radeon_render.c      |   12 +-
 5 files changed, 141 insertions(+), 125 deletions(-)

diff --git a/src/r600_exa.c b/src/r600_exa.c
index f6f2007..a794598 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -1114,8 +1114,6 @@ R600DoneCopy(PixmapPtr pDst)
 }
 
 
-#define xFixedToFloat(f) (((float) (f)) / 65536)
-
 struct blendinfo {
     Bool dst_alpha;
     Bool src_alpha;
diff --git a/src/radeon.h b/src/radeon.h
index 9d283bb..2fa4714 100644
--- a/src/radeon.h
+++ b/src/radeon.h
@@ -236,7 +236,7 @@ typedef enum {
 				   * for something else.
 				   */
 
-#define xFixedToFloat(f) (((float) (f)) / 65536)
+#define xFixedToFloat(f) (((float) (f)) * (1.0f / 65536.0f))
 
 #define RADEON_LOGLEVEL_DEBUG 4
 
@@ -657,8 +657,8 @@ struct radeon_accel_state {
     uint32_t          dst_pitch_offset;
 
     /* render accel */
-    unsigned short    texW[2];
-    unsigned short    texH[2];
+    float             texWrcp[2];
+    float             texHrcp[2];
     Bool              XInited3D; /* X itself has the 3D context */
     int               num_gb_pipes;
     Bool              has_tcl;
diff --git a/src/radeon_commonfuncs.c b/src/radeon_commonfuncs.c
index 8c46235..61a5b75 100644
--- a/src/radeon_commonfuncs.c
+++ b/src/radeon_commonfuncs.c
@@ -59,8 +59,8 @@ static void FUNC_NAME(RADEONInit3DEngine)(ScrnInfoPtr pScrn)
     int size;
     ACCEL_PREAMBLE();
 
-    info->accel_state->texW[0] = info->accel_state->texH[0] =
-	info->accel_state->texW[1] = info->accel_state->texH[1] = 1;
+    info->accel_state->texWrcp[0] = info->accel_state->texHrcp[0] =
+	info->accel_state->texWrcp[1] = info->accel_state->texHrcp[1] = 1.0f;
 
     if (IS_R300_3D || IS_R500_3D) {
 
diff --git a/src/radeon_exa_render.c b/src/radeon_exa_render.c
index 7bc8ef0..116f00d 100644
--- a/src/radeon_exa_render.c
+++ b/src/radeon_exa_render.c
@@ -409,8 +409,8 @@ static Bool FUNC_NAME(R100TextureSetup)(PicturePtr pPict, PixmapPtr pPix,
 	txformat |= RADEON_TXFORMAT_NON_POWER2;
     txformat |= unit << 24; /* RADEON_TXFORMAT_ST_ROUTE_STQX */
 
-    info->accel_state->texW[unit] = w;
-    info->accel_state->texH[unit] = h;
+    info->accel_state->texWrcp[unit] = 1.0f / (float)(65536 * w);
+    info->accel_state->texHrcp[unit] = 1.0f / (float)(65536 * h);
 
     switch (pPict->filter) {
     case PictFilterNearest:
@@ -794,8 +794,8 @@ static Bool FUNC_NAME(R200TextureSetup)(PicturePtr pPict, PixmapPtr pPix,
 	txformat |= R200_TXFORMAT_NON_POWER2;
     txformat |= unit << R200_TXFORMAT_ST_ROUTE_SHIFT;
 
-    info->accel_state->texW[unit] = w;
-    info->accel_state->texH[unit] = h;
+    info->accel_state->texWrcp[unit] = 1.0f / (float)(65536 * w);
+    info->accel_state->texHrcp[unit] = 1.0f / (float)(65536 * h);
 
     switch (pPict->filter) {
     case PictFilterNearest:
@@ -1244,64 +1244,44 @@ static Bool FUNC_NAME(R300TextureSetup)(PicturePtr pPict, PixmapPtr pPix,
 	OUT_ACCEL_REG(R300_TX_BORDER_COLOR_0 + (unit * 4), 0);
     FINISH_ACCEL();
 
-    if (pPict->transform != 0) {
-	info->accel_state->is_transform[unit] = TRUE;
-	info->accel_state->transform[unit] = pPict->transform;
+    if (info->accel_state->has_tcl) {
+	info->accel_state->is_transform[unit] = FALSE;
 
 	/* setup the PVS consts */
-	if (info->accel_state->has_tcl) {
-	    info->accel_state->texW[unit] = 1;
-	    info->accel_state->texH[unit] = 1;
-	    BEGIN_ACCEL(9);
-	    if (IS_R300_3D)
-		OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_CONST_INDEX(unit * 2));
-	    else
-		OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R500_PVS_VECTOR_CONST_INDEX(unit * 2));
+	BEGIN_ACCEL(9);
+	if (IS_R300_3D)
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_CONST_INDEX(unit * 2));
+	else
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R500_PVS_VECTOR_CONST_INDEX(unit * 2));
 
+	if (pPict->transform) {
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][0])));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][1])));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][2])));
-	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/w));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)w));
 
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][0])));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][1])));
 	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][2])));
-	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/h));
-
-	    FINISH_ACCEL();
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)h));
 	} else {
-	    info->accel_state->texW[unit] = w;
-	    info->accel_state->texH[unit] = h;
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)w));
+
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f));
+	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)h));
 	}
-    } else {
-	info->accel_state->is_transform[unit] = FALSE;
-
-	/* setup the PVS consts */
-	if (info->accel_state->has_tcl) {
-	    info->accel_state->texW[unit] = 1;
-	    info->accel_state->texH[unit] = 1;
 
-	    BEGIN_ACCEL(9);
-	    if (IS_R300_3D)
-		OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_CONST_INDEX(unit * 2));
-	    else
-		OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R500_PVS_VECTOR_CONST_INDEX(unit * 2));
-
-	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0));
-	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
-	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
-	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/w));
-
-	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
-	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0));
-	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
-	    OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/h));
-
-	    FINISH_ACCEL();
-	} else {
-	    info->accel_state->texW[unit] = w;
-	    info->accel_state->texH[unit] = h;
-	}
+	FINISH_ACCEL();
+    } else {
+	info->accel_state->is_transform[unit] = !!pPict->transform;
+	info->accel_state->transform[unit] = pPict->transform;
+	info->accel_state->texWrcp[unit] = 1.0f / (float)(65536 * w);
+	info->accel_state->texHrcp[unit] = 1.0f / (float)(65536 * h);
     }
 
     return TRUE;
@@ -2147,8 +2127,6 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr pScrn,
 					   int w, int h)
 {
     int vtx_count;
-    xPointFixed srcTopLeft, srcTopRight, srcBottomLeft, srcBottomRight;
-    static xPointFixed maskTopLeft, maskTopRight, maskBottomLeft, maskBottomRight;
     ACCEL_PREAMBLE();
 
     ENTER_DRAW(0);
@@ -2172,45 +2150,9 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr pScrn,
     }
 #endif
 
-    srcTopLeft.x     = IntToxFixed(srcX);
-    srcTopLeft.y     = IntToxFixed(srcY);
-    srcTopRight.x    = IntToxFixed(srcX + w);
-    srcTopRight.y    = IntToxFixed(srcY);
-    srcBottomLeft.x  = IntToxFixed(srcX);
-    srcBottomLeft.y  = IntToxFixed(srcY + h);
-    srcBottomRight.x = IntToxFixed(srcX + w);
-    srcBottomRight.y = IntToxFixed(srcY + h);
-
-    if (info->accel_state->is_transform[0]) {
-	if ((info->ChipFamily < CHIP_FAMILY_R300) || !info->accel_state->has_tcl) {
-	    transformPoint(info->accel_state->transform[0], &srcTopLeft);
-	    transformPoint(info->accel_state->transform[0], &srcTopRight);
-	    transformPoint(info->accel_state->transform[0], &srcBottomLeft);
-	    transformPoint(info->accel_state->transform[0], &srcBottomRight);
-	}
-    }
-
-    if (info->accel_state->msk_pic) {
-	maskTopLeft.x     = IntToxFixed(maskX);
-	maskTopLeft.y     = IntToxFixed(maskY);
-	maskTopRight.x    = IntToxFixed(maskX + w);
-	maskTopRight.y    = IntToxFixed(maskY);
-	maskBottomLeft.x  = IntToxFixed(maskX);
-	maskBottomLeft.y  = IntToxFixed(maskY + h);
-	maskBottomRight.x = IntToxFixed(maskX + w);
-	maskBottomRight.y = IntToxFixed(maskY + h);
-
-	if (info->accel_state->is_transform[1]) {
-	    if ((info->ChipFamily < CHIP_FAMILY_R300) || !info->accel_state->has_tcl) {
-		transformPoint(info->accel_state->transform[1], &maskTopLeft);
-		transformPoint(info->accel_state->transform[1], &maskTopRight);
-		transformPoint(info->accel_state->transform[1], &maskBottomLeft);
-		transformPoint(info->accel_state->transform[1], &maskBottomRight);
-	    }
-	}
-
+    if (info->accel_state->msk_pic)
 	vtx_count = 6;
-    } else
+    else
 	vtx_count = 4;
 
     if (info->accel_state->vsync)
@@ -2285,32 +2227,108 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr pScrn,
 
 #endif
 
-    if (info->accel_state->msk_pic) {
-	if (IS_R300_3D || IS_R500_3D) {
-	    VTX_OUT_MASK((float)dstX,                                      (float)dstY,
-			 xFixedToFloat(srcTopLeft.x) / info->accel_state->texW[0],      xFixedToFloat(srcTopLeft.y) / info->accel_state->texH[0],
-			 xFixedToFloat(maskTopLeft.x) / info->accel_state->texW[1],     xFixedToFloat(maskTopLeft.y) / info->accel_state->texH[1]);
+    if ((info->ChipFamily >= CHIP_FAMILY_R300) && info->accel_state->has_tcl) {
+	float dstX1, dstY1, dstX2, dstY2;
+	float srcX1, srcY1, srcX2, srcY2;
+	float fw, fh;
+
+	fw = w;
+	fh = h;
+	dstX1 = dstX;
+	dstY1 = dstY;
+	dstX2 = dstX1 + fw;
+	dstY2 = dstY1 + fh;
+	srcX1 = srcX;
+	srcY1 = srcY;
+	srcX2 = srcX1 + fw;
+	srcY2 = srcY1 + fh;
+
+	if (info->accel_state->msk_pic) {
+	    float maskX1, maskY1, maskX2, maskY2;
+
+	    maskX1 = maskX;
+	    maskY1 = maskY;
+	    maskX2 = maskX1 + fw;
+	    maskY2 = maskY1 + fh;
+
+	    VTX_OUT_MASK(dstX1,  dstY1,  srcX1,  srcY1,	 maskX1,  maskY1);
+	    VTX_OUT_MASK(dstX1,  dstY2,  srcX1,  srcY2,  maskX1,  maskY2);
+	    VTX_OUT_MASK(dstX2,  dstY2,  srcX2,  srcY2,  maskX2,  maskY2);
+	    VTX_OUT_MASK(dstX2,  dstY1,  srcX2,  srcY1,  maskX2,  maskY1);
+	} else {
+	    VTX_OUT(dstX1,  dstY1,  srcX1,  srcY1);
+	    VTX_OUT(dstX1,  dstY2,  srcX1,  srcY2);
+	    VTX_OUT(dstX2,  dstY2,  srcX2,  srcY2);
+	    VTX_OUT(dstX2,  dstY1,  srcX2,  srcY1);
 	}
-	VTX_OUT_MASK((float)dstX,                                      (float)(dstY + h),
-		xFixedToFloat(srcBottomLeft.x) / info->accel_state->texW[0],   xFixedToFloat(srcBottomLeft.y) / info->accel_state->texH[0],
-		xFixedToFloat(maskBottomLeft.x) / info->accel_state->texW[1],  xFixedToFloat(maskBottomLeft.y) / info->accel_state->texH[1]);
-	VTX_OUT_MASK((float)(dstX + w),                                (float)(dstY + h),
-		xFixedToFloat(srcBottomRight.x) / info->accel_state->texW[0],  xFixedToFloat(srcBottomRight.y) / info->accel_state->texH[0],
-		xFixedToFloat(maskBottomRight.x) / info->accel_state->texW[1], xFixedToFloat(maskBottomRight.y) / info->accel_state->texH[1]);
-	VTX_OUT_MASK((float)(dstX + w),                                (float)dstY,
-		xFixedToFloat(srcTopRight.x) / info->accel_state->texW[0],     xFixedToFloat(srcTopRight.y) / info->accel_state->texH[0],
-		xFixedToFloat(maskTopRight.x) / info->accel_state->texW[1],    xFixedToFloat(maskTopRight.y) / info->accel_state->texH[1]);
     } else {
-	if (IS_R300_3D || IS_R500_3D) {
-	    VTX_OUT((float)dstX,                                      (float)dstY,
-		    xFixedToFloat(srcTopLeft.x) / info->accel_state->texW[0],      xFixedToFloat(srcTopLeft.y) / info->accel_state->texH[0]);
+	xPointFixed srcTopLeft, srcTopRight, srcBottomLeft, srcBottomRight;
+	float srcWrcp = info->accel_state->texWrcp[0];
+	float srcHrcp = info->accel_state->texHrcp[0];
+
+	srcTopLeft.x     = IntToxFixed(srcX);
+	srcTopLeft.y     = IntToxFixed(srcY);
+	srcTopRight.x    = IntToxFixed(srcX + w);
+	srcTopRight.y    = IntToxFixed(srcY);
+	srcBottomLeft.x  = IntToxFixed(srcX);
+	srcBottomLeft.y  = IntToxFixed(srcY + h);
+	srcBottomRight.x = IntToxFixed(srcX + w);
+	srcBottomRight.y = IntToxFixed(srcY + h);
+
+	if (info->accel_state->is_transform[0]) {
+	    transformPoint(info->accel_state->transform[0], &srcTopLeft);
+	    transformPoint(info->accel_state->transform[0], &srcTopRight);
+	    transformPoint(info->accel_state->transform[0], &srcBottomLeft);
+	    transformPoint(info->accel_state->transform[0], &srcBottomRight);
+	}
+
+	if (info->accel_state->msk_pic) {
+	    xPointFixed maskTopLeft, maskTopRight, maskBottomLeft, maskBottomRight;
+	    float maskWrcp = info->accel_state->texWrcp[1];
+	    float maskHrcp = info->accel_state->texHrcp[1];
+
+	    maskTopLeft.x     = IntToxFixed(maskX);
+	    maskTopLeft.y     = IntToxFixed(maskY);
+	    maskTopRight.x    = IntToxFixed(maskX + w);
+	    maskTopRight.y    = IntToxFixed(maskY);
+	    maskBottomLeft.x  = IntToxFixed(maskX);
+	    maskBottomLeft.y  = IntToxFixed(maskY + h);
+	    maskBottomRight.x = IntToxFixed(maskX + w);
+	    maskBottomRight.y = IntToxFixed(maskY + h);
+
+	    if (info->accel_state->is_transform[1]) {
+		transformPoint(info->accel_state->transform[1], &maskTopLeft);
+		transformPoint(info->accel_state->transform[1], &maskTopRight);
+		transformPoint(info->accel_state->transform[1], &maskBottomLeft);
+		transformPoint(info->accel_state->transform[1], &maskBottomRight);
+	    }
+
+	    if (IS_R300_3D || IS_R500_3D) {
+		VTX_OUT_MASK((float)dstX,              (float)dstY,
+			     srcTopLeft.x * srcWrcp,   srcTopLeft.y * srcHrcp,
+			     maskTopLeft.x * maskWrcp, maskTopLeft.y * maskHrcp);
+	    }
+	    VTX_OUT_MASK((float)dstX,                  (float)(dstY + h),
+			 srcBottomLeft.x * srcWrcp,    srcBottomLeft.y * srcHrcp,
+			 maskBottomLeft.x * maskWrcp,  maskBottomLeft.y * maskHrcp);
+	    VTX_OUT_MASK((float)(dstX + w),            (float)(dstY + h),
+			 srcBottomRight.x * srcWrcp,   srcBottomRight.y * srcHrcp,
+			 maskBottomRight.x * maskWrcp, maskBottomRight.y * maskHrcp);
+	    VTX_OUT_MASK((float)(dstX + w),            (float)dstY,
+			 srcTopRight.x * srcWrcp,      srcTopRight.y * srcHrcp,
+			 maskTopRight.x * maskWrcp,    maskTopRight.y * maskHrcp);
+	} else {
+	    if (IS_R300_3D || IS_R500_3D) {
+		VTX_OUT((float)dstX,            (float)dstY,
+			srcTopLeft.x * srcWrcp, srcTopLeft.y * srcHrcp);
+	    }
+	    VTX_OUT((float)dstX,                (float)(dstY + h),
+		    srcBottomLeft.x * srcWrcp,  srcBottomLeft.y * srcHrcp);
+	    VTX_OUT((float)(dstX + w),          (float)(dstY + h),
+		    srcBottomRight.x * srcWrcp, srcBottomRight.y * srcHrcp);
+	    VTX_OUT((float)(dstX + w),          (float)dstY,
+		    srcTopRight.x * srcWrcp,    srcTopRight.y * srcHrcp);
 	}
-	VTX_OUT((float)dstX,                                      (float)(dstY + h),
-		xFixedToFloat(srcBottomLeft.x) / info->accel_state->texW[0],   xFixedToFloat(srcBottomLeft.y) / info->accel_state->texH[0]);
-	VTX_OUT((float)(dstX + w),                                (float)(dstY + h),
-		xFixedToFloat(srcBottomRight.x) / info->accel_state->texW[0],  xFixedToFloat(srcBottomRight.y) / info->accel_state->texH[0]);
-	VTX_OUT((float)(dstX + w),                                (float)dstY,
-		xFixedToFloat(srcTopRight.x) / info->accel_state->texW[0],     xFixedToFloat(srcTopRight.y) / info->accel_state->texH[0]);
     }
 
 #ifdef ACCEL_CP
diff --git a/src/radeon_render.c b/src/radeon_render.c
index 6668fe0..68811b7 100644
--- a/src/radeon_render.c
+++ b/src/radeon_render.c
@@ -773,8 +773,8 @@ static Bool FUNC_NAME(R200SetupTexture)(
 	txformat |= RADEON_TXFORMAT_NON_POWER2;
     }
 
-    info->accel_state->texW[0] = width;
-    info->accel_state->texH[0] = height;
+    info->accel_state->texWrcp[0] = 1.0f / width;
+    info->accel_state->texHrcp[0] = 1.0f / height;
 
     offset = info->accel_state->RenderTex->offset * pScrn->bitsPerPixel / 8;
     dst = (uint8_t*)(info->FB + offset);
@@ -975,10 +975,10 @@ FUNC_NAME(R200SubsequentCPUToScreenTexture) (
     
     r = width + l;
     b = height + t;
-    fl = (float)srcx / info->accel_state->texW[0];
-    fr = (float)(srcx + width) / info->accel_state->texW[0];
-    ft = (float)srcy / info->accel_state->texH[0];
-    fb = (float)(srcy + height) / info->accel_state->texH[0];
+    fl = (float)srcx * info->accel_state->texWrcp[0];
+    fr = (float)(srcx + width) * info->accel_state->texWrcp[0];
+    ft = (float)srcy * info->accel_state->texHrcp[0];
+    fb = (float)(srcy + height) * info->accel_state->texHrcp[0];
 
 #ifdef ACCEL_CP
     BEGIN_RING(24);
-- 
1.6.4.3



More information about the xorg-driver-ati mailing list