[PATCH] EXA: Move floating point math to the GPU as much as possible for R1-5xx.
Michel Dänzer
michel at daenzer.net
Sat Oct 3 07:48:44 PDT 2009
From: Michel Dänzer <daenzer at vmware.com>
Also add fast paths for untransformed Composite operations.
This can significantly reduce the CPU overhead in RadeonCompositeTileCP, at
least for TCL capable GPUs.
---
I think the basic idea is sound, but I'm not sure if some parts are going too
far, e.g. the float fw, fh locals in the fastpath. Opinions?
src/r600_exa.c | 2 -
src/radeon.h | 6 +-
src/radeon_commonfuncs.c | 4 +-
src/radeon_exa_render.c | 242 +++++++++++++++++++++++++---------------------
src/radeon_render.c | 12 +-
5 files changed, 141 insertions(+), 125 deletions(-)
diff --git a/src/r600_exa.c b/src/r600_exa.c
index f6f2007..a794598 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -1114,8 +1114,6 @@ R600DoneCopy(PixmapPtr pDst)
}
-#define xFixedToFloat(f) (((float) (f)) / 65536)
-
struct blendinfo {
Bool dst_alpha;
Bool src_alpha;
diff --git a/src/radeon.h b/src/radeon.h
index 9d283bb..2fa4714 100644
--- a/src/radeon.h
+++ b/src/radeon.h
@@ -236,7 +236,7 @@ typedef enum {
* for something else.
*/
-#define xFixedToFloat(f) (((float) (f)) / 65536)
+#define xFixedToFloat(f) (((float) (f)) * (1.0f / 65536.0f))
#define RADEON_LOGLEVEL_DEBUG 4
@@ -657,8 +657,8 @@ struct radeon_accel_state {
uint32_t dst_pitch_offset;
/* render accel */
- unsigned short texW[2];
- unsigned short texH[2];
+ float texWrcp[2];
+ float texHrcp[2];
Bool XInited3D; /* X itself has the 3D context */
int num_gb_pipes;
Bool has_tcl;
diff --git a/src/radeon_commonfuncs.c b/src/radeon_commonfuncs.c
index 8c46235..61a5b75 100644
--- a/src/radeon_commonfuncs.c
+++ b/src/radeon_commonfuncs.c
@@ -59,8 +59,8 @@ static void FUNC_NAME(RADEONInit3DEngine)(ScrnInfoPtr pScrn)
int size;
ACCEL_PREAMBLE();
- info->accel_state->texW[0] = info->accel_state->texH[0] =
- info->accel_state->texW[1] = info->accel_state->texH[1] = 1;
+ info->accel_state->texWrcp[0] = info->accel_state->texHrcp[0] =
+ info->accel_state->texWrcp[1] = info->accel_state->texHrcp[1] = 1.0f;
if (IS_R300_3D || IS_R500_3D) {
diff --git a/src/radeon_exa_render.c b/src/radeon_exa_render.c
index 7bc8ef0..116f00d 100644
--- a/src/radeon_exa_render.c
+++ b/src/radeon_exa_render.c
@@ -409,8 +409,8 @@ static Bool FUNC_NAME(R100TextureSetup)(PicturePtr pPict, PixmapPtr pPix,
txformat |= RADEON_TXFORMAT_NON_POWER2;
txformat |= unit << 24; /* RADEON_TXFORMAT_ST_ROUTE_STQX */
- info->accel_state->texW[unit] = w;
- info->accel_state->texH[unit] = h;
+ info->accel_state->texWrcp[unit] = 1.0f / (float)(65536 * w);
+ info->accel_state->texHrcp[unit] = 1.0f / (float)(65536 * h);
switch (pPict->filter) {
case PictFilterNearest:
@@ -794,8 +794,8 @@ static Bool FUNC_NAME(R200TextureSetup)(PicturePtr pPict, PixmapPtr pPix,
txformat |= R200_TXFORMAT_NON_POWER2;
txformat |= unit << R200_TXFORMAT_ST_ROUTE_SHIFT;
- info->accel_state->texW[unit] = w;
- info->accel_state->texH[unit] = h;
+ info->accel_state->texWrcp[unit] = 1.0f / (float)(65536 * w);
+ info->accel_state->texHrcp[unit] = 1.0f / (float)(65536 * h);
switch (pPict->filter) {
case PictFilterNearest:
@@ -1244,64 +1244,44 @@ static Bool FUNC_NAME(R300TextureSetup)(PicturePtr pPict, PixmapPtr pPix,
OUT_ACCEL_REG(R300_TX_BORDER_COLOR_0 + (unit * 4), 0);
FINISH_ACCEL();
- if (pPict->transform != 0) {
- info->accel_state->is_transform[unit] = TRUE;
- info->accel_state->transform[unit] = pPict->transform;
+ if (info->accel_state->has_tcl) {
+ info->accel_state->is_transform[unit] = FALSE;
/* setup the PVS consts */
- if (info->accel_state->has_tcl) {
- info->accel_state->texW[unit] = 1;
- info->accel_state->texH[unit] = 1;
- BEGIN_ACCEL(9);
- if (IS_R300_3D)
- OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_CONST_INDEX(unit * 2));
- else
- OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R500_PVS_VECTOR_CONST_INDEX(unit * 2));
+ BEGIN_ACCEL(9);
+ if (IS_R300_3D)
+ OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_CONST_INDEX(unit * 2));
+ else
+ OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R500_PVS_VECTOR_CONST_INDEX(unit * 2));
+ if (pPict->transform) {
OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][0])));
OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][1])));
OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][2])));
- OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/w));
+ OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)w));
OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][0])));
OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][1])));
OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][2])));
- OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/h));
-
- FINISH_ACCEL();
+ OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)h));
} else {
- info->accel_state->texW[unit] = w;
- info->accel_state->texH[unit] = h;
+ OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f));
+ OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f));
+ OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f));
+ OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)w));
+
+ OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f));
+ OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f));
+ OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f));
+ OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)h));
}
- } else {
- info->accel_state->is_transform[unit] = FALSE;
-
- /* setup the PVS consts */
- if (info->accel_state->has_tcl) {
- info->accel_state->texW[unit] = 1;
- info->accel_state->texH[unit] = 1;
- BEGIN_ACCEL(9);
- if (IS_R300_3D)
- OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_CONST_INDEX(unit * 2));
- else
- OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R500_PVS_VECTOR_CONST_INDEX(unit * 2));
-
- OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0));
- OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
- OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
- OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/w));
-
- OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
- OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0));
- OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
- OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/h));
-
- FINISH_ACCEL();
- } else {
- info->accel_state->texW[unit] = w;
- info->accel_state->texH[unit] = h;
- }
+ FINISH_ACCEL();
+ } else {
+ info->accel_state->is_transform[unit] = !!pPict->transform;
+ info->accel_state->transform[unit] = pPict->transform;
+ info->accel_state->texWrcp[unit] = 1.0f / (float)(65536 * w);
+ info->accel_state->texHrcp[unit] = 1.0f / (float)(65536 * h);
}
return TRUE;
@@ -2147,8 +2127,6 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr pScrn,
int w, int h)
{
int vtx_count;
- xPointFixed srcTopLeft, srcTopRight, srcBottomLeft, srcBottomRight;
- static xPointFixed maskTopLeft, maskTopRight, maskBottomLeft, maskBottomRight;
ACCEL_PREAMBLE();
ENTER_DRAW(0);
@@ -2172,45 +2150,9 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr pScrn,
}
#endif
- srcTopLeft.x = IntToxFixed(srcX);
- srcTopLeft.y = IntToxFixed(srcY);
- srcTopRight.x = IntToxFixed(srcX + w);
- srcTopRight.y = IntToxFixed(srcY);
- srcBottomLeft.x = IntToxFixed(srcX);
- srcBottomLeft.y = IntToxFixed(srcY + h);
- srcBottomRight.x = IntToxFixed(srcX + w);
- srcBottomRight.y = IntToxFixed(srcY + h);
-
- if (info->accel_state->is_transform[0]) {
- if ((info->ChipFamily < CHIP_FAMILY_R300) || !info->accel_state->has_tcl) {
- transformPoint(info->accel_state->transform[0], &srcTopLeft);
- transformPoint(info->accel_state->transform[0], &srcTopRight);
- transformPoint(info->accel_state->transform[0], &srcBottomLeft);
- transformPoint(info->accel_state->transform[0], &srcBottomRight);
- }
- }
-
- if (info->accel_state->msk_pic) {
- maskTopLeft.x = IntToxFixed(maskX);
- maskTopLeft.y = IntToxFixed(maskY);
- maskTopRight.x = IntToxFixed(maskX + w);
- maskTopRight.y = IntToxFixed(maskY);
- maskBottomLeft.x = IntToxFixed(maskX);
- maskBottomLeft.y = IntToxFixed(maskY + h);
- maskBottomRight.x = IntToxFixed(maskX + w);
- maskBottomRight.y = IntToxFixed(maskY + h);
-
- if (info->accel_state->is_transform[1]) {
- if ((info->ChipFamily < CHIP_FAMILY_R300) || !info->accel_state->has_tcl) {
- transformPoint(info->accel_state->transform[1], &maskTopLeft);
- transformPoint(info->accel_state->transform[1], &maskTopRight);
- transformPoint(info->accel_state->transform[1], &maskBottomLeft);
- transformPoint(info->accel_state->transform[1], &maskBottomRight);
- }
- }
-
+ if (info->accel_state->msk_pic)
vtx_count = 6;
- } else
+ else
vtx_count = 4;
if (info->accel_state->vsync)
@@ -2285,32 +2227,108 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr pScrn,
#endif
- if (info->accel_state->msk_pic) {
- if (IS_R300_3D || IS_R500_3D) {
- VTX_OUT_MASK((float)dstX, (float)dstY,
- xFixedToFloat(srcTopLeft.x) / info->accel_state->texW[0], xFixedToFloat(srcTopLeft.y) / info->accel_state->texH[0],
- xFixedToFloat(maskTopLeft.x) / info->accel_state->texW[1], xFixedToFloat(maskTopLeft.y) / info->accel_state->texH[1]);
+ if ((info->ChipFamily >= CHIP_FAMILY_R300) && info->accel_state->has_tcl) {
+ float dstX1, dstY1, dstX2, dstY2;
+ float srcX1, srcY1, srcX2, srcY2;
+ float fw, fh;
+
+ fw = w;
+ fh = h;
+ dstX1 = dstX;
+ dstY1 = dstY;
+ dstX2 = dstX1 + fw;
+ dstY2 = dstY1 + fh;
+ srcX1 = srcX;
+ srcY1 = srcY;
+ srcX2 = srcX1 + fw;
+ srcY2 = srcY1 + fh;
+
+ if (info->accel_state->msk_pic) {
+ float maskX1, maskY1, maskX2, maskY2;
+
+ maskX1 = maskX;
+ maskY1 = maskY;
+ maskX2 = maskX1 + fw;
+ maskY2 = maskY1 + fh;
+
+ VTX_OUT_MASK(dstX1, dstY1, srcX1, srcY1, maskX1, maskY1);
+ VTX_OUT_MASK(dstX1, dstY2, srcX1, srcY2, maskX1, maskY2);
+ VTX_OUT_MASK(dstX2, dstY2, srcX2, srcY2, maskX2, maskY2);
+ VTX_OUT_MASK(dstX2, dstY1, srcX2, srcY1, maskX2, maskY1);
+ } else {
+ VTX_OUT(dstX1, dstY1, srcX1, srcY1);
+ VTX_OUT(dstX1, dstY2, srcX1, srcY2);
+ VTX_OUT(dstX2, dstY2, srcX2, srcY2);
+ VTX_OUT(dstX2, dstY1, srcX2, srcY1);
}
- VTX_OUT_MASK((float)dstX, (float)(dstY + h),
- xFixedToFloat(srcBottomLeft.x) / info->accel_state->texW[0], xFixedToFloat(srcBottomLeft.y) / info->accel_state->texH[0],
- xFixedToFloat(maskBottomLeft.x) / info->accel_state->texW[1], xFixedToFloat(maskBottomLeft.y) / info->accel_state->texH[1]);
- VTX_OUT_MASK((float)(dstX + w), (float)(dstY + h),
- xFixedToFloat(srcBottomRight.x) / info->accel_state->texW[0], xFixedToFloat(srcBottomRight.y) / info->accel_state->texH[0],
- xFixedToFloat(maskBottomRight.x) / info->accel_state->texW[1], xFixedToFloat(maskBottomRight.y) / info->accel_state->texH[1]);
- VTX_OUT_MASK((float)(dstX + w), (float)dstY,
- xFixedToFloat(srcTopRight.x) / info->accel_state->texW[0], xFixedToFloat(srcTopRight.y) / info->accel_state->texH[0],
- xFixedToFloat(maskTopRight.x) / info->accel_state->texW[1], xFixedToFloat(maskTopRight.y) / info->accel_state->texH[1]);
} else {
- if (IS_R300_3D || IS_R500_3D) {
- VTX_OUT((float)dstX, (float)dstY,
- xFixedToFloat(srcTopLeft.x) / info->accel_state->texW[0], xFixedToFloat(srcTopLeft.y) / info->accel_state->texH[0]);
+ xPointFixed srcTopLeft, srcTopRight, srcBottomLeft, srcBottomRight;
+ float srcWrcp = info->accel_state->texWrcp[0];
+ float srcHrcp = info->accel_state->texHrcp[0];
+
+ srcTopLeft.x = IntToxFixed(srcX);
+ srcTopLeft.y = IntToxFixed(srcY);
+ srcTopRight.x = IntToxFixed(srcX + w);
+ srcTopRight.y = IntToxFixed(srcY);
+ srcBottomLeft.x = IntToxFixed(srcX);
+ srcBottomLeft.y = IntToxFixed(srcY + h);
+ srcBottomRight.x = IntToxFixed(srcX + w);
+ srcBottomRight.y = IntToxFixed(srcY + h);
+
+ if (info->accel_state->is_transform[0]) {
+ transformPoint(info->accel_state->transform[0], &srcTopLeft);
+ transformPoint(info->accel_state->transform[0], &srcTopRight);
+ transformPoint(info->accel_state->transform[0], &srcBottomLeft);
+ transformPoint(info->accel_state->transform[0], &srcBottomRight);
+ }
+
+ if (info->accel_state->msk_pic) {
+ xPointFixed maskTopLeft, maskTopRight, maskBottomLeft, maskBottomRight;
+ float maskWrcp = info->accel_state->texWrcp[1];
+ float maskHrcp = info->accel_state->texHrcp[1];
+
+ maskTopLeft.x = IntToxFixed(maskX);
+ maskTopLeft.y = IntToxFixed(maskY);
+ maskTopRight.x = IntToxFixed(maskX + w);
+ maskTopRight.y = IntToxFixed(maskY);
+ maskBottomLeft.x = IntToxFixed(maskX);
+ maskBottomLeft.y = IntToxFixed(maskY + h);
+ maskBottomRight.x = IntToxFixed(maskX + w);
+ maskBottomRight.y = IntToxFixed(maskY + h);
+
+ if (info->accel_state->is_transform[1]) {
+ transformPoint(info->accel_state->transform[1], &maskTopLeft);
+ transformPoint(info->accel_state->transform[1], &maskTopRight);
+ transformPoint(info->accel_state->transform[1], &maskBottomLeft);
+ transformPoint(info->accel_state->transform[1], &maskBottomRight);
+ }
+
+ if (IS_R300_3D || IS_R500_3D) {
+ VTX_OUT_MASK((float)dstX, (float)dstY,
+ srcTopLeft.x * srcWrcp, srcTopLeft.y * srcHrcp,
+ maskTopLeft.x * maskWrcp, maskTopLeft.y * maskHrcp);
+ }
+ VTX_OUT_MASK((float)dstX, (float)(dstY + h),
+ srcBottomLeft.x * srcWrcp, srcBottomLeft.y * srcHrcp,
+ maskBottomLeft.x * maskWrcp, maskBottomLeft.y * maskHrcp);
+ VTX_OUT_MASK((float)(dstX + w), (float)(dstY + h),
+ srcBottomRight.x * srcWrcp, srcBottomRight.y * srcHrcp,
+ maskBottomRight.x * maskWrcp, maskBottomRight.y * maskHrcp);
+ VTX_OUT_MASK((float)(dstX + w), (float)dstY,
+ srcTopRight.x * srcWrcp, srcTopRight.y * srcHrcp,
+ maskTopRight.x * maskWrcp, maskTopRight.y * maskHrcp);
+ } else {
+ if (IS_R300_3D || IS_R500_3D) {
+ VTX_OUT((float)dstX, (float)dstY,
+ srcTopLeft.x * srcWrcp, srcTopLeft.y * srcHrcp);
+ }
+ VTX_OUT((float)dstX, (float)(dstY + h),
+ srcBottomLeft.x * srcWrcp, srcBottomLeft.y * srcHrcp);
+ VTX_OUT((float)(dstX + w), (float)(dstY + h),
+ srcBottomRight.x * srcWrcp, srcBottomRight.y * srcHrcp);
+ VTX_OUT((float)(dstX + w), (float)dstY,
+ srcTopRight.x * srcWrcp, srcTopRight.y * srcHrcp);
}
- VTX_OUT((float)dstX, (float)(dstY + h),
- xFixedToFloat(srcBottomLeft.x) / info->accel_state->texW[0], xFixedToFloat(srcBottomLeft.y) / info->accel_state->texH[0]);
- VTX_OUT((float)(dstX + w), (float)(dstY + h),
- xFixedToFloat(srcBottomRight.x) / info->accel_state->texW[0], xFixedToFloat(srcBottomRight.y) / info->accel_state->texH[0]);
- VTX_OUT((float)(dstX + w), (float)dstY,
- xFixedToFloat(srcTopRight.x) / info->accel_state->texW[0], xFixedToFloat(srcTopRight.y) / info->accel_state->texH[0]);
}
#ifdef ACCEL_CP
diff --git a/src/radeon_render.c b/src/radeon_render.c
index 6668fe0..68811b7 100644
--- a/src/radeon_render.c
+++ b/src/radeon_render.c
@@ -773,8 +773,8 @@ static Bool FUNC_NAME(R200SetupTexture)(
txformat |= RADEON_TXFORMAT_NON_POWER2;
}
- info->accel_state->texW[0] = width;
- info->accel_state->texH[0] = height;
+ info->accel_state->texWrcp[0] = 1.0f / width;
+ info->accel_state->texHrcp[0] = 1.0f / height;
offset = info->accel_state->RenderTex->offset * pScrn->bitsPerPixel / 8;
dst = (uint8_t*)(info->FB + offset);
@@ -975,10 +975,10 @@ FUNC_NAME(R200SubsequentCPUToScreenTexture) (
r = width + l;
b = height + t;
- fl = (float)srcx / info->accel_state->texW[0];
- fr = (float)(srcx + width) / info->accel_state->texW[0];
- ft = (float)srcy / info->accel_state->texH[0];
- fb = (float)(srcy + height) / info->accel_state->texH[0];
+ fl = (float)srcx * info->accel_state->texWrcp[0];
+ fr = (float)(srcx + width) * info->accel_state->texWrcp[0];
+ ft = (float)srcy * info->accel_state->texHrcp[0];
+ fb = (float)(srcy + height) * info->accel_state->texHrcp[0];
#ifdef ACCEL_CP
BEGIN_RING(24);
--
1.6.4.3
More information about the xorg-driver-ati
mailing list