[PATCH] EXA: Move floating point math to the GPU as much as possible for R1-5xx.
Alex Deucher
alexdeucher at gmail.com
Sun Oct 4 22:38:33 PDT 2009
2009/10/3 Michel Dänzer <michel at daenzer.net>:
> From: Michel Dänzer <daenzer at vmware.com>
>
> Also add fast paths for untransformed Composite operations.
>
> This can significantly reduce the CPU overhead in RadeonCompositeTileCP, at
> least for TCL capable GPUs.
> ---
>
> I think the basic idea is sound, but I'm not sure if some parts are going too
> far, e.g. the float fw, fh locals in the fastpath. Opinions?
Looks pretty good. What sort of improvements are you seeing? Are
there any improvements to the non-tcl path? If you wanted to take
this a step further you could add some instructions take the
reciprocal in the shader. Also, we don't yet take advantage of the
tcl hw on r1xx and r2xx chips.
Alex
>
>
> src/r600_exa.c | 2 -
> src/radeon.h | 6 +-
> src/radeon_commonfuncs.c | 4 +-
> src/radeon_exa_render.c | 242 +++++++++++++++++++++++++---------------------
> src/radeon_render.c | 12 +-
> 5 files changed, 141 insertions(+), 125 deletions(-)
>
> diff --git a/src/r600_exa.c b/src/r600_exa.c
> index f6f2007..a794598 100644
> --- a/src/r600_exa.c
> +++ b/src/r600_exa.c
> @@ -1114,8 +1114,6 @@ R600DoneCopy(PixmapPtr pDst)
> }
>
>
> -#define xFixedToFloat(f) (((float) (f)) / 65536)
> -
> struct blendinfo {
> Bool dst_alpha;
> Bool src_alpha;
> diff --git a/src/radeon.h b/src/radeon.h
> index 9d283bb..2fa4714 100644
> --- a/src/radeon.h
> +++ b/src/radeon.h
> @@ -236,7 +236,7 @@ typedef enum {
> * for something else.
> */
>
> -#define xFixedToFloat(f) (((float) (f)) / 65536)
> +#define xFixedToFloat(f) (((float) (f)) * (1.0f / 65536.0f))
>
> #define RADEON_LOGLEVEL_DEBUG 4
>
> @@ -657,8 +657,8 @@ struct radeon_accel_state {
> uint32_t dst_pitch_offset;
>
> /* render accel */
> - unsigned short texW[2];
> - unsigned short texH[2];
> + float texWrcp[2];
> + float texHrcp[2];
> Bool XInited3D; /* X itself has the 3D context */
> int num_gb_pipes;
> Bool has_tcl;
> diff --git a/src/radeon_commonfuncs.c b/src/radeon_commonfuncs.c
> index 8c46235..61a5b75 100644
> --- a/src/radeon_commonfuncs.c
> +++ b/src/radeon_commonfuncs.c
> @@ -59,8 +59,8 @@ static void FUNC_NAME(RADEONInit3DEngine)(ScrnInfoPtr pScrn)
> int size;
> ACCEL_PREAMBLE();
>
> - info->accel_state->texW[0] = info->accel_state->texH[0] =
> - info->accel_state->texW[1] = info->accel_state->texH[1] = 1;
> + info->accel_state->texWrcp[0] = info->accel_state->texHrcp[0] =
> + info->accel_state->texWrcp[1] = info->accel_state->texHrcp[1] = 1.0f;
>
> if (IS_R300_3D || IS_R500_3D) {
>
> diff --git a/src/radeon_exa_render.c b/src/radeon_exa_render.c
> index 7bc8ef0..116f00d 100644
> --- a/src/radeon_exa_render.c
> +++ b/src/radeon_exa_render.c
> @@ -409,8 +409,8 @@ static Bool FUNC_NAME(R100TextureSetup)(PicturePtr pPict, PixmapPtr pPix,
> txformat |= RADEON_TXFORMAT_NON_POWER2;
> txformat |= unit << 24; /* RADEON_TXFORMAT_ST_ROUTE_STQX */
>
> - info->accel_state->texW[unit] = w;
> - info->accel_state->texH[unit] = h;
> + info->accel_state->texWrcp[unit] = 1.0f / (float)(65536 * w);
> + info->accel_state->texHrcp[unit] = 1.0f / (float)(65536 * h);
>
> switch (pPict->filter) {
> case PictFilterNearest:
> @@ -794,8 +794,8 @@ static Bool FUNC_NAME(R200TextureSetup)(PicturePtr pPict, PixmapPtr pPix,
> txformat |= R200_TXFORMAT_NON_POWER2;
> txformat |= unit << R200_TXFORMAT_ST_ROUTE_SHIFT;
>
> - info->accel_state->texW[unit] = w;
> - info->accel_state->texH[unit] = h;
> + info->accel_state->texWrcp[unit] = 1.0f / (float)(65536 * w);
> + info->accel_state->texHrcp[unit] = 1.0f / (float)(65536 * h);
>
> switch (pPict->filter) {
> case PictFilterNearest:
> @@ -1244,64 +1244,44 @@ static Bool FUNC_NAME(R300TextureSetup)(PicturePtr pPict, PixmapPtr pPix,
> OUT_ACCEL_REG(R300_TX_BORDER_COLOR_0 + (unit * 4), 0);
> FINISH_ACCEL();
>
> - if (pPict->transform != 0) {
> - info->accel_state->is_transform[unit] = TRUE;
> - info->accel_state->transform[unit] = pPict->transform;
> + if (info->accel_state->has_tcl) {
> + info->accel_state->is_transform[unit] = FALSE;
>
> /* setup the PVS consts */
> - if (info->accel_state->has_tcl) {
> - info->accel_state->texW[unit] = 1;
> - info->accel_state->texH[unit] = 1;
> - BEGIN_ACCEL(9);
> - if (IS_R300_3D)
> - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_CONST_INDEX(unit * 2));
> - else
> - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R500_PVS_VECTOR_CONST_INDEX(unit * 2));
> + BEGIN_ACCEL(9);
> + if (IS_R300_3D)
> + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_CONST_INDEX(unit * 2));
> + else
> + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R500_PVS_VECTOR_CONST_INDEX(unit * 2));
>
> + if (pPict->transform) {
> OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][0])));
> OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][1])));
> OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][2])));
> - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/w));
> + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)w));
>
> OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][0])));
> OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][1])));
> OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][2])));
> - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/h));
> -
> - FINISH_ACCEL();
> + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)h));
> } else {
> - info->accel_state->texW[unit] = w;
> - info->accel_state->texH[unit] = h;
> + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f));
> + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f));
> + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f));
> + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)w));
> +
> + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f));
> + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f));
> + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f));
> + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)h));
> }
> - } else {
> - info->accel_state->is_transform[unit] = FALSE;
> -
> - /* setup the PVS consts */
> - if (info->accel_state->has_tcl) {
> - info->accel_state->texW[unit] = 1;
> - info->accel_state->texH[unit] = 1;
>
> - BEGIN_ACCEL(9);
> - if (IS_R300_3D)
> - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_CONST_INDEX(unit * 2));
> - else
> - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R500_PVS_VECTOR_CONST_INDEX(unit * 2));
> -
> - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0));
> - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
> - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
> - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/w));
> -
> - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
> - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0));
> - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
> - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/h));
> -
> - FINISH_ACCEL();
> - } else {
> - info->accel_state->texW[unit] = w;
> - info->accel_state->texH[unit] = h;
> - }
> + FINISH_ACCEL();
> + } else {
> + info->accel_state->is_transform[unit] = !!pPict->transform;
> + info->accel_state->transform[unit] = pPict->transform;
> + info->accel_state->texWrcp[unit] = 1.0f / (float)(65536 * w);
> + info->accel_state->texHrcp[unit] = 1.0f / (float)(65536 * h);
> }
>
> return TRUE;
> @@ -2147,8 +2127,6 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr pScrn,
> int w, int h)
> {
> int vtx_count;
> - xPointFixed srcTopLeft, srcTopRight, srcBottomLeft, srcBottomRight;
> - static xPointFixed maskTopLeft, maskTopRight, maskBottomLeft, maskBottomRight;
> ACCEL_PREAMBLE();
>
> ENTER_DRAW(0);
> @@ -2172,45 +2150,9 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr pScrn,
> }
> #endif
>
> - srcTopLeft.x = IntToxFixed(srcX);
> - srcTopLeft.y = IntToxFixed(srcY);
> - srcTopRight.x = IntToxFixed(srcX + w);
> - srcTopRight.y = IntToxFixed(srcY);
> - srcBottomLeft.x = IntToxFixed(srcX);
> - srcBottomLeft.y = IntToxFixed(srcY + h);
> - srcBottomRight.x = IntToxFixed(srcX + w);
> - srcBottomRight.y = IntToxFixed(srcY + h);
> -
> - if (info->accel_state->is_transform[0]) {
> - if ((info->ChipFamily < CHIP_FAMILY_R300) || !info->accel_state->has_tcl) {
> - transformPoint(info->accel_state->transform[0], &srcTopLeft);
> - transformPoint(info->accel_state->transform[0], &srcTopRight);
> - transformPoint(info->accel_state->transform[0], &srcBottomLeft);
> - transformPoint(info->accel_state->transform[0], &srcBottomRight);
> - }
> - }
> -
> - if (info->accel_state->msk_pic) {
> - maskTopLeft.x = IntToxFixed(maskX);
> - maskTopLeft.y = IntToxFixed(maskY);
> - maskTopRight.x = IntToxFixed(maskX + w);
> - maskTopRight.y = IntToxFixed(maskY);
> - maskBottomLeft.x = IntToxFixed(maskX);
> - maskBottomLeft.y = IntToxFixed(maskY + h);
> - maskBottomRight.x = IntToxFixed(maskX + w);
> - maskBottomRight.y = IntToxFixed(maskY + h);
> -
> - if (info->accel_state->is_transform[1]) {
> - if ((info->ChipFamily < CHIP_FAMILY_R300) || !info->accel_state->has_tcl) {
> - transformPoint(info->accel_state->transform[1], &maskTopLeft);
> - transformPoint(info->accel_state->transform[1], &maskTopRight);
> - transformPoint(info->accel_state->transform[1], &maskBottomLeft);
> - transformPoint(info->accel_state->transform[1], &maskBottomRight);
> - }
> - }
> -
> + if (info->accel_state->msk_pic)
> vtx_count = 6;
> - } else
> + else
> vtx_count = 4;
>
> if (info->accel_state->vsync)
> @@ -2285,32 +2227,108 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr pScrn,
>
> #endif
>
> - if (info->accel_state->msk_pic) {
> - if (IS_R300_3D || IS_R500_3D) {
> - VTX_OUT_MASK((float)dstX, (float)dstY,
> - xFixedToFloat(srcTopLeft.x) / info->accel_state->texW[0], xFixedToFloat(srcTopLeft.y) / info->accel_state->texH[0],
> - xFixedToFloat(maskTopLeft.x) / info->accel_state->texW[1], xFixedToFloat(maskTopLeft.y) / info->accel_state->texH[1]);
> + if ((info->ChipFamily >= CHIP_FAMILY_R300) && info->accel_state->has_tcl) {
> + float dstX1, dstY1, dstX2, dstY2;
> + float srcX1, srcY1, srcX2, srcY2;
> + float fw, fh;
> +
> + fw = w;
> + fh = h;
> + dstX1 = dstX;
> + dstY1 = dstY;
> + dstX2 = dstX1 + fw;
> + dstY2 = dstY1 + fh;
> + srcX1 = srcX;
> + srcY1 = srcY;
> + srcX2 = srcX1 + fw;
> + srcY2 = srcY1 + fh;
> +
> + if (info->accel_state->msk_pic) {
> + float maskX1, maskY1, maskX2, maskY2;
> +
> + maskX1 = maskX;
> + maskY1 = maskY;
> + maskX2 = maskX1 + fw;
> + maskY2 = maskY1 + fh;
> +
> + VTX_OUT_MASK(dstX1, dstY1, srcX1, srcY1, maskX1, maskY1);
> + VTX_OUT_MASK(dstX1, dstY2, srcX1, srcY2, maskX1, maskY2);
> + VTX_OUT_MASK(dstX2, dstY2, srcX2, srcY2, maskX2, maskY2);
> + VTX_OUT_MASK(dstX2, dstY1, srcX2, srcY1, maskX2, maskY1);
> + } else {
> + VTX_OUT(dstX1, dstY1, srcX1, srcY1);
> + VTX_OUT(dstX1, dstY2, srcX1, srcY2);
> + VTX_OUT(dstX2, dstY2, srcX2, srcY2);
> + VTX_OUT(dstX2, dstY1, srcX2, srcY1);
> }
> - VTX_OUT_MASK((float)dstX, (float)(dstY + h),
> - xFixedToFloat(srcBottomLeft.x) / info->accel_state->texW[0], xFixedToFloat(srcBottomLeft.y) / info->accel_state->texH[0],
> - xFixedToFloat(maskBottomLeft.x) / info->accel_state->texW[1], xFixedToFloat(maskBottomLeft.y) / info->accel_state->texH[1]);
> - VTX_OUT_MASK((float)(dstX + w), (float)(dstY + h),
> - xFixedToFloat(srcBottomRight.x) / info->accel_state->texW[0], xFixedToFloat(srcBottomRight.y) / info->accel_state->texH[0],
> - xFixedToFloat(maskBottomRight.x) / info->accel_state->texW[1], xFixedToFloat(maskBottomRight.y) / info->accel_state->texH[1]);
> - VTX_OUT_MASK((float)(dstX + w), (float)dstY,
> - xFixedToFloat(srcTopRight.x) / info->accel_state->texW[0], xFixedToFloat(srcTopRight.y) / info->accel_state->texH[0],
> - xFixedToFloat(maskTopRight.x) / info->accel_state->texW[1], xFixedToFloat(maskTopRight.y) / info->accel_state->texH[1]);
> } else {
> - if (IS_R300_3D || IS_R500_3D) {
> - VTX_OUT((float)dstX, (float)dstY,
> - xFixedToFloat(srcTopLeft.x) / info->accel_state->texW[0], xFixedToFloat(srcTopLeft.y) / info->accel_state->texH[0]);
> + xPointFixed srcTopLeft, srcTopRight, srcBottomLeft, srcBottomRight;
> + float srcWrcp = info->accel_state->texWrcp[0];
> + float srcHrcp = info->accel_state->texHrcp[0];
> +
> + srcTopLeft.x = IntToxFixed(srcX);
> + srcTopLeft.y = IntToxFixed(srcY);
> + srcTopRight.x = IntToxFixed(srcX + w);
> + srcTopRight.y = IntToxFixed(srcY);
> + srcBottomLeft.x = IntToxFixed(srcX);
> + srcBottomLeft.y = IntToxFixed(srcY + h);
> + srcBottomRight.x = IntToxFixed(srcX + w);
> + srcBottomRight.y = IntToxFixed(srcY + h);
> +
> + if (info->accel_state->is_transform[0]) {
> + transformPoint(info->accel_state->transform[0], &srcTopLeft);
> + transformPoint(info->accel_state->transform[0], &srcTopRight);
> + transformPoint(info->accel_state->transform[0], &srcBottomLeft);
> + transformPoint(info->accel_state->transform[0], &srcBottomRight);
> + }
> +
> + if (info->accel_state->msk_pic) {
> + xPointFixed maskTopLeft, maskTopRight, maskBottomLeft, maskBottomRight;
> + float maskWrcp = info->accel_state->texWrcp[1];
> + float maskHrcp = info->accel_state->texHrcp[1];
> +
> + maskTopLeft.x = IntToxFixed(maskX);
> + maskTopLeft.y = IntToxFixed(maskY);
> + maskTopRight.x = IntToxFixed(maskX + w);
> + maskTopRight.y = IntToxFixed(maskY);
> + maskBottomLeft.x = IntToxFixed(maskX);
> + maskBottomLeft.y = IntToxFixed(maskY + h);
> + maskBottomRight.x = IntToxFixed(maskX + w);
> + maskBottomRight.y = IntToxFixed(maskY + h);
> +
> + if (info->accel_state->is_transform[1]) {
> + transformPoint(info->accel_state->transform[1], &maskTopLeft);
> + transformPoint(info->accel_state->transform[1], &maskTopRight);
> + transformPoint(info->accel_state->transform[1], &maskBottomLeft);
> + transformPoint(info->accel_state->transform[1], &maskBottomRight);
> + }
> +
> + if (IS_R300_3D || IS_R500_3D) {
> + VTX_OUT_MASK((float)dstX, (float)dstY,
> + srcTopLeft.x * srcWrcp, srcTopLeft.y * srcHrcp,
> + maskTopLeft.x * maskWrcp, maskTopLeft.y * maskHrcp);
> + }
> + VTX_OUT_MASK((float)dstX, (float)(dstY + h),
> + srcBottomLeft.x * srcWrcp, srcBottomLeft.y * srcHrcp,
> + maskBottomLeft.x * maskWrcp, maskBottomLeft.y * maskHrcp);
> + VTX_OUT_MASK((float)(dstX + w), (float)(dstY + h),
> + srcBottomRight.x * srcWrcp, srcBottomRight.y * srcHrcp,
> + maskBottomRight.x * maskWrcp, maskBottomRight.y * maskHrcp);
> + VTX_OUT_MASK((float)(dstX + w), (float)dstY,
> + srcTopRight.x * srcWrcp, srcTopRight.y * srcHrcp,
> + maskTopRight.x * maskWrcp, maskTopRight.y * maskHrcp);
> + } else {
> + if (IS_R300_3D || IS_R500_3D) {
> + VTX_OUT((float)dstX, (float)dstY,
> + srcTopLeft.x * srcWrcp, srcTopLeft.y * srcHrcp);
> + }
> + VTX_OUT((float)dstX, (float)(dstY + h),
> + srcBottomLeft.x * srcWrcp, srcBottomLeft.y * srcHrcp);
> + VTX_OUT((float)(dstX + w), (float)(dstY + h),
> + srcBottomRight.x * srcWrcp, srcBottomRight.y * srcHrcp);
> + VTX_OUT((float)(dstX + w), (float)dstY,
> + srcTopRight.x * srcWrcp, srcTopRight.y * srcHrcp);
> }
> - VTX_OUT((float)dstX, (float)(dstY + h),
> - xFixedToFloat(srcBottomLeft.x) / info->accel_state->texW[0], xFixedToFloat(srcBottomLeft.y) / info->accel_state->texH[0]);
> - VTX_OUT((float)(dstX + w), (float)(dstY + h),
> - xFixedToFloat(srcBottomRight.x) / info->accel_state->texW[0], xFixedToFloat(srcBottomRight.y) / info->accel_state->texH[0]);
> - VTX_OUT((float)(dstX + w), (float)dstY,
> - xFixedToFloat(srcTopRight.x) / info->accel_state->texW[0], xFixedToFloat(srcTopRight.y) / info->accel_state->texH[0]);
> }
>
> #ifdef ACCEL_CP
> diff --git a/src/radeon_render.c b/src/radeon_render.c
> index 6668fe0..68811b7 100644
> --- a/src/radeon_render.c
> +++ b/src/radeon_render.c
> @@ -773,8 +773,8 @@ static Bool FUNC_NAME(R200SetupTexture)(
> txformat |= RADEON_TXFORMAT_NON_POWER2;
> }
>
> - info->accel_state->texW[0] = width;
> - info->accel_state->texH[0] = height;
> + info->accel_state->texWrcp[0] = 1.0f / width;
> + info->accel_state->texHrcp[0] = 1.0f / height;
>
> offset = info->accel_state->RenderTex->offset * pScrn->bitsPerPixel / 8;
> dst = (uint8_t*)(info->FB + offset);
> @@ -975,10 +975,10 @@ FUNC_NAME(R200SubsequentCPUToScreenTexture) (
>
> r = width + l;
> b = height + t;
> - fl = (float)srcx / info->accel_state->texW[0];
> - fr = (float)(srcx + width) / info->accel_state->texW[0];
> - ft = (float)srcy / info->accel_state->texH[0];
> - fb = (float)(srcy + height) / info->accel_state->texH[0];
> + fl = (float)srcx * info->accel_state->texWrcp[0];
> + fr = (float)(srcx + width) * info->accel_state->texWrcp[0];
> + ft = (float)srcy * info->accel_state->texHrcp[0];
> + fb = (float)(srcy + height) * info->accel_state->texHrcp[0];
>
> #ifdef ACCEL_CP
> BEGIN_RING(24);
> --
> 1.6.4.3
>
> _______________________________________________
> xorg-driver-ati mailing list
> xorg-driver-ati at lists.x.org
> http://lists.x.org/mailman/listinfo/xorg-driver-ati
>
More information about the xorg-driver-ati
mailing list