[Xorg-driver-geode] [PATCH 5/5] Improve the glyph rendering performance
Huang, FrankR
FrankR.Huang at amd.com
Sun Jul 25 20:35:54 PDT 2010
From: Frank Huang <frankr.huang at amd.com>
*Add a PICT_a8 entry in the format we support
*Based on Mart's work around patch, give three judge for PictOpAdd
in lx_check_composite
*Use a function to implement the glyph rendering(SW+HW)
*The performance is grown up apparently(10 times) for "x11perf -aa10text"
and "x11perf -aa24text"
Signed-off-by: Frank Huang <frankr.huang at amd.com>
Mart Raudsepp <leio at gentoo.org>
---
src/lx_exa.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 files changed, 139 insertions(+), 10 deletions(-)
A latest SW+HW solve way. That is better for "x11perf -aa24text" test because the width is bigger.
Stat. for reference:
x11perf -aa10text: 46400/s
x11perf -aa24text: 18300/s
diff --git a/src/lx_exa.c b/src/lx_exa.c
index 74327ef..0c18d4f 100644
--- a/src/lx_exa.c
+++ b/src/lx_exa.c
@@ -65,7 +65,8 @@ static const struct exa_format_t
PICT_b5g6r5, 16, CIMGP_SOURCE_FMT_16BPP_BGR, 0}, {
PICT_x1r5g5b5, 16, CIMGP_SOURCE_FMT_1_5_5_5, 0}, {
PICT_x1b5g5r5, 16, CIMGP_SOURCE_FMT_15BPP_BGR, 0}, {
- PICT_r3g3b2, 8, CIMGP_SOURCE_FMT_3_3_2, 0}
+ PICT_r3g3b2, 8, CIMGP_SOURCE_FMT_3_3_2, 0}, {
+ PICT_a8, 32, CIMGP_SOURCE_FMT_8_8_8_8, 8}
};
/* This is a chunk of memory we use for scratch space */
@@ -457,16 +458,9 @@ lx_get_format(PicturePtr p)
int i;
unsigned int format = p->format;
- for (i = 0; i < ARRAY_SIZE(lx_exa_formats); i++) {
-
- if (lx_exa_formats[i].bpp < PICT_FORMAT_BPP(format))
- break;
- else if (lx_exa_formats[i].bpp != PICT_FORMAT_BPP(format))
- continue;
-
+ for (i = 0; i < ARRAY_SIZE(lx_exa_formats); i++)
if (lx_exa_formats[i].exa == format)
return (&lx_exa_formats[i]);
- }
return NULL;
}
@@ -543,6 +537,26 @@ lx_check_composite(int op, PicturePtr pSrc, PicturePtr pMsk, PicturePtr pDst)
if (op > PictOpAdd)
return FALSE;
+ /* FIXME: Meet this conditions from the debug for PictOpAdd.
+ * Any Other possibilities? Add a judge for the future supplement */
+ if (op == PictOpAdd && pSrc->format == PICT_a8r8g8b8 &&
+ pDst->format == PICT_a8 && !pMsk)
+ return TRUE;
+
+ if (op == PictOpAdd && pSrc->format == PICT_x8r8g8b8 &&
+ pDst->format == PICT_a8 && !pMsk)
+ return TRUE;
+
+ if (op == PictOpAdd && pSrc->format == PICT_r5g6b5 &&
+ pDst->format == PICT_a8 && !pMsk)
+ return TRUE;
+
+ if ((op == PictOpAdd) && (pSrc->format != pDst->format)) {
+ ErrorF("PictOpAdd: the pSrc->format = %x, the pDst->format = %x\n",
+ pSrc->format, pDst->format);
+ return FALSE;
+ }
+
/* We need the off-screen buffer to do the multipass work */
if (usesPasses(op)) {
@@ -586,7 +600,8 @@ lx_check_composite(int op, PicturePtr pSrc, PicturePtr pMsk, PicturePtr pDst)
/* XXX - I don't understand PICT_a8 enough - so I'm punting */
- if (pSrc->format == PICT_a8 || pDst->format == PICT_a8)
+ if ((op != PictOpAdd) && (pSrc->format == PICT_a8 ||
+ pDst->format == PICT_a8))
return FALSE;
if (pMsk && op != PictOpClear) {
@@ -795,6 +810,116 @@ get_op_type(struct exa_format_t *src, struct exa_format_t *dst, int type)
((_x) * exaScratch.srcBpp))
static void
+lx_composite_onepass_pict_a8(PixmapPtr pxDst, unsigned long dstOffset,
+ unsigned long srcOffset, int width, int height, int opX, int opY,
+ int srcX, int srcY)
+{
+ struct blend_ops_t *opPtr;
+ int apply, type;
+ int optempX, optempY;
+ int i, j;
+ int calWidth;
+ unsigned long pixmapOffset, pixmapPitch, calBitsPixel;
+ int mod4, mod4flag;
+
+ /* Keep this GP idle judge here. Otherwise the SW method has chance to
+ * conflict with the HW rendering method */
+ gp_wait_until_idle();
+
+ /* These 5 local parameters are used for performance improvement */
+ pixmapOffset = exaGetPixmapOffset(pxDst);
+ pixmapPitch = exaGetPixmapPitch(pxDst);
+ calBitsPixel = (pxDst->drawable.bitsPerPixel + 7) / 8;
+ mod4 = 4 - opX % 4;
+ mod4flag = (opX % 4 == 0) ? 0 : 1;
+
+ /* If the width is lower than 4, do rendering in pure SW method */
+ if (width < 4) {
+ for (j = srcY; j < srcY + height; j++)
+ for (i = srcX; i < srcX + width; i++) {
+ srcOffset = GetSrcOffset(i, j);
+ optempX = opX + i - srcX;
+ optempY = opY + j - srcY;
+ dstOffset = pixmapOffset + pixmapPitch * optempY +
+ calBitsPixel * optempX;
+ *(cim_fb_ptr + dstOffset) = (*(cim_fb_ptr + srcOffset)
+ + *(cim_fb_ptr + dstOffset) <= 0xff) ? *(cim_fb_ptr
+ + srcOffset) + *(cim_fb_ptr + dstOffset) : 0xff;
+ }
+ return ;
+ }
+
+ /* Do SW rendering to the first pixels(4 - opX % 4) of each line in order
+ * the alignment of 4 bytes of the destination */
+ if (mod4flag) {
+ for (j = srcY; j < srcY + height; j++)
+ for (i = srcX; i < srcX + mod4; i++) {
+ srcOffset = GetSrcOffset(i, j);
+ optempX = opX + i - srcX;
+ optempY = opY + j - srcY;
+ dstOffset = pixmapOffset + pixmapPitch * optempY +
+ calBitsPixel * optempX;
+ *(cim_fb_ptr + dstOffset) = (*(cim_fb_ptr + srcOffset)
+ + *(cim_fb_ptr + dstOffset) <= 0xff) ? *(cim_fb_ptr
+ + srcOffset) + *(cim_fb_ptr + dstOffset) : 0xff;
+ }
+ }
+
+ /* Do HW rendering to the middle pixels except the last dword(4 pixels),
+ * because the HW will do the wrong rendering to that based on debug, still
+ * the alighment thing */
+ if (mod4flag) {
+ srcOffset = GetSrcOffset(srcX + mod4, srcY);
+ dstOffset = pixmapOffset + pixmapPitch * opY +
+ calBitsPixel * (opX + mod4);
+ calWidth = (width - mod4) / 4;
+ if (calWidth > 0)
+ calWidth = calWidth - 1;
+ else
+ calWidth = 0;
+ } else {
+ srcOffset = GetSrcOffset(srcX, srcY);
+ dstOffset = pixmapOffset + pixmapPitch * opY +
+ calBitsPixel * opX;
+ calWidth = width / 4;
+ }
+
+ if (calWidth > 0) {
+ opPtr = &lx_alpha_ops[exaScratch.op * 2];
+ apply = CIMGP_APPLY_BLEND_TO_ALL;
+ gp_declare_blt(0);
+ gp_set_bpp(32);
+ gp_set_strides(exaGetPixmapPitch(pxDst), exaScratch.srcPitch);
+ gp_set_source_format(8);
+ type = opPtr->type;
+ gp_set_alpha_operation(opPtr->operation, type, opPtr->channel,
+ apply, 0);
+ gp_screen_to_screen_convert(dstOffset, srcOffset, calWidth, height, 0);
+ }
+
+ /* Do SW rendering to the remaining pixels */
+ if (mod4flag)
+ if (calWidth > 0)
+ calWidth = calWidth * 4 + mod4;
+ else
+ calWidth = mod4;
+ else
+ calWidth = (width / 4) * 4;
+
+ for (j = srcY; j < srcY + height; j++)
+ for (i = srcX + calWidth; i < srcX + width; i++) {
+ srcOffset = GetSrcOffset(i, j);
+ optempX = opX + i - srcX;
+ optempY = opY + j - srcY;
+ dstOffset = pixmapOffset + pixmapPitch * optempY +
+ calBitsPixel * optempX;
+ *(cim_fb_ptr + dstOffset) = (*(cim_fb_ptr + srcOffset)
+ + *(cim_fb_ptr + dstOffset) <= 0xff) ?
+ *(cim_fb_ptr + srcOffset) + *(cim_fb_ptr + dstOffset) : 0xff;
+ }
+}
+
+static void
lx_composite_onepass(PixmapPtr pxDst, unsigned long dstOffset,
unsigned long srcOffset, int width, int height)
{
@@ -1268,6 +1393,10 @@ lx_do_composite(PixmapPtr pxDst, int srcX, int srcY, int maskX,
== 1))
lx_composite_onepass_special(pxDst, dstOffset, srcOffset,
opWidth, opHeight, opX, opY, srcX, srcY);
+ else if ((exaScratch.op == PictOpAdd) && (exaScratch.srcFormat->exa
+ == PICT_a8) && (exaScratch.dstFormat->exa == PICT_a8))
+ lx_composite_onepass_pict_a8(pxDst, dstOffset, srcOffset,
+ opWidth, opHeight, opX, opY, srcX, srcY);
else
lx_composite_onepass(pxDst, dstOffset, srcOffset, opWidth,
opHeight);
--
1.7.1
More information about the Xorg-driver-geode
mailing list