[Mesa-dev] [PATCH] ac: Use DPP for build_ddxy where possible.
Nicolai Hähnle
nhaehnle at gmail.com
Wed May 23 12:09:52 UTC 2018
On 23.05.2018 11:48, Bas Nieuwenhuizen wrote:
> WQM is pretty reliable now on LLVM 7, so let us just use
> DPP + WQM.
>
> This gives approximately a 1.5% performance increase on the
> vrcompositor built-in benchmark.
> ---
> src/amd/common/ac_llvm_build.c | 243 ++++++++++++++++++---------------
> 1 file changed, 130 insertions(+), 113 deletions(-)
>
> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
> index 36c1d62637b..f849f6461ce 100644
> --- a/src/amd/common/ac_llvm_build.c
> +++ b/src/amd/common/ac_llvm_build.c
> @@ -1137,119 +1137,6 @@ ac_get_thread_id(struct ac_llvm_context *ctx)
> return tid;
> }
>
> -/*
> - * SI implements derivatives using the local data store (LDS)
> - * All writes to the LDS happen in all executing threads at
> - * the same time. TID is the Thread ID for the current
> - * thread and is a value between 0 and 63, representing
> - * the thread's position in the wavefront.
> - *
> - * For the pixel shader threads are grouped into quads of four pixels.
> - * The TIDs of the pixels of a quad are:
> - *
> - * +------+------+
> - * |4n + 0|4n + 1|
> - * +------+------+
> - * |4n + 2|4n + 3|
> - * +------+------+
> - *
> - * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
> - * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
> - * the current pixel's column, and masking with 0xfffffffe yields the TID
> - * of the left pixel of the current pixel's row.
> - *
> - * Adding 1 yields the TID of the pixel to the right of the left pixel, and
> - * adding 2 yields the TID of the pixel below the top pixel.
> - */
> -LLVMValueRef
> -ac_build_ddxy(struct ac_llvm_context *ctx,
> - uint32_t mask,
> - int idx,
> - LLVMValueRef val)
> -{
> - LLVMValueRef tl, trbl, args[2];
> - LLVMValueRef result;
> -
> - if (ctx->chip_class >= VI) {
> - LLVMValueRef thread_id, tl_tid, trbl_tid;
> - thread_id = ac_get_thread_id(ctx);
> -
> - tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
> - LLVMConstInt(ctx->i32, mask, false), "");
> -
> - trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
> - LLVMConstInt(ctx->i32, idx, false), "");
> -
> - args[0] = LLVMBuildMul(ctx->builder, tl_tid,
> - LLVMConstInt(ctx->i32, 4, false), "");
> - args[1] = val;
> - tl = ac_build_intrinsic(ctx,
> - "llvm.amdgcn.ds.bpermute", ctx->i32,
> - args, 2,
> - AC_FUNC_ATTR_READNONE |
> - AC_FUNC_ATTR_CONVERGENT);
> -
> - args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
> - LLVMConstInt(ctx->i32, 4, false), "");
> - trbl = ac_build_intrinsic(ctx,
> - "llvm.amdgcn.ds.bpermute", ctx->i32,
> - args, 2,
> - AC_FUNC_ATTR_READNONE |
> - AC_FUNC_ATTR_CONVERGENT);
> - } else {
> - uint32_t masks[2] = {};
> -
> - switch (mask) {
> - case AC_TID_MASK_TOP_LEFT:
> - masks[0] = 0x8000;
> - if (idx == 1)
> - masks[1] = 0x8055;
> - else
> - masks[1] = 0x80aa;
> -
> - break;
> - case AC_TID_MASK_TOP:
> - masks[0] = 0x8044;
> - masks[1] = 0x80ee;
> - break;
> - case AC_TID_MASK_LEFT:
> - masks[0] = 0x80a0;
> - masks[1] = 0x80f5;
> - break;
> - default:
> - assert(0);
> - }
> -
> - args[0] = val;
> - args[1] = LLVMConstInt(ctx->i32, masks[0], false);
> -
> - tl = ac_build_intrinsic(ctx,
> - "llvm.amdgcn.ds.swizzle", ctx->i32,
> - args, 2,
> - AC_FUNC_ATTR_READNONE |
> - AC_FUNC_ATTR_CONVERGENT);
> -
> - args[1] = LLVMConstInt(ctx->i32, masks[1], false);
> - trbl = ac_build_intrinsic(ctx,
> - "llvm.amdgcn.ds.swizzle", ctx->i32,
> - args, 2,
> - AC_FUNC_ATTR_READNONE |
> - AC_FUNC_ATTR_CONVERGENT);
> - }
> -
> - tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
> - trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
> - result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
> -
> - if (HAVE_LLVM >= 0x0700) {
> - result = ac_build_intrinsic(ctx,
> - "llvm.amdgcn.wqm.f32", ctx->f32,
> - &result, 1, 0);
> - }
> -
> - return result;
> -}
> -
> void
> ac_build_sendmsg(struct ac_llvm_context *ctx,
> uint32_t msg,
> @@ -2764,6 +2651,136 @@ ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
> return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
> }
>
> +/*
> + * SI implements derivatives using the local data store (LDS)
> + * All writes to the LDS happen in all executing threads at
> + * the same time. TID is the Thread ID for the current
> + * thread and is a value between 0 and 63, representing
> + * the thread's position in the wavefront.
> + *
> + * For the pixel shader threads are grouped into quads of four pixels.
> + * The TIDs of the pixels of a quad are:
> + *
> + * +------+------+
> + * |4n + 0|4n + 1|
> + * +------+------+
> + * |4n + 2|4n + 3|
> + * +------+------+
> + *
> + * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
> + * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
> + * the current pixel's column, and masking with 0xfffffffe yields the TID
> + * of the left pixel of the current pixel's row.
> + *
> + * Adding 1 yields the TID of the pixel to the right of the left pixel, and
> + * adding 2 yields the TID of the pixel below the top pixel.
> + */
> +LLVMValueRef
> +ac_build_ddxy(struct ac_llvm_context *ctx,
> + uint32_t mask,
> + int idx,
> + LLVMValueRef val)
> +{
> + LLVMValueRef tl, trbl, args[2];
> + LLVMValueRef result;
> +
> + if (ctx->chip_class >= VI && HAVE_LLVM >= 0x0700) {
> + LLVMValueRef zero = ctx->i32_0;
> + unsigned tl_lanes[4], trbl_lanes[4];
> +
> + for (unsigned i = 0; i < 4; ++i) {
> + tl_lanes[i] = i & mask;
> + trbl_lanes[i] = (i & mask) + idx;
> + }
> +
> + tl = ac_build_dpp(ctx, zero, val,
Inconsistent indentation.
Anyway, maybe this should just use ac_build_quad_swizzle?
Cheers,
Nicolai
> + dpp_quad_perm(tl_lanes[0], tl_lanes[1],
> + tl_lanes[2], tl_lanes[3]),
> + 0xf, 0xf, false);
> + trbl = ac_build_dpp(ctx, zero, val,
> + dpp_quad_perm(trbl_lanes[0], trbl_lanes[1],
> + trbl_lanes[2], trbl_lanes[3]),
> + 0xf, 0xf, false);
> + } else if (ctx->chip_class >= VI) {
> + LLVMValueRef thread_id, tl_tid, trbl_tid;
> + thread_id = ac_get_thread_id(ctx);
> +
> + tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
> + LLVMConstInt(ctx->i32, mask, false), "");
> +
> + trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
> + LLVMConstInt(ctx->i32, idx, false), "");
> +
> + args[0] = LLVMBuildMul(ctx->builder, tl_tid,
> + LLVMConstInt(ctx->i32, 4, false), "");
> + args[1] = val;
> + tl = ac_build_intrinsic(ctx,
> + "llvm.amdgcn.ds.bpermute", ctx->i32,
> + args, 2,
> + AC_FUNC_ATTR_READNONE |
> + AC_FUNC_ATTR_CONVERGENT);
> +
> + args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
> + LLVMConstInt(ctx->i32, 4, false), "");
> + trbl = ac_build_intrinsic(ctx,
> + "llvm.amdgcn.ds.bpermute", ctx->i32,
> + args, 2,
> + AC_FUNC_ATTR_READNONE |
> + AC_FUNC_ATTR_CONVERGENT);
> + } else {
> + uint32_t masks[2] = {};
> +
> + switch (mask) {
> + case AC_TID_MASK_TOP_LEFT:
> + masks[0] = 0x8000;
> + if (idx == 1)
> + masks[1] = 0x8055;
> + else
> + masks[1] = 0x80aa;
> +
> + break;
> + case AC_TID_MASK_TOP:
> + masks[0] = 0x8044;
> + masks[1] = 0x80ee;
> + break;
> + case AC_TID_MASK_LEFT:
> + masks[0] = 0x80a0;
> + masks[1] = 0x80f5;
> + break;
> + default:
> + assert(0);
> + }
> +
> + args[0] = val;
> + args[1] = LLVMConstInt(ctx->i32, masks[0], false);
> +
> + tl = ac_build_intrinsic(ctx,
> + "llvm.amdgcn.ds.swizzle", ctx->i32,
> + args, 2,
> + AC_FUNC_ATTR_READNONE |
> + AC_FUNC_ATTR_CONVERGENT);
> +
> + args[1] = LLVMConstInt(ctx->i32, masks[1], false);
> + trbl = ac_build_intrinsic(ctx,
> + "llvm.amdgcn.ds.swizzle", ctx->i32,
> + args, 2,
> + AC_FUNC_ATTR_READNONE |
> + AC_FUNC_ATTR_CONVERGENT);
> + }
> +
> + tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
> + trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
> + result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
> +
> + if (HAVE_LLVM >= 0x0700) {
> + result = ac_build_intrinsic(ctx,
> + "llvm.amdgcn.wqm.f32", ctx->f32,
> + &result, 1, 0);
> + }
> +
> + return result;
> +}
> +
> static inline unsigned
> ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
> {
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list