[Mesa-dev] [PATCH] ac: Use DPP for build_ddxy where possible.

Wed May 23 12:09:52 UTC 2018

On 23.05.2018 11:48, Bas Nieuwenhuizen wrote:
> WQM is pretty reliable now on LLVM 7, so let us just use
> DPP + WQM.
> 
> This gives approximately a 1.5% performance increase on the
> vrcompositor built-in benchmark.
> ---
>   src/amd/common/ac_llvm_build.c | 243 ++++++++++++++++++---------------
>   1 file changed, 130 insertions(+), 113 deletions(-)
> 
> diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
> index 36c1d62637b..f849f6461ce 100644
> --- a/src/amd/common/ac_llvm_build.c
> +++ b/src/amd/common/ac_llvm_build.c
> @@ -1137,119 +1137,6 @@ ac_get_thread_id(struct ac_llvm_context *ctx)
>   	return tid;
>   }
>   
> -/*
> - * SI implements derivatives using the local data store (LDS)
> - * All writes to the LDS happen in all executing threads at
> - * the same time. TID is the Thread ID for the current
> - * thread and is a value between 0 and 63, representing
> - * the thread's position in the wavefront.
> - *
> - * For the pixel shader threads are grouped into quads of four pixels.
> - * The TIDs of the pixels of a quad are:
> - *
> - *  +------+------+
> - *  |4n + 0|4n + 1|
> - *  +------+------+
> - *  |4n + 2|4n + 3|
> - *  +------+------+
> - *
> - * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
> - * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
> - * the current pixel's column, and masking with 0xfffffffe yields the TID
> - * of the left pixel of the current pixel's row.
> - *
> - * Adding 1 yields the TID of the pixel to the right of the left pixel, and
> - * adding 2 yields the TID of the pixel below the top pixel.
> - */
> -LLVMValueRef
> -ac_build_ddxy(struct ac_llvm_context *ctx,
> -	      uint32_t mask,
> -	      int idx,
> -	      LLVMValueRef val)
> -{
> -	LLVMValueRef tl, trbl, args[2];
> -	LLVMValueRef result;
> -
> -	if (ctx->chip_class >= VI) {
> -		LLVMValueRef thread_id, tl_tid, trbl_tid;
> -		thread_id = ac_get_thread_id(ctx);
> -
> -		tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
> -				      LLVMConstInt(ctx->i32, mask, false), "");
> -
> -		trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
> -					LLVMConstInt(ctx->i32, idx, false), "");
> -
> -		args[0] = LLVMBuildMul(ctx->builder, tl_tid,
> -				       LLVMConstInt(ctx->i32, 4, false), "");
> -		args[1] = val;
> -		tl = ac_build_intrinsic(ctx,
> -					"llvm.amdgcn.ds.bpermute", ctx->i32,
> -					args, 2,
> -					AC_FUNC_ATTR_READNONE |
> -					AC_FUNC_ATTR_CONVERGENT);
> -
> -		args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
> -				       LLVMConstInt(ctx->i32, 4, false), "");
> -		trbl = ac_build_intrinsic(ctx,
> -					  "llvm.amdgcn.ds.bpermute", ctx->i32,
> -					  args, 2,
> -					  AC_FUNC_ATTR_READNONE |
> -					  AC_FUNC_ATTR_CONVERGENT);
> -	} else {
> -		uint32_t masks[2] = {};
> -
> -		switch (mask) {
> -		case AC_TID_MASK_TOP_LEFT:
> -			masks[0] = 0x8000;
> -			if (idx == 1)
> -				masks[1] = 0x8055;
> -			else
> -				masks[1] = 0x80aa;
> -
> -			break;
> -		case AC_TID_MASK_TOP:
> -			masks[0] = 0x8044;
> -			masks[1] = 0x80ee;
> -			break;
> -		case AC_TID_MASK_LEFT:
> -			masks[0] = 0x80a0;
> -			masks[1] = 0x80f5;
> -			break;
> -		default:
> -			assert(0);
> -		}
> -
> -		args[0] = val;
> -		args[1] = LLVMConstInt(ctx->i32, masks[0], false);
> -
> -		tl = ac_build_intrinsic(ctx,
> -					"llvm.amdgcn.ds.swizzle", ctx->i32,
> -					args, 2,
> -					AC_FUNC_ATTR_READNONE |
> -					AC_FUNC_ATTR_CONVERGENT);
> -
> -		args[1] = LLVMConstInt(ctx->i32, masks[1], false);
> -		trbl = ac_build_intrinsic(ctx,
> -					"llvm.amdgcn.ds.swizzle", ctx->i32,
> -					args, 2,
> -					AC_FUNC_ATTR_READNONE |
> -					AC_FUNC_ATTR_CONVERGENT);
> -	}
> -
> -	tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
> -	trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
> -	result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
> -
> -	if (HAVE_LLVM >= 0x0700) {
> -		result = ac_build_intrinsic(ctx,
> -			"llvm.amdgcn.wqm.f32", ctx->f32,
> -			&result, 1, 0);
> -	}
> -
> -	return result;
> -}
> -
>   void
>   ac_build_sendmsg(struct ac_llvm_context *ctx,
>   		 uint32_t msg,
> @@ -2764,6 +2651,136 @@ ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
>   	return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
>   }
>   
> +/*
> + * SI implements derivatives using the local data store (LDS)
> + * All writes to the LDS happen in all executing threads at
> + * the same time. TID is the Thread ID for the current
> + * thread and is a value between 0 and 63, representing
> + * the thread's position in the wavefront.
> + *
> + * For the pixel shader threads are grouped into quads of four pixels.
> + * The TIDs of the pixels of a quad are:
> + *
> + *  +------+------+
> + *  |4n + 0|4n + 1|
> + *  +------+------+
> + *  |4n + 2|4n + 3|
> + *  +------+------+
> + *
> + * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
> + * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
> + * the current pixel's column, and masking with 0xfffffffe yields the TID
> + * of the left pixel of the current pixel's row.
> + *
> + * Adding 1 yields the TID of the pixel to the right of the left pixel, and
> + * adding 2 yields the TID of the pixel below the top pixel.
> + */
> +LLVMValueRef
> +ac_build_ddxy(struct ac_llvm_context *ctx,
> +	      uint32_t mask,
> +	      int idx,
> +	      LLVMValueRef val)
> +{
> +	LLVMValueRef tl, trbl, args[2];
> +	LLVMValueRef result;
> +
> +	if (ctx->chip_class >= VI && HAVE_LLVM >= 0x0700) {
> +		LLVMValueRef zero = ctx->i32_0;
> +		unsigned tl_lanes[4], trbl_lanes[4];
> +
> +		for (unsigned i = 0; i < 4; ++i) {
> +			tl_lanes[i] = i & mask;
> +			trbl_lanes[i] = (i & mask) + idx;
> +		}
> +
> +                tl = ac_build_dpp(ctx, zero, val,

Inconsistent indentation.

Anyway, maybe this should just use ac_build_quad_swizzle?

Cheers,
Nicolai

> +		                  dpp_quad_perm(tl_lanes[0], tl_lanes[1],
> +		                                tl_lanes[2], tl_lanes[3]),
> +		                  0xf, 0xf, false);
> +		trbl = ac_build_dpp(ctx, zero, val,
> +		                    dpp_quad_perm(trbl_lanes[0], trbl_lanes[1],
> +		                                  trbl_lanes[2], trbl_lanes[3]),
> +		                    0xf, 0xf, false);
> +	} else if (ctx->chip_class >= VI) {
> +		LLVMValueRef thread_id, tl_tid, trbl_tid;
> +		thread_id = ac_get_thread_id(ctx);
> +
> +		tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
> +				      LLVMConstInt(ctx->i32, mask, false), "");
> +
> +		trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
> +					LLVMConstInt(ctx->i32, idx, false), "");
> +
> +		args[0] = LLVMBuildMul(ctx->builder, tl_tid,
> +				       LLVMConstInt(ctx->i32, 4, false), "");
> +		args[1] = val;
> +		tl = ac_build_intrinsic(ctx,
> +					"llvm.amdgcn.ds.bpermute", ctx->i32,
> +					args, 2,
> +					AC_FUNC_ATTR_READNONE |
> +					AC_FUNC_ATTR_CONVERGENT);
> +
> +		args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
> +				       LLVMConstInt(ctx->i32, 4, false), "");
> +		trbl = ac_build_intrinsic(ctx,
> +					  "llvm.amdgcn.ds.bpermute", ctx->i32,
> +					  args, 2,
> +					  AC_FUNC_ATTR_READNONE |
> +					  AC_FUNC_ATTR_CONVERGENT);
> +	} else {
> +		uint32_t masks[2] = {};
> +
> +		switch (mask) {
> +		case AC_TID_MASK_TOP_LEFT:
> +			masks[0] = 0x8000;
> +			if (idx == 1)
> +				masks[1] = 0x8055;
> +			else
> +				masks[1] = 0x80aa;
> +
> +			break;
> +		case AC_TID_MASK_TOP:
> +			masks[0] = 0x8044;
> +			masks[1] = 0x80ee;
> +			break;
> +		case AC_TID_MASK_LEFT:
> +			masks[0] = 0x80a0;
> +			masks[1] = 0x80f5;
> +			break;
> +		default:
> +			assert(0);
> +		}
> +
> +		args[0] = val;
> +		args[1] = LLVMConstInt(ctx->i32, masks[0], false);
> +
> +		tl = ac_build_intrinsic(ctx,
> +					"llvm.amdgcn.ds.swizzle", ctx->i32,
> +					args, 2,
> +					AC_FUNC_ATTR_READNONE |
> +					AC_FUNC_ATTR_CONVERGENT);
> +
> +		args[1] = LLVMConstInt(ctx->i32, masks[1], false);
> +		trbl = ac_build_intrinsic(ctx,
> +					"llvm.amdgcn.ds.swizzle", ctx->i32,
> +					args, 2,
> +					AC_FUNC_ATTR_READNONE |
> +					AC_FUNC_ATTR_CONVERGENT);
> +	}
> +
> +	tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
> +	trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
> +	result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
> +
> +	if (HAVE_LLVM >= 0x0700) {
> +		result = ac_build_intrinsic(ctx,
> +			"llvm.amdgcn.wqm.f32", ctx->f32,
> +			&result, 1, 0);
> +	}
> +
> +	return result;
> +}
> +
>   static inline unsigned
>   ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
>   {
> 

-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.