[Mesa-dev] [PATCH 1/2] gallivm: don't use control flow when doing indirect constant buffer lookups
Roland Scheidegger
sroland at vmware.com
Wed Apr 8 14:11:33 PDT 2015
Am 08.04.2015 um 21:13 schrieb Jose Fonseca:
> Series looks good to me.
>
> Just a few suggestions inline.
>
>
> On 04/04/15 15:50, sroland at vmware.com wrote:
>> From: Roland Scheidegger <sroland at vmware.com>
>>
>> llvm goes crazy when doing that, using way more memory and time,
>> though there's
>> probably more to it - this points to a very much similar issue as
>> fixed in
>> 8a9f5ecdb116d0449d63f7b94efbfa8b205d826f. In any case I've seen a quite
>> plain looking vertex shader with just ~50 simple tgsi instructions
>> (but with a
>> dozen or so such indirect constant buffer lookups) go from a terribly
>> high
>> ~440ms compile time (consuming 25MB of memory in the process) down to
>> a still
>> awful ~230ms and 13MB with this fix (with llvm 3.3), so there's still
>> obvious
>> improvements possible (but I have no clue why it's so slow...).
>> The resulting shader is most likely also faster (certainly seemed so
>> though
>> I don't have any hard numbers as it may have been influenced by
>> compile times)
>> since generally fetching constants outside the buffer range is most
>> likely an
>> app error (that is we expect all indices to be valid).
>> It is possible this fixes some mysterious vertex shader slowdowns
>> we've seen
>> ever since we are conforming to newer apis at least partially (the
>> main draw
>> loop also has similar looking conditionals which we probably could do
>> without -
>> if not for the fetch at least for the additional elts condition.)
>> ---
>> src/gallium/auxiliary/draw/draw_llvm.h | 2 +
>> .../draw/draw_pt_fetch_shade_pipeline_llvm.c | 27 +++---
>> src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c | 95
>> +++++++++-------------
>> src/gallium/drivers/llvmpipe/lp_scene.h | 2 +
>> src/gallium/drivers/llvmpipe/lp_setup.c | 6 +-
>> 5 files changed, 63 insertions(+), 69 deletions(-)
>>
>> diff --git a/src/gallium/auxiliary/draw/draw_llvm.h
>> b/src/gallium/auxiliary/draw/draw_llvm.h
>> index 9565fc6..a1983e1 100644
>> --- a/src/gallium/auxiliary/draw/draw_llvm.h
>> +++ b/src/gallium/auxiliary/draw/draw_llvm.h
>> @@ -472,6 +472,8 @@ struct draw_llvm {
>>
>> struct draw_gs_llvm_variant_list_item gs_variants_list;
>> int nr_gs_variants;
>> +
>> + float fake_const_buf[4];
>
> Couldn't we make fake_const_buf a mere local static const array instead?
> It would save memory.
Ah right can just declare that in llvm_middle_end_prepare(). Putting it
in the scene data seemed convenient for the fs case (there's only one
scene per context anyway but I guess could do the same thing there too).
>
>> };
>>
>>
>> diff --git
>> a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
>> b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
>> index 0dfafdc..03257d8 100644
>> --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
>> +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
>> @@ -273,28 +273,35 @@ llvm_middle_end_bind_parameters(struct
>> draw_pt_middle_end *middle)
>> {
>> struct llvm_middle_end *fpme = llvm_middle_end(middle);
>> struct draw_context *draw = fpme->draw;
>> + struct draw_llvm *llvm = fpme->llvm;
>> unsigned i;
>>
>> - for (i = 0; i < Elements(fpme->llvm->jit_context.vs_constants);
>> ++i) {
>> + for (i = 0; i < Elements(llvm->jit_context.vs_constants); ++i) {
>> int num_consts =
>> draw->pt.user.vs_constants_size[i] / (sizeof(float) * 4);
>> - fpme->llvm->jit_context.vs_constants[i] =
>> draw->pt.user.vs_constants[i];
>> - fpme->llvm->jit_context.num_vs_constants[i] = num_consts;
>> + llvm->jit_context.vs_constants[i] = draw->pt.user.vs_constants[i];
>> + llvm->jit_context.num_vs_constants[i] = num_consts;
>> + if (num_consts == 0) {
>> + llvm->jit_context.vs_constants[i] = llvm->fake_const_buf;
>> + }
>> }
>> - for (i = 0; i < Elements(fpme->llvm->gs_jit_context.constants);
>> ++i) {
>> + for (i = 0; i < Elements(llvm->gs_jit_context.constants); ++i) {
>> int num_consts =
>> draw->pt.user.gs_constants_size[i] / (sizeof(float) * 4);
>> - fpme->llvm->gs_jit_context.constants[i] =
>> draw->pt.user.gs_constants[i];
>> - fpme->llvm->gs_jit_context.num_constants[i] = num_consts;
>> + llvm->gs_jit_context.constants[i] = draw->pt.user.gs_constants[i];
>> + llvm->gs_jit_context.num_constants[i] = num_consts;
>> + if (num_consts == 0) {
>> + llvm->gs_jit_context.constants[i] = llvm->fake_const_buf;
>> + }
>> }
>>
>> - fpme->llvm->jit_context.planes =
>> + llvm->jit_context.planes =
>> (float (*)[DRAW_TOTAL_CLIP_PLANES][4]) draw->pt.user.planes[0];
>> - fpme->llvm->gs_jit_context.planes =
>> + llvm->gs_jit_context.planes =
>> (float (*)[DRAW_TOTAL_CLIP_PLANES][4]) draw->pt.user.planes[0];
>>
>> - fpme->llvm->jit_context.viewports = draw->viewports;
>> - fpme->llvm->gs_jit_context.viewports = draw->viewports;
>> + llvm->jit_context.viewports = draw->viewports;
>> + llvm->gs_jit_context.viewports = draw->viewports;
>> }
>>
>>
>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
>> b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
>> index 17b68ff..5aa2846 100644
>> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
>> @@ -944,20 +944,39 @@ gather_outputs(struct lp_build_tgsi_soa_context
>> * bld)
>> * with a little work.
>> */
>> static LLVMValueRef
>> -build_gather(struct lp_build_context *bld,
>> +build_gather(struct lp_build_tgsi_context *bld_base,
>> LLVMValueRef base_ptr,
>> LLVMValueRef indexes,
>> - LLVMValueRef *overflow_mask)
>> + LLVMValueRef overflow_mask)
>> {
>> - LLVMBuilderRef builder = bld->gallivm->builder;
>> + struct gallivm_state *gallivm = bld_base->base.gallivm;
>> + LLVMBuilderRef builder = gallivm->builder;
>> + struct lp_build_context *uint_bld = &bld_base->uint_bld;
>> + struct lp_build_context *bld = &bld_base->base;
>> LLVMValueRef res = bld->undef;
>> unsigned i;
>> - LLVMValueRef temp_ptr = NULL;
>> +
>> + /*
>> + * overflow_mask is a vector telling us which channels
>> + * in the vector overflowed. We use the overflow behavior for
>> + * constant buffers which is defined as:
>> + * Out of bounds access to constant buffer returns 0 in all
>> + * components. Out of bounds behavior is always with respect
>> + * to the size of the buffer bound at that slot.
>> + */
>>
>> if (overflow_mask) {
>> - temp_ptr = lp_build_alloca(
>> - bld->gallivm,
>> - lp_build_vec_type(bld->gallivm, bld->type), "");
>> + /*
>> + * We avoid per-element control flow here (also due to llvm
>> going crazy,
>> + * though I suspect it's better anyway since overflow is likely
>> rare).
>> + * Note that since we still fetch from buffers even if
>> num_elements was
>> + * zero (in this case we'll fetch from index zero) the jit func
>> callers
>> + * MUST provide valid fake constant buffers of size 4x32 (the
>> values do
>> + * not matter), otherwise we'd still need (not per element though)
>> + * control flow.
>> + */
>> + LLVMValueRef zeros = lp_build_const_int_vec(gallivm,
>> uint_bld->type, 0);
>
> Instead of recreating zeros, you can just use uint_bld->zero , here and
> below.
Indeed.
FWIW this made me realize how great avx2 gather really is. This could
(of course) not only fetch all values in one go, but additionally you
also get the default values for the masked out elements for free without
compare/select, plus you don't have to bother with clamping the offsets
to a valid value. And, of course, the real kicker, you don't need a
valid base address neither as long as you guarantee that all elements
are masked out (providing such a valid base address turned out to be the
most complex part of this patch).
Roland
>
>
>
>> + indexes = lp_build_select(uint_bld, overflow_mask, zeros,
>> indexes);
>> }
>>
>> /*
>> @@ -968,53 +987,17 @@ build_gather(struct lp_build_context *bld,
>> LLVMValueRef index = LLVMBuildExtractElement(builder,
>> indexes, ii, "");
>> LLVMValueRef scalar_ptr, scalar;
>> - LLVMValueRef overflow;
>> - struct lp_build_if_state if_ctx;
>> -
>> - /*
>> - * overflow_mask is a boolean vector telling us which channels
>> - * in the vector overflowed. We use the overflow behavior for
>> - * constant buffers which is defined as:
>> - * Out of bounds access to constant buffer returns 0 in all
>> - * componenets. Out of bounds behavior is always with respect
>> - * to the size of the buffer bound at that slot.
>> - */
>> - if (overflow_mask) {
>> - overflow = LLVMBuildExtractElement(builder, *overflow_mask,
>> - ii, "");
>> - lp_build_if(&if_ctx, bld->gallivm, overflow);
>> - {
>> - LLVMValueRef val = LLVMBuildLoad(builder, temp_ptr, "");
>> - val = LLVMBuildInsertElement(
>> - builder, val,
>> -
>> LLVMConstNull(LLVMFloatTypeInContext(bld->gallivm->context)),
>> - ii, "");
>> - LLVMBuildStore(builder, val, temp_ptr);
>> - }
>> - lp_build_else(&if_ctx);
>> - {
>> - LLVMValueRef val = LLVMBuildLoad(builder, temp_ptr, "");
>> -
>> - scalar_ptr = LLVMBuildGEP(builder, base_ptr,
>> - &index, 1, "gather_ptr");
>> - scalar = LLVMBuildLoad(builder, scalar_ptr, "");
>> -
>> - val = LLVMBuildInsertElement(builder, val, scalar, ii, "");
>>
>> - LLVMBuildStore(builder, val, temp_ptr);
>> - }
>> - lp_build_endif(&if_ctx);
>> - } else {
>> - scalar_ptr = LLVMBuildGEP(builder, base_ptr,
>> - &index, 1, "gather_ptr");
>> - scalar = LLVMBuildLoad(builder, scalar_ptr, "");
>> + scalar_ptr = LLVMBuildGEP(builder, base_ptr,
>> + &index, 1, "gather_ptr");
>> + scalar = LLVMBuildLoad(builder, scalar_ptr, "");
>>
>> - res = LLVMBuildInsertElement(builder, res, scalar, ii, "");
>> - }
>> + res = LLVMBuildInsertElement(builder, res, scalar, ii, "");
>> }
>>
>> if (overflow_mask) {
>> - res = LLVMBuildLoad(builder, temp_ptr, "gather_val");
>> + LLVMValueRef zeros = lp_build_const_vec(gallivm, bld->type, 0.0);
>
> bld->zero
>
>> + res = lp_build_select(bld, overflow_mask, zeros, res);
>> }
>>
>> return res;
>> @@ -1247,17 +1230,15 @@ emit_fetch_constant(
>> num_consts = lp_build_broadcast_scalar(uint_bld, num_consts);
>> /* Construct a boolean vector telling us which channels
>> * overflow the bound constant buffer */
>> - overflow_mask = LLVMBuildICmp(builder, LLVMIntUGE,
>> - indirect_index,
>> - num_consts, "");
>> + overflow_mask = lp_build_compare(gallivm, uint_bld->type,
>> PIPE_FUNC_GEQUAL,
>> + indirect_index, num_consts);
>>
>> /* index_vec = indirect_index * 4 + swizzle */
>> index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
>> index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
>>
>> /* Gather values from the constant buffer */
>> - res = build_gather(&bld_base->base, consts_ptr, index_vec,
>> - &overflow_mask);
>> + res = build_gather(bld_base, consts_ptr, index_vec,
>> overflow_mask);
>> }
>> else {
>> LLVMValueRef index; /* index into the const buffer */
>> @@ -1319,7 +1300,7 @@ emit_fetch_immediate(
>> FALSE);
>>
>> /* Gather values from the immediate register array */
>> - res = build_gather(&bld_base->base, imms_array, index_vec,
>> NULL);
>> + res = build_gather(bld_base, imms_array, index_vec, NULL);
>> } else {
>> LLVMValueRef lindex = lp_build_const_int32(gallivm,
>> reg->Register.Index * 4 +
>> swizzle);
>> @@ -1373,7 +1354,7 @@ emit_fetch_input(
>> inputs_array = LLVMBuildBitCast(builder, bld->inputs_array,
>> fptr_type, "");
>>
>> /* Gather values from the input register array */
>> - res = build_gather(&bld_base->base, inputs_array, index_vec,
>> NULL);
>> + res = build_gather(bld_base, inputs_array, index_vec, NULL);
>> } else {
>> if (bld->indirect_files & (1 << TGSI_FILE_INPUT)) {
>> LLVMValueRef lindex = lp_build_const_int32(gallivm,
>> @@ -1495,7 +1476,7 @@ emit_fetch_temporary(
>> temps_array = LLVMBuildBitCast(builder, bld->temps_array,
>> fptr_type, "");
>>
>> /* Gather values from the temporary register array */
>> - res = build_gather(&bld_base->base, temps_array, index_vec, NULL);
>> + res = build_gather(bld_base, temps_array, index_vec, NULL);
>> }
>> else {
>> LLVMValueRef temp_ptr;
>> diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h
>> b/src/gallium/drivers/llvmpipe/lp_scene.h
>> index ad23c20..3ef0f32 100644
>> --- a/src/gallium/drivers/llvmpipe/lp_scene.h
>> +++ b/src/gallium/drivers/llvmpipe/lp_scene.h
>> @@ -178,6 +178,8 @@ struct lp_scene {
>>
>> struct cmd_bin tile[TILES_X][TILES_Y];
>> struct data_block_list data;
>> +
>> + float fake_const_buf[4];
>> };
>>
>>
>> diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c
>> b/src/gallium/drivers/llvmpipe/lp_setup.c
>> index 3b0056c..bfe0af4 100644
>> --- a/src/gallium/drivers/llvmpipe/lp_setup.c
>> +++ b/src/gallium/drivers/llvmpipe/lp_setup.c
>> @@ -1103,14 +1103,16 @@ try_update_scene_state( struct
>> lp_setup_context *setup )
>> setup->constants[i].stored_size = current_size;
>> setup->constants[i].stored_data = stored;
>> }
>> + setup->fs.current.jit_context.constants[i] =
>> + setup->constants[i].stored_data;
>> }
>> else {
>> setup->constants[i].stored_size = 0;
>> setup->constants[i].stored_data = NULL;
>> + setup->fs.current.jit_context.constants[i] =
>> + scene->fake_const_buf;
>> }
>>
>> - setup->fs.current.jit_context.constants[i] =
>> - setup->constants[i].stored_data;
>> num_constants =
>> setup->constants[i].stored_size / (sizeof(float) * 4);
>> setup->fs.current.jit_context.num_constants[i] =
>> num_constants;
>>
>
>
> Jose
More information about the mesa-dev
mailing list