[Mesa-dev] [PATCH v2 15/20] i965/cs: Emit compute shader code and upload programs

Mon Apr 27 16:55:53 PDT 2015

On Friday, April 24, 2015 04:33:07 PM Jordan Justen wrote:
> v2:
>  * Don't bother checking for 'gen > 5' (krh)
>  * Populate sampler data in key (krh)
> 
> Signed-off-by: Jordan Justen <jordan.l.justen at intel.com>
> ---
>  src/mesa/drivers/dri/i965/brw_context.h      |   1 +
>  src/mesa/drivers/dri/i965/brw_cs.cpp         | 224 +++++++++++++++++++++++++++
>  src/mesa/drivers/dri/i965/brw_state_upload.c |   3 +
>  3 files changed, 228 insertions(+)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index 56827d8..9e13c59 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -148,6 +148,7 @@ struct brw_vs_prog_key;
>  struct brw_vue_prog_key;
>  struct brw_wm_prog_key;
>  struct brw_wm_prog_data;
> +struct brw_cs_prog_key;
>  struct brw_cs_prog_data;
>  
>  enum brw_pipeline {
> diff --git a/src/mesa/drivers/dri/i965/brw_cs.cpp b/src/mesa/drivers/dri/i965/brw_cs.cpp
> index 8021147..648f0f0 100644
> --- a/src/mesa/drivers/dri/i965/brw_cs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_cs.cpp
> @@ -22,8 +22,15 @@
>   */
>  
>  
> +#include "util/ralloc.h"
>  #include "brw_context.h"
>  #include "brw_cs.h"
> +#include "brw_fs.h"
> +#include "brw_eu.h"
> +#include "brw_wm.h"
> +#include "intel_mipmap_tree.h"
> +#include "brw_state.h"
> +#include "intel_batchbuffer.h"
>  
>  extern "C"
>  bool
> @@ -46,3 +53,220 @@ brw_cs_prog_data_compare(const void *in_a, const void *in_b)
>  
>     return true;
>  }
> +
> +
> +static const unsigned *
> +brw_cs_emit(struct brw_context *brw,
> +            void *mem_ctx,
> +            const struct brw_cs_prog_key *key,
> +            struct brw_cs_prog_data *prog_data,
> +            struct gl_compute_program *cp,
> +            struct gl_shader_program *prog,
> +            unsigned *final_assembly_size)
> +{
> +   bool start_busy = false;
> +   double start_time = 0;
> +
> +   if (unlikely(brw->perf_debug)) {
> +      start_busy = (brw->batch.last_bo &&
> +                    drm_intel_bo_busy(brw->batch.last_bo));
> +      start_time = get_time();
> +   }
> +
> +   struct brw_shader *shader = NULL;
> +   if (prog)
> +      shader = (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_COMPUTE];
> +
> +   if (unlikely(INTEL_DEBUG & DEBUG_CS))
> +      brw_dump_ir("compute", prog, &shader->base, &cp->Base);
> +
> +   /* Now the main event: Visit the shader IR and generate our CS IR for it.
> +    */
> +   fs_visitor v(brw, mem_ctx, key, prog_data, prog, cp, 8);
> +   if (!v.run_cs()) {
> +      if (prog) {
> +         prog->LinkStatus = false;
> +         ralloc_strcat(&prog->InfoLog, v.fail_msg);
> +      }
> +
> +      _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
> +                    v.fail_msg);
> +
> +      return NULL;
> +   }
> +
> +   cfg_t *simd16_cfg = NULL;
> +   fs_visitor v2(brw, mem_ctx, key, prog_data, prog, cp, 16);
> +   if (likely(!(INTEL_DEBUG & DEBUG_NO16))) {

I know some programs may require SIMD16 shaders to even function (on
some hardware), and that you're planning to make SIMD16 compute shader
compilation never fail (at some point).  So we may want to rework this
logic eventually.  But it's probably OK for now.

> +      if (!v.simd16_unsupported) {
> +         /* Try a SIMD16 compile */
> +         v2.import_uniforms(&v);
> +         if (!v2.run_cs()) {
> +            perf_debug("SIMD16 shader failed to compile, falling back to "
> +                       "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
> +         } else {
> +            simd16_cfg = v2.cfg;
> +         }
> +      } else {
> +         perf_debug("SIMD16 shader unsupported, falling back to "
> +                    "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
> +      }
> +   }

Let's eliminate the "at a 10-20% performance cost" part of the messages.
That was something Eric measured about fragment shading - the
performance characteristics of compute shaders will probably be entirely
different.

> +
> +   prog_data->local_size[0] = cp->LocalSize[0];
> +   prog_data->local_size[1] = cp->LocalSize[1];
> +   prog_data->local_size[2] = cp->LocalSize[2];
> +
> +   cfg_t *simd8_cfg;
> +   int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;

I don't really see any value in supporting INTEL_DEBUG=no8 for compute
shaders.  Let's just drop this.

Why don't we have a single "cfg_t *cfg" variable.  After compiling the
SIMD8 program, set cfg = v.cfg and prog_data->simd_size = 8.  If we
successfully compile a SIMD16 program, set cfg = v2.cfg and override
prog_data->simd_size = 16.  Then pass cfg to fs_generator.

I think that'll streamline this code a fair bit.

> +   if (no_simd8 && simd16_cfg) {
> +      simd8_cfg = NULL;
> +      prog_data->no_8 = true;
> +   } else {
> +      simd8_cfg = v.cfg;
> +      prog_data->no_8 = false;
> +   }
> +
> +   fs_generator g(brw, mem_ctx, (void*) key, &prog_data->base, &cp->Base,
> +                  v.promoted_constants, v.runtime_check_aads_emit, "CS");
> +   if (INTEL_DEBUG & DEBUG_CS) {
> +      char *name = ralloc_asprintf(mem_ctx, "%s compute shader %d",
> +                                   prog->Label ? prog->Label : "unnamed",
> +                                   prog->Name);
> +      g.enable_debug(name);
> +   }
> +   if (simd16_cfg) {
> +      prog_data->simd_size = 16;
> +      g.generate_code(simd16_cfg, 16);
> +   } else if (simd8_cfg) {
> +      prog_data->simd_size = 8;
> +      g.generate_code(simd8_cfg, 8);
> +   }
> +
> +   if (unlikely(brw->perf_debug) && shader) {
> +      if (shader->compiled_once) {
> +         _mesa_problem(&brw->ctx, "CS programs shouldn't need recompiles");
> +      }
> +      shader->compiled_once = true;
> +
> +      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
> +         perf_debug("CS compile took %.03f ms and stalled the GPU\n",
> +                    (get_time() - start_time) * 1000);
> +      }
> +   }
> +
> +   return g.get_assembly(final_assembly_size);
> +}
> +
> +static bool
> +brw_codegen_cs_prog(struct brw_context *brw,
> +                    struct gl_shader_program *prog,
> +                    struct brw_compute_program *cp,
> +                    struct brw_cs_prog_key *key)
> +{
> +   struct gl_context *ctx = &brw->ctx;
> +   const GLuint *program;
> +   void *mem_ctx = ralloc_context(NULL);
> +   GLuint program_size;
> +   struct brw_cs_prog_data prog_data;
> +   struct gl_shader *cs = NULL;
> +
> +   if (prog)
> +      cs = prog->_LinkedShaders[MESA_SHADER_COMPUTE];

Isn't prog required to be != NULL?

> +
> +   memset(&prog_data, 0, sizeof(prog_data));
> +
> +   /* Allocate the references to the uniforms that will end up in the
> +    * prog_data associated with the compiled program, and which will be freed
> +    * by the state cache.
> +    */
> +   int param_count;
> +   if (cs) {
> +      param_count = cs->num_uniform_components;
> +   } else {
> +      param_count = cp->program.Base.Parameters->NumParameters * 4;

NumParameters * 4 is a thing for ARB programs, and there aren't ARB
programs.  I think the else clause is dead.

> +   }
> +
> +   /* The backend also sometimes adds params for texture size. */
> +   param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits;
> +   prog_data.base.param =
> +      rzalloc_array(NULL, const gl_constant_value *, param_count);
> +   prog_data.base.pull_param =
> +      rzalloc_array(NULL, const gl_constant_value *, param_count);
> +   prog_data.base.nr_params = param_count;
> +
> +   program = brw_cs_emit(brw, mem_ctx, key, &prog_data,
> +                         &cp->program, prog, &program_size);
> +   if (program == NULL) {
> +      ralloc_free(mem_ctx);
> +      return false;
> +   }
> +
> +   if (prog_data.base.total_scratch) {
> +      brw_get_scratch_bo(brw, &brw->cs.base.scratch_bo,
> +                         prog_data.base.total_scratch * brw->max_cs_threads);
> +   }
> +
> +   if (unlikely(INTEL_DEBUG & DEBUG_CS))
> +      fprintf(stderr, "\n");
> +
> +   brw_upload_cache(&brw->cache, BRW_CACHE_CS_PROG,
> +                    key, sizeof(*key),
> +                    program, program_size,
> +                    &prog_data, sizeof(prog_data),
> +                    &brw->cs.base.prog_offset, &brw->cs.prog_data);
> +   ralloc_free(mem_ctx);
> +
> +   return true;
> +}
> +
> +
> +static void
> +brw_cs_populate_key(struct brw_context *brw, struct brw_cs_prog_key *key)
> +{
> +   struct gl_context *ctx = &brw->ctx;
> +   /* BRW_NEW_COMPUTE_PROGRAM */
> +   const struct brw_compute_program *cp =
> +      (struct brw_compute_program *) brw->compute_program;
> +   const struct gl_program *prog = (struct gl_program *) cp;
> +
> +   memset(key, 0, sizeof(*key));
> +
> +   /* _NEW_TEXTURE */
> +   brw_populate_sampler_prog_key_data(ctx, prog, brw->cs.base.sampler_count,
> +                                      &key->tex);
> +
> +   /* The unique compute program ID */
> +   key->program_string_id = cp->id;
> +}
> +
> +
> +extern "C"
> +void
> +brw_upload_cs_prog(struct brw_context *brw)
> +{
> +   struct gl_context *ctx = &brw->ctx;
> +   struct brw_cs_prog_key key;
> +   struct brw_compute_program *cp = (struct brw_compute_program *)
> +      brw->compute_program;
> +
> +   if (!cp)
> +      return;
> +
> +   if (!brw_state_dirty(brw, 0, BRW_NEW_COMPUTE_PROGRAM))

This needs to be

    if (!brw_state_dirty(brw, _NEW_TEXTURE, BRW_NEW_COMPUTE_PROGRAM))

since brw_cs_populate_key depends on _NEW_TEXTURE for the sampler stuff.

Would you mind sending a v2?  I don't anticipate a lot of extra
feedback, but it'd be nice to see how it looks with the feedback
incorporated.

> +      return;
> +
> +   brw_cs_populate_key(brw, &key);
> +
> +   if (!brw_search_cache(&brw->cache, BRW_CACHE_CS_PROG,
> +                         &key, sizeof(key),
> +                         &brw->cs.base.prog_offset, &brw->cs.prog_data)) {
> +      bool success =
> +         brw_codegen_cs_prog(brw,
> +                             ctx->Shader.CurrentProgram[MESA_SHADER_COMPUTE],
> +                             cp, &key);
> +      (void) success;
> +      assert(success);
> +   }
> +   brw->cs.base.prog_data = &brw->cs.prog_data->base;
> +}
> diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
> index 5c5420d..d086f39 100644
> --- a/src/mesa/drivers/dri/i965/brw_state_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
> @@ -40,6 +40,7 @@
>  #include "brw_ff_gs.h"
>  #include "brw_gs.h"
>  #include "brw_wm.h"
> +#include "brw_cs.h"
>  
>  static const struct brw_tracked_state *gen4_atoms[] =
>  {
> @@ -618,6 +619,8 @@ brw_upload_programs(struct brw_context *brw,
>           brw_upload_gs_prog(brw);
>  
>        brw_upload_wm_prog(brw);
> +   } else if (pipeline == BRW_COMPUTE_PIPELINE) {
> +      brw_upload_cs_prog(brw);
>     }
>  }
>  
> 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 819 bytes
Desc: This is a digitally signed message part.
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20150427/e3d9ab7e/attachment-0001.sig>