[Mesa-dev] [RFC 2/3] freedreno/a5xx: update compute param handling
Rob Clark
robdclark at gmail.com
Tue Apr 24 12:29:03 UTC 2018
Move to per-generation backend, since these are likely to be fairly
generation specific, and that is nicer than having it split between
freedreno_screen (for the global case) and fd5_compute (for the
kernel-specific limits case)
Signed-off-by: Rob Clark <robdclark at gmail.com>
---
Not totally working yet, so there might still be another constraint
about max # of threads in a WG that I don't understand yet. The
blob *mostly* seems to follow the formula that:
num_threads <= 1024
num_threads * ((2 * num_regs) + num_half_regs) <= 8192
Except in a few cases where it uses a lower value for some reason.
And in practice this formula mostly seems to work, except in a few
cases where the GPU still locks up.
But regardless, the first patch in the series is the right thing to
do.
src/gallium/drivers/freedreno/a5xx/fd5_compute.c | 126 ++++++++++++++++++++++-
src/gallium/drivers/freedreno/a5xx/fd5_compute.h | 2 +
src/gallium/drivers/freedreno/a5xx/fd5_program.c | 4 +
src/gallium/drivers/freedreno/a5xx/fd5_screen.c | 2 +
src/gallium/drivers/freedreno/freedreno_screen.c | 75 ++------------
src/gallium/drivers/freedreno/freedreno_screen.h | 9 ++
src/gallium/drivers/freedreno/ir3/ir3_shader.c | 2 +
7 files changed, 152 insertions(+), 68 deletions(-)
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_compute.c b/src/gallium/drivers/freedreno/a5xx/fd5_compute.c
index 9d3039c3805..52b60e0c5e2 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_compute.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_compute.c
@@ -68,6 +68,124 @@ fd5_delete_compute_state(struct pipe_context *pctx, void *hwcso)
free(so);
}
+// TODO move this somewhere to be shared with fd5_program.c..
+static unsigned
+max_threads(struct ir3_info *info)
+{
+ /* blob seems to advertise 1024 as max threads for all a5xx. Either
+ * that is wrong, or when they scale up/down the number of shader core
+ * units it is always a multiples of a thing that can (in best case)
+ * run 1024 threads. (Ie. the bigger variants can run 4 or however
+ * many blocks at a time, while the smaller could only run 1 or 2).
+ */
+ unsigned threads = 1024;
+
+ if (info) {
+ unsigned hregs;
+
+ /* seems like we have 1024 threads and 4096 full registers (or
+ * 8192 half-regs), once a shader is using more than 4 full regs
+ * it starts to cut down on threads in flight:
+ *
+ * XXX maybe this is 3k full / 6k half registers..
+ */
+ hregs = (2 * (info->max_reg + 1)) + (info->max_half_reg + 1);
+ threads /= DIV_ROUND_UP(hregs, 8);
+ }
+
+ return threads;
+}
+
+#define RET(x) do { \
+ if (ret) \
+ memcpy(ret, x, sizeof(x)); \
+ return sizeof(x); \
+} while (0); break;
+
+int
+fd5_get_compute_param(struct fd_screen *screen, enum pipe_compute_cap param,
+ void *hwcso, void *ret)
+{
+ const char * const ir = "ir3";
+ /* blob seems to advertise 1024 as max threads for all a5xx. Either
+ * that is wrong, or when they scale up/down the number of shader core
+ * units it is always a multiples of a thing that can (in best case)
+ * run 1024 threads. (Ie. the bigger variants can run 4 or however
+ * many blocks at a time, while the smaller could only run 1 or 2).
+ */
+ unsigned threads;
+
+ // XXX blob appears to not care unless there is a barrier instruction
+ if (hwcso) {
+ struct fd5_compute_stateobj *so = hwcso;
+ struct ir3_shader_key key = {0};
+ struct ir3_shader_variant *v;
+
+ v = ir3_shader_variant(so->shader, key, NULL);
+
+ threads = max_threads(&v->info);
+ } else {
+ threads = max_threads(NULL);
+ }
+
+ switch (param) {
+ case PIPE_COMPUTE_CAP_ADDRESS_BITS:
+// don't expose 64b pointer support yet, until ir3 supports 64b
+// math, otherwise spir64 target is used and we get 64b pointer
+// calculations that we can't do yet
+// if (is_a5xx(screen))
+// RET((uint32_t []){ 64 });
+ RET((uint32_t []){ 32 });
+
+ case PIPE_COMPUTE_CAP_IR_TARGET:
+ if (ret)
+ sprintf(ret, ir);
+ return strlen(ir) * sizeof(char);
+
+ case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+ RET((uint64_t []) { 3 });
+
+ case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+ RET(((uint64_t []) { 65535, 65535, 65535 }));
+
+ case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+ RET(((uint64_t []) { threads, threads, threads }));
+
+ case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+ RET((uint64_t []) { threads });
+
+ case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
+ RET((uint64_t []) { screen->ram_size });
+
+ case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
+ RET((uint64_t []) { 32768 });
+
+ case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
+ case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
+ RET((uint64_t []) { 4096 });
+
+ case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
+ RET((uint64_t []) { screen->ram_size });
+
+ case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
+ RET((uint32_t []) { screen->max_freq / 1000000 });
+
+ case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
+ RET((uint32_t []) { 9999 }); // TODO
+
+ case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
+ RET((uint32_t []) { 0 });
+
+ case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+ RET((uint32_t []) { 32 }); // TODO
+
+ case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
+ RET((uint64_t []) { 1024 }); // TODO
+ }
+
+ return 0;
+}
+
/* maybe move to fd5_program? */
static void
cs_program_emit(struct fd_ringbuffer *ring, struct ir3_shader_variant *v)
@@ -76,7 +194,7 @@ cs_program_emit(struct fd_ringbuffer *ring, struct ir3_shader_variant *v)
enum a3xx_threadsize thrsz;
/* note: blob uses local_size_x/y/z threshold to choose threadsize: */
- thrsz = FOUR_QUADS;
+ thrsz = (max_threads(&v->info) < 1024) ? TWO_QUADS : FOUR_QUADS;
OUT_PKT4(ring, REG_A5XX_SP_SP_CNTL, 1);
OUT_RING(ring, 0x00000000); /* SP_SP_CNTL */
@@ -214,9 +332,9 @@ fd5_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info)
OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_6 */
OUT_PKT4(ring, REG_A5XX_HLSQ_CS_KERNEL_GROUP_X, 3);
- OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_X */
- OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Y */
- OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Z */
+ OUT_RING(ring, num_groups[0]); /* HLSQ_CS_KERNEL_GROUP_X */
+ OUT_RING(ring, num_groups[1]); /* HLSQ_CS_KERNEL_GROUP_Y */
+ OUT_RING(ring, num_groups[2]); /* HLSQ_CS_KERNEL_GROUP_Z */
if (info->indirect) {
struct fd_resource *rsc = fd_resource(info->indirect);
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_compute.h b/src/gallium/drivers/freedreno/a5xx/fd5_compute.h
index d5cc8b8a0ca..ae03c2bc374 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_compute.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_compute.h
@@ -29,6 +29,8 @@
#include "pipe/p_context.h"
+int fd5_get_compute_param(struct fd_screen *screen, enum pipe_compute_cap param,
+ void *hwcso, void *ret);
void fd5_compute_init(struct pipe_context *pctx);
#endif /* FD5_COMPUTE_H_ */
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_program.c b/src/gallium/drivers/freedreno/a5xx/fd5_program.c
index 81fe7d4b582..886589fdb9d 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_program.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.c
@@ -337,6 +337,10 @@ fd5_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
setup_stages(emit, s);
+ // should also consider half-regs.. but if # of registers used
+ // means that we only have 512 or fewer threads in flight, then
+ // use TWO_QUAD mode to reduce branch divergence penalty. See
+ // the calculation used for cs_program_emit()
fssz = (s[FS].i->max_reg >= 24) ? TWO_QUADS : FOUR_QUADS;
pos_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_POS);
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_screen.c b/src/gallium/drivers/freedreno/a5xx/fd5_screen.c
index 7d7e76e869c..3e21030333e 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_screen.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_screen.c
@@ -29,6 +29,7 @@
#include "fd5_screen.h"
#include "fd5_blitter.h"
+#include "fd5_compute.h"
#include "fd5_context.h"
#include "fd5_format.h"
#include "fd5_resource.h"
@@ -109,6 +110,7 @@ fd5_screen_init(struct pipe_screen *pscreen)
struct fd_screen *screen = fd_screen(pscreen);
screen->max_rts = A5XX_MAX_RENDER_TARGETS;
screen->compiler = ir3_compiler_create(screen->dev, screen->gpu_id);
+ screen->get_compute_param = fd5_get_compute_param;
pscreen->context_create = fd5_context_create;
pscreen->is_format_supported = fd5_screen_is_format_supported;
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 6f35d5dafbd..f4fdcef9ee3 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -622,81 +622,27 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen,
return 0;
}
-/* TODO depending on how much the limits differ for a3xx/a4xx, maybe move this
- * into per-generation backend?
- */
static int
fd_get_compute_param(struct pipe_screen *pscreen, enum pipe_shader_ir ir_type,
enum pipe_compute_cap param, void *ret)
{
struct fd_screen *screen = fd_screen(pscreen);
- const char * const ir = "ir3";
- if (!has_compute(screen))
+ if (!screen->get_compute_param)
return 0;
-#define RET(x) do { \
- if (ret) \
- memcpy(ret, x, sizeof(x)); \
- return sizeof(x); \
-} while (0); break;
-
- switch (param) {
- case PIPE_COMPUTE_CAP_ADDRESS_BITS:
-// don't expose 64b pointer support yet, until ir3 supports 64b
-// math, otherwise spir64 target is used and we get 64b pointer
-// calculations that we can't do yet
-// if (is_a5xx(screen))
-// RET((uint32_t []){ 64 });
- RET((uint32_t []){ 32 });
-
- case PIPE_COMPUTE_CAP_IR_TARGET:
- if (ret)
- sprintf(ret, ir);
- return strlen(ir) * sizeof(char);
-
- case PIPE_COMPUTE_CAP_GRID_DIMENSION:
- RET((uint64_t []) { 3 });
-
- case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
- RET(((uint64_t []) { 65535, 65535, 65535 }));
-
- case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
- RET(((uint64_t []) { 256, 256, 256 }));
-
- case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
- RET((uint64_t []) { 256 });
-
- case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
- RET((uint64_t []) { screen->ram_size });
-
- case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
- RET((uint64_t []) { 32768 });
-
- case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
- case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
- RET((uint64_t []) { 4096 });
-
- case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
- RET((uint64_t []) { screen->ram_size });
-
- case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
- RET((uint32_t []) { screen->max_freq / 1000000 });
-
- case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
- RET((uint32_t []) { 9999 }); // TODO
-
- case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
- RET((uint32_t []) { 0 });
+ return screen->get_compute_param(screen, param, NULL, ret);
+}
- case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
- RET((uint32_t []) { 32 }); // TODO
+static int fd_get_kernel_param(struct pipe_screen *pscreen, void *hwcso,
+ enum pipe_compute_cap param, void *ret)
+{
+ struct fd_screen *screen = fd_screen(pscreen);
- case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
- RET((uint64_t []) { 1024 }); // TODO
- }
+ if (!screen->get_compute_param)
+ return 0;
- return 0;
+ return screen->get_compute_param(screen, param, hwcso, ret);
}
static const void *
@@ -906,6 +852,7 @@ fd_screen_create(struct fd_device *dev)
pscreen->get_paramf = fd_screen_get_paramf;
pscreen->get_shader_param = fd_screen_get_shader_param;
pscreen->get_compute_param = fd_get_compute_param;
+ pscreen->get_kernel_param = fd_get_kernel_param;
pscreen->get_compiler_options = fd_get_compiler_options;
fd_resource_screen_init(pscreen);
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h
index 6be739ae287..e2c481074ff 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.h
+++ b/src/gallium/drivers/freedreno/freedreno_screen.h
@@ -84,6 +84,15 @@ struct fd_screen {
uint32_t (*setup_slices)(struct fd_resource *rsc);
unsigned (*tile_mode)(const struct pipe_resource *prsc);
+ /* for backends that support compute, access compute param. If hwcso
+ * is not NULL, then it is the compute_state cso, in which case the
+ * returned param value should take into account limits imposed by
+ * resources used by compute shader, such as # of registers used.
+ * Otherwise the best-case value is returned.
+ */
+ int (*get_compute_param)(struct fd_screen *screen, enum pipe_compute_cap param,
+ void *hwcso, void *ret);
+
int64_t cpu_gpu_time_delta;
struct fd_batch_cache batch_cache;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 3a2c06f5963..55f28e0eee4 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -353,6 +353,8 @@ ir3_shader_create_compute(struct ir3_compiler *compiler,
shader->compiler = compiler;
shader->id = ++shader->compiler->shader_count;
shader->type = SHADER_COMPUTE;
+ // TODO if we figure this out by scanning input params we could
+ // avoid a shader recompile by dropping PIPE_SHADER_DEP_INPUT_MEM
shader->cs.req_input_mem = align(cso->req_input_mem, 4) / 4; /* byte->dword */
// TODO we need a way to differentiate clover vs glsl compute!
--
2.14.3
More information about the mesa-dev
mailing list