[Mesa-dev] [PATCH 3/3] radeonsi: don't emit partial flushes for internal CS flushes only
Nicolai Hähnle
nhaehnle at gmail.com
Sun Apr 15 18:47:37 UTC 2018
How much testing have you done with the radeon drm? It may be safer to
just skip that part of the changes.
Apart from that, the series is
Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
On 07.04.2018 04:31, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> ---
> src/gallium/drivers/radeonsi/si_buffer.c | 6 +++---
> src/gallium/drivers/radeonsi/si_dma_cs.c | 2 +-
> src/gallium/drivers/radeonsi/si_fence.c | 5 ++++-
> src/gallium/drivers/radeonsi/si_gfx_cs.c | 4 ++--
> src/gallium/drivers/radeonsi/si_pipe.h | 2 +-
> src/gallium/drivers/radeonsi/si_state_shaders.c | 4 ++--
> src/gallium/drivers/radeonsi/si_texture.c | 2 +-
> src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 12 ++++++++----
> src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 12 ++++++++----
> src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 3 ++-
> 10 files changed, 32 insertions(+), 20 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c
> index 1420702d8d4..d17b2c6a831 100644
> --- a/src/gallium/drivers/radeonsi/si_buffer.c
> +++ b/src/gallium/drivers/radeonsi/si_buffer.c
> @@ -57,24 +57,24 @@ void *si_buffer_map_sync_with_rings(struct si_context *sctx,
>
> if (!(usage & PIPE_TRANSFER_WRITE)) {
> /* have to wait for the last write */
> rusage = RADEON_USAGE_WRITE;
> }
>
> if (radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size) &&
> sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs,
> resource->buf, rusage)) {
> if (usage & PIPE_TRANSFER_DONTBLOCK) {
> - si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
> + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
> return NULL;
> } else {
> - si_flush_gfx_cs(sctx, 0, NULL);
> + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
> busy = true;
> }
> }
> if (radeon_emitted(sctx->dma_cs, 0) &&
> sctx->ws->cs_is_buffer_referenced(sctx->dma_cs,
> resource->buf, rusage)) {
> if (usage & PIPE_TRANSFER_DONTBLOCK) {
> si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
> return NULL;
> } else {
> @@ -718,21 +718,21 @@ static bool si_resource_commit(struct pipe_context *pctx,
> /*
> * Since buffer commitment changes cannot be pipelined, we need to
> * (a) flush any pending commands that refer to the buffer we're about
> * to change, and
> * (b) wait for threaded submit to finish, including those that were
> * triggered by some other, earlier operation.
> */
> if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
> ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs,
> res->buf, RADEON_USAGE_READWRITE)) {
> - si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
> + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
> }
> if (radeon_emitted(ctx->dma_cs, 0) &&
> ctx->ws->cs_is_buffer_referenced(ctx->dma_cs,
> res->buf, RADEON_USAGE_READWRITE)) {
> si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
> }
>
> ctx->ws->cs_sync_flush(ctx->dma_cs);
> ctx->ws->cs_sync_flush(ctx->gfx_cs);
>
> diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c
> index 7af7c5623b7..1eefaeb6ad5 100644
> --- a/src/gallium/drivers/radeonsi/si_dma_cs.c
> +++ b/src/gallium/drivers/radeonsi/si_dma_cs.c
> @@ -51,21 +51,21 @@ void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
> }
>
> /* Flush the GFX IB if DMA depends on it. */
> if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
> ((dst &&
> ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf,
> RADEON_USAGE_READWRITE)) ||
> (src &&
> ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf,
> RADEON_USAGE_WRITE))))
> - si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
> + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
>
> /* Flush if there's not enough space, or if the memory usage per IB
> * is too large.
> *
> * IBs using too little memory are limited by the IB submission overhead.
> * IBs using too much memory are limited by the kernel/TTM overhead.
> * Too long IBs create CPU-GPU pipeline bubbles and add latency.
> *
> * This heuristic makes sure that DMA requests are executed
> * very soon after the call is made and lowers memory usage.
> diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c
> index 26d6c43b34d..19fcb96041f 100644
> --- a/src/gallium/drivers/radeonsi/si_fence.c
> +++ b/src/gallium/drivers/radeonsi/si_fence.c
> @@ -367,21 +367,24 @@ static boolean si_fence_finish(struct pipe_screen *screen,
> * * and the calls to ClientWaitSync and FenceSync were
> * issued from the same context,
> *
> * then the GL will behave as if the equivalent of Flush
> * were inserted immediately after the creation of sync."
> *
> * This means we need to flush for such fences even when we're
> * not going to wait.
> */
> threaded_context_unwrap_sync(ctx);
> - si_flush_gfx_cs(sctx, timeout ? 0 : PIPE_FLUSH_ASYNC, NULL);
> + si_flush_gfx_cs(sctx,
> + (timeout ? 0 : PIPE_FLUSH_ASYNC) |
> + RADEON_FLUSH_START_NEXT_GFX_IB_NOW,
> + NULL);
> rfence->gfx_unflushed.ctx = NULL;
>
> if (!timeout)
> return false;
>
> /* Recompute the timeout after all that. */
> if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
> int64_t time = os_time_get_nano();
> timeout = abs_timeout > time ? abs_timeout - time : 0;
> }
> diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
> index 63bff29e63a..0173c64631b 100644
> --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
> +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
> @@ -40,35 +40,35 @@ void si_need_gfx_cs_space(struct si_context *ctx)
> */
>
> /* There are two memory usage counters in the winsys for all buffers
> * that have been added (cs_add_buffer) and two counters in the pipe
> * driver for those that haven't been added yet.
> */
> if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs,
> ctx->vram, ctx->gtt))) {
> ctx->gtt = 0;
> ctx->vram = 0;
> - si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
> + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
> return;
> }
> ctx->gtt = 0;
> ctx->vram = 0;
>
> /* If the IB is sufficiently large, don't count the space needed
> * and just flush if there is not enough space left.
> *
> * Also reserve space for stopping queries at the end of IB, because
> * the number of active queries is mostly unlimited.
> */
> unsigned need_dwords = 2048 + ctx->num_cs_dw_queries_suspend;
> if (!ctx->ws->cs_check_space(cs, need_dwords))
> - si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
> + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
> }
>
> void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
> struct pipe_fence_handle **fence)
> {
> struct radeon_winsys_cs *cs = ctx->gfx_cs;
> struct radeon_winsys *ws = ctx->ws;
> unsigned wait_flags = 0;
>
> if (ctx->gfx_flush_in_progress)
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index f0f323ff3a7..5e24d6cbb7e 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -1321,19 +1321,19 @@ static inline void
> radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx,
> struct r600_resource *rbo,
> enum radeon_bo_usage usage,
> enum radeon_bo_priority priority,
> bool check_mem)
> {
> if (check_mem &&
> !radeon_cs_memory_below_limit(sctx->screen, sctx->gfx_cs,
> sctx->vram + rbo->vram_usage,
> sctx->gtt + rbo->gart_usage))
> - si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
> + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
>
> radeon_add_to_buffer_list(sctx, sctx->gfx_cs, rbo, usage, priority);
> }
>
> #define PRINT_ERR(fmt, args...) \
> fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
>
> #endif
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index 7e1660415f5..67ab75bbd2d 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -2767,21 +2767,21 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx)
> si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
> sctx->init_config_gs_rings = pm4;
>
> if (!sctx->init_config_has_vgt_flush) {
> si_init_config_add_vgt_flush(sctx);
> si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
> }
>
> /* Flush the context to re-emit both init_config states. */
> sctx->initial_gfx_cs_size = 0; /* force flush */
> - si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
> + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
>
> /* Set ring bindings. */
> if (sctx->esgs_ring) {
> assert(sctx->chip_class <= VI);
> si_set_ring_buffer(sctx, SI_ES_RING_ESGS,
> sctx->esgs_ring, 0, sctx->esgs_ring->width0,
> true, true, 4, 64, 0);
> si_set_ring_buffer(sctx, SI_GS_RING_ESGS,
> sctx->esgs_ring, 0, sctx->esgs_ring->width0,
> false, false, 0, 0, 0);
> @@ -3044,21 +3044,21 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
> factor_va >> 8);
> si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM,
> sctx->screen->vgt_hs_offchip_param);
> }
>
> /* Flush the context to re-emit the init_config state.
> * This is done only once in a lifetime of a context.
> */
> si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
> sctx->initial_gfx_cs_size = 0; /* force flush */
> - si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
> + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
> }
>
> /**
> * This is used when TCS is NULL in the VS->TCS->TES chain. In this case,
> * VS passes its outputs to TES directly, so the fixed-function shader only
> * has to write TESSOUTER and TESSINNER.
> */
> static void si_generate_fixed_func_tcs(struct si_context *sctx)
> {
> struct ureg_src outer, inner;
> diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c
> index 1f0de5e71ec..8964c6b730c 100644
> --- a/src/gallium/drivers/radeonsi/si_texture.c
> +++ b/src/gallium/drivers/radeonsi/si_texture.c
> @@ -1862,21 +1862,21 @@ static void si_texture_transfer_unmap(struct pipe_context *ctx,
> * The idea is that we don't want to build IBs that use too much
> * memory and put pressure on the kernel memory manager and we also
> * want to make temporary and invalidated buffers go idle ASAP to
> * decrease the total memory usage or make them reusable. The memory
> * usage will be slightly higher than given here because of the buffer
> * cache in the winsys.
> *
> * The result is that the kernel memory manager is never a bottleneck.
> */
> if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 4) {
> - si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
> + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
> sctx->num_alloc_tex_transfer_bytes = 0;
> }
>
> pipe_resource_reference(&transfer->resource, NULL);
> FREE(transfer);
> }
>
> static const struct u_resource_vtbl si_texture_vtbl =
> {
> NULL, /* get_handle */
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> index 22b5a73143d..9b6d6e83032 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> @@ -232,31 +232,33 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
> if (!(usage & PIPE_TRANSFER_WRITE)) {
> /* Mapping for read.
> *
> * Since we are mapping for read, we don't need to wait
> * if the GPU is using the buffer for read too
> * (neither one is changing it).
> *
> * Only check whether the buffer is being used for write. */
> if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
> RADEON_USAGE_WRITE)) {
> - cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
> + cs->flush_cs(cs->flush_data,
> + RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
> return NULL;
> }
>
> if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
> RADEON_USAGE_WRITE)) {
> return NULL;
> }
> } else {
> if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) {
> - cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
> + cs->flush_cs(cs->flush_data,
> + RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
> return NULL;
> }
>
> if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
> RADEON_USAGE_READWRITE)) {
> return NULL;
> }
> }
> } else {
> uint64_t time = os_time_get_nano();
> @@ -265,35 +267,37 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
> /* Mapping for read.
> *
> * Since we are mapping for read, we don't need to wait
> * if the GPU is using the buffer for read too
> * (neither one is changing it).
> *
> * Only check whether the buffer is being used for write. */
> if (cs) {
> if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
> RADEON_USAGE_WRITE)) {
> - cs->flush_cs(cs->flush_data, 0, NULL);
> + cs->flush_cs(cs->flush_data,
> + RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
> } else {
> /* Try to avoid busy-waiting in amdgpu_bo_wait. */
> if (p_atomic_read(&bo->num_active_ioctls))
> amdgpu_cs_sync_flush(rcs);
> }
> }
>
> amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
> RADEON_USAGE_WRITE);
> } else {
> /* Mapping for write. */
> if (cs) {
> if (amdgpu_bo_is_referenced_by_cs(cs, bo)) {
> - cs->flush_cs(cs->flush_data, 0, NULL);
> + cs->flush_cs(cs->flush_data,
> + RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
> } else {
> /* Try to avoid busy-waiting in amdgpu_bo_wait. */
> if (p_atomic_read(&bo->num_active_ioctls))
> amdgpu_cs_sync_flush(rcs);
> }
> }
>
> amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
> RADEON_USAGE_READWRITE);
> }
> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> index 1617a2fe32e..6652977e586 100644
> --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> @@ -509,60 +509,64 @@ static void *radeon_bo_map(struct pb_buffer *buf,
> if (usage & PIPE_TRANSFER_DONTBLOCK) {
> if (!(usage & PIPE_TRANSFER_WRITE)) {
> /* Mapping for read.
> *
> * Since we are mapping for read, we don't need to wait
> * if the GPU is using the buffer for read too
> * (neither one is changing it).
> *
> * Only check whether the buffer is being used for write. */
> if (cs && radeon_bo_is_referenced_by_cs_for_write(cs, bo)) {
> - cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
> + cs->flush_cs(cs->flush_data,
> + RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
> return NULL;
> }
>
> if (!radeon_bo_wait((struct pb_buffer*)bo, 0,
> RADEON_USAGE_WRITE)) {
> return NULL;
> }
> } else {
> if (cs && radeon_bo_is_referenced_by_cs(cs, bo)) {
> - cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
> + cs->flush_cs(cs->flush_data,
> + RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
> return NULL;
> }
>
> if (!radeon_bo_wait((struct pb_buffer*)bo, 0,
> RADEON_USAGE_READWRITE)) {
> return NULL;
> }
> }
> } else {
> uint64_t time = os_time_get_nano();
>
> if (!(usage & PIPE_TRANSFER_WRITE)) {
> /* Mapping for read.
> *
> * Since we are mapping for read, we don't need to wait
> * if the GPU is using the buffer for read too
> * (neither one is changing it).
> *
> * Only check whether the buffer is being used for write. */
> if (cs && radeon_bo_is_referenced_by_cs_for_write(cs, bo)) {
> - cs->flush_cs(cs->flush_data, 0, NULL);
> + cs->flush_cs(cs->flush_data,
> + RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
> }
> radeon_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
> RADEON_USAGE_WRITE);
> } else {
> /* Mapping for write. */
> if (cs) {
> if (radeon_bo_is_referenced_by_cs(cs, bo)) {
> - cs->flush_cs(cs->flush_data, 0, NULL);
> + cs->flush_cs(cs->flush_data,
> + RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
> } else {
> /* Try to avoid busy-waiting in radeon_bo_wait. */
> if (p_atomic_read(&bo->num_active_ioctls))
> radeon_drm_cs_sync_flush(rcs);
> }
> }
>
> radeon_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
> RADEON_USAGE_READWRITE);
> }
> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
> index a1975dff8df..9070464bec8 100644
> --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
> @@ -400,21 +400,22 @@ static bool radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
> unsigned i;
>
> for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
> p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
> radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
> }
> cs->csc->num_relocs = cs->csc->num_validated_relocs;
>
> /* Flush if there are any relocs. Clean up otherwise. */
> if (cs->csc->num_relocs) {
> - cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
> + cs->flush_cs(cs->flush_data,
> + RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
> } else {
> radeon_cs_context_cleanup(cs->csc);
> cs->base.used_vram = 0;
> cs->base.used_gart = 0;
>
> assert(cs->base.current.cdw == 0);
> if (cs->base.current.cdw != 0) {
> fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
> }
> }
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the mesa-dev
mailing list