[Mesa-dev] [PATCH 3/3] radeonsi: don't emit partial flushes for internal CS flushes only

Sat Apr 7 02:31:03 UTC 2018

From: Marek Olšák <marek.olsak at amd.com>

---
 src/gallium/drivers/radeonsi/si_buffer.c        |  6 +++---
 src/gallium/drivers/radeonsi/si_dma_cs.c        |  2 +-
 src/gallium/drivers/radeonsi/si_fence.c         |  5 ++++-
 src/gallium/drivers/radeonsi/si_gfx_cs.c        |  4 ++--
 src/gallium/drivers/radeonsi/si_pipe.h          |  2 +-
 src/gallium/drivers/radeonsi/si_state_shaders.c |  4 ++--
 src/gallium/drivers/radeonsi/si_texture.c       |  2 +-
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c       | 12 ++++++++----
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c   | 12 ++++++++----
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c   |  3 ++-
 10 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c
index 1420702d8d4..d17b2c6a831 100644
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -57,24 +57,24 @@ void *si_buffer_map_sync_with_rings(struct si_context *sctx,
 
 	if (!(usage & PIPE_TRANSFER_WRITE)) {
 		/* have to wait for the last write */
 		rusage = RADEON_USAGE_WRITE;
 	}
 
 	if (radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size) &&
 	    sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs,
 						resource->buf, rusage)) {
 		if (usage & PIPE_TRANSFER_DONTBLOCK) {
-			si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
+			si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 			return NULL;
 		} else {
-			si_flush_gfx_cs(sctx, 0, NULL);
+			si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 			busy = true;
 		}
 	}
 	if (radeon_emitted(sctx->dma_cs, 0) &&
 	    sctx->ws->cs_is_buffer_referenced(sctx->dma_cs,
 						resource->buf, rusage)) {
 		if (usage & PIPE_TRANSFER_DONTBLOCK) {
 			si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
 			return NULL;
 		} else {
@@ -718,21 +718,21 @@ static bool si_resource_commit(struct pipe_context *pctx,
 	/*
 	 * Since buffer commitment changes cannot be pipelined, we need to
 	 * (a) flush any pending commands that refer to the buffer we're about
 	 *     to change, and
 	 * (b) wait for threaded submit to finish, including those that were
 	 *     triggered by some other, earlier operation.
 	 */
 	if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
 	    ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs,
 					       res->buf, RADEON_USAGE_READWRITE)) {
-		si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 	}
 	if (radeon_emitted(ctx->dma_cs, 0) &&
 	    ctx->ws->cs_is_buffer_referenced(ctx->dma_cs,
 					       res->buf, RADEON_USAGE_READWRITE)) {
 		si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
 	}
 
 	ctx->ws->cs_sync_flush(ctx->dma_cs);
 	ctx->ws->cs_sync_flush(ctx->gfx_cs);
 
diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c
index 7af7c5623b7..1eefaeb6ad5 100644
--- a/src/gallium/drivers/radeonsi/si_dma_cs.c
+++ b/src/gallium/drivers/radeonsi/si_dma_cs.c
@@ -51,21 +51,21 @@ void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
 	}
 
 	/* Flush the GFX IB if DMA depends on it. */
 	if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
 	    ((dst &&
 	      ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf,
 						 RADEON_USAGE_READWRITE)) ||
 	     (src &&
 	      ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf,
 						 RADEON_USAGE_WRITE))))
-		si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 
 	/* Flush if there's not enough space, or if the memory usage per IB
 	 * is too large.
 	 *
 	 * IBs using too little memory are limited by the IB submission overhead.
 	 * IBs using too much memory are limited by the kernel/TTM overhead.
 	 * Too long IBs create CPU-GPU pipeline bubbles and add latency.
 	 *
 	 * This heuristic makes sure that DMA requests are executed
 	 * very soon after the call is made and lowers memory usage.
diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c
index 26d6c43b34d..19fcb96041f 100644
--- a/src/gallium/drivers/radeonsi/si_fence.c
+++ b/src/gallium/drivers/radeonsi/si_fence.c
@@ -367,21 +367,24 @@ static boolean si_fence_finish(struct pipe_screen *screen,
 			 *     * and the calls to ClientWaitSync and FenceSync were
 			 *       issued from the same context,
 			 *
 			 *     then the GL will behave as if the equivalent of Flush
 			 *     were inserted immediately after the creation of sync."
 			 *
 			 * This means we need to flush for such fences even when we're
 			 * not going to wait.
 			 */
 			threaded_context_unwrap_sync(ctx);
-			si_flush_gfx_cs(sctx, timeout ? 0 : PIPE_FLUSH_ASYNC, NULL);
+			si_flush_gfx_cs(sctx,
+					(timeout ? 0 : PIPE_FLUSH_ASYNC) |
+					 RADEON_FLUSH_START_NEXT_GFX_IB_NOW,
+					NULL);
 			rfence->gfx_unflushed.ctx = NULL;
 
 			if (!timeout)
 				return false;
 
 			/* Recompute the timeout after all that. */
 			if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
 				int64_t time = os_time_get_nano();
 				timeout = abs_timeout > time ? abs_timeout - time : 0;
 			}
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 63bff29e63a..0173c64631b 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -40,35 +40,35 @@ void si_need_gfx_cs_space(struct si_context *ctx)
 	 */
 
 	/* There are two memory usage counters in the winsys for all buffers
 	 * that have been added (cs_add_buffer) and two counters in the pipe
 	 * driver for those that haven't been added yet.
 	 */
 	if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs,
 						   ctx->vram, ctx->gtt))) {
 		ctx->gtt = 0;
 		ctx->vram = 0;
-		si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 		return;
 	}
 	ctx->gtt = 0;
 	ctx->vram = 0;
 
 	/* If the IB is sufficiently large, don't count the space needed
 	 * and just flush if there is not enough space left.
 	 *
 	 * Also reserve space for stopping queries at the end of IB, because
 	 * the number of active queries is mostly unlimited.
 	 */
 	unsigned need_dwords = 2048 + ctx->num_cs_dw_queries_suspend;
 	if (!ctx->ws->cs_check_space(cs, need_dwords))
-		si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+		si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 }
 
 void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
 		     struct pipe_fence_handle **fence)
 {
 	struct radeon_winsys_cs *cs = ctx->gfx_cs;
 	struct radeon_winsys *ws = ctx->ws;
 	unsigned wait_flags = 0;
 
 	if (ctx->gfx_flush_in_progress)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index f0f323ff3a7..5e24d6cbb7e 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1321,19 +1321,19 @@ static inline void
 radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx,
 					struct r600_resource *rbo,
 					enum radeon_bo_usage usage,
 					enum radeon_bo_priority priority,
 					bool check_mem)
 {
 	if (check_mem &&
 	    !radeon_cs_memory_below_limit(sctx->screen, sctx->gfx_cs,
 					  sctx->vram + rbo->vram_usage,
 					  sctx->gtt + rbo->gart_usage))
-		si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
+		si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 
 	radeon_add_to_buffer_list(sctx, sctx->gfx_cs, rbo, usage, priority);
 }
 
 #define PRINT_ERR(fmt, args...) \
 	fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
 
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 7e1660415f5..67ab75bbd2d 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2767,21 +2767,21 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx)
 		si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
 	sctx->init_config_gs_rings = pm4;
 
 	if (!sctx->init_config_has_vgt_flush) {
 		si_init_config_add_vgt_flush(sctx);
 		si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
 	}
 
 	/* Flush the context to re-emit both init_config states. */
 	sctx->initial_gfx_cs_size = 0; /* force flush */
-	si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
+	si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 
 	/* Set ring bindings. */
 	if (sctx->esgs_ring) {
 		assert(sctx->chip_class <= VI);
 		si_set_ring_buffer(sctx, SI_ES_RING_ESGS,
 				   sctx->esgs_ring, 0, sctx->esgs_ring->width0,
 				   true, true, 4, 64, 0);
 		si_set_ring_buffer(sctx, SI_GS_RING_ESGS,
 				   sctx->esgs_ring, 0, sctx->esgs_ring->width0,
 				   false, false, 0, 0, 0);
@@ -3044,21 +3044,21 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
 			       factor_va >> 8);
 		si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM,
 		               sctx->screen->vgt_hs_offchip_param);
 	}
 
 	/* Flush the context to re-emit the init_config state.
 	 * This is done only once in a lifetime of a context.
 	 */
 	si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
 	sctx->initial_gfx_cs_size = 0; /* force flush */
-	si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
+	si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 }
 
 /**
  * This is used when TCS is NULL in the VS->TCS->TES chain. In this case,
  * VS passes its outputs to TES directly, so the fixed-function shader only
  * has to write TESSOUTER and TESSINNER.
  */
 static void si_generate_fixed_func_tcs(struct si_context *sctx)
 {
 	struct ureg_src outer, inner;
diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c
index 1f0de5e71ec..8964c6b730c 100644
--- a/src/gallium/drivers/radeonsi/si_texture.c
+++ b/src/gallium/drivers/radeonsi/si_texture.c
@@ -1862,21 +1862,21 @@ static void si_texture_transfer_unmap(struct pipe_context *ctx,
 	 * The idea is that we don't want to build IBs that use too much
 	 * memory and put pressure on the kernel memory manager and we also
 	 * want to make temporary and invalidated buffers go idle ASAP to
 	 * decrease the total memory usage or make them reusable. The memory
 	 * usage will be slightly higher than given here because of the buffer
 	 * cache in the winsys.
 	 *
 	 * The result is that the kernel memory manager is never a bottleneck.
 	 */
 	if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 4) {
-		si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
+		si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 		sctx->num_alloc_tex_transfer_bytes = 0;
 	}
 
 	pipe_resource_reference(&transfer->resource, NULL);
 	FREE(transfer);
 }
 
 static const struct u_resource_vtbl si_texture_vtbl =
 {
 	NULL,				/* get_handle */
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 22b5a73143d..9b6d6e83032 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -232,31 +232,33 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
          if (!(usage & PIPE_TRANSFER_WRITE)) {
             /* Mapping for read.
              *
              * Since we are mapping for read, we don't need to wait
              * if the GPU is using the buffer for read too
              * (neither one is changing it).
              *
              * Only check whether the buffer is being used for write. */
             if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
                                                                RADEON_USAGE_WRITE)) {
-               cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
+               cs->flush_cs(cs->flush_data,
+			    RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
                return NULL;
             }
 
             if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
                                 RADEON_USAGE_WRITE)) {
                return NULL;
             }
          } else {
             if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) {
-               cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
+               cs->flush_cs(cs->flush_data,
+			    RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
                return NULL;
             }
 
             if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
                                 RADEON_USAGE_READWRITE)) {
                return NULL;
             }
          }
       } else {
          uint64_t time = os_time_get_nano();
@@ -265,35 +267,37 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
             /* Mapping for read.
              *
              * Since we are mapping for read, we don't need to wait
              * if the GPU is using the buffer for read too
              * (neither one is changing it).
              *
              * Only check whether the buffer is being used for write. */
             if (cs) {
                if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
                                                             RADEON_USAGE_WRITE)) {
-                  cs->flush_cs(cs->flush_data, 0, NULL);
+                  cs->flush_cs(cs->flush_data,
+			       RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
                } else {
                   /* Try to avoid busy-waiting in amdgpu_bo_wait. */
                   if (p_atomic_read(&bo->num_active_ioctls))
                      amdgpu_cs_sync_flush(rcs);
                }
             }
 
             amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
                            RADEON_USAGE_WRITE);
          } else {
             /* Mapping for write. */
             if (cs) {
                if (amdgpu_bo_is_referenced_by_cs(cs, bo)) {
-                  cs->flush_cs(cs->flush_data, 0, NULL);
+                  cs->flush_cs(cs->flush_data,
+			       RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
                } else {
                   /* Try to avoid busy-waiting in amdgpu_bo_wait. */
                   if (p_atomic_read(&bo->num_active_ioctls))
                      amdgpu_cs_sync_flush(rcs);
                }
             }
 
             amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
                            RADEON_USAGE_READWRITE);
          }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 1617a2fe32e..6652977e586 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -509,60 +509,64 @@ static void *radeon_bo_map(struct pb_buffer *buf,
         if (usage & PIPE_TRANSFER_DONTBLOCK) {
             if (!(usage & PIPE_TRANSFER_WRITE)) {
                 /* Mapping for read.
                  *
                  * Since we are mapping for read, we don't need to wait
                  * if the GPU is using the buffer for read too
                  * (neither one is changing it).
                  *
                  * Only check whether the buffer is being used for write. */
                 if (cs && radeon_bo_is_referenced_by_cs_for_write(cs, bo)) {
-                    cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
+                    cs->flush_cs(cs->flush_data,
+				 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
                     return NULL;
                 }
 
                 if (!radeon_bo_wait((struct pb_buffer*)bo, 0,
                                     RADEON_USAGE_WRITE)) {
                     return NULL;
                 }
             } else {
                 if (cs && radeon_bo_is_referenced_by_cs(cs, bo)) {
-                    cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
+                    cs->flush_cs(cs->flush_data,
+				 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
                     return NULL;
                 }
 
                 if (!radeon_bo_wait((struct pb_buffer*)bo, 0,
                                     RADEON_USAGE_READWRITE)) {
                     return NULL;
                 }
             }
         } else {
             uint64_t time = os_time_get_nano();
 
             if (!(usage & PIPE_TRANSFER_WRITE)) {
                 /* Mapping for read.
                  *
                  * Since we are mapping for read, we don't need to wait
                  * if the GPU is using the buffer for read too
                  * (neither one is changing it).
                  *
                  * Only check whether the buffer is being used for write. */
                 if (cs && radeon_bo_is_referenced_by_cs_for_write(cs, bo)) {
-                    cs->flush_cs(cs->flush_data, 0, NULL);
+                    cs->flush_cs(cs->flush_data,
+				 RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
                 }
                 radeon_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
                                RADEON_USAGE_WRITE);
             } else {
                 /* Mapping for write. */
                 if (cs) {
                     if (radeon_bo_is_referenced_by_cs(cs, bo)) {
-                        cs->flush_cs(cs->flush_data, 0, NULL);
+                        cs->flush_cs(cs->flush_data,
+				     RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
                     } else {
                         /* Try to avoid busy-waiting in radeon_bo_wait. */
                         if (p_atomic_read(&bo->num_active_ioctls))
                             radeon_drm_cs_sync_flush(rcs);
                     }
                 }
 
                 radeon_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
                                RADEON_USAGE_READWRITE);
             }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index a1975dff8df..9070464bec8 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -400,21 +400,22 @@ static bool radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
         unsigned i;
 
         for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
             p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
             radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
         }
         cs->csc->num_relocs = cs->csc->num_validated_relocs;
 
         /* Flush if there are any relocs. Clean up otherwise. */
         if (cs->csc->num_relocs) {
-            cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
+            cs->flush_cs(cs->flush_data,
+			 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
         } else {
             radeon_cs_context_cleanup(cs->csc);
             cs->base.used_vram = 0;
             cs->base.used_gart = 0;
 
             assert(cs->base.current.cdw == 0);
             if (cs->base.current.cdw != 0) {
                 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
             }
         }
-- 
2.15.1