[Mesa-dev] [PATCH v4 3/5] i965/miptree: Use cpu tiling/detiling when mapping

Thu Apr 5 20:12:00 UTC 2018

Quoting Scott D Phillips (2018-04-03 21:05:43)
> Rename the (un)map_gtt functions to (un)map_map (map by
> returning a map) and add new functions (un)map_tiled_memcpy that
> return a shadow buffer populated with the intel_tiled_memcpy
> functions.
> 
> Tiling/detiling with the cpu will be the only way to handle Yf/Ys
> tiling, when support is added for those formats.
> 
> v2: Compute extents properly in the x|y-rounded-down case (Chris Wilson)
> 
> v3: Add units to parameter names of tile_extents (Nanley Chery)
>     Use _mesa_align_malloc for the shadow copy (Nanley)
>     Continue using gtt maps on gen4 (Nanley)
> 
> v4: Use streaming_load_memcpy when detiling
> ---
>  src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 108 ++++++++++++++++++++++++--
>  1 file changed, 100 insertions(+), 8 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
> index 23cb40f3226..58ffe868d0d 100644
> --- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
> +++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
> @@ -31,6 +31,7 @@
>  #include "intel_image.h"
>  #include "intel_mipmap_tree.h"
>  #include "intel_tex.h"
> +#include "intel_tiled_memcpy.h"
>  #include "intel_blit.h"
>  #include "intel_fbo.h"
>  
> @@ -3046,10 +3047,10 @@ intel_miptree_unmap_raw(struct intel_mipmap_tree *mt)
>  }
>  
>  static void
> -intel_miptree_map_gtt(struct brw_context *brw,
> -                     struct intel_mipmap_tree *mt,
> -                     struct intel_miptree_map *map,
> -                     unsigned int level, unsigned int slice)
> +intel_miptree_map_map(struct brw_context *brw,
> +                      struct intel_mipmap_tree *mt,
> +                      struct intel_miptree_map *map,
> +                      unsigned int level, unsigned int slice)
>  {
>     unsigned int bw, bh;
>     void *base;
> @@ -3093,11 +3094,93 @@ intel_miptree_map_gtt(struct brw_context *brw,
>  }
>  
>  static void
> -intel_miptree_unmap_gtt(struct intel_mipmap_tree *mt)
> +intel_miptree_unmap_map(struct intel_mipmap_tree *mt)
>  {
>     intel_miptree_unmap_raw(mt);
>  }
>  
> +/* Compute extent parameters for use with tiled_memcpy functions.
> + * xs are in units of bytes and ys are in units of strides. */
> +static inline void
> +tile_extents(struct intel_mipmap_tree *mt, struct intel_miptree_map *map,
> +             unsigned int level, unsigned int slice, unsigned int *x1_B,
> +             unsigned int *x2_B, unsigned int *y1_el, unsigned int *y2_el)
> +{
> +   unsigned int block_width, block_height;
> +   unsigned int x0_el, y0_el;
> +
> +   _mesa_get_format_block_size(mt->format, &block_width, &block_height);
> +
> +   assert(map->x % block_width == 0);
> +   assert(map->y % block_height == 0);
> +
> +   intel_miptree_get_image_offset(mt, level, slice, &x0_el, &y0_el);
> +   *x1_B = (map->x / block_width + x0_el) * mt->cpp;
> +   *y1_el = map->y / block_height + y0_el;
> +   *x2_B = (DIV_ROUND_UP(map->x + map->w, block_width) + x0_el) * mt->cpp;
> +   *y2_el = DIV_ROUND_UP(map->y + map->h, block_height) + y0_el;
> +}
> +
> +static void
> +intel_miptree_map_tiled_memcpy(struct brw_context *brw,
> +                               struct intel_mipmap_tree *mt,
> +                               struct intel_miptree_map *map,
> +                               unsigned int level, unsigned int slice)
> +{
> +   unsigned int x1, x2, y1, y2;
> +   tile_extents(mt, map, level, slice, &x1, &x2, &y1, &y2);
> +   map->stride = ALIGN(_mesa_format_row_stride(mt->format, map->w), 16);
> +
> +   /* The tiling and detiling functions require that the linear buffer
> +    * has proper 16-byte alignment (that is, `x0` is 16-byte aligned).

Throw in an its here, i.e.  (that is, its `x0`...) Just spent a few
moments going what x0 before remembering it's the internal x0 of
tiled_to_linear().

We really want to move that knowledge back to intel_tiled_memcpy.c. A
single user isn't enough to justify a lot of effort though (or be sure
you get the interface right).

> +    * Here we over-allocate the linear buffer by enough bytes to get
> +    * the proper alignment.
> +    */
> +   map->buffer = _mesa_align_malloc(map->stride * (y2 - y1) + (x1 & 0xf), 16);
> +   map->ptr = (char *)map->buffer + (x1 & 0xf);
> +   assert(map->buffer);
> +
> +   if (!(map->mode & GL_MAP_INVALIDATE_RANGE_BIT)) {
> +      char *src = intel_miptree_map_raw(brw, mt, map->mode | MAP_RAW);
> +      src += mt->offset;
> +
> +      const mem_copy_fn fn =
> +#if defined(USE_SSE41)
> +         cpu_has_sse4_1 ? (mem_copy_fn)_mesa_streaming_load_memcpy :
> +#endif
> +         memcpy;

So always use a streaming load and bypass cache, even coming from WB.
Justifiable I believe, since there is no reason to keep it in cache as
the modification is on map->buffer not the tiled bo.

But do we want to use this path if !USE_SSE41 and WC? Let's see if
that's excluded.

>  static void
>  intel_miptree_map_blit(struct brw_context *brw,
>                        struct intel_mipmap_tree *mt,
> @@ -3655,8 +3738,11 @@ intel_miptree_map(struct brw_context *brw,
>                (mt->surf.row_pitch % 16 == 0)) {
>        intel_miptree_map_movntdqa(brw, mt, map, level, slice);
>  #endif
> +   } else if (mt->surf.tiling != ISL_TILING_LINEAR &&
> +              brw->screen->devinfo.gen > 4) {
> +      intel_miptree_map_tiled_memcpy(brw, mt, map, level, slice);
>     } else {
> -      intel_miptree_map_gtt(brw, mt, map, level, slice);
> +      intel_miptree_map_map(brw, mt, map, level, slice);

Ah, the remaining choice is to go through a GTT map if tiled_memcpy
doesn't support it.

So memcpy is the only option right now, that is painful. Maybe use
perf_debug if we hit map_map.

I'm never sure how map_blit ties into this. We definitely want to use
the direct copy over the indirect blit in the majority, if not all,
cases.

As it stands, as an improvement over map_gtt,
From: Chris Wilson <chris at chris-wilson.co.uk>
-Chris