[Mesa-dev] [PATCH 10/10] i965/fs: Combine pixel center calculation into one inst.

Fri Apr 17 11:56:31 PDT 2015

On Tue, Apr 14, 2015 at 4:15 PM, Matt Turner <mattst88 at gmail.com> wrote:
> The X and Y values come interleaved in g1 (.4-.11 inclusive), so we can
> calculate them together with a single add(32) instruction on some
> platforms like Broadwell and newer or in SIMD8 elsewhere.
>
> Note that I also moved the PIXEL_X/PIXEL_Y virtual opcodes from before
> LINTERP to after it. That's because the writes_accumulator_implicitly()
> function in backend_instruction tests for <= LINTERP for determining
> whether the instruction indeed writes the accumulator implicitly. The
> old FS_OPCODE_PIXEL_X/Y emitted ADD instructions, which did, but the new
> opcodes just emit MOVs, which don't. It doesn't matter, since we don't
> use these opcodes on Gen4/5 anymore, but in the case that we do...
>
> On Broadwell:
> total instructions in shared programs: 7192355 -> 7186224 (-0.09%)
> instructions in affected programs:     1190700 -> 1184569 (-0.51%)
> helped:                                6131
>
> On Haswell:
> total instructions in shared programs: 6155979 -> 6152800 (-0.05%)
> instructions in affected programs:     652362 -> 649183 (-0.49%)
> helped:                                3179
> ---
>  src/mesa/drivers/dri/i965/brw_defines.h        |  2 +
>  src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 10 ++++
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp   | 71 ++++++++++++++++++--------
>  3 files changed, 63 insertions(+), 20 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> index 1afa34e..f99da12 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -925,6 +925,8 @@ enum opcode {
>     FS_OPCODE_DDY_FINE,
>     FS_OPCODE_CINTERP,
>     FS_OPCODE_LINTERP,
> +   FS_OPCODE_PIXEL_X,
> +   FS_OPCODE_PIXEL_Y,
>     FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
>     FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7,
>     FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> index dba3286..7bc97a6 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> @@ -1909,6 +1909,16 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
>        case FS_OPCODE_LINTERP:
>          generate_linterp(inst, dst, src);
>          break;
> +      case FS_OPCODE_PIXEL_X:
> +         assert(src[0].type == BRW_REGISTER_TYPE_UW);
> +         src[0].subnr = 0 * type_sz(src[0].type);
> +         brw_MOV(p, dst, stride(src[0], 8, 4, 1));

Why do we need the special opcode?  Can we not do that stride with
what we already have in the FS compiler?

> +         break;
> +      case FS_OPCODE_PIXEL_Y:
> +         assert(src[0].type == BRW_REGISTER_TYPE_UW);
> +         src[0].subnr = 4 * type_sz(src[0].type);
> +         brw_MOV(p, dst, stride(src[0], 8, 4, 1));
> +         break;
>        case SHADER_OPCODE_TEX:
>        case FS_OPCODE_TXB:
>        case SHADER_OPCODE_TXD:
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index 0c2511a..1479333 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -3469,27 +3469,58 @@ fs_visitor::emit_interpolation_setup_gen6()
>  {
>     struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
>
> -   /* If the pixel centers end up used, the setup is the same as for gen4. */
>     this->current_annotation = "compute pixel centers";
> -   fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
> -   fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
> -   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
> -   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
> -   emit(ADD(int_pixel_x,
> -            fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
> -            fs_reg(brw_imm_v(0x10101010))));
> -   emit(ADD(int_pixel_y,
> -            fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
> -            fs_reg(brw_imm_v(0x11001100))));
> -
> -   /* As of gen6, we can no longer mix float and int sources.  We have
> -    * to turn the integer pixel centers into floats for their actual
> -    * use.
> -    */
> -   this->pixel_x = vgrf(glsl_type::float_type);
> -   this->pixel_y = vgrf(glsl_type::float_type);
> -   emit(MOV(this->pixel_x, int_pixel_x));
> -   emit(MOV(this->pixel_y, int_pixel_y));
> +   if (brw->gen >= 8 || dispatch_width == 8) {
> +      /* The "Register Region Restrictions" page says for BDW (and newer,
> +       * presumably):
> +       *
> +       *     "When destination spans two registers, the source may be one or
> +       *      two registers. The destination elements must be evenly split
> +       *      between the two registers."
> +       *
> +       * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to
> +       * compute our pixel centers.
> +       */
> +      fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
> +                          BRW_REGISTER_TYPE_UW, dispatch_width * 2);
> +      emit(ADD(int_pixel_xy,
> +               fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
> +               fs_reg(brw_imm_v(0x11001010))))
> +         ->force_writemask_all = true;
> +
> +      this->pixel_x = vgrf(glsl_type::float_type);
> +      this->pixel_y = vgrf(glsl_type::float_type);
> +      emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy);
> +      emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy);
> +   } else {
> +      /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
> +       *
> +       *     "When destination spans two registers, the source MUST span two
> +       *      registers."
> +       *
> +       * Since the GRF source of the ADD will only read a single register, we
> +       * must do two separate ADDs in SIMD16.
> +       */
> +      fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
> +      fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
> +      int_pixel_x.type = BRW_REGISTER_TYPE_UW;
> +      int_pixel_y.type = BRW_REGISTER_TYPE_UW;
> +      emit(ADD(int_pixel_x,
> +               fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
> +               fs_reg(brw_imm_v(0x10101010))));
> +      emit(ADD(int_pixel_y,
> +               fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
> +               fs_reg(brw_imm_v(0x11001100))));
> +
> +      /* As of gen6, we can no longer mix float and int sources.  We have
> +       * to turn the integer pixel centers into floats for their actual
> +       * use.
> +       */
> +      this->pixel_x = vgrf(glsl_type::float_type);
> +      this->pixel_y = vgrf(glsl_type::float_type);
> +      emit(MOV(this->pixel_x, int_pixel_x));
> +      emit(MOV(this->pixel_y, int_pixel_y));
> +   }
>
>     this->current_annotation = "compute pos.w";
>     this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
> --
> 2.0.5
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev