[Mesa-dev] [PATCH 09/10] i965/fs: Calculate delta_x and delta_y together.
Jason Ekstrand
jason at jlekstrand.net
Fri Apr 17 11:45:57 PDT 2015
On Tue, Apr 14, 2015 at 4:15 PM, Matt Turner <mattst88 at gmail.com> wrote:
> This lets SIMD16 programs on G45 and Gen5 use the PLN instruction.
This patch also changes the meaning of FS_OPCODE_LINTERP. Those
chainges should be described in the commit message as well.
> On Ironlake:
>
> total instructions in shared programs: 5634757 -> 5518055 (-2.07%)
> instructions in affected programs: 1745837 -> 1629135 (-6.68%)
> helped: 11439
> HURT: 4
> ---
> src/mesa/drivers/dri/i965/brw_fs.cpp | 46 +++++++-------------
> src/mesa/drivers/dri/i965/brw_fs.h | 3 +-
> src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 5 +--
> src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 13 +++---
> src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 8 ++--
> src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 51 +++++++++++------------
> src/mesa/drivers/dri/i965/brw_reg.h | 7 ++++
> 7 files changed, 59 insertions(+), 74 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 5ab8701..6e55f67 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -1259,8 +1259,7 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
> emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
> } else {
> emit(FS_OPCODE_LINTERP, wpos,
> - this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> - this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> + this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> interp_reg(VARYING_SLOT_POS, 2));
> }
> wpos = offset(wpos, 1);
> @@ -1302,8 +1301,7 @@ fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
> barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
> }
> return emit(FS_OPCODE_LINTERP, attr,
> - this->delta_x[barycoord_mode],
> - this->delta_y[barycoord_mode], interp);
> + this->delta_xy[barycoord_mode], interp);
> }
>
> void
> @@ -1851,8 +1849,8 @@ fs_visitor::assign_urb_setup()
> */
> foreach_block_and_inst(block, fs_inst, inst, cfg) {
> if (inst->opcode == FS_OPCODE_LINTERP) {
> - assert(inst->src[2].file == HW_REG);
> - inst->src[2].fixed_hw_reg.nr += urb_start;
> + assert(inst->src[1].file == HW_REG);
> + inst->src[1].fixed_hw_reg.nr += urb_start;
> }
>
> if (inst->opcode == FS_OPCODE_CINTERP) {
> @@ -2106,25 +2104,16 @@ fs_visitor::compact_virtual_grfs()
> }
> }
>
> - /* Patch all the references to delta_x/delta_y, since they're used in
> - * register allocation. If they're unused, switch them to BAD_FILE so
> - * we don't think some random VGRF is delta_x/delta_y.
> + /* Patch all the references to delta_xy, since they're used in register
> + * allocation. If they're unused, switch them to BAD_FILE so we don't
> + * think some random VGRF is delta_xy.
> */
> - for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
> - if (delta_x[i].file == GRF) {
> - if (remap_table[delta_x[i].reg] != -1) {
> - delta_x[i].reg = remap_table[delta_x[i].reg];
> + for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
> + if (delta_xy[i].file == GRF) {
> + if (remap_table[delta_xy[i].reg] != -1) {
> + delta_xy[i].reg = remap_table[delta_xy[i].reg];
> } else {
> - delta_x[i].file = BAD_FILE;
> - }
> - }
> - }
> - for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
> - if (delta_y[i].file == GRF) {
> - if (remap_table[delta_y[i].reg] != -1) {
> - delta_y[i].reg = remap_table[delta_y[i].reg];
> - } else {
> - delta_y[i].file = BAD_FILE;
> + delta_xy[i].file = BAD_FILE;
> }
> }
> }
> @@ -2589,14 +2578,9 @@ fs_visitor::opt_register_renaming()
> if (progress) {
> invalidate_live_intervals();
>
> - for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
> - if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
> - delta_x[i].reg = remap[delta_x[i].reg];
> - }
> - }
> - for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
> - if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
> - delta_y[i].reg = remap[delta_y[i].reg];
> + for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
> + if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
> + delta_xy[i].reg = remap[delta_xy[i].reg];
> }
> }
> }
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index b7c1c39..e04691c 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -512,8 +512,7 @@ public:
> fs_reg pixel_y;
> fs_reg wpos_w;
> fs_reg pixel_w;
> - fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
> - fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
> + fs_reg delta_xy[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
> fs_reg shader_start_time;
> fs_reg userplane[MAX_CLIP_PLANES];
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> index fc9597e..dba3286 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> @@ -392,11 +392,10 @@ fs_generator::generate_linterp(fs_inst *inst,
> struct brw_reg dst, struct brw_reg *src)
> {
> struct brw_reg delta_x = src[0];
> - struct brw_reg delta_y = src[1];
> - struct brw_reg interp = src[2];
> + struct brw_reg delta_y = offset(src[0], dispatch_width / 8);
This is an awkward use of offset(). It's supposed to take the width
into account for you. Why do you need to do it explicitly? If
there's something funny going on here, it deserves a comment.
> + struct brw_reg interp = src[1];
>
> if (brw->has_pln &&
> - delta_y.nr == delta_x.nr + 1 &&
> (brw->gen >= 7 || (delta_x.nr & 1) == 0)) {
> brw_PLN(p, dst, interp, delta_x);
> } else {
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> index 3972581..e1687ed 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> @@ -1482,8 +1482,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
> */
> no16("interpolate_at_* not yet supported in SIMD16 mode.");
>
> - fs_reg dst_x = vgrf(2);
> - fs_reg dst_y = offset(dst_x, 1);
> + fs_reg dst_xy = vgrf(2);
>
> /* For most messages, we need one reg of ignored data; the hardware
> * requires mlen==1 even when there is no payload. in the per-slot
> @@ -1495,7 +1494,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
>
> switch (instr->intrinsic) {
> case nir_intrinsic_interp_var_at_centroid:
> - inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
> + inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u));
> break;
>
> case nir_intrinsic_interp_var_at_sample: {
> @@ -1503,7 +1502,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
> nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
> assert(const_sample);
> unsigned msg_data = const_sample ? const_sample->i[0] << 4 : 0;
> - inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src,
> + inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src,
> fs_reg(msg_data));
> break;
> }
> @@ -1515,7 +1514,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
> unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf;
> unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf;
>
> - inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
> + inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
> fs_reg(off_x | (off_y << 4)));
> } else {
> src = vgrf(glsl_type::ivec2_type);
> @@ -1548,7 +1547,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
> }
>
> mlen = 2;
> - inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
> + inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
> fs_reg(0u));
> }
> break;
> @@ -1567,7 +1566,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
> fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
> src.type = dest.type;
>
> - emit(FS_OPCODE_LINTERP, dest, dst_x, dst_y, src);
> + emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
> dest = offset(dest, 1);
> }
> break;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> index 2a4054a..47f5a42 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> @@ -244,7 +244,7 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
> }
> assert(reg == ra_reg_count);
>
> - /* Add a special class for aligned pairs, which we'll put delta_x/y
> + /* Add a special class for aligned pairs, which we'll put delta_xy
> * in on Gen <= 6 so that we can do PLN.
> */
> if (devinfo->has_pln && reg_width == 1 && devinfo->gen <= 6) {
> @@ -558,14 +558,14 @@ fs_visitor::assign_regs(bool allow_spilling)
> * second operand of a PLN instruction needs to be an
> * even-numbered register, so we have a special register class
> * wm_aligned_pairs_class to handle this case. pre-GEN6 always
> - * uses this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] as the
> + * uses this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] as the
> * second operand of a PLN instruction (since it doesn't support
> * any other interpolation modes). So all we need to do is find
> * that register and set it to the appropriate class.
> */
> if (screen->wm_reg_sets[rsi].aligned_pairs_class >= 0 &&
> - this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF &&
> - this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) {
> + this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF &&
> + this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) {
> c = screen->wm_reg_sets[rsi].aligned_pairs_class;
> }
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index e9252c5..0c2511a 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -593,8 +593,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
>
> /* 1. collect interpolation factors */
>
> - fs_reg dst_x = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
> - fs_reg dst_y = offset(dst_x, 1);
> + fs_reg dst_xy = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
>
> /* for most messages, we need one reg of ignored data; the hardware requires mlen==1
> * even when there is no payload. in the per-slot offset case, we'll replace this with
> @@ -606,7 +605,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
>
> switch (ir->operation) {
> case ir_unop_interpolate_at_centroid:
> - inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
> + inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u));
> break;
>
> case ir_binop_interpolate_at_sample: {
> @@ -614,7 +613,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
> assert(sample_num || !"nonconstant sample number should have been lowered.");
>
> unsigned msg_data = sample_num->value.i[0] << 4;
> - inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, fs_reg(msg_data));
> + inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src, fs_reg(msg_data));
> break;
> }
>
> @@ -623,7 +622,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
> if (const_offset) {
> unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
> (pack_pixel_offset(const_offset->value.f[1]) << 4);
> - inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
> + inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
> fs_reg(msg_data));
> } else {
> /* pack the operands: hw wants offsets as 4 bit signed ints */
> @@ -656,7 +655,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
> }
>
> mlen = 2 * reg_width;
> - inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
> + inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
> fs_reg(0u));
> }
> break;
> @@ -678,8 +677,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
>
> for (int i = 0; i < ir->type->vector_elements; i++) {
> int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
> - emit(FS_OPCODE_LINTERP, res,
> - dst_x, dst_y,
> + emit(FS_OPCODE_LINTERP, res, dst_xy,
> fs_reg(interp_reg(var->data.location, ch)));
> res = offset(res, 1);
> }
> @@ -3434,31 +3432,31 @@ fs_visitor::emit_interpolation_setup_gen4()
> fs_reg(brw_imm_v(0x11001100))));
>
> this->current_annotation = "compute pixel deltas from v0";
> - if (brw->has_pln) {
> - this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> - vgrf(glsl_type::vec2_type);
> - this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> - offset(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 1);
> +
> + this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> + vgrf(glsl_type::vec2_type);
> + const fs_reg &delta_xy = this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
> + const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
> + const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
> +
> + if (brw->has_pln && dispatch_width == 16) {
> + emit(ADD(half(offset(delta_xy, 0), 0), half(this->pixel_x, 0), xstart));
> + emit(ADD(half(offset(delta_xy, 0), 1), half(this->pixel_y, 0), ystart));
> + emit(ADD(half(offset(delta_xy, 1), 0), half(this->pixel_x, 1), xstart))
> + ->force_sechalf = true;
> + emit(ADD(half(offset(delta_xy, 1), 1), half(this->pixel_y, 1), ystart))
> + ->force_sechalf = true;
> } else {
> - this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> - vgrf(glsl_type::float_type);
> - this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> - vgrf(glsl_type::float_type);
> + emit(ADD(offset(delta_xy, 0), this->pixel_x, xstart));
> + emit(ADD(offset(delta_xy, 1), this->pixel_y, ystart));
> }
> - emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> - this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))));
> - emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> - this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))));
>
> this->current_annotation = "compute pos.w and 1/pos.w";
> /* Compute wpos.w. It's always in our setup, since it's needed to
> * interpolate the other attributes.
> */
> this->wpos_w = vgrf(glsl_type::float_type);
> - emit(FS_OPCODE_LINTERP, wpos_w,
> - this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> - this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> - interp_reg(VARYING_SLOT_POS, 3));
> + emit(FS_OPCODE_LINTERP, wpos_w, delta_xy, interp_reg(VARYING_SLOT_POS, 3));
> /* Compute the pixel 1/W value from wpos.w. */
> this->pixel_w = vgrf(glsl_type::float_type);
> emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
> @@ -3500,8 +3498,7 @@ fs_visitor::emit_interpolation_setup_gen6()
>
> for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
> uint8_t reg = payload.barycentric_coord_reg[i];
> - this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
> - this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
> + this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0));
> }
>
> this->current_annotation = NULL;
> diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
> index 3a50e86..1b2bb10 100644
> --- a/src/mesa/drivers/dri/i965/brw_reg.h
> +++ b/src/mesa/drivers/dri/i965/brw_reg.h
> @@ -704,6 +704,13 @@ brw_vec8_grf(unsigned nr, unsigned subnr)
> return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
> }
>
> +/** Construct float[16] general-purpose register */
> +static inline struct brw_reg
> +brw_vec16_grf(unsigned nr, unsigned subnr)
> +{
> + return brw_vec16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
> +}
> +
>
> static inline struct brw_reg
> brw_uw8_grf(unsigned nr, unsigned subnr)
> --
> 2.0.5
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list