[Mesa-dev] [PATCH 09/10] i965/fs: Calculate delta_x and delta_y together.

Matt Turner mattst88 at gmail.com
Tue Apr 14 16:15:46 PDT 2015


This lets SIMD16 programs on G45 and Gen5 use the PLN instruction.

On Ironlake:

total instructions in shared programs: 5634757 -> 5518055 (-2.07%)
instructions in affected programs:     1745837 -> 1629135 (-6.68%)
helped:                                11439
HURT:                                  4
---
 src/mesa/drivers/dri/i965/brw_fs.cpp              | 46 +++++++-------------
 src/mesa/drivers/dri/i965/brw_fs.h                |  3 +-
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp    |  5 +--
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp          | 13 +++---
 src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp |  8 ++--
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp      | 51 +++++++++++------------
 src/mesa/drivers/dri/i965/brw_reg.h               |  7 ++++
 7 files changed, 59 insertions(+), 74 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 5ab8701..6e55f67 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1259,8 +1259,7 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
    } else {
       emit(FS_OPCODE_LINTERP, wpos,
-           this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
-           this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
+           this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
            interp_reg(VARYING_SLOT_POS, 2));
    }
    wpos = offset(wpos, 1);
@@ -1302,8 +1301,7 @@ fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
    }
    return emit(FS_OPCODE_LINTERP, attr,
-               this->delta_x[barycoord_mode],
-               this->delta_y[barycoord_mode], interp);
+               this->delta_xy[barycoord_mode], interp);
 }
 
 void
@@ -1851,8 +1849,8 @@ fs_visitor::assign_urb_setup()
     */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       if (inst->opcode == FS_OPCODE_LINTERP) {
-	 assert(inst->src[2].file == HW_REG);
-	 inst->src[2].fixed_hw_reg.nr += urb_start;
+	 assert(inst->src[1].file == HW_REG);
+	 inst->src[1].fixed_hw_reg.nr += urb_start;
       }
 
       if (inst->opcode == FS_OPCODE_CINTERP) {
@@ -2106,25 +2104,16 @@ fs_visitor::compact_virtual_grfs()
       }
    }
 
-   /* Patch all the references to delta_x/delta_y, since they're used in
-    * register allocation.  If they're unused, switch them to BAD_FILE so
-    * we don't think some random VGRF is delta_x/delta_y.
+   /* Patch all the references to delta_xy, since they're used in register
+    * allocation.  If they're unused, switch them to BAD_FILE so we don't
+    * think some random VGRF is delta_xy.
     */
-   for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
-      if (delta_x[i].file == GRF) {
-         if (remap_table[delta_x[i].reg] != -1) {
-            delta_x[i].reg = remap_table[delta_x[i].reg];
+   for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
+      if (delta_xy[i].file == GRF) {
+         if (remap_table[delta_xy[i].reg] != -1) {
+            delta_xy[i].reg = remap_table[delta_xy[i].reg];
          } else {
-            delta_x[i].file = BAD_FILE;
-         }
-      }
-   }
-   for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
-      if (delta_y[i].file == GRF) {
-         if (remap_table[delta_y[i].reg] != -1) {
-            delta_y[i].reg = remap_table[delta_y[i].reg];
-         } else {
-            delta_y[i].file = BAD_FILE;
+            delta_xy[i].file = BAD_FILE;
          }
       }
    }
@@ -2589,14 +2578,9 @@ fs_visitor::opt_register_renaming()
    if (progress) {
       invalidate_live_intervals();
 
-      for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
-         if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
-            delta_x[i].reg = remap[delta_x[i].reg];
-         }
-      }
-      for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
-         if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
-            delta_y[i].reg = remap[delta_y[i].reg];
+      for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
+         if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
+            delta_xy[i].reg = remap[delta_xy[i].reg];
          }
       }
    }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index b7c1c39..e04691c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -512,8 +512,7 @@ public:
    fs_reg pixel_y;
    fs_reg wpos_w;
    fs_reg pixel_w;
-   fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
-   fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
+   fs_reg delta_xy[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
    fs_reg shader_start_time;
    fs_reg userplane[MAX_CLIP_PLANES];
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index fc9597e..dba3286 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -392,11 +392,10 @@ fs_generator::generate_linterp(fs_inst *inst,
 			     struct brw_reg dst, struct brw_reg *src)
 {
    struct brw_reg delta_x = src[0];
-   struct brw_reg delta_y = src[1];
-   struct brw_reg interp = src[2];
+   struct brw_reg delta_y = offset(src[0], dispatch_width / 8);
+   struct brw_reg interp = src[1];
 
    if (brw->has_pln &&
-       delta_y.nr == delta_x.nr + 1 &&
        (brw->gen >= 7 || (delta_x.nr & 1) == 0)) {
       brw_PLN(p, dst, interp, delta_x);
    } else {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 3972581..e1687ed 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1482,8 +1482,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
        */
       no16("interpolate_at_* not yet supported in SIMD16 mode.");
 
-      fs_reg dst_x = vgrf(2);
-      fs_reg dst_y = offset(dst_x, 1);
+      fs_reg dst_xy = vgrf(2);
 
       /* For most messages, we need one reg of ignored data; the hardware
        * requires mlen==1 even when there is no payload. in the per-slot
@@ -1495,7 +1494,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
       switch (instr->intrinsic) {
       case nir_intrinsic_interp_var_at_centroid:
-         inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
+         inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u));
          break;
 
       case nir_intrinsic_interp_var_at_sample: {
@@ -1503,7 +1502,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
          assert(const_sample);
          unsigned msg_data = const_sample ? const_sample->i[0] << 4 : 0;
-         inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src,
+         inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src,
                      fs_reg(msg_data));
          break;
       }
@@ -1515,7 +1514,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
             unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf;
             unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf;
 
-            inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
+            inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
                         fs_reg(off_x | (off_y << 4)));
          } else {
             src = vgrf(glsl_type::ivec2_type);
@@ -1548,7 +1547,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
             }
 
             mlen = 2;
-            inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
+            inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
                         fs_reg(0u));
          }
          break;
@@ -1567,7 +1566,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
          src.type = dest.type;
 
-         emit(FS_OPCODE_LINTERP, dest, dst_x, dst_y, src);
+         emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
          dest = offset(dest, 1);
       }
       break;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 2a4054a..47f5a42 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -244,7 +244,7 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
    }
    assert(reg == ra_reg_count);
 
-   /* Add a special class for aligned pairs, which we'll put delta_x/y
+   /* Add a special class for aligned pairs, which we'll put delta_xy
     * in on Gen <= 6 so that we can do PLN.
     */
    if (devinfo->has_pln && reg_width == 1 && devinfo->gen <= 6) {
@@ -558,14 +558,14 @@ fs_visitor::assign_regs(bool allow_spilling)
        * second operand of a PLN instruction needs to be an
        * even-numbered register, so we have a special register class
        * wm_aligned_pairs_class to handle this case.  pre-GEN6 always
-       * uses this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] as the
+       * uses this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] as the
        * second operand of a PLN instruction (since it doesn't support
        * any other interpolation modes).  So all we need to do is find
        * that register and set it to the appropriate class.
        */
       if (screen->wm_reg_sets[rsi].aligned_pairs_class >= 0 &&
-          this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF &&
-          this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) {
+          this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF &&
+          this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) {
          c = screen->wm_reg_sets[rsi].aligned_pairs_class;
       }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index e9252c5..0c2511a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -593,8 +593,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
 
    /* 1. collect interpolation factors */
 
-   fs_reg dst_x = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
-   fs_reg dst_y = offset(dst_x, 1);
+   fs_reg dst_xy = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
 
    /* for most messages, we need one reg of ignored data; the hardware requires mlen==1
     * even when there is no payload. in the per-slot offset case, we'll replace this with
@@ -606,7 +605,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
 
    switch (ir->operation) {
    case ir_unop_interpolate_at_centroid:
-      inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
+      inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u));
       break;
 
    case ir_binop_interpolate_at_sample: {
@@ -614,7 +613,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
       assert(sample_num || !"nonconstant sample number should have been lowered.");
 
       unsigned msg_data = sample_num->value.i[0] << 4;
-      inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, fs_reg(msg_data));
+      inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src, fs_reg(msg_data));
       break;
    }
 
@@ -623,7 +622,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
       if (const_offset) {
          unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
                             (pack_pixel_offset(const_offset->value.f[1]) << 4);
-         inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
+         inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
                      fs_reg(msg_data));
       } else {
          /* pack the operands: hw wants offsets as 4 bit signed ints */
@@ -656,7 +655,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
          }
 
          mlen = 2 * reg_width;
-         inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
+         inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
                      fs_reg(0u));
       }
       break;
@@ -678,8 +677,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
 
    for (int i = 0; i < ir->type->vector_elements; i++) {
       int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
-      emit(FS_OPCODE_LINTERP, res,
-           dst_x, dst_y,
+      emit(FS_OPCODE_LINTERP, res, dst_xy,
            fs_reg(interp_reg(var->data.location, ch)));
       res = offset(res, 1);
    }
@@ -3434,31 +3432,31 @@ fs_visitor::emit_interpolation_setup_gen4()
             fs_reg(brw_imm_v(0x11001100))));
 
    this->current_annotation = "compute pixel deltas from v0";
-   if (brw->has_pln) {
-      this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
-         vgrf(glsl_type::vec2_type);
-      this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
-         offset(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 1);
+
+   this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
+      vgrf(glsl_type::vec2_type);
+   const fs_reg &delta_xy = this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
+   const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
+   const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
+
+   if (brw->has_pln && dispatch_width == 16) {
+      emit(ADD(half(offset(delta_xy, 0), 0), half(this->pixel_x, 0), xstart));
+      emit(ADD(half(offset(delta_xy, 0), 1), half(this->pixel_y, 0), ystart));
+      emit(ADD(half(offset(delta_xy, 1), 0), half(this->pixel_x, 1), xstart))
+         ->force_sechalf = true;
+      emit(ADD(half(offset(delta_xy, 1), 1), half(this->pixel_y, 1), ystart))
+         ->force_sechalf = true;
    } else {
-      this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
-         vgrf(glsl_type::float_type);
-      this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
-         vgrf(glsl_type::float_type);
+      emit(ADD(offset(delta_xy, 0), this->pixel_x, xstart));
+      emit(ADD(offset(delta_xy, 1), this->pixel_y, ystart));
    }
-   emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
-            this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))));
-   emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
-            this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))));
 
    this->current_annotation = "compute pos.w and 1/pos.w";
    /* Compute wpos.w.  It's always in our setup, since it's needed to
     * interpolate the other attributes.
     */
    this->wpos_w = vgrf(glsl_type::float_type);
-   emit(FS_OPCODE_LINTERP, wpos_w,
-        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
-        this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
-	interp_reg(VARYING_SLOT_POS, 3));
+   emit(FS_OPCODE_LINTERP, wpos_w, delta_xy, interp_reg(VARYING_SLOT_POS, 3));
    /* Compute the pixel 1/W value from wpos.w. */
    this->pixel_w = vgrf(glsl_type::float_type);
    emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
@@ -3500,8 +3498,7 @@ fs_visitor::emit_interpolation_setup_gen6()
 
    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
       uint8_t reg = payload.barycentric_coord_reg[i];
-      this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
-      this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
+      this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0));
    }
 
    this->current_annotation = NULL;
diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
index 3a50e86..1b2bb10 100644
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -704,6 +704,13 @@ brw_vec8_grf(unsigned nr, unsigned subnr)
    return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
 }
 
+/** Construct float[16] general-purpose register */
+static inline struct brw_reg
+brw_vec16_grf(unsigned nr, unsigned subnr)
+{
+   return brw_vec16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
 
 static inline struct brw_reg
 brw_uw8_grf(unsigned nr, unsigned subnr)
-- 
2.0.5



More information about the mesa-dev mailing list