[Mesa-dev] [PATCH] i965: Give the FS and VEC4 visitors more descriptive names.
Connor Abbott
cwabbott0 at gmail.com
Wed Apr 1 09:06:24 PDT 2015
Laughed-at-by: Connor Abbott <cwabbott0 at gmail.com>
On Wed, Apr 1, 2015 at 5:43 AM, Francisco Jerez <currojerez at riseup.net> wrote:
> It has always struck me as odd that these objects are both called
> visitors even though visiting only makes for a tiny fraction of their
> job. Other no less important tasks seem misrepresented, like
> optimizing, analyzing, emitting and pretty-printing the IR,
> translating NIR and ARB assembly programs, applying all sorts of
> hardware workarounds, calculating the binding table, URB, push and
> pull constant layout, etc.
>
> The new names should better depict the extraordinary power of these
> objects, and have the additional advantage of being up to 40% shorter
> than the old ones, reducing the number of keystrokes required to refer
> to these frequently used objects and hopefully increasing everyone's
> productivity.
> ---
> src/mesa/drivers/dri/i965/Makefile.sources | 14 +-
> src/mesa/drivers/dri/i965/brw_cfg.cpp | 4 +-
> src/mesa/drivers/dri/i965/brw_cfg.h | 4 +-
> .../drivers/dri/i965/brw_dead_control_flow.cpp | 2 +-
> src/mesa/drivers/dri/i965/brw_dead_control_flow.h | 2 +-
> src/mesa/drivers/dri/i965/brw_fs.cpp | 164 +-
> src/mesa/drivers/dri/i965/brw_fs.h | 10 +-
> .../drivers/dri/i965/brw_fs_cmod_propagation.cpp | 4 +-
> .../drivers/dri/i965/brw_fs_combine_constants.cpp | 2 +-
> .../drivers/dri/i965/brw_fs_copy_propagation.cpp | 8 +-
> src/mesa/drivers/dri/i965/brw_fs_cse.cpp | 4 +-
> .../dri/i965/brw_fs_dead_code_eliminate.cpp | 2 +-
> src/mesa/drivers/dri/i965/brw_fs_fp.cpp | 20 +-
> src/mesa/drivers/dri/i965/brw_fs_god.cpp | 4157 ++++++++++++++++++++
> .../drivers/dri/i965/brw_fs_live_variables.cpp | 8 +-
> src/mesa/drivers/dri/i965/brw_fs_live_variables.h | 4 +-
> src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 +-
> .../dri/i965/brw_fs_peephole_predicated_break.cpp | 2 +-
> src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 18 +-
> .../drivers/dri/i965/brw_fs_register_coalesce.cpp | 6 +-
> .../dri/i965/brw_fs_saturate_propagation.cpp | 4 +-
> src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp | 2 +-
> src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 4157 --------------------
> src/mesa/drivers/dri/i965/brw_gs.c | 2 +-
> src/mesa/drivers/dri/i965/brw_ir_vec4.h | 6 +-
> .../drivers/dri/i965/brw_schedule_instructions.cpp | 20 +-
> src/mesa/drivers/dri/i965/brw_shader.cpp | 12 +-
> src/mesa/drivers/dri/i965/brw_shader.h | 4 +-
> src/mesa/drivers/dri/i965/brw_vec4.cpp | 54 +-
> src/mesa/drivers/dri/i965/brw_vec4.h | 6 +-
> .../drivers/dri/i965/brw_vec4_copy_propagation.cpp | 2 +-
> src/mesa/drivers/dri/i965/brw_vec4_cse.cpp | 4 +-
> .../dri/i965/brw_vec4_dead_code_eliminate.cpp | 2 +-
> src/mesa/drivers/dri/i965/brw_vec4_god.cpp | 3658 +++++++++++++++++
> src/mesa/drivers/dri/i965/brw_vec4_gs_god.cpp | 706 ++++
> src/mesa/drivers/dri/i965/brw_vec4_gs_god.h | 103 +
> src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp | 706 ----
> src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h | 103 -
> .../drivers/dri/i965/brw_vec4_live_variables.cpp | 10 +-
> .../drivers/dri/i965/brw_vec4_reg_allocate.cpp | 12 +-
> src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 3658 -----------------
> src/mesa/drivers/dri/i965/brw_vec4_vp.cpp | 10 +-
> src/mesa/drivers/dri/i965/brw_vec4_vs_god.cpp | 231 ++
> src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp | 231 --
> src/mesa/drivers/dri/i965/brw_vs.c | 2 +-
> src/mesa/drivers/dri/i965/brw_vs.h | 4 +-
> src/mesa/drivers/dri/i965/brw_wm_iz.cpp | 2 +-
> src/mesa/drivers/dri/i965/gen6_gs_god.cpp | 776 ++++
> src/mesa/drivers/dri/i965/gen6_gs_god.h | 82 +
> src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp | 776 ----
> src/mesa/drivers/dri/i965/gen6_gs_visitor.h | 82 -
> .../drivers/dri/i965/test_fs_cmod_propagation.cpp | 12 +-
> .../dri/i965/test_fs_saturate_propagation.cpp | 12 +-
> .../dri/i965/test_vec4_copy_propagation.cpp | 12 +-
> .../dri/i965/test_vec4_register_coalesce.cpp | 12 +-
> 55 files changed, 9978 insertions(+), 9978 deletions(-)
> create mode 100644 src/mesa/drivers/dri/i965/brw_fs_god.cpp
> delete mode 100644 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> create mode 100644 src/mesa/drivers/dri/i965/brw_vec4_god.cpp
> create mode 100644 src/mesa/drivers/dri/i965/brw_vec4_gs_god.cpp
> create mode 100644 src/mesa/drivers/dri/i965/brw_vec4_gs_god.h
> delete mode 100644 src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
> delete mode 100644 src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
> delete mode 100644 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> create mode 100644 src/mesa/drivers/dri/i965/brw_vec4_vs_god.cpp
> delete mode 100644 src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
> create mode 100644 src/mesa/drivers/dri/i965/gen6_gs_god.cpp
> create mode 100644 src/mesa/drivers/dri/i965/gen6_gs_god.h
> delete mode 100644 src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
> delete mode 100644 src/mesa/drivers/dri/i965/gen6_gs_visitor.h
>
> diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
> index 498d5a7..82477c0 100644
> --- a/src/mesa/drivers/dri/i965/Makefile.sources
> +++ b/src/mesa/drivers/dri/i965/Makefile.sources
> @@ -57,7 +57,7 @@ i965_FILES = \
> brw_fs_saturate_propagation.cpp \
> brw_fs_sel_peephole.cpp \
> brw_fs_vector_splitting.cpp \
> - brw_fs_visitor.cpp \
> + brw_fs_god.cpp \
> brw_gs.c \
> brw_gs.h \
> brw_gs_state.c \
> @@ -112,15 +112,15 @@ i965_FILES = \
> brw_vec4_cse.cpp \
> brw_vec4_dead_code_eliminate.cpp \
> brw_vec4_generator.cpp \
> - brw_vec4_gs_visitor.cpp \
> - brw_vec4_gs_visitor.h \
> + brw_vec4_gs_god.cpp \
> + brw_vec4_gs_god.h \
> brw_vec4.h \
> brw_vec4_live_variables.cpp \
> brw_vec4_live_variables.h \
> brw_vec4_reg_allocate.cpp \
> - brw_vec4_visitor.cpp \
> + brw_vec4_god.cpp \
> brw_vec4_vp.cpp \
> - brw_vec4_vs_visitor.cpp \
> + brw_vec4_vs_god.cpp \
> brw_vs.c \
> brw_vs.h \
> brw_vs_state.c \
> @@ -137,8 +137,8 @@ i965_FILES = \
> gen6_depth_state.c \
> gen6_depthstencil.c \
> gen6_gs_state.c \
> - gen6_gs_visitor.cpp \
> - gen6_gs_visitor.h \
> + gen6_gs_god.cpp \
> + gen6_gs_god.h \
> gen6_multisample_state.c \
> gen6_queryobj.c \
> gen6_sampler_state.c \
> diff --git a/src/mesa/drivers/dri/i965/brw_cfg.cpp b/src/mesa/drivers/dri/i965/brw_cfg.cpp
> index 7e7770e..8eaf276 100644
> --- a/src/mesa/drivers/dri/i965/brw_cfg.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_cfg.cpp
> @@ -141,7 +141,7 @@ bblock_t::combine_with(bblock_t *that)
> }
>
> void
> -bblock_t::dump(backend_visitor *v) const
> +bblock_t::dump(backend_god *v) const
> {
> int ip = this->start_ip;
> foreach_inst_in_block(backend_instruction, inst, this) {
> @@ -411,7 +411,7 @@ cfg_t::make_block_array()
> }
>
> void
> -cfg_t::dump(backend_visitor *v)
> +cfg_t::dump(backend_god *v)
> {
> if (idom_dirty)
> calculate_idom();
> diff --git a/src/mesa/drivers/dri/i965/brw_cfg.h b/src/mesa/drivers/dri/i965/brw_cfg.h
> index 56d7d07..961876f 100644
> --- a/src/mesa/drivers/dri/i965/brw_cfg.h
> +++ b/src/mesa/drivers/dri/i965/brw_cfg.h
> @@ -60,7 +60,7 @@ struct bblock_t {
> bool is_successor_of(const bblock_t *block) const;
> bool can_combine_with(const bblock_t *that) const;
> void combine_with(bblock_t *that);
> - void dump(backend_visitor *v) const;
> + void dump(backend_god *v) const;
>
> backend_instruction *start();
> const backend_instruction *start() const;
> @@ -273,7 +273,7 @@ struct cfg_t {
> void calculate_idom();
> static bblock_t *intersect(bblock_t *b1, bblock_t *b2);
>
> - void dump(backend_visitor *v);
> + void dump(backend_god *v);
> void dump_cfg();
> void dump_domtree();
> #endif
> diff --git a/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp b/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
> index 03f838d..256dee6 100644
> --- a/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
> @@ -36,7 +36,7 @@
> * - if/else/endif
> */
> bool
> -dead_control_flow_eliminate(backend_visitor *v)
> +dead_control_flow_eliminate(backend_god *v)
> {
> bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_dead_control_flow.h b/src/mesa/drivers/dri/i965/brw_dead_control_flow.h
> index 57a4dab..754a870 100644
> --- a/src/mesa/drivers/dri/i965/brw_dead_control_flow.h
> +++ b/src/mesa/drivers/dri/i965/brw_dead_control_flow.h
> @@ -23,4 +23,4 @@
>
> #include "brw_shader.h"
>
> -bool dead_control_flow_eliminate(backend_visitor *v);
> +bool dead_control_flow_eliminate(backend_god *v);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 9c2ccce..8be13af 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -239,14 +239,14 @@ fs_inst::resize_sources(uint8_t num_sources)
>
> #define ALU1(op) \
> fs_inst * \
> - fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
> + fs_god::op(const fs_reg &dst, const fs_reg &src0) \
> { \
> return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
> }
>
> #define ALU2(op) \
> fs_inst * \
> - fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
> + fs_god::op(const fs_reg &dst, const fs_reg &src0, \
> const fs_reg &src1) \
> { \
> return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
> @@ -254,7 +254,7 @@ fs_inst::resize_sources(uint8_t num_sources)
>
> #define ALU2_ACC(op) \
> fs_inst * \
> - fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
> + fs_god::op(const fs_reg &dst, const fs_reg &src0, \
> const fs_reg &src1) \
> { \
> fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
> @@ -264,7 +264,7 @@ fs_inst::resize_sources(uint8_t num_sources)
>
> #define ALU3(op) \
> fs_inst * \
> - fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
> + fs_god::op(const fs_reg &dst, const fs_reg &src0, \
> const fs_reg &src1, const fs_reg &src2) \
> { \
> return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
> @@ -301,7 +301,7 @@ ALU2(MAC)
>
> /** Gen4 predicated IF. */
> fs_inst *
> -fs_visitor::IF(enum brw_predicate predicate)
> +fs_god::IF(enum brw_predicate predicate)
> {
> fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
> inst->predicate = predicate;
> @@ -310,7 +310,7 @@ fs_visitor::IF(enum brw_predicate predicate)
>
> /** Gen6 IF with embedded comparison. */
> fs_inst *
> -fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
> +fs_god::IF(const fs_reg &src0, const fs_reg &src1,
> enum brw_conditional_mod condition)
> {
> assert(brw->gen == 6);
> @@ -326,7 +326,7 @@ fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
> * the flag register with the packed 16 bits of the result.
> */
> fs_inst *
> -fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
> +fs_god::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
> enum brw_conditional_mod condition)
> {
> fs_inst *inst;
> @@ -355,7 +355,7 @@ fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
> }
>
> fs_inst *
> -fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
> +fs_god::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
> {
> uint8_t exec_size = dst.width;
> for (int i = 0; i < sources; ++i) {
> @@ -381,7 +381,7 @@ fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
> }
>
> exec_list
> -fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
> +fs_god::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
> const fs_reg &surf_index,
> const fs_reg &varying_offset,
> uint32_t const_offset)
> @@ -448,7 +448,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
> * handling.
> */
> fs_inst *
> -fs_visitor::DEP_RESOLVE_MOV(int grf)
> +fs_god::DEP_RESOLVE_MOV(int grf)
> {
> fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
>
> @@ -638,7 +638,7 @@ fs_reg::is_contiguous() const
> }
>
> int
> -fs_visitor::type_size(const struct glsl_type *type)
> +fs_god::type_size(const struct glsl_type *type)
> {
> unsigned int size, i;
>
> @@ -681,7 +681,7 @@ fs_visitor::type_size(const struct glsl_type *type)
> * the destination of the MOV, with extra parameters set.
> */
> fs_reg
> -fs_visitor::get_timestamp(fs_inst **out_mov)
> +fs_god::get_timestamp(fs_inst **out_mov)
> {
> assert(brw->gen >= 7);
>
> @@ -715,7 +715,7 @@ fs_visitor::get_timestamp(fs_inst **out_mov)
> }
>
> void
> -fs_visitor::emit_shader_time_begin()
> +fs_god::emit_shader_time_begin()
> {
> current_annotation = "shader time start";
> fs_inst *mov;
> @@ -724,7 +724,7 @@ fs_visitor::emit_shader_time_begin()
> }
>
> void
> -fs_visitor::emit_shader_time_end()
> +fs_god::emit_shader_time_end()
> {
> current_annotation = "shader time end";
>
> @@ -753,7 +753,7 @@ fs_visitor::emit_shader_time_end()
> }
> break;
> default:
> - unreachable("fs_visitor::emit_shader_time_end missing code");
> + unreachable("fs_god::emit_shader_time_end missing code");
> }
>
> /* Insert our code just before the final SEND with EOT. */
> @@ -799,7 +799,7 @@ fs_visitor::emit_shader_time_end()
> }
>
> fs_inst *
> -fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
> +fs_god::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
> {
> int shader_time_index =
> brw_get_shader_time_index(brw, shader_prog, prog, type);
> @@ -816,7 +816,7 @@ fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
> }
>
> void
> -fs_visitor::vfail(const char *format, va_list va)
> +fs_god::vfail(const char *format, va_list va)
> {
> char *msg;
>
> @@ -836,7 +836,7 @@ fs_visitor::vfail(const char *format, va_list va)
> }
>
> void
> -fs_visitor::fail(const char *format, ...)
> +fs_god::fail(const char *format, ...)
> {
> va_list va;
>
> @@ -855,7 +855,7 @@ fs_visitor::fail(const char *format, ...)
> * During a SIMD16 compile (if one happens anyway), this just calls fail().
> */
> void
> -fs_visitor::no16(const char *format, ...)
> +fs_god::no16(const char *format, ...)
> {
> va_list va;
>
> @@ -878,39 +878,39 @@ fs_visitor::no16(const char *format, ...)
> }
>
> fs_inst *
> -fs_visitor::emit(enum opcode opcode)
> +fs_god::emit(enum opcode opcode)
> {
> return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
> }
>
> fs_inst *
> -fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
> +fs_god::emit(enum opcode opcode, const fs_reg &dst)
> {
> return emit(new(mem_ctx) fs_inst(opcode, dst));
> }
>
> fs_inst *
> -fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
> +fs_god::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
> {
> return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
> }
>
> fs_inst *
> -fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
> +fs_god::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
> const fs_reg &src1)
> {
> return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
> }
>
> fs_inst *
> -fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
> +fs_god::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
> const fs_reg &src1, const fs_reg &src2)
> {
> return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
> }
>
> fs_inst *
> -fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
> +fs_god::emit(enum opcode opcode, const fs_reg &dst,
> fs_reg src[], int sources)
> {
> return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
> @@ -991,7 +991,7 @@ fs_inst::writes_flag() const
> * instruction -- the FS opcodes often generate MOVs in addition.
> */
> int
> -fs_visitor::implied_mrf_writes(fs_inst *inst)
> +fs_god::implied_mrf_writes(fs_inst *inst)
> {
> if (inst->mlen == 0)
> return 0;
> @@ -1047,7 +1047,7 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
> }
>
> fs_reg
> -fs_visitor::vgrf(const glsl_type *const type)
> +fs_god::vgrf(const glsl_type *const type)
> {
> int reg_width = dispatch_width / 8;
> return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
> @@ -1055,7 +1055,7 @@ fs_visitor::vgrf(const glsl_type *const type)
> }
>
> fs_reg
> -fs_visitor::vgrf(int num_components)
> +fs_god::vgrf(int num_components)
> {
> int reg_width = dispatch_width / 8;
> return fs_reg(GRF, alloc.allocate(num_components * reg_width),
> @@ -1108,7 +1108,7 @@ fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
> }
>
> fs_reg *
> -fs_visitor::variable_storage(ir_variable *var)
> +fs_god::variable_storage(ir_variable *var)
> {
> return (fs_reg *)hash_table_find(this->variable_ht, var);
> }
> @@ -1131,7 +1131,7 @@ import_uniforms_callback(const void *key,
> * This brings in those uniform definitions
> */
> void
> -fs_visitor::import_uniforms(fs_visitor *v)
> +fs_god::import_uniforms(fs_god *v)
> {
> hash_table_call_foreach(v->variable_ht,
> import_uniforms_callback,
> @@ -1148,7 +1148,7 @@ fs_visitor::import_uniforms(fs_visitor *v)
> * store.
> */
> void
> -fs_visitor::setup_uniform_values(ir_variable *ir)
> +fs_god::setup_uniform_values(ir_variable *ir)
> {
> int namelen = strlen(ir->name);
>
> @@ -1189,7 +1189,7 @@ fs_visitor::setup_uniform_values(ir_variable *ir)
> * automatically updated from GL context state.
> */
> void
> -fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
> +fs_god::setup_builtin_uniform_values(ir_variable *ir)
> {
> const ir_state_slot *const slots = ir->get_state_slots();
> assert(slots != NULL);
> @@ -1219,7 +1219,7 @@ fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
> }
>
> fs_reg *
> -fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
> +fs_god::emit_fragcoord_interpolation(bool pixel_center_integer,
> bool origin_upper_left)
> {
> assert(stage == MESA_SHADER_FRAGMENT);
> @@ -1270,7 +1270,7 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
> }
>
> fs_inst *
> -fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
> +fs_god::emit_linterp(const fs_reg &attr, const fs_reg &interp,
> glsl_interp_qualifier interpolation_mode,
> bool is_centroid, bool is_sample)
> {
> @@ -1305,7 +1305,7 @@ fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
> }
>
> void
> -fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
> +fs_god::emit_general_interpolation(fs_reg attr, const char *name,
> const glsl_type *type,
> glsl_interp_qualifier interpolation_mode,
> int location, bool mod_centroid,
> @@ -1408,7 +1408,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
> }
>
> fs_reg *
> -fs_visitor::emit_frontfacing_interpolation()
> +fs_god::emit_frontfacing_interpolation()
> {
> fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
>
> @@ -1449,7 +1449,7 @@ fs_visitor::emit_frontfacing_interpolation()
> }
>
> void
> -fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
> +fs_god::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
> {
> assert(stage == MESA_SHADER_FRAGMENT);
> brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> @@ -1472,7 +1472,7 @@ fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
> }
>
> fs_reg *
> -fs_visitor::emit_samplepos_setup()
> +fs_god::emit_samplepos_setup()
> {
> assert(brw->gen >= 6);
>
> @@ -1521,7 +1521,7 @@ fs_visitor::emit_samplepos_setup()
> }
>
> fs_reg *
> -fs_visitor::emit_sampleid_setup()
> +fs_god::emit_sampleid_setup()
> {
> assert(stage == MESA_SHADER_FRAGMENT);
> brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> @@ -1580,7 +1580,7 @@ fs_visitor::emit_sampleid_setup()
> }
>
> void
> -fs_visitor::resolve_source_modifiers(fs_reg *src)
> +fs_god::resolve_source_modifiers(fs_reg *src)
> {
> if (!src->abs && !src->negate)
> return;
> @@ -1591,7 +1591,7 @@ fs_visitor::resolve_source_modifiers(fs_reg *src)
> }
>
> fs_reg
> -fs_visitor::fix_math_operand(fs_reg src)
> +fs_god::fix_math_operand(fs_reg src)
> {
> /* Can't do hstride == 0 args on gen6 math, so expand it out. We
> * might be able to do better by doing execsize = 1 math and then
> @@ -1618,7 +1618,7 @@ fs_visitor::fix_math_operand(fs_reg src)
> }
>
> fs_inst *
> -fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
> +fs_god::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
> {
> switch (opcode) {
> case SHADER_OPCODE_RCP:
> @@ -1655,7 +1655,7 @@ fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
> }
>
> fs_inst *
> -fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
> +fs_god::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
> {
> int base_mrf = 2;
> fs_inst *inst;
> @@ -1691,7 +1691,7 @@ fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
> }
>
> void
> -fs_visitor::emit_discard_jump()
> +fs_god::emit_discard_jump()
> {
> /* For performance, after a discard, jump to the end of the
> * shader if all relevant channels have been discarded.
> @@ -1706,7 +1706,7 @@ fs_visitor::emit_discard_jump()
> }
>
> void
> -fs_visitor::assign_curb_setup()
> +fs_god::assign_curb_setup()
> {
> if (dispatch_width == 8) {
> prog_data->dispatch_grf_start_reg = payload.num_regs;
> @@ -1749,7 +1749,7 @@ fs_visitor::assign_curb_setup()
> }
>
> void
> -fs_visitor::calculate_urb_setup()
> +fs_god::calculate_urb_setup()
> {
> assert(stage == MESA_SHADER_FRAGMENT);
> brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> @@ -1837,7 +1837,7 @@ fs_visitor::calculate_urb_setup()
> }
>
> void
> -fs_visitor::assign_urb_setup()
> +fs_god::assign_urb_setup()
> {
> assert(stage == MESA_SHADER_FRAGMENT);
> brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> @@ -1865,7 +1865,7 @@ fs_visitor::assign_urb_setup()
> }
>
> void
> -fs_visitor::assign_vs_urb_setup()
> +fs_god::assign_vs_urb_setup()
> {
> brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
> int grf, count, slot, channel, attr;
> @@ -1938,7 +1938,7 @@ fs_visitor::assign_vs_urb_setup()
> * live intervals and better dead code elimination and coalescing.
> */
> void
> -fs_visitor::split_virtual_grfs()
> +fs_god::split_virtual_grfs()
> {
> int num_vars = this->alloc.count;
>
> @@ -2069,7 +2069,7 @@ fs_visitor::split_virtual_grfs()
> * overhead.
> */
> bool
> -fs_visitor::compact_virtual_grfs()
> +fs_god::compact_virtual_grfs()
> {
> bool progress = false;
> int remap_table[this->alloc.count];
> @@ -2154,7 +2154,7 @@ fs_visitor::compact_virtual_grfs()
> * uniform array access out to a pull constant buffer.
> */
> void
> -fs_visitor::move_uniform_array_access_to_pull_constants()
> +fs_god::move_uniform_array_access_to_pull_constants()
> {
> if (dispatch_width != 8)
> return;
> @@ -2204,7 +2204,7 @@ fs_visitor::move_uniform_array_access_to_pull_constants()
> * update the program to load them.
> */
> void
> -fs_visitor::assign_constant_locations()
> +fs_god::assign_constant_locations()
> {
> /* Only the first compile (SIMD8 mode) gets to decide on locations. */
> if (dispatch_width != 8)
> @@ -2286,7 +2286,7 @@ fs_visitor::assign_constant_locations()
> * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
> */
> void
> -fs_visitor::demote_pull_constants()
> +fs_god::demote_pull_constants()
> {
> foreach_block_and_inst (block, fs_inst, inst, cfg) {
> for (int i = 0; i < inst->sources; i++) {
> @@ -2338,7 +2338,7 @@ fs_visitor::demote_pull_constants()
> }
>
> bool
> -fs_visitor::opt_algebraic()
> +fs_god::opt_algebraic()
> {
> bool progress = false;
>
> @@ -2548,7 +2548,7 @@ fs_visitor::opt_algebraic()
> }
>
> bool
> -fs_visitor::opt_register_renaming()
> +fs_god::opt_register_renaming()
> {
> bool progress = false;
> int depth = 0;
> @@ -2623,7 +2623,7 @@ fs_visitor::opt_register_renaming()
> * placeholder-halt
> */
> bool
> -fs_visitor::opt_redundant_discard_jumps()
> +fs_god::opt_redundant_discard_jumps()
> {
> bool progress = false;
>
> @@ -2655,7 +2655,7 @@ fs_visitor::opt_redundant_discard_jumps()
> }
>
> bool
> -fs_visitor::compute_to_mrf()
> +fs_god::compute_to_mrf()
> {
> bool progress = false;
> int next_ip = 0;
> @@ -2819,7 +2819,7 @@ fs_visitor::compute_to_mrf()
> * instructions to FS_OPCODE_REP_FB_WRITE.
> */
> void
> -fs_visitor::emit_repclear_shader()
> +fs_god::emit_repclear_shader()
> {
> brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> int base_mrf = 1;
> @@ -2865,7 +2865,7 @@ fs_visitor::emit_repclear_shader()
> * removing the later ones.
> */
> bool
> -fs_visitor::remove_duplicate_mrf_writes()
> +fs_god::remove_duplicate_mrf_writes()
> {
> fs_inst *last_mrf_move[16];
> bool progress = false;
> @@ -2970,7 +2970,7 @@ clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
> * same time that both consider ‘r3’ as the target of their final writes.
> */
> void
> -fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
> +fs_god::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
> fs_inst *inst)
> {
> int write_len = inst->regs_written;
> @@ -3042,7 +3042,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
> * instruction with a different destination register.
> */
> void
> -fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
> +fs_god::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
> {
> int write_len = inst->regs_written;
> int first_write_grf = inst->dst.reg;
> @@ -3091,7 +3091,7 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
> }
>
> void
> -fs_visitor::insert_gen4_send_dependency_workarounds()
> +fs_god::insert_gen4_send_dependency_workarounds()
> {
> if (brw->gen != 4 || brw->is_g4x)
> return;
> @@ -3131,7 +3131,7 @@ fs_visitor::insert_gen4_send_dependency_workarounds()
> * source operand for all 8 or 16 of its channels.
> */
> void
> -fs_visitor::lower_uniform_pull_constant_loads()
> +fs_god::lower_uniform_pull_constant_loads()
> {
> foreach_block_and_inst (block, fs_inst, inst, cfg) {
> if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
> @@ -3189,7 +3189,7 @@ fs_visitor::lower_uniform_pull_constant_loads()
> }
>
> bool
> -fs_visitor::lower_load_payload()
> +fs_god::lower_load_payload()
> {
> bool progress = false;
>
> @@ -3295,13 +3295,13 @@ fs_visitor::lower_load_payload()
> }
>
> void
> -fs_visitor::dump_instructions()
> +fs_god::dump_instructions()
> {
> dump_instructions(NULL);
> }
>
> void
> -fs_visitor::dump_instructions(const char *name)
> +fs_god::dump_instructions(const char *name)
> {
> FILE *file = stderr;
> if (name && geteuid() != 0) {
> @@ -3334,13 +3334,13 @@ fs_visitor::dump_instructions(const char *name)
> }
>
> void
> -fs_visitor::dump_instruction(backend_instruction *be_inst)
> +fs_god::dump_instruction(backend_instruction *be_inst)
> {
> dump_instruction(be_inst, stderr);
> }
>
> void
> -fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
> +fs_god::dump_instruction(backend_instruction *be_inst, FILE *file)
> {
> fs_inst *inst = (fs_inst *)be_inst;
>
> @@ -3552,7 +3552,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
> * only reg -- it might be the size=4 destination of a texture instruction.
> */
> fs_inst *
> -fs_visitor::get_instruction_generating_reg(fs_inst *start,
> +fs_god::get_instruction_generating_reg(fs_inst *start,
> fs_inst *end,
> const fs_reg ®)
> {
> @@ -3567,7 +3567,7 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start,
> }
>
> void
> -fs_visitor::setup_payload_gen6()
> +fs_god::setup_payload_gen6()
> {
> bool uses_depth =
> (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
> @@ -3648,14 +3648,14 @@ fs_visitor::setup_payload_gen6()
> }
>
> void
> -fs_visitor::setup_vs_payload()
> +fs_god::setup_vs_payload()
> {
> /* R0: thread header, R1: urb handles */
> payload.num_regs = 2;
> }
>
> void
> -fs_visitor::assign_binding_table_offsets()
> +fs_god::assign_binding_table_offsets()
> {
> assert(stage == MESA_SHADER_FRAGMENT);
> brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> @@ -3672,7 +3672,7 @@ fs_visitor::assign_binding_table_offsets()
> }
>
> void
> -fs_visitor::calculate_register_pressure()
> +fs_god::calculate_register_pressure()
> {
> invalidate_live_intervals();
> calculate_live_intervals();
> @@ -3690,7 +3690,7 @@ fs_visitor::calculate_register_pressure()
> }
>
> void
> -fs_visitor::optimize()
> +fs_god::optimize()
> {
> const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
>
> @@ -3709,7 +3709,7 @@ fs_visitor::optimize()
> snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
> stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
> \
> - backend_visitor::dump_instructions(filename); \
> + backend_god::dump_instructions(filename); \
> } \
> \
> progress = progress || this_progress; \
> @@ -3721,7 +3721,7 @@ fs_visitor::optimize()
> snprintf(filename, 64, "%s%d-%04d-00-start",
> stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
>
> - backend_visitor::dump_instructions(filename);
> + backend_god::dump_instructions(filename);
> }
>
> bool progress;
> @@ -3770,7 +3770,7 @@ fs_visitor::optimize()
> * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
> */
> void
> -fs_visitor::fixup_3src_null_dest()
> +fs_god::fixup_3src_null_dest()
> {
> foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
> if (inst->is_3src() && inst->dst.is_null()) {
> @@ -3781,7 +3781,7 @@ fs_visitor::fixup_3src_null_dest()
> }
>
> void
> -fs_visitor::allocate_registers()
> +fs_god::allocate_registers()
> {
> bool allocated_without_spills;
>
> @@ -3851,7 +3851,7 @@ fs_visitor::allocate_registers()
> }
>
> bool
> -fs_visitor::run_vs()
> +fs_god::run_vs()
> {
> assert(stage == MESA_SHADER_VERTEX);
>
> @@ -3891,7 +3891,7 @@ fs_visitor::run_vs()
> }
>
> bool
> -fs_visitor::run_fs()
> +fs_god::run_fs()
> {
> brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
> brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
> @@ -4015,7 +4015,7 @@ brw_wm_fs_emit(struct brw_context *brw,
>
> /* Now the main event: Visit the shader IR and generate our FS IR for it.
> */
> - fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
> + fs_god v(brw, mem_ctx, key, prog_data, prog, fp, 8);
> if (!v.run_fs()) {
> if (prog) {
> prog->LinkStatus = false;
> @@ -4029,7 +4029,7 @@ brw_wm_fs_emit(struct brw_context *brw,
> }
>
> cfg_t *simd16_cfg = NULL;
> - fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
> + fs_god v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
> if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
> brw->use_rep_send)) {
> if (!v.simd16_unsupported) {
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index 278a8ee..ff1a8b8 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -66,14 +66,14 @@ namespace brw {
> *
> * Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR.
> */
> -class fs_visitor : public backend_visitor
> +class fs_god : public backend_god
> {
> public:
> const fs_reg reg_null_f;
> const fs_reg reg_null_d;
> const fs_reg reg_null_ud;
>
> - fs_visitor(struct brw_context *brw,
> + fs_god(struct brw_context *brw,
> void *mem_ctx,
> const struct brw_wm_prog_key *key,
> struct brw_wm_prog_data *prog_data,
> @@ -81,7 +81,7 @@ public:
> struct gl_fragment_program *fp,
> unsigned dispatch_width);
>
> - fs_visitor(struct brw_context *brw,
> + fs_god(struct brw_context *brw,
> void *mem_ctx,
> const struct brw_vs_prog_key *key,
> struct brw_vs_prog_data *prog_data,
> @@ -89,13 +89,13 @@ public:
> struct gl_vertex_program *cp,
> unsigned dispatch_width);
>
> - ~fs_visitor();
> + ~fs_god();
> void init();
>
> fs_reg *variable_storage(ir_variable *var);
> fs_reg vgrf(const glsl_type *const type);
> fs_reg vgrf(int num_components);
> - void import_uniforms(fs_visitor *v);
> + void import_uniforms(fs_god *v);
> void setup_uniform_clipplane_values();
> void compute_clip_distance();
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
> index 798fef3..db62f51 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
> @@ -67,7 +67,7 @@ opt_cmod_propagation_local(bblock_t *block)
> continue;
>
> /* Only an AND.NZ can be propagated. Many AND.Z instructions are
> - * generated (for ir_unop_not in fs_visitor::emit_bool_to_cond_code).
> + * generated (for ir_unop_not in fs_god::emit_bool_to_cond_code).
> * Propagating those would require inverting the condition on the CMP.
> * This changes both the flag value and the register destination of the
> * CMP. That result may be used elsewhere, so we can't change its value
> @@ -153,7 +153,7 @@ opt_cmod_propagation_local(bblock_t *block)
> }
>
> bool
> -fs_visitor::opt_cmod_propagation()
> +fs_god::opt_cmod_propagation()
> {
> bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
> index ebde8df..5b84b34 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
> @@ -188,7 +188,7 @@ compare(const void *_a, const void *_b)
> }
>
> bool
> -fs_visitor::opt_combine_constants()
> +fs_god::opt_combine_constants()
> {
> void *const_ctx = ralloc_context(NULL);
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
> index 764741d..b059849 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
> @@ -276,7 +276,7 @@ is_logic_op(enum opcode opcode)
> }
>
> bool
> -fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
> +fs_god::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
> {
> if (inst->src[arg].file != GRF)
> return false;
> @@ -422,7 +422,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
>
>
> bool
> -fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
> +fs_god::try_constant_propagate(fs_inst *inst, acp_entry *entry)
> {
> bool progress = false;
>
> @@ -608,7 +608,7 @@ can_propagate_from(fs_inst *inst)
> * list.
> */
> bool
> -fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
> +fs_god::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
> exec_list *acp)
> {
> bool progress = false;
> @@ -687,7 +687,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
> }
>
> bool
> -fs_visitor::opt_copy_propagate()
> +fs_god::opt_copy_propagate()
> {
> bool progress = false;
> void *copy_prop_ctx = ralloc_context(NULL);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
> index ca5b32f..ba4dbde 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
> @@ -155,7 +155,7 @@ instructions_match(fs_inst *a, fs_inst *b)
> }
>
> bool
> -fs_visitor::opt_cse_local(bblock_t *block)
> +fs_god::opt_cse_local(bblock_t *block)
> {
> bool progress = false;
> exec_list aeb;
> @@ -300,7 +300,7 @@ fs_visitor::opt_cse_local(bblock_t *block)
> }
>
> bool
> -fs_visitor::opt_cse()
> +fs_god::opt_cse()
> {
> bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp
> index 4b5548a..669257b 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp
> @@ -35,7 +35,7 @@
> */
>
> bool
> -fs_visitor::dead_code_eliminate()
> +fs_god::dead_code_eliminate()
> {
> bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
> index c4064da..0a10e74 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
> @@ -31,7 +31,7 @@
> #include "brw_fs.h"
>
> void
> -fs_visitor::emit_fp_alu1(enum opcode opcode,
> +fs_god::emit_fp_alu1(enum opcode opcode,
> const struct prog_instruction *fpi,
> fs_reg dst, fs_reg src)
> {
> @@ -42,7 +42,7 @@ fs_visitor::emit_fp_alu1(enum opcode opcode,
> }
>
> void
> -fs_visitor::emit_fp_alu2(enum opcode opcode,
> +fs_god::emit_fp_alu2(enum opcode opcode,
> const struct prog_instruction *fpi,
> fs_reg dst, fs_reg src0, fs_reg src1)
> {
> @@ -54,7 +54,7 @@ fs_visitor::emit_fp_alu2(enum opcode opcode,
> }
>
> void
> -fs_visitor::emit_fp_minmax(const prog_instruction *fpi,
> +fs_god::emit_fp_minmax(const prog_instruction *fpi,
> fs_reg dst, fs_reg src0, fs_reg src1)
> {
> enum brw_conditional_mod conditionalmod;
> @@ -72,7 +72,7 @@ fs_visitor::emit_fp_minmax(const prog_instruction *fpi,
> }
>
> void
> -fs_visitor::emit_fp_sop(enum brw_conditional_mod conditional_mod,
> +fs_god::emit_fp_sop(enum brw_conditional_mod conditional_mod,
> const struct prog_instruction *fpi,
> fs_reg dst, fs_reg src0, fs_reg src1,
> fs_reg one)
> @@ -91,7 +91,7 @@ fs_visitor::emit_fp_sop(enum brw_conditional_mod conditional_mod,
> }
>
> void
> -fs_visitor::emit_fp_scalar_write(const struct prog_instruction *fpi,
> +fs_god::emit_fp_scalar_write(const struct prog_instruction *fpi,
> fs_reg dst, fs_reg src)
> {
> for (int i = 0; i < 4; i++) {
> @@ -101,7 +101,7 @@ fs_visitor::emit_fp_scalar_write(const struct prog_instruction *fpi,
> }
>
> void
> -fs_visitor::emit_fp_scalar_math(enum opcode opcode,
> +fs_god::emit_fp_scalar_math(enum opcode opcode,
> const struct prog_instruction *fpi,
> fs_reg dst, fs_reg src)
> {
> @@ -111,7 +111,7 @@ fs_visitor::emit_fp_scalar_math(enum opcode opcode,
> }
>
> void
> -fs_visitor::emit_fragment_program_code()
> +fs_god::emit_fragment_program_code()
> {
> setup_fp_regs();
>
> @@ -552,7 +552,7 @@ fs_visitor::emit_fragment_program_code()
> }
>
> void
> -fs_visitor::setup_fp_regs()
> +fs_god::setup_fp_regs()
> {
> /* PROGRAM_TEMPORARY */
> int num_temp = prog->NumTemporaries;
> @@ -612,7 +612,7 @@ fs_visitor::setup_fp_regs()
> }
>
> fs_reg
> -fs_visitor::get_fp_dst_reg(const prog_dst_register *dst)
> +fs_god::get_fp_dst_reg(const prog_dst_register *dst)
> {
> assert(stage == MESA_SHADER_FRAGMENT);
> brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> @@ -660,7 +660,7 @@ fs_visitor::get_fp_dst_reg(const prog_dst_register *dst)
> }
>
> fs_reg
> -fs_visitor::get_fp_src_reg(const prog_src_register *src)
> +fs_god::get_fp_src_reg(const prog_src_register *src)
> {
> struct gl_program_parameter_list *plist = prog->Parameters;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_god.cpp b/src/mesa/drivers/dri/i965/brw_fs_god.cpp
> new file mode 100644
> index 0000000..e3d8b3a
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_fs_god.cpp
> @@ -0,0 +1,4157 @@
> +/*
> + * Copyright © 2010 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +/** @file brw_fs_god.cpp
> + *
> + * This file supports generating the FS LIR from the GLSL IR. The LIR
> + * makes it easier to do backend-specific optimizations than doing so
> + * in the GLSL IR or in the native code.
> + */
> +#include <sys/types.h>
> +
> +#include "main/macros.h"
> +#include "main/shaderobj.h"
> +#include "program/prog_parameter.h"
> +#include "program/prog_print.h"
> +#include "program/prog_optimize.h"
> +#include "util/register_allocate.h"
> +#include "program/hash_table.h"
> +#include "brw_context.h"
> +#include "brw_eu.h"
> +#include "brw_wm.h"
> +#include "brw_vec4.h"
> +#include "brw_fs.h"
> +#include "main/uniforms.h"
> +#include "glsl/glsl_types.h"
> +#include "glsl/ir_optimization.h"
> +#include "program/sampler.h"
> +
> +
> +fs_reg *
> +fs_god::emit_vs_system_value(int location)
> +{
> + fs_reg *reg = new(this->mem_ctx)
> + fs_reg(ATTR, VERT_ATTRIB_MAX, BRW_REGISTER_TYPE_D);
> + brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
> +
> + switch (location) {
> + case SYSTEM_VALUE_BASE_VERTEX:
> + reg->reg_offset = 0;
> + vs_prog_data->uses_vertexid = true;
> + break;
> + case SYSTEM_VALUE_VERTEX_ID:
> + case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
> + reg->reg_offset = 2;
> + vs_prog_data->uses_vertexid = true;
> + break;
> + case SYSTEM_VALUE_INSTANCE_ID:
> + reg->reg_offset = 3;
> + vs_prog_data->uses_instanceid = true;
> + break;
> + default:
> + unreachable("not reached");
> + }
> +
> + return reg;
> +}
> +
> +void
> +fs_god::visit(ir_variable *ir)
> +{
> + fs_reg *reg = NULL;
> +
> + if (variable_storage(ir))
> + return;
> +
> + if (ir->data.mode == ir_var_shader_in) {
> + assert(ir->data.location != -1);
> + if (stage == MESA_SHADER_VERTEX) {
> + reg = new(this->mem_ctx)
> + fs_reg(ATTR, ir->data.location,
> + brw_type_for_base_type(ir->type->get_scalar_type()));
> + } else if (ir->data.location == VARYING_SLOT_POS) {
> + reg = emit_fragcoord_interpolation(ir->data.pixel_center_integer,
> + ir->data.origin_upper_left);
> + } else if (ir->data.location == VARYING_SLOT_FACE) {
> + reg = emit_frontfacing_interpolation();
> + } else {
> + reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
> + emit_general_interpolation(*reg, ir->name, ir->type,
> + (glsl_interp_qualifier) ir->data.interpolation,
> + ir->data.location, ir->data.centroid,
> + ir->data.sample);
> + }
> + assert(reg);
> + hash_table_insert(this->variable_ht, reg, ir);
> + return;
> + } else if (ir->data.mode == ir_var_shader_out) {
> + reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
> +
> + if (stage == MESA_SHADER_VERTEX) {
> + int vector_elements =
> + ir->type->is_array() ? ir->type->fields.array->vector_elements
> + : ir->type->vector_elements;
> +
> + for (int i = 0; i < (type_size(ir->type) + 3) / 4; i++) {
> + int output = ir->data.location + i;
> + this->outputs[output] = *reg;
> + this->outputs[output].reg_offset = i * 4;
> + this->output_components[output] = vector_elements;
> + }
> +
> + } else if (ir->data.index > 0) {
> + assert(ir->data.location == FRAG_RESULT_DATA0);
> + assert(ir->data.index == 1);
> + this->dual_src_output = *reg;
> + this->do_dual_src = true;
> + } else if (ir->data.location == FRAG_RESULT_COLOR) {
> + /* Writing gl_FragColor outputs to all color regions. */
> + assert(stage == MESA_SHADER_FRAGMENT);
> + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> + for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
> + this->outputs[i] = *reg;
> + this->output_components[i] = 4;
> + }
> + } else if (ir->data.location == FRAG_RESULT_DEPTH) {
> + this->frag_depth = *reg;
> + } else if (ir->data.location == FRAG_RESULT_SAMPLE_MASK) {
> + this->sample_mask = *reg;
> + } else {
> + /* gl_FragData or a user-defined FS output */
> + assert(ir->data.location >= FRAG_RESULT_DATA0 &&
> + ir->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
> +
> + int vector_elements =
> + ir->type->is_array() ? ir->type->fields.array->vector_elements
> + : ir->type->vector_elements;
> +
> + /* General color output. */
> + for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
> + int output = ir->data.location - FRAG_RESULT_DATA0 + i;
> + this->outputs[output] = offset(*reg, vector_elements * i);
> + this->output_components[output] = vector_elements;
> + }
> + }
> + } else if (ir->data.mode == ir_var_uniform) {
> + int param_index = uniforms;
> +
> + /* Thanks to the lower_ubo_reference pass, we will see only
> + * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
> + * variables, so no need for them to be in variable_ht.
> + *
> + * Some uniforms, such as samplers and atomic counters, have no actual
> + * storage, so we should ignore them.
> + */
> + if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
> + return;
> +
> + if (dispatch_width == 16) {
> + if (!variable_storage(ir)) {
> + fail("Failed to find uniform '%s' in SIMD16\n", ir->name);
> + }
> + return;
> + }
> +
> + param_size[param_index] = type_size(ir->type);
> + if (!strncmp(ir->name, "gl_", 3)) {
> + setup_builtin_uniform_values(ir);
> + } else {
> + setup_uniform_values(ir);
> + }
> +
> + reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
> + reg->type = brw_type_for_base_type(ir->type);
> +
> + } else if (ir->data.mode == ir_var_system_value) {
> + switch (ir->data.location) {
> + case SYSTEM_VALUE_BASE_VERTEX:
> + case SYSTEM_VALUE_VERTEX_ID:
> + case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
> + case SYSTEM_VALUE_INSTANCE_ID:
> + reg = emit_vs_system_value(ir->data.location);
> + break;
> + case SYSTEM_VALUE_SAMPLE_POS:
> + reg = emit_samplepos_setup();
> + break;
> + case SYSTEM_VALUE_SAMPLE_ID:
> + reg = emit_sampleid_setup();
> + break;
> + case SYSTEM_VALUE_SAMPLE_MASK_IN:
> + assert(brw->gen >= 7);
> + reg = new(mem_ctx)
> + fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
> + BRW_REGISTER_TYPE_D));
> + break;
> + }
> + }
> +
> + if (!reg)
> + reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
> +
> + hash_table_insert(this->variable_ht, reg, ir);
> +}
> +
> +void
> +fs_god::visit(ir_dereference_variable *ir)
> +{
> + fs_reg *reg = variable_storage(ir->var);
> +
> + if (!reg) {
> + fail("Failed to find variable storage for %s\n", ir->var->name);
> + this->result = fs_reg(reg_null_d);
> + return;
> + }
> + this->result = *reg;
> +}
> +
> +void
> +fs_god::visit(ir_dereference_record *ir)
> +{
> + const glsl_type *struct_type = ir->record->type;
> +
> + ir->record->accept(this);
> +
> + unsigned int off = 0;
> + for (unsigned int i = 0; i < struct_type->length; i++) {
> + if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
> + break;
> + off += type_size(struct_type->fields.structure[i].type);
> + }
> + this->result = offset(this->result, off);
> + this->result.type = brw_type_for_base_type(ir->type);
> +}
> +
> +void
> +fs_god::visit(ir_dereference_array *ir)
> +{
> + ir_constant *constant_index;
> + fs_reg src;
> + int element_size = type_size(ir->type);
> +
> + constant_index = ir->array_index->as_constant();
> +
> + ir->array->accept(this);
> + src = this->result;
> + src.type = brw_type_for_base_type(ir->type);
> +
> + if (constant_index) {
> + if (src.file == ATTR) {
> + /* Attribute arrays get loaded as one vec4 per element. In that case
> + * offset the source register.
> + */
> + src.reg += constant_index->value.i[0];
> + } else {
> + assert(src.file == UNIFORM || src.file == GRF || src.file == HW_REG);
> + src = offset(src, constant_index->value.i[0] * element_size);
> + }
> + } else {
> + /* Variable index array dereference. We attach the variable index
> + * component to the reg as a pointer to a register containing the
> + * offset. Currently only uniform arrays are supported in this patch,
> + * and that reladdr pointer is resolved by
> + * move_uniform_array_access_to_pull_constants(). All other array types
> + * are lowered by lower_variable_index_to_cond_assign().
> + */
> + ir->array_index->accept(this);
> +
> + fs_reg index_reg;
> + index_reg = vgrf(glsl_type::int_type);
> + emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
> +
> + if (src.reladdr) {
> + emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
> + }
> +
> + src.reladdr = ralloc(mem_ctx, fs_reg);
> + memcpy(src.reladdr, &index_reg, sizeof(index_reg));
> + }
> + this->result = src;
> +}
> +
> +fs_inst *
> +fs_god::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
> + const fs_reg &a)
> +{
> + if (brw->gen < 6) {
> + /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
> + fs_reg y_times_a = vgrf(glsl_type::float_type);
> + fs_reg one_minus_a = vgrf(glsl_type::float_type);
> + fs_reg x_times_one_minus_a = vgrf(glsl_type::float_type);
> +
> + emit(MUL(y_times_a, y, a));
> +
> + fs_reg negative_a = a;
> + negative_a.negate = !a.negate;
> + emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
> + emit(MUL(x_times_one_minus_a, x, one_minus_a));
> +
> + return emit(ADD(dst, x_times_one_minus_a, y_times_a));
> + } else {
> + /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
> + * we need to reorder the operands.
> + */
> + return emit(LRP(dst, a, y, x));
> + }
> +}
> +
> +void
> +fs_god::emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
> + const fs_reg &src0, const fs_reg &src1)
> +{
> + assert(conditionalmod == BRW_CONDITIONAL_GE ||
> + conditionalmod == BRW_CONDITIONAL_L);
> +
> + fs_inst *inst;
> +
> + if (brw->gen >= 6) {
> + inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
> + inst->conditional_mod = conditionalmod;
> + } else {
> + emit(CMP(reg_null_d, src0, src1, conditionalmod));
> +
> + inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + }
> +}
> +
> +bool
> +fs_god::try_emit_saturate(ir_expression *ir)
> +{
> + if (ir->operation != ir_unop_saturate)
> + return false;
> +
> + ir_rvalue *sat_val = ir->operands[0];
> +
> + fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
> +
> + sat_val->accept(this);
> + fs_reg src = this->result;
> +
> + fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
> +
> + /* If the last instruction from our accept() generated our
> + * src, just set the saturate flag instead of emmitting a separate mov.
> + */
> + fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
> + if (modify && modify->regs_written == modify->dst.width / 8 &&
> + modify->can_do_saturate()) {
> + modify->saturate = true;
> + this->result = src;
> + return true;
> + }
> +
> + return false;
> +}
> +
> +bool
> +fs_god::try_emit_line(ir_expression *ir)
> +{
> + /* LINE's src0 must be of type float. */
> + if (ir->type != glsl_type::float_type)
> + return false;
> +
> + ir_rvalue *nonmul = ir->operands[1];
> + ir_expression *mul = ir->operands[0]->as_expression();
> +
> + if (!mul || mul->operation != ir_binop_mul) {
> + nonmul = ir->operands[0];
> + mul = ir->operands[1]->as_expression();
> +
> + if (!mul || mul->operation != ir_binop_mul)
> + return false;
> + }
> +
> + ir_constant *const_add = nonmul->as_constant();
> + if (!const_add)
> + return false;
> +
> + int add_operand_vf = brw_float_to_vf(const_add->value.f[0]);
> + if (add_operand_vf == -1)
> + return false;
> +
> + ir_rvalue *non_const_mul = mul->operands[1];
> + ir_constant *const_mul = mul->operands[0]->as_constant();
> + if (!const_mul) {
> + const_mul = mul->operands[1]->as_constant();
> +
> + if (!const_mul)
> + return false;
> +
> + non_const_mul = mul->operands[0];
> + }
> +
> + int mul_operand_vf = brw_float_to_vf(const_mul->value.f[0]);
> + if (mul_operand_vf == -1)
> + return false;
> +
> + non_const_mul->accept(this);
> + fs_reg src1 = this->result;
> +
> + fs_reg src0 = vgrf(ir->type);
> + emit(BRW_OPCODE_MOV, src0,
> + fs_reg((uint8_t)mul_operand_vf, 0, 0, (uint8_t)add_operand_vf));
> +
> + this->result = vgrf(ir->type);
> + emit(BRW_OPCODE_LINE, this->result, src0, src1);
> + return true;
> +}
> +
> +bool
> +fs_god::try_emit_mad(ir_expression *ir)
> +{
> + /* 3-src instructions were introduced in gen6. */
> + if (brw->gen < 6)
> + return false;
> +
> + /* MAD can only handle floating-point data. */
> + if (ir->type != glsl_type::float_type)
> + return false;
> +
> + ir_rvalue *nonmul;
> + ir_expression *mul;
> + bool mul_negate, mul_abs;
> +
> + for (int i = 0; i < 2; i++) {
> + mul_negate = false;
> + mul_abs = false;
> +
> + mul = ir->operands[i]->as_expression();
> + nonmul = ir->operands[1 - i];
> +
> + if (mul && mul->operation == ir_unop_abs) {
> + mul = mul->operands[0]->as_expression();
> + mul_abs = true;
> + } else if (mul && mul->operation == ir_unop_neg) {
> + mul = mul->operands[0]->as_expression();
> + mul_negate = true;
> + }
> +
> + if (mul && mul->operation == ir_binop_mul)
> + break;
> + }
> +
> + if (!mul || mul->operation != ir_binop_mul)
> + return false;
> +
> + nonmul->accept(this);
> + fs_reg src0 = this->result;
> +
> + mul->operands[0]->accept(this);
> + fs_reg src1 = this->result;
> + src1.negate ^= mul_negate;
> + src1.abs = mul_abs;
> + if (mul_abs)
> + src1.negate = false;
> +
> + mul->operands[1]->accept(this);
> + fs_reg src2 = this->result;
> + src2.abs = mul_abs;
> + if (mul_abs)
> + src2.negate = false;
> +
> + this->result = vgrf(ir->type);
> + emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
> +
> + return true;
> +}
> +
> +bool
> +fs_god::try_emit_b2f_of_comparison(ir_expression *ir)
> +{
> + /* On platforms that do not natively generate 0u and ~0u for Boolean
> + * results, b2f expressions that look like
> + *
> + * f = b2f(expr cmp 0)
> + *
> + * will generate better code by pretending the expression is
> + *
> + * f = ir_triop_csel(0.0, 1.0, expr cmp 0)
> + *
> + * This is because the last instruction of "expr" can generate the
> + * condition code for the "cmp 0". This avoids having to do the "-(b & 1)"
> + * trick to generate 0u or ~0u for the Boolean result. This means code like
> + *
> + * mov(16) g16<1>F 1F
> + * mul.ge.f0(16) null g6<8,8,1>F g14<8,8,1>F
> + * (+f0) sel(16) m6<1>F g16<8,8,1>F 0F
> + *
> + * will be generated instead of
> + *
> + * mul(16) g2<1>F g12<8,8,1>F g4<8,8,1>F
> + * cmp.ge.f0(16) g2<1>D g4<8,8,1>F 0F
> + * and(16) g4<1>D g2<8,8,1>D 1D
> + * and(16) m6<1>D -g4<8,8,1>D 0x3f800000UD
> + *
> + * When the comparison is either == 0.0 or != 0.0 using the knowledge that
> + * the true (or false) case already results in zero would allow better code
> + * generation by possibly avoiding a load-immediate instruction.
> + */
> + ir_expression *cmp = ir->operands[0]->as_expression();
> + if (cmp == NULL)
> + return false;
> +
> + if (cmp->operation == ir_binop_equal || cmp->operation == ir_binop_nequal) {
> + for (unsigned i = 0; i < 2; i++) {
> + ir_constant *c = cmp->operands[i]->as_constant();
> + if (c == NULL || !c->is_zero())
> + continue;
> +
> + ir_expression *expr = cmp->operands[i ^ 1]->as_expression();
> + if (expr != NULL) {
> + fs_reg op[2];
> +
> + for (unsigned j = 0; j < 2; j++) {
> + cmp->operands[j]->accept(this);
> + op[j] = this->result;
> +
> + resolve_ud_negate(&op[j]);
> + }
> +
> + emit_bool_to_cond_code_of_reg(cmp, op);
> +
> + /* In this case we know when the condition is true, op[i ^ 1]
> + * contains zero. Invert the predicate, use op[i ^ 1] as src0,
> + * and immediate 1.0f as src1.
> + */
> + this->result = vgrf(ir->type);
> + op[i ^ 1].type = BRW_REGISTER_TYPE_F;
> +
> + fs_inst *inst = emit(SEL(this->result, op[i ^ 1], fs_reg(1.0f)));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + inst->predicate_inverse = cmp->operation == ir_binop_equal;
> + return true;
> + }
> + }
> + }
> +
> + emit_bool_to_cond_code(cmp);
> +
> + fs_reg temp = vgrf(ir->type);
> + emit(MOV(temp, fs_reg(1.0f)));
> +
> + this->result = vgrf(ir->type);
> + fs_inst *inst = emit(SEL(this->result, temp, fs_reg(0.0f)));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> +
> + return true;
> +}
> +
> +static int
> +pack_pixel_offset(float x)
> +{
> + /* Clamp upper end of the range to +7/16. See explanation in non-constant
> + * offset case below. */
> + int n = MIN2((int)(x * 16), 7);
> + return n & 0xf;
> +}
> +
> +void
> +fs_god::emit_interpolate_expression(ir_expression *ir)
> +{
> + /* in SIMD16 mode, the pixel interpolator returns coords interleaved
> + * 8 channels at a time, same as the barycentric coords presented in
> + * the FS payload. this requires a bit of extra work to support.
> + */
> + no16("interpolate_at_* not yet supported in SIMD16 mode.");
> +
> + assert(stage == MESA_SHADER_FRAGMENT);
> + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> +
> + ir_dereference * deref = ir->operands[0]->as_dereference();
> + ir_swizzle * swiz = NULL;
> + if (!deref) {
> + /* the api does not allow a swizzle here, but the varying packing code
> + * may have pushed one into here.
> + */
> + swiz = ir->operands[0]->as_swizzle();
> + assert(swiz);
> + deref = swiz->val->as_dereference();
> + }
> + assert(deref);
> + ir_variable * var = deref->variable_referenced();
> + assert(var);
> +
> + /* 1. collect interpolation factors */
> +
> + fs_reg dst_x = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
> + fs_reg dst_y = offset(dst_x, 1);
> +
> + /* for most messages, we need one reg of ignored data; the hardware requires mlen==1
> + * even when there is no payload. in the per-slot offset case, we'll replace this with
> + * the proper source data. */
> + fs_reg src = vgrf(glsl_type::float_type);
> + int mlen = 1; /* one reg unless overriden */
> + int reg_width = dispatch_width / 8;
> + fs_inst *inst;
> +
> + switch (ir->operation) {
> + case ir_unop_interpolate_at_centroid:
> + inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
> + break;
> +
> + case ir_binop_interpolate_at_sample: {
> + ir_constant *sample_num = ir->operands[1]->as_constant();
> + assert(sample_num || !"nonconstant sample number should have been lowered.");
> +
> + unsigned msg_data = sample_num->value.i[0] << 4;
> + inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, fs_reg(msg_data));
> + break;
> + }
> +
> + case ir_binop_interpolate_at_offset: {
> + ir_constant *const_offset = ir->operands[1]->as_constant();
> + if (const_offset) {
> + unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
> + (pack_pixel_offset(const_offset->value.f[1]) << 4);
> + inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
> + fs_reg(msg_data));
> + } else {
> + /* pack the operands: hw wants offsets as 4 bit signed ints */
> + ir->operands[1]->accept(this);
> + src = vgrf(glsl_type::ivec2_type);
> + fs_reg src2 = src;
> + for (int i = 0; i < 2; i++) {
> + fs_reg temp = vgrf(glsl_type::float_type);
> + emit(MUL(temp, this->result, fs_reg(16.0f)));
> + emit(MOV(src2, temp)); /* float to int */
> +
> + /* Clamp the upper end of the range to +7/16. ARB_gpu_shader5 requires
> + * that we support a maximum offset of +0.5, which isn't representable
> + * in a S0.4 value -- if we didn't clamp it, we'd end up with -8/16,
> + * which is the opposite of what the shader author wanted.
> + *
> + * This is legal due to ARB_gpu_shader5's quantization rules:
> + *
> + * "Not all values of <offset> may be supported; x and y offsets may
> + * be rounded to fixed-point values with the number of fraction bits
> + * given by the implementation-dependent constant
> + * FRAGMENT_INTERPOLATION_OFFSET_BITS"
> + */
> +
> + fs_inst *inst = emit(BRW_OPCODE_SEL, src2, src2, fs_reg(7));
> + inst->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
> +
> + src2 = offset(src2, 1);
> + this->result = offset(this->result, 1);
> + }
> +
> + mlen = 2 * reg_width;
> + inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
> + fs_reg(0u));
> + }
> + break;
> + }
> +
> + default:
> + unreachable("not reached");
> + }
> +
> + inst->mlen = mlen;
> + inst->regs_written = 2 * reg_width; /* 2 floats per slot returned */
> + inst->pi_noperspective = var->determine_interpolation_mode(key->flat_shade) ==
> + INTERP_QUALIFIER_NOPERSPECTIVE;
> +
> + /* 2. emit linterp */
> +
> + fs_reg res = vgrf(ir->type);
> + this->result = res;
> +
> + for (int i = 0; i < ir->type->vector_elements; i++) {
> + int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
> + emit(FS_OPCODE_LINTERP, res,
> + dst_x, dst_y,
> + fs_reg(interp_reg(var->data.location, ch)));
> + res = offset(res, 1);
> + }
> +}
> +
> +void
> +fs_god::visit(ir_expression *ir)
> +{
> + unsigned int operand;
> + fs_reg op[3], temp;
> + fs_inst *inst;
> + struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
> +
> + assert(ir->get_num_operands() <= 3);
> +
> + if (try_emit_saturate(ir))
> + return;
> +
> + /* Deal with the real oddball stuff first */
> + switch (ir->operation) {
> + case ir_binop_add:
> + if (brw->gen <= 5 && try_emit_line(ir))
> + return;
> + if (try_emit_mad(ir))
> + return;
> + break;
> +
> + case ir_triop_csel:
> + ir->operands[1]->accept(this);
> + op[1] = this->result;
> + ir->operands[2]->accept(this);
> + op[2] = this->result;
> +
> + emit_bool_to_cond_code(ir->operands[0]);
> +
> + this->result = vgrf(ir->type);
> + inst = emit(SEL(this->result, op[1], op[2]));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + return;
> +
> + case ir_unop_b2f:
> + if (brw->gen <= 5 && try_emit_b2f_of_comparison(ir))
> + return;
> + break;
> +
> + case ir_unop_interpolate_at_centroid:
> + case ir_binop_interpolate_at_offset:
> + case ir_binop_interpolate_at_sample:
> + emit_interpolate_expression(ir);
> + return;
> +
> + default:
> + break;
> + }
> +
> + for (operand = 0; operand < ir->get_num_operands(); operand++) {
> + ir->operands[operand]->accept(this);
> + if (this->result.file == BAD_FILE) {
> + fail("Failed to get tree for expression operand:\n");
> + ir->operands[operand]->fprint(stderr);
> + fprintf(stderr, "\n");
> + }
> + assert(this->result.file == GRF ||
> + this->result.file == UNIFORM || this->result.file == ATTR);
> + op[operand] = this->result;
> +
> + /* Matrix expression operands should have been broken down to vector
> + * operations already.
> + */
> + assert(!ir->operands[operand]->type->is_matrix());
> + /* And then those vector operands should have been broken down to scalar.
> + */
> + assert(!ir->operands[operand]->type->is_vector());
> + }
> +
> + /* Storage for our result. If our result goes into an assignment, it will
> + * just get copy-propagated out, so no worries.
> + */
> + this->result = vgrf(ir->type);
> +
> + switch (ir->operation) {
> + case ir_unop_logic_not:
> + emit(NOT(this->result, op[0]));
> + break;
> + case ir_unop_neg:
> + op[0].negate = !op[0].negate;
> + emit(MOV(this->result, op[0]));
> + break;
> + case ir_unop_abs:
> + op[0].abs = true;
> + op[0].negate = false;
> + emit(MOV(this->result, op[0]));
> + break;
> + case ir_unop_sign:
> + if (ir->type->is_float()) {
> + /* AND(val, 0x80000000) gives the sign bit.
> + *
> + * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
> + * zero.
> + */
> + emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
> +
> + op[0].type = BRW_REGISTER_TYPE_UD;
> + this->result.type = BRW_REGISTER_TYPE_UD;
> + emit(AND(this->result, op[0], fs_reg(0x80000000u)));
> +
> + inst = emit(OR(this->result, this->result, fs_reg(0x3f800000u)));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> +
> + this->result.type = BRW_REGISTER_TYPE_F;
> + } else {
> + /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
> + * -> non-negative val generates 0x00000000.
> + * Predicated OR sets 1 if val is positive.
> + */
> + emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
> +
> + emit(ASR(this->result, op[0], fs_reg(31)));
> +
> + inst = emit(OR(this->result, this->result, fs_reg(1)));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + }
> + break;
> + case ir_unop_rcp:
> + emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
> + break;
> +
> + case ir_unop_exp2:
> + emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
> + break;
> + case ir_unop_log2:
> + emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
> + break;
> + case ir_unop_exp:
> + case ir_unop_log:
> + unreachable("not reached: should be handled by ir_explog_to_explog2");
> + case ir_unop_sin:
> + case ir_unop_sin_reduced:
> + emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
> + break;
> + case ir_unop_cos:
> + case ir_unop_cos_reduced:
> + emit_math(SHADER_OPCODE_COS, this->result, op[0]);
> + break;
> +
> + case ir_unop_dFdx:
> + /* Select one of the two opcodes based on the glHint value. */
> + if (fs_key->high_quality_derivatives)
> + emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
> + else
> + emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
> + break;
> +
> + case ir_unop_dFdx_coarse:
> + emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
> + break;
> +
> + case ir_unop_dFdx_fine:
> + emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
> + break;
> +
> + case ir_unop_dFdy:
> + /* Select one of the two opcodes based on the glHint value. */
> + if (fs_key->high_quality_derivatives)
> + emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
> + else
> + emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
> + break;
> +
> + case ir_unop_dFdy_coarse:
> + emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
> + break;
> +
> + case ir_unop_dFdy_fine:
> + emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
> + break;
> +
> + case ir_binop_add:
> + emit(ADD(this->result, op[0], op[1]));
> + break;
> + case ir_binop_sub:
> + unreachable("not reached: should be handled by ir_sub_to_add_neg");
> +
> + case ir_binop_mul:
> + if (brw->gen < 8 && ir->type->is_integer()) {
> + /* For integer multiplication, the MUL uses the low 16 bits
> + * of one of the operands (src0 on gen6, src1 on gen7). The
> + * MACH accumulates in the contribution of the upper 16 bits
> + * of that operand.
> + */
> + if (ir->operands[0]->is_uint16_constant()) {
> + if (brw->gen < 7)
> + emit(MUL(this->result, op[0], op[1]));
> + else
> + emit(MUL(this->result, op[1], op[0]));
> + } else if (ir->operands[1]->is_uint16_constant()) {
> + if (brw->gen < 7)
> + emit(MUL(this->result, op[1], op[0]));
> + else
> + emit(MUL(this->result, op[0], op[1]));
> + } else {
> + if (brw->gen >= 7)
> + no16("SIMD16 explicit accumulator operands unsupported\n");
> +
> + struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> + this->result.type);
> +
> + emit(MUL(acc, op[0], op[1]));
> + emit(MACH(reg_null_d, op[0], op[1]));
> + emit(MOV(this->result, fs_reg(acc)));
> + }
> + } else {
> + emit(MUL(this->result, op[0], op[1]));
> + }
> + break;
> + case ir_binop_imul_high: {
> + if (brw->gen == 7)
> + no16("SIMD16 explicit accumulator operands unsupported\n");
> +
> + struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> + this->result.type);
> +
> + fs_inst *mul = emit(MUL(acc, op[0], op[1]));
> + emit(MACH(this->result, op[0], op[1]));
> +
> + /* Until Gen8, integer multiplies read 32-bits from one source, and
> + * 16-bits from the other, and relying on the MACH instruction to
> + * generate the high bits of the result.
> + *
> + * On Gen8, the multiply instruction does a full 32x32-bit multiply,
> + * but in order to do a 64x64-bit multiply we have to simulate the
> + * previous behavior and then use a MACH instruction.
> + *
> + * FINISHME: Don't use source modifiers on src1.
> + */
> + if (brw->gen >= 8) {
> + assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
> + mul->src[1].type == BRW_REGISTER_TYPE_UD);
> + if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
> + mul->src[1].type = BRW_REGISTER_TYPE_W;
> + } else {
> + mul->src[1].type = BRW_REGISTER_TYPE_UW;
> + }
> + }
> +
> + break;
> + }
> + case ir_binop_div:
> + /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
> + assert(ir->type->is_integer());
> + emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
> + break;
> + case ir_binop_carry: {
> + if (brw->gen == 7)
> + no16("SIMD16 explicit accumulator operands unsupported\n");
> +
> + struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> + BRW_REGISTER_TYPE_UD);
> +
> + emit(ADDC(reg_null_ud, op[0], op[1]));
> + emit(MOV(this->result, fs_reg(acc)));
> + break;
> + }
> + case ir_binop_borrow: {
> + if (brw->gen == 7)
> + no16("SIMD16 explicit accumulator operands unsupported\n");
> +
> + struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> + BRW_REGISTER_TYPE_UD);
> +
> + emit(SUBB(reg_null_ud, op[0], op[1]));
> + emit(MOV(this->result, fs_reg(acc)));
> + break;
> + }
> + case ir_binop_mod:
> + /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
> + assert(ir->type->is_integer());
> + emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
> + break;
> +
> + case ir_binop_less:
> + case ir_binop_greater:
> + case ir_binop_lequal:
> + case ir_binop_gequal:
> + case ir_binop_equal:
> + case ir_binop_all_equal:
> + case ir_binop_nequal:
> + case ir_binop_any_nequal:
> + if (brw->gen <= 5) {
> + resolve_bool_comparison(ir->operands[0], &op[0]);
> + resolve_bool_comparison(ir->operands[1], &op[1]);
> + }
> +
> + emit(CMP(this->result, op[0], op[1],
> + brw_conditional_for_comparison(ir->operation)));
> + break;
> +
> + case ir_binop_logic_xor:
> + emit(XOR(this->result, op[0], op[1]));
> + break;
> +
> + case ir_binop_logic_or:
> + emit(OR(this->result, op[0], op[1]));
> + break;
> +
> + case ir_binop_logic_and:
> + emit(AND(this->result, op[0], op[1]));
> + break;
> +
> + case ir_binop_dot:
> + case ir_unop_any:
> + unreachable("not reached: should be handled by brw_fs_channel_expressions");
> +
> + case ir_unop_noise:
> + unreachable("not reached: should be handled by lower_noise");
> +
> + case ir_quadop_vector:
> + unreachable("not reached: should be handled by lower_quadop_vector");
> +
> + case ir_binop_vector_extract:
> + unreachable("not reached: should be handled by lower_vec_index_to_cond_assign()");
> +
> + case ir_triop_vector_insert:
> + unreachable("not reached: should be handled by lower_vector_insert()");
> +
> + case ir_binop_ldexp:
> + unreachable("not reached: should be handled by ldexp_to_arith()");
> +
> + case ir_unop_sqrt:
> + emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
> + break;
> +
> + case ir_unop_rsq:
> + emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
> + break;
> +
> + case ir_unop_bitcast_i2f:
> + case ir_unop_bitcast_u2f:
> + op[0].type = BRW_REGISTER_TYPE_F;
> + this->result = op[0];
> + break;
> + case ir_unop_i2u:
> + case ir_unop_bitcast_f2u:
> + op[0].type = BRW_REGISTER_TYPE_UD;
> + this->result = op[0];
> + break;
> + case ir_unop_u2i:
> + case ir_unop_bitcast_f2i:
> + op[0].type = BRW_REGISTER_TYPE_D;
> + this->result = op[0];
> + break;
> + case ir_unop_i2f:
> + case ir_unop_u2f:
> + case ir_unop_f2i:
> + case ir_unop_f2u:
> + emit(MOV(this->result, op[0]));
> + break;
> +
> + case ir_unop_b2i:
> + emit(AND(this->result, op[0], fs_reg(1)));
> + break;
> + case ir_unop_b2f:
> + if (brw->gen <= 5) {
> + resolve_bool_comparison(ir->operands[0], &op[0]);
> + }
> + op[0].type = BRW_REGISTER_TYPE_D;
> + this->result.type = BRW_REGISTER_TYPE_D;
> + emit(AND(this->result, op[0], fs_reg(0x3f800000u)));
> + this->result.type = BRW_REGISTER_TYPE_F;
> + break;
> +
> + case ir_unop_f2b:
> + emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
> + break;
> + case ir_unop_i2b:
> + emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
> + break;
> +
> + case ir_unop_trunc:
> + emit(RNDZ(this->result, op[0]));
> + break;
> + case ir_unop_ceil: {
> + fs_reg tmp = vgrf(ir->type);
> + op[0].negate = !op[0].negate;
> + emit(RNDD(tmp, op[0]));
> + tmp.negate = true;
> + emit(MOV(this->result, tmp));
> + }
> + break;
> + case ir_unop_floor:
> + emit(RNDD(this->result, op[0]));
> + break;
> + case ir_unop_fract:
> + emit(FRC(this->result, op[0]));
> + break;
> + case ir_unop_round_even:
> + emit(RNDE(this->result, op[0]));
> + break;
> +
> + case ir_binop_min:
> + case ir_binop_max:
> + resolve_ud_negate(&op[0]);
> + resolve_ud_negate(&op[1]);
> + emit_minmax(ir->operation == ir_binop_min ?
> + BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
> + this->result, op[0], op[1]);
> + break;
> + case ir_unop_pack_snorm_2x16:
> + case ir_unop_pack_snorm_4x8:
> + case ir_unop_pack_unorm_2x16:
> + case ir_unop_pack_unorm_4x8:
> + case ir_unop_unpack_snorm_2x16:
> + case ir_unop_unpack_snorm_4x8:
> + case ir_unop_unpack_unorm_2x16:
> + case ir_unop_unpack_unorm_4x8:
> + case ir_unop_unpack_half_2x16:
> + case ir_unop_pack_half_2x16:
> + unreachable("not reached: should be handled by lower_packing_builtins");
> + case ir_unop_unpack_half_2x16_split_x:
> + emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
> + break;
> + case ir_unop_unpack_half_2x16_split_y:
> + emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
> + break;
> + case ir_binop_pow:
> + emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
> + break;
> +
> + case ir_unop_bitfield_reverse:
> + emit(BFREV(this->result, op[0]));
> + break;
> + case ir_unop_bit_count:
> + emit(CBIT(this->result, op[0]));
> + break;
> + case ir_unop_find_msb:
> + temp = vgrf(glsl_type::uint_type);
> + emit(FBH(temp, op[0]));
> +
> + /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
> + * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
> + * subtract the result from 31 to convert the MSB count into an LSB count.
> + */
> +
> + /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
> + emit(MOV(this->result, temp));
> + emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
> +
> + temp.negate = true;
> + inst = emit(ADD(this->result, temp, fs_reg(31)));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + break;
> + case ir_unop_find_lsb:
> + emit(FBL(this->result, op[0]));
> + break;
> + case ir_unop_saturate:
> + inst = emit(MOV(this->result, op[0]));
> + inst->saturate = true;
> + break;
> + case ir_triop_bitfield_extract:
> + /* Note that the instruction's argument order is reversed from GLSL
> + * and the IR.
> + */
> + emit(BFE(this->result, op[2], op[1], op[0]));
> + break;
> + case ir_binop_bfm:
> + emit(BFI1(this->result, op[0], op[1]));
> + break;
> + case ir_triop_bfi:
> + emit(BFI2(this->result, op[0], op[1], op[2]));
> + break;
> + case ir_quadop_bitfield_insert:
> + unreachable("not reached: should be handled by "
> + "lower_instructions::bitfield_insert_to_bfm_bfi");
> +
> + case ir_unop_bit_not:
> + emit(NOT(this->result, op[0]));
> + break;
> + case ir_binop_bit_and:
> + emit(AND(this->result, op[0], op[1]));
> + break;
> + case ir_binop_bit_xor:
> + emit(XOR(this->result, op[0], op[1]));
> + break;
> + case ir_binop_bit_or:
> + emit(OR(this->result, op[0], op[1]));
> + break;
> +
> + case ir_binop_lshift:
> + emit(SHL(this->result, op[0], op[1]));
> + break;
> +
> + case ir_binop_rshift:
> + if (ir->type->base_type == GLSL_TYPE_INT)
> + emit(ASR(this->result, op[0], op[1]));
> + else
> + emit(SHR(this->result, op[0], op[1]));
> + break;
> + case ir_binop_pack_half_2x16_split:
> + emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
> + break;
> + case ir_binop_ubo_load: {
> + /* This IR node takes a constant uniform block and a constant or
> + * variable byte offset within the block and loads a vector from that.
> + */
> + ir_constant *const_uniform_block = ir->operands[0]->as_constant();
> + ir_constant *const_offset = ir->operands[1]->as_constant();
> + fs_reg surf_index;
> +
> + if (const_uniform_block) {
> + /* The block index is a constant, so just emit the binding table entry
> + * as an immediate.
> + */
> + surf_index = fs_reg(stage_prog_data->binding_table.ubo_start +
> + const_uniform_block->value.u[0]);
> + } else {
> + /* The block index is not a constant. Evaluate the index expression
> + * per-channel and add the base UBO index; the generator will select
> + * a value from any live channel.
> + */
> + surf_index = vgrf(glsl_type::uint_type);
> + emit(ADD(surf_index, op[0],
> + fs_reg(stage_prog_data->binding_table.ubo_start)))
> + ->force_writemask_all = true;
> +
> + /* Assume this may touch any UBO. It would be nice to provide
> + * a tighter bound, but the array information is already lowered away.
> + */
> + brw_mark_surface_used(prog_data,
> + stage_prog_data->binding_table.ubo_start +
> + shader_prog->NumUniformBlocks - 1);
> + }
> +
> + if (const_offset) {
> + fs_reg packed_consts = vgrf(glsl_type::float_type);
> + packed_consts.type = result.type;
> +
> + fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
> + emit(new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
> + packed_consts, surf_index, const_offset_reg));
> +
> + for (int i = 0; i < ir->type->vector_elements; i++) {
> + packed_consts.set_smear(const_offset->value.u[0] % 16 / 4 + i);
> +
> + /* The std140 packing rules don't allow vectors to cross 16-byte
> + * boundaries, and a reg is 32 bytes.
> + */
> + assert(packed_consts.subreg_offset < 32);
> +
> + /* UBO bools are any nonzero value. We consider bools to be
> + * values with the low bit set to 1. Convert them using CMP.
> + */
> + if (ir->type->base_type == GLSL_TYPE_BOOL) {
> + emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
> + } else {
> + emit(MOV(result, packed_consts));
> + }
> +
> + result = offset(result, 1);
> + }
> + } else {
> + /* Turn the byte offset into a dword offset. */
> + fs_reg base_offset = vgrf(glsl_type::int_type);
> + emit(SHR(base_offset, op[1], fs_reg(2)));
> +
> + for (int i = 0; i < ir->type->vector_elements; i++) {
> + emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
> + base_offset, i));
> +
> + if (ir->type->base_type == GLSL_TYPE_BOOL)
> + emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
> +
> + result = offset(result, 1);
> + }
> + }
> +
> + result.reg_offset = 0;
> + break;
> + }
> +
> + case ir_triop_fma:
> + /* Note that the instruction's argument order is reversed from GLSL
> + * and the IR.
> + */
> + emit(MAD(this->result, op[2], op[1], op[0]));
> + break;
> +
> + case ir_triop_lrp:
> + emit_lrp(this->result, op[0], op[1], op[2]);
> + break;
> +
> + case ir_triop_csel:
> + case ir_unop_interpolate_at_centroid:
> + case ir_binop_interpolate_at_offset:
> + case ir_binop_interpolate_at_sample:
> + unreachable("already handled above");
> + break;
> +
> + case ir_unop_d2f:
> + case ir_unop_f2d:
> + case ir_unop_d2i:
> + case ir_unop_i2d:
> + case ir_unop_d2u:
> + case ir_unop_u2d:
> + case ir_unop_d2b:
> + case ir_unop_pack_double_2x32:
> + case ir_unop_unpack_double_2x32:
> + case ir_unop_frexp_sig:
> + case ir_unop_frexp_exp:
> + unreachable("fp64 todo");
> + break;
> + }
> +}
> +
> +void
> +fs_god::emit_assignment_writes(fs_reg &l, fs_reg &r,
> + const glsl_type *type, bool predicated)
> +{
> + switch (type->base_type) {
> + case GLSL_TYPE_FLOAT:
> + case GLSL_TYPE_UINT:
> + case GLSL_TYPE_INT:
> + case GLSL_TYPE_BOOL:
> + for (unsigned int i = 0; i < type->components(); i++) {
> + l.type = brw_type_for_base_type(type);
> + r.type = brw_type_for_base_type(type);
> +
> + if (predicated || !l.equals(r)) {
> + fs_inst *inst = emit(MOV(l, r));
> + inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
> + }
> +
> + l = offset(l, 1);
> + r = offset(r, 1);
> + }
> + break;
> + case GLSL_TYPE_ARRAY:
> + for (unsigned int i = 0; i < type->length; i++) {
> + emit_assignment_writes(l, r, type->fields.array, predicated);
> + }
> + break;
> +
> + case GLSL_TYPE_STRUCT:
> + for (unsigned int i = 0; i < type->length; i++) {
> + emit_assignment_writes(l, r, type->fields.structure[i].type,
> + predicated);
> + }
> + break;
> +
> + case GLSL_TYPE_SAMPLER:
> + case GLSL_TYPE_IMAGE:
> + case GLSL_TYPE_ATOMIC_UINT:
> + break;
> +
> + case GLSL_TYPE_DOUBLE:
> + case GLSL_TYPE_VOID:
> + case GLSL_TYPE_ERROR:
> + case GLSL_TYPE_INTERFACE:
> + unreachable("not reached");
> + }
> +}
> +
> +/* If the RHS processing resulted in an instruction generating a
> + * temporary value, and it would be easy to rewrite the instruction to
> + * generate its result right into the LHS instead, do so. This ends
> + * up reliably removing instructions where it can be tricky to do so
> + * later without real UD chain information.
> + */
> +bool
> +fs_god::try_rewrite_rhs_to_dst(ir_assignment *ir,
> + fs_reg dst,
> + fs_reg src,
> + fs_inst *pre_rhs_inst,
> + fs_inst *last_rhs_inst)
> +{
> + /* Only attempt if we're doing a direct assignment. */
> + if (ir->condition ||
> + !(ir->lhs->type->is_scalar() ||
> + (ir->lhs->type->is_vector() &&
> + ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
> + return false;
> +
> + /* Make sure the last instruction generated our source reg. */
> + fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
> + last_rhs_inst,
> + src);
> + if (!modify)
> + return false;
> +
> + /* If last_rhs_inst wrote a different number of components than our LHS,
> + * we can't safely rewrite it.
> + */
> + if (alloc.sizes[dst.reg] != modify->regs_written)
> + return false;
> +
> + /* Success! Rewrite the instruction. */
> + modify->dst = dst;
> +
> + return true;
> +}
> +
> +void
> +fs_god::visit(ir_assignment *ir)
> +{
> + fs_reg l, r;
> + fs_inst *inst;
> +
> + /* FINISHME: arrays on the lhs */
> + ir->lhs->accept(this);
> + l = this->result;
> +
> + fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
> +
> + ir->rhs->accept(this);
> + r = this->result;
> +
> + fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
> +
> + assert(l.file != BAD_FILE);
> + assert(r.file != BAD_FILE);
> +
> + if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
> + return;
> +
> + if (ir->condition) {
> + emit_bool_to_cond_code(ir->condition);
> + }
> +
> + if (ir->lhs->type->is_scalar() ||
> + ir->lhs->type->is_vector()) {
> + for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
> + if (ir->write_mask & (1 << i)) {
> + inst = emit(MOV(l, r));
> + if (ir->condition)
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + r = offset(r, 1);
> + }
> + l = offset(l, 1);
> + }
> + } else {
> + emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
> + }
> +}
> +
> +fs_inst *
> +fs_god::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
> + fs_reg coordinate, int coord_components,
> + fs_reg shadow_c,
> + fs_reg lod, fs_reg dPdy, int grad_components,
> + uint32_t sampler)
> +{
> + int mlen;
> + int base_mrf = 1;
> + bool simd16 = false;
> + fs_reg orig_dst;
> +
> + /* g0 header. */
> + mlen = 1;
> +
> + if (shadow_c.file != BAD_FILE) {
> + for (int i = 0; i < coord_components; i++) {
> + emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
> + coordinate = offset(coordinate, 1);
> + }
> +
> + /* gen4's SIMD8 sampler always has the slots for u,v,r present.
> + * the unused slots must be zeroed.
> + */
> + for (int i = coord_components; i < 3; i++) {
> + emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
> + }
> + mlen += 3;
> +
> + if (op == ir_tex) {
> + /* There's no plain shadow compare message, so we use shadow
> + * compare with a bias of 0.0.
> + */
> + emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
> + mlen++;
> + } else if (op == ir_txb || op == ir_txl) {
> + emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
> + mlen++;
> + } else {
> + unreachable("Should not get here.");
> + }
> +
> + emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
> + mlen++;
> + } else if (op == ir_tex) {
> + for (int i = 0; i < coord_components; i++) {
> + emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
> + coordinate = offset(coordinate, 1);
> + }
> + /* zero the others. */
> + for (int i = coord_components; i<3; i++) {
> + emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
> + }
> + /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
> + mlen += 3;
> + } else if (op == ir_txd) {
> + fs_reg &dPdx = lod;
> +
> + for (int i = 0; i < coord_components; i++) {
> + emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
> + coordinate = offset(coordinate, 1);
> + }
> + /* the slots for u and v are always present, but r is optional */
> + mlen += MAX2(coord_components, 2);
> +
> + /* P = u, v, r
> + * dPdx = dudx, dvdx, drdx
> + * dPdy = dudy, dvdy, drdy
> + *
> + * 1-arg: Does not exist.
> + *
> + * 2-arg: dudx dvdx dudy dvdy
> + * dPdx.x dPdx.y dPdy.x dPdy.y
> + * m4 m5 m6 m7
> + *
> + * 3-arg: dudx dvdx drdx dudy dvdy drdy
> + * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
> + * m5 m6 m7 m8 m9 m10
> + */
> + for (int i = 0; i < grad_components; i++) {
> + emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
> + dPdx = offset(dPdx, 1);
> + }
> + mlen += MAX2(grad_components, 2);
> +
> + for (int i = 0; i < grad_components; i++) {
> + emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
> + dPdy = offset(dPdy, 1);
> + }
> + mlen += MAX2(grad_components, 2);
> + } else if (op == ir_txs) {
> + /* There's no SIMD8 resinfo message on Gen4. Use SIMD16 instead. */
> + simd16 = true;
> + emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
> + mlen += 2;
> + } else {
> + /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod
> + * instructions. We'll need to do SIMD16 here.
> + */
> + simd16 = true;
> + assert(op == ir_txb || op == ir_txl || op == ir_txf);
> +
> + for (int i = 0; i < coord_components; i++) {
> + emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
> + coordinate));
> + coordinate = offset(coordinate, 1);
> + }
> +
> + /* Initialize the rest of u/v/r with 0.0. Empirically, this seems to
> + * be necessary for TXF (ld), but seems wise to do for all messages.
> + */
> + for (int i = coord_components; i < 3; i++) {
> + emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
> + }
> +
> + /* lod/bias appears after u/v/r. */
> + mlen += 6;
> +
> + emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
> + mlen++;
> +
> + /* The unused upper half. */
> + mlen++;
> + }
> +
> + if (simd16) {
> + /* Now, since we're doing simd16, the return is 2 interleaved
> + * vec4s where the odd-indexed ones are junk. We'll need to move
> + * this weirdness around to the expected layout.
> + */
> + orig_dst = dst;
> + dst = fs_reg(GRF, alloc.allocate(8), orig_dst.type);
> + }
> +
> + enum opcode opcode;
> + switch (op) {
> + case ir_tex: opcode = SHADER_OPCODE_TEX; break;
> + case ir_txb: opcode = FS_OPCODE_TXB; break;
> + case ir_txl: opcode = SHADER_OPCODE_TXL; break;
> + case ir_txd: opcode = SHADER_OPCODE_TXD; break;
> + case ir_txs: opcode = SHADER_OPCODE_TXS; break;
> + case ir_txf: opcode = SHADER_OPCODE_TXF; break;
> + default:
> + unreachable("not reached");
> + }
> +
> + fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
> + inst->base_mrf = base_mrf;
> + inst->mlen = mlen;
> + inst->header_present = true;
> + inst->regs_written = simd16 ? 8 : 4;
> +
> + if (simd16) {
> + for (int i = 0; i < 4; i++) {
> + emit(MOV(orig_dst, dst));
> + orig_dst = offset(orig_dst, 1);
> + dst = offset(dst, 2);
> + }
> + }
> +
> + return inst;
> +}
> +
> +/* gen5's sampler has slots for u, v, r, array index, then optional
> + * parameters like shadow comparitor or LOD bias. If optional
> + * parameters aren't present, those base slots are optional and don't
> + * need to be included in the message.
> + *
> + * We don't fill in the unnecessary slots regardless, which may look
> + * surprising in the disassembly.
> + */
> +fs_inst *
> +fs_god::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
> + fs_reg coordinate, int vector_elements,
> + fs_reg shadow_c,
> + fs_reg lod, fs_reg lod2, int grad_components,
> + fs_reg sample_index, uint32_t sampler,
> + bool has_offset)
> +{
> + int reg_width = dispatch_width / 8;
> + bool header_present = false;
> +
> + fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
> + fs_reg msg_coords = message;
> +
> + if (has_offset) {
> + /* The offsets set up by the ir_texture visitor are in the
> + * m1 header, so we can't go headerless.
> + */
> + header_present = true;
> + message.reg--;
> + }
> +
> + for (int i = 0; i < vector_elements; i++) {
> + emit(MOV(retype(offset(msg_coords, i), coordinate.type), coordinate));
> + coordinate = offset(coordinate, 1);
> + }
> + fs_reg msg_end = offset(msg_coords, vector_elements);
> + fs_reg msg_lod = offset(msg_coords, 4);
> +
> + if (shadow_c.file != BAD_FILE) {
> + fs_reg msg_shadow = msg_lod;
> + emit(MOV(msg_shadow, shadow_c));
> + msg_lod = offset(msg_shadow, 1);
> + msg_end = msg_lod;
> + }
> +
> + enum opcode opcode;
> + switch (op) {
> + case ir_tex:
> + opcode = SHADER_OPCODE_TEX;
> + break;
> + case ir_txb:
> + emit(MOV(msg_lod, lod));
> + msg_end = offset(msg_lod, 1);
> +
> + opcode = FS_OPCODE_TXB;
> + break;
> + case ir_txl:
> + emit(MOV(msg_lod, lod));
> + msg_end = offset(msg_lod, 1);
> +
> + opcode = SHADER_OPCODE_TXL;
> + break;
> + case ir_txd: {
> + /**
> + * P = u, v, r
> + * dPdx = dudx, dvdx, drdx
> + * dPdy = dudy, dvdy, drdy
> + *
> + * Load up these values:
> + * - dudx dudy dvdx dvdy drdx drdy
> + * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
> + */
> + msg_end = msg_lod;
> + for (int i = 0; i < grad_components; i++) {
> + emit(MOV(msg_end, lod));
> + lod = offset(lod, 1);
> + msg_end = offset(msg_end, 1);
> +
> + emit(MOV(msg_end, lod2));
> + lod2 = offset(lod2, 1);
> + msg_end = offset(msg_end, 1);
> + }
> +
> + opcode = SHADER_OPCODE_TXD;
> + break;
> + }
> + case ir_txs:
> + msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
> + emit(MOV(msg_lod, lod));
> + msg_end = offset(msg_lod, 1);
> +
> + opcode = SHADER_OPCODE_TXS;
> + break;
> + case ir_query_levels:
> + msg_lod = msg_end;
> + emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
> + msg_end = offset(msg_lod, 1);
> +
> + opcode = SHADER_OPCODE_TXS;
> + break;
> + case ir_txf:
> + msg_lod = offset(msg_coords, 3);
> + emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod));
> + msg_end = offset(msg_lod, 1);
> +
> + opcode = SHADER_OPCODE_TXF;
> + break;
> + case ir_txf_ms:
> + msg_lod = offset(msg_coords, 3);
> + /* lod */
> + emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
> + /* sample index */
> + emit(MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index));
> + msg_end = offset(msg_lod, 2);
> +
> + opcode = SHADER_OPCODE_TXF_CMS;
> + break;
> + case ir_lod:
> + opcode = SHADER_OPCODE_LOD;
> + break;
> + case ir_tg4:
> + opcode = SHADER_OPCODE_TG4;
> + break;
> + default:
> + unreachable("not reached");
> + }
> +
> + fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
> + inst->base_mrf = message.reg;
> + inst->mlen = msg_end.reg - message.reg;
> + inst->header_present = header_present;
> + inst->regs_written = 4 * reg_width;
> +
> + if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
> + fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
> + " disallowed by hardware\n");
> + }
> +
> + return inst;
> +}
> +
> +static bool
> +is_high_sampler(struct brw_context *brw, fs_reg sampler)
> +{
> + if (brw->gen < 8 && !brw->is_haswell)
> + return false;
> +
> + return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
> +}
> +
> +fs_inst *
> +fs_god::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
> + fs_reg coordinate, int coord_components,
> + fs_reg shadow_c,
> + fs_reg lod, fs_reg lod2, int grad_components,
> + fs_reg sample_index, fs_reg mcs, fs_reg sampler,
> + fs_reg offset_value)
> +{
> + int reg_width = dispatch_width / 8;
> + bool header_present = false;
> +
> + fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
> + for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
> + sources[i] = vgrf(glsl_type::float_type);
> + }
> + int length = 0;
> +
> + if (op == ir_tg4 || offset_value.file != BAD_FILE ||
> + is_high_sampler(brw, sampler)) {
> + /* For general texture offsets (no txf workaround), we need a header to
> + * put them in. Note that for SIMD16 we're making space for two actual
> + * hardware registers here, so the emit will have to fix up for this.
> + *
> + * * ir4_tg4 needs to place its channel select in the header,
> + * for interaction with ARB_texture_swizzle
> + *
> + * The sampler index is only 4-bits, so for larger sampler numbers we
> + * need to offset the Sampler State Pointer in the header.
> + */
> + header_present = true;
> + sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> + length++;
> + }
> +
> + if (shadow_c.file != BAD_FILE) {
> + emit(MOV(sources[length], shadow_c));
> + length++;
> + }
> +
> + bool has_nonconstant_offset =
> + offset_value.file != BAD_FILE && offset_value.file != IMM;
> + bool coordinate_done = false;
> +
> + /* Set up the LOD info */
> + switch (op) {
> + case ir_tex:
> + case ir_lod:
> + break;
> + case ir_txb:
> + emit(MOV(sources[length], lod));
> + length++;
> + break;
> + case ir_txl:
> + emit(MOV(sources[length], lod));
> + length++;
> + break;
> + case ir_txd: {
> + no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
> +
> + /* Load dPdx and the coordinate together:
> + * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
> + */
> + for (int i = 0; i < coord_components; i++) {
> + emit(MOV(sources[length], coordinate));
> + coordinate = offset(coordinate, 1);
> + length++;
> +
> + /* For cube map array, the coordinate is (u,v,r,ai) but there are
> + * only derivatives for (u, v, r).
> + */
> + if (i < grad_components) {
> + emit(MOV(sources[length], lod));
> + lod = offset(lod, 1);
> + length++;
> +
> + emit(MOV(sources[length], lod2));
> + lod2 = offset(lod2, 1);
> + length++;
> + }
> + }
> +
> + coordinate_done = true;
> + break;
> + }
> + case ir_txs:
> + emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod));
> + length++;
> + break;
> + case ir_query_levels:
> + emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u)));
> + length++;
> + break;
> + case ir_txf:
> + /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
> + emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
> + coordinate = offset(coordinate, 1);
> + length++;
> +
> + emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod));
> + length++;
> +
> + for (int i = 1; i < coord_components; i++) {
> + emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
> + coordinate = offset(coordinate, 1);
> + length++;
> + }
> +
> + coordinate_done = true;
> + break;
> + case ir_txf_ms:
> + emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index));
> + length++;
> +
> + /* data from the multisample control surface */
> + emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs));
> + length++;
> +
> + /* there is no offsetting for this message; just copy in the integer
> + * texture coordinates
> + */
> + for (int i = 0; i < coord_components; i++) {
> + emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
> + coordinate = offset(coordinate, 1);
> + length++;
> + }
> +
> + coordinate_done = true;
> + break;
> + case ir_tg4:
> + if (has_nonconstant_offset) {
> + if (shadow_c.file != BAD_FILE)
> + no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
> +
> + /* More crazy intermixing */
> + for (int i = 0; i < 2; i++) { /* u, v */
> + emit(MOV(sources[length], coordinate));
> + coordinate = offset(coordinate, 1);
> + length++;
> + }
> +
> + for (int i = 0; i < 2; i++) { /* offu, offv */
> + emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value));
> + offset_value = offset(offset_value, 1);
> + length++;
> + }
> +
> + if (coord_components == 3) { /* r if present */
> + emit(MOV(sources[length], coordinate));
> + coordinate = offset(coordinate, 1);
> + length++;
> + }
> +
> + coordinate_done = true;
> + }
> + break;
> + }
> +
> + /* Set up the coordinate (except for cases where it was done above) */
> + if (!coordinate_done) {
> + for (int i = 0; i < coord_components; i++) {
> + emit(MOV(sources[length], coordinate));
> + coordinate = offset(coordinate, 1);
> + length++;
> + }
> + }
> +
> + int mlen;
> + if (reg_width == 2)
> + mlen = length * reg_width - header_present;
> + else
> + mlen = length * reg_width;
> +
> + fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
> + BRW_REGISTER_TYPE_F);
> + emit(LOAD_PAYLOAD(src_payload, sources, length));
> +
> + /* Generate the SEND */
> + enum opcode opcode;
> + switch (op) {
> + case ir_tex: opcode = SHADER_OPCODE_TEX; break;
> + case ir_txb: opcode = FS_OPCODE_TXB; break;
> + case ir_txl: opcode = SHADER_OPCODE_TXL; break;
> + case ir_txd: opcode = SHADER_OPCODE_TXD; break;
> + case ir_txf: opcode = SHADER_OPCODE_TXF; break;
> + case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
> + case ir_txs: opcode = SHADER_OPCODE_TXS; break;
> + case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
> + case ir_lod: opcode = SHADER_OPCODE_LOD; break;
> + case ir_tg4:
> + if (has_nonconstant_offset)
> + opcode = SHADER_OPCODE_TG4_OFFSET;
> + else
> + opcode = SHADER_OPCODE_TG4;
> + break;
> + default:
> + unreachable("not reached");
> + }
> + fs_inst *inst = emit(opcode, dst, src_payload, sampler);
> + inst->base_mrf = -1;
> + inst->mlen = mlen;
> + inst->header_present = header_present;
> + inst->regs_written = 4 * reg_width;
> +
> + if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
> + fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
> + " disallowed by hardware\n");
> + }
> +
> + return inst;
> +}
> +
> +fs_reg
> +fs_god::rescale_texcoord(fs_reg coordinate, int coord_components,
> + bool is_rect, uint32_t sampler, int texunit)
> +{
> + fs_inst *inst = NULL;
> + bool needs_gl_clamp = true;
> + fs_reg scale_x, scale_y;
> +
> + /* The 965 requires the EU to do the normalization of GL rectangle
> + * texture coordinates. We use the program parameter state
> + * tracking to get the scaling factor.
> + */
> + if (is_rect &&
> + (brw->gen < 6 ||
> + (brw->gen >= 6 && (key_tex->gl_clamp_mask[0] & (1 << sampler) ||
> + key_tex->gl_clamp_mask[1] & (1 << sampler))))) {
> + struct gl_program_parameter_list *params = prog->Parameters;
> + int tokens[STATE_LENGTH] = {
> + STATE_INTERNAL,
> + STATE_TEXRECT_SCALE,
> + texunit,
> + 0,
> + 0
> + };
> +
> + no16("rectangle scale uniform setup not supported on SIMD16\n");
> + if (dispatch_width == 16) {
> + return coordinate;
> + }
> +
> + GLuint index = _mesa_add_state_reference(params,
> + (gl_state_index *)tokens);
> + /* Try to find existing copies of the texrect scale uniforms. */
> + for (unsigned i = 0; i < uniforms; i++) {
> + if (stage_prog_data->param[i] ==
> + &prog->Parameters->ParameterValues[index][0]) {
> + scale_x = fs_reg(UNIFORM, i);
> + scale_y = fs_reg(UNIFORM, i + 1);
> + break;
> + }
> + }
> +
> + /* If we didn't already set them up, do so now. */
> + if (scale_x.file == BAD_FILE) {
> + scale_x = fs_reg(UNIFORM, uniforms);
> + scale_y = fs_reg(UNIFORM, uniforms + 1);
> +
> + stage_prog_data->param[uniforms++] =
> + &prog->Parameters->ParameterValues[index][0];
> + stage_prog_data->param[uniforms++] =
> + &prog->Parameters->ParameterValues[index][1];
> + }
> + }
> +
> + /* The 965 requires the EU to do the normalization of GL rectangle
> + * texture coordinates. We use the program parameter state
> + * tracking to get the scaling factor.
> + */
> + if (brw->gen < 6 && is_rect) {
> + fs_reg dst = fs_reg(GRF, alloc.allocate(coord_components));
> + fs_reg src = coordinate;
> + coordinate = dst;
> +
> + emit(MUL(dst, src, scale_x));
> + dst = offset(dst, 1);
> + src = offset(src, 1);
> + emit(MUL(dst, src, scale_y));
> + } else if (is_rect) {
> + /* On gen6+, the sampler handles the rectangle coordinates
> + * natively, without needing rescaling. But that means we have
> + * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
> + * not [0, 1] like the default case below.
> + */
> + needs_gl_clamp = false;
> +
> + for (int i = 0; i < 2; i++) {
> + if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
> + fs_reg chan = coordinate;
> + chan = offset(chan, i);
> +
> + inst = emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f));
> + inst->conditional_mod = BRW_CONDITIONAL_GE;
> +
> + /* Our parameter comes in as 1.0/width or 1.0/height,
> + * because that's what people normally want for doing
> + * texture rectangle handling. We need width or height
> + * for clamping, but we don't care enough to make a new
> + * parameter type, so just invert back.
> + */
> + fs_reg limit = vgrf(glsl_type::float_type);
> + emit(MOV(limit, i == 0 ? scale_x : scale_y));
> + emit(SHADER_OPCODE_RCP, limit, limit);
> +
> + inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
> + inst->conditional_mod = BRW_CONDITIONAL_L;
> + }
> + }
> + }
> +
> + if (coord_components > 0 && needs_gl_clamp) {
> + for (int i = 0; i < MIN2(coord_components, 3); i++) {
> + if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
> + fs_reg chan = coordinate;
> + chan = offset(chan, i);
> +
> + fs_inst *inst = emit(MOV(chan, chan));
> + inst->saturate = true;
> + }
> + }
> + }
> + return coordinate;
> +}
> +
> +/* Sample from the MCS surface attached to this multisample texture. */
> +fs_reg
> +fs_god::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
> +{
> + int reg_width = dispatch_width / 8;
> + fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
> + BRW_REGISTER_TYPE_F);
> + fs_reg dest = vgrf(glsl_type::uvec4_type);
> + fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
> +
> + /* parameters are: u, v, r; missing parameters are treated as zero */
> + for (int i = 0; i < components; i++) {
> + sources[i] = vgrf(glsl_type::float_type);
> + emit(MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate));
> + coordinate = offset(coordinate, 1);
> + }
> +
> + emit(LOAD_PAYLOAD(payload, sources, components));
> +
> + fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
> + inst->base_mrf = -1;
> + inst->mlen = components * reg_width;
> + inst->header_present = false;
> + inst->regs_written = 4 * reg_width; /* we only care about one reg of
> + * response, but the sampler always
> + * writes 4/8
> + */
> +
> + return dest;
> +}
> +
> +void
> +fs_god::emit_texture(ir_texture_opcode op,
> + const glsl_type *dest_type,
> + fs_reg coordinate, int coord_components,
> + fs_reg shadow_c,
> + fs_reg lod, fs_reg lod2, int grad_components,
> + fs_reg sample_index,
> + fs_reg offset_value,
> + fs_reg mcs,
> + int gather_component,
> + bool is_cube_array,
> + bool is_rect,
> + uint32_t sampler,
> + fs_reg sampler_reg, int texunit)
> +{
> + fs_inst *inst = NULL;
> +
> + if (op == ir_tg4) {
> + /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
> + * emitting anything other than setting up the constant result.
> + */
> + int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
> + if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
> +
> + fs_reg res = vgrf(glsl_type::vec4_type);
> + this->result = res;
> +
> + for (int i=0; i<4; i++) {
> + emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
> + res = offset(res, 1);
> + }
> + return;
> + }
> + }
> +
> + if (coordinate.file != BAD_FILE) {
> + /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
> + * samplers. This should only be a problem with GL_CLAMP on Gen7.
> + */
> + coordinate = rescale_texcoord(coordinate, coord_components, is_rect,
> + sampler, texunit);
> + }
> +
> + /* Writemasking doesn't eliminate channels on SIMD8 texture
> + * samples, so don't worry about them.
> + */
> + fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
> +
> + if (brw->gen >= 7) {
> + inst = emit_texture_gen7(op, dst, coordinate, coord_components,
> + shadow_c, lod, lod2, grad_components,
> + sample_index, mcs, sampler_reg,
> + offset_value);
> + } else if (brw->gen >= 5) {
> + inst = emit_texture_gen5(op, dst, coordinate, coord_components,
> + shadow_c, lod, lod2, grad_components,
> + sample_index, sampler,
> + offset_value.file != BAD_FILE);
> + } else {
> + inst = emit_texture_gen4(op, dst, coordinate, coord_components,
> + shadow_c, lod, lod2, grad_components,
> + sampler);
> + }
> +
> + if (shadow_c.file != BAD_FILE)
> + inst->shadow_compare = true;
> +
> + if (offset_value.file == IMM)
> + inst->offset = offset_value.fixed_hw_reg.dw1.ud;
> +
> + if (op == ir_tg4) {
> + inst->offset |=
> + gather_channel(gather_component, sampler) << 16; /* M0.2:16-17 */
> +
> + if (brw->gen == 6)
> + emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], dst);
> + }
> +
> + /* fixup #layers for cube map arrays */
> + if (op == ir_txs && is_cube_array) {
> + fs_reg depth = offset(dst, 2);
> + fs_reg fixed_depth = vgrf(glsl_type::int_type);
> + emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
> +
> + fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
> + int components = inst->regs_written / (dst.width / 8);
> + for (int i = 0; i < components; i++) {
> + if (i == 2) {
> + fixed_payload[i] = fixed_depth;
> + } else {
> + fixed_payload[i] = offset(dst, i);
> + }
> + }
> + emit(LOAD_PAYLOAD(dst, fixed_payload, components));
> + }
> +
> + swizzle_result(op, dest_type->vector_elements, dst, sampler);
> +}
> +
> +void
> +fs_god::visit(ir_texture *ir)
> +{
> + uint32_t sampler =
> + _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
> +
> + ir_rvalue *nonconst_sampler_index =
> + _mesa_get_sampler_array_nonconst_index(ir->sampler);
> +
> + /* Handle non-constant sampler array indexing */
> + fs_reg sampler_reg;
> + if (nonconst_sampler_index) {
> + /* The highest sampler which may be used by this operation is
> + * the last element of the array. Mark it here, because the generator
> + * doesn't have enough information to determine the bound.
> + */
> + uint32_t array_size = ir->sampler->as_dereference_array()
> + ->array->type->array_size();
> +
> + uint32_t max_used = sampler + array_size - 1;
> + if (ir->op == ir_tg4 && brw->gen < 8) {
> + max_used += stage_prog_data->binding_table.gather_texture_start;
> + } else {
> + max_used += stage_prog_data->binding_table.texture_start;
> + }
> +
> + brw_mark_surface_used(prog_data, max_used);
> +
> + /* Emit code to evaluate the actual indexing expression */
> + nonconst_sampler_index->accept(this);
> + fs_reg temp = vgrf(glsl_type::uint_type);
> + emit(ADD(temp, this->result, fs_reg(sampler)))
> + ->force_writemask_all = true;
> + sampler_reg = temp;
> + } else {
> + /* Single sampler, or constant array index; the indexing expression
> + * is just an immediate.
> + */
> + sampler_reg = fs_reg(sampler);
> + }
> +
> + /* FINISHME: We're failing to recompile our programs when the sampler is
> + * updated. This only matters for the texture rectangle scale parameters
> + * (pre-gen6, or gen6+ with GL_CLAMP).
> + */
> + int texunit = prog->SamplerUnits[sampler];
> +
> + /* Should be lowered by do_lower_texture_projection */
> + assert(!ir->projector);
> +
> + /* Should be lowered */
> + assert(!ir->offset || !ir->offset->type->is_array());
> +
> + /* Generate code to compute all the subexpression trees. This has to be
> + * done before loading any values into MRFs for the sampler message since
> + * generating these values may involve SEND messages that need the MRFs.
> + */
> + fs_reg coordinate;
> + int coord_components = 0;
> + if (ir->coordinate) {
> + coord_components = ir->coordinate->type->vector_elements;
> + ir->coordinate->accept(this);
> + coordinate = this->result;
> + }
> +
> + fs_reg shadow_comparitor;
> + if (ir->shadow_comparitor) {
> + ir->shadow_comparitor->accept(this);
> + shadow_comparitor = this->result;
> + }
> +
> + fs_reg offset_value;
> + if (ir->offset) {
> + ir_constant *const_offset = ir->offset->as_constant();
> + if (const_offset) {
> + /* Store the header bitfield in an IMM register. This allows us to
> + * use offset_value.file to distinguish between no offset, a constant
> + * offset, and a non-constant offset.
> + */
> + offset_value =
> + fs_reg(brw_texture_offset(ctx, const_offset->value.i,
> + const_offset->type->vector_elements));
> + } else {
> + ir->offset->accept(this);
> + offset_value = this->result;
> + }
> + }
> +
> + fs_reg lod, lod2, sample_index, mcs;
> + int grad_components = 0;
> + switch (ir->op) {
> + case ir_tex:
> + case ir_lod:
> + case ir_tg4:
> + case ir_query_levels:
> + break;
> + case ir_txb:
> + ir->lod_info.bias->accept(this);
> + lod = this->result;
> + break;
> + case ir_txd:
> + ir->lod_info.grad.dPdx->accept(this);
> + lod = this->result;
> +
> + ir->lod_info.grad.dPdy->accept(this);
> + lod2 = this->result;
> +
> + grad_components = ir->lod_info.grad.dPdx->type->vector_elements;
> + break;
> + case ir_txf:
> + case ir_txl:
> + case ir_txs:
> + ir->lod_info.lod->accept(this);
> + lod = this->result;
> + break;
> + case ir_txf_ms:
> + ir->lod_info.sample_index->accept(this);
> + sample_index = this->result;
> +
> + if (brw->gen >= 7 &&
> + key_tex->compressed_multisample_layout_mask & (1 << sampler)) {
> + mcs = emit_mcs_fetch(coordinate, ir->coordinate->type->vector_elements,
> + sampler_reg);
> + } else {
> + mcs = fs_reg(0u);
> + }
> + break;
> + default:
> + unreachable("Unrecognized texture opcode");
> + };
> +
> + int gather_component = 0;
> + if (ir->op == ir_tg4)
> + gather_component = ir->lod_info.component->as_constant()->value.i[0];
> +
> + bool is_rect =
> + ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT;
> +
> + bool is_cube_array =
> + ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
> + ir->sampler->type->sampler_array;
> +
> + emit_texture(ir->op, ir->type, coordinate, coord_components,
> + shadow_comparitor, lod, lod2, grad_components,
> + sample_index, offset_value, mcs,
> + gather_component, is_cube_array, is_rect, sampler,
> + sampler_reg, texunit);
> +}
> +
> +/**
> + * Apply workarounds for Gen6 gather with UINT/SINT
> + */
> +void
> +fs_god::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
> +{
> + if (!wa)
> + return;
> +
> + int width = (wa & WA_8BIT) ? 8 : 16;
> +
> + for (int i = 0; i < 4; i++) {
> + fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
> + /* Convert from UNORM to UINT */
> + emit(MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1))));
> + emit(MOV(dst, dst_f));
> +
> + if (wa & WA_SIGN) {
> + /* Reinterpret the UINT value as a signed INT value by
> + * shifting the sign bit into place, then shifting back
> + * preserving sign.
> + */
> + emit(SHL(dst, dst, fs_reg(32 - width)));
> + emit(ASR(dst, dst, fs_reg(32 - width)));
> + }
> +
> + dst = offset(dst, 1);
> + }
> +}
> +
> +/**
> + * Set up the gather channel based on the swizzle, for gather4.
> + */
> +uint32_t
> +fs_god::gather_channel(int orig_chan, uint32_t sampler)
> +{
> + int swiz = GET_SWZ(key_tex->swizzles[sampler], orig_chan);
> + switch (swiz) {
> + case SWIZZLE_X: return 0;
> + case SWIZZLE_Y:
> + /* gather4 sampler is broken for green channel on RG32F --
> + * we must ask for blue instead.
> + */
> + if (key_tex->gather_channel_quirk_mask & (1 << sampler))
> + return 2;
> + return 1;
> + case SWIZZLE_Z: return 2;
> + case SWIZZLE_W: return 3;
> + default:
> + unreachable("Not reached"); /* zero, one swizzles handled already */
> + }
> +}
> +
> +/**
> + * Swizzle the result of a texture result. This is necessary for
> + * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
> + */
> +void
> +fs_god::swizzle_result(ir_texture_opcode op, int dest_components,
> + fs_reg orig_val, uint32_t sampler)
> +{
> + if (op == ir_query_levels) {
> + /* # levels is in .w */
> + this->result = offset(orig_val, 3);
> + return;
> + }
> +
> + this->result = orig_val;
> +
> + /* txs,lod don't actually sample the texture, so swizzling the result
> + * makes no sense.
> + */
> + if (op == ir_txs || op == ir_lod || op == ir_tg4)
> + return;
> +
> + if (dest_components == 1) {
> + /* Ignore DEPTH_TEXTURE_MODE swizzling. */
> + } else if (key_tex->swizzles[sampler] != SWIZZLE_NOOP) {
> + fs_reg swizzled_result = vgrf(glsl_type::vec4_type);
> + swizzled_result.type = orig_val.type;
> +
> + for (int i = 0; i < 4; i++) {
> + int swiz = GET_SWZ(key_tex->swizzles[sampler], i);
> + fs_reg l = swizzled_result;
> + l = offset(l, i);
> +
> + if (swiz == SWIZZLE_ZERO) {
> + emit(MOV(l, fs_reg(0.0f)));
> + } else if (swiz == SWIZZLE_ONE) {
> + emit(MOV(l, fs_reg(1.0f)));
> + } else {
> + emit(MOV(l, offset(orig_val,
> + GET_SWZ(key_tex->swizzles[sampler], i))));
> + }
> + }
> + this->result = swizzled_result;
> + }
> +}
> +
> +void
> +fs_god::visit(ir_swizzle *ir)
> +{
> + ir->val->accept(this);
> + fs_reg val = this->result;
> +
> + if (ir->type->vector_elements == 1) {
> + this->result = offset(this->result, ir->mask.x);
> + return;
> + }
> +
> + fs_reg result = vgrf(ir->type);
> + this->result = result;
> +
> + for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
> + fs_reg channel = val;
> + int swiz = 0;
> +
> + switch (i) {
> + case 0:
> + swiz = ir->mask.x;
> + break;
> + case 1:
> + swiz = ir->mask.y;
> + break;
> + case 2:
> + swiz = ir->mask.z;
> + break;
> + case 3:
> + swiz = ir->mask.w;
> + break;
> + }
> +
> + emit(MOV(result, offset(channel, swiz)));
> + result = offset(result, 1);
> + }
> +}
> +
> +void
> +fs_god::visit(ir_discard *ir)
> +{
> + /* We track our discarded pixels in f0.1. By predicating on it, we can
> + * update just the flag bits that aren't yet discarded. If there's no
> + * condition, we emit a CMP of g0 != g0, so all currently executing
> + * channels will get turned off.
> + */
> + fs_inst *cmp;
> + if (ir->condition) {
> + emit_bool_to_cond_code(ir->condition);
> + cmp = (fs_inst *) this->instructions.get_tail();
> + cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
> + } else {
> + fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
> + BRW_REGISTER_TYPE_UW));
> + cmp = emit(CMP(reg_null_f, some_reg, some_reg, BRW_CONDITIONAL_NZ));
> + }
> + cmp->predicate = BRW_PREDICATE_NORMAL;
> + cmp->flag_subreg = 1;
> +
> + if (brw->gen >= 6) {
> + emit_discard_jump();
> + }
> +}
> +
> +void
> +fs_god::visit(ir_constant *ir)
> +{
> + /* Set this->result to reg at the bottom of the function because some code
> + * paths will cause this visitor to be applied to other fields. This will
> + * cause the value stored in this->result to be modified.
> + *
> + * Make reg constant so that it doesn't get accidentally modified along the
> + * way. Yes, I actually had this problem. :(
> + */
> + const fs_reg reg = vgrf(ir->type);
> + fs_reg dst_reg = reg;
> +
> + if (ir->type->is_array()) {
> + const unsigned size = type_size(ir->type->fields.array);
> +
> + for (unsigned i = 0; i < ir->type->length; i++) {
> + ir->array_elements[i]->accept(this);
> + fs_reg src_reg = this->result;
> +
> + dst_reg.type = src_reg.type;
> + for (unsigned j = 0; j < size; j++) {
> + emit(MOV(dst_reg, src_reg));
> + src_reg = offset(src_reg, 1);
> + dst_reg = offset(dst_reg, 1);
> + }
> + }
> + } else if (ir->type->is_record()) {
> + foreach_in_list(ir_constant, field, &ir->components) {
> + const unsigned size = type_size(field->type);
> +
> + field->accept(this);
> + fs_reg src_reg = this->result;
> +
> + dst_reg.type = src_reg.type;
> + for (unsigned j = 0; j < size; j++) {
> + emit(MOV(dst_reg, src_reg));
> + src_reg = offset(src_reg, 1);
> + dst_reg = offset(dst_reg, 1);
> + }
> + }
> + } else {
> + const unsigned size = type_size(ir->type);
> +
> + for (unsigned i = 0; i < size; i++) {
> + switch (ir->type->base_type) {
> + case GLSL_TYPE_FLOAT:
> + emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
> + break;
> + case GLSL_TYPE_UINT:
> + emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
> + break;
> + case GLSL_TYPE_INT:
> + emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
> + break;
> + case GLSL_TYPE_BOOL:
> + emit(MOV(dst_reg,
> + fs_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
> + : 0)));
> + break;
> + default:
> + unreachable("Non-float/uint/int/bool constant");
> + }
> + dst_reg = offset(dst_reg, 1);
> + }
> + }
> +
> + this->result = reg;
> +}
> +
> +void
> +fs_god::emit_bool_to_cond_code(ir_rvalue *ir)
> +{
> + ir_expression *expr = ir->as_expression();
> +
> + if (!expr || expr->operation == ir_binop_ubo_load) {
> + ir->accept(this);
> +
> + fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> + return;
> + }
> +
> + fs_reg op[3];
> +
> + assert(expr->get_num_operands() <= 3);
> + for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> + assert(expr->operands[i]->type->is_scalar());
> +
> + expr->operands[i]->accept(this);
> + op[i] = this->result;
> +
> + resolve_ud_negate(&op[i]);
> + }
> +
> + emit_bool_to_cond_code_of_reg(expr, op);
> +}
> +
> +void
> +fs_god::emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3])
> +{
> + fs_inst *inst;
> +
> + switch (expr->operation) {
> + case ir_unop_logic_not:
> + inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
> + inst->conditional_mod = BRW_CONDITIONAL_Z;
> + break;
> +
> + case ir_binop_logic_xor:
> + if (brw->gen <= 5) {
> + fs_reg temp = vgrf(expr->type);
> + emit(XOR(temp, op[0], op[1]));
> + inst = emit(AND(reg_null_d, temp, fs_reg(1)));
> + } else {
> + inst = emit(XOR(reg_null_d, op[0], op[1]));
> + }
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> + break;
> +
> + case ir_binop_logic_or:
> + if (brw->gen <= 5) {
> + fs_reg temp = vgrf(expr->type);
> + emit(OR(temp, op[0], op[1]));
> + inst = emit(AND(reg_null_d, temp, fs_reg(1)));
> + } else {
> + inst = emit(OR(reg_null_d, op[0], op[1]));
> + }
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> + break;
> +
> + case ir_binop_logic_and:
> + if (brw->gen <= 5) {
> + fs_reg temp = vgrf(expr->type);
> + emit(AND(temp, op[0], op[1]));
> + inst = emit(AND(reg_null_d, temp, fs_reg(1)));
> + } else {
> + inst = emit(AND(reg_null_d, op[0], op[1]));
> + }
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> + break;
> +
> + case ir_unop_f2b:
> + if (brw->gen >= 6) {
> + emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
> + } else {
> + inst = emit(MOV(reg_null_f, op[0]));
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> + }
> + break;
> +
> + case ir_unop_i2b:
> + if (brw->gen >= 6) {
> + emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
> + } else {
> + inst = emit(MOV(reg_null_d, op[0]));
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> + }
> + break;
> +
> + case ir_binop_greater:
> + case ir_binop_gequal:
> + case ir_binop_less:
> + case ir_binop_lequal:
> + case ir_binop_equal:
> + case ir_binop_all_equal:
> + case ir_binop_nequal:
> + case ir_binop_any_nequal:
> + if (brw->gen <= 5) {
> + resolve_bool_comparison(expr->operands[0], &op[0]);
> + resolve_bool_comparison(expr->operands[1], &op[1]);
> + }
> +
> + emit(CMP(reg_null_d, op[0], op[1],
> + brw_conditional_for_comparison(expr->operation)));
> + break;
> +
> + case ir_triop_csel: {
> + /* Expand the boolean condition into the flag register. */
> + inst = emit(MOV(reg_null_d, op[0]));
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +
> + /* Select which boolean to return. */
> + fs_reg temp = vgrf(expr->operands[1]->type);
> + inst = emit(SEL(temp, op[1], op[2]));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> +
> + /* Expand the result to a condition code. */
> + inst = emit(MOV(reg_null_d, temp));
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> + break;
> + }
> +
> + default:
> + unreachable("not reached");
> + }
> +}
> +
> +/**
> + * Emit a gen6 IF statement with the comparison folded into the IF
> + * instruction.
> + */
> +void
> +fs_god::emit_if_gen6(ir_if *ir)
> +{
> + ir_expression *expr = ir->condition->as_expression();
> +
> + if (expr && expr->operation != ir_binop_ubo_load) {
> + fs_reg op[3];
> + fs_inst *inst;
> + fs_reg temp;
> +
> + assert(expr->get_num_operands() <= 3);
> + for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> + assert(expr->operands[i]->type->is_scalar());
> +
> + expr->operands[i]->accept(this);
> + op[i] = this->result;
> + }
> +
> + switch (expr->operation) {
> + case ir_unop_logic_not:
> + emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_Z));
> + return;
> +
> + case ir_binop_logic_xor:
> + emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
> + return;
> +
> + case ir_binop_logic_or:
> + temp = vgrf(glsl_type::bool_type);
> + emit(OR(temp, op[0], op[1]));
> + emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
> + return;
> +
> + case ir_binop_logic_and:
> + temp = vgrf(glsl_type::bool_type);
> + emit(AND(temp, op[0], op[1]));
> + emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
> + return;
> +
> + case ir_unop_f2b:
> + inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> + return;
> +
> + case ir_unop_i2b:
> + emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
> + return;
> +
> + case ir_binop_greater:
> + case ir_binop_gequal:
> + case ir_binop_less:
> + case ir_binop_lequal:
> + case ir_binop_equal:
> + case ir_binop_all_equal:
> + case ir_binop_nequal:
> + case ir_binop_any_nequal:
> + if (brw->gen <= 5) {
> + resolve_bool_comparison(expr->operands[0], &op[0]);
> + resolve_bool_comparison(expr->operands[1], &op[1]);
> + }
> +
> + emit(IF(op[0], op[1],
> + brw_conditional_for_comparison(expr->operation)));
> + return;
> +
> + case ir_triop_csel: {
> + /* Expand the boolean condition into the flag register. */
> + fs_inst *inst = emit(MOV(reg_null_d, op[0]));
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +
> + /* Select which boolean to use as the result. */
> + fs_reg temp = vgrf(expr->operands[1]->type);
> + inst = emit(SEL(temp, op[1], op[2]));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> +
> + emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
> + return;
> + }
> +
> + default:
> + unreachable("not reached");
> + }
> + }
> +
> + ir->condition->accept(this);
> + emit(IF(this->result, fs_reg(0), BRW_CONDITIONAL_NZ));
> +}
> +
> +bool
> +fs_god::try_opt_frontfacing_ternary(ir_if *ir)
> +{
> + ir_dereference_variable *deref = ir->condition->as_dereference_variable();
> + if (!deref || strcmp(deref->var->name, "gl_FrontFacing") != 0)
> + return false;
> +
> + if (ir->then_instructions.length() != 1 ||
> + ir->else_instructions.length() != 1)
> + return false;
> +
> + ir_assignment *then_assign =
> + ((ir_instruction *)ir->then_instructions.head)->as_assignment();
> + ir_assignment *else_assign =
> + ((ir_instruction *)ir->else_instructions.head)->as_assignment();
> +
> + if (!then_assign || then_assign->condition ||
> + !else_assign || else_assign->condition ||
> + then_assign->write_mask != else_assign->write_mask ||
> + !then_assign->lhs->equals(else_assign->lhs))
> + return false;
> +
> + ir_constant *then_rhs = then_assign->rhs->as_constant();
> + ir_constant *else_rhs = else_assign->rhs->as_constant();
> +
> + if (!then_rhs || !else_rhs)
> + return false;
> +
> + if (then_rhs->type->base_type != GLSL_TYPE_FLOAT)
> + return false;
> +
> + if ((then_rhs->is_one() && else_rhs->is_negative_one()) ||
> + (else_rhs->is_one() && then_rhs->is_negative_one())) {
> + then_assign->lhs->accept(this);
> + fs_reg dst = this->result;
> + dst.type = BRW_REGISTER_TYPE_D;
> + fs_reg tmp = vgrf(glsl_type::int_type);
> +
> + if (brw->gen >= 6) {
> + /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
> + fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
> +
> + /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
> + *
> + * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W
> + * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
> + *
> + * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
> + */
> +
> + if (then_rhs->is_negative_one()) {
> + assert(else_rhs->is_one());
> + g0.negate = true;
> + }
> +
> + tmp.type = BRW_REGISTER_TYPE_W;
> + tmp.subreg_offset = 2;
> + tmp.stride = 2;
> +
> + fs_inst *or_inst = emit(OR(tmp, g0, fs_reg(0x3f80)));
> + or_inst->src[1].type = BRW_REGISTER_TYPE_UW;
> +
> + tmp.type = BRW_REGISTER_TYPE_D;
> + tmp.subreg_offset = 0;
> + tmp.stride = 1;
> + } else {
> + /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
> + fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
> +
> + /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
> + *
> + * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D
> + * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
> + *
> + * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
> + */
> +
> + if (then_rhs->is_negative_one()) {
> + assert(else_rhs->is_one());
> + g1_6.negate = true;
> + }
> +
> + emit(OR(tmp, g1_6, fs_reg(0x3f800000)));
> + }
> + emit(AND(dst, tmp, fs_reg(0xbf800000)));
> + return true;
> + }
> +
> + return false;
> +}
> +
> +/**
> + * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
> + *
> + * Many GLSL shaders contain the following pattern:
> + *
> + * x = condition ? foo : bar
> + *
> + * The compiler emits an ir_if tree for this, since each subexpression might be
> + * a complex tree that could have side-effects or short-circuit logic.
> + *
> + * However, the common case is to simply select one of two constants or
> + * variable values---which is exactly what SEL is for. In this case, the
> + * assembly looks like:
> + *
> + * (+f0) IF
> + * MOV dst src0
> + * ELSE
> + * MOV dst src1
> + * ENDIF
> + *
> + * which can be easily translated into:
> + *
> + * (+f0) SEL dst src0 src1
> + *
> + * If src0 is an immediate value, we promote it to a temporary GRF.
> + */
> +bool
> +fs_god::try_replace_with_sel()
> +{
> + fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
> + assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
> +
> + /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
> + int opcodes[] = {
> + BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
> + };
> +
> + fs_inst *match = (fs_inst *) endif_inst->prev;
> + for (int i = 0; i < 4; i++) {
> + if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
> + return false;
> + match = (fs_inst *) match->prev;
> + }
> +
> + /* The opcodes match; it looks like the right sequence of instructions. */
> + fs_inst *else_mov = (fs_inst *) endif_inst->prev;
> + fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
> + fs_inst *if_inst = (fs_inst *) then_mov->prev;
> +
> + /* Check that the MOVs are the right form. */
> + if (then_mov->dst.equals(else_mov->dst) &&
> + !then_mov->is_partial_write() &&
> + !else_mov->is_partial_write()) {
> +
> + /* Remove the matched instructions; we'll emit a SEL to replace them. */
> + while (!if_inst->next->is_tail_sentinel())
> + if_inst->next->exec_node::remove();
> + if_inst->exec_node::remove();
> +
> + /* Only the last source register can be a constant, so if the MOV in
> + * the "then" clause uses a constant, we need to put it in a temporary.
> + */
> + fs_reg src0(then_mov->src[0]);
> + if (src0.file == IMM) {
> + src0 = vgrf(glsl_type::float_type);
> + src0.type = then_mov->src[0].type;
> + emit(MOV(src0, then_mov->src[0]));
> + }
> +
> + fs_inst *sel;
> + if (if_inst->conditional_mod) {
> + /* Sandybridge-specific IF with embedded comparison */
> + emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
> + if_inst->conditional_mod));
> + sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
> + sel->predicate = BRW_PREDICATE_NORMAL;
> + } else {
> + /* Separate CMP and IF instructions */
> + sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
> + sel->predicate = if_inst->predicate;
> + sel->predicate_inverse = if_inst->predicate_inverse;
> + }
> +
> + return true;
> + }
> +
> + return false;
> +}
> +
> +void
> +fs_god::visit(ir_if *ir)
> +{
> + if (try_opt_frontfacing_ternary(ir))
> + return;
> +
> + /* Don't point the annotation at the if statement, because then it plus
> + * the then and else blocks get printed.
> + */
> + this->base_ir = ir->condition;
> +
> + if (brw->gen == 6) {
> + emit_if_gen6(ir);
> + } else {
> + emit_bool_to_cond_code(ir->condition);
> +
> + emit(IF(BRW_PREDICATE_NORMAL));
> + }
> +
> + foreach_in_list(ir_instruction, ir_, &ir->then_instructions) {
> + this->base_ir = ir_;
> + ir_->accept(this);
> + }
> +
> + if (!ir->else_instructions.is_empty()) {
> + emit(BRW_OPCODE_ELSE);
> +
> + foreach_in_list(ir_instruction, ir_, &ir->else_instructions) {
> + this->base_ir = ir_;
> + ir_->accept(this);
> + }
> + }
> +
> + emit(BRW_OPCODE_ENDIF);
> +
> + if (!try_replace_with_sel() && brw->gen < 6) {
> + no16("Can't support (non-uniform) control flow on SIMD16\n");
> + }
> +}
> +
> +void
> +fs_god::visit(ir_loop *ir)
> +{
> + if (brw->gen < 6) {
> + no16("Can't support (non-uniform) control flow on SIMD16\n");
> + }
> +
> + this->base_ir = NULL;
> + emit(BRW_OPCODE_DO);
> +
> + foreach_in_list(ir_instruction, ir_, &ir->body_instructions) {
> + this->base_ir = ir_;
> + ir_->accept(this);
> + }
> +
> + this->base_ir = NULL;
> + emit(BRW_OPCODE_WHILE);
> +}
> +
> +void
> +fs_god::visit(ir_loop_jump *ir)
> +{
> + switch (ir->mode) {
> + case ir_loop_jump::jump_break:
> + emit(BRW_OPCODE_BREAK);
> + break;
> + case ir_loop_jump::jump_continue:
> + emit(BRW_OPCODE_CONTINUE);
> + break;
> + }
> +}
> +
> +void
> +fs_god::visit_atomic_counter_intrinsic(ir_call *ir)
> +{
> + ir_dereference *deref = static_cast<ir_dereference *>(
> + ir->actual_parameters.get_head());
> + ir_variable *location = deref->variable_referenced();
> + unsigned surf_index = (stage_prog_data->binding_table.abo_start +
> + location->data.binding);
> +
> + /* Calculate the surface offset */
> + fs_reg offset = vgrf(glsl_type::uint_type);
> + ir_dereference_array *deref_array = deref->as_dereference_array();
> +
> + if (deref_array) {
> + deref_array->array_index->accept(this);
> +
> + fs_reg tmp = vgrf(glsl_type::uint_type);
> + emit(MUL(tmp, this->result, fs_reg(ATOMIC_COUNTER_SIZE)));
> + emit(ADD(offset, tmp, fs_reg(location->data.atomic.offset)));
> + } else {
> + offset = fs_reg(location->data.atomic.offset);
> + }
> +
> + /* Emit the appropriate machine instruction */
> + const char *callee = ir->callee->function_name();
> + ir->return_deref->accept(this);
> + fs_reg dst = this->result;
> +
> + if (!strcmp("__intrinsic_atomic_read", callee)) {
> + emit_untyped_surface_read(surf_index, dst, offset);
> +
> + } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
> + emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
> + fs_reg(), fs_reg());
> +
> + } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
> + emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
> + fs_reg(), fs_reg());
> + }
> +}
> +
> +void
> +fs_god::visit(ir_call *ir)
> +{
> + const char *callee = ir->callee->function_name();
> +
> + if (!strcmp("__intrinsic_atomic_read", callee) ||
> + !strcmp("__intrinsic_atomic_increment", callee) ||
> + !strcmp("__intrinsic_atomic_predecrement", callee)) {
> + visit_atomic_counter_intrinsic(ir);
> + } else {
> + unreachable("Unsupported intrinsic.");
> + }
> +}
> +
> +void
> +fs_god::visit(ir_return *)
> +{
> + unreachable("FINISHME");
> +}
> +
> +void
> +fs_god::visit(ir_function *ir)
> +{
> + /* Ignore function bodies other than main() -- we shouldn't see calls to
> + * them since they should all be inlined before we get to ir_to_mesa.
> + */
> + if (strcmp(ir->name, "main") == 0) {
> + const ir_function_signature *sig;
> + exec_list empty;
> +
> + sig = ir->matching_signature(NULL, &empty, false);
> +
> + assert(sig);
> +
> + foreach_in_list(ir_instruction, ir_, &sig->body) {
> + this->base_ir = ir_;
> + ir_->accept(this);
> + }
> + }
> +}
> +
> +void
> +fs_god::visit(ir_function_signature *)
> +{
> + unreachable("not reached");
> +}
> +
> +void
> +fs_god::visit(ir_emit_vertex *)
> +{
> + unreachable("not reached");
> +}
> +
> +void
> +fs_god::visit(ir_end_primitive *)
> +{
> + unreachable("not reached");
> +}
> +
> +void
> +fs_god::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
> + fs_reg dst, fs_reg offset, fs_reg src0,
> + fs_reg src1)
> +{
> + int reg_width = dispatch_width / 8;
> + int length = 0;
> +
> + fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 4);
> +
> + sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> + /* Initialize the sample mask in the message header. */
> + emit(MOV(sources[0], fs_reg(0u)))
> + ->force_writemask_all = true;
> +
> + if (stage == MESA_SHADER_FRAGMENT) {
> + if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
> + emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
> + ->force_writemask_all = true;
> + } else {
> + emit(MOV(component(sources[0], 7),
> + retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
> + ->force_writemask_all = true;
> + }
> + } else {
> + /* The execution mask is part of the side-band information sent together with
> + * the message payload to the data port. It's implicitly ANDed with the sample
> + * mask sent in the header to compute the actual set of channels that execute
> + * the atomic operation.
> + */
> + assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
> + emit(MOV(component(sources[0], 7),
> + fs_reg(0xffffu)))->force_writemask_all = true;
> + }
> + length++;
> +
> + /* Set the atomic operation offset. */
> + sources[1] = vgrf(glsl_type::uint_type);
> + emit(MOV(sources[1], offset));
> + length++;
> +
> + /* Set the atomic operation arguments. */
> + if (src0.file != BAD_FILE) {
> + sources[length] = vgrf(glsl_type::uint_type);
> + emit(MOV(sources[length], src0));
> + length++;
> + }
> +
> + if (src1.file != BAD_FILE) {
> + sources[length] = vgrf(glsl_type::uint_type);
> + emit(MOV(sources[length], src1));
> + length++;
> + }
> +
> + int mlen = 1 + (length - 1) * reg_width;
> + fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
> + BRW_REGISTER_TYPE_UD);
> + emit(LOAD_PAYLOAD(src_payload, sources, length));
> +
> + /* Emit the instruction. */
> + fs_inst *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
> + fs_reg(atomic_op), fs_reg(surf_index));
> + inst->mlen = mlen;
> +}
> +
> +void
> +fs_god::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
> + fs_reg offset)
> +{
> + int reg_width = dispatch_width / 8;
> +
> + fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
> +
> + sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> + /* Initialize the sample mask in the message header. */
> + emit(MOV(sources[0], fs_reg(0u)))
> + ->force_writemask_all = true;
> +
> + if (stage == MESA_SHADER_FRAGMENT) {
> + if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
> + emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
> + ->force_writemask_all = true;
> + } else {
> + emit(MOV(component(sources[0], 7),
> + retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
> + ->force_writemask_all = true;
> + }
> + } else {
> + /* The execution mask is part of the side-band information sent together with
> + * the message payload to the data port. It's implicitly ANDed with the sample
> + * mask sent in the header to compute the actual set of channels that execute
> + * the atomic operation.
> + */
> + assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
> + emit(MOV(component(sources[0], 7),
> + fs_reg(0xffffu)))->force_writemask_all = true;
> + }
> +
> + /* Set the surface read offset. */
> + sources[1] = vgrf(glsl_type::uint_type);
> + emit(MOV(sources[1], offset));
> +
> + int mlen = 1 + reg_width;
> + fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
> + BRW_REGISTER_TYPE_UD);
> + fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2));
> +
> + /* Emit the instruction. */
> + inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
> + fs_reg(surf_index));
> + inst->mlen = mlen;
> +}
> +
> +fs_inst *
> +fs_god::emit(fs_inst *inst)
> +{
> + if (dispatch_width == 16 && inst->exec_size == 8)
> + inst->force_uncompressed = true;
> +
> + inst->annotation = this->current_annotation;
> + inst->ir = this->base_ir;
> +
> + this->instructions.push_tail(inst);
> +
> + return inst;
> +}
> +
> +void
> +fs_god::emit(exec_list list)
> +{
> + foreach_in_list_safe(fs_inst, inst, &list) {
> + inst->exec_node::remove();
> + emit(inst);
> + }
> +}
> +
> +/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
> +void
> +fs_god::emit_dummy_fs()
> +{
> + int reg_width = dispatch_width / 8;
> +
> + /* Everyone's favorite color. */
> + const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
> + for (int i = 0; i < 4; i++) {
> + emit(MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
> + dispatch_width), fs_reg(color[i])));
> + }
> +
> + fs_inst *write;
> + write = emit(FS_OPCODE_FB_WRITE);
> + write->eot = true;
> + if (brw->gen >= 6) {
> + write->base_mrf = 2;
> + write->mlen = 4 * reg_width;
> + } else {
> + write->header_present = true;
> + write->base_mrf = 0;
> + write->mlen = 2 + 4 * reg_width;
> + }
> +
> + /* Tell the SF we don't have any inputs. Gen4-5 require at least one
> + * varying to avoid GPU hangs, so set that.
> + */
> + brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
> + wm_prog_data->num_varying_inputs = brw->gen < 6 ? 1 : 0;
> + memset(wm_prog_data->urb_setup, -1,
> + sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
> +
> + /* We don't have any uniforms. */
> + stage_prog_data->nr_params = 0;
> + stage_prog_data->nr_pull_params = 0;
> + stage_prog_data->curb_read_length = 0;
> + stage_prog_data->dispatch_grf_start_reg = 2;
> + wm_prog_data->dispatch_grf_start_reg_16 = 2;
> + grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
> +
> + calculate_cfg();
> +}
> +
> +/* The register location here is relative to the start of the URB
> + * data. It will get adjusted to be a real location before
> + * generate_code() time.
> + */
> +struct brw_reg
> +fs_god::interp_reg(int location, int channel)
> +{
> + assert(stage == MESA_SHADER_FRAGMENT);
> + brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> + int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
> + int stride = (channel & 1) * 4;
> +
> + assert(prog_data->urb_setup[location] != -1);
> +
> + return brw_vec1_grf(regnr, stride);
> +}
> +
> +/** Emits the interpolation for the varying inputs. */
> +void
> +fs_god::emit_interpolation_setup_gen4()
> +{
> + this->current_annotation = "compute pixel centers";
> + this->pixel_x = vgrf(glsl_type::uint_type);
> + this->pixel_y = vgrf(glsl_type::uint_type);
> + this->pixel_x.type = BRW_REGISTER_TYPE_UW;
> + this->pixel_y.type = BRW_REGISTER_TYPE_UW;
> +
> + emit(FS_OPCODE_PIXEL_X, this->pixel_x);
> + emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
> +
> + this->current_annotation = "compute pixel deltas from v0";
> + if (brw->has_pln) {
> + this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> + vgrf(glsl_type::vec2_type);
> + this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> + offset(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 1);
> + } else {
> + this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> + vgrf(glsl_type::float_type);
> + this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> + vgrf(glsl_type::float_type);
> + }
> + emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> + this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))));
> + emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> + this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))));
> +
> + this->current_annotation = "compute pos.w and 1/pos.w";
> + /* Compute wpos.w. It's always in our setup, since it's needed to
> + * interpolate the other attributes.
> + */
> + this->wpos_w = vgrf(glsl_type::float_type);
> + emit(FS_OPCODE_LINTERP, wpos_w,
> + this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> + this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> + interp_reg(VARYING_SLOT_POS, 3));
> + /* Compute the pixel 1/W value from wpos.w. */
> + this->pixel_w = vgrf(glsl_type::float_type);
> + emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
> + this->current_annotation = NULL;
> +}
> +
> +/** Emits the interpolation for the varying inputs. */
> +void
> +fs_god::emit_interpolation_setup_gen6()
> +{
> + struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
> +
> + /* If the pixel centers end up used, the setup is the same as for gen4. */
> + this->current_annotation = "compute pixel centers";
> + fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
> + fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
> + int_pixel_x.type = BRW_REGISTER_TYPE_UW;
> + int_pixel_y.type = BRW_REGISTER_TYPE_UW;
> + emit(ADD(int_pixel_x,
> + fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
> + fs_reg(brw_imm_v(0x10101010))));
> + emit(ADD(int_pixel_y,
> + fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
> + fs_reg(brw_imm_v(0x11001100))));
> +
> + /* As of gen6, we can no longer mix float and int sources. We have
> + * to turn the integer pixel centers into floats for their actual
> + * use.
> + */
> + this->pixel_x = vgrf(glsl_type::float_type);
> + this->pixel_y = vgrf(glsl_type::float_type);
> + emit(MOV(this->pixel_x, int_pixel_x));
> + emit(MOV(this->pixel_y, int_pixel_y));
> +
> + this->current_annotation = "compute pos.w";
> + this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
> + this->wpos_w = vgrf(glsl_type::float_type);
> + emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
> +
> + for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
> + uint8_t reg = payload.barycentric_coord_reg[i];
> + this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
> + this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
> + }
> +
> + this->current_annotation = NULL;
> +}
> +
> +int
> +fs_god::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
> + bool use_2nd_half)
> +{
> + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> + fs_inst *inst;
> +
> + if (color.file == BAD_FILE) {
> + return 4 * (dispatch_width / 8);
> + }
> +
> + uint8_t colors_enabled;
> + if (components == 0) {
> + /* We want to write one component to the alpha channel */
> + colors_enabled = 0x8;
> + } else {
> + /* Enable the first components-many channels */
> + colors_enabled = (1 << components) - 1;
> + }
> +
> + if (dispatch_width == 8 || (brw->gen >= 6 && !do_dual_src)) {
> + /* SIMD8 write looks like:
> + * m + 0: r0
> + * m + 1: r1
> + * m + 2: g0
> + * m + 3: g1
> + *
> + * gen6 SIMD16 DP write looks like:
> + * m + 0: r0
> + * m + 1: r1
> + * m + 2: g0
> + * m + 3: g1
> + * m + 4: b0
> + * m + 5: b1
> + * m + 6: a0
> + * m + 7: a1
> + */
> + int len = 0;
> + for (unsigned i = 0; i < 4; ++i) {
> + if (colors_enabled & (1 << i)) {
> + dst[len] = fs_reg(GRF, alloc.allocate(color.width / 8),
> + color.type, color.width);
> + inst = emit(MOV(dst[len], offset(color, i)));
> + inst->saturate = key->clamp_fragment_color;
> + } else if (color.width == 16) {
> + /* We need two BAD_FILE slots for a 16-wide color */
> + len++;
> + }
> + len++;
> + }
> + return len;
> + } else if (brw->gen >= 6 && do_dual_src) {
> + /* SIMD16 dual source blending for gen6+.
> + *
> + * From the SNB PRM, volume 4, part 1, page 193:
> + *
> + * "The dual source render target messages only have SIMD8 forms due to
> + * maximum message length limitations. SIMD16 pixel shaders must send two
> + * of these messages to cover all of the pixels. Each message contains
> + * two colors (4 channels each) for each pixel in the message payload."
> + *
> + * So in SIMD16 dual source blending we will send 2 SIMD8 messages,
> + * each one will call this function twice (one for each color involved),
> + * so in each pass we only write 4 registers. Notice that the second
> + * SIMD8 message needs to read color data from the 2nd half of the color
> + * registers, so it needs to call this with use_2nd_half = true.
> + */
> + for (unsigned i = 0; i < 4; ++i) {
> + if (colors_enabled & (1 << i)) {
> + dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
> + inst = emit(MOV(dst[i], half(offset(color, i),
> + use_2nd_half ? 1 : 0)));
> + inst->saturate = key->clamp_fragment_color;
> + if (use_2nd_half)
> + inst->force_sechalf = true;
> + }
> + }
> + return 4;
> + } else {
> + /* pre-gen6 SIMD16 single source DP write looks like:
> + * m + 0: r0
> + * m + 1: g0
> + * m + 2: b0
> + * m + 3: a0
> + * m + 4: r1
> + * m + 5: g1
> + * m + 6: b1
> + * m + 7: a1
> + */
> + for (unsigned i = 0; i < 4; ++i) {
> + if (colors_enabled & (1 << i)) {
> + dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
> + inst = emit(MOV(dst[i], half(offset(color, i), 0)));
> + inst->saturate = key->clamp_fragment_color;
> +
> + dst[i + 4] = fs_reg(GRF, alloc.allocate(1), color.type);
> + inst = emit(MOV(dst[i + 4], half(offset(color, i), 1)));
> + inst->saturate = key->clamp_fragment_color;
> + inst->force_sechalf = true;
> + }
> + }
> + return 8;
> + }
> +}
> +
> +static enum brw_conditional_mod
> +cond_for_alpha_func(GLenum func)
> +{
> + switch(func) {
> + case GL_GREATER:
> + return BRW_CONDITIONAL_G;
> + case GL_GEQUAL:
> + return BRW_CONDITIONAL_GE;
> + case GL_LESS:
> + return BRW_CONDITIONAL_L;
> + case GL_LEQUAL:
> + return BRW_CONDITIONAL_LE;
> + case GL_EQUAL:
> + return BRW_CONDITIONAL_EQ;
> + case GL_NOTEQUAL:
> + return BRW_CONDITIONAL_NEQ;
> + default:
> + unreachable("Not reached");
> + }
> +}
> +
> +/**
> + * Alpha test support for when we compile it into the shader instead
> + * of using the normal fixed-function alpha test.
> + */
> +void
> +fs_god::emit_alpha_test()
> +{
> + assert(stage == MESA_SHADER_FRAGMENT);
> + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> + this->current_annotation = "Alpha test";
> +
> + fs_inst *cmp;
> + if (key->alpha_test_func == GL_ALWAYS)
> + return;
> +
> + if (key->alpha_test_func == GL_NEVER) {
> + /* f0.1 = 0 */
> + fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
> + BRW_REGISTER_TYPE_UW));
> + cmp = emit(CMP(reg_null_f, some_reg, some_reg,
> + BRW_CONDITIONAL_NEQ));
> + } else {
> + /* RT0 alpha */
> + fs_reg color = offset(outputs[0], 3);
> +
> + /* f0.1 &= func(color, ref) */
> + cmp = emit(CMP(reg_null_f, color, fs_reg(key->alpha_test_ref),
> + cond_for_alpha_func(key->alpha_test_func)));
> + }
> + cmp->predicate = BRW_PREDICATE_NORMAL;
> + cmp->flag_subreg = 1;
> +}
> +
> +fs_inst *
> +fs_god::emit_single_fb_write(fs_reg color0, fs_reg color1,
> + fs_reg src0_alpha, unsigned components,
> + bool use_2nd_half)
> +{
> + assert(stage == MESA_SHADER_FRAGMENT);
> + brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> +
> + this->current_annotation = "FB write header";
> + bool header_present = true;
> + int reg_size = dispatch_width / 8;
> +
> + /* We can potentially have a message length of up to 15, so we have to set
> + * base_mrf to either 0 or 1 in order to fit in m0..m15.
> + */
> + fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15);
> + int length = 0;
> +
> + /* From the Sandy Bridge PRM, volume 4, page 198:
> + *
> + * "Dispatched Pixel Enables. One bit per pixel indicating
> + * which pixels were originally enabled when the thread was
> + * dispatched. This field is only required for the end-of-
> + * thread message and on all dual-source messages."
> + */
> + if (brw->gen >= 6 &&
> + (brw->is_haswell || brw->gen >= 8 || !prog_data->uses_kill) &&
> + color1.file == BAD_FILE &&
> + key->nr_color_regions == 1) {
> + header_present = false;
> + }
> +
> + if (header_present)
> + /* Allocate 2 registers for a header */
> + length += 2;
> +
> + if (payload.aa_dest_stencil_reg) {
> + sources[length] = fs_reg(GRF, alloc.allocate(1));
> + emit(MOV(sources[length],
> + fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))));
> + length++;
> + }
> +
> + prog_data->uses_omask =
> + prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
> + if (prog_data->uses_omask) {
> + this->current_annotation = "FB write oMask";
> + assert(this->sample_mask.file != BAD_FILE);
> + /* Hand over gl_SampleMask. Only lower 16 bits are relevant. Since
> + * it's unsinged single words, one vgrf is always 16-wide.
> + */
> + sources[length] = fs_reg(GRF, alloc.allocate(1),
> + BRW_REGISTER_TYPE_UW, 16);
> + emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
> + length++;
> + }
> +
> + if (color0.file == BAD_FILE) {
> + /* Even if there's no color buffers enabled, we still need to send
> + * alpha out the pipeline to our null renderbuffer to support
> + * alpha-testing, alpha-to-coverage, and so on.
> + */
> + length += setup_color_payload(sources + length, this->outputs[0], 0,
> + false);
> + } else if (color1.file == BAD_FILE) {
> + if (src0_alpha.file != BAD_FILE) {
> + sources[length] = fs_reg(GRF, alloc.allocate(reg_size),
> + src0_alpha.type, src0_alpha.width);
> + fs_inst *inst = emit(MOV(sources[length], src0_alpha));
> + inst->saturate = key->clamp_fragment_color;
> + length++;
> + }
> +
> + length += setup_color_payload(sources + length, color0, components,
> + false);
> + } else {
> + length += setup_color_payload(sources + length, color0, components,
> + use_2nd_half);
> + length += setup_color_payload(sources + length, color1, components,
> + use_2nd_half);
> + }
> +
> + if (source_depth_to_render_target) {
> + if (brw->gen == 6) {
> + /* For outputting oDepth on gen6, SIMD8 writes have to be
> + * used. This would require SIMD8 moves of each half to
> + * message regs, kind of like pre-gen5 SIMD16 FB writes.
> + * Just bail on doing so for now.
> + */
> + no16("Missing support for simd16 depth writes on gen6\n");
> + }
> +
> + sources[length] = vgrf(glsl_type::float_type);
> + if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
> + /* Hand over gl_FragDepth. */
> + assert(this->frag_depth.file != BAD_FILE);
> + emit(MOV(sources[length], this->frag_depth));
> + } else {
> + /* Pass through the payload depth. */
> + emit(MOV(sources[length],
> + fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
> + }
> + length++;
> + }
> +
> + if (payload.dest_depth_reg) {
> + sources[length] = vgrf(glsl_type::float_type);
> + emit(MOV(sources[length],
> + fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0))));
> + length++;
> + }
> +
> + fs_inst *load;
> + fs_inst *write;
> + if (brw->gen >= 7) {
> + /* Send from the GRF */
> + fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
> + load = emit(LOAD_PAYLOAD(payload, sources, length));
> + payload.reg = alloc.allocate(load->regs_written);
> + payload.width = dispatch_width;
> + load->dst = payload;
> + write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
> + write->base_mrf = -1;
> + } else {
> + /* Send from the MRF */
> + load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
> + sources, length));
> + write = emit(FS_OPCODE_FB_WRITE);
> + write->exec_size = dispatch_width;
> + write->base_mrf = 1;
> + }
> +
> + write->mlen = load->regs_written;
> + write->header_present = header_present;
> + if (prog_data->uses_kill) {
> + write->predicate = BRW_PREDICATE_NORMAL;
> + write->flag_subreg = 1;
> + }
> + return write;
> +}
> +
> +void
> +fs_god::emit_fb_writes()
> +{
> + assert(stage == MESA_SHADER_FRAGMENT);
> + brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> +
> + fs_inst *inst = NULL;
> + if (do_dual_src) {
> + this->current_annotation = ralloc_asprintf(this->mem_ctx,
> + "FB dual-source write");
> + inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
> + reg_undef, 4);
> + inst->target = 0;
> +
> + /* SIMD16 dual source blending requires to send two SIMD8 dual source
> + * messages, where each message contains color data for 8 pixels. Color
> + * data for the first group of pixels is stored in the "lower" half of
> + * the color registers, so in SIMD16, the previous message did:
> + * m + 0: r0
> + * m + 1: g0
> + * m + 2: b0
> + * m + 3: a0
> + *
> + * Here goes the second message, which packs color data for the
> + * remaining 8 pixels. Color data for these pixels is stored in the
> + * "upper" half of the color registers, so we need to do:
> + * m + 0: r1
> + * m + 1: g1
> + * m + 2: b1
> + * m + 3: a1
> + */
> + if (dispatch_width == 16) {
> + inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
> + reg_undef, 4, true);
> + inst->target = 0;
> + }
> +
> + prog_data->dual_src_blend = true;
> + } else {
> + for (int target = 0; target < key->nr_color_regions; target++) {
> + /* Skip over outputs that weren't written. */
> + if (this->outputs[target].file == BAD_FILE)
> + continue;
> +
> + this->current_annotation = ralloc_asprintf(this->mem_ctx,
> + "FB write target %d",
> + target);
> + fs_reg src0_alpha;
> + if (brw->gen >= 6 && key->replicate_alpha && target != 0)
> + src0_alpha = offset(outputs[0], 3);
> +
> + inst = emit_single_fb_write(this->outputs[target], reg_undef,
> + src0_alpha,
> + this->output_components[target]);
> + inst->target = target;
> + }
> + }
> +
> + if (inst == NULL) {
> + /* Even if there's no color buffers enabled, we still need to send
> + * alpha out the pipeline to our null renderbuffer to support
> + * alpha-testing, alpha-to-coverage, and so on.
> + */
> + inst = emit_single_fb_write(reg_undef, reg_undef, reg_undef, 0);
> + inst->target = 0;
> + }
> +
> + inst->eot = true;
> + this->current_annotation = NULL;
> +}
> +
> +void
> +fs_god::setup_uniform_clipplane_values()
> +{
> + gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
> + const struct brw_vue_prog_key *key =
> + (const struct brw_vue_prog_key *) this->key;
> +
> + for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
> + this->userplane[i] = fs_reg(UNIFORM, uniforms);
> + for (int j = 0; j < 4; ++j) {
> + stage_prog_data->param[uniforms + j] =
> + (gl_constant_value *) &clip_planes[i][j];
> + }
> + uniforms += 4;
> + }
> +}
> +
> +void fs_god::compute_clip_distance()
> +{
> + struct brw_vue_prog_data *vue_prog_data =
> + (struct brw_vue_prog_data *) prog_data;
> + const struct brw_vue_prog_key *key =
> + (const struct brw_vue_prog_key *) this->key;
> +
> + /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
> + *
> + * "If a linked set of shaders forming the vertex stage contains no
> + * static write to gl_ClipVertex or gl_ClipDistance, but the
> + * application has requested clipping against user clip planes through
> + * the API, then the coordinate written to gl_Position is used for
> + * comparison against the user clip planes."
> + *
> + * This function is only called if the shader didn't write to
> + * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
> + * if the user wrote to it; otherwise we use gl_Position.
> + */
> +
> + gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
> + if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
> + clip_vertex = VARYING_SLOT_POS;
> +
> + /* If the clip vertex isn't written, skip this. Typically this means
> + * the GS will set up clipping. */
> + if (outputs[clip_vertex].file == BAD_FILE)
> + return;
> +
> + setup_uniform_clipplane_values();
> +
> + current_annotation = "user clip distances";
> +
> + this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
> + this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
> +
> + for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
> + fs_reg u = userplane[i];
> + fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
> + output.reg_offset = i & 3;
> +
> + emit(MUL(output, outputs[clip_vertex], u));
> + for (int j = 1; j < 4; j++) {
> + u.reg = userplane[i].reg + j;
> + emit(MAD(output, output, offset(outputs[clip_vertex], j), u));
> + }
> + }
> +}
> +
> +void
> +fs_god::emit_urb_writes()
> +{
> + int slot, urb_offset, length;
> + struct brw_vs_prog_data *vs_prog_data =
> + (struct brw_vs_prog_data *) prog_data;
> + const struct brw_vs_prog_key *key =
> + (const struct brw_vs_prog_key *) this->key;
> + const GLbitfield64 psiz_mask =
> + VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
> + const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map;
> + bool flush;
> + fs_reg sources[8];
> +
> + /* Lower legacy ff and ClipVertex clipping to clip distances */
> + if (key->base.userclip_active && !prog->UsesClipDistanceOut)
> + compute_clip_distance();
> +
> + /* If we don't have any valid slots to write, just do a minimal urb write
> + * send to terminate the shader. */
> + if (vue_map->slots_valid == 0) {
> +
> + fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> + fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
> + BRW_REGISTER_TYPE_UD))));
> + inst->force_writemask_all = true;
> +
> + inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
> + inst->eot = true;
> + inst->mlen = 1;
> + inst->offset = 1;
> + return;
> + }
> +
> + length = 0;
> + urb_offset = 0;
> + flush = false;
> + for (slot = 0; slot < vue_map->num_slots; slot++) {
> + fs_reg reg, src, zero;
> +
> + int varying = vue_map->slot_to_varying[slot];
> + switch (varying) {
> + case VARYING_SLOT_PSIZ:
> +
> + /* The point size varying slot is the vue header and is always in the
> + * vue map. But often none of the special varyings that live there
> + * are written and in that case we can skip writing to the vue
> + * header, provided the corresponding state properly clamps the
> + * values further down the pipeline. */
> + if ((vue_map->slots_valid & psiz_mask) == 0) {
> + assert(length == 0);
> + urb_offset++;
> + break;
> + }
> +
> + zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> + emit(MOV(zero, fs_reg(0u)));
> +
> + sources[length++] = zero;
> + if (vue_map->slots_valid & VARYING_BIT_LAYER)
> + sources[length++] = this->outputs[VARYING_SLOT_LAYER];
> + else
> + sources[length++] = zero;
> +
> + if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
> + sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
> + else
> + sources[length++] = zero;
> +
> + if (vue_map->slots_valid & VARYING_BIT_PSIZ)
> + sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
> + else
> + sources[length++] = zero;
> + break;
> +
> + case BRW_VARYING_SLOT_NDC:
> + case VARYING_SLOT_EDGE:
> + unreachable("unexpected scalar vs output");
> + break;
> +
> + case BRW_VARYING_SLOT_PAD:
> + break;
> +
> + default:
> + /* gl_Position is always in the vue map, but isn't always written by
> + * the shader. Other varyings (clip distances) get added to the vue
> + * map but don't always get written. In those cases, the
> + * corresponding this->output[] slot will be invalid we and can skip
> + * the urb write for the varying. If we've already queued up a vue
> + * slot for writing we flush a mlen 5 urb write, otherwise we just
> + * advance the urb_offset.
> + */
> + if (this->outputs[varying].file == BAD_FILE) {
> + if (length > 0)
> + flush = true;
> + else
> + urb_offset++;
> + break;
> + }
> +
> + if ((varying == VARYING_SLOT_COL0 ||
> + varying == VARYING_SLOT_COL1 ||
> + varying == VARYING_SLOT_BFC0 ||
> + varying == VARYING_SLOT_BFC1) &&
> + key->clamp_vertex_color) {
> + /* We need to clamp these guys, so do a saturating MOV into a
> + * temp register and use that for the payload.
> + */
> + for (int i = 0; i < 4; i++) {
> + reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
> + src = offset(this->outputs[varying], i);
> + fs_inst *inst = emit(MOV(reg, src));
> + inst->saturate = true;
> + sources[length++] = reg;
> + }
> + } else {
> + for (int i = 0; i < 4; i++)
> + sources[length++] = offset(this->outputs[varying], i);
> + }
> + break;
> + }
> +
> + current_annotation = "URB write";
> +
> + /* If we've queued up 8 registers of payload (2 VUE slots), if this is
> + * the last slot or if we need to flush (see BAD_FILE varying case
> + * above), emit a URB write send now to flush out the data.
> + */
> + int last = slot == vue_map->num_slots - 1;
> + if (length == 8 || last)
> + flush = true;
> + if (flush) {
> + fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
> + fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
> + BRW_REGISTER_TYPE_F);
> +
> + /* We need WE_all on the MOV for the message header (the URB handles)
> + * so do a MOV to a dummy register and set force_writemask_all on the
> + * MOV. LOAD_PAYLOAD will preserve that.
> + */
> + fs_reg dummy = fs_reg(GRF, alloc.allocate(1),
> + BRW_REGISTER_TYPE_UD);
> + fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0),
> + BRW_REGISTER_TYPE_UD))));
> + inst->force_writemask_all = true;
> + payload_sources[0] = dummy;
> +
> + memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
> + emit(LOAD_PAYLOAD(payload, payload_sources, length + 1));
> +
> + inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
> + inst->eot = last;
> + inst->mlen = length + 1;
> + inst->offset = urb_offset;
> + urb_offset = slot + 1;
> + length = 0;
> + flush = false;
> + }
> + }
> +}
> +
> +void
> +fs_god::resolve_ud_negate(fs_reg *reg)
> +{
> + if (reg->type != BRW_REGISTER_TYPE_UD ||
> + !reg->negate)
> + return;
> +
> + fs_reg temp = vgrf(glsl_type::uint_type);
> + emit(MOV(temp, *reg));
> + *reg = temp;
> +}
> +
> +/**
> + * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
> + *
> + * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
> + * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
> + */
> +void
> +fs_god::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
> +{
> + assert(brw->gen <= 5);
> +
> + if (rvalue->type != glsl_type::bool_type)
> + return;
> +
> + fs_reg and_result = vgrf(glsl_type::bool_type);
> + fs_reg neg_result = vgrf(glsl_type::bool_type);
> + emit(AND(and_result, *reg, fs_reg(1)));
> + emit(MOV(neg_result, negate(and_result)));
> + *reg = neg_result;
> +}
> +
> +fs_god::fs_god(struct brw_context *brw,
> + void *mem_ctx,
> + const struct brw_wm_prog_key *key,
> + struct brw_wm_prog_data *prog_data,
> + struct gl_shader_program *shader_prog,
> + struct gl_fragment_program *fp,
> + unsigned dispatch_width)
> + : backend_god(brw, shader_prog, &fp->Base, &prog_data->base,
> + MESA_SHADER_FRAGMENT),
> + reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
> + reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
> + reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
> + key(key), prog_data(&prog_data->base),
> + dispatch_width(dispatch_width), promoted_constants(0)
> +{
> + this->mem_ctx = mem_ctx;
> + init();
> +}
> +
> +fs_god::fs_god(struct brw_context *brw,
> + void *mem_ctx,
> + const struct brw_vs_prog_key *key,
> + struct brw_vs_prog_data *prog_data,
> + struct gl_shader_program *shader_prog,
> + struct gl_vertex_program *cp,
> + unsigned dispatch_width)
> + : backend_god(brw, shader_prog, &cp->Base, &prog_data->base.base,
> + MESA_SHADER_VERTEX),
> + reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
> + reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
> + reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
> + key(key), prog_data(&prog_data->base.base),
> + dispatch_width(dispatch_width), promoted_constants(0)
> +{
> + this->mem_ctx = mem_ctx;
> + init();
> +}
> +
> +void
> +fs_god::init()
> +{
> + switch (stage) {
> + case MESA_SHADER_FRAGMENT:
> + key_tex = &((const brw_wm_prog_key *) key)->tex;
> + break;
> + case MESA_SHADER_VERTEX:
> + case MESA_SHADER_GEOMETRY:
> + key_tex = &((const brw_vue_prog_key *) key)->tex;
> + break;
> + default:
> + unreachable("unhandled shader stage");
> + }
> +
> + this->failed = false;
> + this->simd16_unsupported = false;
> + this->no16_msg = NULL;
> + this->variable_ht = hash_table_ctor(0,
> + hash_table_pointer_hash,
> + hash_table_pointer_compare);
> +
> + this->nir_locals = NULL;
> + this->nir_globals = NULL;
> +
> + memset(&this->payload, 0, sizeof(this->payload));
> + memset(this->outputs, 0, sizeof(this->outputs));
> + memset(this->output_components, 0, sizeof(this->output_components));
> + this->source_depth_to_render_target = false;
> + this->runtime_check_aads_emit = false;
> + this->first_non_payload_grf = 0;
> + this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
> +
> + this->current_annotation = NULL;
> + this->base_ir = NULL;
> +
> + this->virtual_grf_start = NULL;
> + this->virtual_grf_end = NULL;
> + this->live_intervals = NULL;
> + this->regs_live_at_ip = NULL;
> +
> + this->uniforms = 0;
> + this->last_scratch = 0;
> + this->pull_constant_loc = NULL;
> + this->push_constant_loc = NULL;
> +
> + this->spilled_any_registers = false;
> + this->do_dual_src = false;
> +
> + if (dispatch_width == 8)
> + this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
> +}
> +
> +fs_god::~fs_god()
> +{
> + hash_table_dtor(this->variable_ht);
> +}
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
> index 502161d..dca6f56 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
> @@ -273,7 +273,7 @@ fs_live_variables::compute_start_end()
> }
> }
>
> -fs_live_variables::fs_live_variables(fs_visitor *v, const cfg_t *cfg)
> +fs_live_variables::fs_live_variables(fs_god *v, const cfg_t *cfg)
> : v(v), cfg(cfg)
> {
> mem_ctx = ralloc_context(NULL);
> @@ -326,7 +326,7 @@ fs_live_variables::~fs_live_variables()
> }
>
> void
> -fs_visitor::invalidate_live_intervals()
> +fs_god::invalidate_live_intervals()
> {
> ralloc_free(live_intervals);
> live_intervals = NULL;
> @@ -339,7 +339,7 @@ fs_visitor::invalidate_live_intervals()
> * information about whole VGRFs.
> */
> void
> -fs_visitor::calculate_live_intervals()
> +fs_god::calculate_live_intervals()
> {
> if (this->live_intervals)
> return;
> @@ -375,7 +375,7 @@ fs_live_variables::vars_interfere(int a, int b)
> }
>
> bool
> -fs_visitor::virtual_grf_interferes(int a, int b)
> +fs_god::virtual_grf_interferes(int a, int b)
> {
> return !(virtual_grf_end[a] <= virtual_grf_start[b] ||
> virtual_grf_end[b] <= virtual_grf_start[a]);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.h b/src/mesa/drivers/dri/i965/brw_fs_live_variables.h
> index c745706..27512de 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.h
> @@ -62,7 +62,7 @@ class fs_live_variables {
> public:
> DECLARE_RALLOC_CXX_OPERATORS(fs_live_variables)
>
> - fs_live_variables(fs_visitor *v, const cfg_t *cfg);
> + fs_live_variables(fs_god *v, const cfg_t *cfg);
> ~fs_live_variables();
>
> bool vars_interfere(int a, int b);
> @@ -106,7 +106,7 @@ protected:
> void compute_live_variables();
> void compute_start_end();
>
> - fs_visitor *v;
> + fs_god *v;
> const cfg_t *cfg;
> void *mem_ctx;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> index 21e52fe..a720f55 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> @@ -82,7 +82,7 @@ count_nir_instrs(nir_shader *nir)
> }
>
> void
> -fs_visitor::emit_nir_code()
> +fs_god::emit_nir_code()
> {
> const nir_shader_compiler_options *options =
> ctx->Const.ShaderCompilerOptions[stage].NirOptions;
> @@ -226,7 +226,7 @@ fs_visitor::emit_nir_code()
> }
>
> void
> -fs_visitor::nir_setup_inputs(nir_shader *shader)
> +fs_god::nir_setup_inputs(nir_shader *shader)
> {
> foreach_list_typed(nir_variable, var, node, &shader->inputs) {
> enum brw_reg_type type = brw_type_for_base_type(var->type);
> @@ -257,7 +257,7 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
> }
> case MESA_SHADER_GEOMETRY:
> case MESA_SHADER_COMPUTE:
> - unreachable("fs_visitor not used for these stages yet.");
> + unreachable("fs_god not used for these stages yet.");
> break;
> case MESA_SHADER_FRAGMENT:
> if (var->data.location == VARYING_SLOT_POS) {
> @@ -276,7 +276,7 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
> }
>
> void
> -fs_visitor::nir_setup_outputs(nir_shader *shader)
> +fs_god::nir_setup_outputs(nir_shader *shader)
> {
> brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
>
> @@ -324,7 +324,7 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
> }
>
> void
> -fs_visitor::nir_setup_uniforms(nir_shader *shader)
> +fs_god::nir_setup_uniforms(nir_shader *shader)
> {
> uniforms = shader->num_uniforms;
>
> @@ -361,7 +361,7 @@ fs_visitor::nir_setup_uniforms(nir_shader *shader)
> }
>
> void
> -fs_visitor::nir_setup_uniform(nir_variable *var)
> +fs_god::nir_setup_uniform(nir_variable *var)
> {
> int namelen = strlen(var->name);
>
> @@ -397,7 +397,7 @@ fs_visitor::nir_setup_uniform(nir_variable *var)
> }
>
> void
> -fs_visitor::nir_setup_builtin_uniform(nir_variable *var)
> +fs_god::nir_setup_builtin_uniform(nir_variable *var)
> {
> const nir_state_slot *const slots = var->state_slots;
> assert(var->state_slots != NULL);
> @@ -430,7 +430,7 @@ fs_visitor::nir_setup_builtin_uniform(nir_variable *var)
> static bool
> emit_system_values_block(nir_block *block, void *void_visitor)
> {
> - fs_visitor *v = (fs_visitor *)void_visitor;
> + fs_god *v = (fs_god *)void_visitor;
> fs_reg *reg;
>
> nir_foreach_instr(block, instr) {
> @@ -495,7 +495,7 @@ emit_system_values_block(nir_block *block, void *void_visitor)
> }
>
> void
> -fs_visitor::nir_emit_system_values(nir_shader *shader)
> +fs_god::nir_emit_system_values(nir_shader *shader)
> {
> nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
> nir_foreach_overload(shader, overload) {
> @@ -506,7 +506,7 @@ fs_visitor::nir_emit_system_values(nir_shader *shader)
> }
>
> void
> -fs_visitor::nir_emit_impl(nir_function_impl *impl)
> +fs_god::nir_emit_impl(nir_function_impl *impl)
> {
> nir_locals = reralloc(mem_ctx, nir_locals, fs_reg, impl->reg_alloc);
> foreach_list_typed(nir_register, reg, node, &impl->registers) {
> @@ -520,7 +520,7 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl)
> }
>
> void
> -fs_visitor::nir_emit_cf_list(exec_list *list)
> +fs_god::nir_emit_cf_list(exec_list *list)
> {
> exec_list_validate(list);
> foreach_list_typed(nir_cf_node, node, node, list) {
> @@ -544,7 +544,7 @@ fs_visitor::nir_emit_cf_list(exec_list *list)
> }
>
> void
> -fs_visitor::nir_emit_if(nir_if *if_stmt)
> +fs_god::nir_emit_if(nir_if *if_stmt)
> {
> /* first, put the condition into f0 */
> fs_inst *inst = emit(MOV(reg_null_d,
> @@ -569,7 +569,7 @@ fs_visitor::nir_emit_if(nir_if *if_stmt)
> }
>
> void
> -fs_visitor::nir_emit_loop(nir_loop *loop)
> +fs_god::nir_emit_loop(nir_loop *loop)
> {
> if (brw->gen < 6) {
> no16("Can't support (non-uniform) control flow on SIMD16\n");
> @@ -583,7 +583,7 @@ fs_visitor::nir_emit_loop(nir_loop *loop)
> }
>
> void
> -fs_visitor::nir_emit_block(nir_block *block)
> +fs_god::nir_emit_block(nir_block *block)
> {
> nir_foreach_instr(block, instr) {
> nir_emit_instr(instr);
> @@ -591,7 +591,7 @@ fs_visitor::nir_emit_block(nir_block *block)
> }
>
> void
> -fs_visitor::nir_emit_instr(nir_instr *instr)
> +fs_god::nir_emit_instr(nir_instr *instr)
> {
> switch (instr->type) {
> case nir_instr_type_alu:
> @@ -640,7 +640,7 @@ brw_type_for_nir_type(nir_alu_type type)
> }
>
> bool
> -fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
> +fs_god::optimize_frontfacing_ternary(nir_alu_instr *instr,
> const fs_reg &result)
> {
> if (instr->src[0].src.is_ssa ||
> @@ -724,7 +724,7 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
> }
>
> void
> -fs_visitor::nir_emit_alu(nir_alu_instr *instr)
> +fs_god::nir_emit_alu(nir_alu_instr *instr)
> {
> struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
> fs_inst *inst;
> @@ -1311,7 +1311,7 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
> }
>
> fs_reg
> -fs_visitor::get_nir_src(nir_src src)
> +fs_god::get_nir_src(nir_src src)
> {
> if (src.is_ssa) {
> assert(src.ssa->parent_instr->type == nir_instr_type_load_const);
> @@ -1346,7 +1346,7 @@ fs_visitor::get_nir_src(nir_src src)
> }
>
> fs_reg
> -fs_visitor::get_nir_dest(nir_dest dest)
> +fs_god::get_nir_dest(nir_dest dest)
> {
> fs_reg reg;
> if (dest.reg.reg->is_global)
> @@ -1365,7 +1365,7 @@ fs_visitor::get_nir_dest(nir_dest dest)
> }
>
> void
> -fs_visitor::emit_percomp(fs_inst *inst, unsigned wr_mask)
> +fs_god::emit_percomp(fs_inst *inst, unsigned wr_mask)
> {
> for (unsigned i = 0; i < 4; i++) {
> if (!((wr_mask >> i) & 1))
> @@ -1382,7 +1382,7 @@ fs_visitor::emit_percomp(fs_inst *inst, unsigned wr_mask)
> }
>
> void
> -fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
> +fs_god::nir_emit_intrinsic(nir_intrinsic_instr *instr)
> {
> fs_reg dest;
> if (nir_intrinsic_infos[instr->intrinsic].has_dest)
> @@ -1750,7 +1750,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
> }
>
> void
> -fs_visitor::nir_emit_texture(nir_tex_instr *instr)
> +fs_god::nir_emit_texture(nir_tex_instr *instr)
> {
> unsigned sampler = instr->sampler_index;
> fs_reg sampler_reg(sampler);
> @@ -1920,7 +1920,7 @@ fs_visitor::nir_emit_texture(nir_tex_instr *instr)
> }
>
> void
> -fs_visitor::nir_emit_jump(nir_jump_instr *instr)
> +fs_god::nir_emit_jump(nir_jump_instr *instr)
> {
> switch (instr->type) {
> case nir_jump_break:
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp b/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
> index 047c2c0..adadb51 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
> @@ -53,7 +53,7 @@
> */
>
> bool
> -fs_visitor::opt_peephole_predicated_break()
> +fs_god::opt_peephole_predicated_break()
> {
> bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> index 72c490b..578951c 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> @@ -41,7 +41,7 @@ assign_reg(unsigned *reg_hw_locations, fs_reg *reg)
> }
>
> void
> -fs_visitor::assign_regs_trivial()
> +fs_god::assign_regs_trivial()
> {
> unsigned hw_reg_mapping[this->alloc.count + 1];
> unsigned i;
> @@ -332,7 +332,7 @@ count_to_loop_end(const bblock_t *block)
> * (note that in SIMD16, a node is two registers).
> */
> void
> -fs_visitor::setup_payload_interference(struct ra_graph *g,
> +fs_god::setup_payload_interference(struct ra_graph *g,
> int payload_node_count,
> int first_payload_node)
> {
> @@ -466,7 +466,7 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
> * contents.
> */
> void
> -fs_visitor::get_used_mrfs(bool *mrf_used)
> +fs_god::get_used_mrfs(bool *mrf_used)
> {
> int reg_width = dispatch_width / 8;
>
> @@ -498,7 +498,7 @@ fs_visitor::get_used_mrfs(bool *mrf_used)
> * messages (treated as MRFs in code generation).
> */
> void
> -fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
> +fs_god::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
> {
> bool mrf_used[BRW_MAX_MRF];
> get_used_mrfs(mrf_used);
> @@ -523,7 +523,7 @@ fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
> }
>
> bool
> -fs_visitor::assign_regs(bool allow_spilling)
> +fs_god::assign_regs(bool allow_spilling)
> {
> struct intel_screen *screen = brw->intelScreen;
> /* Most of this allocation was written for a reg_width of 1
> @@ -684,7 +684,7 @@ fs_visitor::assign_regs(bool allow_spilling)
> }
>
> void
> -fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
> +fs_god::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
> uint32_t spill_offset, int count)
> {
> int reg_size = 1;
> @@ -719,7 +719,7 @@ fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
> }
>
> void
> -fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
> +fs_god::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
> uint32_t spill_offset, int count)
> {
> int reg_size = 1;
> @@ -744,7 +744,7 @@ fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
> }
>
> int
> -fs_visitor::choose_spill_reg(struct ra_graph *g)
> +fs_god::choose_spill_reg(struct ra_graph *g)
> {
> float loop_scale = 1.0;
> float spill_costs[this->alloc.count];
> @@ -820,7 +820,7 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)
> }
>
> void
> -fs_visitor::spill_reg(int spill_reg)
> +fs_god::spill_reg(int spill_reg)
> {
> int size = alloc.sizes[spill_reg];
> unsigned int spill_offset = last_scratch;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
> index 09f0fad..44cbf76 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
> @@ -64,7 +64,7 @@ is_nop_mov(const fs_inst *inst)
> }
>
> static bool
> -is_copy_payload(const fs_visitor *v, const fs_inst *inst)
> +is_copy_payload(const fs_god *v, const fs_inst *inst)
> {
> if (v->alloc.sizes[inst->src[0].reg] != inst->regs_written)
> return false;
> @@ -79,7 +79,7 @@ is_copy_payload(const fs_visitor *v, const fs_inst *inst)
> }
>
> static bool
> -is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst)
> +is_coalesce_candidate(const fs_god *v, const fs_inst *inst)
> {
> if ((inst->opcode != BRW_OPCODE_MOV &&
> inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) ||
> @@ -152,7 +152,7 @@ can_coalesce_vars(brw::fs_live_variables *live_intervals,
> }
>
> bool
> -fs_visitor::register_coalesce()
> +fs_god::register_coalesce()
> {
> bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
> index e406c28..0bda2d3 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
> @@ -43,7 +43,7 @@
> */
>
> static bool
> -opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
> +opt_saturate_propagation_local(fs_god *v, bblock_t *block)
> {
> bool progress = false;
> int ip = block->end_ip + 1;
> @@ -103,7 +103,7 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
> }
>
> bool
> -fs_visitor::opt_saturate_propagation()
> +fs_god::opt_saturate_propagation()
> {
> bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
> index 740ba67..0a3e32d 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
> @@ -120,7 +120,7 @@ count_movs_from_if(fs_inst *then_mov[MAX_MOVS], fs_inst *else_mov[MAX_MOVS],
> * If src0 is an immediate value, we promote it to a temporary GRF.
> */
> bool
> -fs_visitor::opt_peephole_sel()
> +fs_god::opt_peephole_sel()
> {
> bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> deleted file mode 100644
> index e6fb0cb..0000000
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ /dev/null
> @@ -1,4157 +0,0 @@
> -/*
> - * Copyright © 2010 Intel Corporation
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the "Software"),
> - * to deal in the Software without restriction, including without limitation
> - * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> - * and/or sell copies of the Software, and to permit persons to whom the
> - * Software is furnished to do so, subject to the following conditions:
> - *
> - * The above copyright notice and this permission notice (including the next
> - * paragraph) shall be included in all copies or substantial portions of the
> - * Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> - * IN THE SOFTWARE.
> - */
> -
> -/** @file brw_fs_visitor.cpp
> - *
> - * This file supports generating the FS LIR from the GLSL IR. The LIR
> - * makes it easier to do backend-specific optimizations than doing so
> - * in the GLSL IR or in the native code.
> - */
> -#include <sys/types.h>
> -
> -#include "main/macros.h"
> -#include "main/shaderobj.h"
> -#include "program/prog_parameter.h"
> -#include "program/prog_print.h"
> -#include "program/prog_optimize.h"
> -#include "util/register_allocate.h"
> -#include "program/hash_table.h"
> -#include "brw_context.h"
> -#include "brw_eu.h"
> -#include "brw_wm.h"
> -#include "brw_vec4.h"
> -#include "brw_fs.h"
> -#include "main/uniforms.h"
> -#include "glsl/glsl_types.h"
> -#include "glsl/ir_optimization.h"
> -#include "program/sampler.h"
> -
> -
> -fs_reg *
> -fs_visitor::emit_vs_system_value(int location)
> -{
> - fs_reg *reg = new(this->mem_ctx)
> - fs_reg(ATTR, VERT_ATTRIB_MAX, BRW_REGISTER_TYPE_D);
> - brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
> -
> - switch (location) {
> - case SYSTEM_VALUE_BASE_VERTEX:
> - reg->reg_offset = 0;
> - vs_prog_data->uses_vertexid = true;
> - break;
> - case SYSTEM_VALUE_VERTEX_ID:
> - case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
> - reg->reg_offset = 2;
> - vs_prog_data->uses_vertexid = true;
> - break;
> - case SYSTEM_VALUE_INSTANCE_ID:
> - reg->reg_offset = 3;
> - vs_prog_data->uses_instanceid = true;
> - break;
> - default:
> - unreachable("not reached");
> - }
> -
> - return reg;
> -}
> -
> -void
> -fs_visitor::visit(ir_variable *ir)
> -{
> - fs_reg *reg = NULL;
> -
> - if (variable_storage(ir))
> - return;
> -
> - if (ir->data.mode == ir_var_shader_in) {
> - assert(ir->data.location != -1);
> - if (stage == MESA_SHADER_VERTEX) {
> - reg = new(this->mem_ctx)
> - fs_reg(ATTR, ir->data.location,
> - brw_type_for_base_type(ir->type->get_scalar_type()));
> - } else if (ir->data.location == VARYING_SLOT_POS) {
> - reg = emit_fragcoord_interpolation(ir->data.pixel_center_integer,
> - ir->data.origin_upper_left);
> - } else if (ir->data.location == VARYING_SLOT_FACE) {
> - reg = emit_frontfacing_interpolation();
> - } else {
> - reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
> - emit_general_interpolation(*reg, ir->name, ir->type,
> - (glsl_interp_qualifier) ir->data.interpolation,
> - ir->data.location, ir->data.centroid,
> - ir->data.sample);
> - }
> - assert(reg);
> - hash_table_insert(this->variable_ht, reg, ir);
> - return;
> - } else if (ir->data.mode == ir_var_shader_out) {
> - reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
> -
> - if (stage == MESA_SHADER_VERTEX) {
> - int vector_elements =
> - ir->type->is_array() ? ir->type->fields.array->vector_elements
> - : ir->type->vector_elements;
> -
> - for (int i = 0; i < (type_size(ir->type) + 3) / 4; i++) {
> - int output = ir->data.location + i;
> - this->outputs[output] = *reg;
> - this->outputs[output].reg_offset = i * 4;
> - this->output_components[output] = vector_elements;
> - }
> -
> - } else if (ir->data.index > 0) {
> - assert(ir->data.location == FRAG_RESULT_DATA0);
> - assert(ir->data.index == 1);
> - this->dual_src_output = *reg;
> - this->do_dual_src = true;
> - } else if (ir->data.location == FRAG_RESULT_COLOR) {
> - /* Writing gl_FragColor outputs to all color regions. */
> - assert(stage == MESA_SHADER_FRAGMENT);
> - brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> - for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
> - this->outputs[i] = *reg;
> - this->output_components[i] = 4;
> - }
> - } else if (ir->data.location == FRAG_RESULT_DEPTH) {
> - this->frag_depth = *reg;
> - } else if (ir->data.location == FRAG_RESULT_SAMPLE_MASK) {
> - this->sample_mask = *reg;
> - } else {
> - /* gl_FragData or a user-defined FS output */
> - assert(ir->data.location >= FRAG_RESULT_DATA0 &&
> - ir->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
> -
> - int vector_elements =
> - ir->type->is_array() ? ir->type->fields.array->vector_elements
> - : ir->type->vector_elements;
> -
> - /* General color output. */
> - for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
> - int output = ir->data.location - FRAG_RESULT_DATA0 + i;
> - this->outputs[output] = offset(*reg, vector_elements * i);
> - this->output_components[output] = vector_elements;
> - }
> - }
> - } else if (ir->data.mode == ir_var_uniform) {
> - int param_index = uniforms;
> -
> - /* Thanks to the lower_ubo_reference pass, we will see only
> - * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
> - * variables, so no need for them to be in variable_ht.
> - *
> - * Some uniforms, such as samplers and atomic counters, have no actual
> - * storage, so we should ignore them.
> - */
> - if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
> - return;
> -
> - if (dispatch_width == 16) {
> - if (!variable_storage(ir)) {
> - fail("Failed to find uniform '%s' in SIMD16\n", ir->name);
> - }
> - return;
> - }
> -
> - param_size[param_index] = type_size(ir->type);
> - if (!strncmp(ir->name, "gl_", 3)) {
> - setup_builtin_uniform_values(ir);
> - } else {
> - setup_uniform_values(ir);
> - }
> -
> - reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
> - reg->type = brw_type_for_base_type(ir->type);
> -
> - } else if (ir->data.mode == ir_var_system_value) {
> - switch (ir->data.location) {
> - case SYSTEM_VALUE_BASE_VERTEX:
> - case SYSTEM_VALUE_VERTEX_ID:
> - case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
> - case SYSTEM_VALUE_INSTANCE_ID:
> - reg = emit_vs_system_value(ir->data.location);
> - break;
> - case SYSTEM_VALUE_SAMPLE_POS:
> - reg = emit_samplepos_setup();
> - break;
> - case SYSTEM_VALUE_SAMPLE_ID:
> - reg = emit_sampleid_setup();
> - break;
> - case SYSTEM_VALUE_SAMPLE_MASK_IN:
> - assert(brw->gen >= 7);
> - reg = new(mem_ctx)
> - fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
> - BRW_REGISTER_TYPE_D));
> - break;
> - }
> - }
> -
> - if (!reg)
> - reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
> -
> - hash_table_insert(this->variable_ht, reg, ir);
> -}
> -
> -void
> -fs_visitor::visit(ir_dereference_variable *ir)
> -{
> - fs_reg *reg = variable_storage(ir->var);
> -
> - if (!reg) {
> - fail("Failed to find variable storage for %s\n", ir->var->name);
> - this->result = fs_reg(reg_null_d);
> - return;
> - }
> - this->result = *reg;
> -}
> -
> -void
> -fs_visitor::visit(ir_dereference_record *ir)
> -{
> - const glsl_type *struct_type = ir->record->type;
> -
> - ir->record->accept(this);
> -
> - unsigned int off = 0;
> - for (unsigned int i = 0; i < struct_type->length; i++) {
> - if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
> - break;
> - off += type_size(struct_type->fields.structure[i].type);
> - }
> - this->result = offset(this->result, off);
> - this->result.type = brw_type_for_base_type(ir->type);
> -}
> -
> -void
> -fs_visitor::visit(ir_dereference_array *ir)
> -{
> - ir_constant *constant_index;
> - fs_reg src;
> - int element_size = type_size(ir->type);
> -
> - constant_index = ir->array_index->as_constant();
> -
> - ir->array->accept(this);
> - src = this->result;
> - src.type = brw_type_for_base_type(ir->type);
> -
> - if (constant_index) {
> - if (src.file == ATTR) {
> - /* Attribute arrays get loaded as one vec4 per element. In that case
> - * offset the source register.
> - */
> - src.reg += constant_index->value.i[0];
> - } else {
> - assert(src.file == UNIFORM || src.file == GRF || src.file == HW_REG);
> - src = offset(src, constant_index->value.i[0] * element_size);
> - }
> - } else {
> - /* Variable index array dereference. We attach the variable index
> - * component to the reg as a pointer to a register containing the
> - * offset. Currently only uniform arrays are supported in this patch,
> - * and that reladdr pointer is resolved by
> - * move_uniform_array_access_to_pull_constants(). All other array types
> - * are lowered by lower_variable_index_to_cond_assign().
> - */
> - ir->array_index->accept(this);
> -
> - fs_reg index_reg;
> - index_reg = vgrf(glsl_type::int_type);
> - emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
> -
> - if (src.reladdr) {
> - emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
> - }
> -
> - src.reladdr = ralloc(mem_ctx, fs_reg);
> - memcpy(src.reladdr, &index_reg, sizeof(index_reg));
> - }
> - this->result = src;
> -}
> -
> -fs_inst *
> -fs_visitor::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
> - const fs_reg &a)
> -{
> - if (brw->gen < 6) {
> - /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
> - fs_reg y_times_a = vgrf(glsl_type::float_type);
> - fs_reg one_minus_a = vgrf(glsl_type::float_type);
> - fs_reg x_times_one_minus_a = vgrf(glsl_type::float_type);
> -
> - emit(MUL(y_times_a, y, a));
> -
> - fs_reg negative_a = a;
> - negative_a.negate = !a.negate;
> - emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
> - emit(MUL(x_times_one_minus_a, x, one_minus_a));
> -
> - return emit(ADD(dst, x_times_one_minus_a, y_times_a));
> - } else {
> - /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
> - * we need to reorder the operands.
> - */
> - return emit(LRP(dst, a, y, x));
> - }
> -}
> -
> -void
> -fs_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
> - const fs_reg &src0, const fs_reg &src1)
> -{
> - assert(conditionalmod == BRW_CONDITIONAL_GE ||
> - conditionalmod == BRW_CONDITIONAL_L);
> -
> - fs_inst *inst;
> -
> - if (brw->gen >= 6) {
> - inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
> - inst->conditional_mod = conditionalmod;
> - } else {
> - emit(CMP(reg_null_d, src0, src1, conditionalmod));
> -
> - inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
> - inst->predicate = BRW_PREDICATE_NORMAL;
> - }
> -}
> -
> -bool
> -fs_visitor::try_emit_saturate(ir_expression *ir)
> -{
> - if (ir->operation != ir_unop_saturate)
> - return false;
> -
> - ir_rvalue *sat_val = ir->operands[0];
> -
> - fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
> -
> - sat_val->accept(this);
> - fs_reg src = this->result;
> -
> - fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
> -
> - /* If the last instruction from our accept() generated our
> - * src, just set the saturate flag instead of emmitting a separate mov.
> - */
> - fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
> - if (modify && modify->regs_written == modify->dst.width / 8 &&
> - modify->can_do_saturate()) {
> - modify->saturate = true;
> - this->result = src;
> - return true;
> - }
> -
> - return false;
> -}
> -
> -bool
> -fs_visitor::try_emit_line(ir_expression *ir)
> -{
> - /* LINE's src0 must be of type float. */
> - if (ir->type != glsl_type::float_type)
> - return false;
> -
> - ir_rvalue *nonmul = ir->operands[1];
> - ir_expression *mul = ir->operands[0]->as_expression();
> -
> - if (!mul || mul->operation != ir_binop_mul) {
> - nonmul = ir->operands[0];
> - mul = ir->operands[1]->as_expression();
> -
> - if (!mul || mul->operation != ir_binop_mul)
> - return false;
> - }
> -
> - ir_constant *const_add = nonmul->as_constant();
> - if (!const_add)
> - return false;
> -
> - int add_operand_vf = brw_float_to_vf(const_add->value.f[0]);
> - if (add_operand_vf == -1)
> - return false;
> -
> - ir_rvalue *non_const_mul = mul->operands[1];
> - ir_constant *const_mul = mul->operands[0]->as_constant();
> - if (!const_mul) {
> - const_mul = mul->operands[1]->as_constant();
> -
> - if (!const_mul)
> - return false;
> -
> - non_const_mul = mul->operands[0];
> - }
> -
> - int mul_operand_vf = brw_float_to_vf(const_mul->value.f[0]);
> - if (mul_operand_vf == -1)
> - return false;
> -
> - non_const_mul->accept(this);
> - fs_reg src1 = this->result;
> -
> - fs_reg src0 = vgrf(ir->type);
> - emit(BRW_OPCODE_MOV, src0,
> - fs_reg((uint8_t)mul_operand_vf, 0, 0, (uint8_t)add_operand_vf));
> -
> - this->result = vgrf(ir->type);
> - emit(BRW_OPCODE_LINE, this->result, src0, src1);
> - return true;
> -}
> -
> -bool
> -fs_visitor::try_emit_mad(ir_expression *ir)
> -{
> - /* 3-src instructions were introduced in gen6. */
> - if (brw->gen < 6)
> - return false;
> -
> - /* MAD can only handle floating-point data. */
> - if (ir->type != glsl_type::float_type)
> - return false;
> -
> - ir_rvalue *nonmul;
> - ir_expression *mul;
> - bool mul_negate, mul_abs;
> -
> - for (int i = 0; i < 2; i++) {
> - mul_negate = false;
> - mul_abs = false;
> -
> - mul = ir->operands[i]->as_expression();
> - nonmul = ir->operands[1 - i];
> -
> - if (mul && mul->operation == ir_unop_abs) {
> - mul = mul->operands[0]->as_expression();
> - mul_abs = true;
> - } else if (mul && mul->operation == ir_unop_neg) {
> - mul = mul->operands[0]->as_expression();
> - mul_negate = true;
> - }
> -
> - if (mul && mul->operation == ir_binop_mul)
> - break;
> - }
> -
> - if (!mul || mul->operation != ir_binop_mul)
> - return false;
> -
> - nonmul->accept(this);
> - fs_reg src0 = this->result;
> -
> - mul->operands[0]->accept(this);
> - fs_reg src1 = this->result;
> - src1.negate ^= mul_negate;
> - src1.abs = mul_abs;
> - if (mul_abs)
> - src1.negate = false;
> -
> - mul->operands[1]->accept(this);
> - fs_reg src2 = this->result;
> - src2.abs = mul_abs;
> - if (mul_abs)
> - src2.negate = false;
> -
> - this->result = vgrf(ir->type);
> - emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
> -
> - return true;
> -}
> -
> -bool
> -fs_visitor::try_emit_b2f_of_comparison(ir_expression *ir)
> -{
> - /* On platforms that do not natively generate 0u and ~0u for Boolean
> - * results, b2f expressions that look like
> - *
> - * f = b2f(expr cmp 0)
> - *
> - * will generate better code by pretending the expression is
> - *
> - * f = ir_triop_csel(0.0, 1.0, expr cmp 0)
> - *
> - * This is because the last instruction of "expr" can generate the
> - * condition code for the "cmp 0". This avoids having to do the "-(b & 1)"
> - * trick to generate 0u or ~0u for the Boolean result. This means code like
> - *
> - * mov(16) g16<1>F 1F
> - * mul.ge.f0(16) null g6<8,8,1>F g14<8,8,1>F
> - * (+f0) sel(16) m6<1>F g16<8,8,1>F 0F
> - *
> - * will be generated instead of
> - *
> - * mul(16) g2<1>F g12<8,8,1>F g4<8,8,1>F
> - * cmp.ge.f0(16) g2<1>D g4<8,8,1>F 0F
> - * and(16) g4<1>D g2<8,8,1>D 1D
> - * and(16) m6<1>D -g4<8,8,1>D 0x3f800000UD
> - *
> - * When the comparison is either == 0.0 or != 0.0 using the knowledge that
> - * the true (or false) case already results in zero would allow better code
> - * generation by possibly avoiding a load-immediate instruction.
> - */
> - ir_expression *cmp = ir->operands[0]->as_expression();
> - if (cmp == NULL)
> - return false;
> -
> - if (cmp->operation == ir_binop_equal || cmp->operation == ir_binop_nequal) {
> - for (unsigned i = 0; i < 2; i++) {
> - ir_constant *c = cmp->operands[i]->as_constant();
> - if (c == NULL || !c->is_zero())
> - continue;
> -
> - ir_expression *expr = cmp->operands[i ^ 1]->as_expression();
> - if (expr != NULL) {
> - fs_reg op[2];
> -
> - for (unsigned j = 0; j < 2; j++) {
> - cmp->operands[j]->accept(this);
> - op[j] = this->result;
> -
> - resolve_ud_negate(&op[j]);
> - }
> -
> - emit_bool_to_cond_code_of_reg(cmp, op);
> -
> - /* In this case we know when the condition is true, op[i ^ 1]
> - * contains zero. Invert the predicate, use op[i ^ 1] as src0,
> - * and immediate 1.0f as src1.
> - */
> - this->result = vgrf(ir->type);
> - op[i ^ 1].type = BRW_REGISTER_TYPE_F;
> -
> - fs_inst *inst = emit(SEL(this->result, op[i ^ 1], fs_reg(1.0f)));
> - inst->predicate = BRW_PREDICATE_NORMAL;
> - inst->predicate_inverse = cmp->operation == ir_binop_equal;
> - return true;
> - }
> - }
> - }
> -
> - emit_bool_to_cond_code(cmp);
> -
> - fs_reg temp = vgrf(ir->type);
> - emit(MOV(temp, fs_reg(1.0f)));
> -
> - this->result = vgrf(ir->type);
> - fs_inst *inst = emit(SEL(this->result, temp, fs_reg(0.0f)));
> - inst->predicate = BRW_PREDICATE_NORMAL;
> -
> - return true;
> -}
> -
> -static int
> -pack_pixel_offset(float x)
> -{
> - /* Clamp upper end of the range to +7/16. See explanation in non-constant
> - * offset case below. */
> - int n = MIN2((int)(x * 16), 7);
> - return n & 0xf;
> -}
> -
> -void
> -fs_visitor::emit_interpolate_expression(ir_expression *ir)
> -{
> - /* in SIMD16 mode, the pixel interpolator returns coords interleaved
> - * 8 channels at a time, same as the barycentric coords presented in
> - * the FS payload. this requires a bit of extra work to support.
> - */
> - no16("interpolate_at_* not yet supported in SIMD16 mode.");
> -
> - assert(stage == MESA_SHADER_FRAGMENT);
> - brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> -
> - ir_dereference * deref = ir->operands[0]->as_dereference();
> - ir_swizzle * swiz = NULL;
> - if (!deref) {
> - /* the api does not allow a swizzle here, but the varying packing code
> - * may have pushed one into here.
> - */
> - swiz = ir->operands[0]->as_swizzle();
> - assert(swiz);
> - deref = swiz->val->as_dereference();
> - }
> - assert(deref);
> - ir_variable * var = deref->variable_referenced();
> - assert(var);
> -
> - /* 1. collect interpolation factors */
> -
> - fs_reg dst_x = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
> - fs_reg dst_y = offset(dst_x, 1);
> -
> - /* for most messages, we need one reg of ignored data; the hardware requires mlen==1
> - * even when there is no payload. in the per-slot offset case, we'll replace this with
> - * the proper source data. */
> - fs_reg src = vgrf(glsl_type::float_type);
> - int mlen = 1; /* one reg unless overriden */
> - int reg_width = dispatch_width / 8;
> - fs_inst *inst;
> -
> - switch (ir->operation) {
> - case ir_unop_interpolate_at_centroid:
> - inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
> - break;
> -
> - case ir_binop_interpolate_at_sample: {
> - ir_constant *sample_num = ir->operands[1]->as_constant();
> - assert(sample_num || !"nonconstant sample number should have been lowered.");
> -
> - unsigned msg_data = sample_num->value.i[0] << 4;
> - inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, fs_reg(msg_data));
> - break;
> - }
> -
> - case ir_binop_interpolate_at_offset: {
> - ir_constant *const_offset = ir->operands[1]->as_constant();
> - if (const_offset) {
> - unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
> - (pack_pixel_offset(const_offset->value.f[1]) << 4);
> - inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
> - fs_reg(msg_data));
> - } else {
> - /* pack the operands: hw wants offsets as 4 bit signed ints */
> - ir->operands[1]->accept(this);
> - src = vgrf(glsl_type::ivec2_type);
> - fs_reg src2 = src;
> - for (int i = 0; i < 2; i++) {
> - fs_reg temp = vgrf(glsl_type::float_type);
> - emit(MUL(temp, this->result, fs_reg(16.0f)));
> - emit(MOV(src2, temp)); /* float to int */
> -
> - /* Clamp the upper end of the range to +7/16. ARB_gpu_shader5 requires
> - * that we support a maximum offset of +0.5, which isn't representable
> - * in a S0.4 value -- if we didn't clamp it, we'd end up with -8/16,
> - * which is the opposite of what the shader author wanted.
> - *
> - * This is legal due to ARB_gpu_shader5's quantization rules:
> - *
> - * "Not all values of <offset> may be supported; x and y offsets may
> - * be rounded to fixed-point values with the number of fraction bits
> - * given by the implementation-dependent constant
> - * FRAGMENT_INTERPOLATION_OFFSET_BITS"
> - */
> -
> - fs_inst *inst = emit(BRW_OPCODE_SEL, src2, src2, fs_reg(7));
> - inst->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
> -
> - src2 = offset(src2, 1);
> - this->result = offset(this->result, 1);
> - }
> -
> - mlen = 2 * reg_width;
> - inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
> - fs_reg(0u));
> - }
> - break;
> - }
> -
> - default:
> - unreachable("not reached");
> - }
> -
> - inst->mlen = mlen;
> - inst->regs_written = 2 * reg_width; /* 2 floats per slot returned */
> - inst->pi_noperspective = var->determine_interpolation_mode(key->flat_shade) ==
> - INTERP_QUALIFIER_NOPERSPECTIVE;
> -
> - /* 2. emit linterp */
> -
> - fs_reg res = vgrf(ir->type);
> - this->result = res;
> -
> - for (int i = 0; i < ir->type->vector_elements; i++) {
> - int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
> - emit(FS_OPCODE_LINTERP, res,
> - dst_x, dst_y,
> - fs_reg(interp_reg(var->data.location, ch)));
> - res = offset(res, 1);
> - }
> -}
> -
> -void
> -fs_visitor::visit(ir_expression *ir)
> -{
> - unsigned int operand;
> - fs_reg op[3], temp;
> - fs_inst *inst;
> - struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
> -
> - assert(ir->get_num_operands() <= 3);
> -
> - if (try_emit_saturate(ir))
> - return;
> -
> - /* Deal with the real oddball stuff first */
> - switch (ir->operation) {
> - case ir_binop_add:
> - if (brw->gen <= 5 && try_emit_line(ir))
> - return;
> - if (try_emit_mad(ir))
> - return;
> - break;
> -
> - case ir_triop_csel:
> - ir->operands[1]->accept(this);
> - op[1] = this->result;
> - ir->operands[2]->accept(this);
> - op[2] = this->result;
> -
> - emit_bool_to_cond_code(ir->operands[0]);
> -
> - this->result = vgrf(ir->type);
> - inst = emit(SEL(this->result, op[1], op[2]));
> - inst->predicate = BRW_PREDICATE_NORMAL;
> - return;
> -
> - case ir_unop_b2f:
> - if (brw->gen <= 5 && try_emit_b2f_of_comparison(ir))
> - return;
> - break;
> -
> - case ir_unop_interpolate_at_centroid:
> - case ir_binop_interpolate_at_offset:
> - case ir_binop_interpolate_at_sample:
> - emit_interpolate_expression(ir);
> - return;
> -
> - default:
> - break;
> - }
> -
> - for (operand = 0; operand < ir->get_num_operands(); operand++) {
> - ir->operands[operand]->accept(this);
> - if (this->result.file == BAD_FILE) {
> - fail("Failed to get tree for expression operand:\n");
> - ir->operands[operand]->fprint(stderr);
> - fprintf(stderr, "\n");
> - }
> - assert(this->result.file == GRF ||
> - this->result.file == UNIFORM || this->result.file == ATTR);
> - op[operand] = this->result;
> -
> - /* Matrix expression operands should have been broken down to vector
> - * operations already.
> - */
> - assert(!ir->operands[operand]->type->is_matrix());
> - /* And then those vector operands should have been broken down to scalar.
> - */
> - assert(!ir->operands[operand]->type->is_vector());
> - }
> -
> - /* Storage for our result. If our result goes into an assignment, it will
> - * just get copy-propagated out, so no worries.
> - */
> - this->result = vgrf(ir->type);
> -
> - switch (ir->operation) {
> - case ir_unop_logic_not:
> - emit(NOT(this->result, op[0]));
> - break;
> - case ir_unop_neg:
> - op[0].negate = !op[0].negate;
> - emit(MOV(this->result, op[0]));
> - break;
> - case ir_unop_abs:
> - op[0].abs = true;
> - op[0].negate = false;
> - emit(MOV(this->result, op[0]));
> - break;
> - case ir_unop_sign:
> - if (ir->type->is_float()) {
> - /* AND(val, 0x80000000) gives the sign bit.
> - *
> - * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
> - * zero.
> - */
> - emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
> -
> - op[0].type = BRW_REGISTER_TYPE_UD;
> - this->result.type = BRW_REGISTER_TYPE_UD;
> - emit(AND(this->result, op[0], fs_reg(0x80000000u)));
> -
> - inst = emit(OR(this->result, this->result, fs_reg(0x3f800000u)));
> - inst->predicate = BRW_PREDICATE_NORMAL;
> -
> - this->result.type = BRW_REGISTER_TYPE_F;
> - } else {
> - /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
> - * -> non-negative val generates 0x00000000.
> - * Predicated OR sets 1 if val is positive.
> - */
> - emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
> -
> - emit(ASR(this->result, op[0], fs_reg(31)));
> -
> - inst = emit(OR(this->result, this->result, fs_reg(1)));
> - inst->predicate = BRW_PREDICATE_NORMAL;
> - }
> - break;
> - case ir_unop_rcp:
> - emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
> - break;
> -
> - case ir_unop_exp2:
> - emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
> - break;
> - case ir_unop_log2:
> - emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
> - break;
> - case ir_unop_exp:
> - case ir_unop_log:
> - unreachable("not reached: should be handled by ir_explog_to_explog2");
> - case ir_unop_sin:
> - case ir_unop_sin_reduced:
> - emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
> - break;
> - case ir_unop_cos:
> - case ir_unop_cos_reduced:
> - emit_math(SHADER_OPCODE_COS, this->result, op[0]);
> - break;
> -
> - case ir_unop_dFdx:
> - /* Select one of the two opcodes based on the glHint value. */
> - if (fs_key->high_quality_derivatives)
> - emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
> - else
> - emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
> - break;
> -
> - case ir_unop_dFdx_coarse:
> - emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
> - break;
> -
> - case ir_unop_dFdx_fine:
> - emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
> - break;
> -
> - case ir_unop_dFdy:
> - /* Select one of the two opcodes based on the glHint value. */
> - if (fs_key->high_quality_derivatives)
> - emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
> - else
> - emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
> - break;
> -
> - case ir_unop_dFdy_coarse:
> - emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
> - break;
> -
> - case ir_unop_dFdy_fine:
> - emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
> - break;
> -
> - case ir_binop_add:
> - emit(ADD(this->result, op[0], op[1]));
> - break;
> - case ir_binop_sub:
> - unreachable("not reached: should be handled by ir_sub_to_add_neg");
> -
> - case ir_binop_mul:
> - if (brw->gen < 8 && ir->type->is_integer()) {
> - /* For integer multiplication, the MUL uses the low 16 bits
> - * of one of the operands (src0 on gen6, src1 on gen7). The
> - * MACH accumulates in the contribution of the upper 16 bits
> - * of that operand.
> - */
> - if (ir->operands[0]->is_uint16_constant()) {
> - if (brw->gen < 7)
> - emit(MUL(this->result, op[0], op[1]));
> - else
> - emit(MUL(this->result, op[1], op[0]));
> - } else if (ir->operands[1]->is_uint16_constant()) {
> - if (brw->gen < 7)
> - emit(MUL(this->result, op[1], op[0]));
> - else
> - emit(MUL(this->result, op[0], op[1]));
> - } else {
> - if (brw->gen >= 7)
> - no16("SIMD16 explicit accumulator operands unsupported\n");
> -
> - struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> - this->result.type);
> -
> - emit(MUL(acc, op[0], op[1]));
> - emit(MACH(reg_null_d, op[0], op[1]));
> - emit(MOV(this->result, fs_reg(acc)));
> - }
> - } else {
> - emit(MUL(this->result, op[0], op[1]));
> - }
> - break;
> - case ir_binop_imul_high: {
> - if (brw->gen == 7)
> - no16("SIMD16 explicit accumulator operands unsupported\n");
> -
> - struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> - this->result.type);
> -
> - fs_inst *mul = emit(MUL(acc, op[0], op[1]));
> - emit(MACH(this->result, op[0], op[1]));
> -
> - /* Until Gen8, integer multiplies read 32-bits from one source, and
> - * 16-bits from the other, and relying on the MACH instruction to
> - * generate the high bits of the result.
> - *
> - * On Gen8, the multiply instruction does a full 32x32-bit multiply,
> - * but in order to do a 64x64-bit multiply we have to simulate the
> - * previous behavior and then use a MACH instruction.
> - *
> - * FINISHME: Don't use source modifiers on src1.
> - */
> - if (brw->gen >= 8) {
> - assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
> - mul->src[1].type == BRW_REGISTER_TYPE_UD);
> - if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
> - mul->src[1].type = BRW_REGISTER_TYPE_W;
> - } else {
> - mul->src[1].type = BRW_REGISTER_TYPE_UW;
> - }
> - }
> -
> - break;
> - }
> - case ir_binop_div:
> - /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
> - assert(ir->type->is_integer());
> - emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
> - break;
> - case ir_binop_carry: {
> - if (brw->gen == 7)
> - no16("SIMD16 explicit accumulator operands unsupported\n");
> -
> - struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> - BRW_REGISTER_TYPE_UD);
> -
> - emit(ADDC(reg_null_ud, op[0], op[1]));
> - emit(MOV(this->result, fs_reg(acc)));
> - break;
> - }
> - case ir_binop_borrow: {
> - if (brw->gen == 7)
> - no16("SIMD16 explicit accumulator operands unsupported\n");
> -
> - struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> - BRW_REGISTER_TYPE_UD);
> -
> - emit(SUBB(reg_null_ud, op[0], op[1]));
> - emit(MOV(this->result, fs_reg(acc)));
> - break;
> - }
> - case ir_binop_mod:
> - /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
> - assert(ir->type->is_integer());
> - emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
> - break;
> -
> - case ir_binop_less:
> - case ir_binop_greater:
> - case ir_binop_lequal:
> - case ir_binop_gequal:
> - case ir_binop_equal:
> - case ir_binop_all_equal:
> - case ir_binop_nequal:
> - case ir_binop_any_nequal:
> - if (brw->gen <= 5) {
> - resolve_bool_comparison(ir->operands[0], &op[0]);
> - resolve_bool_comparison(ir->operands[1], &op[1]);
> - }
> -
> - emit(CMP(this->result, op[0], op[1],
> - brw_conditional_for_comparison(ir->operation)));
> - break;
> -
> - case ir_binop_logic_xor:
> - emit(XOR(this->result, op[0], op[1]));
> - break;
> -
> - case ir_binop_logic_or:
> - emit(OR(this->result, op[0], op[1]));
> - break;
> -
> - case ir_binop_logic_and:
> - emit(AND(this->result, op[0], op[1]));
> - break;
> -
> - case ir_binop_dot:
> - case ir_unop_any:
> - unreachable("not reached: should be handled by brw_fs_channel_expressions");
> -
> - case ir_unop_noise:
> - unreachable("not reached: should be handled by lower_noise");
> -
> - case ir_quadop_vector:
> - unreachable("not reached: should be handled by lower_quadop_vector");
> -
> - case ir_binop_vector_extract:
> - unreachable("not reached: should be handled by lower_vec_index_to_cond_assign()");
> -
> - case ir_triop_vector_insert:
> - unreachable("not reached: should be handled by lower_vector_insert()");
> -
> - case ir_binop_ldexp:
> - unreachable("not reached: should be handled by ldexp_to_arith()");
> -
> - case ir_unop_sqrt:
> - emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
> - break;
> -
> - case ir_unop_rsq:
> - emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
> - break;
> -
> - case ir_unop_bitcast_i2f:
> - case ir_unop_bitcast_u2f:
> - op[0].type = BRW_REGISTER_TYPE_F;
> - this->result = op[0];
> - break;
> - case ir_unop_i2u:
> - case ir_unop_bitcast_f2u:
> - op[0].type = BRW_REGISTER_TYPE_UD;
> - this->result = op[0];
> - break;
> - case ir_unop_u2i:
> - case ir_unop_bitcast_f2i:
> - op[0].type = BRW_REGISTER_TYPE_D;
> - this->result = op[0];
> - break;
> - case ir_unop_i2f:
> - case ir_unop_u2f:
> - case ir_unop_f2i:
> - case ir_unop_f2u:
> - emit(MOV(this->result, op[0]));
> - break;
> -
> - case ir_unop_b2i:
> - emit(AND(this->result, op[0], fs_reg(1)));
> - break;
> - case ir_unop_b2f:
> - if (brw->gen <= 5) {
> - resolve_bool_comparison(ir->operands[0], &op[0]);
> - }
> - op[0].type = BRW_REGISTER_TYPE_D;
> - this->result.type = BRW_REGISTER_TYPE_D;
> - emit(AND(this->result, op[0], fs_reg(0x3f800000u)));
> - this->result.type = BRW_REGISTER_TYPE_F;
> - break;
> -
> - case ir_unop_f2b:
> - emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
> - break;
> - case ir_unop_i2b:
> - emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
> - break;
> -
> - case ir_unop_trunc:
> - emit(RNDZ(this->result, op[0]));
> - break;
> - case ir_unop_ceil: {
> - fs_reg tmp = vgrf(ir->type);
> - op[0].negate = !op[0].negate;
> - emit(RNDD(tmp, op[0]));
> - tmp.negate = true;
> - emit(MOV(this->result, tmp));
> - }
> - break;
> - case ir_unop_floor:
> - emit(RNDD(this->result, op[0]));
> - break;
> - case ir_unop_fract:
> - emit(FRC(this->result, op[0]));
> - break;
> - case ir_unop_round_even:
> - emit(RNDE(this->result, op[0]));
> - break;
> -
> - case ir_binop_min:
> - case ir_binop_max:
> - resolve_ud_negate(&op[0]);
> - resolve_ud_negate(&op[1]);
> - emit_minmax(ir->operation == ir_binop_min ?
> - BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
> - this->result, op[0], op[1]);
> - break;
> - case ir_unop_pack_snorm_2x16:
> - case ir_unop_pack_snorm_4x8:
> - case ir_unop_pack_unorm_2x16:
> - case ir_unop_pack_unorm_4x8:
> - case ir_unop_unpack_snorm_2x16:
> - case ir_unop_unpack_snorm_4x8:
> - case ir_unop_unpack_unorm_2x16:
> - case ir_unop_unpack_unorm_4x8:
> - case ir_unop_unpack_half_2x16:
> - case ir_unop_pack_half_2x16:
> - unreachable("not reached: should be handled by lower_packing_builtins");
> - case ir_unop_unpack_half_2x16_split_x:
> - emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
> - break;
> - case ir_unop_unpack_half_2x16_split_y:
> - emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
> - break;
> - case ir_binop_pow:
> - emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
> - break;
> -
> - case ir_unop_bitfield_reverse:
> - emit(BFREV(this->result, op[0]));
> - break;
> - case ir_unop_bit_count:
> - emit(CBIT(this->result, op[0]));
> - break;
> - case ir_unop_find_msb:
> - temp = vgrf(glsl_type::uint_type);
> - emit(FBH(temp, op[0]));
> -
> - /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
> - * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
> - * subtract the result from 31 to convert the MSB count into an LSB count.
> - */
> -
> - /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
> - emit(MOV(this->result, temp));
> - emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
> -
> - temp.negate = true;
> - inst = emit(ADD(this->result, temp, fs_reg(31)));
> - inst->predicate = BRW_PREDICATE_NORMAL;
> - break;
> - case ir_unop_find_lsb:
> - emit(FBL(this->result, op[0]));
> - break;
> - case ir_unop_saturate:
> - inst = emit(MOV(this->result, op[0]));
> - inst->saturate = true;
> - break;
> - case ir_triop_bitfield_extract:
> - /* Note that the instruction's argument order is reversed from GLSL
> - * and the IR.
> - */
> - emit(BFE(this->result, op[2], op[1], op[0]));
> - break;
> - case ir_binop_bfm:
> - emit(BFI1(this->result, op[0], op[1]));
> - break;
> - case ir_triop_bfi:
> - emit(BFI2(this->result, op[0], op[1], op[2]));
> - break;
> - case ir_quadop_bitfield_insert:
> - unreachable("not reached: should be handled by "
> - "lower_instructions::bitfield_insert_to_bfm_bfi");
> -
> - case ir_unop_bit_not:
> - emit(NOT(this->result, op[0]));
> - break;
> - case ir_binop_bit_and:
> - emit(AND(this->result, op[0], op[1]));
> - break;
> - case ir_binop_bit_xor:
> - emit(XOR(this->result, op[0], op[1]));
> - break;
> - case ir_binop_bit_or:
> - emit(OR(this->result, op[0], op[1]));
> - break;
> -
> - case ir_binop_lshift:
> - emit(SHL(this->result, op[0], op[1]));
> - break;
> -
> - case ir_binop_rshift:
> - if (ir->type->base_type == GLSL_TYPE_INT)
> - emit(ASR(this->result, op[0], op[1]));
> - else
> - emit(SHR(this->result, op[0], op[1]));
> - break;
> - case ir_binop_pack_half_2x16_split:
> - emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
> - break;
> - case ir_binop_ubo_load: {
> - /* This IR node takes a constant uniform block and a constant or
> - * variable byte offset within the block and loads a vector from that.
> - */
> - ir_constant *const_uniform_block = ir->operands[0]->as_constant();
> - ir_constant *const_offset = ir->operands[1]->as_constant();
> - fs_reg surf_index;
> -
> - if (const_uniform_block) {
> - /* The block index is a constant, so just emit the binding table entry
> - * as an immediate.
> - */
> - surf_index = fs_reg(stage_prog_data->binding_table.ubo_start +
> - const_uniform_block->value.u[0]);
> - } else {
> - /* The block index is not a constant. Evaluate the index expression
> - * per-channel and add the base UBO index; the generator will select
> - * a value from any live channel.
> - */
> - surf_index = vgrf(glsl_type::uint_type);
> - emit(ADD(surf_index, op[0],
> - fs_reg(stage_prog_data->binding_table.ubo_start)))
> - ->force_writemask_all = true;
> -
> - /* Assume this may touch any UBO. It would be nice to provide
> - * a tighter bound, but the array information is already lowered away.
> - */
> - brw_mark_surface_used(prog_data,
> - stage_prog_data->binding_table.ubo_start +
> - shader_prog->NumUniformBlocks - 1);
> - }
> -
> - if (const_offset) {
> - fs_reg packed_consts = vgrf(glsl_type::float_type);
> - packed_consts.type = result.type;
> -
> - fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
> - emit(new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
> - packed_consts, surf_index, const_offset_reg));
> -
> - for (int i = 0; i < ir->type->vector_elements; i++) {
> - packed_consts.set_smear(const_offset->value.u[0] % 16 / 4 + i);
> -
> - /* The std140 packing rules don't allow vectors to cross 16-byte
> - * boundaries, and a reg is 32 bytes.
> - */
> - assert(packed_consts.subreg_offset < 32);
> -
> - /* UBO bools are any nonzero value. We consider bools to be
> - * values with the low bit set to 1. Convert them using CMP.
> - */
> - if (ir->type->base_type == GLSL_TYPE_BOOL) {
> - emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
> - } else {
> - emit(MOV(result, packed_consts));
> - }
> -
> - result = offset(result, 1);
> - }
> - } else {
> - /* Turn the byte offset into a dword offset. */
> - fs_reg base_offset = vgrf(glsl_type::int_type);
> - emit(SHR(base_offset, op[1], fs_reg(2)));
> -
> - for (int i = 0; i < ir->type->vector_elements; i++) {
> - emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
> - base_offset, i));
> -
> - if (ir->type->base_type == GLSL_TYPE_BOOL)
> - emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
> -
> - result = offset(result, 1);
> - }
> - }
> -
> - result.reg_offset = 0;
> - break;
> - }
> -
> - case ir_triop_fma:
> - /* Note that the instruction's argument order is reversed from GLSL
> - * and the IR.
> - */
> - emit(MAD(this->result, op[2], op[1], op[0]));
> - break;
> -
> - case ir_triop_lrp:
> - emit_lrp(this->result, op[0], op[1], op[2]);
> - break;
> -
> - case ir_triop_csel:
> - case ir_unop_interpolate_at_centroid:
> - case ir_binop_interpolate_at_offset:
> - case ir_binop_interpolate_at_sample:
> - unreachable("already handled above");
> - break;
> -
> - case ir_unop_d2f:
> - case ir_unop_f2d:
> - case ir_unop_d2i:
> - case ir_unop_i2d:
> - case ir_unop_d2u:
> - case ir_unop_u2d:
> - case ir_unop_d2b:
> - case ir_unop_pack_double_2x32:
> - case ir_unop_unpack_double_2x32:
> - case ir_unop_frexp_sig:
> - case ir_unop_frexp_exp:
> - unreachable("fp64 todo");
> - break;
> - }
> -}
> -
> -void
> -fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
> - const glsl_type *type, bool predicated)
> -{
> - switch (type->base_type) {
> - case GLSL_TYPE_FLOAT:
> - case GLSL_TYPE_UINT:
> - case GLSL_TYPE_INT:
> - case GLSL_TYPE_BOOL:
> - for (unsigned int i = 0; i < type->components(); i++) {
> - l.type = brw_type_for_base_type(type);
> - r.type = brw_type_for_base_type(type);
> -
> - if (predicated || !l.equals(r)) {
> - fs_inst *inst = emit(MOV(l, r));
> - inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
> - }
> -
> - l = offset(l, 1);
> - r = offset(r, 1);
> - }
> - break;
> - case GLSL_TYPE_ARRAY:
> - for (unsigned int i = 0; i < type->length; i++) {
> - emit_assignment_writes(l, r, type->fields.array, predicated);
> - }
> - break;
> -
> - case GLSL_TYPE_STRUCT:
> - for (unsigned int i = 0; i < type->length; i++) {
> - emit_assignment_writes(l, r, type->fields.structure[i].type,
> - predicated);
> - }
> - break;
> -
> - case GLSL_TYPE_SAMPLER:
> - case GLSL_TYPE_IMAGE:
> - case GLSL_TYPE_ATOMIC_UINT:
> - break;
> -
> - case GLSL_TYPE_DOUBLE:
> - case GLSL_TYPE_VOID:
> - case GLSL_TYPE_ERROR:
> - case GLSL_TYPE_INTERFACE:
> - unreachable("not reached");
> - }
> -}
> -
> -/* If the RHS processing resulted in an instruction generating a
> - * temporary value, and it would be easy to rewrite the instruction to
> - * generate its result right into the LHS instead, do so. This ends
> - * up reliably removing instructions where it can be tricky to do so
> - * later without real UD chain information.
> - */
> -bool
> -fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
> - fs_reg dst,
> - fs_reg src,
> - fs_inst *pre_rhs_inst,
> - fs_inst *last_rhs_inst)
> -{
> - /* Only attempt if we're doing a direct assignment. */
> - if (ir->condition ||
> - !(ir->lhs->type->is_scalar() ||
> - (ir->lhs->type->is_vector() &&
> - ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
> - return false;
> -
> - /* Make sure the last instruction generated our source reg. */
> - fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
> - last_rhs_inst,
> - src);
> - if (!modify)
> - return false;
> -
> - /* If last_rhs_inst wrote a different number of components than our LHS,
> - * we can't safely rewrite it.
> - */
> - if (alloc.sizes[dst.reg] != modify->regs_written)
> - return false;
> -
> - /* Success! Rewrite the instruction. */
> - modify->dst = dst;
> -
> - return true;
> -}
> -
> -void
> -fs_visitor::visit(ir_assignment *ir)
> -{
> - fs_reg l, r;
> - fs_inst *inst;
> -
> - /* FINISHME: arrays on the lhs */
> - ir->lhs->accept(this);
> - l = this->result;
> -
> - fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
> -
> - ir->rhs->accept(this);
> - r = this->result;
> -
> - fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
> -
> - assert(l.file != BAD_FILE);
> - assert(r.file != BAD_FILE);
> -
> - if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
> - return;
> -
> - if (ir->condition) {
> - emit_bool_to_cond_code(ir->condition);
> - }
> -
> - if (ir->lhs->type->is_scalar() ||
> - ir->lhs->type->is_vector()) {
> - for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
> - if (ir->write_mask & (1 << i)) {
> - inst = emit(MOV(l, r));
> - if (ir->condition)
> - inst->predicate = BRW_PREDICATE_NORMAL;
> - r = offset(r, 1);
> - }
> - l = offset(l, 1);
> - }
> - } else {
> - emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
> - }
> -}
> -
> -fs_inst *
> -fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
> - fs_reg coordinate, int coord_components,
> - fs_reg shadow_c,
> - fs_reg lod, fs_reg dPdy, int grad_components,
> - uint32_t sampler)
> -{
> - int mlen;
> - int base_mrf = 1;
> - bool simd16 = false;
> - fs_reg orig_dst;
> -
> - /* g0 header. */
> - mlen = 1;
> -
> - if (shadow_c.file != BAD_FILE) {
> - for (int i = 0; i < coord_components; i++) {
> - emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
> - coordinate = offset(coordinate, 1);
> - }
> -
> - /* gen4's SIMD8 sampler always has the slots for u,v,r present.
> - * the unused slots must be zeroed.
> - */
> - for (int i = coord_components; i < 3; i++) {
> - emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
> - }
> - mlen += 3;
> -
> - if (op == ir_tex) {
> - /* There's no plain shadow compare message, so we use shadow
> - * compare with a bias of 0.0.
> - */
> - emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
> - mlen++;
> - } else if (op == ir_txb || op == ir_txl) {
> - emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
> - mlen++;
> - } else {
> - unreachable("Should not get here.");
> - }
> -
> - emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
> - mlen++;
> - } else if (op == ir_tex) {
> - for (int i = 0; i < coord_components; i++) {
> - emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
> - coordinate = offset(coordinate, 1);
> - }
> - /* zero the others. */
> - for (int i = coord_components; i<3; i++) {
> - emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
> - }
> - /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
> - mlen += 3;
> - } else if (op == ir_txd) {
> - fs_reg &dPdx = lod;
> -
> - for (int i = 0; i < coord_components; i++) {
> - emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
> - coordinate = offset(coordinate, 1);
> - }
> - /* the slots for u and v are always present, but r is optional */
> - mlen += MAX2(coord_components, 2);
> -
> - /* P = u, v, r
> - * dPdx = dudx, dvdx, drdx
> - * dPdy = dudy, dvdy, drdy
> - *
> - * 1-arg: Does not exist.
> - *
> - * 2-arg: dudx dvdx dudy dvdy
> - * dPdx.x dPdx.y dPdy.x dPdy.y
> - * m4 m5 m6 m7
> - *
> - * 3-arg: dudx dvdx drdx dudy dvdy drdy
> - * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
> - * m5 m6 m7 m8 m9 m10
> - */
> - for (int i = 0; i < grad_components; i++) {
> - emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
> - dPdx = offset(dPdx, 1);
> - }
> - mlen += MAX2(grad_components, 2);
> -
> - for (int i = 0; i < grad_components; i++) {
> - emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
> - dPdy = offset(dPdy, 1);
> - }
> - mlen += MAX2(grad_components, 2);
> - } else if (op == ir_txs) {
> - /* There's no SIMD8 resinfo message on Gen4. Use SIMD16 instead. */
> - simd16 = true;
> - emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
> - mlen += 2;
> - } else {
> - /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod
> - * instructions. We'll need to do SIMD16 here.
> - */
> - simd16 = true;
> - assert(op == ir_txb || op == ir_txl || op == ir_txf);
> -
> - for (int i = 0; i < coord_components; i++) {
> - emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
> - coordinate));
> - coordinate = offset(coordinate, 1);
> - }
> -
> - /* Initialize the rest of u/v/r with 0.0. Empirically, this seems to
> - * be necessary for TXF (ld), but seems wise to do for all messages.
> - */
> - for (int i = coord_components; i < 3; i++) {
> - emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
> - }
> -
> - /* lod/bias appears after u/v/r. */
> - mlen += 6;
> -
> - emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
> - mlen++;
> -
> - /* The unused upper half. */
> - mlen++;
> - }
> -
> - if (simd16) {
> - /* Now, since we're doing simd16, the return is 2 interleaved
> - * vec4s where the odd-indexed ones are junk. We'll need to move
> - * this weirdness around to the expected layout.
> - */
> - orig_dst = dst;
> - dst = fs_reg(GRF, alloc.allocate(8), orig_dst.type);
> - }
> -
> - enum opcode opcode;
> - switch (op) {
> - case ir_tex: opcode = SHADER_OPCODE_TEX; break;
> - case ir_txb: opcode = FS_OPCODE_TXB; break;
> - case ir_txl: opcode = SHADER_OPCODE_TXL; break;
> - case ir_txd: opcode = SHADER_OPCODE_TXD; break;
> - case ir_txs: opcode = SHADER_OPCODE_TXS; break;
> - case ir_txf: opcode = SHADER_OPCODE_TXF; break;
> - default:
> - unreachable("not reached");
> - }
> -
> - fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
> - inst->base_mrf = base_mrf;
> - inst->mlen = mlen;
> - inst->header_present = true;
> - inst->regs_written = simd16 ? 8 : 4;
> -
> - if (simd16) {
> - for (int i = 0; i < 4; i++) {
> - emit(MOV(orig_dst, dst));
> - orig_dst = offset(orig_dst, 1);
> - dst = offset(dst, 2);
> - }
> - }
> -
> - return inst;
> -}
> -
> -/* gen5's sampler has slots for u, v, r, array index, then optional
> - * parameters like shadow comparitor or LOD bias. If optional
> - * parameters aren't present, those base slots are optional and don't
> - * need to be included in the message.
> - *
> - * We don't fill in the unnecessary slots regardless, which may look
> - * surprising in the disassembly.
> - */
> -fs_inst *
> -fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
> - fs_reg coordinate, int vector_elements,
> - fs_reg shadow_c,
> - fs_reg lod, fs_reg lod2, int grad_components,
> - fs_reg sample_index, uint32_t sampler,
> - bool has_offset)
> -{
> - int reg_width = dispatch_width / 8;
> - bool header_present = false;
> -
> - fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
> - fs_reg msg_coords = message;
> -
> - if (has_offset) {
> - /* The offsets set up by the ir_texture visitor are in the
> - * m1 header, so we can't go headerless.
> - */
> - header_present = true;
> - message.reg--;
> - }
> -
> - for (int i = 0; i < vector_elements; i++) {
> - emit(MOV(retype(offset(msg_coords, i), coordinate.type), coordinate));
> - coordinate = offset(coordinate, 1);
> - }
> - fs_reg msg_end = offset(msg_coords, vector_elements);
> - fs_reg msg_lod = offset(msg_coords, 4);
> -
> - if (shadow_c.file != BAD_FILE) {
> - fs_reg msg_shadow = msg_lod;
> - emit(MOV(msg_shadow, shadow_c));
> - msg_lod = offset(msg_shadow, 1);
> - msg_end = msg_lod;
> - }
> -
> - enum opcode opcode;
> - switch (op) {
> - case ir_tex:
> - opcode = SHADER_OPCODE_TEX;
> - break;
> - case ir_txb:
> - emit(MOV(msg_lod, lod));
> - msg_end = offset(msg_lod, 1);
> -
> - opcode = FS_OPCODE_TXB;
> - break;
> - case ir_txl:
> - emit(MOV(msg_lod, lod));
> - msg_end = offset(msg_lod, 1);
> -
> - opcode = SHADER_OPCODE_TXL;
> - break;
> - case ir_txd: {
> - /**
> - * P = u, v, r
> - * dPdx = dudx, dvdx, drdx
> - * dPdy = dudy, dvdy, drdy
> - *
> - * Load up these values:
> - * - dudx dudy dvdx dvdy drdx drdy
> - * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
> - */
> - msg_end = msg_lod;
> - for (int i = 0; i < grad_components; i++) {
> - emit(MOV(msg_end, lod));
> - lod = offset(lod, 1);
> - msg_end = offset(msg_end, 1);
> -
> - emit(MOV(msg_end, lod2));
> - lod2 = offset(lod2, 1);
> - msg_end = offset(msg_end, 1);
> - }
> -
> - opcode = SHADER_OPCODE_TXD;
> - break;
> - }
> - case ir_txs:
> - msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
> - emit(MOV(msg_lod, lod));
> - msg_end = offset(msg_lod, 1);
> -
> - opcode = SHADER_OPCODE_TXS;
> - break;
> - case ir_query_levels:
> - msg_lod = msg_end;
> - emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
> - msg_end = offset(msg_lod, 1);
> -
> - opcode = SHADER_OPCODE_TXS;
> - break;
> - case ir_txf:
> - msg_lod = offset(msg_coords, 3);
> - emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod));
> - msg_end = offset(msg_lod, 1);
> -
> - opcode = SHADER_OPCODE_TXF;
> - break;
> - case ir_txf_ms:
> - msg_lod = offset(msg_coords, 3);
> - /* lod */
> - emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
> - /* sample index */
> - emit(MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index));
> - msg_end = offset(msg_lod, 2);
> -
> - opcode = SHADER_OPCODE_TXF_CMS;
> - break;
> - case ir_lod:
> - opcode = SHADER_OPCODE_LOD;
> - break;
> - case ir_tg4:
> - opcode = SHADER_OPCODE_TG4;
> - break;
> - default:
> - unreachable("not reached");
> - }
> -
> - fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
> - inst->base_mrf = message.reg;
> - inst->mlen = msg_end.reg - message.reg;
> - inst->header_present = header_present;
> - inst->regs_written = 4 * reg_width;
> -
> - if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
> - fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
> - " disallowed by hardware\n");
> - }
> -
> - return inst;
> -}
> -
> -static bool
> -is_high_sampler(struct brw_context *brw, fs_reg sampler)
> -{
> - if (brw->gen < 8 && !brw->is_haswell)
> - return false;
> -
> - return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
> -}
> -
> -fs_inst *
> -fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
> - fs_reg coordinate, int coord_components,
> - fs_reg shadow_c,
> - fs_reg lod, fs_reg lod2, int grad_components,
> - fs_reg sample_index, fs_reg mcs, fs_reg sampler,
> - fs_reg offset_value)
> -{
> - int reg_width = dispatch_width / 8;
> - bool header_present = false;
> -
> - fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
> - for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
> - sources[i] = vgrf(glsl_type::float_type);
> - }
> - int length = 0;
> -
> - if (op == ir_tg4 || offset_value.file != BAD_FILE ||
> - is_high_sampler(brw, sampler)) {
> - /* For general texture offsets (no txf workaround), we need a header to
> - * put them in. Note that for SIMD16 we're making space for two actual
> - * hardware registers here, so the emit will have to fix up for this.
> - *
> - * * ir4_tg4 needs to place its channel select in the header,
> - * for interaction with ARB_texture_swizzle
> - *
> - * The sampler index is only 4-bits, so for larger sampler numbers we
> - * need to offset the Sampler State Pointer in the header.
> - */
> - header_present = true;
> - sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> - length++;
> - }
> -
> - if (shadow_c.file != BAD_FILE) {
> - emit(MOV(sources[length], shadow_c));
> - length++;
> - }
> -
> - bool has_nonconstant_offset =
> - offset_value.file != BAD_FILE && offset_value.file != IMM;
> - bool coordinate_done = false;
> -
> - /* Set up the LOD info */
> - switch (op) {
> - case ir_tex:
> - case ir_lod:
> - break;
> - case ir_txb:
> - emit(MOV(sources[length], lod));
> - length++;
> - break;
> - case ir_txl:
> - emit(MOV(sources[length], lod));
> - length++;
> - break;
> - case ir_txd: {
> - no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
> -
> - /* Load dPdx and the coordinate together:
> - * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
> - */
> - for (int i = 0; i < coord_components; i++) {
> - emit(MOV(sources[length], coordinate));
> - coordinate = offset(coordinate, 1);
> - length++;
> -
> - /* For cube map array, the coordinate is (u,v,r,ai) but there are
> - * only derivatives for (u, v, r).
> - */
> - if (i < grad_components) {
> - emit(MOV(sources[length], lod));
> - lod = offset(lod, 1);
> - length++;
> -
> - emit(MOV(sources[length], lod2));
> - lod2 = offset(lod2, 1);
> - length++;
> - }
> - }
> -
> - coordinate_done = true;
> - break;
> - }
> - case ir_txs:
> - emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod));
> - length++;
> - break;
> - case ir_query_levels:
> - emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u)));
> - length++;
> - break;
> - case ir_txf:
> - /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
> - emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
> - coordinate = offset(coordinate, 1);
> - length++;
> -
> - emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod));
> - length++;
> -
> - for (int i = 1; i < coord_components; i++) {
> - emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
> - coordinate = offset(coordinate, 1);
> - length++;
> - }
> -
> - coordinate_done = true;
> - break;
> - case ir_txf_ms:
> - emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index));
> - length++;
> -
> - /* data from the multisample control surface */
> - emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs));
> - length++;
> -
> - /* there is no offsetting for this message; just copy in the integer
> - * texture coordinates
> - */
> - for (int i = 0; i < coord_components; i++) {
> - emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
> - coordinate = offset(coordinate, 1);
> - length++;
> - }
> -
> - coordinate_done = true;
> - break;
> - case ir_tg4:
> - if (has_nonconstant_offset) {
> - if (shadow_c.file != BAD_FILE)
> - no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
> -
> - /* More crazy intermixing */
> - for (int i = 0; i < 2; i++) { /* u, v */
> - emit(MOV(sources[length], coordinate));
> - coordinate = offset(coordinate, 1);
> - length++;
> - }
> -
> - for (int i = 0; i < 2; i++) { /* offu, offv */
> - emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value));
> - offset_value = offset(offset_value, 1);
> - length++;
> - }
> -
> - if (coord_components == 3) { /* r if present */
> - emit(MOV(sources[length], coordinate));
> - coordinate = offset(coordinate, 1);
> - length++;
> - }
> -
> - coordinate_done = true;
> - }
> - break;
> - }
> -
> - /* Set up the coordinate (except for cases where it was done above) */
> - if (!coordinate_done) {
> - for (int i = 0; i < coord_components; i++) {
> - emit(MOV(sources[length], coordinate));
> - coordinate = offset(coordinate, 1);
> - length++;
> - }
> - }
> -
> - int mlen;
> - if (reg_width == 2)
> - mlen = length * reg_width - header_present;
> - else
> - mlen = length * reg_width;
> -
> - fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
> - BRW_REGISTER_TYPE_F);
> - emit(LOAD_PAYLOAD(src_payload, sources, length));
> -
> - /* Generate the SEND */
> - enum opcode opcode;
> - switch (op) {
> - case ir_tex: opcode = SHADER_OPCODE_TEX; break;
> - case ir_txb: opcode = FS_OPCODE_TXB; break;
> - case ir_txl: opcode = SHADER_OPCODE_TXL; break;
> - case ir_txd: opcode = SHADER_OPCODE_TXD; break;
> - case ir_txf: opcode = SHADER_OPCODE_TXF; break;
> - case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
> - case ir_txs: opcode = SHADER_OPCODE_TXS; break;
> - case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
> - case ir_lod: opcode = SHADER_OPCODE_LOD; break;
> - case ir_tg4:
> - if (has_nonconstant_offset)
> - opcode = SHADER_OPCODE_TG4_OFFSET;
> - else
> - opcode = SHADER_OPCODE_TG4;
> - break;
> - default:
> - unreachable("not reached");
> - }
> - fs_inst *inst = emit(opcode, dst, src_payload, sampler);
> - inst->base_mrf = -1;
> - inst->mlen = mlen;
> - inst->header_present = header_present;
> - inst->regs_written = 4 * reg_width;
> -
> - if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
> - fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
> - " disallowed by hardware\n");
> - }
> -
> - return inst;
> -}
> -
> -fs_reg
> -fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
> - bool is_rect, uint32_t sampler, int texunit)
> -{
> - fs_inst *inst = NULL;
> - bool needs_gl_clamp = true;
> - fs_reg scale_x, scale_y;
> -
> - /* The 965 requires the EU to do the normalization of GL rectangle
> - * texture coordinates. We use the program parameter state
> - * tracking to get the scaling factor.
> - */
> - if (is_rect &&
> - (brw->gen < 6 ||
> - (brw->gen >= 6 && (key_tex->gl_clamp_mask[0] & (1 << sampler) ||
> - key_tex->gl_clamp_mask[1] & (1 << sampler))))) {
> - struct gl_program_parameter_list *params = prog->Parameters;
> - int tokens[STATE_LENGTH] = {
> - STATE_INTERNAL,
> - STATE_TEXRECT_SCALE,
> - texunit,
> - 0,
> - 0
> - };
> -
> - no16("rectangle scale uniform setup not supported on SIMD16\n");
> - if (dispatch_width == 16) {
> - return coordinate;
> - }
> -
> - GLuint index = _mesa_add_state_reference(params,
> - (gl_state_index *)tokens);
> - /* Try to find existing copies of the texrect scale uniforms. */
> - for (unsigned i = 0; i < uniforms; i++) {
> - if (stage_prog_data->param[i] ==
> - &prog->Parameters->ParameterValues[index][0]) {
> - scale_x = fs_reg(UNIFORM, i);
> - scale_y = fs_reg(UNIFORM, i + 1);
> - break;
> - }
> - }
> -
> - /* If we didn't already set them up, do so now. */
> - if (scale_x.file == BAD_FILE) {
> - scale_x = fs_reg(UNIFORM, uniforms);
> - scale_y = fs_reg(UNIFORM, uniforms + 1);
> -
> - stage_prog_data->param[uniforms++] =
> - &prog->Parameters->ParameterValues[index][0];
> - stage_prog_data->param[uniforms++] =
> - &prog->Parameters->ParameterValues[index][1];
> - }
> - }
> -
> - /* The 965 requires the EU to do the normalization of GL rectangle
> - * texture coordinates. We use the program parameter state
> - * tracking to get the scaling factor.
> - */
> - if (brw->gen < 6 && is_rect) {
> - fs_reg dst = fs_reg(GRF, alloc.allocate(coord_components));
> - fs_reg src = coordinate;
> - coordinate = dst;
> -
> - emit(MUL(dst, src, scale_x));
> - dst = offset(dst, 1);
> - src = offset(src, 1);
> - emit(MUL(dst, src, scale_y));
> - } else if (is_rect) {
> - /* On gen6+, the sampler handles the rectangle coordinates
> - * natively, without needing rescaling. But that means we have
> - * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
> - * not [0, 1] like the default case below.
> - */
> - needs_gl_clamp = false;
> -
> - for (int i = 0; i < 2; i++) {
> - if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
> - fs_reg chan = coordinate;
> - chan = offset(chan, i);
> -
> - inst = emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f));
> - inst->conditional_mod = BRW_CONDITIONAL_GE;
> -
> - /* Our parameter comes in as 1.0/width or 1.0/height,
> - * because that's what people normally want for doing
> - * texture rectangle handling. We need width or height
> - * for clamping, but we don't care enough to make a new
> - * parameter type, so just invert back.
> - */
> - fs_reg limit = vgrf(glsl_type::float_type);
> - emit(MOV(limit, i == 0 ? scale_x : scale_y));
> - emit(SHADER_OPCODE_RCP, limit, limit);
> -
> - inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
> - inst->conditional_mod = BRW_CONDITIONAL_L;
> - }
> - }
> - }
> -
> - if (coord_components > 0 && needs_gl_clamp) {
> - for (int i = 0; i < MIN2(coord_components, 3); i++) {
> - if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
> - fs_reg chan = coordinate;
> - chan = offset(chan, i);
> -
> - fs_inst *inst = emit(MOV(chan, chan));
> - inst->saturate = true;
> - }
> - }
> - }
> - return coordinate;
> -}
> -
> -/* Sample from the MCS surface attached to this multisample texture. */
> -fs_reg
> -fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
> -{
> - int reg_width = dispatch_width / 8;
> - fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
> - BRW_REGISTER_TYPE_F);
> - fs_reg dest = vgrf(glsl_type::uvec4_type);
> - fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
> -
> - /* parameters are: u, v, r; missing parameters are treated as zero */
> - for (int i = 0; i < components; i++) {
> - sources[i] = vgrf(glsl_type::float_type);
> - emit(MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate));
> - coordinate = offset(coordinate, 1);
> - }
> -
> - emit(LOAD_PAYLOAD(payload, sources, components));
> -
> - fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
> - inst->base_mrf = -1;
> - inst->mlen = components * reg_width;
> - inst->header_present = false;
> - inst->regs_written = 4 * reg_width; /* we only care about one reg of
> - * response, but the sampler always
> - * writes 4/8
> - */
> -
> - return dest;
> -}
> -
> -void
> -fs_visitor::emit_texture(ir_texture_opcode op,
> - const glsl_type *dest_type,
> - fs_reg coordinate, int coord_components,
> - fs_reg shadow_c,
> - fs_reg lod, fs_reg lod2, int grad_components,
> - fs_reg sample_index,
> - fs_reg offset_value,
> - fs_reg mcs,
> - int gather_component,
> - bool is_cube_array,
> - bool is_rect,
> - uint32_t sampler,
> - fs_reg sampler_reg, int texunit)
> -{
> - fs_inst *inst = NULL;
> -
> - if (op == ir_tg4) {
> - /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
> - * emitting anything other than setting up the constant result.
> - */
> - int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
> - if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
> -
> - fs_reg res = vgrf(glsl_type::vec4_type);
> - this->result = res;
> -
> - for (int i=0; i<4; i++) {
> - emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
> - res = offset(res, 1);
> - }
> - return;
> - }
> - }
> -
> - if (coordinate.file != BAD_FILE) {
> - /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
> - * samplers. This should only be a problem with GL_CLAMP on Gen7.
> - */
> - coordinate = rescale_texcoord(coordinate, coord_components, is_rect,
> - sampler, texunit);
> - }
> -
> - /* Writemasking doesn't eliminate channels on SIMD8 texture
> - * samples, so don't worry about them.
> - */
> - fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
> -
> - if (brw->gen >= 7) {
> - inst = emit_texture_gen7(op, dst, coordinate, coord_components,
> - shadow_c, lod, lod2, grad_components,
> - sample_index, mcs, sampler_reg,
> - offset_value);
> - } else if (brw->gen >= 5) {
> - inst = emit_texture_gen5(op, dst, coordinate, coord_components,
> - shadow_c, lod, lod2, grad_components,
> - sample_index, sampler,
> - offset_value.file != BAD_FILE);
> - } else {
> - inst = emit_texture_gen4(op, dst, coordinate, coord_components,
> - shadow_c, lod, lod2, grad_components,
> - sampler);
> - }
> -
> - if (shadow_c.file != BAD_FILE)
> - inst->shadow_compare = true;
> -
> - if (offset_value.file == IMM)
> - inst->offset = offset_value.fixed_hw_reg.dw1.ud;
> -
> - if (op == ir_tg4) {
> - inst->offset |=
> - gather_channel(gather_component, sampler) << 16; /* M0.2:16-17 */
> -
> - if (brw->gen == 6)
> - emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], dst);
> - }
> -
> - /* fixup #layers for cube map arrays */
> - if (op == ir_txs && is_cube_array) {
> - fs_reg depth = offset(dst, 2);
> - fs_reg fixed_depth = vgrf(glsl_type::int_type);
> - emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
> -
> - fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
> - int components = inst->regs_written / (dst.width / 8);
> - for (int i = 0; i < components; i++) {
> - if (i == 2) {
> - fixed_payload[i] = fixed_depth;
> - } else {
> - fixed_payload[i] = offset(dst, i);
> - }
> - }
> - emit(LOAD_PAYLOAD(dst, fixed_payload, components));
> - }
> -
> - swizzle_result(op, dest_type->vector_elements, dst, sampler);
> -}
> -
> -void
> -fs_visitor::visit(ir_texture *ir)
> -{
> - uint32_t sampler =
> - _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
> -
> - ir_rvalue *nonconst_sampler_index =
> - _mesa_get_sampler_array_nonconst_index(ir->sampler);
> -
> - /* Handle non-constant sampler array indexing */
> - fs_reg sampler_reg;
> - if (nonconst_sampler_index) {
> - /* The highest sampler which may be used by this operation is
> - * the last element of the array. Mark it here, because the generator
> - * doesn't have enough information to determine the bound.
> - */
> - uint32_t array_size = ir->sampler->as_dereference_array()
> - ->array->type->array_size();
> -
> - uint32_t max_used = sampler + array_size - 1;
> - if (ir->op == ir_tg4 && brw->gen < 8) {
> - max_used += stage_prog_data->binding_table.gather_texture_start;
> - } else {
> - max_used += stage_prog_data->binding_table.texture_start;
> - }
> -
> - brw_mark_surface_used(prog_data, max_used);
> -
> - /* Emit code to evaluate the actual indexing expression */
> - nonconst_sampler_index->accept(this);
> - fs_reg temp = vgrf(glsl_type::uint_type);
> - emit(ADD(temp, this->result, fs_reg(sampler)))
> - ->force_writemask_all = true;
> - sampler_reg = temp;
> - } else {
> - /* Single sampler, or constant array index; the indexing expression
> - * is just an immediate.
> - */
> - sampler_reg = fs_reg(sampler);
> - }
> -
> - /* FINISHME: We're failing to recompile our programs when the sampler is
> - * updated. This only matters for the texture rectangle scale parameters
> - * (pre-gen6, or gen6+ with GL_CLAMP).
> - */
> - int texunit = prog->SamplerUnits[sampler];
> -
> - /* Should be lowered by do_lower_texture_projection */
> - assert(!ir->projector);
> -
> - /* Should be lowered */
> - assert(!ir->offset || !ir->offset->type->is_array());
> -
> - /* Generate code to compute all the subexpression trees. This has to be
> - * done before loading any values into MRFs for the sampler message since
> - * generating these values may involve SEND messages that need the MRFs.
> - */
> - fs_reg coordinate;
> - int coord_components = 0;
> - if (ir->coordinate) {
> - coord_components = ir->coordinate->type->vector_elements;
> - ir->coordinate->accept(this);
> - coordinate = this->result;
> - }
> -
> - fs_reg shadow_comparitor;
> - if (ir->shadow_comparitor) {
> - ir->shadow_comparitor->accept(this);
> - shadow_comparitor = this->result;
> - }
> -
> - fs_reg offset_value;
> - if (ir->offset) {
> - ir_constant *const_offset = ir->offset->as_constant();
> - if (const_offset) {
> - /* Store the header bitfield in an IMM register. This allows us to
> - * use offset_value.file to distinguish between no offset, a constant
> - * offset, and a non-constant offset.
> - */
> - offset_value =
> - fs_reg(brw_texture_offset(ctx, const_offset->value.i,
> - const_offset->type->vector_elements));
> - } else {
> - ir->offset->accept(this);
> - offset_value = this->result;
> - }
> - }
> -
> - fs_reg lod, lod2, sample_index, mcs;
> - int grad_components = 0;
> - switch (ir->op) {
> - case ir_tex:
> - case ir_lod:
> - case ir_tg4:
> - case ir_query_levels:
> - break;
> - case ir_txb:
> - ir->lod_info.bias->accept(this);
> - lod = this->result;
> - break;
> - case ir_txd:
> - ir->lod_info.grad.dPdx->accept(this);
> - lod = this->result;
> -
> - ir->lod_info.grad.dPdy->accept(this);
> - lod2 = this->result;
> -
> - grad_components = ir->lod_info.grad.dPdx->type->vector_elements;
> - break;
> - case ir_txf:
> - case ir_txl:
> - case ir_txs:
> - ir->lod_info.lod->accept(this);
> - lod = this->result;
> - break;
> - case ir_txf_ms:
> - ir->lod_info.sample_index->accept(this);
> - sample_index = this->result;
> -
> - if (brw->gen >= 7 &&
> - key_tex->compressed_multisample_layout_mask & (1 << sampler)) {
> - mcs = emit_mcs_fetch(coordinate, ir->coordinate->type->vector_elements,
> - sampler_reg);
> - } else {
> - mcs = fs_reg(0u);
> - }
> - break;
> - default:
> - unreachable("Unrecognized texture opcode");
> - };
> -
> - int gather_component = 0;
> - if (ir->op == ir_tg4)
> - gather_component = ir->lod_info.component->as_constant()->value.i[0];
> -
> - bool is_rect =
> - ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT;
> -
> - bool is_cube_array =
> - ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
> - ir->sampler->type->sampler_array;
> -
> - emit_texture(ir->op, ir->type, coordinate, coord_components,
> - shadow_comparitor, lod, lod2, grad_components,
> - sample_index, offset_value, mcs,
> - gather_component, is_cube_array, is_rect, sampler,
> - sampler_reg, texunit);
> -}
> -
> -/**
> - * Apply workarounds for Gen6 gather with UINT/SINT
> - */
> -void
> -fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
> -{
> - if (!wa)
> - return;
> -
> - int width = (wa & WA_8BIT) ? 8 : 16;
> -
> - for (int i = 0; i < 4; i++) {
> - fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
> - /* Convert from UNORM to UINT */
> - emit(MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1))));
> - emit(MOV(dst, dst_f));
> -
> - if (wa & WA_SIGN) {
> - /* Reinterpret the UINT value as a signed INT value by
> - * shifting the sign bit into place, then shifting back
> - * preserving sign.
> - */
> - emit(SHL(dst, dst, fs_reg(32 - width)));
> - emit(ASR(dst, dst, fs_reg(32 - width)));
> - }
> -
> - dst = offset(dst, 1);
> - }
> -}
> -
> -/**
> - * Set up the gather channel based on the swizzle, for gather4.
> - */
> -uint32_t
> -fs_visitor::gather_channel(int orig_chan, uint32_t sampler)
> -{
> - int swiz = GET_SWZ(key_tex->swizzles[sampler], orig_chan);
> - switch (swiz) {
> - case SWIZZLE_X: return 0;
> - case SWIZZLE_Y:
> - /* gather4 sampler is broken for green channel on RG32F --
> - * we must ask for blue instead.
> - */
> - if (key_tex->gather_channel_quirk_mask & (1 << sampler))
> - return 2;
> - return 1;
> - case SWIZZLE_Z: return 2;
> - case SWIZZLE_W: return 3;
> - default:
> - unreachable("Not reached"); /* zero, one swizzles handled already */
> - }
> -}
> -
> -/**
> - * Swizzle the result of a texture result. This is necessary for
> - * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
> - */
> -void
> -fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
> - fs_reg orig_val, uint32_t sampler)
> -{
> - if (op == ir_query_levels) {
> - /* # levels is in .w */
> - this->result = offset(orig_val, 3);
> - return;
> - }
> -
> - this->result = orig_val;
> -
> - /* txs,lod don't actually sample the texture, so swizzling the result
> - * makes no sense.
> - */
> - if (op == ir_txs || op == ir_lod || op == ir_tg4)
> - return;
> -
> - if (dest_components == 1) {
> - /* Ignore DEPTH_TEXTURE_MODE swizzling. */
> - } else if (key_tex->swizzles[sampler] != SWIZZLE_NOOP) {
> - fs_reg swizzled_result = vgrf(glsl_type::vec4_type);
> - swizzled_result.type = orig_val.type;
> -
> - for (int i = 0; i < 4; i++) {
> - int swiz = GET_SWZ(key_tex->swizzles[sampler], i);
> - fs_reg l = swizzled_result;
> - l = offset(l, i);
> -
> - if (swiz == SWIZZLE_ZERO) {
> - emit(MOV(l, fs_reg(0.0f)));
> - } else if (swiz == SWIZZLE_ONE) {
> - emit(MOV(l, fs_reg(1.0f)));
> - } else {
> - emit(MOV(l, offset(orig_val,
> - GET_SWZ(key_tex->swizzles[sampler], i))));
> - }
> - }
> - this->result = swizzled_result;
> - }
> -}
> -
> -void
> -fs_visitor::visit(ir_swizzle *ir)
> -{
> - ir->val->accept(this);
> - fs_reg val = this->result;
> -
> - if (ir->type->vector_elements == 1) {
> - this->result = offset(this->result, ir->mask.x);
> - return;
> - }
> -
> - fs_reg result = vgrf(ir->type);
> - this->result = result;
> -
> - for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
> - fs_reg channel = val;
> - int swiz = 0;
> -
> - switch (i) {
> - case 0:
> - swiz = ir->mask.x;
> - break;
> - case 1:
> - swiz = ir->mask.y;
> - break;
> - case 2:
> - swiz = ir->mask.z;
> - break;
> - case 3:
> - swiz = ir->mask.w;
> - break;
> - }
> -
> - emit(MOV(result, offset(channel, swiz)));
> - result = offset(result, 1);
> - }
> -}
> -
> -void
> -fs_visitor::visit(ir_discard *ir)
> -{
> - /* We track our discarded pixels in f0.1. By predicating on it, we can
> - * update just the flag bits that aren't yet discarded. If there's no
> - * condition, we emit a CMP of g0 != g0, so all currently executing
> - * channels will get turned off.
> - */
> - fs_inst *cmp;
> - if (ir->condition) {
> - emit_bool_to_cond_code(ir->condition);
> - cmp = (fs_inst *) this->instructions.get_tail();
> - cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
> - } else {
> - fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
> - BRW_REGISTER_TYPE_UW));
> - cmp = emit(CMP(reg_null_f, some_reg, some_reg, BRW_CONDITIONAL_NZ));
> - }
> - cmp->predicate = BRW_PREDICATE_NORMAL;
> - cmp->flag_subreg = 1;
> -
> - if (brw->gen >= 6) {
> - emit_discard_jump();
> - }
> -}
> -
> -void
> -fs_visitor::visit(ir_constant *ir)
> -{
> - /* Set this->result to reg at the bottom of the function because some code
> - * paths will cause this visitor to be applied to other fields. This will
> - * cause the value stored in this->result to be modified.
> - *
> - * Make reg constant so that it doesn't get accidentally modified along the
> - * way. Yes, I actually had this problem. :(
> - */
> - const fs_reg reg = vgrf(ir->type);
> - fs_reg dst_reg = reg;
> -
> - if (ir->type->is_array()) {
> - const unsigned size = type_size(ir->type->fields.array);
> -
> - for (unsigned i = 0; i < ir->type->length; i++) {
> - ir->array_elements[i]->accept(this);
> - fs_reg src_reg = this->result;
> -
> - dst_reg.type = src_reg.type;
> - for (unsigned j = 0; j < size; j++) {
> - emit(MOV(dst_reg, src_reg));
> - src_reg = offset(src_reg, 1);
> - dst_reg = offset(dst_reg, 1);
> - }
> - }
> - } else if (ir->type->is_record()) {
> - foreach_in_list(ir_constant, field, &ir->components) {
> - const unsigned size = type_size(field->type);
> -
> - field->accept(this);
> - fs_reg src_reg = this->result;
> -
> - dst_reg.type = src_reg.type;
> - for (unsigned j = 0; j < size; j++) {
> - emit(MOV(dst_reg, src_reg));
> - src_reg = offset(src_reg, 1);
> - dst_reg = offset(dst_reg, 1);
> - }
> - }
> - } else {
> - const unsigned size = type_size(ir->type);
> -
> - for (unsigned i = 0; i < size; i++) {
> - switch (ir->type->base_type) {
> - case GLSL_TYPE_FLOAT:
> - emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
> - break;
> - case GLSL_TYPE_UINT:
> - emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
> - break;
> - case GLSL_TYPE_INT:
> - emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
> - break;
> - case GLSL_TYPE_BOOL:
> - emit(MOV(dst_reg,
> - fs_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
> - : 0)));
> - break;
> - default:
> - unreachable("Non-float/uint/int/bool constant");
> - }
> - dst_reg = offset(dst_reg, 1);
> - }
> - }
> -
> - this->result = reg;
> -}
> -
> -void
> -fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
> -{
> - ir_expression *expr = ir->as_expression();
> -
> - if (!expr || expr->operation == ir_binop_ubo_load) {
> - ir->accept(this);
> -
> - fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> - return;
> - }
> -
> - fs_reg op[3];
> -
> - assert(expr->get_num_operands() <= 3);
> - for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> - assert(expr->operands[i]->type->is_scalar());
> -
> - expr->operands[i]->accept(this);
> - op[i] = this->result;
> -
> - resolve_ud_negate(&op[i]);
> - }
> -
> - emit_bool_to_cond_code_of_reg(expr, op);
> -}
> -
> -void
> -fs_visitor::emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3])
> -{
> - fs_inst *inst;
> -
> - switch (expr->operation) {
> - case ir_unop_logic_not:
> - inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
> - inst->conditional_mod = BRW_CONDITIONAL_Z;
> - break;
> -
> - case ir_binop_logic_xor:
> - if (brw->gen <= 5) {
> - fs_reg temp = vgrf(expr->type);
> - emit(XOR(temp, op[0], op[1]));
> - inst = emit(AND(reg_null_d, temp, fs_reg(1)));
> - } else {
> - inst = emit(XOR(reg_null_d, op[0], op[1]));
> - }
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> - break;
> -
> - case ir_binop_logic_or:
> - if (brw->gen <= 5) {
> - fs_reg temp = vgrf(expr->type);
> - emit(OR(temp, op[0], op[1]));
> - inst = emit(AND(reg_null_d, temp, fs_reg(1)));
> - } else {
> - inst = emit(OR(reg_null_d, op[0], op[1]));
> - }
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> - break;
> -
> - case ir_binop_logic_and:
> - if (brw->gen <= 5) {
> - fs_reg temp = vgrf(expr->type);
> - emit(AND(temp, op[0], op[1]));
> - inst = emit(AND(reg_null_d, temp, fs_reg(1)));
> - } else {
> - inst = emit(AND(reg_null_d, op[0], op[1]));
> - }
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> - break;
> -
> - case ir_unop_f2b:
> - if (brw->gen >= 6) {
> - emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
> - } else {
> - inst = emit(MOV(reg_null_f, op[0]));
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> - }
> - break;
> -
> - case ir_unop_i2b:
> - if (brw->gen >= 6) {
> - emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
> - } else {
> - inst = emit(MOV(reg_null_d, op[0]));
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> - }
> - break;
> -
> - case ir_binop_greater:
> - case ir_binop_gequal:
> - case ir_binop_less:
> - case ir_binop_lequal:
> - case ir_binop_equal:
> - case ir_binop_all_equal:
> - case ir_binop_nequal:
> - case ir_binop_any_nequal:
> - if (brw->gen <= 5) {
> - resolve_bool_comparison(expr->operands[0], &op[0]);
> - resolve_bool_comparison(expr->operands[1], &op[1]);
> - }
> -
> - emit(CMP(reg_null_d, op[0], op[1],
> - brw_conditional_for_comparison(expr->operation)));
> - break;
> -
> - case ir_triop_csel: {
> - /* Expand the boolean condition into the flag register. */
> - inst = emit(MOV(reg_null_d, op[0]));
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -
> - /* Select which boolean to return. */
> - fs_reg temp = vgrf(expr->operands[1]->type);
> - inst = emit(SEL(temp, op[1], op[2]));
> - inst->predicate = BRW_PREDICATE_NORMAL;
> -
> - /* Expand the result to a condition code. */
> - inst = emit(MOV(reg_null_d, temp));
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> - break;
> - }
> -
> - default:
> - unreachable("not reached");
> - }
> -}
> -
> -/**
> - * Emit a gen6 IF statement with the comparison folded into the IF
> - * instruction.
> - */
> -void
> -fs_visitor::emit_if_gen6(ir_if *ir)
> -{
> - ir_expression *expr = ir->condition->as_expression();
> -
> - if (expr && expr->operation != ir_binop_ubo_load) {
> - fs_reg op[3];
> - fs_inst *inst;
> - fs_reg temp;
> -
> - assert(expr->get_num_operands() <= 3);
> - for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> - assert(expr->operands[i]->type->is_scalar());
> -
> - expr->operands[i]->accept(this);
> - op[i] = this->result;
> - }
> -
> - switch (expr->operation) {
> - case ir_unop_logic_not:
> - emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_Z));
> - return;
> -
> - case ir_binop_logic_xor:
> - emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
> - return;
> -
> - case ir_binop_logic_or:
> - temp = vgrf(glsl_type::bool_type);
> - emit(OR(temp, op[0], op[1]));
> - emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
> - return;
> -
> - case ir_binop_logic_and:
> - temp = vgrf(glsl_type::bool_type);
> - emit(AND(temp, op[0], op[1]));
> - emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
> - return;
> -
> - case ir_unop_f2b:
> - inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> - return;
> -
> - case ir_unop_i2b:
> - emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
> - return;
> -
> - case ir_binop_greater:
> - case ir_binop_gequal:
> - case ir_binop_less:
> - case ir_binop_lequal:
> - case ir_binop_equal:
> - case ir_binop_all_equal:
> - case ir_binop_nequal:
> - case ir_binop_any_nequal:
> - if (brw->gen <= 5) {
> - resolve_bool_comparison(expr->operands[0], &op[0]);
> - resolve_bool_comparison(expr->operands[1], &op[1]);
> - }
> -
> - emit(IF(op[0], op[1],
> - brw_conditional_for_comparison(expr->operation)));
> - return;
> -
> - case ir_triop_csel: {
> - /* Expand the boolean condition into the flag register. */
> - fs_inst *inst = emit(MOV(reg_null_d, op[0]));
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -
> - /* Select which boolean to use as the result. */
> - fs_reg temp = vgrf(expr->operands[1]->type);
> - inst = emit(SEL(temp, op[1], op[2]));
> - inst->predicate = BRW_PREDICATE_NORMAL;
> -
> - emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
> - return;
> - }
> -
> - default:
> - unreachable("not reached");
> - }
> - }
> -
> - ir->condition->accept(this);
> - emit(IF(this->result, fs_reg(0), BRW_CONDITIONAL_NZ));
> -}
> -
> -bool
> -fs_visitor::try_opt_frontfacing_ternary(ir_if *ir)
> -{
> - ir_dereference_variable *deref = ir->condition->as_dereference_variable();
> - if (!deref || strcmp(deref->var->name, "gl_FrontFacing") != 0)
> - return false;
> -
> - if (ir->then_instructions.length() != 1 ||
> - ir->else_instructions.length() != 1)
> - return false;
> -
> - ir_assignment *then_assign =
> - ((ir_instruction *)ir->then_instructions.head)->as_assignment();
> - ir_assignment *else_assign =
> - ((ir_instruction *)ir->else_instructions.head)->as_assignment();
> -
> - if (!then_assign || then_assign->condition ||
> - !else_assign || else_assign->condition ||
> - then_assign->write_mask != else_assign->write_mask ||
> - !then_assign->lhs->equals(else_assign->lhs))
> - return false;
> -
> - ir_constant *then_rhs = then_assign->rhs->as_constant();
> - ir_constant *else_rhs = else_assign->rhs->as_constant();
> -
> - if (!then_rhs || !else_rhs)
> - return false;
> -
> - if (then_rhs->type->base_type != GLSL_TYPE_FLOAT)
> - return false;
> -
> - if ((then_rhs->is_one() && else_rhs->is_negative_one()) ||
> - (else_rhs->is_one() && then_rhs->is_negative_one())) {
> - then_assign->lhs->accept(this);
> - fs_reg dst = this->result;
> - dst.type = BRW_REGISTER_TYPE_D;
> - fs_reg tmp = vgrf(glsl_type::int_type);
> -
> - if (brw->gen >= 6) {
> - /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
> - fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
> -
> - /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
> - *
> - * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W
> - * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
> - *
> - * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
> - */
> -
> - if (then_rhs->is_negative_one()) {
> - assert(else_rhs->is_one());
> - g0.negate = true;
> - }
> -
> - tmp.type = BRW_REGISTER_TYPE_W;
> - tmp.subreg_offset = 2;
> - tmp.stride = 2;
> -
> - fs_inst *or_inst = emit(OR(tmp, g0, fs_reg(0x3f80)));
> - or_inst->src[1].type = BRW_REGISTER_TYPE_UW;
> -
> - tmp.type = BRW_REGISTER_TYPE_D;
> - tmp.subreg_offset = 0;
> - tmp.stride = 1;
> - } else {
> - /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
> - fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
> -
> - /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
> - *
> - * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D
> - * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
> - *
> - * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
> - */
> -
> - if (then_rhs->is_negative_one()) {
> - assert(else_rhs->is_one());
> - g1_6.negate = true;
> - }
> -
> - emit(OR(tmp, g1_6, fs_reg(0x3f800000)));
> - }
> - emit(AND(dst, tmp, fs_reg(0xbf800000)));
> - return true;
> - }
> -
> - return false;
> -}
> -
> -/**
> - * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
> - *
> - * Many GLSL shaders contain the following pattern:
> - *
> - * x = condition ? foo : bar
> - *
> - * The compiler emits an ir_if tree for this, since each subexpression might be
> - * a complex tree that could have side-effects or short-circuit logic.
> - *
> - * However, the common case is to simply select one of two constants or
> - * variable values---which is exactly what SEL is for. In this case, the
> - * assembly looks like:
> - *
> - * (+f0) IF
> - * MOV dst src0
> - * ELSE
> - * MOV dst src1
> - * ENDIF
> - *
> - * which can be easily translated into:
> - *
> - * (+f0) SEL dst src0 src1
> - *
> - * If src0 is an immediate value, we promote it to a temporary GRF.
> - */
> -bool
> -fs_visitor::try_replace_with_sel()
> -{
> - fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
> - assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
> -
> - /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
> - int opcodes[] = {
> - BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
> - };
> -
> - fs_inst *match = (fs_inst *) endif_inst->prev;
> - for (int i = 0; i < 4; i++) {
> - if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
> - return false;
> - match = (fs_inst *) match->prev;
> - }
> -
> - /* The opcodes match; it looks like the right sequence of instructions. */
> - fs_inst *else_mov = (fs_inst *) endif_inst->prev;
> - fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
> - fs_inst *if_inst = (fs_inst *) then_mov->prev;
> -
> - /* Check that the MOVs are the right form. */
> - if (then_mov->dst.equals(else_mov->dst) &&
> - !then_mov->is_partial_write() &&
> - !else_mov->is_partial_write()) {
> -
> - /* Remove the matched instructions; we'll emit a SEL to replace them. */
> - while (!if_inst->next->is_tail_sentinel())
> - if_inst->next->exec_node::remove();
> - if_inst->exec_node::remove();
> -
> - /* Only the last source register can be a constant, so if the MOV in
> - * the "then" clause uses a constant, we need to put it in a temporary.
> - */
> - fs_reg src0(then_mov->src[0]);
> - if (src0.file == IMM) {
> - src0 = vgrf(glsl_type::float_type);
> - src0.type = then_mov->src[0].type;
> - emit(MOV(src0, then_mov->src[0]));
> - }
> -
> - fs_inst *sel;
> - if (if_inst->conditional_mod) {
> - /* Sandybridge-specific IF with embedded comparison */
> - emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
> - if_inst->conditional_mod));
> - sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
> - sel->predicate = BRW_PREDICATE_NORMAL;
> - } else {
> - /* Separate CMP and IF instructions */
> - sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
> - sel->predicate = if_inst->predicate;
> - sel->predicate_inverse = if_inst->predicate_inverse;
> - }
> -
> - return true;
> - }
> -
> - return false;
> -}
> -
> -void
> -fs_visitor::visit(ir_if *ir)
> -{
> - if (try_opt_frontfacing_ternary(ir))
> - return;
> -
> - /* Don't point the annotation at the if statement, because then it plus
> - * the then and else blocks get printed.
> - */
> - this->base_ir = ir->condition;
> -
> - if (brw->gen == 6) {
> - emit_if_gen6(ir);
> - } else {
> - emit_bool_to_cond_code(ir->condition);
> -
> - emit(IF(BRW_PREDICATE_NORMAL));
> - }
> -
> - foreach_in_list(ir_instruction, ir_, &ir->then_instructions) {
> - this->base_ir = ir_;
> - ir_->accept(this);
> - }
> -
> - if (!ir->else_instructions.is_empty()) {
> - emit(BRW_OPCODE_ELSE);
> -
> - foreach_in_list(ir_instruction, ir_, &ir->else_instructions) {
> - this->base_ir = ir_;
> - ir_->accept(this);
> - }
> - }
> -
> - emit(BRW_OPCODE_ENDIF);
> -
> - if (!try_replace_with_sel() && brw->gen < 6) {
> - no16("Can't support (non-uniform) control flow on SIMD16\n");
> - }
> -}
> -
> -void
> -fs_visitor::visit(ir_loop *ir)
> -{
> - if (brw->gen < 6) {
> - no16("Can't support (non-uniform) control flow on SIMD16\n");
> - }
> -
> - this->base_ir = NULL;
> - emit(BRW_OPCODE_DO);
> -
> - foreach_in_list(ir_instruction, ir_, &ir->body_instructions) {
> - this->base_ir = ir_;
> - ir_->accept(this);
> - }
> -
> - this->base_ir = NULL;
> - emit(BRW_OPCODE_WHILE);
> -}
> -
> -void
> -fs_visitor::visit(ir_loop_jump *ir)
> -{
> - switch (ir->mode) {
> - case ir_loop_jump::jump_break:
> - emit(BRW_OPCODE_BREAK);
> - break;
> - case ir_loop_jump::jump_continue:
> - emit(BRW_OPCODE_CONTINUE);
> - break;
> - }
> -}
> -
> -void
> -fs_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
> -{
> - ir_dereference *deref = static_cast<ir_dereference *>(
> - ir->actual_parameters.get_head());
> - ir_variable *location = deref->variable_referenced();
> - unsigned surf_index = (stage_prog_data->binding_table.abo_start +
> - location->data.binding);
> -
> - /* Calculate the surface offset */
> - fs_reg offset = vgrf(glsl_type::uint_type);
> - ir_dereference_array *deref_array = deref->as_dereference_array();
> -
> - if (deref_array) {
> - deref_array->array_index->accept(this);
> -
> - fs_reg tmp = vgrf(glsl_type::uint_type);
> - emit(MUL(tmp, this->result, fs_reg(ATOMIC_COUNTER_SIZE)));
> - emit(ADD(offset, tmp, fs_reg(location->data.atomic.offset)));
> - } else {
> - offset = fs_reg(location->data.atomic.offset);
> - }
> -
> - /* Emit the appropriate machine instruction */
> - const char *callee = ir->callee->function_name();
> - ir->return_deref->accept(this);
> - fs_reg dst = this->result;
> -
> - if (!strcmp("__intrinsic_atomic_read", callee)) {
> - emit_untyped_surface_read(surf_index, dst, offset);
> -
> - } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
> - emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
> - fs_reg(), fs_reg());
> -
> - } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
> - emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
> - fs_reg(), fs_reg());
> - }
> -}
> -
> -void
> -fs_visitor::visit(ir_call *ir)
> -{
> - const char *callee = ir->callee->function_name();
> -
> - if (!strcmp("__intrinsic_atomic_read", callee) ||
> - !strcmp("__intrinsic_atomic_increment", callee) ||
> - !strcmp("__intrinsic_atomic_predecrement", callee)) {
> - visit_atomic_counter_intrinsic(ir);
> - } else {
> - unreachable("Unsupported intrinsic.");
> - }
> -}
> -
> -void
> -fs_visitor::visit(ir_return *)
> -{
> - unreachable("FINISHME");
> -}
> -
> -void
> -fs_visitor::visit(ir_function *ir)
> -{
> - /* Ignore function bodies other than main() -- we shouldn't see calls to
> - * them since they should all be inlined before we get to ir_to_mesa.
> - */
> - if (strcmp(ir->name, "main") == 0) {
> - const ir_function_signature *sig;
> - exec_list empty;
> -
> - sig = ir->matching_signature(NULL, &empty, false);
> -
> - assert(sig);
> -
> - foreach_in_list(ir_instruction, ir_, &sig->body) {
> - this->base_ir = ir_;
> - ir_->accept(this);
> - }
> - }
> -}
> -
> -void
> -fs_visitor::visit(ir_function_signature *)
> -{
> - unreachable("not reached");
> -}
> -
> -void
> -fs_visitor::visit(ir_emit_vertex *)
> -{
> - unreachable("not reached");
> -}
> -
> -void
> -fs_visitor::visit(ir_end_primitive *)
> -{
> - unreachable("not reached");
> -}
> -
> -void
> -fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
> - fs_reg dst, fs_reg offset, fs_reg src0,
> - fs_reg src1)
> -{
> - int reg_width = dispatch_width / 8;
> - int length = 0;
> -
> - fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 4);
> -
> - sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> - /* Initialize the sample mask in the message header. */
> - emit(MOV(sources[0], fs_reg(0u)))
> - ->force_writemask_all = true;
> -
> - if (stage == MESA_SHADER_FRAGMENT) {
> - if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
> - emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
> - ->force_writemask_all = true;
> - } else {
> - emit(MOV(component(sources[0], 7),
> - retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
> - ->force_writemask_all = true;
> - }
> - } else {
> - /* The execution mask is part of the side-band information sent together with
> - * the message payload to the data port. It's implicitly ANDed with the sample
> - * mask sent in the header to compute the actual set of channels that execute
> - * the atomic operation.
> - */
> - assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
> - emit(MOV(component(sources[0], 7),
> - fs_reg(0xffffu)))->force_writemask_all = true;
> - }
> - length++;
> -
> - /* Set the atomic operation offset. */
> - sources[1] = vgrf(glsl_type::uint_type);
> - emit(MOV(sources[1], offset));
> - length++;
> -
> - /* Set the atomic operation arguments. */
> - if (src0.file != BAD_FILE) {
> - sources[length] = vgrf(glsl_type::uint_type);
> - emit(MOV(sources[length], src0));
> - length++;
> - }
> -
> - if (src1.file != BAD_FILE) {
> - sources[length] = vgrf(glsl_type::uint_type);
> - emit(MOV(sources[length], src1));
> - length++;
> - }
> -
> - int mlen = 1 + (length - 1) * reg_width;
> - fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
> - BRW_REGISTER_TYPE_UD);
> - emit(LOAD_PAYLOAD(src_payload, sources, length));
> -
> - /* Emit the instruction. */
> - fs_inst *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
> - fs_reg(atomic_op), fs_reg(surf_index));
> - inst->mlen = mlen;
> -}
> -
> -void
> -fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
> - fs_reg offset)
> -{
> - int reg_width = dispatch_width / 8;
> -
> - fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
> -
> - sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> - /* Initialize the sample mask in the message header. */
> - emit(MOV(sources[0], fs_reg(0u)))
> - ->force_writemask_all = true;
> -
> - if (stage == MESA_SHADER_FRAGMENT) {
> - if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
> - emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
> - ->force_writemask_all = true;
> - } else {
> - emit(MOV(component(sources[0], 7),
> - retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
> - ->force_writemask_all = true;
> - }
> - } else {
> - /* The execution mask is part of the side-band information sent together with
> - * the message payload to the data port. It's implicitly ANDed with the sample
> - * mask sent in the header to compute the actual set of channels that execute
> - * the atomic operation.
> - */
> - assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
> - emit(MOV(component(sources[0], 7),
> - fs_reg(0xffffu)))->force_writemask_all = true;
> - }
> -
> - /* Set the surface read offset. */
> - sources[1] = vgrf(glsl_type::uint_type);
> - emit(MOV(sources[1], offset));
> -
> - int mlen = 1 + reg_width;
> - fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
> - BRW_REGISTER_TYPE_UD);
> - fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2));
> -
> - /* Emit the instruction. */
> - inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
> - fs_reg(surf_index));
> - inst->mlen = mlen;
> -}
> -
> -fs_inst *
> -fs_visitor::emit(fs_inst *inst)
> -{
> - if (dispatch_width == 16 && inst->exec_size == 8)
> - inst->force_uncompressed = true;
> -
> - inst->annotation = this->current_annotation;
> - inst->ir = this->base_ir;
> -
> - this->instructions.push_tail(inst);
> -
> - return inst;
> -}
> -
> -void
> -fs_visitor::emit(exec_list list)
> -{
> - foreach_in_list_safe(fs_inst, inst, &list) {
> - inst->exec_node::remove();
> - emit(inst);
> - }
> -}
> -
> -/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
> -void
> -fs_visitor::emit_dummy_fs()
> -{
> - int reg_width = dispatch_width / 8;
> -
> - /* Everyone's favorite color. */
> - const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
> - for (int i = 0; i < 4; i++) {
> - emit(MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
> - dispatch_width), fs_reg(color[i])));
> - }
> -
> - fs_inst *write;
> - write = emit(FS_OPCODE_FB_WRITE);
> - write->eot = true;
> - if (brw->gen >= 6) {
> - write->base_mrf = 2;
> - write->mlen = 4 * reg_width;
> - } else {
> - write->header_present = true;
> - write->base_mrf = 0;
> - write->mlen = 2 + 4 * reg_width;
> - }
> -
> - /* Tell the SF we don't have any inputs. Gen4-5 require at least one
> - * varying to avoid GPU hangs, so set that.
> - */
> - brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
> - wm_prog_data->num_varying_inputs = brw->gen < 6 ? 1 : 0;
> - memset(wm_prog_data->urb_setup, -1,
> - sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
> -
> - /* We don't have any uniforms. */
> - stage_prog_data->nr_params = 0;
> - stage_prog_data->nr_pull_params = 0;
> - stage_prog_data->curb_read_length = 0;
> - stage_prog_data->dispatch_grf_start_reg = 2;
> - wm_prog_data->dispatch_grf_start_reg_16 = 2;
> - grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
> -
> - calculate_cfg();
> -}
> -
> -/* The register location here is relative to the start of the URB
> - * data. It will get adjusted to be a real location before
> - * generate_code() time.
> - */
> -struct brw_reg
> -fs_visitor::interp_reg(int location, int channel)
> -{
> - assert(stage == MESA_SHADER_FRAGMENT);
> - brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> - int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
> - int stride = (channel & 1) * 4;
> -
> - assert(prog_data->urb_setup[location] != -1);
> -
> - return brw_vec1_grf(regnr, stride);
> -}
> -
> -/** Emits the interpolation for the varying inputs. */
> -void
> -fs_visitor::emit_interpolation_setup_gen4()
> -{
> - this->current_annotation = "compute pixel centers";
> - this->pixel_x = vgrf(glsl_type::uint_type);
> - this->pixel_y = vgrf(glsl_type::uint_type);
> - this->pixel_x.type = BRW_REGISTER_TYPE_UW;
> - this->pixel_y.type = BRW_REGISTER_TYPE_UW;
> -
> - emit(FS_OPCODE_PIXEL_X, this->pixel_x);
> - emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
> -
> - this->current_annotation = "compute pixel deltas from v0";
> - if (brw->has_pln) {
> - this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> - vgrf(glsl_type::vec2_type);
> - this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> - offset(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 1);
> - } else {
> - this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> - vgrf(glsl_type::float_type);
> - this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> - vgrf(glsl_type::float_type);
> - }
> - emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> - this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))));
> - emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> - this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))));
> -
> - this->current_annotation = "compute pos.w and 1/pos.w";
> - /* Compute wpos.w. It's always in our setup, since it's needed to
> - * interpolate the other attributes.
> - */
> - this->wpos_w = vgrf(glsl_type::float_type);
> - emit(FS_OPCODE_LINTERP, wpos_w,
> - this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> - this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> - interp_reg(VARYING_SLOT_POS, 3));
> - /* Compute the pixel 1/W value from wpos.w. */
> - this->pixel_w = vgrf(glsl_type::float_type);
> - emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
> - this->current_annotation = NULL;
> -}
> -
> -/** Emits the interpolation for the varying inputs. */
> -void
> -fs_visitor::emit_interpolation_setup_gen6()
> -{
> - struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
> -
> - /* If the pixel centers end up used, the setup is the same as for gen4. */
> - this->current_annotation = "compute pixel centers";
> - fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
> - fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
> - int_pixel_x.type = BRW_REGISTER_TYPE_UW;
> - int_pixel_y.type = BRW_REGISTER_TYPE_UW;
> - emit(ADD(int_pixel_x,
> - fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
> - fs_reg(brw_imm_v(0x10101010))));
> - emit(ADD(int_pixel_y,
> - fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
> - fs_reg(brw_imm_v(0x11001100))));
> -
> - /* As of gen6, we can no longer mix float and int sources. We have
> - * to turn the integer pixel centers into floats for their actual
> - * use.
> - */
> - this->pixel_x = vgrf(glsl_type::float_type);
> - this->pixel_y = vgrf(glsl_type::float_type);
> - emit(MOV(this->pixel_x, int_pixel_x));
> - emit(MOV(this->pixel_y, int_pixel_y));
> -
> - this->current_annotation = "compute pos.w";
> - this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
> - this->wpos_w = vgrf(glsl_type::float_type);
> - emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
> -
> - for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
> - uint8_t reg = payload.barycentric_coord_reg[i];
> - this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
> - this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
> - }
> -
> - this->current_annotation = NULL;
> -}
> -
> -int
> -fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
> - bool use_2nd_half)
> -{
> - brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> - fs_inst *inst;
> -
> - if (color.file == BAD_FILE) {
> - return 4 * (dispatch_width / 8);
> - }
> -
> - uint8_t colors_enabled;
> - if (components == 0) {
> - /* We want to write one component to the alpha channel */
> - colors_enabled = 0x8;
> - } else {
> - /* Enable the first components-many channels */
> - colors_enabled = (1 << components) - 1;
> - }
> -
> - if (dispatch_width == 8 || (brw->gen >= 6 && !do_dual_src)) {
> - /* SIMD8 write looks like:
> - * m + 0: r0
> - * m + 1: r1
> - * m + 2: g0
> - * m + 3: g1
> - *
> - * gen6 SIMD16 DP write looks like:
> - * m + 0: r0
> - * m + 1: r1
> - * m + 2: g0
> - * m + 3: g1
> - * m + 4: b0
> - * m + 5: b1
> - * m + 6: a0
> - * m + 7: a1
> - */
> - int len = 0;
> - for (unsigned i = 0; i < 4; ++i) {
> - if (colors_enabled & (1 << i)) {
> - dst[len] = fs_reg(GRF, alloc.allocate(color.width / 8),
> - color.type, color.width);
> - inst = emit(MOV(dst[len], offset(color, i)));
> - inst->saturate = key->clamp_fragment_color;
> - } else if (color.width == 16) {
> - /* We need two BAD_FILE slots for a 16-wide color */
> - len++;
> - }
> - len++;
> - }
> - return len;
> - } else if (brw->gen >= 6 && do_dual_src) {
> - /* SIMD16 dual source blending for gen6+.
> - *
> - * From the SNB PRM, volume 4, part 1, page 193:
> - *
> - * "The dual source render target messages only have SIMD8 forms due to
> - * maximum message length limitations. SIMD16 pixel shaders must send two
> - * of these messages to cover all of the pixels. Each message contains
> - * two colors (4 channels each) for each pixel in the message payload."
> - *
> - * So in SIMD16 dual source blending we will send 2 SIMD8 messages,
> - * each one will call this function twice (one for each color involved),
> - * so in each pass we only write 4 registers. Notice that the second
> - * SIMD8 message needs to read color data from the 2nd half of the color
> - * registers, so it needs to call this with use_2nd_half = true.
> - */
> - for (unsigned i = 0; i < 4; ++i) {
> - if (colors_enabled & (1 << i)) {
> - dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
> - inst = emit(MOV(dst[i], half(offset(color, i),
> - use_2nd_half ? 1 : 0)));
> - inst->saturate = key->clamp_fragment_color;
> - if (use_2nd_half)
> - inst->force_sechalf = true;
> - }
> - }
> - return 4;
> - } else {
> - /* pre-gen6 SIMD16 single source DP write looks like:
> - * m + 0: r0
> - * m + 1: g0
> - * m + 2: b0
> - * m + 3: a0
> - * m + 4: r1
> - * m + 5: g1
> - * m + 6: b1
> - * m + 7: a1
> - */
> - for (unsigned i = 0; i < 4; ++i) {
> - if (colors_enabled & (1 << i)) {
> - dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
> - inst = emit(MOV(dst[i], half(offset(color, i), 0)));
> - inst->saturate = key->clamp_fragment_color;
> -
> - dst[i + 4] = fs_reg(GRF, alloc.allocate(1), color.type);
> - inst = emit(MOV(dst[i + 4], half(offset(color, i), 1)));
> - inst->saturate = key->clamp_fragment_color;
> - inst->force_sechalf = true;
> - }
> - }
> - return 8;
> - }
> -}
> -
> -static enum brw_conditional_mod
> -cond_for_alpha_func(GLenum func)
> -{
> - switch(func) {
> - case GL_GREATER:
> - return BRW_CONDITIONAL_G;
> - case GL_GEQUAL:
> - return BRW_CONDITIONAL_GE;
> - case GL_LESS:
> - return BRW_CONDITIONAL_L;
> - case GL_LEQUAL:
> - return BRW_CONDITIONAL_LE;
> - case GL_EQUAL:
> - return BRW_CONDITIONAL_EQ;
> - case GL_NOTEQUAL:
> - return BRW_CONDITIONAL_NEQ;
> - default:
> - unreachable("Not reached");
> - }
> -}
> -
> -/**
> - * Alpha test support for when we compile it into the shader instead
> - * of using the normal fixed-function alpha test.
> - */
> -void
> -fs_visitor::emit_alpha_test()
> -{
> - assert(stage == MESA_SHADER_FRAGMENT);
> - brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> - this->current_annotation = "Alpha test";
> -
> - fs_inst *cmp;
> - if (key->alpha_test_func == GL_ALWAYS)
> - return;
> -
> - if (key->alpha_test_func == GL_NEVER) {
> - /* f0.1 = 0 */
> - fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
> - BRW_REGISTER_TYPE_UW));
> - cmp = emit(CMP(reg_null_f, some_reg, some_reg,
> - BRW_CONDITIONAL_NEQ));
> - } else {
> - /* RT0 alpha */
> - fs_reg color = offset(outputs[0], 3);
> -
> - /* f0.1 &= func(color, ref) */
> - cmp = emit(CMP(reg_null_f, color, fs_reg(key->alpha_test_ref),
> - cond_for_alpha_func(key->alpha_test_func)));
> - }
> - cmp->predicate = BRW_PREDICATE_NORMAL;
> - cmp->flag_subreg = 1;
> -}
> -
> -fs_inst *
> -fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
> - fs_reg src0_alpha, unsigned components,
> - bool use_2nd_half)
> -{
> - assert(stage == MESA_SHADER_FRAGMENT);
> - brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> - brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> -
> - this->current_annotation = "FB write header";
> - bool header_present = true;
> - int reg_size = dispatch_width / 8;
> -
> - /* We can potentially have a message length of up to 15, so we have to set
> - * base_mrf to either 0 or 1 in order to fit in m0..m15.
> - */
> - fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15);
> - int length = 0;
> -
> - /* From the Sandy Bridge PRM, volume 4, page 198:
> - *
> - * "Dispatched Pixel Enables. One bit per pixel indicating
> - * which pixels were originally enabled when the thread was
> - * dispatched. This field is only required for the end-of-
> - * thread message and on all dual-source messages."
> - */
> - if (brw->gen >= 6 &&
> - (brw->is_haswell || brw->gen >= 8 || !prog_data->uses_kill) &&
> - color1.file == BAD_FILE &&
> - key->nr_color_regions == 1) {
> - header_present = false;
> - }
> -
> - if (header_present)
> - /* Allocate 2 registers for a header */
> - length += 2;
> -
> - if (payload.aa_dest_stencil_reg) {
> - sources[length] = fs_reg(GRF, alloc.allocate(1));
> - emit(MOV(sources[length],
> - fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))));
> - length++;
> - }
> -
> - prog_data->uses_omask =
> - prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
> - if (prog_data->uses_omask) {
> - this->current_annotation = "FB write oMask";
> - assert(this->sample_mask.file != BAD_FILE);
> - /* Hand over gl_SampleMask. Only lower 16 bits are relevant. Since
> - * it's unsinged single words, one vgrf is always 16-wide.
> - */
> - sources[length] = fs_reg(GRF, alloc.allocate(1),
> - BRW_REGISTER_TYPE_UW, 16);
> - emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
> - length++;
> - }
> -
> - if (color0.file == BAD_FILE) {
> - /* Even if there's no color buffers enabled, we still need to send
> - * alpha out the pipeline to our null renderbuffer to support
> - * alpha-testing, alpha-to-coverage, and so on.
> - */
> - length += setup_color_payload(sources + length, this->outputs[0], 0,
> - false);
> - } else if (color1.file == BAD_FILE) {
> - if (src0_alpha.file != BAD_FILE) {
> - sources[length] = fs_reg(GRF, alloc.allocate(reg_size),
> - src0_alpha.type, src0_alpha.width);
> - fs_inst *inst = emit(MOV(sources[length], src0_alpha));
> - inst->saturate = key->clamp_fragment_color;
> - length++;
> - }
> -
> - length += setup_color_payload(sources + length, color0, components,
> - false);
> - } else {
> - length += setup_color_payload(sources + length, color0, components,
> - use_2nd_half);
> - length += setup_color_payload(sources + length, color1, components,
> - use_2nd_half);
> - }
> -
> - if (source_depth_to_render_target) {
> - if (brw->gen == 6) {
> - /* For outputting oDepth on gen6, SIMD8 writes have to be
> - * used. This would require SIMD8 moves of each half to
> - * message regs, kind of like pre-gen5 SIMD16 FB writes.
> - * Just bail on doing so for now.
> - */
> - no16("Missing support for simd16 depth writes on gen6\n");
> - }
> -
> - sources[length] = vgrf(glsl_type::float_type);
> - if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
> - /* Hand over gl_FragDepth. */
> - assert(this->frag_depth.file != BAD_FILE);
> - emit(MOV(sources[length], this->frag_depth));
> - } else {
> - /* Pass through the payload depth. */
> - emit(MOV(sources[length],
> - fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
> - }
> - length++;
> - }
> -
> - if (payload.dest_depth_reg) {
> - sources[length] = vgrf(glsl_type::float_type);
> - emit(MOV(sources[length],
> - fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0))));
> - length++;
> - }
> -
> - fs_inst *load;
> - fs_inst *write;
> - if (brw->gen >= 7) {
> - /* Send from the GRF */
> - fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
> - load = emit(LOAD_PAYLOAD(payload, sources, length));
> - payload.reg = alloc.allocate(load->regs_written);
> - payload.width = dispatch_width;
> - load->dst = payload;
> - write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
> - write->base_mrf = -1;
> - } else {
> - /* Send from the MRF */
> - load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
> - sources, length));
> - write = emit(FS_OPCODE_FB_WRITE);
> - write->exec_size = dispatch_width;
> - write->base_mrf = 1;
> - }
> -
> - write->mlen = load->regs_written;
> - write->header_present = header_present;
> - if (prog_data->uses_kill) {
> - write->predicate = BRW_PREDICATE_NORMAL;
> - write->flag_subreg = 1;
> - }
> - return write;
> -}
> -
> -void
> -fs_visitor::emit_fb_writes()
> -{
> - assert(stage == MESA_SHADER_FRAGMENT);
> - brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> - brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> -
> - fs_inst *inst = NULL;
> - if (do_dual_src) {
> - this->current_annotation = ralloc_asprintf(this->mem_ctx,
> - "FB dual-source write");
> - inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
> - reg_undef, 4);
> - inst->target = 0;
> -
> - /* SIMD16 dual source blending requires to send two SIMD8 dual source
> - * messages, where each message contains color data for 8 pixels. Color
> - * data for the first group of pixels is stored in the "lower" half of
> - * the color registers, so in SIMD16, the previous message did:
> - * m + 0: r0
> - * m + 1: g0
> - * m + 2: b0
> - * m + 3: a0
> - *
> - * Here goes the second message, which packs color data for the
> - * remaining 8 pixels. Color data for these pixels is stored in the
> - * "upper" half of the color registers, so we need to do:
> - * m + 0: r1
> - * m + 1: g1
> - * m + 2: b1
> - * m + 3: a1
> - */
> - if (dispatch_width == 16) {
> - inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
> - reg_undef, 4, true);
> - inst->target = 0;
> - }
> -
> - prog_data->dual_src_blend = true;
> - } else {
> - for (int target = 0; target < key->nr_color_regions; target++) {
> - /* Skip over outputs that weren't written. */
> - if (this->outputs[target].file == BAD_FILE)
> - continue;
> -
> - this->current_annotation = ralloc_asprintf(this->mem_ctx,
> - "FB write target %d",
> - target);
> - fs_reg src0_alpha;
> - if (brw->gen >= 6 && key->replicate_alpha && target != 0)
> - src0_alpha = offset(outputs[0], 3);
> -
> - inst = emit_single_fb_write(this->outputs[target], reg_undef,
> - src0_alpha,
> - this->output_components[target]);
> - inst->target = target;
> - }
> - }
> -
> - if (inst == NULL) {
> - /* Even if there's no color buffers enabled, we still need to send
> - * alpha out the pipeline to our null renderbuffer to support
> - * alpha-testing, alpha-to-coverage, and so on.
> - */
> - inst = emit_single_fb_write(reg_undef, reg_undef, reg_undef, 0);
> - inst->target = 0;
> - }
> -
> - inst->eot = true;
> - this->current_annotation = NULL;
> -}
> -
> -void
> -fs_visitor::setup_uniform_clipplane_values()
> -{
> - gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
> - const struct brw_vue_prog_key *key =
> - (const struct brw_vue_prog_key *) this->key;
> -
> - for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
> - this->userplane[i] = fs_reg(UNIFORM, uniforms);
> - for (int j = 0; j < 4; ++j) {
> - stage_prog_data->param[uniforms + j] =
> - (gl_constant_value *) &clip_planes[i][j];
> - }
> - uniforms += 4;
> - }
> -}
> -
> -void fs_visitor::compute_clip_distance()
> -{
> - struct brw_vue_prog_data *vue_prog_data =
> - (struct brw_vue_prog_data *) prog_data;
> - const struct brw_vue_prog_key *key =
> - (const struct brw_vue_prog_key *) this->key;
> -
> - /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
> - *
> - * "If a linked set of shaders forming the vertex stage contains no
> - * static write to gl_ClipVertex or gl_ClipDistance, but the
> - * application has requested clipping against user clip planes through
> - * the API, then the coordinate written to gl_Position is used for
> - * comparison against the user clip planes."
> - *
> - * This function is only called if the shader didn't write to
> - * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
> - * if the user wrote to it; otherwise we use gl_Position.
> - */
> -
> - gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
> - if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
> - clip_vertex = VARYING_SLOT_POS;
> -
> - /* If the clip vertex isn't written, skip this. Typically this means
> - * the GS will set up clipping. */
> - if (outputs[clip_vertex].file == BAD_FILE)
> - return;
> -
> - setup_uniform_clipplane_values();
> -
> - current_annotation = "user clip distances";
> -
> - this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
> - this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
> -
> - for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
> - fs_reg u = userplane[i];
> - fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
> - output.reg_offset = i & 3;
> -
> - emit(MUL(output, outputs[clip_vertex], u));
> - for (int j = 1; j < 4; j++) {
> - u.reg = userplane[i].reg + j;
> - emit(MAD(output, output, offset(outputs[clip_vertex], j), u));
> - }
> - }
> -}
> -
> -void
> -fs_visitor::emit_urb_writes()
> -{
> - int slot, urb_offset, length;
> - struct brw_vs_prog_data *vs_prog_data =
> - (struct brw_vs_prog_data *) prog_data;
> - const struct brw_vs_prog_key *key =
> - (const struct brw_vs_prog_key *) this->key;
> - const GLbitfield64 psiz_mask =
> - VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
> - const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map;
> - bool flush;
> - fs_reg sources[8];
> -
> - /* Lower legacy ff and ClipVertex clipping to clip distances */
> - if (key->base.userclip_active && !prog->UsesClipDistanceOut)
> - compute_clip_distance();
> -
> - /* If we don't have any valid slots to write, just do a minimal urb write
> - * send to terminate the shader. */
> - if (vue_map->slots_valid == 0) {
> -
> - fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> - fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
> - BRW_REGISTER_TYPE_UD))));
> - inst->force_writemask_all = true;
> -
> - inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
> - inst->eot = true;
> - inst->mlen = 1;
> - inst->offset = 1;
> - return;
> - }
> -
> - length = 0;
> - urb_offset = 0;
> - flush = false;
> - for (slot = 0; slot < vue_map->num_slots; slot++) {
> - fs_reg reg, src, zero;
> -
> - int varying = vue_map->slot_to_varying[slot];
> - switch (varying) {
> - case VARYING_SLOT_PSIZ:
> -
> - /* The point size varying slot is the vue header and is always in the
> - * vue map. But often none of the special varyings that live there
> - * are written and in that case we can skip writing to the vue
> - * header, provided the corresponding state properly clamps the
> - * values further down the pipeline. */
> - if ((vue_map->slots_valid & psiz_mask) == 0) {
> - assert(length == 0);
> - urb_offset++;
> - break;
> - }
> -
> - zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> - emit(MOV(zero, fs_reg(0u)));
> -
> - sources[length++] = zero;
> - if (vue_map->slots_valid & VARYING_BIT_LAYER)
> - sources[length++] = this->outputs[VARYING_SLOT_LAYER];
> - else
> - sources[length++] = zero;
> -
> - if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
> - sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
> - else
> - sources[length++] = zero;
> -
> - if (vue_map->slots_valid & VARYING_BIT_PSIZ)
> - sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
> - else
> - sources[length++] = zero;
> - break;
> -
> - case BRW_VARYING_SLOT_NDC:
> - case VARYING_SLOT_EDGE:
> - unreachable("unexpected scalar vs output");
> - break;
> -
> - case BRW_VARYING_SLOT_PAD:
> - break;
> -
> - default:
> - /* gl_Position is always in the vue map, but isn't always written by
> - * the shader. Other varyings (clip distances) get added to the vue
> - * map but don't always get written. In those cases, the
> - * corresponding this->output[] slot will be invalid we and can skip
> - * the urb write for the varying. If we've already queued up a vue
> - * slot for writing we flush a mlen 5 urb write, otherwise we just
> - * advance the urb_offset.
> - */
> - if (this->outputs[varying].file == BAD_FILE) {
> - if (length > 0)
> - flush = true;
> - else
> - urb_offset++;
> - break;
> - }
> -
> - if ((varying == VARYING_SLOT_COL0 ||
> - varying == VARYING_SLOT_COL1 ||
> - varying == VARYING_SLOT_BFC0 ||
> - varying == VARYING_SLOT_BFC1) &&
> - key->clamp_vertex_color) {
> - /* We need to clamp these guys, so do a saturating MOV into a
> - * temp register and use that for the payload.
> - */
> - for (int i = 0; i < 4; i++) {
> - reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
> - src = offset(this->outputs[varying], i);
> - fs_inst *inst = emit(MOV(reg, src));
> - inst->saturate = true;
> - sources[length++] = reg;
> - }
> - } else {
> - for (int i = 0; i < 4; i++)
> - sources[length++] = offset(this->outputs[varying], i);
> - }
> - break;
> - }
> -
> - current_annotation = "URB write";
> -
> - /* If we've queued up 8 registers of payload (2 VUE slots), if this is
> - * the last slot or if we need to flush (see BAD_FILE varying case
> - * above), emit a URB write send now to flush out the data.
> - */
> - int last = slot == vue_map->num_slots - 1;
> - if (length == 8 || last)
> - flush = true;
> - if (flush) {
> - fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
> - fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
> - BRW_REGISTER_TYPE_F);
> -
> - /* We need WE_all on the MOV for the message header (the URB handles)
> - * so do a MOV to a dummy register and set force_writemask_all on the
> - * MOV. LOAD_PAYLOAD will preserve that.
> - */
> - fs_reg dummy = fs_reg(GRF, alloc.allocate(1),
> - BRW_REGISTER_TYPE_UD);
> - fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0),
> - BRW_REGISTER_TYPE_UD))));
> - inst->force_writemask_all = true;
> - payload_sources[0] = dummy;
> -
> - memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
> - emit(LOAD_PAYLOAD(payload, payload_sources, length + 1));
> -
> - inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
> - inst->eot = last;
> - inst->mlen = length + 1;
> - inst->offset = urb_offset;
> - urb_offset = slot + 1;
> - length = 0;
> - flush = false;
> - }
> - }
> -}
> -
> -void
> -fs_visitor::resolve_ud_negate(fs_reg *reg)
> -{
> - if (reg->type != BRW_REGISTER_TYPE_UD ||
> - !reg->negate)
> - return;
> -
> - fs_reg temp = vgrf(glsl_type::uint_type);
> - emit(MOV(temp, *reg));
> - *reg = temp;
> -}
> -
> -/**
> - * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
> - *
> - * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
> - * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
> - */
> -void
> -fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
> -{
> - assert(brw->gen <= 5);
> -
> - if (rvalue->type != glsl_type::bool_type)
> - return;
> -
> - fs_reg and_result = vgrf(glsl_type::bool_type);
> - fs_reg neg_result = vgrf(glsl_type::bool_type);
> - emit(AND(and_result, *reg, fs_reg(1)));
> - emit(MOV(neg_result, negate(and_result)));
> - *reg = neg_result;
> -}
> -
> -fs_visitor::fs_visitor(struct brw_context *brw,
> - void *mem_ctx,
> - const struct brw_wm_prog_key *key,
> - struct brw_wm_prog_data *prog_data,
> - struct gl_shader_program *shader_prog,
> - struct gl_fragment_program *fp,
> - unsigned dispatch_width)
> - : backend_visitor(brw, shader_prog, &fp->Base, &prog_data->base,
> - MESA_SHADER_FRAGMENT),
> - reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
> - reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
> - reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
> - key(key), prog_data(&prog_data->base),
> - dispatch_width(dispatch_width), promoted_constants(0)
> -{
> - this->mem_ctx = mem_ctx;
> - init();
> -}
> -
> -fs_visitor::fs_visitor(struct brw_context *brw,
> - void *mem_ctx,
> - const struct brw_vs_prog_key *key,
> - struct brw_vs_prog_data *prog_data,
> - struct gl_shader_program *shader_prog,
> - struct gl_vertex_program *cp,
> - unsigned dispatch_width)
> - : backend_visitor(brw, shader_prog, &cp->Base, &prog_data->base.base,
> - MESA_SHADER_VERTEX),
> - reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
> - reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
> - reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
> - key(key), prog_data(&prog_data->base.base),
> - dispatch_width(dispatch_width), promoted_constants(0)
> -{
> - this->mem_ctx = mem_ctx;
> - init();
> -}
> -
> -void
> -fs_visitor::init()
> -{
> - switch (stage) {
> - case MESA_SHADER_FRAGMENT:
> - key_tex = &((const brw_wm_prog_key *) key)->tex;
> - break;
> - case MESA_SHADER_VERTEX:
> - case MESA_SHADER_GEOMETRY:
> - key_tex = &((const brw_vue_prog_key *) key)->tex;
> - break;
> - default:
> - unreachable("unhandled shader stage");
> - }
> -
> - this->failed = false;
> - this->simd16_unsupported = false;
> - this->no16_msg = NULL;
> - this->variable_ht = hash_table_ctor(0,
> - hash_table_pointer_hash,
> - hash_table_pointer_compare);
> -
> - this->nir_locals = NULL;
> - this->nir_globals = NULL;
> -
> - memset(&this->payload, 0, sizeof(this->payload));
> - memset(this->outputs, 0, sizeof(this->outputs));
> - memset(this->output_components, 0, sizeof(this->output_components));
> - this->source_depth_to_render_target = false;
> - this->runtime_check_aads_emit = false;
> - this->first_non_payload_grf = 0;
> - this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
> -
> - this->current_annotation = NULL;
> - this->base_ir = NULL;
> -
> - this->virtual_grf_start = NULL;
> - this->virtual_grf_end = NULL;
> - this->live_intervals = NULL;
> - this->regs_live_at_ip = NULL;
> -
> - this->uniforms = 0;
> - this->last_scratch = 0;
> - this->pull_constant_loc = NULL;
> - this->push_constant_loc = NULL;
> -
> - this->spilled_any_registers = false;
> - this->do_dual_src = false;
> -
> - if (dispatch_width == 8)
> - this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
> -}
> -
> -fs_visitor::~fs_visitor()
> -{
> - hash_table_dtor(this->variable_ht);
> -}
> diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
> index 45c157a..3694811 100644
> --- a/src/mesa/drivers/dri/i965/brw_gs.c
> +++ b/src/mesa/drivers/dri/i965/brw_gs.c
> @@ -29,7 +29,7 @@
>
> #include "brw_gs.h"
> #include "brw_context.h"
> -#include "brw_vec4_gs_visitor.h"
> +#include "brw_vec4_gs_god.h"
> #include "brw_state.h"
> #include "brw_ff_gs.h"
>
> diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
> index d3bd64d..9f3473e 100644
> --- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h
> +++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
> @@ -50,8 +50,8 @@ public:
>
> bool equals(const src_reg &r) const;
>
> - src_reg(class vec4_visitor *v, const struct glsl_type *type);
> - src_reg(class vec4_visitor *v, const struct glsl_type *type, int size);
> + src_reg(class vec4_god *v, const struct glsl_type *type);
> + src_reg(class vec4_god *v, const struct glsl_type *type, int size);
>
> explicit src_reg(const dst_reg ®);
>
> @@ -107,7 +107,7 @@ public:
> dst_reg(register_file file, int reg, const glsl_type *type,
> unsigned writemask);
> dst_reg(struct brw_reg reg);
> - dst_reg(class vec4_visitor *v, const struct glsl_type *type);
> + dst_reg(class vec4_god *v, const struct glsl_type *type);
>
> explicit dst_reg(const src_reg ®);
>
> diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> index 56f69ea..120a13d 100644
> --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> @@ -395,7 +395,7 @@ schedule_node::set_latency_gen7(bool is_haswell)
>
> class instruction_scheduler {
> public:
> - instruction_scheduler(backend_visitor *v, int grf_count,
> + instruction_scheduler(backend_god *v, int grf_count,
> instruction_scheduler_mode mode)
> {
> this->bv = v;
> @@ -451,7 +451,7 @@ public:
> int grf_count;
> int time;
> exec_list instructions;
> - backend_visitor *bv;
> + backend_god *bv;
>
> instruction_scheduler_mode mode;
>
> @@ -475,20 +475,20 @@ public:
> class fs_instruction_scheduler : public instruction_scheduler
> {
> public:
> - fs_instruction_scheduler(fs_visitor *v, int grf_count,
> + fs_instruction_scheduler(fs_god *v, int grf_count,
> instruction_scheduler_mode mode);
> void calculate_deps();
> bool is_compressed(fs_inst *inst);
> schedule_node *choose_instruction_to_schedule();
> int issue_time(backend_instruction *inst);
> - fs_visitor *v;
> + fs_god *v;
>
> void count_remaining_grf_uses(backend_instruction *inst);
> void update_register_pressure(backend_instruction *inst);
> int get_register_pressure_benefit(backend_instruction *inst);
> };
>
> -fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
> +fs_instruction_scheduler::fs_instruction_scheduler(fs_god *v,
> int grf_count,
> instruction_scheduler_mode mode)
> : instruction_scheduler(v, grf_count, mode),
> @@ -565,18 +565,18 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
> class vec4_instruction_scheduler : public instruction_scheduler
> {
> public:
> - vec4_instruction_scheduler(vec4_visitor *v, int grf_count);
> + vec4_instruction_scheduler(vec4_god *v, int grf_count);
> void calculate_deps();
> schedule_node *choose_instruction_to_schedule();
> int issue_time(backend_instruction *inst);
> - vec4_visitor *v;
> + vec4_god *v;
>
> void count_remaining_grf_uses(backend_instruction *inst);
> void update_register_pressure(backend_instruction *inst);
> int get_register_pressure_benefit(backend_instruction *inst);
> };
>
> -vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
> +vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_god *v,
> int grf_count)
> : instruction_scheduler(v, grf_count, SCHEDULE_POST),
> v(v)
> @@ -1506,7 +1506,7 @@ instruction_scheduler::run(cfg_t *cfg)
> }
>
> void
> -fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
> +fs_god::schedule_instructions(instruction_scheduler_mode mode)
> {
> int grf_count;
> if (mode == SCHEDULE_POST)
> @@ -1526,7 +1526,7 @@ fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
> }
>
> void
> -vec4_visitor::opt_schedule_instructions()
> +vec4_god::opt_schedule_instructions()
> {
> vec4_instruction_scheduler sched(this, prog_data->total_grf);
> sched.run(cfg);
> diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
> index 0dda9bb..24c86a0 100644
> --- a/src/mesa/drivers/dri/i965/brw_shader.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
> @@ -695,7 +695,7 @@ brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg)
> return false;
> }
>
> -backend_visitor::backend_visitor(struct brw_context *brw,
> +backend_god::backend_god(struct brw_context *brw,
> struct gl_shader_program *shader_prog,
> struct gl_program *prog,
> struct brw_stage_prog_data *stage_prog_data,
> @@ -1083,13 +1083,13 @@ backend_instruction::remove(bblock_t *block)
> }
>
> void
> -backend_visitor::dump_instructions()
> +backend_god::dump_instructions()
> {
> dump_instructions(NULL);
> }
>
> void
> -backend_visitor::dump_instructions(const char *name)
> +backend_god::dump_instructions(const char *name)
> {
> FILE *file = stderr;
> if (name && geteuid() != 0) {
> @@ -1118,7 +1118,7 @@ backend_visitor::dump_instructions(const char *name)
> }
>
> void
> -backend_visitor::calculate_cfg()
> +backend_god::calculate_cfg()
> {
> if (this->cfg)
> return;
> @@ -1126,7 +1126,7 @@ backend_visitor::calculate_cfg()
> }
>
> void
> -backend_visitor::invalidate_cfg()
> +backend_god::invalidate_cfg()
> {
> ralloc_free(this->cfg);
> this->cfg = NULL;
> @@ -1141,7 +1141,7 @@ backend_visitor::invalidate_cfg()
> * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
> */
> void
> -backend_visitor::assign_common_binding_table_offsets(uint32_t next_binding_table_offset)
> +backend_god::assign_common_binding_table_offsets(uint32_t next_binding_table_offset)
> {
> int num_textures = _mesa_fls(prog->SamplersUsed);
>
> diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
> index 8a3263e..4479002 100644
> --- a/src/mesa/drivers/dri/i965/brw_shader.h
> +++ b/src/mesa/drivers/dri/i965/brw_shader.h
> @@ -157,10 +157,10 @@ enum instruction_scheduler_mode {
> SCHEDULE_POST,
> };
>
> -class backend_visitor : public ir_visitor {
> +class backend_god : public ir_visitor {
> protected:
>
> - backend_visitor(struct brw_context *brw,
> + backend_god(struct brw_context *brw,
> struct gl_shader_program *shader_prog,
> struct gl_program *prog,
> struct brw_stage_prog_data *stage_prog_data,
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> index 480e50c..cc85790 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> @@ -260,7 +260,7 @@ vec4_instruction::can_do_source_mods(struct brw_context *brw)
> * for setup.
> */
> int
> -vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
> +vec4_god::implied_mrf_writes(vec4_instruction *inst)
> {
> if (inst->mlen == 0 || inst->is_send_from_grf())
> return 0;
> @@ -328,7 +328,7 @@ src_reg::equals(const src_reg &r) const
> }
>
> bool
> -vec4_visitor::opt_vector_float()
> +vec4_god::opt_vector_float()
> {
> bool progress = false;
>
> @@ -407,7 +407,7 @@ vec4_visitor::opt_vector_float()
> * remove the instructions that wrote them.
> */
> bool
> -vec4_visitor::opt_reduce_swizzle()
> +vec4_god::opt_reduce_swizzle()
> {
> bool progress = false;
>
> @@ -461,7 +461,7 @@ vec4_visitor::opt_reduce_swizzle()
> }
>
> void
> -vec4_visitor::split_uniform_registers()
> +vec4_god::split_uniform_registers()
> {
> /* Prior to this, uniforms have been in an array sized according to
> * the number of vector uniforms present, sparsely filled (so an
> @@ -489,7 +489,7 @@ vec4_visitor::split_uniform_registers()
> }
>
> void
> -vec4_visitor::pack_uniform_registers()
> +vec4_god::pack_uniform_registers()
> {
> bool uniform_used[this->uniforms];
> int new_loc[this->uniforms];
> @@ -583,7 +583,7 @@ vec4_visitor::pack_uniform_registers()
> * instructions involving 0.
> */
> bool
> -vec4_visitor::opt_algebraic()
> +vec4_god::opt_algebraic()
> {
> bool progress = false;
>
> @@ -689,7 +689,7 @@ vec4_visitor::opt_algebraic()
> * pull constants.
> */
> void
> -vec4_visitor::move_push_constants_to_pull_constants()
> +vec4_god::move_push_constants_to_pull_constants()
> {
> int pull_constant_loc[this->uniforms];
>
> @@ -772,7 +772,7 @@ vec4_visitor::move_push_constants_to_pull_constants()
>
> /* Conditions for which we want to avoid setting the dependency control bits */
> bool
> -vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
> +vec4_god::is_dep_ctrl_unsafe(const vec4_instruction *inst)
> {
> #define IS_DWORD(reg) \
> (reg.type == BRW_REGISTER_TYPE_UD || \
> @@ -833,7 +833,7 @@ vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
> * manual fields we can set in the instructions that let it do so.
> */
> void
> -vec4_visitor::opt_set_dependency_control()
> +vec4_god::opt_set_dependency_control()
> {
> vec4_instruction *last_grf_write[BRW_MAX_GRF];
> uint8_t grf_channels_written[BRW_MAX_GRF];
> @@ -958,7 +958,7 @@ vec4_instruction::reswizzle(int dst_writemask, int swizzle)
> * of the GRF write directly to the final destination instead.
> */
> bool
> -vec4_visitor::opt_register_coalesce()
> +vec4_god::opt_register_coalesce()
> {
> bool progress = false;
> int next_ip = 0;
> @@ -1124,7 +1124,7 @@ vec4_visitor::opt_register_coalesce()
> * a GRF on IVB.
> */
> void
> -vec4_visitor::split_virtual_grfs()
> +vec4_god::split_virtual_grfs()
> {
> int num_vars = this->alloc.count;
> int new_virtual_grf[num_vars];
> @@ -1186,13 +1186,13 @@ vec4_visitor::split_virtual_grfs()
> }
>
> void
> -vec4_visitor::dump_instruction(backend_instruction *be_inst)
> +vec4_god::dump_instruction(backend_instruction *be_inst)
> {
> dump_instruction(be_inst, stderr);
> }
>
> void
> -vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
> +vec4_god::dump_instruction(backend_instruction *be_inst, FILE *file)
> {
> vec4_instruction *inst = (vec4_instruction *)be_inst;
>
> @@ -1404,7 +1404,7 @@ attribute_to_hw_reg(int attr, bool interleaved)
> * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
> */
> void
> -vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
> +vec4_god::lower_attributes_to_hw_regs(const int *attribute_map,
> bool interleaved)
> {
> foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
> @@ -1451,7 +1451,7 @@ vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
> }
>
> int
> -vec4_vs_visitor::setup_attributes(int payload_reg)
> +vec4_vs_god::setup_attributes(int payload_reg)
> {
> int nr_attributes;
> int attribute_map[VERT_ATTRIB_MAX + 1];
> @@ -1496,7 +1496,7 @@ vec4_vs_visitor::setup_attributes(int payload_reg)
> }
>
> int
> -vec4_visitor::setup_uniforms(int reg)
> +vec4_god::setup_uniforms(int reg)
> {
> prog_data->base.dispatch_grf_start_reg = reg;
>
> @@ -1530,7 +1530,7 @@ vec4_visitor::setup_uniforms(int reg)
> }
>
> void
> -vec4_vs_visitor::setup_payload(void)
> +vec4_vs_god::setup_payload(void)
> {
> int reg = 0;
>
> @@ -1548,13 +1548,13 @@ vec4_vs_visitor::setup_payload(void)
> }
>
> void
> -vec4_visitor::assign_binding_table_offsets()
> +vec4_god::assign_binding_table_offsets()
> {
> assign_common_binding_table_offsets(0);
> }
>
> src_reg
> -vec4_visitor::get_timestamp()
> +vec4_god::get_timestamp()
> {
> assert(brw->gen >= 7);
>
> @@ -1582,14 +1582,14 @@ vec4_visitor::get_timestamp()
> }
>
> void
> -vec4_visitor::emit_shader_time_begin()
> +vec4_god::emit_shader_time_begin()
> {
> current_annotation = "shader time start";
> shader_start_time = get_timestamp();
> }
>
> void
> -vec4_visitor::emit_shader_time_end()
> +vec4_god::emit_shader_time_end()
> {
> current_annotation = "shader time end";
> src_reg shader_end_time = get_timestamp();
> @@ -1624,7 +1624,7 @@ vec4_visitor::emit_shader_time_end()
> }
>
> void
> -vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
> +vec4_god::emit_shader_time_write(enum shader_time_shader_type type,
> src_reg value)
> {
> int shader_time_index =
> @@ -1649,7 +1649,7 @@ vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
> }
>
> bool
> -vec4_visitor::run()
> +vec4_god::run()
> {
> sanity_param_count = prog->Parameters->NumParameters;
>
> @@ -1710,7 +1710,7 @@ vec4_visitor::run()
> snprintf(filename, 64, "%s-%04d-%02d-%02d-" #pass, \
> stage_name, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
> \
> - backend_visitor::dump_instructions(filename); \
> + backend_god::dump_instructions(filename); \
> } \
> \
> progress = progress || this_progress; \
> @@ -1723,7 +1723,7 @@ vec4_visitor::run()
> snprintf(filename, 64, "%s-%04d-00-start",
> stage_name, shader_prog ? shader_prog->Name : 0);
>
> - backend_visitor::dump_instructions(filename);
> + backend_god::dump_instructions(filename);
> }
>
> bool progress;
> @@ -1824,7 +1824,7 @@ brw_vs_emit(struct brw_context *brw,
> brw_dump_ir("vertex", prog, &shader->base, &c->vp->program.Base);
>
> if (brw->scalar_vs && (prog || brw_env_var_as_boolean("INTEL_USE_NIR", false))) {
> - fs_visitor v(brw, mem_ctx, &c->key, prog_data, prog, &c->vp->program, 8);
> + fs_god v(brw, mem_ctx, &c->key, prog_data, prog, &c->vp->program, 8);
> if (!v.run_vs()) {
> if (prog) {
> prog->LinkStatus = false;
> @@ -1861,7 +1861,7 @@ brw_vs_emit(struct brw_context *brw,
> }
>
> if (!assembly) {
> - vec4_vs_visitor v(brw, c, prog_data, prog, mem_ctx);
> + vec4_vs_god v(brw, c, prog_data, prog, mem_ctx);
> if (!v.run()) {
> if (prog) {
> prog->LinkStatus = false;
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
> index 33297ae..b8418b1 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.h
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.h
> @@ -73,10 +73,10 @@ class vec4_live_variables;
> * Translates either GLSL IR or Mesa IR (for ARB_vertex_program and
> * fixed-function) into VS IR.
> */
> -class vec4_visitor : public backend_visitor
> +class vec4_god : public backend_god
> {
> public:
> - vec4_visitor(struct brw_context *brw,
> + vec4_god(struct brw_context *brw,
> struct brw_vec4_compile *c,
> struct gl_program *prog,
> const struct brw_vue_prog_key *key,
> @@ -88,7 +88,7 @@ public:
> shader_time_shader_type st_base,
> shader_time_shader_type st_written,
> shader_time_shader_type st_reset);
> - ~vec4_visitor();
> + ~vec4_god();
>
> dst_reg dst_null_f()
> {
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
> index e897be2..5b40ab4 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
> @@ -364,7 +364,7 @@ try_copy_propagate(struct brw_context *brw, vec4_instruction *inst,
> }
>
> bool
> -vec4_visitor::opt_copy_propagation(bool do_constant_prop)
> +vec4_god::opt_copy_propagation(bool do_constant_prop)
> {
> bool progress = false;
> struct copy_entry entries[alloc.total_size];
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
> index 100e511..fe6d0bd 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
> @@ -121,7 +121,7 @@ instructions_match(vec4_instruction *a, vec4_instruction *b)
> }
>
> bool
> -vec4_visitor::opt_cse_local(bblock_t *block)
> +vec4_god::opt_cse_local(bblock_t *block)
> {
> bool progress = false;
> exec_list aeb;
> @@ -250,7 +250,7 @@ vec4_visitor::opt_cse_local(bblock_t *block)
> }
>
> bool
> -vec4_visitor::opt_cse()
> +vec4_god::opt_cse()
> {
> bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
> index 980e266..3a8e0b7 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
> @@ -60,7 +60,7 @@ can_do_writemask(const struct brw_context *brw,
> }
>
> bool
> -vec4_visitor::dead_code_eliminate()
> +vec4_god::dead_code_eliminate()
> {
> bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_god.cpp b/src/mesa/drivers/dri/i965/brw_vec4_god.cpp
> new file mode 100644
> index 0000000..3483143
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_god.cpp
> @@ -0,0 +1,3658 @@
> +/*
> + * Copyright © 2011 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +#include "brw_vec4.h"
> +#include "brw_cfg.h"
> +#include "glsl/ir_uniform.h"
> +#include "program/sampler.h"
> +
> +namespace brw {
> +
> +vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
> + const src_reg &src0, const src_reg &src1,
> + const src_reg &src2)
> +{
> + this->opcode = opcode;
> + this->dst = dst;
> + this->src[0] = src0;
> + this->src[1] = src1;
> + this->src[2] = src2;
> + this->saturate = false;
> + this->force_writemask_all = false;
> + this->no_dd_clear = false;
> + this->no_dd_check = false;
> + this->writes_accumulator = false;
> + this->conditional_mod = BRW_CONDITIONAL_NONE;
> + this->predicate = BRW_PREDICATE_NONE;
> + this->predicate_inverse = false;
> + this->target = 0;
> + this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
> + this->shadow_compare = false;
> + this->ir = NULL;
> + this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
> + this->header_present = false;
> + this->flag_subreg = 0;
> + this->mlen = 0;
> + this->base_mrf = 0;
> + this->offset = 0;
> + this->annotation = NULL;
> +}
> +
> +vec4_instruction *
> +vec4_god::emit(vec4_instruction *inst)
> +{
> + inst->ir = this->base_ir;
> + inst->annotation = this->current_annotation;
> +
> + this->instructions.push_tail(inst);
> +
> + return inst;
> +}
> +
> +vec4_instruction *
> +vec4_god::emit_before(bblock_t *block, vec4_instruction *inst,
> + vec4_instruction *new_inst)
> +{
> + new_inst->ir = inst->ir;
> + new_inst->annotation = inst->annotation;
> +
> + inst->insert_before(block, new_inst);
> +
> + return inst;
> +}
> +
> +vec4_instruction *
> +vec4_god::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
> + const src_reg &src1, const src_reg &src2)
> +{
> + return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
> +}
> +
> +
> +vec4_instruction *
> +vec4_god::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
> + const src_reg &src1)
> +{
> + return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
> +}
> +
> +vec4_instruction *
> +vec4_god::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
> +{
> + return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
> +}
> +
> +vec4_instruction *
> +vec4_god::emit(enum opcode opcode, const dst_reg &dst)
> +{
> + return emit(new(mem_ctx) vec4_instruction(opcode, dst));
> +}
> +
> +vec4_instruction *
> +vec4_god::emit(enum opcode opcode)
> +{
> + return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
> +}
> +
> +#define ALU1(op) \
> + vec4_instruction * \
> + vec4_god::op(const dst_reg &dst, const src_reg &src0) \
> + { \
> + return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
> + }
> +
> +#define ALU2(op) \
> + vec4_instruction * \
> + vec4_god::op(const dst_reg &dst, const src_reg &src0, \
> + const src_reg &src1) \
> + { \
> + return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
> + src0, src1); \
> + }
> +
> +#define ALU2_ACC(op) \
> + vec4_instruction * \
> + vec4_god::op(const dst_reg &dst, const src_reg &src0, \
> + const src_reg &src1) \
> + { \
> + vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
> + BRW_OPCODE_##op, dst, src0, src1); \
> + inst->writes_accumulator = true; \
> + return inst; \
> + }
> +
> +#define ALU3(op) \
> + vec4_instruction * \
> + vec4_god::op(const dst_reg &dst, const src_reg &src0, \
> + const src_reg &src1, const src_reg &src2) \
> + { \
> + assert(brw->gen >= 6); \
> + return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
> + src0, src1, src2); \
> + }
> +
> +ALU1(NOT)
> +ALU1(MOV)
> +ALU1(FRC)
> +ALU1(RNDD)
> +ALU1(RNDE)
> +ALU1(RNDZ)
> +ALU1(F32TO16)
> +ALU1(F16TO32)
> +ALU2(ADD)
> +ALU2(MUL)
> +ALU2_ACC(MACH)
> +ALU2(AND)
> +ALU2(OR)
> +ALU2(XOR)
> +ALU2(DP3)
> +ALU2(DP4)
> +ALU2(DPH)
> +ALU2(SHL)
> +ALU2(SHR)
> +ALU2(ASR)
> +ALU3(LRP)
> +ALU1(BFREV)
> +ALU3(BFE)
> +ALU2(BFI1)
> +ALU3(BFI2)
> +ALU1(FBH)
> +ALU1(FBL)
> +ALU1(CBIT)
> +ALU3(MAD)
> +ALU2_ACC(ADDC)
> +ALU2_ACC(SUBB)
> +ALU2(MAC)
> +
> +/** Gen4 predicated IF. */
> +vec4_instruction *
> +vec4_god::IF(enum brw_predicate predicate)
> +{
> + vec4_instruction *inst;
> +
> + inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
> + inst->predicate = predicate;
> +
> + return inst;
> +}
> +
> +/** Gen6 IF with embedded comparison. */
> +vec4_instruction *
> +vec4_god::IF(src_reg src0, src_reg src1,
> + enum brw_conditional_mod condition)
> +{
> + assert(brw->gen == 6);
> +
> + vec4_instruction *inst;
> +
> + resolve_ud_negate(&src0);
> + resolve_ud_negate(&src1);
> +
> + inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
> + src0, src1);
> + inst->conditional_mod = condition;
> +
> + return inst;
> +}
> +
> +/**
> + * CMP: Sets the low bit of the destination channels with the result
> + * of the comparison, while the upper bits are undefined, and updates
> + * the flag register with the packed 16 bits of the result.
> + */
> +vec4_instruction *
> +vec4_god::CMP(dst_reg dst, src_reg src0, src_reg src1,
> + enum brw_conditional_mod condition)
> +{
> + vec4_instruction *inst;
> +
> + /* Take the instruction:
> + *
> + * CMP null<d> src0<f> src1<f>
> + *
> + * Original gen4 does type conversion to the destination type before
> + * comparison, producing garbage results for floating point comparisons.
> + *
> + * The destination type doesn't matter on newer generations, so we set the
> + * type to match src0 so we can compact the instruction.
> + */
> + dst.type = src0.type;
> + if (dst.file == HW_REG)
> + dst.fixed_hw_reg.type = dst.type;
> +
> + resolve_ud_negate(&src0);
> + resolve_ud_negate(&src1);
> +
> + inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
> + inst->conditional_mod = condition;
> +
> + return inst;
> +}
> +
> +vec4_instruction *
> +vec4_god::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
> +{
> + vec4_instruction *inst;
> +
> + inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
> + dst, index);
> + inst->base_mrf = 14;
> + inst->mlen = 2;
> +
> + return inst;
> +}
> +
> +vec4_instruction *
> +vec4_god::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
> + const src_reg &index)
> +{
> + vec4_instruction *inst;
> +
> + inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
> + dst, src, index);
> + inst->base_mrf = 13;
> + inst->mlen = 3;
> +
> + return inst;
> +}
> +
> +void
> +vec4_god::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
> +{
> + static enum opcode dot_opcodes[] = {
> + BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
> + };
> +
> + emit(dot_opcodes[elements - 2], dst, src0, src1);
> +}
> +
> +src_reg
> +vec4_god::fix_3src_operand(src_reg src)
> +{
> + /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
> + * able to use vertical stride of zero to replicate the vec4 uniform, like
> + *
> + * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
> + *
> + * But you can't, since vertical stride is always four in three-source
> + * instructions. Instead, insert a MOV instruction to do the replication so
> + * that the three-source instruction can consume it.
> + */
> +
> + /* The MOV is only needed if the source is a uniform or immediate. */
> + if (src.file != UNIFORM && src.file != IMM)
> + return src;
> +
> + if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
> + return src;
> +
> + dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
> + expanded.type = src.type;
> + emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
> + return src_reg(expanded);
> +}
> +
> +src_reg
> +vec4_god::fix_math_operand(src_reg src)
> +{
> + if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
> + return src;
> +
> + /* The gen6 math instruction ignores the source modifiers --
> + * swizzle, abs, negate, and at least some parts of the register
> + * region description.
> + *
> + * Rather than trying to enumerate all these cases, *always* expand the
> + * operand to a temp GRF for gen6.
> + *
> + * For gen7, keep the operand as-is, except if immediate, which gen7 still
> + * can't use.
> + */
> +
> + if (brw->gen == 7 && src.file != IMM)
> + return src;
> +
> + dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
> + expanded.type = src.type;
> + emit(MOV(expanded, src));
> + return src_reg(expanded);
> +}
> +
> +void
> +vec4_god::emit_math(enum opcode opcode,
> + const dst_reg &dst,
> + const src_reg &src0, const src_reg &src1)
> +{
> + vec4_instruction *math =
> + emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
> +
> + if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
> + /* MATH on Gen6 must be align1, so we can't do writemasks. */
> + math->dst = dst_reg(this, glsl_type::vec4_type);
> + math->dst.type = dst.type;
> + emit(MOV(dst, src_reg(math->dst)));
> + } else if (brw->gen < 6) {
> + math->base_mrf = 1;
> + math->mlen = src1.file == BAD_FILE ? 1 : 2;
> + }
> +}
> +
> +void
> +vec4_god::emit_pack_half_2x16(dst_reg dst, src_reg src0)
> +{
> + if (brw->gen < 7) {
> + unreachable("ir_unop_pack_half_2x16 should be lowered");
> + }
> +
> + assert(dst.type == BRW_REGISTER_TYPE_UD);
> + assert(src0.type == BRW_REGISTER_TYPE_F);
> +
> + /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
> + *
> + * Because this instruction does not have a 16-bit floating-point type,
> + * the destination data type must be Word (W).
> + *
> + * The destination must be DWord-aligned and specify a horizontal stride
> + * (HorzStride) of 2. The 16-bit result is stored in the lower word of
> + * each destination channel and the upper word is not modified.
> + *
> + * The above restriction implies that the f32to16 instruction must use
> + * align1 mode, because only in align1 mode is it possible to specify
> + * horizontal stride. We choose here to defy the hardware docs and emit
> + * align16 instructions.
> + *
> + * (I [chadv] did attempt to emit align1 instructions for VS f32to16
> + * instructions. I was partially successful in that the code passed all
> + * tests. However, the code was dubiously correct and fragile, and the
> + * tests were not harsh enough to probe that frailty. Not trusting the
> + * code, I chose instead to remain in align16 mode in defiance of the hw
> + * docs).
> + *
> + * I've [chadv] experimentally confirmed that, on gen7 hardware and the
> + * simulator, emitting a f32to16 in align16 mode with UD as destination
> + * data type is safe. The behavior differs from that specified in the PRM
> + * in that the upper word of each destination channel is cleared to 0.
> + */
> +
> + dst_reg tmp_dst(this, glsl_type::uvec2_type);
> + src_reg tmp_src(tmp_dst);
> +
> +#if 0
> + /* Verify the undocumented behavior on which the following instructions
> + * rely. If f32to16 fails to clear the upper word of the X and Y channels,
> + * then the result of the bit-or instruction below will be incorrect.
> + *
> + * You should inspect the disasm output in order to verify that the MOV is
> + * not optimized away.
> + */
> + emit(MOV(tmp_dst, src_reg(0x12345678u)));
> +#endif
> +
> + /* Give tmp the form below, where "." means untouched.
> + *
> + * w z y x w z y x
> + * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
> + *
> + * That the upper word of each write-channel be 0 is required for the
> + * following bit-shift and bit-or instructions to work. Note that this
> + * relies on the undocumented hardware behavior mentioned above.
> + */
> + tmp_dst.writemask = WRITEMASK_XY;
> + emit(F32TO16(tmp_dst, src0));
> +
> + /* Give the write-channels of dst the form:
> + * 0xhhhh0000
> + */
> + tmp_src.swizzle = BRW_SWIZZLE_YYYY;
> + emit(SHL(dst, tmp_src, src_reg(16u)));
> +
> + /* Finally, give the write-channels of dst the form of packHalf2x16's
> + * output:
> + * 0xhhhhllll
> + */
> + tmp_src.swizzle = BRW_SWIZZLE_XXXX;
> + emit(OR(dst, src_reg(dst), tmp_src));
> +}
> +
> +void
> +vec4_god::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
> +{
> + if (brw->gen < 7) {
> + unreachable("ir_unop_unpack_half_2x16 should be lowered");
> + }
> +
> + assert(dst.type == BRW_REGISTER_TYPE_F);
> + assert(src0.type == BRW_REGISTER_TYPE_UD);
> +
> + /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
> + *
> + * Because this instruction does not have a 16-bit floating-point type,
> + * the source data type must be Word (W). The destination type must be
> + * F (Float).
> + *
> + * To use W as the source data type, we must adjust horizontal strides,
> + * which is only possible in align1 mode. All my [chadv] attempts at
> + * emitting align1 instructions for unpackHalf2x16 failed to pass the
> + * Piglit tests, so I gave up.
> + *
> + * I've verified that, on gen7 hardware and the simulator, it is safe to
> + * emit f16to32 in align16 mode with UD as source data type.
> + */
> +
> + dst_reg tmp_dst(this, glsl_type::uvec2_type);
> + src_reg tmp_src(tmp_dst);
> +
> + tmp_dst.writemask = WRITEMASK_X;
> + emit(AND(tmp_dst, src0, src_reg(0xffffu)));
> +
> + tmp_dst.writemask = WRITEMASK_Y;
> + emit(SHR(tmp_dst, src0, src_reg(16u)));
> +
> + dst.writemask = WRITEMASK_XY;
> + emit(F16TO32(dst, tmp_src));
> +}
> +
> +void
> +vec4_god::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
> +{
> + /* Instead of splitting the 32-bit integer, shifting, and ORing it back
> + * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
> + * is not suitable to generate the shift values, but we can use the packed
> + * vector float and a type-converting MOV.
> + */
> + dst_reg shift(this, glsl_type::uvec4_type);
> + emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
> +
> + dst_reg shifted(this, glsl_type::uvec4_type);
> + src0.swizzle = BRW_SWIZZLE_XXXX;
> + emit(SHR(shifted, src0, src_reg(shift)));
> +
> + shifted.type = BRW_REGISTER_TYPE_UB;
> + dst_reg f(this, glsl_type::vec4_type);
> + emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
> +
> + emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
> +}
> +
> +void
> +vec4_god::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
> +{
> + /* Instead of splitting the 32-bit integer, shifting, and ORing it back
> + * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
> + * is not suitable to generate the shift values, but we can use the packed
> + * vector float and a type-converting MOV.
> + */
> + dst_reg shift(this, glsl_type::uvec4_type);
> + emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
> +
> + dst_reg shifted(this, glsl_type::uvec4_type);
> + src0.swizzle = BRW_SWIZZLE_XXXX;
> + emit(SHR(shifted, src0, src_reg(shift)));
> +
> + shifted.type = BRW_REGISTER_TYPE_B;
> + dst_reg f(this, glsl_type::vec4_type);
> + emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
> +
> + dst_reg scaled(this, glsl_type::vec4_type);
> + emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
> +
> + dst_reg max(this, glsl_type::vec4_type);
> + emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
> + emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
> +}
> +
> +void
> +vec4_god::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
> +{
> + dst_reg saturated(this, glsl_type::vec4_type);
> + vec4_instruction *inst = emit(MOV(saturated, src0));
> + inst->saturate = true;
> +
> + dst_reg scaled(this, glsl_type::vec4_type);
> + emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
> +
> + dst_reg rounded(this, glsl_type::vec4_type);
> + emit(RNDE(rounded, src_reg(scaled)));
> +
> + dst_reg u(this, glsl_type::uvec4_type);
> + emit(MOV(u, src_reg(rounded)));
> +
> + src_reg bytes(u);
> + emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
> +}
> +
> +void
> +vec4_god::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
> +{
> + dst_reg max(this, glsl_type::vec4_type);
> + emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
> +
> + dst_reg min(this, glsl_type::vec4_type);
> + emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
> +
> + dst_reg scaled(this, glsl_type::vec4_type);
> + emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
> +
> + dst_reg rounded(this, glsl_type::vec4_type);
> + emit(RNDE(rounded, src_reg(scaled)));
> +
> + dst_reg i(this, glsl_type::ivec4_type);
> + emit(MOV(i, src_reg(rounded)));
> +
> + src_reg bytes(i);
> + emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
> +}
> +
> +void
> +vec4_god::visit_instructions(const exec_list *list)
> +{
> + foreach_in_list(ir_instruction, ir, list) {
> + base_ir = ir;
> + ir->accept(this);
> + }
> +}
> +
> +
> +static int
> +type_size(const struct glsl_type *type)
> +{
> + unsigned int i;
> + int size;
> +
> + switch (type->base_type) {
> + case GLSL_TYPE_UINT:
> + case GLSL_TYPE_INT:
> + case GLSL_TYPE_FLOAT:
> + case GLSL_TYPE_BOOL:
> + if (type->is_matrix()) {
> + return type->matrix_columns;
> + } else {
> + /* Regardless of size of vector, it gets a vec4. This is bad
> + * packing for things like floats, but otherwise arrays become a
> + * mess. Hopefully a later pass over the code can pack scalars
> + * down if appropriate.
> + */
> + return 1;
> + }
> + case GLSL_TYPE_ARRAY:
> + assert(type->length > 0);
> + return type_size(type->fields.array) * type->length;
> + case GLSL_TYPE_STRUCT:
> + size = 0;
> + for (i = 0; i < type->length; i++) {
> + size += type_size(type->fields.structure[i].type);
> + }
> + return size;
> + case GLSL_TYPE_SAMPLER:
> + /* Samplers take up no register space, since they're baked in at
> + * link time.
> + */
> + return 0;
> + case GLSL_TYPE_ATOMIC_UINT:
> + return 0;
> + case GLSL_TYPE_IMAGE:
> + case GLSL_TYPE_VOID:
> + case GLSL_TYPE_DOUBLE:
> + case GLSL_TYPE_ERROR:
> + case GLSL_TYPE_INTERFACE:
> + unreachable("not reached");
> + }
> +
> + return 0;
> +}
> +
> +src_reg::src_reg(class vec4_god *v, const struct glsl_type *type)
> +{
> + init();
> +
> + this->file = GRF;
> + this->reg = v->alloc.allocate(type_size(type));
> +
> + if (type->is_array() || type->is_record()) {
> + this->swizzle = BRW_SWIZZLE_NOOP;
> + } else {
> + this->swizzle = brw_swizzle_for_size(type->vector_elements);
> + }
> +
> + this->type = brw_type_for_base_type(type);
> +}
> +
> +src_reg::src_reg(class vec4_god *v, const struct glsl_type *type, int size)
> +{
> + assert(size > 0);
> +
> + init();
> +
> + this->file = GRF;
> + this->reg = v->alloc.allocate(type_size(type) * size);
> +
> + this->swizzle = BRW_SWIZZLE_NOOP;
> +
> + this->type = brw_type_for_base_type(type);
> +}
> +
> +dst_reg::dst_reg(class vec4_god *v, const struct glsl_type *type)
> +{
> + init();
> +
> + this->file = GRF;
> + this->reg = v->alloc.allocate(type_size(type));
> +
> + if (type->is_array() || type->is_record()) {
> + this->writemask = WRITEMASK_XYZW;
> + } else {
> + this->writemask = (1 << type->vector_elements) - 1;
> + }
> +
> + this->type = brw_type_for_base_type(type);
> +}
> +
> +/* Our support for uniforms is piggy-backed on the struct
> + * gl_fragment_program, because that's where the values actually
> + * get stored, rather than in some global gl_shader_program uniform
> + * store.
> + */
> +void
> +vec4_god::setup_uniform_values(ir_variable *ir)
> +{
> + int namelen = strlen(ir->name);
> +
> + /* The data for our (non-builtin) uniforms is stored in a series of
> + * gl_uniform_driver_storage structs for each subcomponent that
> + * glGetUniformLocation() could name. We know it's been set up in the same
> + * order we'd walk the type, so walk the list of storage and find anything
> + * with our name, or the prefix of a component that starts with our name.
> + */
> + for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
> + struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
> +
> + if (strncmp(ir->name, storage->name, namelen) != 0 ||
> + (storage->name[namelen] != 0 &&
> + storage->name[namelen] != '.' &&
> + storage->name[namelen] != '[')) {
> + continue;
> + }
> +
> + gl_constant_value *components = storage->storage;
> + unsigned vector_count = (MAX2(storage->array_elements, 1) *
> + storage->type->matrix_columns);
> +
> + for (unsigned s = 0; s < vector_count; s++) {
> + assert(uniforms < uniform_array_size);
> + uniform_vector_size[uniforms] = storage->type->vector_elements;
> +
> + int i;
> + for (i = 0; i < uniform_vector_size[uniforms]; i++) {
> + stage_prog_data->param[uniforms * 4 + i] = components;
> + components++;
> + }
> + for (; i < 4; i++) {
> + static gl_constant_value zero = { 0.0 };
> + stage_prog_data->param[uniforms * 4 + i] = &zero;
> + }
> +
> + uniforms++;
> + }
> + }
> +}
> +
> +void
> +vec4_god::setup_uniform_clipplane_values()
> +{
> + gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
> +
> + for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
> + assert(this->uniforms < uniform_array_size);
> + this->uniform_vector_size[this->uniforms] = 4;
> + this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
> + this->userplane[i].type = BRW_REGISTER_TYPE_F;
> + for (int j = 0; j < 4; ++j) {
> + stage_prog_data->param[this->uniforms * 4 + j] =
> + (gl_constant_value *) &clip_planes[i][j];
> + }
> + ++this->uniforms;
> + }
> +}
> +
> +/* Our support for builtin uniforms is even scarier than non-builtin.
> + * It sits on top of the PROG_STATE_VAR parameters that are
> + * automatically updated from GL context state.
> + */
> +void
> +vec4_god::setup_builtin_uniform_values(ir_variable *ir)
> +{
> + const ir_state_slot *const slots = ir->get_state_slots();
> + assert(slots != NULL);
> +
> + for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
> + /* This state reference has already been setup by ir_to_mesa,
> + * but we'll get the same index back here. We can reference
> + * ParameterValues directly, since unlike brw_fs.cpp, we never
> + * add new state references during compile.
> + */
> + int index = _mesa_add_state_reference(this->prog->Parameters,
> + (gl_state_index *)slots[i].tokens);
> + gl_constant_value *values =
> + &this->prog->Parameters->ParameterValues[index][0];
> +
> + assert(this->uniforms < uniform_array_size);
> +
> + for (unsigned j = 0; j < 4; j++)
> + stage_prog_data->param[this->uniforms * 4 + j] =
> + &values[GET_SWZ(slots[i].swizzle, j)];
> +
> + this->uniform_vector_size[this->uniforms] =
> + (ir->type->is_scalar() || ir->type->is_vector() ||
> + ir->type->is_matrix() ? ir->type->vector_elements : 4);
> +
> + this->uniforms++;
> + }
> +}
> +
> +dst_reg *
> +vec4_god::variable_storage(ir_variable *var)
> +{
> + return (dst_reg *)hash_table_find(this->variable_ht, var);
> +}
> +
> +void
> +vec4_god::emit_bool_to_cond_code(ir_rvalue *ir,
> + enum brw_predicate *predicate)
> +{
> + ir_expression *expr = ir->as_expression();
> +
> + *predicate = BRW_PREDICATE_NORMAL;
> +
> + if (expr && expr->operation != ir_binop_ubo_load) {
> + src_reg op[3];
> + vec4_instruction *inst;
> +
> + assert(expr->get_num_operands() <= 3);
> + for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> + expr->operands[i]->accept(this);
> + op[i] = this->result;
> +
> + resolve_ud_negate(&op[i]);
> + }
> +
> + switch (expr->operation) {
> + case ir_unop_logic_not:
> + inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
> + inst->conditional_mod = BRW_CONDITIONAL_Z;
> + break;
> +
> + case ir_binop_logic_xor:
> + if (brw->gen <= 5) {
> + src_reg temp = src_reg(this, ir->type);
> + emit(XOR(dst_reg(temp), op[0], op[1]));
> + inst = emit(AND(dst_null_d(), temp, src_reg(1)));
> + } else {
> + inst = emit(XOR(dst_null_d(), op[0], op[1]));
> + }
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> + break;
> +
> + case ir_binop_logic_or:
> + if (brw->gen <= 5) {
> + src_reg temp = src_reg(this, ir->type);
> + emit(OR(dst_reg(temp), op[0], op[1]));
> + inst = emit(AND(dst_null_d(), temp, src_reg(1)));
> + } else {
> + inst = emit(OR(dst_null_d(), op[0], op[1]));
> + }
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> + break;
> +
> + case ir_binop_logic_and:
> + if (brw->gen <= 5) {
> + src_reg temp = src_reg(this, ir->type);
> + emit(AND(dst_reg(temp), op[0], op[1]));
> + inst = emit(AND(dst_null_d(), temp, src_reg(1)));
> + } else {
> + inst = emit(AND(dst_null_d(), op[0], op[1]));
> + }
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> + break;
> +
> + case ir_unop_f2b:
> + if (brw->gen >= 6) {
> + emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
> + } else {
> + inst = emit(MOV(dst_null_f(), op[0]));
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> + }
> + break;
> +
> + case ir_unop_i2b:
> + if (brw->gen >= 6) {
> + emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> + } else {
> + inst = emit(MOV(dst_null_d(), op[0]));
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> + }
> + break;
> +
> + case ir_binop_all_equal:
> + if (brw->gen <= 5) {
> + resolve_bool_comparison(expr->operands[0], &op[0]);
> + resolve_bool_comparison(expr->operands[1], &op[1]);
> + }
> + inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
> + *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
> + break;
> +
> + case ir_binop_any_nequal:
> + if (brw->gen <= 5) {
> + resolve_bool_comparison(expr->operands[0], &op[0]);
> + resolve_bool_comparison(expr->operands[1], &op[1]);
> + }
> + inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
> + *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
> + break;
> +
> + case ir_unop_any:
> + if (brw->gen <= 5) {
> + resolve_bool_comparison(expr->operands[0], &op[0]);
> + }
> + inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> + *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
> + break;
> +
> + case ir_binop_greater:
> + case ir_binop_gequal:
> + case ir_binop_less:
> + case ir_binop_lequal:
> + case ir_binop_equal:
> + case ir_binop_nequal:
> + if (brw->gen <= 5) {
> + resolve_bool_comparison(expr->operands[0], &op[0]);
> + resolve_bool_comparison(expr->operands[1], &op[1]);
> + }
> + emit(CMP(dst_null_d(), op[0], op[1],
> + brw_conditional_for_comparison(expr->operation)));
> + break;
> +
> + case ir_triop_csel: {
> + /* Expand the boolean condition into the flag register. */
> + inst = emit(MOV(dst_null_d(), op[0]));
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +
> + /* Select which boolean to return. */
> + dst_reg temp(this, expr->operands[1]->type);
> + inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
> + inst->predicate = BRW_PREDICATE_NORMAL;
> +
> + /* Expand the result to a condition code. */
> + inst = emit(MOV(dst_null_d(), src_reg(temp)));
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> + break;
> + }
> +
> + default:
> + unreachable("not reached");
> + }
> + return;
> + }
> +
> + ir->accept(this);
> +
> + resolve_ud_negate(&this->result);
> +
> + vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +}
> +
> +/**
> + * Emit a gen6 IF statement with the comparison folded into the IF
> + * instruction.
> + */
> +void
> +vec4_god::emit_if_gen6(ir_if *ir)
> +{
> + ir_expression *expr = ir->condition->as_expression();
> +
> + if (expr && expr->operation != ir_binop_ubo_load) {
> + src_reg op[3];
> + dst_reg temp;
> +
> + assert(expr->get_num_operands() <= 3);
> + for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> + expr->operands[i]->accept(this);
> + op[i] = this->result;
> + }
> +
> + switch (expr->operation) {
> + case ir_unop_logic_not:
> + emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
> + return;
> +
> + case ir_binop_logic_xor:
> + emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
> + return;
> +
> + case ir_binop_logic_or:
> + temp = dst_reg(this, glsl_type::bool_type);
> + emit(OR(temp, op[0], op[1]));
> + emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
> + return;
> +
> + case ir_binop_logic_and:
> + temp = dst_reg(this, glsl_type::bool_type);
> + emit(AND(temp, op[0], op[1]));
> + emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
> + return;
> +
> + case ir_unop_f2b:
> + emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> + return;
> +
> + case ir_unop_i2b:
> + emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> + return;
> +
> + case ir_binop_greater:
> + case ir_binop_gequal:
> + case ir_binop_less:
> + case ir_binop_lequal:
> + case ir_binop_equal:
> + case ir_binop_nequal:
> + emit(IF(op[0], op[1],
> + brw_conditional_for_comparison(expr->operation)));
> + return;
> +
> + case ir_binop_all_equal:
> + emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
> + emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
> + return;
> +
> + case ir_binop_any_nequal:
> + emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
> + emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
> + return;
> +
> + case ir_unop_any:
> + emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> + emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
> + return;
> +
> + case ir_triop_csel: {
> + /* Expand the boolean condition into the flag register. */
> + vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
> + inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +
> + /* Select which boolean to return. */
> + dst_reg temp(this, expr->operands[1]->type);
> + inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
> + inst->predicate = BRW_PREDICATE_NORMAL;
> +
> + emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
> + return;
> + }
> +
> + default:
> + unreachable("not reached");
> + }
> + return;
> + }
> +
> + ir->condition->accept(this);
> +
> + emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
> +}
> +
> +void
> +vec4_god::visit(ir_variable *ir)
> +{
> + dst_reg *reg = NULL;
> +
> + if (variable_storage(ir))
> + return;
> +
> + switch (ir->data.mode) {
> + case ir_var_shader_in:
> + assert(ir->data.location != -1);
> + reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
> + break;
> +
> + case ir_var_shader_out:
> + assert(ir->data.location != -1);
> + reg = new(mem_ctx) dst_reg(this, ir->type);
> +
> + for (int i = 0; i < type_size(ir->type); i++) {
> + output_reg[ir->data.location + i] = *reg;
> + output_reg[ir->data.location + i].reg_offset = i;
> + output_reg[ir->data.location + i].type =
> + brw_type_for_base_type(ir->type->get_scalar_type());
> + output_reg_annotation[ir->data.location + i] = ir->name;
> + }
> + break;
> +
> + case ir_var_auto:
> + case ir_var_temporary:
> + reg = new(mem_ctx) dst_reg(this, ir->type);
> + break;
> +
> + case ir_var_uniform:
> + reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
> +
> + /* Thanks to the lower_ubo_reference pass, we will see only
> + * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
> + * variables, so no need for them to be in variable_ht.
> + *
> + * Some uniforms, such as samplers and atomic counters, have no actual
> + * storage, so we should ignore them.
> + */
> + if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
> + return;
> +
> + /* Track how big the whole uniform variable is, in case we need to put a
> + * copy of its data into pull constants for array access.
> + */
> + assert(this->uniforms < uniform_array_size);
> + this->uniform_size[this->uniforms] = type_size(ir->type);
> +
> + if (!strncmp(ir->name, "gl_", 3)) {
> + setup_builtin_uniform_values(ir);
> + } else {
> + setup_uniform_values(ir);
> + }
> + break;
> +
> + case ir_var_system_value:
> + reg = make_reg_for_system_value(ir);
> + break;
> +
> + default:
> + unreachable("not reached");
> + }
> +
> + reg->type = brw_type_for_base_type(ir->type);
> + hash_table_insert(this->variable_ht, reg, ir);
> +}
> +
> +void
> +vec4_god::visit(ir_loop *ir)
> +{
> + /* We don't want debugging output to print the whole body of the
> + * loop as the annotation.
> + */
> + this->base_ir = NULL;
> +
> + emit(BRW_OPCODE_DO);
> +
> + visit_instructions(&ir->body_instructions);
> +
> + emit(BRW_OPCODE_WHILE);
> +}
> +
> +void
> +vec4_god::visit(ir_loop_jump *ir)
> +{
> + switch (ir->mode) {
> + case ir_loop_jump::jump_break:
> + emit(BRW_OPCODE_BREAK);
> + break;
> + case ir_loop_jump::jump_continue:
> + emit(BRW_OPCODE_CONTINUE);
> + break;
> + }
> +}
> +
> +
> +void
> +vec4_god::visit(ir_function_signature *)
> +{
> + unreachable("not reached");
> +}
> +
> +void
> +vec4_god::visit(ir_function *ir)
> +{
> + /* Ignore function bodies other than main() -- we shouldn't see calls to
> + * them since they should all be inlined.
> + */
> + if (strcmp(ir->name, "main") == 0) {
> + const ir_function_signature *sig;
> + exec_list empty;
> +
> + sig = ir->matching_signature(NULL, &empty, false);
> +
> + assert(sig);
> +
> + visit_instructions(&sig->body);
> + }
> +}
> +
> +bool
> +vec4_god::try_emit_mad(ir_expression *ir)
> +{
> + /* 3-src instructions were introduced in gen6. */
> + if (brw->gen < 6)
> + return false;
> +
> + /* MAD can only handle floating-point data. */
> + if (ir->type->base_type != GLSL_TYPE_FLOAT)
> + return false;
> +
> + ir_rvalue *nonmul;
> + ir_expression *mul;
> + bool mul_negate, mul_abs;
> +
> + for (int i = 0; i < 2; i++) {
> + mul_negate = false;
> + mul_abs = false;
> +
> + mul = ir->operands[i]->as_expression();
> + nonmul = ir->operands[1 - i];
> +
> + if (mul && mul->operation == ir_unop_abs) {
> + mul = mul->operands[0]->as_expression();
> + mul_abs = true;
> + } else if (mul && mul->operation == ir_unop_neg) {
> + mul = mul->operands[0]->as_expression();
> + mul_negate = true;
> + }
> +
> + if (mul && mul->operation == ir_binop_mul)
> + break;
> + }
> +
> + if (!mul || mul->operation != ir_binop_mul)
> + return false;
> +
> + nonmul->accept(this);
> + src_reg src0 = fix_3src_operand(this->result);
> +
> + mul->operands[0]->accept(this);
> + src_reg src1 = fix_3src_operand(this->result);
> + src1.negate ^= mul_negate;
> + src1.abs = mul_abs;
> + if (mul_abs)
> + src1.negate = false;
> +
> + mul->operands[1]->accept(this);
> + src_reg src2 = fix_3src_operand(this->result);
> + src2.abs = mul_abs;
> + if (mul_abs)
> + src2.negate = false;
> +
> + this->result = src_reg(this, ir->type);
> + emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
> +
> + return true;
> +}
> +
> +bool
> +vec4_god::try_emit_b2f_of_compare(ir_expression *ir)
> +{
> + /* This optimization relies on CMP setting the destination to 0 when
> + * false. Early hardware only sets the least significant bit, and
> + * leaves the other bits undefined. So we can't use it.
> + */
> + if (brw->gen < 6)
> + return false;
> +
> + ir_expression *const cmp = ir->operands[0]->as_expression();
> +
> + if (cmp == NULL)
> + return false;
> +
> + switch (cmp->operation) {
> + case ir_binop_less:
> + case ir_binop_greater:
> + case ir_binop_lequal:
> + case ir_binop_gequal:
> + case ir_binop_equal:
> + case ir_binop_nequal:
> + break;
> +
> + default:
> + return false;
> + }
> +
> + cmp->operands[0]->accept(this);
> + const src_reg cmp_src0 = this->result;
> +
> + cmp->operands[1]->accept(this);
> + const src_reg cmp_src1 = this->result;
> +
> + this->result = src_reg(this, ir->type);
> +
> + emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
> + brw_conditional_for_comparison(cmp->operation)));
> +
> + /* If the comparison is false, this->result will just happen to be zero.
> + */
> + vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
> + this->result, src_reg(1.0f));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + inst->predicate_inverse = true;
> +
> + return true;
> +}
> +
> +void
> +vec4_god::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
> + src_reg src0, src_reg src1)
> +{
> + vec4_instruction *inst;
> +
> + if (brw->gen >= 6) {
> + inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
> + inst->conditional_mod = conditionalmod;
> + } else {
> + emit(CMP(dst, src0, src1, conditionalmod));
> +
> + inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + }
> +}
> +
> +void
> +vec4_god::emit_lrp(const dst_reg &dst,
> + const src_reg &x, const src_reg &y, const src_reg &a)
> +{
> + if (brw->gen >= 6) {
> + /* Note that the instruction's argument order is reversed from GLSL
> + * and the IR.
> + */
> + emit(LRP(dst,
> + fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
> + } else {
> + /* Earlier generations don't support three source operations, so we
> + * need to emit x*(1-a) + y*a.
> + */
> + dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
> + dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
> + dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
> + y_times_a.writemask = dst.writemask;
> + one_minus_a.writemask = dst.writemask;
> + x_times_one_minus_a.writemask = dst.writemask;
> +
> + emit(MUL(y_times_a, y, a));
> + emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
> + emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
> + emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
> + }
> +}
> +
> +void
> +vec4_god::visit(ir_expression *ir)
> +{
> + unsigned int operand;
> + src_reg op[ARRAY_SIZE(ir->operands)];
> + vec4_instruction *inst;
> +
> + if (ir->operation == ir_binop_add) {
> + if (try_emit_mad(ir))
> + return;
> + }
> +
> + if (ir->operation == ir_unop_b2f) {
> + if (try_emit_b2f_of_compare(ir))
> + return;
> + }
> +
> + /* Storage for our result. Ideally for an assignment we'd be using
> + * the actual storage for the result here, instead.
> + */
> + dst_reg result_dst(this, ir->type);
> + src_reg result_src(result_dst);
> +
> + if (ir->operation == ir_triop_csel) {
> + ir->operands[1]->accept(this);
> + op[1] = this->result;
> + ir->operands[2]->accept(this);
> + op[2] = this->result;
> +
> + enum brw_predicate predicate;
> + emit_bool_to_cond_code(ir->operands[0], &predicate);
> + inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
> + inst->predicate = predicate;
> + this->result = result_src;
> + return;
> + }
> +
> + for (operand = 0; operand < ir->get_num_operands(); operand++) {
> + this->result.file = BAD_FILE;
> + ir->operands[operand]->accept(this);
> + if (this->result.file == BAD_FILE) {
> + fprintf(stderr, "Failed to get tree for expression operand:\n");
> + ir->operands[operand]->fprint(stderr);
> + exit(1);
> + }
> + op[operand] = this->result;
> +
> + /* Matrix expression operands should have been broken down to vector
> + * operations already.
> + */
> + assert(!ir->operands[operand]->type->is_matrix());
> + }
> +
> + /* If nothing special happens, this is the result. */
> + this->result = result_src;
> +
> + switch (ir->operation) {
> + case ir_unop_logic_not:
> + emit(NOT(result_dst, op[0]));
> + break;
> + case ir_unop_neg:
> + op[0].negate = !op[0].negate;
> + emit(MOV(result_dst, op[0]));
> + break;
> + case ir_unop_abs:
> + op[0].abs = true;
> + op[0].negate = false;
> + emit(MOV(result_dst, op[0]));
> + break;
> +
> + case ir_unop_sign:
> + if (ir->type->is_float()) {
> + /* AND(val, 0x80000000) gives the sign bit.
> + *
> + * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
> + * zero.
> + */
> + emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
> +
> + op[0].type = BRW_REGISTER_TYPE_UD;
> + result_dst.type = BRW_REGISTER_TYPE_UD;
> + emit(AND(result_dst, op[0], src_reg(0x80000000u)));
> +
> + inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> +
> + this->result.type = BRW_REGISTER_TYPE_F;
> + } else {
> + /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
> + * -> non-negative val generates 0x00000000.
> + * Predicated OR sets 1 if val is positive.
> + */
> + emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
> +
> + emit(ASR(result_dst, op[0], src_reg(31)));
> +
> + inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + }
> + break;
> +
> + case ir_unop_rcp:
> + emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
> + break;
> +
> + case ir_unop_exp2:
> + emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
> + break;
> + case ir_unop_log2:
> + emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
> + break;
> + case ir_unop_exp:
> + case ir_unop_log:
> + unreachable("not reached: should be handled by ir_explog_to_explog2");
> + case ir_unop_sin:
> + case ir_unop_sin_reduced:
> + emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
> + break;
> + case ir_unop_cos:
> + case ir_unop_cos_reduced:
> + emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
> + break;
> +
> + case ir_unop_dFdx:
> + case ir_unop_dFdx_coarse:
> + case ir_unop_dFdx_fine:
> + case ir_unop_dFdy:
> + case ir_unop_dFdy_coarse:
> + case ir_unop_dFdy_fine:
> + unreachable("derivatives not valid in vertex shader");
> +
> + case ir_unop_bitfield_reverse:
> + emit(BFREV(result_dst, op[0]));
> + break;
> + case ir_unop_bit_count:
> + emit(CBIT(result_dst, op[0]));
> + break;
> + case ir_unop_find_msb: {
> + src_reg temp = src_reg(this, glsl_type::uint_type);
> +
> + inst = emit(FBH(dst_reg(temp), op[0]));
> + inst->dst.writemask = WRITEMASK_XYZW;
> +
> + /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
> + * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
> + * subtract the result from 31 to convert the MSB count into an LSB count.
> + */
> +
> + /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
> + temp.swizzle = BRW_SWIZZLE_NOOP;
> + emit(MOV(result_dst, temp));
> +
> + src_reg src_tmp = src_reg(result_dst);
> + emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
> +
> + src_tmp.negate = true;
> + inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + break;
> + }
> + case ir_unop_find_lsb:
> + emit(FBL(result_dst, op[0]));
> + break;
> + case ir_unop_saturate:
> + inst = emit(MOV(result_dst, op[0]));
> + inst->saturate = true;
> + break;
> +
> + case ir_unop_noise:
> + unreachable("not reached: should be handled by lower_noise");
> +
> + case ir_binop_add:
> + emit(ADD(result_dst, op[0], op[1]));
> + break;
> + case ir_binop_sub:
> + unreachable("not reached: should be handled by ir_sub_to_add_neg");
> +
> + case ir_binop_mul:
> + if (brw->gen < 8 && ir->type->is_integer()) {
> + /* For integer multiplication, the MUL uses the low 16 bits of one of
> + * the operands (src0 through SNB, src1 on IVB and later). The MACH
> + * accumulates in the contribution of the upper 16 bits of that
> + * operand. If we can determine that one of the args is in the low
> + * 16 bits, though, we can just emit a single MUL.
> + */
> + if (ir->operands[0]->is_uint16_constant()) {
> + if (brw->gen < 7)
> + emit(MUL(result_dst, op[0], op[1]));
> + else
> + emit(MUL(result_dst, op[1], op[0]));
> + } else if (ir->operands[1]->is_uint16_constant()) {
> + if (brw->gen < 7)
> + emit(MUL(result_dst, op[1], op[0]));
> + else
> + emit(MUL(result_dst, op[0], op[1]));
> + } else {
> + struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
> +
> + emit(MUL(acc, op[0], op[1]));
> + emit(MACH(dst_null_d(), op[0], op[1]));
> + emit(MOV(result_dst, src_reg(acc)));
> + }
> + } else {
> + emit(MUL(result_dst, op[0], op[1]));
> + }
> + break;
> + case ir_binop_imul_high: {
> + struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
> +
> + emit(MUL(acc, op[0], op[1]));
> + emit(MACH(result_dst, op[0], op[1]));
> + break;
> + }
> + case ir_binop_div:
> + /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
> + assert(ir->type->is_integer());
> + emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
> + break;
> + case ir_binop_carry: {
> + struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
> +
> + emit(ADDC(dst_null_ud(), op[0], op[1]));
> + emit(MOV(result_dst, src_reg(acc)));
> + break;
> + }
> + case ir_binop_borrow: {
> + struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
> +
> + emit(SUBB(dst_null_ud(), op[0], op[1]));
> + emit(MOV(result_dst, src_reg(acc)));
> + break;
> + }
> + case ir_binop_mod:
> + /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
> + assert(ir->type->is_integer());
> + emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
> + break;
> +
> + case ir_binop_less:
> + case ir_binop_greater:
> + case ir_binop_lequal:
> + case ir_binop_gequal:
> + case ir_binop_equal:
> + case ir_binop_nequal: {
> + if (brw->gen <= 5) {
> + resolve_bool_comparison(ir->operands[0], &op[0]);
> + resolve_bool_comparison(ir->operands[1], &op[1]);
> + }
> + emit(CMP(result_dst, op[0], op[1],
> + brw_conditional_for_comparison(ir->operation)));
> + break;
> + }
> +
> + case ir_binop_all_equal:
> + if (brw->gen <= 5) {
> + resolve_bool_comparison(ir->operands[0], &op[0]);
> + resolve_bool_comparison(ir->operands[1], &op[1]);
> + }
> +
> + /* "==" operator producing a scalar boolean. */
> + if (ir->operands[0]->type->is_vector() ||
> + ir->operands[1]->type->is_vector()) {
> + emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
> + emit(MOV(result_dst, src_reg(0)));
> + inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
> + inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
> + } else {
> + emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
> + }
> + break;
> + case ir_binop_any_nequal:
> + if (brw->gen <= 5) {
> + resolve_bool_comparison(ir->operands[0], &op[0]);
> + resolve_bool_comparison(ir->operands[1], &op[1]);
> + }
> +
> + /* "!=" operator producing a scalar boolean. */
> + if (ir->operands[0]->type->is_vector() ||
> + ir->operands[1]->type->is_vector()) {
> + emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
> +
> + emit(MOV(result_dst, src_reg(0)));
> + inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
> + inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
> + } else {
> + emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
> + }
> + break;
> +
> + case ir_unop_any:
> + if (brw->gen <= 5) {
> + resolve_bool_comparison(ir->operands[0], &op[0]);
> + }
> + emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> + emit(MOV(result_dst, src_reg(0)));
> +
> + inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
> + inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
> + break;
> +
> + case ir_binop_logic_xor:
> + emit(XOR(result_dst, op[0], op[1]));
> + break;
> +
> + case ir_binop_logic_or:
> + emit(OR(result_dst, op[0], op[1]));
> + break;
> +
> + case ir_binop_logic_and:
> + emit(AND(result_dst, op[0], op[1]));
> + break;
> +
> + case ir_binop_dot:
> + assert(ir->operands[0]->type->is_vector());
> + assert(ir->operands[0]->type == ir->operands[1]->type);
> + emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
> + break;
> +
> + case ir_unop_sqrt:
> + emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
> + break;
> + case ir_unop_rsq:
> + emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
> + break;
> +
> + case ir_unop_bitcast_i2f:
> + case ir_unop_bitcast_u2f:
> + this->result = op[0];
> + this->result.type = BRW_REGISTER_TYPE_F;
> + break;
> +
> + case ir_unop_bitcast_f2i:
> + this->result = op[0];
> + this->result.type = BRW_REGISTER_TYPE_D;
> + break;
> +
> + case ir_unop_bitcast_f2u:
> + this->result = op[0];
> + this->result.type = BRW_REGISTER_TYPE_UD;
> + break;
> +
> + case ir_unop_i2f:
> + case ir_unop_i2u:
> + case ir_unop_u2i:
> + case ir_unop_u2f:
> + case ir_unop_f2i:
> + case ir_unop_f2u:
> + emit(MOV(result_dst, op[0]));
> + break;
> + case ir_unop_b2i:
> + emit(AND(result_dst, op[0], src_reg(1)));
> + break;
> + case ir_unop_b2f:
> + if (brw->gen <= 5) {
> + resolve_bool_comparison(ir->operands[0], &op[0]);
> + }
> + op[0].type = BRW_REGISTER_TYPE_D;
> + result_dst.type = BRW_REGISTER_TYPE_D;
> + emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
> + result_dst.type = BRW_REGISTER_TYPE_F;
> + break;
> + case ir_unop_f2b:
> + emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
> + break;
> + case ir_unop_i2b:
> + emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> + break;
> +
> + case ir_unop_trunc:
> + emit(RNDZ(result_dst, op[0]));
> + break;
> + case ir_unop_ceil: {
> + src_reg tmp = src_reg(this, ir->type);
> + op[0].negate = !op[0].negate;
> + emit(RNDD(dst_reg(tmp), op[0]));
> + tmp.negate = true;
> + emit(MOV(result_dst, tmp));
> + }
> + break;
> + case ir_unop_floor:
> + inst = emit(RNDD(result_dst, op[0]));
> + break;
> + case ir_unop_fract:
> + inst = emit(FRC(result_dst, op[0]));
> + break;
> + case ir_unop_round_even:
> + emit(RNDE(result_dst, op[0]));
> + break;
> +
> + case ir_binop_min:
> + emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
> + break;
> + case ir_binop_max:
> + emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
> + break;
> +
> + case ir_binop_pow:
> + emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
> + break;
> +
> + case ir_unop_bit_not:
> + inst = emit(NOT(result_dst, op[0]));
> + break;
> + case ir_binop_bit_and:
> + inst = emit(AND(result_dst, op[0], op[1]));
> + break;
> + case ir_binop_bit_xor:
> + inst = emit(XOR(result_dst, op[0], op[1]));
> + break;
> + case ir_binop_bit_or:
> + inst = emit(OR(result_dst, op[0], op[1]));
> + break;
> +
> + case ir_binop_lshift:
> + inst = emit(SHL(result_dst, op[0], op[1]));
> + break;
> +
> + case ir_binop_rshift:
> + if (ir->type->base_type == GLSL_TYPE_INT)
> + inst = emit(ASR(result_dst, op[0], op[1]));
> + else
> + inst = emit(SHR(result_dst, op[0], op[1]));
> + break;
> +
> + case ir_binop_bfm:
> + emit(BFI1(result_dst, op[0], op[1]));
> + break;
> +
> + case ir_binop_ubo_load: {
> + ir_constant *const_uniform_block = ir->operands[0]->as_constant();
> + ir_constant *const_offset_ir = ir->operands[1]->as_constant();
> + unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
> + src_reg offset;
> +
> + /* Now, load the vector from that offset. */
> + assert(ir->type->is_vector() || ir->type->is_scalar());
> +
> + src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
> + packed_consts.type = result.type;
> + src_reg surf_index;
> +
> + if (const_uniform_block) {
> + /* The block index is a constant, so just emit the binding table entry
> + * as an immediate.
> + */
> + surf_index = src_reg(prog_data->base.binding_table.ubo_start +
> + const_uniform_block->value.u[0]);
> + } else {
> + /* The block index is not a constant. Evaluate the index expression
> + * per-channel and add the base UBO index; the generator will select
> + * a value from any live channel.
> + */
> + surf_index = src_reg(this, glsl_type::uint_type);
> + emit(ADD(dst_reg(surf_index), op[0],
> + src_reg(prog_data->base.binding_table.ubo_start)));
> +
> + /* Assume this may touch any UBO. It would be nice to provide
> + * a tighter bound, but the array information is already lowered away.
> + */
> + brw_mark_surface_used(&prog_data->base,
> + prog_data->base.binding_table.ubo_start +
> + shader_prog->NumUniformBlocks - 1);
> + }
> +
> + if (const_offset_ir) {
> + if (brw->gen >= 8) {
> + /* Store the offset in a GRF so we can send-from-GRF. */
> + offset = src_reg(this, glsl_type::int_type);
> + emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
> + } else {
> + /* Immediates are fine on older generations since they'll be moved
> + * to a (potentially fake) MRF at the generator level.
> + */
> + offset = src_reg(const_offset / 16);
> + }
> + } else {
> + offset = src_reg(this, glsl_type::uint_type);
> + emit(SHR(dst_reg(offset), op[1], src_reg(4)));
> + }
> +
> + if (brw->gen >= 7) {
> + dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
> +
> + /* We have to use a message header on Skylake to get SIMD4x2 mode.
> + * Reserve space for the register.
> + */
> + if (brw->gen >= 9) {
> + grf_offset.reg_offset++;
> + alloc.sizes[grf_offset.reg] = 2;
> + }
> +
> + grf_offset.type = offset.type;
> +
> + emit(MOV(grf_offset, offset));
> +
> + vec4_instruction *pull =
> + emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
> + dst_reg(packed_consts),
> + surf_index,
> + src_reg(grf_offset)));
> + pull->mlen = 1;
> + } else {
> + vec4_instruction *pull =
> + emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
> + dst_reg(packed_consts),
> + surf_index,
> + offset));
> + pull->base_mrf = 14;
> + pull->mlen = 1;
> + }
> +
> + packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
> + packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
> + const_offset % 16 / 4,
> + const_offset % 16 / 4,
> + const_offset % 16 / 4);
> +
> + /* UBO bools are any nonzero int. We need to convert them to use the
> + * value of true stored in ctx->Const.UniformBooleanTrue.
> + */
> + if (ir->type->base_type == GLSL_TYPE_BOOL) {
> + emit(CMP(result_dst, packed_consts, src_reg(0u),
> + BRW_CONDITIONAL_NZ));
> + } else {
> + emit(MOV(result_dst, packed_consts));
> + }
> + break;
> + }
> +
> + case ir_binop_vector_extract:
> + unreachable("should have been lowered by vec_index_to_cond_assign");
> +
> + case ir_triop_fma:
> + op[0] = fix_3src_operand(op[0]);
> + op[1] = fix_3src_operand(op[1]);
> + op[2] = fix_3src_operand(op[2]);
> + /* Note that the instruction's argument order is reversed from GLSL
> + * and the IR.
> + */
> + emit(MAD(result_dst, op[2], op[1], op[0]));
> + break;
> +
> + case ir_triop_lrp:
> + emit_lrp(result_dst, op[0], op[1], op[2]);
> + break;
> +
> + case ir_triop_csel:
> + unreachable("already handled above");
> + break;
> +
> + case ir_triop_bfi:
> + op[0] = fix_3src_operand(op[0]);
> + op[1] = fix_3src_operand(op[1]);
> + op[2] = fix_3src_operand(op[2]);
> + emit(BFI2(result_dst, op[0], op[1], op[2]));
> + break;
> +
> + case ir_triop_bitfield_extract:
> + op[0] = fix_3src_operand(op[0]);
> + op[1] = fix_3src_operand(op[1]);
> + op[2] = fix_3src_operand(op[2]);
> + /* Note that the instruction's argument order is reversed from GLSL
> + * and the IR.
> + */
> + emit(BFE(result_dst, op[2], op[1], op[0]));
> + break;
> +
> + case ir_triop_vector_insert:
> + unreachable("should have been lowered by lower_vector_insert");
> +
> + case ir_quadop_bitfield_insert:
> + unreachable("not reached: should be handled by "
> + "bitfield_insert_to_bfm_bfi\n");
> +
> + case ir_quadop_vector:
> + unreachable("not reached: should be handled by lower_quadop_vector");
> +
> + case ir_unop_pack_half_2x16:
> + emit_pack_half_2x16(result_dst, op[0]);
> + break;
> + case ir_unop_unpack_half_2x16:
> + emit_unpack_half_2x16(result_dst, op[0]);
> + break;
> + case ir_unop_unpack_unorm_4x8:
> + emit_unpack_unorm_4x8(result_dst, op[0]);
> + break;
> + case ir_unop_unpack_snorm_4x8:
> + emit_unpack_snorm_4x8(result_dst, op[0]);
> + break;
> + case ir_unop_pack_unorm_4x8:
> + emit_pack_unorm_4x8(result_dst, op[0]);
> + break;
> + case ir_unop_pack_snorm_4x8:
> + emit_pack_snorm_4x8(result_dst, op[0]);
> + break;
> + case ir_unop_pack_snorm_2x16:
> + case ir_unop_pack_unorm_2x16:
> + case ir_unop_unpack_snorm_2x16:
> + case ir_unop_unpack_unorm_2x16:
> + unreachable("not reached: should be handled by lower_packing_builtins");
> + case ir_unop_unpack_half_2x16_split_x:
> + case ir_unop_unpack_half_2x16_split_y:
> + case ir_binop_pack_half_2x16_split:
> + case ir_unop_interpolate_at_centroid:
> + case ir_binop_interpolate_at_sample:
> + case ir_binop_interpolate_at_offset:
> + unreachable("not reached: should not occur in vertex shader");
> + case ir_binop_ldexp:
> + unreachable("not reached: should be handled by ldexp_to_arith()");
> + case ir_unop_d2f:
> + case ir_unop_f2d:
> + case ir_unop_d2i:
> + case ir_unop_i2d:
> + case ir_unop_d2u:
> + case ir_unop_u2d:
> + case ir_unop_d2b:
> + case ir_unop_pack_double_2x32:
> + case ir_unop_unpack_double_2x32:
> + case ir_unop_frexp_sig:
> + case ir_unop_frexp_exp:
> + unreachable("fp64 todo");
> + }
> +}
> +
> +
> +void
> +vec4_god::visit(ir_swizzle *ir)
> +{
> + /* Note that this is only swizzles in expressions, not those on the left
> + * hand side of an assignment, which do write masking. See ir_assignment
> + * for that.
> + */
> + const unsigned swz = brw_compose_swizzle(
> + brw_swizzle_for_size(ir->type->vector_elements),
> + BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
> +
> + ir->val->accept(this);
> + this->result = swizzle(this->result, swz);
> +}
> +
> +void
> +vec4_god::visit(ir_dereference_variable *ir)
> +{
> + const struct glsl_type *type = ir->type;
> + dst_reg *reg = variable_storage(ir->var);
> +
> + if (!reg) {
> + fail("Failed to find variable storage for %s\n", ir->var->name);
> + this->result = src_reg(brw_null_reg());
> + return;
> + }
> +
> + this->result = src_reg(*reg);
> +
> + /* System values get their swizzle from the dst_reg writemask */
> + if (ir->var->data.mode == ir_var_system_value)
> + return;
> +
> + if (type->is_scalar() || type->is_vector() || type->is_matrix())
> + this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
> +}
> +
> +
> +int
> +vec4_god::compute_array_stride(ir_dereference_array *ir)
> +{
> + /* Under normal circumstances array elements are stored consecutively, so
> + * the stride is equal to the size of the array element.
> + */
> + return type_size(ir->type);
> +}
> +
> +
> +void
> +vec4_god::visit(ir_dereference_array *ir)
> +{
> + ir_constant *constant_index;
> + src_reg src;
> + int array_stride = compute_array_stride(ir);
> +
> + constant_index = ir->array_index->constant_expression_value();
> +
> + ir->array->accept(this);
> + src = this->result;
> +
> + if (constant_index) {
> + src.reg_offset += constant_index->value.i[0] * array_stride;
> + } else {
> + /* Variable index array dereference. It eats the "vec4" of the
> + * base of the array and an index that offsets the Mesa register
> + * index.
> + */
> + ir->array_index->accept(this);
> +
> + src_reg index_reg;
> +
> + if (array_stride == 1) {
> + index_reg = this->result;
> + } else {
> + index_reg = src_reg(this, glsl_type::int_type);
> +
> + emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
> + }
> +
> + if (src.reladdr) {
> + src_reg temp = src_reg(this, glsl_type::int_type);
> +
> + emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
> +
> + index_reg = temp;
> + }
> +
> + src.reladdr = ralloc(mem_ctx, src_reg);
> + memcpy(src.reladdr, &index_reg, sizeof(index_reg));
> + }
> +
> + /* If the type is smaller than a vec4, replicate the last channel out. */
> + if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
> + src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
> + else
> + src.swizzle = BRW_SWIZZLE_NOOP;
> + src.type = brw_type_for_base_type(ir->type);
> +
> + this->result = src;
> +}
> +
> +void
> +vec4_god::visit(ir_dereference_record *ir)
> +{
> + unsigned int i;
> + const glsl_type *struct_type = ir->record->type;
> + int offset = 0;
> +
> + ir->record->accept(this);
> +
> + for (i = 0; i < struct_type->length; i++) {
> + if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
> + break;
> + offset += type_size(struct_type->fields.structure[i].type);
> + }
> +
> + /* If the type is smaller than a vec4, replicate the last channel out. */
> + if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
> + this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
> + else
> + this->result.swizzle = BRW_SWIZZLE_NOOP;
> + this->result.type = brw_type_for_base_type(ir->type);
> +
> + this->result.reg_offset += offset;
> +}
> +
> +/**
> + * We want to be careful in assignment setup to hit the actual storage
> + * instead of potentially using a temporary like we might with the
> + * ir_dereference handler.
> + */
> +static dst_reg
> +get_assignment_lhs(ir_dereference *ir, vec4_god *v)
> +{
> + /* The LHS must be a dereference. If the LHS is a variable indexed array
> + * access of a vector, it must be separated into a series conditional moves
> + * before reaching this point (see ir_vec_index_to_cond_assign).
> + */
> + assert(ir->as_dereference());
> + ir_dereference_array *deref_array = ir->as_dereference_array();
> + if (deref_array) {
> + assert(!deref_array->array->type->is_vector());
> + }
> +
> + /* Use the rvalue deref handler for the most part. We'll ignore
> + * swizzles in it and write swizzles using writemask, though.
> + */
> + ir->accept(v);
> + return dst_reg(v->result);
> +}
> +
> +void
> +vec4_god::emit_block_move(dst_reg *dst, src_reg *src,
> + const struct glsl_type *type,
> + enum brw_predicate predicate)
> +{
> + if (type->base_type == GLSL_TYPE_STRUCT) {
> + for (unsigned int i = 0; i < type->length; i++) {
> + emit_block_move(dst, src, type->fields.structure[i].type, predicate);
> + }
> + return;
> + }
> +
> + if (type->is_array()) {
> + for (unsigned int i = 0; i < type->length; i++) {
> + emit_block_move(dst, src, type->fields.array, predicate);
> + }
> + return;
> + }
> +
> + if (type->is_matrix()) {
> + const struct glsl_type *vec_type;
> +
> + vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
> + type->vector_elements, 1);
> +
> + for (int i = 0; i < type->matrix_columns; i++) {
> + emit_block_move(dst, src, vec_type, predicate);
> + }
> + return;
> + }
> +
> + assert(type->is_scalar() || type->is_vector());
> +
> + dst->type = brw_type_for_base_type(type);
> + src->type = dst->type;
> +
> + dst->writemask = (1 << type->vector_elements) - 1;
> +
> + src->swizzle = brw_swizzle_for_size(type->vector_elements);
> +
> + vec4_instruction *inst = emit(MOV(*dst, *src));
> + inst->predicate = predicate;
> +
> + dst->reg_offset++;
> + src->reg_offset++;
> +}
> +
> +
> +/* If the RHS processing resulted in an instruction generating a
> + * temporary value, and it would be easy to rewrite the instruction to
> + * generate its result right into the LHS instead, do so. This ends
> + * up reliably removing instructions where it can be tricky to do so
> + * later without real UD chain information.
> + */
> +bool
> +vec4_god::try_rewrite_rhs_to_dst(ir_assignment *ir,
> + dst_reg dst,
> + src_reg src,
> + vec4_instruction *pre_rhs_inst,
> + vec4_instruction *last_rhs_inst)
> +{
> + /* This could be supported, but it would take more smarts. */
> + if (ir->condition)
> + return false;
> +
> + if (pre_rhs_inst == last_rhs_inst)
> + return false; /* No instructions generated to work with. */
> +
> + /* Make sure the last instruction generated our source reg. */
> + if (src.file != GRF ||
> + src.file != last_rhs_inst->dst.file ||
> + src.reg != last_rhs_inst->dst.reg ||
> + src.reg_offset != last_rhs_inst->dst.reg_offset ||
> + src.reladdr ||
> + src.abs ||
> + src.negate ||
> + last_rhs_inst->predicate != BRW_PREDICATE_NONE)
> + return false;
> +
> + /* Check that that last instruction fully initialized the channels
> + * we want to use, in the order we want to use them. We could
> + * potentially reswizzle the operands of many instructions so that
> + * we could handle out of order channels, but don't yet.
> + */
> +
> + for (unsigned i = 0; i < 4; i++) {
> + if (dst.writemask & (1 << i)) {
> + if (!(last_rhs_inst->dst.writemask & (1 << i)))
> + return false;
> +
> + if (BRW_GET_SWZ(src.swizzle, i) != i)
> + return false;
> + }
> + }
> +
> + /* Success! Rewrite the instruction. */
> + last_rhs_inst->dst.file = dst.file;
> + last_rhs_inst->dst.reg = dst.reg;
> + last_rhs_inst->dst.reg_offset = dst.reg_offset;
> + last_rhs_inst->dst.reladdr = dst.reladdr;
> + last_rhs_inst->dst.writemask &= dst.writemask;
> +
> + return true;
> +}
> +
> +void
> +vec4_god::visit(ir_assignment *ir)
> +{
> + dst_reg dst = get_assignment_lhs(ir->lhs, this);
> + enum brw_predicate predicate = BRW_PREDICATE_NONE;
> +
> + if (!ir->lhs->type->is_scalar() &&
> + !ir->lhs->type->is_vector()) {
> + ir->rhs->accept(this);
> + src_reg src = this->result;
> +
> + if (ir->condition) {
> + emit_bool_to_cond_code(ir->condition, &predicate);
> + }
> +
> + /* emit_block_move doesn't account for swizzles in the source register.
> + * This should be ok, since the source register is a structure or an
> + * array, and those can't be swizzled. But double-check to be sure.
> + */
> + assert(src.swizzle ==
> + (ir->rhs->type->is_matrix()
> + ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
> + : BRW_SWIZZLE_NOOP));
> +
> + emit_block_move(&dst, &src, ir->rhs->type, predicate);
> + return;
> + }
> +
> + /* Now we're down to just a scalar/vector with writemasks. */
> + int i;
> +
> + vec4_instruction *pre_rhs_inst, *last_rhs_inst;
> + pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
> +
> + ir->rhs->accept(this);
> +
> + last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
> +
> + int swizzles[4];
> + int src_chan = 0;
> +
> + assert(ir->lhs->type->is_vector() ||
> + ir->lhs->type->is_scalar());
> + dst.writemask = ir->write_mask;
> +
> + /* Swizzle a small RHS vector into the channels being written.
> + *
> + * glsl ir treats write_mask as dictating how many channels are
> + * present on the RHS while in our instructions we need to make
> + * those channels appear in the slots of the vec4 they're written to.
> + */
> + for (int i = 0; i < 4; i++)
> + swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
> +
> + src_reg src = swizzle(this->result,
> + BRW_SWIZZLE4(swizzles[0], swizzles[1],
> + swizzles[2], swizzles[3]));
> +
> + if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
> + return;
> + }
> +
> + if (ir->condition) {
> + emit_bool_to_cond_code(ir->condition, &predicate);
> + }
> +
> + for (i = 0; i < type_size(ir->lhs->type); i++) {
> + vec4_instruction *inst = emit(MOV(dst, src));
> + inst->predicate = predicate;
> +
> + dst.reg_offset++;
> + src.reg_offset++;
> + }
> +}
> +
> +void
> +vec4_god::emit_constant_values(dst_reg *dst, ir_constant *ir)
> +{
> + if (ir->type->base_type == GLSL_TYPE_STRUCT) {
> + foreach_in_list(ir_constant, field_value, &ir->components) {
> + emit_constant_values(dst, field_value);
> + }
> + return;
> + }
> +
> + if (ir->type->is_array()) {
> + for (unsigned int i = 0; i < ir->type->length; i++) {
> + emit_constant_values(dst, ir->array_elements[i]);
> + }
> + return;
> + }
> +
> + if (ir->type->is_matrix()) {
> + for (int i = 0; i < ir->type->matrix_columns; i++) {
> + float *vec = &ir->value.f[i * ir->type->vector_elements];
> +
> + for (int j = 0; j < ir->type->vector_elements; j++) {
> + dst->writemask = 1 << j;
> + dst->type = BRW_REGISTER_TYPE_F;
> +
> + emit(MOV(*dst, src_reg(vec[j])));
> + }
> + dst->reg_offset++;
> + }
> + return;
> + }
> +
> + int remaining_writemask = (1 << ir->type->vector_elements) - 1;
> +
> + for (int i = 0; i < ir->type->vector_elements; i++) {
> + if (!(remaining_writemask & (1 << i)))
> + continue;
> +
> + dst->writemask = 1 << i;
> + dst->type = brw_type_for_base_type(ir->type);
> +
> + /* Find other components that match the one we're about to
> + * write. Emits fewer instructions for things like vec4(0.5,
> + * 1.5, 1.5, 1.5).
> + */
> + for (int j = i + 1; j < ir->type->vector_elements; j++) {
> + if (ir->type->base_type == GLSL_TYPE_BOOL) {
> + if (ir->value.b[i] == ir->value.b[j])
> + dst->writemask |= (1 << j);
> + } else {
> + /* u, i, and f storage all line up, so no need for a
> + * switch case for comparing each type.
> + */
> + if (ir->value.u[i] == ir->value.u[j])
> + dst->writemask |= (1 << j);
> + }
> + }
> +
> + switch (ir->type->base_type) {
> + case GLSL_TYPE_FLOAT:
> + emit(MOV(*dst, src_reg(ir->value.f[i])));
> + break;
> + case GLSL_TYPE_INT:
> + emit(MOV(*dst, src_reg(ir->value.i[i])));
> + break;
> + case GLSL_TYPE_UINT:
> + emit(MOV(*dst, src_reg(ir->value.u[i])));
> + break;
> + case GLSL_TYPE_BOOL:
> + emit(MOV(*dst,
> + src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
> + : 0)));
> + break;
> + default:
> + unreachable("Non-float/uint/int/bool constant");
> + }
> +
> + remaining_writemask &= ~dst->writemask;
> + }
> + dst->reg_offset++;
> +}
> +
> +void
> +vec4_god::visit(ir_constant *ir)
> +{
> + dst_reg dst = dst_reg(this, ir->type);
> + this->result = src_reg(dst);
> +
> + emit_constant_values(&dst, ir);
> +}
> +
> +void
> +vec4_god::visit_atomic_counter_intrinsic(ir_call *ir)
> +{
> + ir_dereference *deref = static_cast<ir_dereference *>(
> + ir->actual_parameters.get_head());
> + ir_variable *location = deref->variable_referenced();
> + unsigned surf_index = (prog_data->base.binding_table.abo_start +
> + location->data.binding);
> +
> + /* Calculate the surface offset */
> + src_reg offset(this, glsl_type::uint_type);
> + ir_dereference_array *deref_array = deref->as_dereference_array();
> + if (deref_array) {
> + deref_array->array_index->accept(this);
> +
> + src_reg tmp(this, glsl_type::uint_type);
> + emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
> + emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
> + } else {
> + offset = location->data.atomic.offset;
> + }
> +
> + /* Emit the appropriate machine instruction */
> + const char *callee = ir->callee->function_name();
> + dst_reg dst = get_assignment_lhs(ir->return_deref, this);
> +
> + if (!strcmp("__intrinsic_atomic_read", callee)) {
> + emit_untyped_surface_read(surf_index, dst, offset);
> +
> + } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
> + emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
> + src_reg(), src_reg());
> +
> + } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
> + emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
> + src_reg(), src_reg());
> + }
> +}
> +
> +void
> +vec4_god::visit(ir_call *ir)
> +{
> + const char *callee = ir->callee->function_name();
> +
> + if (!strcmp("__intrinsic_atomic_read", callee) ||
> + !strcmp("__intrinsic_atomic_increment", callee) ||
> + !strcmp("__intrinsic_atomic_predecrement", callee)) {
> + visit_atomic_counter_intrinsic(ir);
> + } else {
> + unreachable("Unsupported intrinsic.");
> + }
> +}
> +
> +src_reg
> +vec4_god::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
> +{
> + vec4_instruction *inst =
> + new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
> + dst_reg(this, glsl_type::uvec4_type));
> + inst->base_mrf = 2;
> + inst->mlen = 1;
> + inst->src[1] = sampler;
> +
> + /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
> + int param_base = inst->base_mrf;
> + int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
> + int zero_mask = 0xf & ~coord_mask;
> +
> + emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
> + coordinate));
> +
> + emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
> + src_reg(0)));
> +
> + emit(inst);
> + return src_reg(inst->dst);
> +}
> +
> +static bool
> +is_high_sampler(struct brw_context *brw, src_reg sampler)
> +{
> + if (brw->gen < 8 && !brw->is_haswell)
> + return false;
> +
> + return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
> +}
> +
> +void
> +vec4_god::visit(ir_texture *ir)
> +{
> + uint32_t sampler =
> + _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
> +
> + ir_rvalue *nonconst_sampler_index =
> + _mesa_get_sampler_array_nonconst_index(ir->sampler);
> +
> + /* Handle non-constant sampler array indexing */
> + src_reg sampler_reg;
> + if (nonconst_sampler_index) {
> + /* The highest sampler which may be used by this operation is
> + * the last element of the array. Mark it here, because the generator
> + * doesn't have enough information to determine the bound.
> + */
> + uint32_t array_size = ir->sampler->as_dereference_array()
> + ->array->type->array_size();
> +
> + uint32_t max_used = sampler + array_size - 1;
> + if (ir->op == ir_tg4 && brw->gen < 8) {
> + max_used += prog_data->base.binding_table.gather_texture_start;
> + } else {
> + max_used += prog_data->base.binding_table.texture_start;
> + }
> +
> + brw_mark_surface_used(&prog_data->base, max_used);
> +
> + /* Emit code to evaluate the actual indexing expression */
> + nonconst_sampler_index->accept(this);
> + dst_reg temp(this, glsl_type::uint_type);
> + emit(ADD(temp, this->result, src_reg(sampler)))
> + ->force_writemask_all = true;
> + sampler_reg = src_reg(temp);
> + } else {
> + /* Single sampler, or constant array index; the indexing expression
> + * is just an immediate.
> + */
> + sampler_reg = src_reg(sampler);
> + }
> +
> + /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
> + * emitting anything other than setting up the constant result.
> + */
> + if (ir->op == ir_tg4) {
> + ir_constant *chan = ir->lod_info.component->as_constant();
> + int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
> + if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
> + dst_reg result(this, ir->type);
> + this->result = src_reg(result);
> + emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
> + return;
> + }
> + }
> +
> + /* Should be lowered by do_lower_texture_projection */
> + assert(!ir->projector);
> +
> + /* Should be lowered */
> + assert(!ir->offset || !ir->offset->type->is_array());
> +
> + /* Generate code to compute all the subexpression trees. This has to be
> + * done before loading any values into MRFs for the sampler message since
> + * generating these values may involve SEND messages that need the MRFs.
> + */
> + src_reg coordinate;
> + if (ir->coordinate) {
> + ir->coordinate->accept(this);
> + coordinate = this->result;
> + }
> +
> + src_reg shadow_comparitor;
> + if (ir->shadow_comparitor) {
> + ir->shadow_comparitor->accept(this);
> + shadow_comparitor = this->result;
> + }
> +
> + bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
> + src_reg offset_value;
> + if (has_nonconstant_offset) {
> + ir->offset->accept(this);
> + offset_value = src_reg(this->result);
> + }
> +
> + const glsl_type *lod_type = NULL, *sample_index_type = NULL;
> + src_reg lod, dPdx, dPdy, sample_index, mcs;
> + switch (ir->op) {
> + case ir_tex:
> + lod = src_reg(0.0f);
> + lod_type = glsl_type::float_type;
> + break;
> + case ir_txf:
> + case ir_txl:
> + case ir_txs:
> + ir->lod_info.lod->accept(this);
> + lod = this->result;
> + lod_type = ir->lod_info.lod->type;
> + break;
> + case ir_query_levels:
> + lod = src_reg(0);
> + lod_type = glsl_type::int_type;
> + break;
> + case ir_txf_ms:
> + ir->lod_info.sample_index->accept(this);
> + sample_index = this->result;
> + sample_index_type = ir->lod_info.sample_index->type;
> +
> + if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
> + mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
> + else
> + mcs = src_reg(0u);
> + break;
> + case ir_txd:
> + ir->lod_info.grad.dPdx->accept(this);
> + dPdx = this->result;
> +
> + ir->lod_info.grad.dPdy->accept(this);
> + dPdy = this->result;
> +
> + lod_type = ir->lod_info.grad.dPdx->type;
> + break;
> + case ir_txb:
> + case ir_lod:
> + case ir_tg4:
> + break;
> + }
> +
> + enum opcode opcode;
> + switch (ir->op) {
> + case ir_tex: opcode = SHADER_OPCODE_TXL; break;
> + case ir_txl: opcode = SHADER_OPCODE_TXL; break;
> + case ir_txd: opcode = SHADER_OPCODE_TXD; break;
> + case ir_txf: opcode = SHADER_OPCODE_TXF; break;
> + case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
> + case ir_txs: opcode = SHADER_OPCODE_TXS; break;
> + case ir_tg4: opcode = has_nonconstant_offset
> + ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
> + case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
> + case ir_txb:
> + unreachable("TXB is not valid for vertex shaders.");
> + case ir_lod:
> + unreachable("LOD is not valid for vertex shaders.");
> + default:
> + unreachable("Unrecognized tex op");
> + }
> +
> + vec4_instruction *inst = new(mem_ctx) vec4_instruction(
> + opcode, dst_reg(this, ir->type));
> +
> + if (ir->offset != NULL && !has_nonconstant_offset) {
> + inst->offset =
> + brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
> + ir->offset->type->vector_elements);
> + }
> +
> + /* Stuff the channel select bits in the top of the texture offset */
> + if (ir->op == ir_tg4)
> + inst->offset |= gather_channel(ir, sampler) << 16;
> +
> + /* The message header is necessary for:
> + * - Gen4 (always)
> + * - Gen9+ for selecting SIMD4x2
> + * - Texel offsets
> + * - Gather channel selection
> + * - Sampler indices too large to fit in a 4-bit value.
> + */
> + inst->header_present =
> + brw->gen < 5 || brw->gen >= 9 ||
> + inst->offset != 0 || ir->op == ir_tg4 ||
> + is_high_sampler(brw, sampler_reg);
> + inst->base_mrf = 2;
> + inst->mlen = inst->header_present + 1; /* always at least one */
> + inst->dst.writemask = WRITEMASK_XYZW;
> + inst->shadow_compare = ir->shadow_comparitor != NULL;
> +
> + inst->src[1] = sampler_reg;
> +
> + /* MRF for the first parameter */
> + int param_base = inst->base_mrf + inst->header_present;
> +
> + if (ir->op == ir_txs || ir->op == ir_query_levels) {
> + int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
> + emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
> + } else {
> + /* Load the coordinate */
> + /* FINISHME: gl_clamp_mask and saturate */
> + int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
> + int zero_mask = 0xf & ~coord_mask;
> +
> + emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
> + coordinate));
> +
> + if (zero_mask != 0) {
> + emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
> + src_reg(0)));
> + }
> + /* Load the shadow comparitor */
> + if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
> + emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
> + WRITEMASK_X),
> + shadow_comparitor));
> + inst->mlen++;
> + }
> +
> + /* Load the LOD info */
> + if (ir->op == ir_tex || ir->op == ir_txl) {
> + int mrf, writemask;
> + if (brw->gen >= 5) {
> + mrf = param_base + 1;
> + if (ir->shadow_comparitor) {
> + writemask = WRITEMASK_Y;
> + /* mlen already incremented */
> + } else {
> + writemask = WRITEMASK_X;
> + inst->mlen++;
> + }
> + } else /* brw->gen == 4 */ {
> + mrf = param_base;
> + writemask = WRITEMASK_W;
> + }
> + emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
> + } else if (ir->op == ir_txf) {
> + emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
> + } else if (ir->op == ir_txf_ms) {
> + emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
> + sample_index));
> + if (brw->gen >= 7) {
> + /* MCS data is in the first channel of `mcs`, but we need to get it into
> + * the .y channel of the second vec4 of params, so replicate .x across
> + * the whole vec4 and then mask off everything except .y
> + */
> + mcs.swizzle = BRW_SWIZZLE_XXXX;
> + emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
> + mcs));
> + }
> + inst->mlen++;
> + } else if (ir->op == ir_txd) {
> + const glsl_type *type = lod_type;
> +
> + if (brw->gen >= 5) {
> + dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
> + dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
> + emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
> + emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
> + inst->mlen++;
> +
> + if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
> + dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
> + dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
> + emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
> + emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
> + inst->mlen++;
> +
> + if (ir->shadow_comparitor) {
> + emit(MOV(dst_reg(MRF, param_base + 2,
> + ir->shadow_comparitor->type, WRITEMASK_Z),
> + shadow_comparitor));
> + }
> + }
> + } else /* brw->gen == 4 */ {
> + emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
> + emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
> + inst->mlen += 2;
> + }
> + } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
> + if (ir->shadow_comparitor) {
> + emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
> + shadow_comparitor));
> + }
> +
> + emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
> + offset_value));
> + inst->mlen++;
> + }
> + }
> +
> + emit(inst);
> +
> + /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
> + * spec requires layers.
> + */
> + if (ir->op == ir_txs) {
> + glsl_type const *type = ir->sampler->type;
> + if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
> + type->sampler_array) {
> + emit_math(SHADER_OPCODE_INT_QUOTIENT,
> + writemask(inst->dst, WRITEMASK_Z),
> + src_reg(inst->dst), src_reg(6));
> + }
> + }
> +
> + if (brw->gen == 6 && ir->op == ir_tg4) {
> + emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
> + }
> +
> + swizzle_result(ir, src_reg(inst->dst), sampler);
> +}
> +
> +/**
> + * Apply workarounds for Gen6 gather with UINT/SINT
> + */
> +void
> +vec4_god::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
> +{
> + if (!wa)
> + return;
> +
> + int width = (wa & WA_8BIT) ? 8 : 16;
> + dst_reg dst_f = dst;
> + dst_f.type = BRW_REGISTER_TYPE_F;
> +
> + /* Convert from UNORM to UINT */
> + emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
> + emit(MOV(dst, src_reg(dst_f)));
> +
> + if (wa & WA_SIGN) {
> + /* Reinterpret the UINT value as a signed INT value by
> + * shifting the sign bit into place, then shifting back
> + * preserving sign.
> + */
> + emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
> + emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
> + }
> +}
> +
> +/**
> + * Set up the gather channel based on the swizzle, for gather4.
> + */
> +uint32_t
> +vec4_god::gather_channel(ir_texture *ir, uint32_t sampler)
> +{
> + ir_constant *chan = ir->lod_info.component->as_constant();
> + int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
> + switch (swiz) {
> + case SWIZZLE_X: return 0;
> + case SWIZZLE_Y:
> + /* gather4 sampler is broken for green channel on RG32F --
> + * we must ask for blue instead.
> + */
> + if (key->tex.gather_channel_quirk_mask & (1<<sampler))
> + return 2;
> + return 1;
> + case SWIZZLE_Z: return 2;
> + case SWIZZLE_W: return 3;
> + default:
> + unreachable("Not reached"); /* zero, one swizzles handled already */
> + }
> +}
> +
> +void
> +vec4_god::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
> +{
> + int s = key->tex.swizzles[sampler];
> +
> + this->result = src_reg(this, ir->type);
> + dst_reg swizzled_result(this->result);
> +
> + if (ir->op == ir_query_levels) {
> + /* # levels is in .w */
> + orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
> + emit(MOV(swizzled_result, orig_val));
> + return;
> + }
> +
> + if (ir->op == ir_txs || ir->type == glsl_type::float_type
> + || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
> + emit(MOV(swizzled_result, orig_val));
> + return;
> + }
> +
> +
> + int zero_mask = 0, one_mask = 0, copy_mask = 0;
> + int swizzle[4] = {0};
> +
> + for (int i = 0; i < 4; i++) {
> + switch (GET_SWZ(s, i)) {
> + case SWIZZLE_ZERO:
> + zero_mask |= (1 << i);
> + break;
> + case SWIZZLE_ONE:
> + one_mask |= (1 << i);
> + break;
> + default:
> + copy_mask |= (1 << i);
> + swizzle[i] = GET_SWZ(s, i);
> + break;
> + }
> + }
> +
> + if (copy_mask) {
> + orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
> + swizzled_result.writemask = copy_mask;
> + emit(MOV(swizzled_result, orig_val));
> + }
> +
> + if (zero_mask) {
> + swizzled_result.writemask = zero_mask;
> + emit(MOV(swizzled_result, src_reg(0.0f)));
> + }
> +
> + if (one_mask) {
> + swizzled_result.writemask = one_mask;
> + emit(MOV(swizzled_result, src_reg(1.0f)));
> + }
> +}
> +
> +void
> +vec4_god::visit(ir_return *)
> +{
> + unreachable("not reached");
> +}
> +
> +void
> +vec4_god::visit(ir_discard *)
> +{
> + unreachable("not reached");
> +}
> +
> +void
> +vec4_god::visit(ir_if *ir)
> +{
> + /* Don't point the annotation at the if statement, because then it plus
> + * the then and else blocks get printed.
> + */
> + this->base_ir = ir->condition;
> +
> + if (brw->gen == 6) {
> + emit_if_gen6(ir);
> + } else {
> + enum brw_predicate predicate;
> + emit_bool_to_cond_code(ir->condition, &predicate);
> + emit(IF(predicate));
> + }
> +
> + visit_instructions(&ir->then_instructions);
> +
> + if (!ir->else_instructions.is_empty()) {
> + this->base_ir = ir->condition;
> + emit(BRW_OPCODE_ELSE);
> +
> + visit_instructions(&ir->else_instructions);
> + }
> +
> + this->base_ir = ir->condition;
> + emit(BRW_OPCODE_ENDIF);
> +}
> +
> +void
> +vec4_god::visit(ir_emit_vertex *)
> +{
> + unreachable("not reached");
> +}
> +
> +void
> +vec4_god::visit(ir_end_primitive *)
> +{
> + unreachable("not reached");
> +}
> +
> +void
> +vec4_god::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
> + dst_reg dst, src_reg offset,
> + src_reg src0, src_reg src1)
> +{
> + unsigned mlen = 0;
> +
> + /* Set the atomic operation offset. */
> + emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
> + mlen++;
> +
> + /* Set the atomic operation arguments. */
> + if (src0.file != BAD_FILE) {
> + emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
> + mlen++;
> + }
> +
> + if (src1.file != BAD_FILE) {
> + emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
> + mlen++;
> + }
> +
> + /* Emit the instruction. Note that this maps to the normal SIMD8
> + * untyped atomic message on Ivy Bridge, but that's OK because
> + * unused channels will be masked out.
> + */
> + vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
> + src_reg(atomic_op), src_reg(surf_index));
> + inst->base_mrf = 0;
> + inst->mlen = mlen;
> +}
> +
> +void
> +vec4_god::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
> + src_reg offset)
> +{
> + /* Set the surface read offset. */
> + emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
> +
> + /* Emit the instruction. Note that this maps to the normal SIMD8
> + * untyped surface read message, but that's OK because unused
> + * channels will be masked out.
> + */
> + vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
> + dst, src_reg(surf_index));
> + inst->base_mrf = 0;
> + inst->mlen = 1;
> +}
> +
> +void
> +vec4_god::emit_ndc_computation()
> +{
> + /* Get the position */
> + src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
> +
> + /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
> + dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
> + output_reg[BRW_VARYING_SLOT_NDC] = ndc;
> +
> + current_annotation = "NDC";
> + dst_reg ndc_w = ndc;
> + ndc_w.writemask = WRITEMASK_W;
> + src_reg pos_w = pos;
> + pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
> + emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
> +
> + dst_reg ndc_xyz = ndc;
> + ndc_xyz.writemask = WRITEMASK_XYZ;
> +
> + emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
> +}
> +
> +void
> +vec4_god::emit_psiz_and_flags(dst_reg reg)
> +{
> + if (brw->gen < 6 &&
> + ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
> + key->userclip_active || brw->has_negative_rhw_bug)) {
> + dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
> + dst_reg header1_w = header1;
> + header1_w.writemask = WRITEMASK_W;
> +
> + emit(MOV(header1, 0u));
> +
> + if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
> + src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
> +
> + current_annotation = "Point size";
> + emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
> + emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
> + }
> +
> + if (key->userclip_active) {
> + current_annotation = "Clipping flags";
> + dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
> + dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
> +
> + emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
> + emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
> + emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
> +
> + emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
> + emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
> + emit(SHL(flags1, src_reg(flags1), src_reg(4)));
> + emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
> + }
> +
> + /* i965 clipping workaround:
> + * 1) Test for -ve rhw
> + * 2) If set,
> + * set ndc = (0,0,0,0)
> + * set ucp[6] = 1
> + *
> + * Later, clipping will detect ucp[6] and ensure the primitive is
> + * clipped against all fixed planes.
> + */
> + if (brw->has_negative_rhw_bug) {
> + src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
> + ndc_w.swizzle = BRW_SWIZZLE_WWWW;
> + emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
> + vec4_instruction *inst;
> + inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + }
> +
> + emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
> + } else if (brw->gen < 6) {
> + emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
> + } else {
> + emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
> + if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
> + dst_reg reg_w = reg;
> + reg_w.writemask = WRITEMASK_W;
> + emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
> + }
> + if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
> + dst_reg reg_y = reg;
> + reg_y.writemask = WRITEMASK_Y;
> + reg_y.type = BRW_REGISTER_TYPE_D;
> + emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
> + }
> + if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
> + dst_reg reg_z = reg;
> + reg_z.writemask = WRITEMASK_Z;
> + reg_z.type = BRW_REGISTER_TYPE_D;
> + emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
> + }
> + }
> +}
> +
> +void
> +vec4_god::emit_clip_distances(dst_reg reg, int offset)
> +{
> + /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
> + *
> + * "If a linked set of shaders forming the vertex stage contains no
> + * static write to gl_ClipVertex or gl_ClipDistance, but the
> + * application has requested clipping against user clip planes through
> + * the API, then the coordinate written to gl_Position is used for
> + * comparison against the user clip planes."
> + *
> + * This function is only called if the shader didn't write to
> + * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
> + * if the user wrote to it; otherwise we use gl_Position.
> + */
> + gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
> + if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
> + clip_vertex = VARYING_SLOT_POS;
> + }
> +
> + for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
> + ++i) {
> + reg.writemask = 1 << i;
> + emit(DP4(reg,
> + src_reg(output_reg[clip_vertex]),
> + src_reg(this->userplane[i + offset])));
> + }
> +}
> +
> +vec4_instruction *
> +vec4_god::emit_generic_urb_slot(dst_reg reg, int varying)
> +{
> + assert (varying < VARYING_SLOT_MAX);
> + reg.type = output_reg[varying].type;
> + current_annotation = output_reg_annotation[varying];
> + /* Copy the register, saturating if necessary */
> + return emit(MOV(reg, src_reg(output_reg[varying])));
> +}
> +
> +void
> +vec4_god::emit_urb_slot(dst_reg reg, int varying)
> +{
> + reg.type = BRW_REGISTER_TYPE_F;
> +
> + switch (varying) {
> + case VARYING_SLOT_PSIZ:
> + {
> + /* PSIZ is always in slot 0, and is coupled with other flags. */
> + current_annotation = "indices, point width, clip flags";
> + emit_psiz_and_flags(reg);
> + break;
> + }
> + case BRW_VARYING_SLOT_NDC:
> + current_annotation = "NDC";
> + emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
> + break;
> + case VARYING_SLOT_POS:
> + current_annotation = "gl_Position";
> + emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
> + break;
> + case VARYING_SLOT_EDGE:
> + /* This is present when doing unfilled polygons. We're supposed to copy
> + * the edge flag from the user-provided vertex array
> + * (glEdgeFlagPointer), or otherwise we'll copy from the current value
> + * of that attribute (starts as 1.0f). This is then used in clipping to
> + * determine which edges should be drawn as wireframe.
> + */
> + current_annotation = "edge flag";
> + emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
> + glsl_type::float_type, WRITEMASK_XYZW))));
> + break;
> + case BRW_VARYING_SLOT_PAD:
> + /* No need to write to this slot */
> + break;
> + case VARYING_SLOT_COL0:
> + case VARYING_SLOT_COL1:
> + case VARYING_SLOT_BFC0:
> + case VARYING_SLOT_BFC1: {
> + /* These built-in varyings are only supported in compatibility mode,
> + * and we only support GS in core profile. So, this must be a vertex
> + * shader.
> + */
> + assert(stage == MESA_SHADER_VERTEX);
> + vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
> + if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
> + inst->saturate = true;
> + break;
> + }
> +
> + default:
> + emit_generic_urb_slot(reg, varying);
> + break;
> + }
> +}
> +
> +static int
> +align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
> +{
> + if (brw->gen >= 6) {
> + /* URB data written (does not include the message header reg) must
> + * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
> + * section 5.4.3.2.2: URB_INTERLEAVED.
> + *
> + * URB entries are allocated on a multiple of 1024 bits, so an
> + * extra 128 bits written here to make the end align to 256 is
> + * no problem.
> + */
> + if ((mlen % 2) != 1)
> + mlen++;
> + }
> +
> + return mlen;
> +}
> +
> +
> +/**
> + * Generates the VUE payload plus the necessary URB write instructions to
> + * output it.
> + *
> + * The VUE layout is documented in Volume 2a.
> + */
> +void
> +vec4_god::emit_vertex()
> +{
> + /* MRF 0 is reserved for the debugger, so start with message header
> + * in MRF 1.
> + */
> + int base_mrf = 1;
> + int mrf = base_mrf;
> + /* In the process of generating our URB write message contents, we
> + * may need to unspill a register or load from an array. Those
> + * reads would use MRFs 14-15.
> + */
> + int max_usable_mrf = 13;
> +
> + /* The following assertion verifies that max_usable_mrf causes an
> + * even-numbered amount of URB write data, which will meet gen6's
> + * requirements for length alignment.
> + */
> + assert ((max_usable_mrf - base_mrf) % 2 == 0);
> +
> + /* First mrf is the g0-based message header containing URB handles and
> + * such.
> + */
> + emit_urb_write_header(mrf++);
> +
> + if (brw->gen < 6) {
> + emit_ndc_computation();
> + }
> +
> + /* Lower legacy ff and ClipVertex clipping to clip distances */
> + if (key->userclip_active && !prog->UsesClipDistanceOut) {
> + current_annotation = "user clip distances";
> +
> + output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
> + output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
> +
> + emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
> + emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
> + }
> +
> + /* We may need to split this up into several URB writes, so do them in a
> + * loop.
> + */
> + int slot = 0;
> + bool complete = false;
> + do {
> + /* URB offset is in URB row increments, and each of our MRFs is half of
> + * one of those, since we're doing interleaved writes.
> + */
> + int offset = slot / 2;
> +
> + mrf = base_mrf + 1;
> + for (; slot < prog_data->vue_map.num_slots; ++slot) {
> + emit_urb_slot(dst_reg(MRF, mrf++),
> + prog_data->vue_map.slot_to_varying[slot]);
> +
> + /* If this was max_usable_mrf, we can't fit anything more into this
> + * URB WRITE.
> + */
> + if (mrf > max_usable_mrf) {
> + slot++;
> + break;
> + }
> + }
> +
> + complete = slot >= prog_data->vue_map.num_slots;
> + current_annotation = "URB write";
> + vec4_instruction *inst = emit_urb_write_opcode(complete);
> + inst->base_mrf = base_mrf;
> + inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
> + inst->offset += offset;
> + } while(!complete);
> +}
> +
> +
> +src_reg
> +vec4_god::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
> + src_reg *reladdr, int reg_offset)
> +{
> + /* Because we store the values to scratch interleaved like our
> + * vertex data, we need to scale the vec4 index by 2.
> + */
> + int message_header_scale = 2;
> +
> + /* Pre-gen6, the message header uses byte offsets instead of vec4
> + * (16-byte) offset units.
> + */
> + if (brw->gen < 6)
> + message_header_scale *= 16;
> +
> + if (reladdr) {
> + src_reg index = src_reg(this, glsl_type::int_type);
> +
> + emit_before(block, inst, ADD(dst_reg(index), *reladdr,
> + src_reg(reg_offset)));
> + emit_before(block, inst, MUL(dst_reg(index), index,
> + src_reg(message_header_scale)));
> +
> + return index;
> + } else {
> + return src_reg(reg_offset * message_header_scale);
> + }
> +}
> +
> +src_reg
> +vec4_god::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
> + src_reg *reladdr, int reg_offset)
> +{
> + if (reladdr) {
> + src_reg index = src_reg(this, glsl_type::int_type);
> +
> + emit_before(block, inst, ADD(dst_reg(index), *reladdr,
> + src_reg(reg_offset)));
> +
> + /* Pre-gen6, the message header uses byte offsets instead of vec4
> + * (16-byte) offset units.
> + */
> + if (brw->gen < 6) {
> + emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
> + }
> +
> + return index;
> + } else if (brw->gen >= 8) {
> + /* Store the offset in a GRF so we can send-from-GRF. */
> + src_reg offset = src_reg(this, glsl_type::int_type);
> + emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
> + return offset;
> + } else {
> + int message_header_scale = brw->gen < 6 ? 16 : 1;
> + return src_reg(reg_offset * message_header_scale);
> + }
> +}
> +
> +/**
> + * Emits an instruction before @inst to load the value named by @orig_src
> + * from scratch space at @base_offset to @temp.
> + *
> + * @base_offset is measured in 32-byte units (the size of a register).
> + */
> +void
> +vec4_god::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
> + dst_reg temp, src_reg orig_src,
> + int base_offset)
> +{
> + int reg_offset = base_offset + orig_src.reg_offset;
> + src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
> + reg_offset);
> +
> + emit_before(block, inst, SCRATCH_READ(temp, index));
> +}
> +
> +/**
> + * Emits an instruction after @inst to store the value to be written
> + * to @orig_dst to scratch space at @base_offset, from @temp.
> + *
> + * @base_offset is measured in 32-byte units (the size of a register).
> + */
> +void
> +vec4_god::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
> + int base_offset)
> +{
> + int reg_offset = base_offset + inst->dst.reg_offset;
> + src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
> + reg_offset);
> +
> + /* Create a temporary register to store *inst's result in.
> + *
> + * We have to be careful in MOVing from our temporary result register in
> + * the scratch write. If we swizzle from channels of the temporary that
> + * weren't initialized, it will confuse live interval analysis, which will
> + * make spilling fail to make progress.
> + */
> + const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
> + inst->dst.type),
> + brw_swizzle_for_mask(inst->dst.writemask));
> + dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
> + inst->dst.writemask));
> + vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
> + write->predicate = inst->predicate;
> + write->ir = inst->ir;
> + write->annotation = inst->annotation;
> + inst->insert_after(block, write);
> +
> + inst->dst.file = temp.file;
> + inst->dst.reg = temp.reg;
> + inst->dst.reg_offset = temp.reg_offset;
> + inst->dst.reladdr = NULL;
> +}
> +
> +/**
> + * We can't generally support array access in GRF space, because a
> + * single instruction's destination can only span 2 contiguous
> + * registers. So, we send all GRF arrays that get variable index
> + * access to scratch space.
> + */
> +void
> +vec4_god::move_grf_array_access_to_scratch()
> +{
> + int scratch_loc[this->alloc.count];
> + memset(scratch_loc, -1, sizeof(scratch_loc));
> +
> + /* First, calculate the set of virtual GRFs that need to be punted
> + * to scratch due to having any array access on them, and where in
> + * scratch.
> + */
> + foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
> + if (inst->dst.file == GRF && inst->dst.reladdr &&
> + scratch_loc[inst->dst.reg] == -1) {
> + scratch_loc[inst->dst.reg] = c->last_scratch;
> + c->last_scratch += this->alloc.sizes[inst->dst.reg];
> + }
> +
> + for (int i = 0 ; i < 3; i++) {
> + src_reg *src = &inst->src[i];
> +
> + if (src->file == GRF && src->reladdr &&
> + scratch_loc[src->reg] == -1) {
> + scratch_loc[src->reg] = c->last_scratch;
> + c->last_scratch += this->alloc.sizes[src->reg];
> + }
> + }
> + }
> +
> + /* Now, for anything that will be accessed through scratch, rewrite
> + * it to load/store. Note that this is a _safe list walk, because
> + * we may generate a new scratch_write instruction after the one
> + * we're processing.
> + */
> + foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
> + /* Set up the annotation tracking for new generated instructions. */
> + base_ir = inst->ir;
> + current_annotation = inst->annotation;
> +
> + if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
> + emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
> + }
> +
> + for (int i = 0 ; i < 3; i++) {
> + if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
> + continue;
> +
> + dst_reg temp = dst_reg(this, glsl_type::vec4_type);
> +
> + emit_scratch_read(block, inst, temp, inst->src[i],
> + scratch_loc[inst->src[i].reg]);
> +
> + inst->src[i].file = temp.file;
> + inst->src[i].reg = temp.reg;
> + inst->src[i].reg_offset = temp.reg_offset;
> + inst->src[i].reladdr = NULL;
> + }
> + }
> +}
> +
> +/**
> + * Emits an instruction before @inst to load the value named by @orig_src
> + * from the pull constant buffer (surface) at @base_offset to @temp.
> + */
> +void
> +vec4_god::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
> + dst_reg temp, src_reg orig_src,
> + int base_offset)
> +{
> + int reg_offset = base_offset + orig_src.reg_offset;
> + src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
> + src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
> + reg_offset);
> + vec4_instruction *load;
> +
> + if (brw->gen >= 7) {
> + dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
> +
> + /* We have to use a message header on Skylake to get SIMD4x2 mode.
> + * Reserve space for the register.
> + */
> + if (brw->gen >= 9) {
> + grf_offset.reg_offset++;
> + alloc.sizes[grf_offset.reg] = 2;
> + }
> +
> + grf_offset.type = offset.type;
> + emit_before(block, inst, MOV(grf_offset, offset));
> +
> + load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
> + temp, index, src_reg(grf_offset));
> + load->mlen = 1;
> + } else {
> + load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
> + temp, index, offset);
> + load->base_mrf = 14;
> + load->mlen = 1;
> + }
> + emit_before(block, inst, load);
> +}
> +
> +/**
> + * Implements array access of uniforms by inserting a
> + * PULL_CONSTANT_LOAD instruction.
> + *
> + * Unlike temporary GRF array access (where we don't support it due to
> + * the difficulty of doing relative addressing on instruction
> + * destinations), we could potentially do array access of uniforms
> + * that were loaded in GRF space as push constants. In real-world
> + * usage we've seen, though, the arrays being used are always larger
> + * than we could load as push constants, so just always move all
> + * uniform array access out to a pull constant buffer.
> + */
> +void
> +vec4_god::move_uniform_array_access_to_pull_constants()
> +{
> + int pull_constant_loc[this->uniforms];
> + memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
> + bool nested_reladdr;
> +
> + /* Walk through and find array access of uniforms. Put a copy of that
> + * uniform in the pull constant buffer.
> + *
> + * Note that we don't move constant-indexed accesses to arrays. No
> + * testing has been done of the performance impact of this choice.
> + */
> + do {
> + nested_reladdr = false;
> +
> + foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
> + for (int i = 0 ; i < 3; i++) {
> + if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
> + continue;
> +
> + int uniform = inst->src[i].reg;
> +
> + if (inst->src[i].reladdr->reladdr)
> + nested_reladdr = true; /* will need another pass */
> +
> + /* If this array isn't already present in the pull constant buffer,
> + * add it.
> + */
> + if (pull_constant_loc[uniform] == -1) {
> + const gl_constant_value **values =
> + &stage_prog_data->param[uniform * 4];
> +
> + pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
> +
> + assert(uniform < uniform_array_size);
> + for (int j = 0; j < uniform_size[uniform] * 4; j++) {
> + stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
> + = values[j];
> + }
> + }
> +
> + /* Set up the annotation tracking for new generated instructions. */
> + base_ir = inst->ir;
> + current_annotation = inst->annotation;
> +
> + dst_reg temp = dst_reg(this, glsl_type::vec4_type);
> +
> + emit_pull_constant_load(block, inst, temp, inst->src[i],
> + pull_constant_loc[uniform]);
> +
> + inst->src[i].file = temp.file;
> + inst->src[i].reg = temp.reg;
> + inst->src[i].reg_offset = temp.reg_offset;
> + inst->src[i].reladdr = NULL;
> + }
> + }
> + } while (nested_reladdr);
> +
> + /* Now there are no accesses of the UNIFORM file with a reladdr, so
> + * no need to track them as larger-than-vec4 objects. This will be
> + * relied on in cutting out unused uniform vectors from push
> + * constants.
> + */
> + split_uniform_registers();
> +}
> +
> +void
> +vec4_god::resolve_ud_negate(src_reg *reg)
> +{
> + if (reg->type != BRW_REGISTER_TYPE_UD ||
> + !reg->negate)
> + return;
> +
> + src_reg temp = src_reg(this, glsl_type::uvec4_type);
> + emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
> + *reg = temp;
> +}
> +
> +/**
> + * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
> + *
> + * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
> + * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
> + */
> +void
> +vec4_god::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
> +{
> + assert(brw->gen <= 5);
> +
> + if (!rvalue->type->is_boolean())
> + return;
> +
> + src_reg and_result = src_reg(this, rvalue->type);
> + src_reg neg_result = src_reg(this, rvalue->type);
> + emit(AND(dst_reg(and_result), *reg, src_reg(1)));
> + emit(MOV(dst_reg(neg_result), negate(and_result)));
> + *reg = neg_result;
> +}
> +
> +vec4_god::vec4_god(struct brw_context *brw,
> + struct brw_vec4_compile *c,
> + struct gl_program *prog,
> + const struct brw_vue_prog_key *key,
> + struct brw_vue_prog_data *prog_data,
> + struct gl_shader_program *shader_prog,
> + gl_shader_stage stage,
> + void *mem_ctx,
> + bool no_spills,
> + shader_time_shader_type st_base,
> + shader_time_shader_type st_written,
> + shader_time_shader_type st_reset)
> + : backend_god(brw, shader_prog, prog, &prog_data->base, stage),
> + c(c),
> + key(key),
> + prog_data(prog_data),
> + sanity_param_count(0),
> + fail_msg(NULL),
> + first_non_payload_grf(0),
> + need_all_constants_in_pull_buffer(false),
> + no_spills(no_spills),
> + st_base(st_base),
> + st_written(st_written),
> + st_reset(st_reset)
> +{
> + this->mem_ctx = mem_ctx;
> + this->failed = false;
> +
> + this->base_ir = NULL;
> + this->current_annotation = NULL;
> + memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
> +
> + this->variable_ht = hash_table_ctor(0,
> + hash_table_pointer_hash,
> + hash_table_pointer_compare);
> +
> + this->virtual_grf_start = NULL;
> + this->virtual_grf_end = NULL;
> + this->live_intervals = NULL;
> +
> + this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
> +
> + this->uniforms = 0;
> +
> + /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
> + * at least one. See setup_uniforms() in brw_vec4.cpp.
> + */
> + this->uniform_array_size = 1;
> + if (prog_data) {
> + this->uniform_array_size =
> + MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
> + }
> +
> + this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
> + this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
> +}
> +
> +vec4_god::~vec4_god()
> +{
> + hash_table_dtor(this->variable_ht);
> +}
> +
> +
> +void
> +vec4_god::fail(const char *format, ...)
> +{
> + va_list va;
> + char *msg;
> +
> + if (failed)
> + return;
> +
> + failed = true;
> +
> + va_start(va, format);
> + msg = ralloc_vasprintf(mem_ctx, format, va);
> + va_end(va);
> + msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
> +
> + this->fail_msg = msg;
> +
> + if (debug_enabled) {
> + fprintf(stderr, "%s", msg);
> + }
> +}
> +
> +} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_god.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_god.cpp
> new file mode 100644
> index 0000000..cbb83e3
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_god.cpp
> @@ -0,0 +1,706 @@
> +/*
> + * Copyright © 2013 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +/**
> + * \file brw_vec4_gs_god.cpp
> + *
> + * Geometry-shader-specific code derived from the vec4_god class.
> + */
> +
> +#include "brw_vec4_gs_god.h"
> +#include "gen6_gs_god.h"
> +
> +const unsigned MAX_GS_INPUT_VERTICES = 6;
> +
> +namespace brw {
> +
> +vec4_gs_god::vec4_gs_god(struct brw_context *brw,
> + struct brw_gs_compile *c,
> + struct gl_shader_program *prog,
> + void *mem_ctx,
> + bool no_spills)
> + : vec4_god(brw, &c->base, &c->gp->program.Base, &c->key.base,
> + &c->prog_data.base, prog, MESA_SHADER_GEOMETRY, mem_ctx,
> + no_spills,
> + ST_GS, ST_GS_WRITTEN, ST_GS_RESET),
> + c(c)
> +{
> +}
> +
> +
> +dst_reg *
> +vec4_gs_god::make_reg_for_system_value(ir_variable *ir)
> +{
> + dst_reg *reg = new(mem_ctx) dst_reg(this, ir->type);
> +
> + switch (ir->data.location) {
> + case SYSTEM_VALUE_INVOCATION_ID:
> + this->current_annotation = "initialize gl_InvocationID";
> + emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
> + break;
> + default:
> + unreachable("not reached");
> + }
> +
> + return reg;
> +}
> +
> +
> +int
> +vec4_gs_god::setup_varying_inputs(int payload_reg, int *attribute_map,
> + int attributes_per_reg)
> +{
> + /* For geometry shaders there are N copies of the input attributes, where N
> + * is the number of input vertices. attribute_map[BRW_VARYING_SLOT_COUNT *
> + * i + j] represents attribute j for vertex i.
> + *
> + * Note that GS inputs are read from the VUE 256 bits (2 vec4's) at a time,
> + * so the total number of input slots that will be delivered to the GS (and
> + * thus the stride of the input arrays) is urb_read_length * 2.
> + */
> + const unsigned num_input_vertices = c->gp->program.VerticesIn;
> + assert(num_input_vertices <= MAX_GS_INPUT_VERTICES);
> + unsigned input_array_stride = c->prog_data.base.urb_read_length * 2;
> +
> + for (int slot = 0; slot < c->input_vue_map.num_slots; slot++) {
> + int varying = c->input_vue_map.slot_to_varying[slot];
> + for (unsigned vertex = 0; vertex < num_input_vertices; vertex++) {
> + attribute_map[BRW_VARYING_SLOT_COUNT * vertex + varying] =
> + attributes_per_reg * payload_reg + input_array_stride * vertex +
> + slot;
> + }
> + }
> +
> + int regs_used = ALIGN(input_array_stride * num_input_vertices,
> + attributes_per_reg) / attributes_per_reg;
> + return payload_reg + regs_used;
> +}
> +
> +
> +void
> +vec4_gs_god::setup_payload()
> +{
> + int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
> +
> + /* If we are in dual instanced or single mode, then attributes are going
> + * to be interleaved, so one register contains two attribute slots.
> + */
> + int attributes_per_reg =
> + c->prog_data.dispatch_mode == GEN7_GS_DISPATCH_MODE_DUAL_OBJECT ? 1 : 2;
> +
> + /* If a geometry shader tries to read from an input that wasn't written by
> + * the vertex shader, that produces undefined results, but it shouldn't
> + * crash anything. So initialize attribute_map to zeros--that ensures that
> + * these undefined results are read from r0.
> + */
> + memset(attribute_map, 0, sizeof(attribute_map));
> +
> + int reg = 0;
> +
> + /* The payload always contains important data in r0, which contains
> + * the URB handles that are passed on to the URB write at the end
> + * of the thread.
> + */
> + reg++;
> +
> + /* If the shader uses gl_PrimitiveIDIn, that goes in r1. */
> + if (c->prog_data.include_primitive_id)
> + attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg++;
> +
> + reg = setup_uniforms(reg);
> +
> + reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
> +
> + lower_attributes_to_hw_regs(attribute_map, attributes_per_reg > 1);
> +
> + this->first_non_payload_grf = reg;
> +}
> +
> +
> +void
> +vec4_gs_god::emit_prolog()
> +{
> + /* In vertex shaders, r0.2 is guaranteed to be initialized to zero. In
> + * geometry shaders, it isn't (it contains a bunch of information we don't
> + * need, like the input primitive type). We need r0.2 to be zero in order
> + * to build scratch read/write messages correctly (otherwise this value
> + * will be interpreted as a global offset, causing us to do our scratch
> + * reads/writes to garbage memory). So just set it to zero at the top of
> + * the shader.
> + */
> + this->current_annotation = "clear r0.2";
> + dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD));
> + vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, 0u);
> + inst->force_writemask_all = true;
> +
> + /* Create a virtual register to hold the vertex count */
> + this->vertex_count = src_reg(this, glsl_type::uint_type);
> +
> + /* Initialize the vertex_count register to 0 */
> + this->current_annotation = "initialize vertex_count";
> + inst = emit(MOV(dst_reg(this->vertex_count), 0u));
> + inst->force_writemask_all = true;
> +
> + if (c->control_data_header_size_bits > 0) {
> + /* Create a virtual register to hold the current set of control data
> + * bits.
> + */
> + this->control_data_bits = src_reg(this, glsl_type::uint_type);
> +
> + /* If we're outputting more than 32 control data bits, then EmitVertex()
> + * will set control_data_bits to 0 after emitting the first vertex.
> + * Otherwise, we need to initialize it to 0 here.
> + */
> + if (c->control_data_header_size_bits <= 32) {
> + this->current_annotation = "initialize control data bits";
> + inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
> + inst->force_writemask_all = true;
> + }
> + }
> +
> + /* If the geometry shader uses the gl_PointSize input, we need to fix it up
> + * to account for the fact that the vertex shader stored it in the w
> + * component of VARYING_SLOT_PSIZ.
> + */
> + if (c->gp->program.Base.InputsRead & VARYING_BIT_PSIZ) {
> + this->current_annotation = "swizzle gl_PointSize input";
> + for (int vertex = 0; vertex < c->gp->program.VerticesIn; vertex++) {
> + dst_reg dst(ATTR,
> + BRW_VARYING_SLOT_COUNT * vertex + VARYING_SLOT_PSIZ);
> + dst.type = BRW_REGISTER_TYPE_F;
> + src_reg src(dst);
> + dst.writemask = WRITEMASK_X;
> + src.swizzle = BRW_SWIZZLE_WWWW;
> + inst = emit(MOV(dst, src));
> +
> + /* In dual instanced dispatch mode, dst has a width of 4, so we need
> + * to make sure the MOV happens regardless of which channels are
> + * enabled.
> + */
> + inst->force_writemask_all = true;
> + }
> + }
> +
> + this->current_annotation = NULL;
> +}
> +
> +
> +void
> +vec4_gs_god::emit_program_code()
> +{
> + /* We don't support NV_geometry_program4. */
> + unreachable("Unreached");
> +}
> +
> +
> +void
> +vec4_gs_god::emit_thread_end()
> +{
> + if (c->control_data_header_size_bits > 0) {
> + /* During shader execution, we only ever call emit_control_data_bits()
> + * just prior to outputting a vertex. Therefore, the control data bits
> + * corresponding to the most recently output vertex still need to be
> + * emitted.
> + */
> + current_annotation = "thread end: emit control data bits";
> + emit_control_data_bits();
> + }
> +
> + /* MRF 0 is reserved for the debugger, so start with message header
> + * in MRF 1.
> + */
> + int base_mrf = 1;
> +
> + current_annotation = "thread end";
> + dst_reg mrf_reg(MRF, base_mrf);
> + src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
> + vec4_instruction *inst = emit(MOV(mrf_reg, r0));
> + inst->force_writemask_all = true;
> + emit(GS_OPCODE_SET_VERTEX_COUNT, mrf_reg, this->vertex_count);
> + if (INTEL_DEBUG & DEBUG_SHADER_TIME)
> + emit_shader_time_end();
> + inst = emit(GS_OPCODE_THREAD_END);
> + inst->base_mrf = base_mrf;
> + inst->mlen = 1;
> +}
> +
> +
> +void
> +vec4_gs_god::emit_urb_write_header(int mrf)
> +{
> + /* The SEND instruction that writes the vertex data to the VUE will use
> + * per_slot_offset=true, which means that DWORDs 3 and 4 of the message
> + * header specify an offset (in multiples of 256 bits) into the URB entry
> + * at which the write should take place.
> + *
> + * So we have to prepare a message header with the appropriate offset
> + * values.
> + */
> + dst_reg mrf_reg(MRF, mrf);
> + src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
> + this->current_annotation = "URB write header";
> + vec4_instruction *inst = emit(MOV(mrf_reg, r0));
> + inst->force_writemask_all = true;
> + emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count,
> + (uint32_t) c->prog_data.output_vertex_size_hwords);
> +}
> +
> +
> +vec4_instruction *
> +vec4_gs_god::emit_urb_write_opcode(bool complete)
> +{
> + /* We don't care whether the vertex is complete, because in general
> + * geometry shaders output multiple vertices, and we don't terminate the
> + * thread until all vertices are complete.
> + */
> + (void) complete;
> +
> + vec4_instruction *inst = emit(GS_OPCODE_URB_WRITE);
> + inst->offset = c->prog_data.control_data_header_size_hwords;
> +
> + /* We need to increment Global Offset by 1 to make room for Broadwell's
> + * extra "Vertex Count" payload at the beginning of the URB entry.
> + */
> + if (brw->gen >= 8)
> + inst->offset++;
> +
> + inst->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
> + return inst;
> +}
> +
> +
> +int
> +vec4_gs_god::compute_array_stride(ir_dereference_array *ir)
> +{
> + /* Geometry shader inputs are arrays, but they use an unusual array layout:
> + * instead of all array elements for a given geometry shader input being
> + * stored consecutively, all geometry shader inputs are interleaved into
> + * one giant array. At this stage of compilation, we assume that the
> + * stride of the array is BRW_VARYING_SLOT_COUNT. Later,
> + * setup_attributes() will remap our accesses to the actual input array.
> + */
> + ir_dereference_variable *deref_var = ir->array->as_dereference_variable();
> + if (deref_var && deref_var->var->data.mode == ir_var_shader_in)
> + return BRW_VARYING_SLOT_COUNT;
> + else
> + return vec4_god::compute_array_stride(ir);
> +}
> +
> +
> +/**
> + * Write out a batch of 32 control data bits from the control_data_bits
> + * register to the URB.
> + *
> + * The current value of the vertex_count register determines which DWORD in
> + * the URB receives the control data bits. The control_data_bits register is
> + * assumed to contain the correct data for the vertex that was most recently
> + * output, and all previous vertices that share the same DWORD.
> + *
> + * This function takes care of ensuring that if no vertices have been output
> + * yet, no control bits are emitted.
> + */
> +void
> +vec4_gs_god::emit_control_data_bits()
> +{
> + assert(c->control_data_bits_per_vertex != 0);
> +
> + /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized)
> + * granularity, we need to use two tricks to ensure that the batch of 32
> + * control data bits is written to the appropriate DWORD in the URB. To
> + * select which vec4 we are writing to, we use the "slot {0,1} offset"
> + * fields of the message header. To select which DWORD in the vec4 we are
> + * writing to, we use the channel mask fields of the message header. To
> + * avoid penalizing geometry shaders that emit a small number of vertices
> + * with extra bookkeeping, we only do each of these tricks when
> + * c->prog_data.control_data_header_size_bits is large enough to make it
> + * necessary.
> + *
> + * Note: this means that if we're outputting just a single DWORD of control
> + * data bits, we'll actually replicate it four times since we won't do any
> + * channel masking. But that's not a problem since in this case the
> + * hardware only pays attention to the first DWORD.
> + */
> + enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD;
> + if (c->control_data_header_size_bits > 32)
> + urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS;
> + if (c->control_data_header_size_bits > 128)
> + urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;
> +
> + /* If vertex_count is 0, then no control data bits have been accumulated
> + * yet, so we should do nothing.
> + */
> + emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ));
> + emit(IF(BRW_PREDICATE_NORMAL));
> + {
> + /* If we are using either channel masks or a per-slot offset, then we
> + * need to figure out which DWORD we are trying to write to, using the
> + * formula:
> + *
> + * dword_index = (vertex_count - 1) * bits_per_vertex / 32
> + *
> + * Since bits_per_vertex is a power of two, and is known at compile
> + * time, this can be optimized to:
> + *
> + * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
> + */
> + src_reg dword_index(this, glsl_type::uint_type);
> + if (urb_write_flags) {
> + src_reg prev_count(this, glsl_type::uint_type);
> + emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
> + unsigned log2_bits_per_vertex =
> + _mesa_fls(c->control_data_bits_per_vertex);
> + emit(SHR(dst_reg(dword_index), prev_count,
> + (uint32_t) (6 - log2_bits_per_vertex)));
> + }
> +
> + /* Start building the URB write message. The first MRF gets a copy of
> + * R0.
> + */
> + int base_mrf = 1;
> + dst_reg mrf_reg(MRF, base_mrf);
> + src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
> + vec4_instruction *inst = emit(MOV(mrf_reg, r0));
> + inst->force_writemask_all = true;
> +
> + if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
> + /* Set the per-slot offset to dword_index / 4, to that we'll write to
> + * the appropriate OWORD within the control data header.
> + */
> + src_reg per_slot_offset(this, glsl_type::uint_type);
> + emit(SHR(dst_reg(per_slot_offset), dword_index, 2u));
> + emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
> + }
> +
> + if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
> + /* Set the channel masks to 1 << (dword_index % 4), so that we'll
> + * write to the appropriate DWORD within the OWORD. We need to do
> + * this computation with force_writemask_all, otherwise garbage data
> + * from invocation 0 might clobber the mask for invocation 1 when
> + * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
> + * together.
> + */
> + src_reg channel(this, glsl_type::uint_type);
> + inst = emit(AND(dst_reg(channel), dword_index, 3u));
> + inst->force_writemask_all = true;
> + src_reg one(this, glsl_type::uint_type);
> + inst = emit(MOV(dst_reg(one), 1u));
> + inst->force_writemask_all = true;
> + src_reg channel_mask(this, glsl_type::uint_type);
> + inst = emit(SHL(dst_reg(channel_mask), one, channel));
> + inst->force_writemask_all = true;
> + emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
> + channel_mask);
> + emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
> + }
> +
> + /* Store the control data bits in the message payload and send it. */
> + dst_reg mrf_reg2(MRF, base_mrf + 1);
> + inst = emit(MOV(mrf_reg2, this->control_data_bits));
> + inst->force_writemask_all = true;
> + inst = emit(GS_OPCODE_URB_WRITE);
> + inst->urb_write_flags = urb_write_flags;
> + /* We need to increment Global Offset by 256-bits to make room for
> + * Broadwell's extra "Vertex Count" payload at the beginning of the
> + * URB entry. Since this is an OWord message, Global Offset is counted
> + * in 128-bit units, so we must set it to 2.
> + */
> + if (brw->gen >= 8)
> + inst->offset = 2;
> + inst->base_mrf = base_mrf;
> + inst->mlen = 2;
> + }
> + emit(BRW_OPCODE_ENDIF);
> +}
> +
> +void
> +vec4_gs_god::set_stream_control_data_bits(unsigned stream_id)
> +{
> + /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
> +
> + /* Note: we are calling this *before* increasing vertex_count, so
> + * this->vertex_count == vertex_count - 1 in the formula above.
> + */
> +
> + /* Stream mode uses 2 bits per vertex */
> + assert(c->control_data_bits_per_vertex == 2);
> +
> + /* Must be a valid stream */
> + assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
> +
> + /* Control data bits are initialized to 0 so we don't have to set any
> + * bits when sending vertices to stream 0.
> + */
> + if (stream_id == 0)
> + return;
> +
> + /* reg::sid = stream_id */
> + src_reg sid(this, glsl_type::uint_type);
> + emit(MOV(dst_reg(sid), stream_id));
> +
> + /* reg:shift_count = 2 * (vertex_count - 1) */
> + src_reg shift_count(this, glsl_type::uint_type);
> + emit(SHL(dst_reg(shift_count), this->vertex_count, 1u));
> +
> + /* Note: we're relying on the fact that the GEN SHL instruction only pays
> + * attention to the lower 5 bits of its second source argument, so on this
> + * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
> + * stream_id << ((2 * (vertex_count - 1)) % 32).
> + */
> + src_reg mask(this, glsl_type::uint_type);
> + emit(SHL(dst_reg(mask), sid, shift_count));
> + emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
> +}
> +
> +void
> +vec4_gs_god::visit(ir_emit_vertex *ir)
> +{
> + this->current_annotation = "emit vertex: safety check";
> +
> + /* To ensure that we don't output more vertices than the shader specified
> + * using max_vertices, do the logic inside a conditional of the form "if
> + * (vertex_count < MAX)"
> + */
> + unsigned num_output_vertices = c->gp->program.VerticesOut;
> + emit(CMP(dst_null_d(), this->vertex_count,
> + src_reg(num_output_vertices), BRW_CONDITIONAL_L));
> + emit(IF(BRW_PREDICATE_NORMAL));
> + {
> + /* If we're outputting 32 control data bits or less, then we can wait
> + * until the shader is over to output them all. Otherwise we need to
> + * output them as we go. Now is the time to do it, since we're about to
> + * output the vertex_count'th vertex, so it's guaranteed that the
> + * control data bits associated with the (vertex_count - 1)th vertex are
> + * correct.
> + */
> + if (c->control_data_header_size_bits > 32) {
> + this->current_annotation = "emit vertex: emit control data bits";
> + /* Only emit control data bits if we've finished accumulating a batch
> + * of 32 bits. This is the case when:
> + *
> + * (vertex_count * bits_per_vertex) % 32 == 0
> + *
> + * (in other words, when the last 5 bits of vertex_count *
> + * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
> + * integer n (which is always the case, since bits_per_vertex is
> + * always 1 or 2), this is equivalent to requiring that the last 5-n
> + * bits of vertex_count are 0:
> + *
> + * vertex_count & (2^(5-n) - 1) == 0
> + *
> + * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
> + * equivalent to:
> + *
> + * vertex_count & (32 / bits_per_vertex - 1) == 0
> + */
> + vec4_instruction *inst =
> + emit(AND(dst_null_d(), this->vertex_count,
> + (uint32_t) (32 / c->control_data_bits_per_vertex - 1)));
> + inst->conditional_mod = BRW_CONDITIONAL_Z;
> + emit(IF(BRW_PREDICATE_NORMAL));
> + {
> + emit_control_data_bits();
> +
> + /* Reset control_data_bits to 0 so we can start accumulating a new
> + * batch.
> + *
> + * Note: in the case where vertex_count == 0, this neutralizes the
> + * effect of any call to EndPrimitive() that the shader may have
> + * made before outputting its first vertex.
> + */
> + inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
> + inst->force_writemask_all = true;
> + }
> + emit(BRW_OPCODE_ENDIF);
> + }
> +
> + this->current_annotation = "emit vertex: vertex data";
> + emit_vertex();
> +
> + /* In stream mode we have to set control data bits for all vertices
> + * unless we have disabled control data bits completely (which we do
> + * do for GL_POINTS outputs that don't use streams).
> + */
> + if (c->control_data_header_size_bits > 0 &&
> + c->prog_data.control_data_format ==
> + GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
> + this->current_annotation = "emit vertex: Stream control data bits";
> + set_stream_control_data_bits(ir->stream_id());
> + }
> +
> + this->current_annotation = "emit vertex: increment vertex count";
> + emit(ADD(dst_reg(this->vertex_count), this->vertex_count,
> + src_reg(1u)));
> + }
> + emit(BRW_OPCODE_ENDIF);
> +
> + this->current_annotation = NULL;
> +}
> +
> +void
> +vec4_gs_god::visit(ir_end_primitive *)
> +{
> + /* We can only do EndPrimitive() functionality when the control data
> + * consists of cut bits. Fortunately, the only time it isn't is when the
> + * output type is points, in which case EndPrimitive() is a no-op.
> + */
> + if (c->prog_data.control_data_format !=
> + GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
> + return;
> + }
> +
> + /* Cut bits use one bit per vertex. */
> + assert(c->control_data_bits_per_vertex == 1);
> +
> + /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
> + * vertex n, 0 otherwise. So all we need to do here is mark bit
> + * (vertex_count - 1) % 32 in the cut_bits register to indicate that
> + * EndPrimitive() was called after emitting vertex (vertex_count - 1);
> + * vec4_gs_god::emit_control_data_bits() will take care of the rest.
> + *
> + * Note that if EndPrimitve() is called before emitting any vertices, this
> + * will cause us to set bit 31 of the control_data_bits register to 1.
> + * That's fine because:
> + *
> + * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
> + * output, so the hardware will ignore cut bit 31.
> + *
> + * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
> + * last vertex, so setting cut bit 31 has no effect (since the primitive
> + * is automatically ended when the GS terminates).
> + *
> + * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
> + * control_data_bits register to 0 when the first vertex is emitted.
> + */
> +
> + /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
> + src_reg one(this, glsl_type::uint_type);
> + emit(MOV(dst_reg(one), 1u));
> + src_reg prev_count(this, glsl_type::uint_type);
> + emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
> + src_reg mask(this, glsl_type::uint_type);
> + /* Note: we're relying on the fact that the GEN SHL instruction only pays
> + * attention to the lower 5 bits of its second source argument, so on this
> + * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
> + * ((vertex_count - 1) % 32).
> + */
> + emit(SHL(dst_reg(mask), one, prev_count));
> + emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
> +}
> +
> +static const unsigned *
> +generate_assembly(struct brw_context *brw,
> + struct gl_shader_program *shader_prog,
> + struct gl_program *prog,
> + struct brw_vue_prog_data *prog_data,
> + void *mem_ctx,
> + const cfg_t *cfg,
> + unsigned *final_assembly_size)
> +{
> + vec4_generator g(brw, shader_prog, prog, prog_data, mem_ctx,
> + INTEL_DEBUG & DEBUG_GS, "geometry", "GS");
> + return g.generate_assembly(cfg, final_assembly_size);
> +}
> +
> +extern "C" const unsigned *
> +brw_gs_emit(struct brw_context *brw,
> + struct gl_shader_program *prog,
> + struct brw_gs_compile *c,
> + void *mem_ctx,
> + unsigned *final_assembly_size)
> +{
> + if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
> + struct brw_shader *shader =
> + (brw_shader *) prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
> +
> + brw_dump_ir("geometry", prog, &shader->base, NULL);
> + }
> +
> + if (brw->gen >= 7) {
> + /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
> + * so without spilling. If the GS invocations count > 1, then we can't use
> + * dual object mode.
> + */
> + if (c->prog_data.invocations <= 1 &&
> + likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
> + c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_DUAL_OBJECT;
> +
> + vec4_gs_god v(brw, c, prog, mem_ctx, true /* no_spills */);
> + if (v.run()) {
> + return generate_assembly(brw, prog, &c->gp->program.Base,
> + &c->prog_data.base, mem_ctx, v.cfg,
> + final_assembly_size);
> + }
> + }
> + }
> +
> + /* Either we failed to compile in DUAL_OBJECT mode (probably because it
> + * would have required spilling) or DUAL_OBJECT mode is disabled. So fall
> + * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
> + *
> + * FIXME: Single dispatch mode requires that the driver can handle
> + * interleaving of input registers, but this is already supported (dual
> + * instance mode has the same requirement). However, to take full advantage
> + * of single dispatch mode to reduce register pressure we would also need to
> + * do interleaved outputs, but currently, the vec4 visitor and generator
> + * classes do not support this, so at the moment register pressure in
> + * single and dual instance modes is the same.
> + *
> + * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
> + * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
> + * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
> + * is also supported. When InstanceCount=1 (one instance per object) software
> + * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
> + * the best choice for performance, followed by SINGLE mode."
> + *
> + * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
> + * mode is more performant when invocations > 1. Gen6 only supports
> + * SINGLE mode.
> + */
> + if (c->prog_data.invocations <= 1 || brw->gen < 7)
> + c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_SINGLE;
> + else
> + c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_DUAL_INSTANCE;
> +
> + vec4_gs_god *gs = NULL;
> + const unsigned *ret = NULL;
> +
> + if (brw->gen >= 7)
> + gs = new vec4_gs_god(brw, c, prog, mem_ctx, false /* no_spills */);
> + else
> + gs = new gen6_gs_god(brw, c, prog, mem_ctx, false /* no_spills */);
> +
> + if (!gs->run()) {
> + prog->LinkStatus = false;
> + ralloc_strcat(&prog->InfoLog, gs->fail_msg);
> + } else {
> + ret = generate_assembly(brw, prog, &c->gp->program.Base,
> + &c->prog_data.base, mem_ctx, gs->cfg,
> + final_assembly_size);
> + }
> +
> + delete gs;
> + return ret;
> +}
> +
> +
> +} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_god.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_god.h
> new file mode 100644
> index 0000000..18d849e
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_god.h
> @@ -0,0 +1,103 @@
> +/*
> + * Copyright © 2013 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +/**
> + * \file brw_vec4_gs_god.h
> + *
> + * Geometry-shader-specific code derived from the vec4_god class.
> + */
> +
> +#ifndef BRW_VEC4_GS_VISITOR_H
> +#define BRW_VEC4_GS_VISITOR_H
> +
> +#include "brw_vec4.h"
> +
> +/**
> + * Scratch data used when compiling a GLSL geometry shader.
> + */
> +struct brw_gs_compile
> +{
> + struct brw_vec4_compile base;
> + struct brw_gs_prog_key key;
> + struct brw_gs_prog_data prog_data;
> + struct brw_vue_map input_vue_map;
> +
> + struct brw_geometry_program *gp;
> +
> + unsigned control_data_bits_per_vertex;
> + unsigned control_data_header_size_bits;
> +};
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +const unsigned *brw_gs_emit(struct brw_context *brw,
> + struct gl_shader_program *prog,
> + struct brw_gs_compile *c,
> + void *mem_ctx,
> + unsigned *final_assembly_size);
> +
> +#ifdef __cplusplus
> +} /* extern "C" */
> +#endif
> +
> +#ifdef __cplusplus
> +namespace brw {
> +
> +class vec4_gs_god : public vec4_god
> +{
> +public:
> + vec4_gs_god(struct brw_context *brw,
> + struct brw_gs_compile *c,
> + struct gl_shader_program *prog,
> + void *mem_ctx,
> + bool no_spills);
> +
> +protected:
> + virtual dst_reg *make_reg_for_system_value(ir_variable *ir);
> + virtual void setup_payload();
> + virtual void emit_prolog();
> + virtual void emit_program_code();
> + virtual void emit_thread_end();
> + virtual void emit_urb_write_header(int mrf);
> + virtual vec4_instruction *emit_urb_write_opcode(bool complete);
> + virtual int compute_array_stride(ir_dereference_array *ir);
> + virtual void visit(ir_emit_vertex *);
> + virtual void visit(ir_end_primitive *);
> +
> +protected:
> + int setup_varying_inputs(int payload_reg, int *attribute_map,
> + int attributes_per_reg);
> + void emit_control_data_bits();
> + void set_stream_control_data_bits(unsigned stream_id);
> +
> + src_reg vertex_count;
> + src_reg control_data_bits;
> + const struct brw_gs_compile * const c;
> +};
> +
> +} /* namespace brw */
> +#endif /* __cplusplus */
> +
> +#endif /* BRW_VEC4_GS_VISITOR_H */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
> deleted file mode 100644
> index 2002ffd..0000000
> --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
> +++ /dev/null
> @@ -1,706 +0,0 @@
> -/*
> - * Copyright © 2013 Intel Corporation
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the "Software"),
> - * to deal in the Software without restriction, including without limitation
> - * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> - * and/or sell copies of the Software, and to permit persons to whom the
> - * Software is furnished to do so, subject to the following conditions:
> - *
> - * The above copyright notice and this permission notice (including the next
> - * paragraph) shall be included in all copies or substantial portions of the
> - * Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> - * DEALINGS IN THE SOFTWARE.
> - */
> -
> -/**
> - * \file brw_vec4_gs_visitor.cpp
> - *
> - * Geometry-shader-specific code derived from the vec4_visitor class.
> - */
> -
> -#include "brw_vec4_gs_visitor.h"
> -#include "gen6_gs_visitor.h"
> -
> -const unsigned MAX_GS_INPUT_VERTICES = 6;
> -
> -namespace brw {
> -
> -vec4_gs_visitor::vec4_gs_visitor(struct brw_context *brw,
> - struct brw_gs_compile *c,
> - struct gl_shader_program *prog,
> - void *mem_ctx,
> - bool no_spills)
> - : vec4_visitor(brw, &c->base, &c->gp->program.Base, &c->key.base,
> - &c->prog_data.base, prog, MESA_SHADER_GEOMETRY, mem_ctx,
> - no_spills,
> - ST_GS, ST_GS_WRITTEN, ST_GS_RESET),
> - c(c)
> -{
> -}
> -
> -
> -dst_reg *
> -vec4_gs_visitor::make_reg_for_system_value(ir_variable *ir)
> -{
> - dst_reg *reg = new(mem_ctx) dst_reg(this, ir->type);
> -
> - switch (ir->data.location) {
> - case SYSTEM_VALUE_INVOCATION_ID:
> - this->current_annotation = "initialize gl_InvocationID";
> - emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
> - break;
> - default:
> - unreachable("not reached");
> - }
> -
> - return reg;
> -}
> -
> -
> -int
> -vec4_gs_visitor::setup_varying_inputs(int payload_reg, int *attribute_map,
> - int attributes_per_reg)
> -{
> - /* For geometry shaders there are N copies of the input attributes, where N
> - * is the number of input vertices. attribute_map[BRW_VARYING_SLOT_COUNT *
> - * i + j] represents attribute j for vertex i.
> - *
> - * Note that GS inputs are read from the VUE 256 bits (2 vec4's) at a time,
> - * so the total number of input slots that will be delivered to the GS (and
> - * thus the stride of the input arrays) is urb_read_length * 2.
> - */
> - const unsigned num_input_vertices = c->gp->program.VerticesIn;
> - assert(num_input_vertices <= MAX_GS_INPUT_VERTICES);
> - unsigned input_array_stride = c->prog_data.base.urb_read_length * 2;
> -
> - for (int slot = 0; slot < c->input_vue_map.num_slots; slot++) {
> - int varying = c->input_vue_map.slot_to_varying[slot];
> - for (unsigned vertex = 0; vertex < num_input_vertices; vertex++) {
> - attribute_map[BRW_VARYING_SLOT_COUNT * vertex + varying] =
> - attributes_per_reg * payload_reg + input_array_stride * vertex +
> - slot;
> - }
> - }
> -
> - int regs_used = ALIGN(input_array_stride * num_input_vertices,
> - attributes_per_reg) / attributes_per_reg;
> - return payload_reg + regs_used;
> -}
> -
> -
> -void
> -vec4_gs_visitor::setup_payload()
> -{
> - int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
> -
> - /* If we are in dual instanced or single mode, then attributes are going
> - * to be interleaved, so one register contains two attribute slots.
> - */
> - int attributes_per_reg =
> - c->prog_data.dispatch_mode == GEN7_GS_DISPATCH_MODE_DUAL_OBJECT ? 1 : 2;
> -
> - /* If a geometry shader tries to read from an input that wasn't written by
> - * the vertex shader, that produces undefined results, but it shouldn't
> - * crash anything. So initialize attribute_map to zeros--that ensures that
> - * these undefined results are read from r0.
> - */
> - memset(attribute_map, 0, sizeof(attribute_map));
> -
> - int reg = 0;
> -
> - /* The payload always contains important data in r0, which contains
> - * the URB handles that are passed on to the URB write at the end
> - * of the thread.
> - */
> - reg++;
> -
> - /* If the shader uses gl_PrimitiveIDIn, that goes in r1. */
> - if (c->prog_data.include_primitive_id)
> - attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg++;
> -
> - reg = setup_uniforms(reg);
> -
> - reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
> -
> - lower_attributes_to_hw_regs(attribute_map, attributes_per_reg > 1);
> -
> - this->first_non_payload_grf = reg;
> -}
> -
> -
> -void
> -vec4_gs_visitor::emit_prolog()
> -{
> - /* In vertex shaders, r0.2 is guaranteed to be initialized to zero. In
> - * geometry shaders, it isn't (it contains a bunch of information we don't
> - * need, like the input primitive type). We need r0.2 to be zero in order
> - * to build scratch read/write messages correctly (otherwise this value
> - * will be interpreted as a global offset, causing us to do our scratch
> - * reads/writes to garbage memory). So just set it to zero at the top of
> - * the shader.
> - */
> - this->current_annotation = "clear r0.2";
> - dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD));
> - vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, 0u);
> - inst->force_writemask_all = true;
> -
> - /* Create a virtual register to hold the vertex count */
> - this->vertex_count = src_reg(this, glsl_type::uint_type);
> -
> - /* Initialize the vertex_count register to 0 */
> - this->current_annotation = "initialize vertex_count";
> - inst = emit(MOV(dst_reg(this->vertex_count), 0u));
> - inst->force_writemask_all = true;
> -
> - if (c->control_data_header_size_bits > 0) {
> - /* Create a virtual register to hold the current set of control data
> - * bits.
> - */
> - this->control_data_bits = src_reg(this, glsl_type::uint_type);
> -
> - /* If we're outputting more than 32 control data bits, then EmitVertex()
> - * will set control_data_bits to 0 after emitting the first vertex.
> - * Otherwise, we need to initialize it to 0 here.
> - */
> - if (c->control_data_header_size_bits <= 32) {
> - this->current_annotation = "initialize control data bits";
> - inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
> - inst->force_writemask_all = true;
> - }
> - }
> -
> - /* If the geometry shader uses the gl_PointSize input, we need to fix it up
> - * to account for the fact that the vertex shader stored it in the w
> - * component of VARYING_SLOT_PSIZ.
> - */
> - if (c->gp->program.Base.InputsRead & VARYING_BIT_PSIZ) {
> - this->current_annotation = "swizzle gl_PointSize input";
> - for (int vertex = 0; vertex < c->gp->program.VerticesIn; vertex++) {
> - dst_reg dst(ATTR,
> - BRW_VARYING_SLOT_COUNT * vertex + VARYING_SLOT_PSIZ);
> - dst.type = BRW_REGISTER_TYPE_F;
> - src_reg src(dst);
> - dst.writemask = WRITEMASK_X;
> - src.swizzle = BRW_SWIZZLE_WWWW;
> - inst = emit(MOV(dst, src));
> -
> - /* In dual instanced dispatch mode, dst has a width of 4, so we need
> - * to make sure the MOV happens regardless of which channels are
> - * enabled.
> - */
> - inst->force_writemask_all = true;
> - }
> - }
> -
> - this->current_annotation = NULL;
> -}
> -
> -
> -void
> -vec4_gs_visitor::emit_program_code()
> -{
> - /* We don't support NV_geometry_program4. */
> - unreachable("Unreached");
> -}
> -
> -
> -void
> -vec4_gs_visitor::emit_thread_end()
> -{
> - if (c->control_data_header_size_bits > 0) {
> - /* During shader execution, we only ever call emit_control_data_bits()
> - * just prior to outputting a vertex. Therefore, the control data bits
> - * corresponding to the most recently output vertex still need to be
> - * emitted.
> - */
> - current_annotation = "thread end: emit control data bits";
> - emit_control_data_bits();
> - }
> -
> - /* MRF 0 is reserved for the debugger, so start with message header
> - * in MRF 1.
> - */
> - int base_mrf = 1;
> -
> - current_annotation = "thread end";
> - dst_reg mrf_reg(MRF, base_mrf);
> - src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
> - vec4_instruction *inst = emit(MOV(mrf_reg, r0));
> - inst->force_writemask_all = true;
> - emit(GS_OPCODE_SET_VERTEX_COUNT, mrf_reg, this->vertex_count);
> - if (INTEL_DEBUG & DEBUG_SHADER_TIME)
> - emit_shader_time_end();
> - inst = emit(GS_OPCODE_THREAD_END);
> - inst->base_mrf = base_mrf;
> - inst->mlen = 1;
> -}
> -
> -
> -void
> -vec4_gs_visitor::emit_urb_write_header(int mrf)
> -{
> - /* The SEND instruction that writes the vertex data to the VUE will use
> - * per_slot_offset=true, which means that DWORDs 3 and 4 of the message
> - * header specify an offset (in multiples of 256 bits) into the URB entry
> - * at which the write should take place.
> - *
> - * So we have to prepare a message header with the appropriate offset
> - * values.
> - */
> - dst_reg mrf_reg(MRF, mrf);
> - src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
> - this->current_annotation = "URB write header";
> - vec4_instruction *inst = emit(MOV(mrf_reg, r0));
> - inst->force_writemask_all = true;
> - emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count,
> - (uint32_t) c->prog_data.output_vertex_size_hwords);
> -}
> -
> -
> -vec4_instruction *
> -vec4_gs_visitor::emit_urb_write_opcode(bool complete)
> -{
> - /* We don't care whether the vertex is complete, because in general
> - * geometry shaders output multiple vertices, and we don't terminate the
> - * thread until all vertices are complete.
> - */
> - (void) complete;
> -
> - vec4_instruction *inst = emit(GS_OPCODE_URB_WRITE);
> - inst->offset = c->prog_data.control_data_header_size_hwords;
> -
> - /* We need to increment Global Offset by 1 to make room for Broadwell's
> - * extra "Vertex Count" payload at the beginning of the URB entry.
> - */
> - if (brw->gen >= 8)
> - inst->offset++;
> -
> - inst->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
> - return inst;
> -}
> -
> -
> -int
> -vec4_gs_visitor::compute_array_stride(ir_dereference_array *ir)
> -{
> - /* Geometry shader inputs are arrays, but they use an unusual array layout:
> - * instead of all array elements for a given geometry shader input being
> - * stored consecutively, all geometry shader inputs are interleaved into
> - * one giant array. At this stage of compilation, we assume that the
> - * stride of the array is BRW_VARYING_SLOT_COUNT. Later,
> - * setup_attributes() will remap our accesses to the actual input array.
> - */
> - ir_dereference_variable *deref_var = ir->array->as_dereference_variable();
> - if (deref_var && deref_var->var->data.mode == ir_var_shader_in)
> - return BRW_VARYING_SLOT_COUNT;
> - else
> - return vec4_visitor::compute_array_stride(ir);
> -}
> -
> -
> -/**
> - * Write out a batch of 32 control data bits from the control_data_bits
> - * register to the URB.
> - *
> - * The current value of the vertex_count register determines which DWORD in
> - * the URB receives the control data bits. The control_data_bits register is
> - * assumed to contain the correct data for the vertex that was most recently
> - * output, and all previous vertices that share the same DWORD.
> - *
> - * This function takes care of ensuring that if no vertices have been output
> - * yet, no control bits are emitted.
> - */
> -void
> -vec4_gs_visitor::emit_control_data_bits()
> -{
> - assert(c->control_data_bits_per_vertex != 0);
> -
> - /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized)
> - * granularity, we need to use two tricks to ensure that the batch of 32
> - * control data bits is written to the appropriate DWORD in the URB. To
> - * select which vec4 we are writing to, we use the "slot {0,1} offset"
> - * fields of the message header. To select which DWORD in the vec4 we are
> - * writing to, we use the channel mask fields of the message header. To
> - * avoid penalizing geometry shaders that emit a small number of vertices
> - * with extra bookkeeping, we only do each of these tricks when
> - * c->prog_data.control_data_header_size_bits is large enough to make it
> - * necessary.
> - *
> - * Note: this means that if we're outputting just a single DWORD of control
> - * data bits, we'll actually replicate it four times since we won't do any
> - * channel masking. But that's not a problem since in this case the
> - * hardware only pays attention to the first DWORD.
> - */
> - enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD;
> - if (c->control_data_header_size_bits > 32)
> - urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS;
> - if (c->control_data_header_size_bits > 128)
> - urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;
> -
> - /* If vertex_count is 0, then no control data bits have been accumulated
> - * yet, so we should do nothing.
> - */
> - emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ));
> - emit(IF(BRW_PREDICATE_NORMAL));
> - {
> - /* If we are using either channel masks or a per-slot offset, then we
> - * need to figure out which DWORD we are trying to write to, using the
> - * formula:
> - *
> - * dword_index = (vertex_count - 1) * bits_per_vertex / 32
> - *
> - * Since bits_per_vertex is a power of two, and is known at compile
> - * time, this can be optimized to:
> - *
> - * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
> - */
> - src_reg dword_index(this, glsl_type::uint_type);
> - if (urb_write_flags) {
> - src_reg prev_count(this, glsl_type::uint_type);
> - emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
> - unsigned log2_bits_per_vertex =
> - _mesa_fls(c->control_data_bits_per_vertex);
> - emit(SHR(dst_reg(dword_index), prev_count,
> - (uint32_t) (6 - log2_bits_per_vertex)));
> - }
> -
> - /* Start building the URB write message. The first MRF gets a copy of
> - * R0.
> - */
> - int base_mrf = 1;
> - dst_reg mrf_reg(MRF, base_mrf);
> - src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
> - vec4_instruction *inst = emit(MOV(mrf_reg, r0));
> - inst->force_writemask_all = true;
> -
> - if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
> - /* Set the per-slot offset to dword_index / 4, to that we'll write to
> - * the appropriate OWORD within the control data header.
> - */
> - src_reg per_slot_offset(this, glsl_type::uint_type);
> - emit(SHR(dst_reg(per_slot_offset), dword_index, 2u));
> - emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
> - }
> -
> - if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
> - /* Set the channel masks to 1 << (dword_index % 4), so that we'll
> - * write to the appropriate DWORD within the OWORD. We need to do
> - * this computation with force_writemask_all, otherwise garbage data
> - * from invocation 0 might clobber the mask for invocation 1 when
> - * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
> - * together.
> - */
> - src_reg channel(this, glsl_type::uint_type);
> - inst = emit(AND(dst_reg(channel), dword_index, 3u));
> - inst->force_writemask_all = true;
> - src_reg one(this, glsl_type::uint_type);
> - inst = emit(MOV(dst_reg(one), 1u));
> - inst->force_writemask_all = true;
> - src_reg channel_mask(this, glsl_type::uint_type);
> - inst = emit(SHL(dst_reg(channel_mask), one, channel));
> - inst->force_writemask_all = true;
> - emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
> - channel_mask);
> - emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
> - }
> -
> - /* Store the control data bits in the message payload and send it. */
> - dst_reg mrf_reg2(MRF, base_mrf + 1);
> - inst = emit(MOV(mrf_reg2, this->control_data_bits));
> - inst->force_writemask_all = true;
> - inst = emit(GS_OPCODE_URB_WRITE);
> - inst->urb_write_flags = urb_write_flags;
> - /* We need to increment Global Offset by 256-bits to make room for
> - * Broadwell's extra "Vertex Count" payload at the beginning of the
> - * URB entry. Since this is an OWord message, Global Offset is counted
> - * in 128-bit units, so we must set it to 2.
> - */
> - if (brw->gen >= 8)
> - inst->offset = 2;
> - inst->base_mrf = base_mrf;
> - inst->mlen = 2;
> - }
> - emit(BRW_OPCODE_ENDIF);
> -}
> -
> -void
> -vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id)
> -{
> - /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
> -
> - /* Note: we are calling this *before* increasing vertex_count, so
> - * this->vertex_count == vertex_count - 1 in the formula above.
> - */
> -
> - /* Stream mode uses 2 bits per vertex */
> - assert(c->control_data_bits_per_vertex == 2);
> -
> - /* Must be a valid stream */
> - assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
> -
> - /* Control data bits are initialized to 0 so we don't have to set any
> - * bits when sending vertices to stream 0.
> - */
> - if (stream_id == 0)
> - return;
> -
> - /* reg::sid = stream_id */
> - src_reg sid(this, glsl_type::uint_type);
> - emit(MOV(dst_reg(sid), stream_id));
> -
> - /* reg:shift_count = 2 * (vertex_count - 1) */
> - src_reg shift_count(this, glsl_type::uint_type);
> - emit(SHL(dst_reg(shift_count), this->vertex_count, 1u));
> -
> - /* Note: we're relying on the fact that the GEN SHL instruction only pays
> - * attention to the lower 5 bits of its second source argument, so on this
> - * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
> - * stream_id << ((2 * (vertex_count - 1)) % 32).
> - */
> - src_reg mask(this, glsl_type::uint_type);
> - emit(SHL(dst_reg(mask), sid, shift_count));
> - emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
> -}
> -
> -void
> -vec4_gs_visitor::visit(ir_emit_vertex *ir)
> -{
> - this->current_annotation = "emit vertex: safety check";
> -
> - /* To ensure that we don't output more vertices than the shader specified
> - * using max_vertices, do the logic inside a conditional of the form "if
> - * (vertex_count < MAX)"
> - */
> - unsigned num_output_vertices = c->gp->program.VerticesOut;
> - emit(CMP(dst_null_d(), this->vertex_count,
> - src_reg(num_output_vertices), BRW_CONDITIONAL_L));
> - emit(IF(BRW_PREDICATE_NORMAL));
> - {
> - /* If we're outputting 32 control data bits or less, then we can wait
> - * until the shader is over to output them all. Otherwise we need to
> - * output them as we go. Now is the time to do it, since we're about to
> - * output the vertex_count'th vertex, so it's guaranteed that the
> - * control data bits associated with the (vertex_count - 1)th vertex are
> - * correct.
> - */
> - if (c->control_data_header_size_bits > 32) {
> - this->current_annotation = "emit vertex: emit control data bits";
> - /* Only emit control data bits if we've finished accumulating a batch
> - * of 32 bits. This is the case when:
> - *
> - * (vertex_count * bits_per_vertex) % 32 == 0
> - *
> - * (in other words, when the last 5 bits of vertex_count *
> - * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
> - * integer n (which is always the case, since bits_per_vertex is
> - * always 1 or 2), this is equivalent to requiring that the last 5-n
> - * bits of vertex_count are 0:
> - *
> - * vertex_count & (2^(5-n) - 1) == 0
> - *
> - * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
> - * equivalent to:
> - *
> - * vertex_count & (32 / bits_per_vertex - 1) == 0
> - */
> - vec4_instruction *inst =
> - emit(AND(dst_null_d(), this->vertex_count,
> - (uint32_t) (32 / c->control_data_bits_per_vertex - 1)));
> - inst->conditional_mod = BRW_CONDITIONAL_Z;
> - emit(IF(BRW_PREDICATE_NORMAL));
> - {
> - emit_control_data_bits();
> -
> - /* Reset control_data_bits to 0 so we can start accumulating a new
> - * batch.
> - *
> - * Note: in the case where vertex_count == 0, this neutralizes the
> - * effect of any call to EndPrimitive() that the shader may have
> - * made before outputting its first vertex.
> - */
> - inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
> - inst->force_writemask_all = true;
> - }
> - emit(BRW_OPCODE_ENDIF);
> - }
> -
> - this->current_annotation = "emit vertex: vertex data";
> - emit_vertex();
> -
> - /* In stream mode we have to set control data bits for all vertices
> - * unless we have disabled control data bits completely (which we do
> - * do for GL_POINTS outputs that don't use streams).
> - */
> - if (c->control_data_header_size_bits > 0 &&
> - c->prog_data.control_data_format ==
> - GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
> - this->current_annotation = "emit vertex: Stream control data bits";
> - set_stream_control_data_bits(ir->stream_id());
> - }
> -
> - this->current_annotation = "emit vertex: increment vertex count";
> - emit(ADD(dst_reg(this->vertex_count), this->vertex_count,
> - src_reg(1u)));
> - }
> - emit(BRW_OPCODE_ENDIF);
> -
> - this->current_annotation = NULL;
> -}
> -
> -void
> -vec4_gs_visitor::visit(ir_end_primitive *)
> -{
> - /* We can only do EndPrimitive() functionality when the control data
> - * consists of cut bits. Fortunately, the only time it isn't is when the
> - * output type is points, in which case EndPrimitive() is a no-op.
> - */
> - if (c->prog_data.control_data_format !=
> - GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
> - return;
> - }
> -
> - /* Cut bits use one bit per vertex. */
> - assert(c->control_data_bits_per_vertex == 1);
> -
> - /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
> - * vertex n, 0 otherwise. So all we need to do here is mark bit
> - * (vertex_count - 1) % 32 in the cut_bits register to indicate that
> - * EndPrimitive() was called after emitting vertex (vertex_count - 1);
> - * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
> - *
> - * Note that if EndPrimitve() is called before emitting any vertices, this
> - * will cause us to set bit 31 of the control_data_bits register to 1.
> - * That's fine because:
> - *
> - * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
> - * output, so the hardware will ignore cut bit 31.
> - *
> - * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
> - * last vertex, so setting cut bit 31 has no effect (since the primitive
> - * is automatically ended when the GS terminates).
> - *
> - * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
> - * control_data_bits register to 0 when the first vertex is emitted.
> - */
> -
> - /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
> - src_reg one(this, glsl_type::uint_type);
> - emit(MOV(dst_reg(one), 1u));
> - src_reg prev_count(this, glsl_type::uint_type);
> - emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
> - src_reg mask(this, glsl_type::uint_type);
> - /* Note: we're relying on the fact that the GEN SHL instruction only pays
> - * attention to the lower 5 bits of its second source argument, so on this
> - * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
> - * ((vertex_count - 1) % 32).
> - */
> - emit(SHL(dst_reg(mask), one, prev_count));
> - emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
> -}
> -
> -static const unsigned *
> -generate_assembly(struct brw_context *brw,
> - struct gl_shader_program *shader_prog,
> - struct gl_program *prog,
> - struct brw_vue_prog_data *prog_data,
> - void *mem_ctx,
> - const cfg_t *cfg,
> - unsigned *final_assembly_size)
> -{
> - vec4_generator g(brw, shader_prog, prog, prog_data, mem_ctx,
> - INTEL_DEBUG & DEBUG_GS, "geometry", "GS");
> - return g.generate_assembly(cfg, final_assembly_size);
> -}
> -
> -extern "C" const unsigned *
> -brw_gs_emit(struct brw_context *brw,
> - struct gl_shader_program *prog,
> - struct brw_gs_compile *c,
> - void *mem_ctx,
> - unsigned *final_assembly_size)
> -{
> - if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
> - struct brw_shader *shader =
> - (brw_shader *) prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
> -
> - brw_dump_ir("geometry", prog, &shader->base, NULL);
> - }
> -
> - if (brw->gen >= 7) {
> - /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
> - * so without spilling. If the GS invocations count > 1, then we can't use
> - * dual object mode.
> - */
> - if (c->prog_data.invocations <= 1 &&
> - likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
> - c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_DUAL_OBJECT;
> -
> - vec4_gs_visitor v(brw, c, prog, mem_ctx, true /* no_spills */);
> - if (v.run()) {
> - return generate_assembly(brw, prog, &c->gp->program.Base,
> - &c->prog_data.base, mem_ctx, v.cfg,
> - final_assembly_size);
> - }
> - }
> - }
> -
> - /* Either we failed to compile in DUAL_OBJECT mode (probably because it
> - * would have required spilling) or DUAL_OBJECT mode is disabled. So fall
> - * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
> - *
> - * FIXME: Single dispatch mode requires that the driver can handle
> - * interleaving of input registers, but this is already supported (dual
> - * instance mode has the same requirement). However, to take full advantage
> - * of single dispatch mode to reduce register pressure we would also need to
> - * do interleaved outputs, but currently, the vec4 visitor and generator
> - * classes do not support this, so at the moment register pressure in
> - * single and dual instance modes is the same.
> - *
> - * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
> - * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
> - * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
> - * is also supported. When InstanceCount=1 (one instance per object) software
> - * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
> - * the best choice for performance, followed by SINGLE mode."
> - *
> - * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
> - * mode is more performant when invocations > 1. Gen6 only supports
> - * SINGLE mode.
> - */
> - if (c->prog_data.invocations <= 1 || brw->gen < 7)
> - c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_SINGLE;
> - else
> - c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_DUAL_INSTANCE;
> -
> - vec4_gs_visitor *gs = NULL;
> - const unsigned *ret = NULL;
> -
> - if (brw->gen >= 7)
> - gs = new vec4_gs_visitor(brw, c, prog, mem_ctx, false /* no_spills */);
> - else
> - gs = new gen6_gs_visitor(brw, c, prog, mem_ctx, false /* no_spills */);
> -
> - if (!gs->run()) {
> - prog->LinkStatus = false;
> - ralloc_strcat(&prog->InfoLog, gs->fail_msg);
> - } else {
> - ret = generate_assembly(brw, prog, &c->gp->program.Base,
> - &c->prog_data.base, mem_ctx, gs->cfg,
> - final_assembly_size);
> - }
> -
> - delete gs;
> - return ret;
> -}
> -
> -
> -} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
> deleted file mode 100644
> index bcb5a2b..0000000
> --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
> +++ /dev/null
> @@ -1,103 +0,0 @@
> -/*
> - * Copyright © 2013 Intel Corporation
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the "Software"),
> - * to deal in the Software without restriction, including without limitation
> - * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> - * and/or sell copies of the Software, and to permit persons to whom the
> - * Software is furnished to do so, subject to the following conditions:
> - *
> - * The above copyright notice and this permission notice (including the next
> - * paragraph) shall be included in all copies or substantial portions of the
> - * Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> - * DEALINGS IN THE SOFTWARE.
> - */
> -
> -/**
> - * \file brw_vec4_gs_visitor.h
> - *
> - * Geometry-shader-specific code derived from the vec4_visitor class.
> - */
> -
> -#ifndef BRW_VEC4_GS_VISITOR_H
> -#define BRW_VEC4_GS_VISITOR_H
> -
> -#include "brw_vec4.h"
> -
> -/**
> - * Scratch data used when compiling a GLSL geometry shader.
> - */
> -struct brw_gs_compile
> -{
> - struct brw_vec4_compile base;
> - struct brw_gs_prog_key key;
> - struct brw_gs_prog_data prog_data;
> - struct brw_vue_map input_vue_map;
> -
> - struct brw_geometry_program *gp;
> -
> - unsigned control_data_bits_per_vertex;
> - unsigned control_data_header_size_bits;
> -};
> -
> -#ifdef __cplusplus
> -extern "C" {
> -#endif
> -
> -const unsigned *brw_gs_emit(struct brw_context *brw,
> - struct gl_shader_program *prog,
> - struct brw_gs_compile *c,
> - void *mem_ctx,
> - unsigned *final_assembly_size);
> -
> -#ifdef __cplusplus
> -} /* extern "C" */
> -#endif
> -
> -#ifdef __cplusplus
> -namespace brw {
> -
> -class vec4_gs_visitor : public vec4_visitor
> -{
> -public:
> - vec4_gs_visitor(struct brw_context *brw,
> - struct brw_gs_compile *c,
> - struct gl_shader_program *prog,
> - void *mem_ctx,
> - bool no_spills);
> -
> -protected:
> - virtual dst_reg *make_reg_for_system_value(ir_variable *ir);
> - virtual void setup_payload();
> - virtual void emit_prolog();
> - virtual void emit_program_code();
> - virtual void emit_thread_end();
> - virtual void emit_urb_write_header(int mrf);
> - virtual vec4_instruction *emit_urb_write_opcode(bool complete);
> - virtual int compute_array_stride(ir_dereference_array *ir);
> - virtual void visit(ir_emit_vertex *);
> - virtual void visit(ir_end_primitive *);
> -
> -protected:
> - int setup_varying_inputs(int payload_reg, int *attribute_map,
> - int attributes_per_reg);
> - void emit_control_data_bits();
> - void set_stream_control_data_bits(unsigned stream_id);
> -
> - src_reg vertex_count;
> - src_reg control_data_bits;
> - const struct brw_gs_compile * const c;
> -};
> -
> -} /* namespace brw */
> -#endif /* __cplusplus */
> -
> -#endif /* BRW_VEC4_GS_VISITOR_H */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
> index 95b9d90..8ef0acb 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
> @@ -228,7 +228,7 @@ vec4_live_variables::~vec4_live_variables()
> * for register allocation performance.
> */
> void
> -vec4_visitor::calculate_live_intervals()
> +vec4_god::calculate_live_intervals()
> {
> if (this->live_intervals)
> return;
> @@ -304,14 +304,14 @@ vec4_visitor::calculate_live_intervals()
> }
>
> void
> -vec4_visitor::invalidate_live_intervals()
> +vec4_god::invalidate_live_intervals()
> {
> ralloc_free(live_intervals);
> live_intervals = NULL;
> }
>
> int
> -vec4_visitor::var_range_start(unsigned v, unsigned n) const
> +vec4_god::var_range_start(unsigned v, unsigned n) const
> {
> int start = INT_MAX;
>
> @@ -322,7 +322,7 @@ vec4_visitor::var_range_start(unsigned v, unsigned n) const
> }
>
> int
> -vec4_visitor::var_range_end(unsigned v, unsigned n) const
> +vec4_god::var_range_end(unsigned v, unsigned n) const
> {
> int end = INT_MIN;
>
> @@ -333,7 +333,7 @@ vec4_visitor::var_range_end(unsigned v, unsigned n) const
> }
>
> bool
> -vec4_visitor::virtual_grf_interferes(int a, int b)
> +vec4_god::virtual_grf_interferes(int a, int b)
> {
> return !((var_range_end(4 * alloc.offsets[a], 4 * alloc.sizes[a]) <=
> var_range_start(4 * alloc.offsets[b], 4 * alloc.sizes[b])) ||
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
> index 3186824..5016a7c 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
> @@ -42,7 +42,7 @@ assign(unsigned int *reg_hw_locations, backend_reg *reg)
> }
>
> bool
> -vec4_visitor::reg_allocate_trivial()
> +vec4_god::reg_allocate_trivial()
> {
> unsigned int hw_reg_mapping[this->alloc.count];
> bool virtual_grf_used[this->alloc.count];
> @@ -166,7 +166,7 @@ brw_vec4_alloc_reg_set(struct intel_screen *screen)
> }
>
> void
> -vec4_visitor::setup_payload_interference(struct ra_graph *g,
> +vec4_god::setup_payload_interference(struct ra_graph *g,
> int first_payload_node,
> int reg_node_count)
> {
> @@ -190,7 +190,7 @@ vec4_visitor::setup_payload_interference(struct ra_graph *g,
> }
>
> bool
> -vec4_visitor::reg_allocate()
> +vec4_god::reg_allocate()
> {
> struct intel_screen *screen = brw->intelScreen;
> unsigned int hw_reg_mapping[alloc.count];
> @@ -267,7 +267,7 @@ vec4_visitor::reg_allocate()
> }
>
> void
> -vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
> +vec4_god::evaluate_spill_costs(float *spill_costs, bool *no_spill)
> {
> float loop_scale = 1.0;
>
> @@ -322,7 +322,7 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
> }
>
> int
> -vec4_visitor::choose_spill_reg(struct ra_graph *g)
> +vec4_god::choose_spill_reg(struct ra_graph *g)
> {
> float spill_costs[this->alloc.count];
> bool no_spill[this->alloc.count];
> @@ -338,7 +338,7 @@ vec4_visitor::choose_spill_reg(struct ra_graph *g)
> }
>
> void
> -vec4_visitor::spill_reg(int spill_reg_nr)
> +vec4_god::spill_reg(int spill_reg_nr)
> {
> assert(alloc.sizes[spill_reg_nr] == 1);
> unsigned int spill_offset = c->last_scratch++;
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> deleted file mode 100644
> index 26a3b9f..0000000
> --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> +++ /dev/null
> @@ -1,3658 +0,0 @@
> -/*
> - * Copyright © 2011 Intel Corporation
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the "Software"),
> - * to deal in the Software without restriction, including without limitation
> - * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> - * and/or sell copies of the Software, and to permit persons to whom the
> - * Software is furnished to do so, subject to the following conditions:
> - *
> - * The above copyright notice and this permission notice (including the next
> - * paragraph) shall be included in all copies or substantial portions of the
> - * Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> - * IN THE SOFTWARE.
> - */
> -
> -#include "brw_vec4.h"
> -#include "brw_cfg.h"
> -#include "glsl/ir_uniform.h"
> -#include "program/sampler.h"
> -
> -namespace brw {
> -
> -vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
> - const src_reg &src0, const src_reg &src1,
> - const src_reg &src2)
> -{
> - this->opcode = opcode;
> - this->dst = dst;
> - this->src[0] = src0;
> - this->src[1] = src1;
> - this->src[2] = src2;
> - this->saturate = false;
> - this->force_writemask_all = false;
> - this->no_dd_clear = false;
> - this->no_dd_check = false;
> - this->writes_accumulator = false;
> - this->conditional_mod = BRW_CONDITIONAL_NONE;
> - this->predicate = BRW_PREDICATE_NONE;
> - this->predicate_inverse = false;
> - this->target = 0;
> - this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
> - this->shadow_compare = false;
> - this->ir = NULL;
> - this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
> - this->header_present = false;
> - this->flag_subreg = 0;
> - this->mlen = 0;
> - this->base_mrf = 0;
> - this->offset = 0;
> - this->annotation = NULL;
> -}
> -
> -vec4_instruction *
> -vec4_visitor::emit(vec4_instruction *inst)
> -{
> - inst->ir = this->base_ir;
> - inst->annotation = this->current_annotation;
> -
> - this->instructions.push_tail(inst);
> -
> - return inst;
> -}
> -
> -vec4_instruction *
> -vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
> - vec4_instruction *new_inst)
> -{
> - new_inst->ir = inst->ir;
> - new_inst->annotation = inst->annotation;
> -
> - inst->insert_before(block, new_inst);
> -
> - return inst;
> -}
> -
> -vec4_instruction *
> -vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
> - const src_reg &src1, const src_reg &src2)
> -{
> - return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
> -}
> -
> -
> -vec4_instruction *
> -vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
> - const src_reg &src1)
> -{
> - return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
> -}
> -
> -vec4_instruction *
> -vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
> -{
> - return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
> -}
> -
> -vec4_instruction *
> -vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
> -{
> - return emit(new(mem_ctx) vec4_instruction(opcode, dst));
> -}
> -
> -vec4_instruction *
> -vec4_visitor::emit(enum opcode opcode)
> -{
> - return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
> -}
> -
> -#define ALU1(op) \
> - vec4_instruction * \
> - vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
> - { \
> - return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
> - }
> -
> -#define ALU2(op) \
> - vec4_instruction * \
> - vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
> - const src_reg &src1) \
> - { \
> - return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
> - src0, src1); \
> - }
> -
> -#define ALU2_ACC(op) \
> - vec4_instruction * \
> - vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
> - const src_reg &src1) \
> - { \
> - vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
> - BRW_OPCODE_##op, dst, src0, src1); \
> - inst->writes_accumulator = true; \
> - return inst; \
> - }
> -
> -#define ALU3(op) \
> - vec4_instruction * \
> - vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
> - const src_reg &src1, const src_reg &src2) \
> - { \
> - assert(brw->gen >= 6); \
> - return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
> - src0, src1, src2); \
> - }
> -
> -ALU1(NOT)
> -ALU1(MOV)
> -ALU1(FRC)
> -ALU1(RNDD)
> -ALU1(RNDE)
> -ALU1(RNDZ)
> -ALU1(F32TO16)
> -ALU1(F16TO32)
> -ALU2(ADD)
> -ALU2(MUL)
> -ALU2_ACC(MACH)
> -ALU2(AND)
> -ALU2(OR)
> -ALU2(XOR)
> -ALU2(DP3)
> -ALU2(DP4)
> -ALU2(DPH)
> -ALU2(SHL)
> -ALU2(SHR)
> -ALU2(ASR)
> -ALU3(LRP)
> -ALU1(BFREV)
> -ALU3(BFE)
> -ALU2(BFI1)
> -ALU3(BFI2)
> -ALU1(FBH)
> -ALU1(FBL)
> -ALU1(CBIT)
> -ALU3(MAD)
> -ALU2_ACC(ADDC)
> -ALU2_ACC(SUBB)
> -ALU2(MAC)
> -
> -/** Gen4 predicated IF. */
> -vec4_instruction *
> -vec4_visitor::IF(enum brw_predicate predicate)
> -{
> - vec4_instruction *inst;
> -
> - inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
> - inst->predicate = predicate;
> -
> - return inst;
> -}
> -
> -/** Gen6 IF with embedded comparison. */
> -vec4_instruction *
> -vec4_visitor::IF(src_reg src0, src_reg src1,
> - enum brw_conditional_mod condition)
> -{
> - assert(brw->gen == 6);
> -
> - vec4_instruction *inst;
> -
> - resolve_ud_negate(&src0);
> - resolve_ud_negate(&src1);
> -
> - inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
> - src0, src1);
> - inst->conditional_mod = condition;
> -
> - return inst;
> -}
> -
> -/**
> - * CMP: Sets the low bit of the destination channels with the result
> - * of the comparison, while the upper bits are undefined, and updates
> - * the flag register with the packed 16 bits of the result.
> - */
> -vec4_instruction *
> -vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
> - enum brw_conditional_mod condition)
> -{
> - vec4_instruction *inst;
> -
> - /* Take the instruction:
> - *
> - * CMP null<d> src0<f> src1<f>
> - *
> - * Original gen4 does type conversion to the destination type before
> - * comparison, producing garbage results for floating point comparisons.
> - *
> - * The destination type doesn't matter on newer generations, so we set the
> - * type to match src0 so we can compact the instruction.
> - */
> - dst.type = src0.type;
> - if (dst.file == HW_REG)
> - dst.fixed_hw_reg.type = dst.type;
> -
> - resolve_ud_negate(&src0);
> - resolve_ud_negate(&src1);
> -
> - inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
> - inst->conditional_mod = condition;
> -
> - return inst;
> -}
> -
> -vec4_instruction *
> -vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
> -{
> - vec4_instruction *inst;
> -
> - inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
> - dst, index);
> - inst->base_mrf = 14;
> - inst->mlen = 2;
> -
> - return inst;
> -}
> -
> -vec4_instruction *
> -vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
> - const src_reg &index)
> -{
> - vec4_instruction *inst;
> -
> - inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
> - dst, src, index);
> - inst->base_mrf = 13;
> - inst->mlen = 3;
> -
> - return inst;
> -}
> -
> -void
> -vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
> -{
> - static enum opcode dot_opcodes[] = {
> - BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
> - };
> -
> - emit(dot_opcodes[elements - 2], dst, src0, src1);
> -}
> -
> -src_reg
> -vec4_visitor::fix_3src_operand(src_reg src)
> -{
> - /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
> - * able to use vertical stride of zero to replicate the vec4 uniform, like
> - *
> - * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
> - *
> - * But you can't, since vertical stride is always four in three-source
> - * instructions. Instead, insert a MOV instruction to do the replication so
> - * that the three-source instruction can consume it.
> - */
> -
> - /* The MOV is only needed if the source is a uniform or immediate. */
> - if (src.file != UNIFORM && src.file != IMM)
> - return src;
> -
> - if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
> - return src;
> -
> - dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
> - expanded.type = src.type;
> - emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
> - return src_reg(expanded);
> -}
> -
> -src_reg
> -vec4_visitor::fix_math_operand(src_reg src)
> -{
> - if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
> - return src;
> -
> - /* The gen6 math instruction ignores the source modifiers --
> - * swizzle, abs, negate, and at least some parts of the register
> - * region description.
> - *
> - * Rather than trying to enumerate all these cases, *always* expand the
> - * operand to a temp GRF for gen6.
> - *
> - * For gen7, keep the operand as-is, except if immediate, which gen7 still
> - * can't use.
> - */
> -
> - if (brw->gen == 7 && src.file != IMM)
> - return src;
> -
> - dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
> - expanded.type = src.type;
> - emit(MOV(expanded, src));
> - return src_reg(expanded);
> -}
> -
> -void
> -vec4_visitor::emit_math(enum opcode opcode,
> - const dst_reg &dst,
> - const src_reg &src0, const src_reg &src1)
> -{
> - vec4_instruction *math =
> - emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
> -
> - if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
> - /* MATH on Gen6 must be align1, so we can't do writemasks. */
> - math->dst = dst_reg(this, glsl_type::vec4_type);
> - math->dst.type = dst.type;
> - emit(MOV(dst, src_reg(math->dst)));
> - } else if (brw->gen < 6) {
> - math->base_mrf = 1;
> - math->mlen = src1.file == BAD_FILE ? 1 : 2;
> - }
> -}
> -
> -void
> -vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
> -{
> - if (brw->gen < 7) {
> - unreachable("ir_unop_pack_half_2x16 should be lowered");
> - }
> -
> - assert(dst.type == BRW_REGISTER_TYPE_UD);
> - assert(src0.type == BRW_REGISTER_TYPE_F);
> -
> - /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
> - *
> - * Because this instruction does not have a 16-bit floating-point type,
> - * the destination data type must be Word (W).
> - *
> - * The destination must be DWord-aligned and specify a horizontal stride
> - * (HorzStride) of 2. The 16-bit result is stored in the lower word of
> - * each destination channel and the upper word is not modified.
> - *
> - * The above restriction implies that the f32to16 instruction must use
> - * align1 mode, because only in align1 mode is it possible to specify
> - * horizontal stride. We choose here to defy the hardware docs and emit
> - * align16 instructions.
> - *
> - * (I [chadv] did attempt to emit align1 instructions for VS f32to16
> - * instructions. I was partially successful in that the code passed all
> - * tests. However, the code was dubiously correct and fragile, and the
> - * tests were not harsh enough to probe that frailty. Not trusting the
> - * code, I chose instead to remain in align16 mode in defiance of the hw
> - * docs).
> - *
> - * I've [chadv] experimentally confirmed that, on gen7 hardware and the
> - * simulator, emitting a f32to16 in align16 mode with UD as destination
> - * data type is safe. The behavior differs from that specified in the PRM
> - * in that the upper word of each destination channel is cleared to 0.
> - */
> -
> - dst_reg tmp_dst(this, glsl_type::uvec2_type);
> - src_reg tmp_src(tmp_dst);
> -
> -#if 0
> - /* Verify the undocumented behavior on which the following instructions
> - * rely. If f32to16 fails to clear the upper word of the X and Y channels,
> - * then the result of the bit-or instruction below will be incorrect.
> - *
> - * You should inspect the disasm output in order to verify that the MOV is
> - * not optimized away.
> - */
> - emit(MOV(tmp_dst, src_reg(0x12345678u)));
> -#endif
> -
> - /* Give tmp the form below, where "." means untouched.
> - *
> - * w z y x w z y x
> - * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
> - *
> - * That the upper word of each write-channel be 0 is required for the
> - * following bit-shift and bit-or instructions to work. Note that this
> - * relies on the undocumented hardware behavior mentioned above.
> - */
> - tmp_dst.writemask = WRITEMASK_XY;
> - emit(F32TO16(tmp_dst, src0));
> -
> - /* Give the write-channels of dst the form:
> - * 0xhhhh0000
> - */
> - tmp_src.swizzle = BRW_SWIZZLE_YYYY;
> - emit(SHL(dst, tmp_src, src_reg(16u)));
> -
> - /* Finally, give the write-channels of dst the form of packHalf2x16's
> - * output:
> - * 0xhhhhllll
> - */
> - tmp_src.swizzle = BRW_SWIZZLE_XXXX;
> - emit(OR(dst, src_reg(dst), tmp_src));
> -}
> -
> -void
> -vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
> -{
> - if (brw->gen < 7) {
> - unreachable("ir_unop_unpack_half_2x16 should be lowered");
> - }
> -
> - assert(dst.type == BRW_REGISTER_TYPE_F);
> - assert(src0.type == BRW_REGISTER_TYPE_UD);
> -
> - /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
> - *
> - * Because this instruction does not have a 16-bit floating-point type,
> - * the source data type must be Word (W). The destination type must be
> - * F (Float).
> - *
> - * To use W as the source data type, we must adjust horizontal strides,
> - * which is only possible in align1 mode. All my [chadv] attempts at
> - * emitting align1 instructions for unpackHalf2x16 failed to pass the
> - * Piglit tests, so I gave up.
> - *
> - * I've verified that, on gen7 hardware and the simulator, it is safe to
> - * emit f16to32 in align16 mode with UD as source data type.
> - */
> -
> - dst_reg tmp_dst(this, glsl_type::uvec2_type);
> - src_reg tmp_src(tmp_dst);
> -
> - tmp_dst.writemask = WRITEMASK_X;
> - emit(AND(tmp_dst, src0, src_reg(0xffffu)));
> -
> - tmp_dst.writemask = WRITEMASK_Y;
> - emit(SHR(tmp_dst, src0, src_reg(16u)));
> -
> - dst.writemask = WRITEMASK_XY;
> - emit(F16TO32(dst, tmp_src));
> -}
> -
> -void
> -vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
> -{
> - /* Instead of splitting the 32-bit integer, shifting, and ORing it back
> - * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
> - * is not suitable to generate the shift values, but we can use the packed
> - * vector float and a type-converting MOV.
> - */
> - dst_reg shift(this, glsl_type::uvec4_type);
> - emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
> -
> - dst_reg shifted(this, glsl_type::uvec4_type);
> - src0.swizzle = BRW_SWIZZLE_XXXX;
> - emit(SHR(shifted, src0, src_reg(shift)));
> -
> - shifted.type = BRW_REGISTER_TYPE_UB;
> - dst_reg f(this, glsl_type::vec4_type);
> - emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
> -
> - emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
> -}
> -
> -void
> -vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
> -{
> - /* Instead of splitting the 32-bit integer, shifting, and ORing it back
> - * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
> - * is not suitable to generate the shift values, but we can use the packed
> - * vector float and a type-converting MOV.
> - */
> - dst_reg shift(this, glsl_type::uvec4_type);
> - emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
> -
> - dst_reg shifted(this, glsl_type::uvec4_type);
> - src0.swizzle = BRW_SWIZZLE_XXXX;
> - emit(SHR(shifted, src0, src_reg(shift)));
> -
> - shifted.type = BRW_REGISTER_TYPE_B;
> - dst_reg f(this, glsl_type::vec4_type);
> - emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
> -
> - dst_reg scaled(this, glsl_type::vec4_type);
> - emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
> -
> - dst_reg max(this, glsl_type::vec4_type);
> - emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
> - emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
> -}
> -
> -void
> -vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
> -{
> - dst_reg saturated(this, glsl_type::vec4_type);
> - vec4_instruction *inst = emit(MOV(saturated, src0));
> - inst->saturate = true;
> -
> - dst_reg scaled(this, glsl_type::vec4_type);
> - emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
> -
> - dst_reg rounded(this, glsl_type::vec4_type);
> - emit(RNDE(rounded, src_reg(scaled)));
> -
> - dst_reg u(this, glsl_type::uvec4_type);
> - emit(MOV(u, src_reg(rounded)));
> -
> - src_reg bytes(u);
> - emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
> -}
> -
> -void
> -vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
> -{
> - dst_reg max(this, glsl_type::vec4_type);
> - emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
> -
> - dst_reg min(this, glsl_type::vec4_type);
> - emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
> -
> - dst_reg scaled(this, glsl_type::vec4_type);
> - emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
> -
> - dst_reg rounded(this, glsl_type::vec4_type);
> - emit(RNDE(rounded, src_reg(scaled)));
> -
> - dst_reg i(this, glsl_type::ivec4_type);
> - emit(MOV(i, src_reg(rounded)));
> -
> - src_reg bytes(i);
> - emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
> -}
> -
> -void
> -vec4_visitor::visit_instructions(const exec_list *list)
> -{
> - foreach_in_list(ir_instruction, ir, list) {
> - base_ir = ir;
> - ir->accept(this);
> - }
> -}
> -
> -
> -static int
> -type_size(const struct glsl_type *type)
> -{
> - unsigned int i;
> - int size;
> -
> - switch (type->base_type) {
> - case GLSL_TYPE_UINT:
> - case GLSL_TYPE_INT:
> - case GLSL_TYPE_FLOAT:
> - case GLSL_TYPE_BOOL:
> - if (type->is_matrix()) {
> - return type->matrix_columns;
> - } else {
> - /* Regardless of size of vector, it gets a vec4. This is bad
> - * packing for things like floats, but otherwise arrays become a
> - * mess. Hopefully a later pass over the code can pack scalars
> - * down if appropriate.
> - */
> - return 1;
> - }
> - case GLSL_TYPE_ARRAY:
> - assert(type->length > 0);
> - return type_size(type->fields.array) * type->length;
> - case GLSL_TYPE_STRUCT:
> - size = 0;
> - for (i = 0; i < type->length; i++) {
> - size += type_size(type->fields.structure[i].type);
> - }
> - return size;
> - case GLSL_TYPE_SAMPLER:
> - /* Samplers take up no register space, since they're baked in at
> - * link time.
> - */
> - return 0;
> - case GLSL_TYPE_ATOMIC_UINT:
> - return 0;
> - case GLSL_TYPE_IMAGE:
> - case GLSL_TYPE_VOID:
> - case GLSL_TYPE_DOUBLE:
> - case GLSL_TYPE_ERROR:
> - case GLSL_TYPE_INTERFACE:
> - unreachable("not reached");
> - }
> -
> - return 0;
> -}
> -
> -src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
> -{
> - init();
> -
> - this->file = GRF;
> - this->reg = v->alloc.allocate(type_size(type));
> -
> - if (type->is_array() || type->is_record()) {
> - this->swizzle = BRW_SWIZZLE_NOOP;
> - } else {
> - this->swizzle = brw_swizzle_for_size(type->vector_elements);
> - }
> -
> - this->type = brw_type_for_base_type(type);
> -}
> -
> -src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
> -{
> - assert(size > 0);
> -
> - init();
> -
> - this->file = GRF;
> - this->reg = v->alloc.allocate(type_size(type) * size);
> -
> - this->swizzle = BRW_SWIZZLE_NOOP;
> -
> - this->type = brw_type_for_base_type(type);
> -}
> -
> -dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
> -{
> - init();
> -
> - this->file = GRF;
> - this->reg = v->alloc.allocate(type_size(type));
> -
> - if (type->is_array() || type->is_record()) {
> - this->writemask = WRITEMASK_XYZW;
> - } else {
> - this->writemask = (1 << type->vector_elements) - 1;
> - }
> -
> - this->type = brw_type_for_base_type(type);
> -}
> -
> -/* Our support for uniforms is piggy-backed on the struct
> - * gl_fragment_program, because that's where the values actually
> - * get stored, rather than in some global gl_shader_program uniform
> - * store.
> - */
> -void
> -vec4_visitor::setup_uniform_values(ir_variable *ir)
> -{
> - int namelen = strlen(ir->name);
> -
> - /* The data for our (non-builtin) uniforms is stored in a series of
> - * gl_uniform_driver_storage structs for each subcomponent that
> - * glGetUniformLocation() could name. We know it's been set up in the same
> - * order we'd walk the type, so walk the list of storage and find anything
> - * with our name, or the prefix of a component that starts with our name.
> - */
> - for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
> - struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
> -
> - if (strncmp(ir->name, storage->name, namelen) != 0 ||
> - (storage->name[namelen] != 0 &&
> - storage->name[namelen] != '.' &&
> - storage->name[namelen] != '[')) {
> - continue;
> - }
> -
> - gl_constant_value *components = storage->storage;
> - unsigned vector_count = (MAX2(storage->array_elements, 1) *
> - storage->type->matrix_columns);
> -
> - for (unsigned s = 0; s < vector_count; s++) {
> - assert(uniforms < uniform_array_size);
> - uniform_vector_size[uniforms] = storage->type->vector_elements;
> -
> - int i;
> - for (i = 0; i < uniform_vector_size[uniforms]; i++) {
> - stage_prog_data->param[uniforms * 4 + i] = components;
> - components++;
> - }
> - for (; i < 4; i++) {
> - static gl_constant_value zero = { 0.0 };
> - stage_prog_data->param[uniforms * 4 + i] = &zero;
> - }
> -
> - uniforms++;
> - }
> - }
> -}
> -
> -void
> -vec4_visitor::setup_uniform_clipplane_values()
> -{
> - gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
> -
> - for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
> - assert(this->uniforms < uniform_array_size);
> - this->uniform_vector_size[this->uniforms] = 4;
> - this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
> - this->userplane[i].type = BRW_REGISTER_TYPE_F;
> - for (int j = 0; j < 4; ++j) {
> - stage_prog_data->param[this->uniforms * 4 + j] =
> - (gl_constant_value *) &clip_planes[i][j];
> - }
> - ++this->uniforms;
> - }
> -}
> -
> -/* Our support for builtin uniforms is even scarier than non-builtin.
> - * It sits on top of the PROG_STATE_VAR parameters that are
> - * automatically updated from GL context state.
> - */
> -void
> -vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
> -{
> - const ir_state_slot *const slots = ir->get_state_slots();
> - assert(slots != NULL);
> -
> - for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
> - /* This state reference has already been setup by ir_to_mesa,
> - * but we'll get the same index back here. We can reference
> - * ParameterValues directly, since unlike brw_fs.cpp, we never
> - * add new state references during compile.
> - */
> - int index = _mesa_add_state_reference(this->prog->Parameters,
> - (gl_state_index *)slots[i].tokens);
> - gl_constant_value *values =
> - &this->prog->Parameters->ParameterValues[index][0];
> -
> - assert(this->uniforms < uniform_array_size);
> -
> - for (unsigned j = 0; j < 4; j++)
> - stage_prog_data->param[this->uniforms * 4 + j] =
> - &values[GET_SWZ(slots[i].swizzle, j)];
> -
> - this->uniform_vector_size[this->uniforms] =
> - (ir->type->is_scalar() || ir->type->is_vector() ||
> - ir->type->is_matrix() ? ir->type->vector_elements : 4);
> -
> - this->uniforms++;
> - }
> -}
> -
> -dst_reg *
> -vec4_visitor::variable_storage(ir_variable *var)
> -{
> - return (dst_reg *)hash_table_find(this->variable_ht, var);
> -}
> -
> -void
> -vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
> - enum brw_predicate *predicate)
> -{
> - ir_expression *expr = ir->as_expression();
> -
> - *predicate = BRW_PREDICATE_NORMAL;
> -
> - if (expr && expr->operation != ir_binop_ubo_load) {
> - src_reg op[3];
> - vec4_instruction *inst;
> -
> - assert(expr->get_num_operands() <= 3);
> - for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> - expr->operands[i]->accept(this);
> - op[i] = this->result;
> -
> - resolve_ud_negate(&op[i]);
> - }
> -
> - switch (expr->operation) {
> - case ir_unop_logic_not:
> - inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
> - inst->conditional_mod = BRW_CONDITIONAL_Z;
> - break;
> -
> - case ir_binop_logic_xor:
> - if (brw->gen <= 5) {
> - src_reg temp = src_reg(this, ir->type);
> - emit(XOR(dst_reg(temp), op[0], op[1]));
> - inst = emit(AND(dst_null_d(), temp, src_reg(1)));
> - } else {
> - inst = emit(XOR(dst_null_d(), op[0], op[1]));
> - }
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> - break;
> -
> - case ir_binop_logic_or:
> - if (brw->gen <= 5) {
> - src_reg temp = src_reg(this, ir->type);
> - emit(OR(dst_reg(temp), op[0], op[1]));
> - inst = emit(AND(dst_null_d(), temp, src_reg(1)));
> - } else {
> - inst = emit(OR(dst_null_d(), op[0], op[1]));
> - }
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> - break;
> -
> - case ir_binop_logic_and:
> - if (brw->gen <= 5) {
> - src_reg temp = src_reg(this, ir->type);
> - emit(AND(dst_reg(temp), op[0], op[1]));
> - inst = emit(AND(dst_null_d(), temp, src_reg(1)));
> - } else {
> - inst = emit(AND(dst_null_d(), op[0], op[1]));
> - }
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> - break;
> -
> - case ir_unop_f2b:
> - if (brw->gen >= 6) {
> - emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
> - } else {
> - inst = emit(MOV(dst_null_f(), op[0]));
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> - }
> - break;
> -
> - case ir_unop_i2b:
> - if (brw->gen >= 6) {
> - emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> - } else {
> - inst = emit(MOV(dst_null_d(), op[0]));
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> - }
> - break;
> -
> - case ir_binop_all_equal:
> - if (brw->gen <= 5) {
> - resolve_bool_comparison(expr->operands[0], &op[0]);
> - resolve_bool_comparison(expr->operands[1], &op[1]);
> - }
> - inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
> - *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
> - break;
> -
> - case ir_binop_any_nequal:
> - if (brw->gen <= 5) {
> - resolve_bool_comparison(expr->operands[0], &op[0]);
> - resolve_bool_comparison(expr->operands[1], &op[1]);
> - }
> - inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
> - *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
> - break;
> -
> - case ir_unop_any:
> - if (brw->gen <= 5) {
> - resolve_bool_comparison(expr->operands[0], &op[0]);
> - }
> - inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> - *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
> - break;
> -
> - case ir_binop_greater:
> - case ir_binop_gequal:
> - case ir_binop_less:
> - case ir_binop_lequal:
> - case ir_binop_equal:
> - case ir_binop_nequal:
> - if (brw->gen <= 5) {
> - resolve_bool_comparison(expr->operands[0], &op[0]);
> - resolve_bool_comparison(expr->operands[1], &op[1]);
> - }
> - emit(CMP(dst_null_d(), op[0], op[1],
> - brw_conditional_for_comparison(expr->operation)));
> - break;
> -
> - case ir_triop_csel: {
> - /* Expand the boolean condition into the flag register. */
> - inst = emit(MOV(dst_null_d(), op[0]));
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -
> - /* Select which boolean to return. */
> - dst_reg temp(this, expr->operands[1]->type);
> - inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
> - inst->predicate = BRW_PREDICATE_NORMAL;
> -
> - /* Expand the result to a condition code. */
> - inst = emit(MOV(dst_null_d(), src_reg(temp)));
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> - break;
> - }
> -
> - default:
> - unreachable("not reached");
> - }
> - return;
> - }
> -
> - ir->accept(this);
> -
> - resolve_ud_negate(&this->result);
> -
> - vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -}
> -
> -/**
> - * Emit a gen6 IF statement with the comparison folded into the IF
> - * instruction.
> - */
> -void
> -vec4_visitor::emit_if_gen6(ir_if *ir)
> -{
> - ir_expression *expr = ir->condition->as_expression();
> -
> - if (expr && expr->operation != ir_binop_ubo_load) {
> - src_reg op[3];
> - dst_reg temp;
> -
> - assert(expr->get_num_operands() <= 3);
> - for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> - expr->operands[i]->accept(this);
> - op[i] = this->result;
> - }
> -
> - switch (expr->operation) {
> - case ir_unop_logic_not:
> - emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
> - return;
> -
> - case ir_binop_logic_xor:
> - emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
> - return;
> -
> - case ir_binop_logic_or:
> - temp = dst_reg(this, glsl_type::bool_type);
> - emit(OR(temp, op[0], op[1]));
> - emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
> - return;
> -
> - case ir_binop_logic_and:
> - temp = dst_reg(this, glsl_type::bool_type);
> - emit(AND(temp, op[0], op[1]));
> - emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
> - return;
> -
> - case ir_unop_f2b:
> - emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> - return;
> -
> - case ir_unop_i2b:
> - emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> - return;
> -
> - case ir_binop_greater:
> - case ir_binop_gequal:
> - case ir_binop_less:
> - case ir_binop_lequal:
> - case ir_binop_equal:
> - case ir_binop_nequal:
> - emit(IF(op[0], op[1],
> - brw_conditional_for_comparison(expr->operation)));
> - return;
> -
> - case ir_binop_all_equal:
> - emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
> - emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
> - return;
> -
> - case ir_binop_any_nequal:
> - emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
> - emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
> - return;
> -
> - case ir_unop_any:
> - emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> - emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
> - return;
> -
> - case ir_triop_csel: {
> - /* Expand the boolean condition into the flag register. */
> - vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
> - inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -
> - /* Select which boolean to return. */
> - dst_reg temp(this, expr->operands[1]->type);
> - inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
> - inst->predicate = BRW_PREDICATE_NORMAL;
> -
> - emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
> - return;
> - }
> -
> - default:
> - unreachable("not reached");
> - }
> - return;
> - }
> -
> - ir->condition->accept(this);
> -
> - emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
> -}
> -
> -void
> -vec4_visitor::visit(ir_variable *ir)
> -{
> - dst_reg *reg = NULL;
> -
> - if (variable_storage(ir))
> - return;
> -
> - switch (ir->data.mode) {
> - case ir_var_shader_in:
> - assert(ir->data.location != -1);
> - reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
> - break;
> -
> - case ir_var_shader_out:
> - assert(ir->data.location != -1);
> - reg = new(mem_ctx) dst_reg(this, ir->type);
> -
> - for (int i = 0; i < type_size(ir->type); i++) {
> - output_reg[ir->data.location + i] = *reg;
> - output_reg[ir->data.location + i].reg_offset = i;
> - output_reg[ir->data.location + i].type =
> - brw_type_for_base_type(ir->type->get_scalar_type());
> - output_reg_annotation[ir->data.location + i] = ir->name;
> - }
> - break;
> -
> - case ir_var_auto:
> - case ir_var_temporary:
> - reg = new(mem_ctx) dst_reg(this, ir->type);
> - break;
> -
> - case ir_var_uniform:
> - reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
> -
> - /* Thanks to the lower_ubo_reference pass, we will see only
> - * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
> - * variables, so no need for them to be in variable_ht.
> - *
> - * Some uniforms, such as samplers and atomic counters, have no actual
> - * storage, so we should ignore them.
> - */
> - if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
> - return;
> -
> - /* Track how big the whole uniform variable is, in case we need to put a
> - * copy of its data into pull constants for array access.
> - */
> - assert(this->uniforms < uniform_array_size);
> - this->uniform_size[this->uniforms] = type_size(ir->type);
> -
> - if (!strncmp(ir->name, "gl_", 3)) {
> - setup_builtin_uniform_values(ir);
> - } else {
> - setup_uniform_values(ir);
> - }
> - break;
> -
> - case ir_var_system_value:
> - reg = make_reg_for_system_value(ir);
> - break;
> -
> - default:
> - unreachable("not reached");
> - }
> -
> - reg->type = brw_type_for_base_type(ir->type);
> - hash_table_insert(this->variable_ht, reg, ir);
> -}
> -
> -void
> -vec4_visitor::visit(ir_loop *ir)
> -{
> - /* We don't want debugging output to print the whole body of the
> - * loop as the annotation.
> - */
> - this->base_ir = NULL;
> -
> - emit(BRW_OPCODE_DO);
> -
> - visit_instructions(&ir->body_instructions);
> -
> - emit(BRW_OPCODE_WHILE);
> -}
> -
> -void
> -vec4_visitor::visit(ir_loop_jump *ir)
> -{
> - switch (ir->mode) {
> - case ir_loop_jump::jump_break:
> - emit(BRW_OPCODE_BREAK);
> - break;
> - case ir_loop_jump::jump_continue:
> - emit(BRW_OPCODE_CONTINUE);
> - break;
> - }
> -}
> -
> -
> -void
> -vec4_visitor::visit(ir_function_signature *)
> -{
> - unreachable("not reached");
> -}
> -
> -void
> -vec4_visitor::visit(ir_function *ir)
> -{
> - /* Ignore function bodies other than main() -- we shouldn't see calls to
> - * them since they should all be inlined.
> - */
> - if (strcmp(ir->name, "main") == 0) {
> - const ir_function_signature *sig;
> - exec_list empty;
> -
> - sig = ir->matching_signature(NULL, &empty, false);
> -
> - assert(sig);
> -
> - visit_instructions(&sig->body);
> - }
> -}
> -
> -bool
> -vec4_visitor::try_emit_mad(ir_expression *ir)
> -{
> - /* 3-src instructions were introduced in gen6. */
> - if (brw->gen < 6)
> - return false;
> -
> - /* MAD can only handle floating-point data. */
> - if (ir->type->base_type != GLSL_TYPE_FLOAT)
> - return false;
> -
> - ir_rvalue *nonmul;
> - ir_expression *mul;
> - bool mul_negate, mul_abs;
> -
> - for (int i = 0; i < 2; i++) {
> - mul_negate = false;
> - mul_abs = false;
> -
> - mul = ir->operands[i]->as_expression();
> - nonmul = ir->operands[1 - i];
> -
> - if (mul && mul->operation == ir_unop_abs) {
> - mul = mul->operands[0]->as_expression();
> - mul_abs = true;
> - } else if (mul && mul->operation == ir_unop_neg) {
> - mul = mul->operands[0]->as_expression();
> - mul_negate = true;
> - }
> -
> - if (mul && mul->operation == ir_binop_mul)
> - break;
> - }
> -
> - if (!mul || mul->operation != ir_binop_mul)
> - return false;
> -
> - nonmul->accept(this);
> - src_reg src0 = fix_3src_operand(this->result);
> -
> - mul->operands[0]->accept(this);
> - src_reg src1 = fix_3src_operand(this->result);
> - src1.negate ^= mul_negate;
> - src1.abs = mul_abs;
> - if (mul_abs)
> - src1.negate = false;
> -
> - mul->operands[1]->accept(this);
> - src_reg src2 = fix_3src_operand(this->result);
> - src2.abs = mul_abs;
> - if (mul_abs)
> - src2.negate = false;
> -
> - this->result = src_reg(this, ir->type);
> - emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
> -
> - return true;
> -}
> -
> -bool
> -vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
> -{
> - /* This optimization relies on CMP setting the destination to 0 when
> - * false. Early hardware only sets the least significant bit, and
> - * leaves the other bits undefined. So we can't use it.
> - */
> - if (brw->gen < 6)
> - return false;
> -
> - ir_expression *const cmp = ir->operands[0]->as_expression();
> -
> - if (cmp == NULL)
> - return false;
> -
> - switch (cmp->operation) {
> - case ir_binop_less:
> - case ir_binop_greater:
> - case ir_binop_lequal:
> - case ir_binop_gequal:
> - case ir_binop_equal:
> - case ir_binop_nequal:
> - break;
> -
> - default:
> - return false;
> - }
> -
> - cmp->operands[0]->accept(this);
> - const src_reg cmp_src0 = this->result;
> -
> - cmp->operands[1]->accept(this);
> - const src_reg cmp_src1 = this->result;
> -
> - this->result = src_reg(this, ir->type);
> -
> - emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
> - brw_conditional_for_comparison(cmp->operation)));
> -
> - /* If the comparison is false, this->result will just happen to be zero.
> - */
> - vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
> - this->result, src_reg(1.0f));
> - inst->predicate = BRW_PREDICATE_NORMAL;
> - inst->predicate_inverse = true;
> -
> - return true;
> -}
> -
> -void
> -vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
> - src_reg src0, src_reg src1)
> -{
> - vec4_instruction *inst;
> -
> - if (brw->gen >= 6) {
> - inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
> - inst->conditional_mod = conditionalmod;
> - } else {
> - emit(CMP(dst, src0, src1, conditionalmod));
> -
> - inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
> - inst->predicate = BRW_PREDICATE_NORMAL;
> - }
> -}
> -
> -void
> -vec4_visitor::emit_lrp(const dst_reg &dst,
> - const src_reg &x, const src_reg &y, const src_reg &a)
> -{
> - if (brw->gen >= 6) {
> - /* Note that the instruction's argument order is reversed from GLSL
> - * and the IR.
> - */
> - emit(LRP(dst,
> - fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
> - } else {
> - /* Earlier generations don't support three source operations, so we
> - * need to emit x*(1-a) + y*a.
> - */
> - dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
> - dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
> - dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
> - y_times_a.writemask = dst.writemask;
> - one_minus_a.writemask = dst.writemask;
> - x_times_one_minus_a.writemask = dst.writemask;
> -
> - emit(MUL(y_times_a, y, a));
> - emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
> - emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
> - emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
> - }
> -}
> -
> -void
> -vec4_visitor::visit(ir_expression *ir)
> -{
> - unsigned int operand;
> - src_reg op[ARRAY_SIZE(ir->operands)];
> - vec4_instruction *inst;
> -
> - if (ir->operation == ir_binop_add) {
> - if (try_emit_mad(ir))
> - return;
> - }
> -
> - if (ir->operation == ir_unop_b2f) {
> - if (try_emit_b2f_of_compare(ir))
> - return;
> - }
> -
> - /* Storage for our result. Ideally for an assignment we'd be using
> - * the actual storage for the result here, instead.
> - */
> - dst_reg result_dst(this, ir->type);
> - src_reg result_src(result_dst);
> -
> - if (ir->operation == ir_triop_csel) {
> - ir->operands[1]->accept(this);
> - op[1] = this->result;
> - ir->operands[2]->accept(this);
> - op[2] = this->result;
> -
> - enum brw_predicate predicate;
> - emit_bool_to_cond_code(ir->operands[0], &predicate);
> - inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
> - inst->predicate = predicate;
> - this->result = result_src;
> - return;
> - }
> -
> - for (operand = 0; operand < ir->get_num_operands(); operand++) {
> - this->result.file = BAD_FILE;
> - ir->operands[operand]->accept(this);
> - if (this->result.file == BAD_FILE) {
> - fprintf(stderr, "Failed to get tree for expression operand:\n");
> - ir->operands[operand]->fprint(stderr);
> - exit(1);
> - }
> - op[operand] = this->result;
> -
> - /* Matrix expression operands should have been broken down to vector
> - * operations already.
> - */
> - assert(!ir->operands[operand]->type->is_matrix());
> - }
> -
> - /* If nothing special happens, this is the result. */
> - this->result = result_src;
> -
> - switch (ir->operation) {
> - case ir_unop_logic_not:
> - emit(NOT(result_dst, op[0]));
> - break;
> - case ir_unop_neg:
> - op[0].negate = !op[0].negate;
> - emit(MOV(result_dst, op[0]));
> - break;
> - case ir_unop_abs:
> - op[0].abs = true;
> - op[0].negate = false;
> - emit(MOV(result_dst, op[0]));
> - break;
> -
> - case ir_unop_sign:
> - if (ir->type->is_float()) {
> - /* AND(val, 0x80000000) gives the sign bit.
> - *
> - * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
> - * zero.
> - */
> - emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
> -
> - op[0].type = BRW_REGISTER_TYPE_UD;
> - result_dst.type = BRW_REGISTER_TYPE_UD;
> - emit(AND(result_dst, op[0], src_reg(0x80000000u)));
> -
> - inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
> - inst->predicate = BRW_PREDICATE_NORMAL;
> -
> - this->result.type = BRW_REGISTER_TYPE_F;
> - } else {
> - /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
> - * -> non-negative val generates 0x00000000.
> - * Predicated OR sets 1 if val is positive.
> - */
> - emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
> -
> - emit(ASR(result_dst, op[0], src_reg(31)));
> -
> - inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
> - inst->predicate = BRW_PREDICATE_NORMAL;
> - }
> - break;
> -
> - case ir_unop_rcp:
> - emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
> - break;
> -
> - case ir_unop_exp2:
> - emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
> - break;
> - case ir_unop_log2:
> - emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
> - break;
> - case ir_unop_exp:
> - case ir_unop_log:
> - unreachable("not reached: should be handled by ir_explog_to_explog2");
> - case ir_unop_sin:
> - case ir_unop_sin_reduced:
> - emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
> - break;
> - case ir_unop_cos:
> - case ir_unop_cos_reduced:
> - emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
> - break;
> -
> - case ir_unop_dFdx:
> - case ir_unop_dFdx_coarse:
> - case ir_unop_dFdx_fine:
> - case ir_unop_dFdy:
> - case ir_unop_dFdy_coarse:
> - case ir_unop_dFdy_fine:
> - unreachable("derivatives not valid in vertex shader");
> -
> - case ir_unop_bitfield_reverse:
> - emit(BFREV(result_dst, op[0]));
> - break;
> - case ir_unop_bit_count:
> - emit(CBIT(result_dst, op[0]));
> - break;
> - case ir_unop_find_msb: {
> - src_reg temp = src_reg(this, glsl_type::uint_type);
> -
> - inst = emit(FBH(dst_reg(temp), op[0]));
> - inst->dst.writemask = WRITEMASK_XYZW;
> -
> - /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
> - * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
> - * subtract the result from 31 to convert the MSB count into an LSB count.
> - */
> -
> - /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
> - temp.swizzle = BRW_SWIZZLE_NOOP;
> - emit(MOV(result_dst, temp));
> -
> - src_reg src_tmp = src_reg(result_dst);
> - emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
> -
> - src_tmp.negate = true;
> - inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
> - inst->predicate = BRW_PREDICATE_NORMAL;
> - break;
> - }
> - case ir_unop_find_lsb:
> - emit(FBL(result_dst, op[0]));
> - break;
> - case ir_unop_saturate:
> - inst = emit(MOV(result_dst, op[0]));
> - inst->saturate = true;
> - break;
> -
> - case ir_unop_noise:
> - unreachable("not reached: should be handled by lower_noise");
> -
> - case ir_binop_add:
> - emit(ADD(result_dst, op[0], op[1]));
> - break;
> - case ir_binop_sub:
> - unreachable("not reached: should be handled by ir_sub_to_add_neg");
> -
> - case ir_binop_mul:
> - if (brw->gen < 8 && ir->type->is_integer()) {
> - /* For integer multiplication, the MUL uses the low 16 bits of one of
> - * the operands (src0 through SNB, src1 on IVB and later). The MACH
> - * accumulates in the contribution of the upper 16 bits of that
> - * operand. If we can determine that one of the args is in the low
> - * 16 bits, though, we can just emit a single MUL.
> - */
> - if (ir->operands[0]->is_uint16_constant()) {
> - if (brw->gen < 7)
> - emit(MUL(result_dst, op[0], op[1]));
> - else
> - emit(MUL(result_dst, op[1], op[0]));
> - } else if (ir->operands[1]->is_uint16_constant()) {
> - if (brw->gen < 7)
> - emit(MUL(result_dst, op[1], op[0]));
> - else
> - emit(MUL(result_dst, op[0], op[1]));
> - } else {
> - struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
> -
> - emit(MUL(acc, op[0], op[1]));
> - emit(MACH(dst_null_d(), op[0], op[1]));
> - emit(MOV(result_dst, src_reg(acc)));
> - }
> - } else {
> - emit(MUL(result_dst, op[0], op[1]));
> - }
> - break;
> - case ir_binop_imul_high: {
> - struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
> -
> - emit(MUL(acc, op[0], op[1]));
> - emit(MACH(result_dst, op[0], op[1]));
> - break;
> - }
> - case ir_binop_div:
> - /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
> - assert(ir->type->is_integer());
> - emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
> - break;
> - case ir_binop_carry: {
> - struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
> -
> - emit(ADDC(dst_null_ud(), op[0], op[1]));
> - emit(MOV(result_dst, src_reg(acc)));
> - break;
> - }
> - case ir_binop_borrow: {
> - struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
> -
> - emit(SUBB(dst_null_ud(), op[0], op[1]));
> - emit(MOV(result_dst, src_reg(acc)));
> - break;
> - }
> - case ir_binop_mod:
> - /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
> - assert(ir->type->is_integer());
> - emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
> - break;
> -
> - case ir_binop_less:
> - case ir_binop_greater:
> - case ir_binop_lequal:
> - case ir_binop_gequal:
> - case ir_binop_equal:
> - case ir_binop_nequal: {
> - if (brw->gen <= 5) {
> - resolve_bool_comparison(ir->operands[0], &op[0]);
> - resolve_bool_comparison(ir->operands[1], &op[1]);
> - }
> - emit(CMP(result_dst, op[0], op[1],
> - brw_conditional_for_comparison(ir->operation)));
> - break;
> - }
> -
> - case ir_binop_all_equal:
> - if (brw->gen <= 5) {
> - resolve_bool_comparison(ir->operands[0], &op[0]);
> - resolve_bool_comparison(ir->operands[1], &op[1]);
> - }
> -
> - /* "==" operator producing a scalar boolean. */
> - if (ir->operands[0]->type->is_vector() ||
> - ir->operands[1]->type->is_vector()) {
> - emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
> - emit(MOV(result_dst, src_reg(0)));
> - inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
> - inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
> - } else {
> - emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
> - }
> - break;
> - case ir_binop_any_nequal:
> - if (brw->gen <= 5) {
> - resolve_bool_comparison(ir->operands[0], &op[0]);
> - resolve_bool_comparison(ir->operands[1], &op[1]);
> - }
> -
> - /* "!=" operator producing a scalar boolean. */
> - if (ir->operands[0]->type->is_vector() ||
> - ir->operands[1]->type->is_vector()) {
> - emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
> -
> - emit(MOV(result_dst, src_reg(0)));
> - inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
> - inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
> - } else {
> - emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
> - }
> - break;
> -
> - case ir_unop_any:
> - if (brw->gen <= 5) {
> - resolve_bool_comparison(ir->operands[0], &op[0]);
> - }
> - emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> - emit(MOV(result_dst, src_reg(0)));
> -
> - inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
> - inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
> - break;
> -
> - case ir_binop_logic_xor:
> - emit(XOR(result_dst, op[0], op[1]));
> - break;
> -
> - case ir_binop_logic_or:
> - emit(OR(result_dst, op[0], op[1]));
> - break;
> -
> - case ir_binop_logic_and:
> - emit(AND(result_dst, op[0], op[1]));
> - break;
> -
> - case ir_binop_dot:
> - assert(ir->operands[0]->type->is_vector());
> - assert(ir->operands[0]->type == ir->operands[1]->type);
> - emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
> - break;
> -
> - case ir_unop_sqrt:
> - emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
> - break;
> - case ir_unop_rsq:
> - emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
> - break;
> -
> - case ir_unop_bitcast_i2f:
> - case ir_unop_bitcast_u2f:
> - this->result = op[0];
> - this->result.type = BRW_REGISTER_TYPE_F;
> - break;
> -
> - case ir_unop_bitcast_f2i:
> - this->result = op[0];
> - this->result.type = BRW_REGISTER_TYPE_D;
> - break;
> -
> - case ir_unop_bitcast_f2u:
> - this->result = op[0];
> - this->result.type = BRW_REGISTER_TYPE_UD;
> - break;
> -
> - case ir_unop_i2f:
> - case ir_unop_i2u:
> - case ir_unop_u2i:
> - case ir_unop_u2f:
> - case ir_unop_f2i:
> - case ir_unop_f2u:
> - emit(MOV(result_dst, op[0]));
> - break;
> - case ir_unop_b2i:
> - emit(AND(result_dst, op[0], src_reg(1)));
> - break;
> - case ir_unop_b2f:
> - if (brw->gen <= 5) {
> - resolve_bool_comparison(ir->operands[0], &op[0]);
> - }
> - op[0].type = BRW_REGISTER_TYPE_D;
> - result_dst.type = BRW_REGISTER_TYPE_D;
> - emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
> - result_dst.type = BRW_REGISTER_TYPE_F;
> - break;
> - case ir_unop_f2b:
> - emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
> - break;
> - case ir_unop_i2b:
> - emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> - break;
> -
> - case ir_unop_trunc:
> - emit(RNDZ(result_dst, op[0]));
> - break;
> - case ir_unop_ceil: {
> - src_reg tmp = src_reg(this, ir->type);
> - op[0].negate = !op[0].negate;
> - emit(RNDD(dst_reg(tmp), op[0]));
> - tmp.negate = true;
> - emit(MOV(result_dst, tmp));
> - }
> - break;
> - case ir_unop_floor:
> - inst = emit(RNDD(result_dst, op[0]));
> - break;
> - case ir_unop_fract:
> - inst = emit(FRC(result_dst, op[0]));
> - break;
> - case ir_unop_round_even:
> - emit(RNDE(result_dst, op[0]));
> - break;
> -
> - case ir_binop_min:
> - emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
> - break;
> - case ir_binop_max:
> - emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
> - break;
> -
> - case ir_binop_pow:
> - emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
> - break;
> -
> - case ir_unop_bit_not:
> - inst = emit(NOT(result_dst, op[0]));
> - break;
> - case ir_binop_bit_and:
> - inst = emit(AND(result_dst, op[0], op[1]));
> - break;
> - case ir_binop_bit_xor:
> - inst = emit(XOR(result_dst, op[0], op[1]));
> - break;
> - case ir_binop_bit_or:
> - inst = emit(OR(result_dst, op[0], op[1]));
> - break;
> -
> - case ir_binop_lshift:
> - inst = emit(SHL(result_dst, op[0], op[1]));
> - break;
> -
> - case ir_binop_rshift:
> - if (ir->type->base_type == GLSL_TYPE_INT)
> - inst = emit(ASR(result_dst, op[0], op[1]));
> - else
> - inst = emit(SHR(result_dst, op[0], op[1]));
> - break;
> -
> - case ir_binop_bfm:
> - emit(BFI1(result_dst, op[0], op[1]));
> - break;
> -
> - case ir_binop_ubo_load: {
> - ir_constant *const_uniform_block = ir->operands[0]->as_constant();
> - ir_constant *const_offset_ir = ir->operands[1]->as_constant();
> - unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
> - src_reg offset;
> -
> - /* Now, load the vector from that offset. */
> - assert(ir->type->is_vector() || ir->type->is_scalar());
> -
> - src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
> - packed_consts.type = result.type;
> - src_reg surf_index;
> -
> - if (const_uniform_block) {
> - /* The block index is a constant, so just emit the binding table entry
> - * as an immediate.
> - */
> - surf_index = src_reg(prog_data->base.binding_table.ubo_start +
> - const_uniform_block->value.u[0]);
> - } else {
> - /* The block index is not a constant. Evaluate the index expression
> - * per-channel and add the base UBO index; the generator will select
> - * a value from any live channel.
> - */
> - surf_index = src_reg(this, glsl_type::uint_type);
> - emit(ADD(dst_reg(surf_index), op[0],
> - src_reg(prog_data->base.binding_table.ubo_start)));
> -
> - /* Assume this may touch any UBO. It would be nice to provide
> - * a tighter bound, but the array information is already lowered away.
> - */
> - brw_mark_surface_used(&prog_data->base,
> - prog_data->base.binding_table.ubo_start +
> - shader_prog->NumUniformBlocks - 1);
> - }
> -
> - if (const_offset_ir) {
> - if (brw->gen >= 8) {
> - /* Store the offset in a GRF so we can send-from-GRF. */
> - offset = src_reg(this, glsl_type::int_type);
> - emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
> - } else {
> - /* Immediates are fine on older generations since they'll be moved
> - * to a (potentially fake) MRF at the generator level.
> - */
> - offset = src_reg(const_offset / 16);
> - }
> - } else {
> - offset = src_reg(this, glsl_type::uint_type);
> - emit(SHR(dst_reg(offset), op[1], src_reg(4)));
> - }
> -
> - if (brw->gen >= 7) {
> - dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
> -
> - /* We have to use a message header on Skylake to get SIMD4x2 mode.
> - * Reserve space for the register.
> - */
> - if (brw->gen >= 9) {
> - grf_offset.reg_offset++;
> - alloc.sizes[grf_offset.reg] = 2;
> - }
> -
> - grf_offset.type = offset.type;
> -
> - emit(MOV(grf_offset, offset));
> -
> - vec4_instruction *pull =
> - emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
> - dst_reg(packed_consts),
> - surf_index,
> - src_reg(grf_offset)));
> - pull->mlen = 1;
> - } else {
> - vec4_instruction *pull =
> - emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
> - dst_reg(packed_consts),
> - surf_index,
> - offset));
> - pull->base_mrf = 14;
> - pull->mlen = 1;
> - }
> -
> - packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
> - packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
> - const_offset % 16 / 4,
> - const_offset % 16 / 4,
> - const_offset % 16 / 4);
> -
> - /* UBO bools are any nonzero int. We need to convert them to use the
> - * value of true stored in ctx->Const.UniformBooleanTrue.
> - */
> - if (ir->type->base_type == GLSL_TYPE_BOOL) {
> - emit(CMP(result_dst, packed_consts, src_reg(0u),
> - BRW_CONDITIONAL_NZ));
> - } else {
> - emit(MOV(result_dst, packed_consts));
> - }
> - break;
> - }
> -
> - case ir_binop_vector_extract:
> - unreachable("should have been lowered by vec_index_to_cond_assign");
> -
> - case ir_triop_fma:
> - op[0] = fix_3src_operand(op[0]);
> - op[1] = fix_3src_operand(op[1]);
> - op[2] = fix_3src_operand(op[2]);
> - /* Note that the instruction's argument order is reversed from GLSL
> - * and the IR.
> - */
> - emit(MAD(result_dst, op[2], op[1], op[0]));
> - break;
> -
> - case ir_triop_lrp:
> - emit_lrp(result_dst, op[0], op[1], op[2]);
> - break;
> -
> - case ir_triop_csel:
> - unreachable("already handled above");
> - break;
> -
> - case ir_triop_bfi:
> - op[0] = fix_3src_operand(op[0]);
> - op[1] = fix_3src_operand(op[1]);
> - op[2] = fix_3src_operand(op[2]);
> - emit(BFI2(result_dst, op[0], op[1], op[2]));
> - break;
> -
> - case ir_triop_bitfield_extract:
> - op[0] = fix_3src_operand(op[0]);
> - op[1] = fix_3src_operand(op[1]);
> - op[2] = fix_3src_operand(op[2]);
> - /* Note that the instruction's argument order is reversed from GLSL
> - * and the IR.
> - */
> - emit(BFE(result_dst, op[2], op[1], op[0]));
> - break;
> -
> - case ir_triop_vector_insert:
> - unreachable("should have been lowered by lower_vector_insert");
> -
> - case ir_quadop_bitfield_insert:
> - unreachable("not reached: should be handled by "
> - "bitfield_insert_to_bfm_bfi\n");
> -
> - case ir_quadop_vector:
> - unreachable("not reached: should be handled by lower_quadop_vector");
> -
> - case ir_unop_pack_half_2x16:
> - emit_pack_half_2x16(result_dst, op[0]);
> - break;
> - case ir_unop_unpack_half_2x16:
> - emit_unpack_half_2x16(result_dst, op[0]);
> - break;
> - case ir_unop_unpack_unorm_4x8:
> - emit_unpack_unorm_4x8(result_dst, op[0]);
> - break;
> - case ir_unop_unpack_snorm_4x8:
> - emit_unpack_snorm_4x8(result_dst, op[0]);
> - break;
> - case ir_unop_pack_unorm_4x8:
> - emit_pack_unorm_4x8(result_dst, op[0]);
> - break;
> - case ir_unop_pack_snorm_4x8:
> - emit_pack_snorm_4x8(result_dst, op[0]);
> - break;
> - case ir_unop_pack_snorm_2x16:
> - case ir_unop_pack_unorm_2x16:
> - case ir_unop_unpack_snorm_2x16:
> - case ir_unop_unpack_unorm_2x16:
> - unreachable("not reached: should be handled by lower_packing_builtins");
> - case ir_unop_unpack_half_2x16_split_x:
> - case ir_unop_unpack_half_2x16_split_y:
> - case ir_binop_pack_half_2x16_split:
> - case ir_unop_interpolate_at_centroid:
> - case ir_binop_interpolate_at_sample:
> - case ir_binop_interpolate_at_offset:
> - unreachable("not reached: should not occur in vertex shader");
> - case ir_binop_ldexp:
> - unreachable("not reached: should be handled by ldexp_to_arith()");
> - case ir_unop_d2f:
> - case ir_unop_f2d:
> - case ir_unop_d2i:
> - case ir_unop_i2d:
> - case ir_unop_d2u:
> - case ir_unop_u2d:
> - case ir_unop_d2b:
> - case ir_unop_pack_double_2x32:
> - case ir_unop_unpack_double_2x32:
> - case ir_unop_frexp_sig:
> - case ir_unop_frexp_exp:
> - unreachable("fp64 todo");
> - }
> -}
> -
> -
> -void
> -vec4_visitor::visit(ir_swizzle *ir)
> -{
> - /* Note that this is only swizzles in expressions, not those on the left
> - * hand side of an assignment, which do write masking. See ir_assignment
> - * for that.
> - */
> - const unsigned swz = brw_compose_swizzle(
> - brw_swizzle_for_size(ir->type->vector_elements),
> - BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
> -
> - ir->val->accept(this);
> - this->result = swizzle(this->result, swz);
> -}
> -
> -void
> -vec4_visitor::visit(ir_dereference_variable *ir)
> -{
> - const struct glsl_type *type = ir->type;
> - dst_reg *reg = variable_storage(ir->var);
> -
> - if (!reg) {
> - fail("Failed to find variable storage for %s\n", ir->var->name);
> - this->result = src_reg(brw_null_reg());
> - return;
> - }
> -
> - this->result = src_reg(*reg);
> -
> - /* System values get their swizzle from the dst_reg writemask */
> - if (ir->var->data.mode == ir_var_system_value)
> - return;
> -
> - if (type->is_scalar() || type->is_vector() || type->is_matrix())
> - this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
> -}
> -
> -
> -int
> -vec4_visitor::compute_array_stride(ir_dereference_array *ir)
> -{
> - /* Under normal circumstances array elements are stored consecutively, so
> - * the stride is equal to the size of the array element.
> - */
> - return type_size(ir->type);
> -}
> -
> -
> -void
> -vec4_visitor::visit(ir_dereference_array *ir)
> -{
> - ir_constant *constant_index;
> - src_reg src;
> - int array_stride = compute_array_stride(ir);
> -
> - constant_index = ir->array_index->constant_expression_value();
> -
> - ir->array->accept(this);
> - src = this->result;
> -
> - if (constant_index) {
> - src.reg_offset += constant_index->value.i[0] * array_stride;
> - } else {
> - /* Variable index array dereference. It eats the "vec4" of the
> - * base of the array and an index that offsets the Mesa register
> - * index.
> - */
> - ir->array_index->accept(this);
> -
> - src_reg index_reg;
> -
> - if (array_stride == 1) {
> - index_reg = this->result;
> - } else {
> - index_reg = src_reg(this, glsl_type::int_type);
> -
> - emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
> - }
> -
> - if (src.reladdr) {
> - src_reg temp = src_reg(this, glsl_type::int_type);
> -
> - emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
> -
> - index_reg = temp;
> - }
> -
> - src.reladdr = ralloc(mem_ctx, src_reg);
> - memcpy(src.reladdr, &index_reg, sizeof(index_reg));
> - }
> -
> - /* If the type is smaller than a vec4, replicate the last channel out. */
> - if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
> - src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
> - else
> - src.swizzle = BRW_SWIZZLE_NOOP;
> - src.type = brw_type_for_base_type(ir->type);
> -
> - this->result = src;
> -}
> -
> -void
> -vec4_visitor::visit(ir_dereference_record *ir)
> -{
> - unsigned int i;
> - const glsl_type *struct_type = ir->record->type;
> - int offset = 0;
> -
> - ir->record->accept(this);
> -
> - for (i = 0; i < struct_type->length; i++) {
> - if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
> - break;
> - offset += type_size(struct_type->fields.structure[i].type);
> - }
> -
> - /* If the type is smaller than a vec4, replicate the last channel out. */
> - if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
> - this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
> - else
> - this->result.swizzle = BRW_SWIZZLE_NOOP;
> - this->result.type = brw_type_for_base_type(ir->type);
> -
> - this->result.reg_offset += offset;
> -}
> -
> -/**
> - * We want to be careful in assignment setup to hit the actual storage
> - * instead of potentially using a temporary like we might with the
> - * ir_dereference handler.
> - */
> -static dst_reg
> -get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
> -{
> - /* The LHS must be a dereference. If the LHS is a variable indexed array
> - * access of a vector, it must be separated into a series conditional moves
> - * before reaching this point (see ir_vec_index_to_cond_assign).
> - */
> - assert(ir->as_dereference());
> - ir_dereference_array *deref_array = ir->as_dereference_array();
> - if (deref_array) {
> - assert(!deref_array->array->type->is_vector());
> - }
> -
> - /* Use the rvalue deref handler for the most part. We'll ignore
> - * swizzles in it and write swizzles using writemask, though.
> - */
> - ir->accept(v);
> - return dst_reg(v->result);
> -}
> -
> -void
> -vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
> - const struct glsl_type *type,
> - enum brw_predicate predicate)
> -{
> - if (type->base_type == GLSL_TYPE_STRUCT) {
> - for (unsigned int i = 0; i < type->length; i++) {
> - emit_block_move(dst, src, type->fields.structure[i].type, predicate);
> - }
> - return;
> - }
> -
> - if (type->is_array()) {
> - for (unsigned int i = 0; i < type->length; i++) {
> - emit_block_move(dst, src, type->fields.array, predicate);
> - }
> - return;
> - }
> -
> - if (type->is_matrix()) {
> - const struct glsl_type *vec_type;
> -
> - vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
> - type->vector_elements, 1);
> -
> - for (int i = 0; i < type->matrix_columns; i++) {
> - emit_block_move(dst, src, vec_type, predicate);
> - }
> - return;
> - }
> -
> - assert(type->is_scalar() || type->is_vector());
> -
> - dst->type = brw_type_for_base_type(type);
> - src->type = dst->type;
> -
> - dst->writemask = (1 << type->vector_elements) - 1;
> -
> - src->swizzle = brw_swizzle_for_size(type->vector_elements);
> -
> - vec4_instruction *inst = emit(MOV(*dst, *src));
> - inst->predicate = predicate;
> -
> - dst->reg_offset++;
> - src->reg_offset++;
> -}
> -
> -
> -/* If the RHS processing resulted in an instruction generating a
> - * temporary value, and it would be easy to rewrite the instruction to
> - * generate its result right into the LHS instead, do so. This ends
> - * up reliably removing instructions where it can be tricky to do so
> - * later without real UD chain information.
> - */
> -bool
> -vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
> - dst_reg dst,
> - src_reg src,
> - vec4_instruction *pre_rhs_inst,
> - vec4_instruction *last_rhs_inst)
> -{
> - /* This could be supported, but it would take more smarts. */
> - if (ir->condition)
> - return false;
> -
> - if (pre_rhs_inst == last_rhs_inst)
> - return false; /* No instructions generated to work with. */
> -
> - /* Make sure the last instruction generated our source reg. */
> - if (src.file != GRF ||
> - src.file != last_rhs_inst->dst.file ||
> - src.reg != last_rhs_inst->dst.reg ||
> - src.reg_offset != last_rhs_inst->dst.reg_offset ||
> - src.reladdr ||
> - src.abs ||
> - src.negate ||
> - last_rhs_inst->predicate != BRW_PREDICATE_NONE)
> - return false;
> -
> - /* Check that that last instruction fully initialized the channels
> - * we want to use, in the order we want to use them. We could
> - * potentially reswizzle the operands of many instructions so that
> - * we could handle out of order channels, but don't yet.
> - */
> -
> - for (unsigned i = 0; i < 4; i++) {
> - if (dst.writemask & (1 << i)) {
> - if (!(last_rhs_inst->dst.writemask & (1 << i)))
> - return false;
> -
> - if (BRW_GET_SWZ(src.swizzle, i) != i)
> - return false;
> - }
> - }
> -
> - /* Success! Rewrite the instruction. */
> - last_rhs_inst->dst.file = dst.file;
> - last_rhs_inst->dst.reg = dst.reg;
> - last_rhs_inst->dst.reg_offset = dst.reg_offset;
> - last_rhs_inst->dst.reladdr = dst.reladdr;
> - last_rhs_inst->dst.writemask &= dst.writemask;
> -
> - return true;
> -}
> -
> -void
> -vec4_visitor::visit(ir_assignment *ir)
> -{
> - dst_reg dst = get_assignment_lhs(ir->lhs, this);
> - enum brw_predicate predicate = BRW_PREDICATE_NONE;
> -
> - if (!ir->lhs->type->is_scalar() &&
> - !ir->lhs->type->is_vector()) {
> - ir->rhs->accept(this);
> - src_reg src = this->result;
> -
> - if (ir->condition) {
> - emit_bool_to_cond_code(ir->condition, &predicate);
> - }
> -
> - /* emit_block_move doesn't account for swizzles in the source register.
> - * This should be ok, since the source register is a structure or an
> - * array, and those can't be swizzled. But double-check to be sure.
> - */
> - assert(src.swizzle ==
> - (ir->rhs->type->is_matrix()
> - ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
> - : BRW_SWIZZLE_NOOP));
> -
> - emit_block_move(&dst, &src, ir->rhs->type, predicate);
> - return;
> - }
> -
> - /* Now we're down to just a scalar/vector with writemasks. */
> - int i;
> -
> - vec4_instruction *pre_rhs_inst, *last_rhs_inst;
> - pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
> -
> - ir->rhs->accept(this);
> -
> - last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
> -
> - int swizzles[4];
> - int src_chan = 0;
> -
> - assert(ir->lhs->type->is_vector() ||
> - ir->lhs->type->is_scalar());
> - dst.writemask = ir->write_mask;
> -
> - /* Swizzle a small RHS vector into the channels being written.
> - *
> - * glsl ir treats write_mask as dictating how many channels are
> - * present on the RHS while in our instructions we need to make
> - * those channels appear in the slots of the vec4 they're written to.
> - */
> - for (int i = 0; i < 4; i++)
> - swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
> -
> - src_reg src = swizzle(this->result,
> - BRW_SWIZZLE4(swizzles[0], swizzles[1],
> - swizzles[2], swizzles[3]));
> -
> - if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
> - return;
> - }
> -
> - if (ir->condition) {
> - emit_bool_to_cond_code(ir->condition, &predicate);
> - }
> -
> - for (i = 0; i < type_size(ir->lhs->type); i++) {
> - vec4_instruction *inst = emit(MOV(dst, src));
> - inst->predicate = predicate;
> -
> - dst.reg_offset++;
> - src.reg_offset++;
> - }
> -}
> -
> -void
> -vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
> -{
> - if (ir->type->base_type == GLSL_TYPE_STRUCT) {
> - foreach_in_list(ir_constant, field_value, &ir->components) {
> - emit_constant_values(dst, field_value);
> - }
> - return;
> - }
> -
> - if (ir->type->is_array()) {
> - for (unsigned int i = 0; i < ir->type->length; i++) {
> - emit_constant_values(dst, ir->array_elements[i]);
> - }
> - return;
> - }
> -
> - if (ir->type->is_matrix()) {
> - for (int i = 0; i < ir->type->matrix_columns; i++) {
> - float *vec = &ir->value.f[i * ir->type->vector_elements];
> -
> - for (int j = 0; j < ir->type->vector_elements; j++) {
> - dst->writemask = 1 << j;
> - dst->type = BRW_REGISTER_TYPE_F;
> -
> - emit(MOV(*dst, src_reg(vec[j])));
> - }
> - dst->reg_offset++;
> - }
> - return;
> - }
> -
> - int remaining_writemask = (1 << ir->type->vector_elements) - 1;
> -
> - for (int i = 0; i < ir->type->vector_elements; i++) {
> - if (!(remaining_writemask & (1 << i)))
> - continue;
> -
> - dst->writemask = 1 << i;
> - dst->type = brw_type_for_base_type(ir->type);
> -
> - /* Find other components that match the one we're about to
> - * write. Emits fewer instructions for things like vec4(0.5,
> - * 1.5, 1.5, 1.5).
> - */
> - for (int j = i + 1; j < ir->type->vector_elements; j++) {
> - if (ir->type->base_type == GLSL_TYPE_BOOL) {
> - if (ir->value.b[i] == ir->value.b[j])
> - dst->writemask |= (1 << j);
> - } else {
> - /* u, i, and f storage all line up, so no need for a
> - * switch case for comparing each type.
> - */
> - if (ir->value.u[i] == ir->value.u[j])
> - dst->writemask |= (1 << j);
> - }
> - }
> -
> - switch (ir->type->base_type) {
> - case GLSL_TYPE_FLOAT:
> - emit(MOV(*dst, src_reg(ir->value.f[i])));
> - break;
> - case GLSL_TYPE_INT:
> - emit(MOV(*dst, src_reg(ir->value.i[i])));
> - break;
> - case GLSL_TYPE_UINT:
> - emit(MOV(*dst, src_reg(ir->value.u[i])));
> - break;
> - case GLSL_TYPE_BOOL:
> - emit(MOV(*dst,
> - src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
> - : 0)));
> - break;
> - default:
> - unreachable("Non-float/uint/int/bool constant");
> - }
> -
> - remaining_writemask &= ~dst->writemask;
> - }
> - dst->reg_offset++;
> -}
> -
> -void
> -vec4_visitor::visit(ir_constant *ir)
> -{
> - dst_reg dst = dst_reg(this, ir->type);
> - this->result = src_reg(dst);
> -
> - emit_constant_values(&dst, ir);
> -}
> -
> -void
> -vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
> -{
> - ir_dereference *deref = static_cast<ir_dereference *>(
> - ir->actual_parameters.get_head());
> - ir_variable *location = deref->variable_referenced();
> - unsigned surf_index = (prog_data->base.binding_table.abo_start +
> - location->data.binding);
> -
> - /* Calculate the surface offset */
> - src_reg offset(this, glsl_type::uint_type);
> - ir_dereference_array *deref_array = deref->as_dereference_array();
> - if (deref_array) {
> - deref_array->array_index->accept(this);
> -
> - src_reg tmp(this, glsl_type::uint_type);
> - emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
> - emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
> - } else {
> - offset = location->data.atomic.offset;
> - }
> -
> - /* Emit the appropriate machine instruction */
> - const char *callee = ir->callee->function_name();
> - dst_reg dst = get_assignment_lhs(ir->return_deref, this);
> -
> - if (!strcmp("__intrinsic_atomic_read", callee)) {
> - emit_untyped_surface_read(surf_index, dst, offset);
> -
> - } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
> - emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
> - src_reg(), src_reg());
> -
> - } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
> - emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
> - src_reg(), src_reg());
> - }
> -}
> -
> -void
> -vec4_visitor::visit(ir_call *ir)
> -{
> - const char *callee = ir->callee->function_name();
> -
> - if (!strcmp("__intrinsic_atomic_read", callee) ||
> - !strcmp("__intrinsic_atomic_increment", callee) ||
> - !strcmp("__intrinsic_atomic_predecrement", callee)) {
> - visit_atomic_counter_intrinsic(ir);
> - } else {
> - unreachable("Unsupported intrinsic.");
> - }
> -}
> -
> -src_reg
> -vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
> -{
> - vec4_instruction *inst =
> - new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
> - dst_reg(this, glsl_type::uvec4_type));
> - inst->base_mrf = 2;
> - inst->mlen = 1;
> - inst->src[1] = sampler;
> -
> - /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
> - int param_base = inst->base_mrf;
> - int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
> - int zero_mask = 0xf & ~coord_mask;
> -
> - emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
> - coordinate));
> -
> - emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
> - src_reg(0)));
> -
> - emit(inst);
> - return src_reg(inst->dst);
> -}
> -
> -static bool
> -is_high_sampler(struct brw_context *brw, src_reg sampler)
> -{
> - if (brw->gen < 8 && !brw->is_haswell)
> - return false;
> -
> - return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
> -}
> -
> -void
> -vec4_visitor::visit(ir_texture *ir)
> -{
> - uint32_t sampler =
> - _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
> -
> - ir_rvalue *nonconst_sampler_index =
> - _mesa_get_sampler_array_nonconst_index(ir->sampler);
> -
> - /* Handle non-constant sampler array indexing */
> - src_reg sampler_reg;
> - if (nonconst_sampler_index) {
> - /* The highest sampler which may be used by this operation is
> - * the last element of the array. Mark it here, because the generator
> - * doesn't have enough information to determine the bound.
> - */
> - uint32_t array_size = ir->sampler->as_dereference_array()
> - ->array->type->array_size();
> -
> - uint32_t max_used = sampler + array_size - 1;
> - if (ir->op == ir_tg4 && brw->gen < 8) {
> - max_used += prog_data->base.binding_table.gather_texture_start;
> - } else {
> - max_used += prog_data->base.binding_table.texture_start;
> - }
> -
> - brw_mark_surface_used(&prog_data->base, max_used);
> -
> - /* Emit code to evaluate the actual indexing expression */
> - nonconst_sampler_index->accept(this);
> - dst_reg temp(this, glsl_type::uint_type);
> - emit(ADD(temp, this->result, src_reg(sampler)))
> - ->force_writemask_all = true;
> - sampler_reg = src_reg(temp);
> - } else {
> - /* Single sampler, or constant array index; the indexing expression
> - * is just an immediate.
> - */
> - sampler_reg = src_reg(sampler);
> - }
> -
> - /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
> - * emitting anything other than setting up the constant result.
> - */
> - if (ir->op == ir_tg4) {
> - ir_constant *chan = ir->lod_info.component->as_constant();
> - int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
> - if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
> - dst_reg result(this, ir->type);
> - this->result = src_reg(result);
> - emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
> - return;
> - }
> - }
> -
> - /* Should be lowered by do_lower_texture_projection */
> - assert(!ir->projector);
> -
> - /* Should be lowered */
> - assert(!ir->offset || !ir->offset->type->is_array());
> -
> - /* Generate code to compute all the subexpression trees. This has to be
> - * done before loading any values into MRFs for the sampler message since
> - * generating these values may involve SEND messages that need the MRFs.
> - */
> - src_reg coordinate;
> - if (ir->coordinate) {
> - ir->coordinate->accept(this);
> - coordinate = this->result;
> - }
> -
> - src_reg shadow_comparitor;
> - if (ir->shadow_comparitor) {
> - ir->shadow_comparitor->accept(this);
> - shadow_comparitor = this->result;
> - }
> -
> - bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
> - src_reg offset_value;
> - if (has_nonconstant_offset) {
> - ir->offset->accept(this);
> - offset_value = src_reg(this->result);
> - }
> -
> - const glsl_type *lod_type = NULL, *sample_index_type = NULL;
> - src_reg lod, dPdx, dPdy, sample_index, mcs;
> - switch (ir->op) {
> - case ir_tex:
> - lod = src_reg(0.0f);
> - lod_type = glsl_type::float_type;
> - break;
> - case ir_txf:
> - case ir_txl:
> - case ir_txs:
> - ir->lod_info.lod->accept(this);
> - lod = this->result;
> - lod_type = ir->lod_info.lod->type;
> - break;
> - case ir_query_levels:
> - lod = src_reg(0);
> - lod_type = glsl_type::int_type;
> - break;
> - case ir_txf_ms:
> - ir->lod_info.sample_index->accept(this);
> - sample_index = this->result;
> - sample_index_type = ir->lod_info.sample_index->type;
> -
> - if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
> - mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
> - else
> - mcs = src_reg(0u);
> - break;
> - case ir_txd:
> - ir->lod_info.grad.dPdx->accept(this);
> - dPdx = this->result;
> -
> - ir->lod_info.grad.dPdy->accept(this);
> - dPdy = this->result;
> -
> - lod_type = ir->lod_info.grad.dPdx->type;
> - break;
> - case ir_txb:
> - case ir_lod:
> - case ir_tg4:
> - break;
> - }
> -
> - enum opcode opcode;
> - switch (ir->op) {
> - case ir_tex: opcode = SHADER_OPCODE_TXL; break;
> - case ir_txl: opcode = SHADER_OPCODE_TXL; break;
> - case ir_txd: opcode = SHADER_OPCODE_TXD; break;
> - case ir_txf: opcode = SHADER_OPCODE_TXF; break;
> - case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
> - case ir_txs: opcode = SHADER_OPCODE_TXS; break;
> - case ir_tg4: opcode = has_nonconstant_offset
> - ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
> - case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
> - case ir_txb:
> - unreachable("TXB is not valid for vertex shaders.");
> - case ir_lod:
> - unreachable("LOD is not valid for vertex shaders.");
> - default:
> - unreachable("Unrecognized tex op");
> - }
> -
> - vec4_instruction *inst = new(mem_ctx) vec4_instruction(
> - opcode, dst_reg(this, ir->type));
> -
> - if (ir->offset != NULL && !has_nonconstant_offset) {
> - inst->offset =
> - brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
> - ir->offset->type->vector_elements);
> - }
> -
> - /* Stuff the channel select bits in the top of the texture offset */
> - if (ir->op == ir_tg4)
> - inst->offset |= gather_channel(ir, sampler) << 16;
> -
> - /* The message header is necessary for:
> - * - Gen4 (always)
> - * - Gen9+ for selecting SIMD4x2
> - * - Texel offsets
> - * - Gather channel selection
> - * - Sampler indices too large to fit in a 4-bit value.
> - */
> - inst->header_present =
> - brw->gen < 5 || brw->gen >= 9 ||
> - inst->offset != 0 || ir->op == ir_tg4 ||
> - is_high_sampler(brw, sampler_reg);
> - inst->base_mrf = 2;
> - inst->mlen = inst->header_present + 1; /* always at least one */
> - inst->dst.writemask = WRITEMASK_XYZW;
> - inst->shadow_compare = ir->shadow_comparitor != NULL;
> -
> - inst->src[1] = sampler_reg;
> -
> - /* MRF for the first parameter */
> - int param_base = inst->base_mrf + inst->header_present;
> -
> - if (ir->op == ir_txs || ir->op == ir_query_levels) {
> - int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
> - emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
> - } else {
> - /* Load the coordinate */
> - /* FINISHME: gl_clamp_mask and saturate */
> - int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
> - int zero_mask = 0xf & ~coord_mask;
> -
> - emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
> - coordinate));
> -
> - if (zero_mask != 0) {
> - emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
> - src_reg(0)));
> - }
> - /* Load the shadow comparitor */
> - if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
> - emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
> - WRITEMASK_X),
> - shadow_comparitor));
> - inst->mlen++;
> - }
> -
> - /* Load the LOD info */
> - if (ir->op == ir_tex || ir->op == ir_txl) {
> - int mrf, writemask;
> - if (brw->gen >= 5) {
> - mrf = param_base + 1;
> - if (ir->shadow_comparitor) {
> - writemask = WRITEMASK_Y;
> - /* mlen already incremented */
> - } else {
> - writemask = WRITEMASK_X;
> - inst->mlen++;
> - }
> - } else /* brw->gen == 4 */ {
> - mrf = param_base;
> - writemask = WRITEMASK_W;
> - }
> - emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
> - } else if (ir->op == ir_txf) {
> - emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
> - } else if (ir->op == ir_txf_ms) {
> - emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
> - sample_index));
> - if (brw->gen >= 7) {
> - /* MCS data is in the first channel of `mcs`, but we need to get it into
> - * the .y channel of the second vec4 of params, so replicate .x across
> - * the whole vec4 and then mask off everything except .y
> - */
> - mcs.swizzle = BRW_SWIZZLE_XXXX;
> - emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
> - mcs));
> - }
> - inst->mlen++;
> - } else if (ir->op == ir_txd) {
> - const glsl_type *type = lod_type;
> -
> - if (brw->gen >= 5) {
> - dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
> - dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
> - emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
> - emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
> - inst->mlen++;
> -
> - if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
> - dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
> - dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
> - emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
> - emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
> - inst->mlen++;
> -
> - if (ir->shadow_comparitor) {
> - emit(MOV(dst_reg(MRF, param_base + 2,
> - ir->shadow_comparitor->type, WRITEMASK_Z),
> - shadow_comparitor));
> - }
> - }
> - } else /* brw->gen == 4 */ {
> - emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
> - emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
> - inst->mlen += 2;
> - }
> - } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
> - if (ir->shadow_comparitor) {
> - emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
> - shadow_comparitor));
> - }
> -
> - emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
> - offset_value));
> - inst->mlen++;
> - }
> - }
> -
> - emit(inst);
> -
> - /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
> - * spec requires layers.
> - */
> - if (ir->op == ir_txs) {
> - glsl_type const *type = ir->sampler->type;
> - if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
> - type->sampler_array) {
> - emit_math(SHADER_OPCODE_INT_QUOTIENT,
> - writemask(inst->dst, WRITEMASK_Z),
> - src_reg(inst->dst), src_reg(6));
> - }
> - }
> -
> - if (brw->gen == 6 && ir->op == ir_tg4) {
> - emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
> - }
> -
> - swizzle_result(ir, src_reg(inst->dst), sampler);
> -}
> -
> -/**
> - * Apply workarounds for Gen6 gather with UINT/SINT
> - */
> -void
> -vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
> -{
> - if (!wa)
> - return;
> -
> - int width = (wa & WA_8BIT) ? 8 : 16;
> - dst_reg dst_f = dst;
> - dst_f.type = BRW_REGISTER_TYPE_F;
> -
> - /* Convert from UNORM to UINT */
> - emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
> - emit(MOV(dst, src_reg(dst_f)));
> -
> - if (wa & WA_SIGN) {
> - /* Reinterpret the UINT value as a signed INT value by
> - * shifting the sign bit into place, then shifting back
> - * preserving sign.
> - */
> - emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
> - emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
> - }
> -}
> -
> -/**
> - * Set up the gather channel based on the swizzle, for gather4.
> - */
> -uint32_t
> -vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
> -{
> - ir_constant *chan = ir->lod_info.component->as_constant();
> - int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
> - switch (swiz) {
> - case SWIZZLE_X: return 0;
> - case SWIZZLE_Y:
> - /* gather4 sampler is broken for green channel on RG32F --
> - * we must ask for blue instead.
> - */
> - if (key->tex.gather_channel_quirk_mask & (1<<sampler))
> - return 2;
> - return 1;
> - case SWIZZLE_Z: return 2;
> - case SWIZZLE_W: return 3;
> - default:
> - unreachable("Not reached"); /* zero, one swizzles handled already */
> - }
> -}
> -
> -void
> -vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
> -{
> - int s = key->tex.swizzles[sampler];
> -
> - this->result = src_reg(this, ir->type);
> - dst_reg swizzled_result(this->result);
> -
> - if (ir->op == ir_query_levels) {
> - /* # levels is in .w */
> - orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
> - emit(MOV(swizzled_result, orig_val));
> - return;
> - }
> -
> - if (ir->op == ir_txs || ir->type == glsl_type::float_type
> - || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
> - emit(MOV(swizzled_result, orig_val));
> - return;
> - }
> -
> -
> - int zero_mask = 0, one_mask = 0, copy_mask = 0;
> - int swizzle[4] = {0};
> -
> - for (int i = 0; i < 4; i++) {
> - switch (GET_SWZ(s, i)) {
> - case SWIZZLE_ZERO:
> - zero_mask |= (1 << i);
> - break;
> - case SWIZZLE_ONE:
> - one_mask |= (1 << i);
> - break;
> - default:
> - copy_mask |= (1 << i);
> - swizzle[i] = GET_SWZ(s, i);
> - break;
> - }
> - }
> -
> - if (copy_mask) {
> - orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
> - swizzled_result.writemask = copy_mask;
> - emit(MOV(swizzled_result, orig_val));
> - }
> -
> - if (zero_mask) {
> - swizzled_result.writemask = zero_mask;
> - emit(MOV(swizzled_result, src_reg(0.0f)));
> - }
> -
> - if (one_mask) {
> - swizzled_result.writemask = one_mask;
> - emit(MOV(swizzled_result, src_reg(1.0f)));
> - }
> -}
> -
> -void
> -vec4_visitor::visit(ir_return *)
> -{
> - unreachable("not reached");
> -}
> -
> -void
> -vec4_visitor::visit(ir_discard *)
> -{
> - unreachable("not reached");
> -}
> -
> -void
> -vec4_visitor::visit(ir_if *ir)
> -{
> - /* Don't point the annotation at the if statement, because then it plus
> - * the then and else blocks get printed.
> - */
> - this->base_ir = ir->condition;
> -
> - if (brw->gen == 6) {
> - emit_if_gen6(ir);
> - } else {
> - enum brw_predicate predicate;
> - emit_bool_to_cond_code(ir->condition, &predicate);
> - emit(IF(predicate));
> - }
> -
> - visit_instructions(&ir->then_instructions);
> -
> - if (!ir->else_instructions.is_empty()) {
> - this->base_ir = ir->condition;
> - emit(BRW_OPCODE_ELSE);
> -
> - visit_instructions(&ir->else_instructions);
> - }
> -
> - this->base_ir = ir->condition;
> - emit(BRW_OPCODE_ENDIF);
> -}
> -
> -void
> -vec4_visitor::visit(ir_emit_vertex *)
> -{
> - unreachable("not reached");
> -}
> -
> -void
> -vec4_visitor::visit(ir_end_primitive *)
> -{
> - unreachable("not reached");
> -}
> -
> -void
> -vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
> - dst_reg dst, src_reg offset,
> - src_reg src0, src_reg src1)
> -{
> - unsigned mlen = 0;
> -
> - /* Set the atomic operation offset. */
> - emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
> - mlen++;
> -
> - /* Set the atomic operation arguments. */
> - if (src0.file != BAD_FILE) {
> - emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
> - mlen++;
> - }
> -
> - if (src1.file != BAD_FILE) {
> - emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
> - mlen++;
> - }
> -
> - /* Emit the instruction. Note that this maps to the normal SIMD8
> - * untyped atomic message on Ivy Bridge, but that's OK because
> - * unused channels will be masked out.
> - */
> - vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
> - src_reg(atomic_op), src_reg(surf_index));
> - inst->base_mrf = 0;
> - inst->mlen = mlen;
> -}
> -
> -void
> -vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
> - src_reg offset)
> -{
> - /* Set the surface read offset. */
> - emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
> -
> - /* Emit the instruction. Note that this maps to the normal SIMD8
> - * untyped surface read message, but that's OK because unused
> - * channels will be masked out.
> - */
> - vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
> - dst, src_reg(surf_index));
> - inst->base_mrf = 0;
> - inst->mlen = 1;
> -}
> -
> -void
> -vec4_visitor::emit_ndc_computation()
> -{
> - /* Get the position */
> - src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
> -
> - /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
> - dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
> - output_reg[BRW_VARYING_SLOT_NDC] = ndc;
> -
> - current_annotation = "NDC";
> - dst_reg ndc_w = ndc;
> - ndc_w.writemask = WRITEMASK_W;
> - src_reg pos_w = pos;
> - pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
> - emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
> -
> - dst_reg ndc_xyz = ndc;
> - ndc_xyz.writemask = WRITEMASK_XYZ;
> -
> - emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
> -}
> -
> -void
> -vec4_visitor::emit_psiz_and_flags(dst_reg reg)
> -{
> - if (brw->gen < 6 &&
> - ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
> - key->userclip_active || brw->has_negative_rhw_bug)) {
> - dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
> - dst_reg header1_w = header1;
> - header1_w.writemask = WRITEMASK_W;
> -
> - emit(MOV(header1, 0u));
> -
> - if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
> - src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
> -
> - current_annotation = "Point size";
> - emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
> - emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
> - }
> -
> - if (key->userclip_active) {
> - current_annotation = "Clipping flags";
> - dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
> - dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
> -
> - emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
> - emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
> - emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
> -
> - emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
> - emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
> - emit(SHL(flags1, src_reg(flags1), src_reg(4)));
> - emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
> - }
> -
> - /* i965 clipping workaround:
> - * 1) Test for -ve rhw
> - * 2) If set,
> - * set ndc = (0,0,0,0)
> - * set ucp[6] = 1
> - *
> - * Later, clipping will detect ucp[6] and ensure the primitive is
> - * clipped against all fixed planes.
> - */
> - if (brw->has_negative_rhw_bug) {
> - src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
> - ndc_w.swizzle = BRW_SWIZZLE_WWWW;
> - emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
> - vec4_instruction *inst;
> - inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
> - inst->predicate = BRW_PREDICATE_NORMAL;
> - inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
> - inst->predicate = BRW_PREDICATE_NORMAL;
> - }
> -
> - emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
> - } else if (brw->gen < 6) {
> - emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
> - } else {
> - emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
> - if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
> - dst_reg reg_w = reg;
> - reg_w.writemask = WRITEMASK_W;
> - emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
> - }
> - if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
> - dst_reg reg_y = reg;
> - reg_y.writemask = WRITEMASK_Y;
> - reg_y.type = BRW_REGISTER_TYPE_D;
> - emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
> - }
> - if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
> - dst_reg reg_z = reg;
> - reg_z.writemask = WRITEMASK_Z;
> - reg_z.type = BRW_REGISTER_TYPE_D;
> - emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
> - }
> - }
> -}
> -
> -void
> -vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
> -{
> - /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
> - *
> - * "If a linked set of shaders forming the vertex stage contains no
> - * static write to gl_ClipVertex or gl_ClipDistance, but the
> - * application has requested clipping against user clip planes through
> - * the API, then the coordinate written to gl_Position is used for
> - * comparison against the user clip planes."
> - *
> - * This function is only called if the shader didn't write to
> - * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
> - * if the user wrote to it; otherwise we use gl_Position.
> - */
> - gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
> - if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
> - clip_vertex = VARYING_SLOT_POS;
> - }
> -
> - for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
> - ++i) {
> - reg.writemask = 1 << i;
> - emit(DP4(reg,
> - src_reg(output_reg[clip_vertex]),
> - src_reg(this->userplane[i + offset])));
> - }
> -}
> -
> -vec4_instruction *
> -vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
> -{
> - assert (varying < VARYING_SLOT_MAX);
> - reg.type = output_reg[varying].type;
> - current_annotation = output_reg_annotation[varying];
> - /* Copy the register, saturating if necessary */
> - return emit(MOV(reg, src_reg(output_reg[varying])));
> -}
> -
> -void
> -vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
> -{
> - reg.type = BRW_REGISTER_TYPE_F;
> -
> - switch (varying) {
> - case VARYING_SLOT_PSIZ:
> - {
> - /* PSIZ is always in slot 0, and is coupled with other flags. */
> - current_annotation = "indices, point width, clip flags";
> - emit_psiz_and_flags(reg);
> - break;
> - }
> - case BRW_VARYING_SLOT_NDC:
> - current_annotation = "NDC";
> - emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
> - break;
> - case VARYING_SLOT_POS:
> - current_annotation = "gl_Position";
> - emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
> - break;
> - case VARYING_SLOT_EDGE:
> - /* This is present when doing unfilled polygons. We're supposed to copy
> - * the edge flag from the user-provided vertex array
> - * (glEdgeFlagPointer), or otherwise we'll copy from the current value
> - * of that attribute (starts as 1.0f). This is then used in clipping to
> - * determine which edges should be drawn as wireframe.
> - */
> - current_annotation = "edge flag";
> - emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
> - glsl_type::float_type, WRITEMASK_XYZW))));
> - break;
> - case BRW_VARYING_SLOT_PAD:
> - /* No need to write to this slot */
> - break;
> - case VARYING_SLOT_COL0:
> - case VARYING_SLOT_COL1:
> - case VARYING_SLOT_BFC0:
> - case VARYING_SLOT_BFC1: {
> - /* These built-in varyings are only supported in compatibility mode,
> - * and we only support GS in core profile. So, this must be a vertex
> - * shader.
> - */
> - assert(stage == MESA_SHADER_VERTEX);
> - vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
> - if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
> - inst->saturate = true;
> - break;
> - }
> -
> - default:
> - emit_generic_urb_slot(reg, varying);
> - break;
> - }
> -}
> -
> -static int
> -align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
> -{
> - if (brw->gen >= 6) {
> - /* URB data written (does not include the message header reg) must
> - * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
> - * section 5.4.3.2.2: URB_INTERLEAVED.
> - *
> - * URB entries are allocated on a multiple of 1024 bits, so an
> - * extra 128 bits written here to make the end align to 256 is
> - * no problem.
> - */
> - if ((mlen % 2) != 1)
> - mlen++;
> - }
> -
> - return mlen;
> -}
> -
> -
> -/**
> - * Generates the VUE payload plus the necessary URB write instructions to
> - * output it.
> - *
> - * The VUE layout is documented in Volume 2a.
> - */
> -void
> -vec4_visitor::emit_vertex()
> -{
> - /* MRF 0 is reserved for the debugger, so start with message header
> - * in MRF 1.
> - */
> - int base_mrf = 1;
> - int mrf = base_mrf;
> - /* In the process of generating our URB write message contents, we
> - * may need to unspill a register or load from an array. Those
> - * reads would use MRFs 14-15.
> - */
> - int max_usable_mrf = 13;
> -
> - /* The following assertion verifies that max_usable_mrf causes an
> - * even-numbered amount of URB write data, which will meet gen6's
> - * requirements for length alignment.
> - */
> - assert ((max_usable_mrf - base_mrf) % 2 == 0);
> -
> - /* First mrf is the g0-based message header containing URB handles and
> - * such.
> - */
> - emit_urb_write_header(mrf++);
> -
> - if (brw->gen < 6) {
> - emit_ndc_computation();
> - }
> -
> - /* Lower legacy ff and ClipVertex clipping to clip distances */
> - if (key->userclip_active && !prog->UsesClipDistanceOut) {
> - current_annotation = "user clip distances";
> -
> - output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
> - output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
> -
> - emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
> - emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
> - }
> -
> - /* We may need to split this up into several URB writes, so do them in a
> - * loop.
> - */
> - int slot = 0;
> - bool complete = false;
> - do {
> - /* URB offset is in URB row increments, and each of our MRFs is half of
> - * one of those, since we're doing interleaved writes.
> - */
> - int offset = slot / 2;
> -
> - mrf = base_mrf + 1;
> - for (; slot < prog_data->vue_map.num_slots; ++slot) {
> - emit_urb_slot(dst_reg(MRF, mrf++),
> - prog_data->vue_map.slot_to_varying[slot]);
> -
> - /* If this was max_usable_mrf, we can't fit anything more into this
> - * URB WRITE.
> - */
> - if (mrf > max_usable_mrf) {
> - slot++;
> - break;
> - }
> - }
> -
> - complete = slot >= prog_data->vue_map.num_slots;
> - current_annotation = "URB write";
> - vec4_instruction *inst = emit_urb_write_opcode(complete);
> - inst->base_mrf = base_mrf;
> - inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
> - inst->offset += offset;
> - } while(!complete);
> -}
> -
> -
> -src_reg
> -vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
> - src_reg *reladdr, int reg_offset)
> -{
> - /* Because we store the values to scratch interleaved like our
> - * vertex data, we need to scale the vec4 index by 2.
> - */
> - int message_header_scale = 2;
> -
> - /* Pre-gen6, the message header uses byte offsets instead of vec4
> - * (16-byte) offset units.
> - */
> - if (brw->gen < 6)
> - message_header_scale *= 16;
> -
> - if (reladdr) {
> - src_reg index = src_reg(this, glsl_type::int_type);
> -
> - emit_before(block, inst, ADD(dst_reg(index), *reladdr,
> - src_reg(reg_offset)));
> - emit_before(block, inst, MUL(dst_reg(index), index,
> - src_reg(message_header_scale)));
> -
> - return index;
> - } else {
> - return src_reg(reg_offset * message_header_scale);
> - }
> -}
> -
> -src_reg
> -vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
> - src_reg *reladdr, int reg_offset)
> -{
> - if (reladdr) {
> - src_reg index = src_reg(this, glsl_type::int_type);
> -
> - emit_before(block, inst, ADD(dst_reg(index), *reladdr,
> - src_reg(reg_offset)));
> -
> - /* Pre-gen6, the message header uses byte offsets instead of vec4
> - * (16-byte) offset units.
> - */
> - if (brw->gen < 6) {
> - emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
> - }
> -
> - return index;
> - } else if (brw->gen >= 8) {
> - /* Store the offset in a GRF so we can send-from-GRF. */
> - src_reg offset = src_reg(this, glsl_type::int_type);
> - emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
> - return offset;
> - } else {
> - int message_header_scale = brw->gen < 6 ? 16 : 1;
> - return src_reg(reg_offset * message_header_scale);
> - }
> -}
> -
> -/**
> - * Emits an instruction before @inst to load the value named by @orig_src
> - * from scratch space at @base_offset to @temp.
> - *
> - * @base_offset is measured in 32-byte units (the size of a register).
> - */
> -void
> -vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
> - dst_reg temp, src_reg orig_src,
> - int base_offset)
> -{
> - int reg_offset = base_offset + orig_src.reg_offset;
> - src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
> - reg_offset);
> -
> - emit_before(block, inst, SCRATCH_READ(temp, index));
> -}
> -
> -/**
> - * Emits an instruction after @inst to store the value to be written
> - * to @orig_dst to scratch space at @base_offset, from @temp.
> - *
> - * @base_offset is measured in 32-byte units (the size of a register).
> - */
> -void
> -vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
> - int base_offset)
> -{
> - int reg_offset = base_offset + inst->dst.reg_offset;
> - src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
> - reg_offset);
> -
> - /* Create a temporary register to store *inst's result in.
> - *
> - * We have to be careful in MOVing from our temporary result register in
> - * the scratch write. If we swizzle from channels of the temporary that
> - * weren't initialized, it will confuse live interval analysis, which will
> - * make spilling fail to make progress.
> - */
> - const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
> - inst->dst.type),
> - brw_swizzle_for_mask(inst->dst.writemask));
> - dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
> - inst->dst.writemask));
> - vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
> - write->predicate = inst->predicate;
> - write->ir = inst->ir;
> - write->annotation = inst->annotation;
> - inst->insert_after(block, write);
> -
> - inst->dst.file = temp.file;
> - inst->dst.reg = temp.reg;
> - inst->dst.reg_offset = temp.reg_offset;
> - inst->dst.reladdr = NULL;
> -}
> -
> -/**
> - * We can't generally support array access in GRF space, because a
> - * single instruction's destination can only span 2 contiguous
> - * registers. So, we send all GRF arrays that get variable index
> - * access to scratch space.
> - */
> -void
> -vec4_visitor::move_grf_array_access_to_scratch()
> -{
> - int scratch_loc[this->alloc.count];
> - memset(scratch_loc, -1, sizeof(scratch_loc));
> -
> - /* First, calculate the set of virtual GRFs that need to be punted
> - * to scratch due to having any array access on them, and where in
> - * scratch.
> - */
> - foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
> - if (inst->dst.file == GRF && inst->dst.reladdr &&
> - scratch_loc[inst->dst.reg] == -1) {
> - scratch_loc[inst->dst.reg] = c->last_scratch;
> - c->last_scratch += this->alloc.sizes[inst->dst.reg];
> - }
> -
> - for (int i = 0 ; i < 3; i++) {
> - src_reg *src = &inst->src[i];
> -
> - if (src->file == GRF && src->reladdr &&
> - scratch_loc[src->reg] == -1) {
> - scratch_loc[src->reg] = c->last_scratch;
> - c->last_scratch += this->alloc.sizes[src->reg];
> - }
> - }
> - }
> -
> - /* Now, for anything that will be accessed through scratch, rewrite
> - * it to load/store. Note that this is a _safe list walk, because
> - * we may generate a new scratch_write instruction after the one
> - * we're processing.
> - */
> - foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
> - /* Set up the annotation tracking for new generated instructions. */
> - base_ir = inst->ir;
> - current_annotation = inst->annotation;
> -
> - if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
> - emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
> - }
> -
> - for (int i = 0 ; i < 3; i++) {
> - if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
> - continue;
> -
> - dst_reg temp = dst_reg(this, glsl_type::vec4_type);
> -
> - emit_scratch_read(block, inst, temp, inst->src[i],
> - scratch_loc[inst->src[i].reg]);
> -
> - inst->src[i].file = temp.file;
> - inst->src[i].reg = temp.reg;
> - inst->src[i].reg_offset = temp.reg_offset;
> - inst->src[i].reladdr = NULL;
> - }
> - }
> -}
> -
> -/**
> - * Emits an instruction before @inst to load the value named by @orig_src
> - * from the pull constant buffer (surface) at @base_offset to @temp.
> - */
> -void
> -vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
> - dst_reg temp, src_reg orig_src,
> - int base_offset)
> -{
> - int reg_offset = base_offset + orig_src.reg_offset;
> - src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
> - src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
> - reg_offset);
> - vec4_instruction *load;
> -
> - if (brw->gen >= 7) {
> - dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
> -
> - /* We have to use a message header on Skylake to get SIMD4x2 mode.
> - * Reserve space for the register.
> - */
> - if (brw->gen >= 9) {
> - grf_offset.reg_offset++;
> - alloc.sizes[grf_offset.reg] = 2;
> - }
> -
> - grf_offset.type = offset.type;
> - emit_before(block, inst, MOV(grf_offset, offset));
> -
> - load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
> - temp, index, src_reg(grf_offset));
> - load->mlen = 1;
> - } else {
> - load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
> - temp, index, offset);
> - load->base_mrf = 14;
> - load->mlen = 1;
> - }
> - emit_before(block, inst, load);
> -}
> -
> -/**
> - * Implements array access of uniforms by inserting a
> - * PULL_CONSTANT_LOAD instruction.
> - *
> - * Unlike temporary GRF array access (where we don't support it due to
> - * the difficulty of doing relative addressing on instruction
> - * destinations), we could potentially do array access of uniforms
> - * that were loaded in GRF space as push constants. In real-world
> - * usage we've seen, though, the arrays being used are always larger
> - * than we could load as push constants, so just always move all
> - * uniform array access out to a pull constant buffer.
> - */
> -void
> -vec4_visitor::move_uniform_array_access_to_pull_constants()
> -{
> - int pull_constant_loc[this->uniforms];
> - memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
> - bool nested_reladdr;
> -
> - /* Walk through and find array access of uniforms. Put a copy of that
> - * uniform in the pull constant buffer.
> - *
> - * Note that we don't move constant-indexed accesses to arrays. No
> - * testing has been done of the performance impact of this choice.
> - */
> - do {
> - nested_reladdr = false;
> -
> - foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
> - for (int i = 0 ; i < 3; i++) {
> - if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
> - continue;
> -
> - int uniform = inst->src[i].reg;
> -
> - if (inst->src[i].reladdr->reladdr)
> - nested_reladdr = true; /* will need another pass */
> -
> - /* If this array isn't already present in the pull constant buffer,
> - * add it.
> - */
> - if (pull_constant_loc[uniform] == -1) {
> - const gl_constant_value **values =
> - &stage_prog_data->param[uniform * 4];
> -
> - pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
> -
> - assert(uniform < uniform_array_size);
> - for (int j = 0; j < uniform_size[uniform] * 4; j++) {
> - stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
> - = values[j];
> - }
> - }
> -
> - /* Set up the annotation tracking for new generated instructions. */
> - base_ir = inst->ir;
> - current_annotation = inst->annotation;
> -
> - dst_reg temp = dst_reg(this, glsl_type::vec4_type);
> -
> - emit_pull_constant_load(block, inst, temp, inst->src[i],
> - pull_constant_loc[uniform]);
> -
> - inst->src[i].file = temp.file;
> - inst->src[i].reg = temp.reg;
> - inst->src[i].reg_offset = temp.reg_offset;
> - inst->src[i].reladdr = NULL;
> - }
> - }
> - } while (nested_reladdr);
> -
> - /* Now there are no accesses of the UNIFORM file with a reladdr, so
> - * no need to track them as larger-than-vec4 objects. This will be
> - * relied on in cutting out unused uniform vectors from push
> - * constants.
> - */
> - split_uniform_registers();
> -}
> -
> -void
> -vec4_visitor::resolve_ud_negate(src_reg *reg)
> -{
> - if (reg->type != BRW_REGISTER_TYPE_UD ||
> - !reg->negate)
> - return;
> -
> - src_reg temp = src_reg(this, glsl_type::uvec4_type);
> - emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
> - *reg = temp;
> -}
> -
> -/**
> - * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
> - *
> - * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
> - * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
> - */
> -void
> -vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
> -{
> - assert(brw->gen <= 5);
> -
> - if (!rvalue->type->is_boolean())
> - return;
> -
> - src_reg and_result = src_reg(this, rvalue->type);
> - src_reg neg_result = src_reg(this, rvalue->type);
> - emit(AND(dst_reg(and_result), *reg, src_reg(1)));
> - emit(MOV(dst_reg(neg_result), negate(and_result)));
> - *reg = neg_result;
> -}
> -
> -vec4_visitor::vec4_visitor(struct brw_context *brw,
> - struct brw_vec4_compile *c,
> - struct gl_program *prog,
> - const struct brw_vue_prog_key *key,
> - struct brw_vue_prog_data *prog_data,
> - struct gl_shader_program *shader_prog,
> - gl_shader_stage stage,
> - void *mem_ctx,
> - bool no_spills,
> - shader_time_shader_type st_base,
> - shader_time_shader_type st_written,
> - shader_time_shader_type st_reset)
> - : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
> - c(c),
> - key(key),
> - prog_data(prog_data),
> - sanity_param_count(0),
> - fail_msg(NULL),
> - first_non_payload_grf(0),
> - need_all_constants_in_pull_buffer(false),
> - no_spills(no_spills),
> - st_base(st_base),
> - st_written(st_written),
> - st_reset(st_reset)
> -{
> - this->mem_ctx = mem_ctx;
> - this->failed = false;
> -
> - this->base_ir = NULL;
> - this->current_annotation = NULL;
> - memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
> -
> - this->variable_ht = hash_table_ctor(0,
> - hash_table_pointer_hash,
> - hash_table_pointer_compare);
> -
> - this->virtual_grf_start = NULL;
> - this->virtual_grf_end = NULL;
> - this->live_intervals = NULL;
> -
> - this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
> -
> - this->uniforms = 0;
> -
> - /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
> - * at least one. See setup_uniforms() in brw_vec4.cpp.
> - */
> - this->uniform_array_size = 1;
> - if (prog_data) {
> - this->uniform_array_size =
> - MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
> - }
> -
> - this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
> - this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
> -}
> -
> -vec4_visitor::~vec4_visitor()
> -{
> - hash_table_dtor(this->variable_ht);
> -}
> -
> -
> -void
> -vec4_visitor::fail(const char *format, ...)
> -{
> - va_list va;
> - char *msg;
> -
> - if (failed)
> - return;
> -
> - failed = true;
> -
> - va_start(va, format);
> - msg = ralloc_vasprintf(mem_ctx, format, va);
> - va_end(va);
> - msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
> -
> - this->fail_msg = msg;
> -
> - if (debug_enabled) {
> - fprintf(stderr, "%s", msg);
> - }
> -}
> -
> -} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
> index c3b0233..c60e532 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
> @@ -37,7 +37,7 @@ extern "C" {
> using namespace brw;
>
> void
> -vec4_visitor::emit_vp_sop(enum brw_conditional_mod conditional_mod,
> +vec4_god::emit_vp_sop(enum brw_conditional_mod conditional_mod,
> dst_reg dst, src_reg src0, src_reg src1,
> src_reg one)
> {
> @@ -50,7 +50,7 @@ vec4_visitor::emit_vp_sop(enum brw_conditional_mod conditional_mod,
> }
>
> void
> -vec4_vs_visitor::emit_program_code()
> +vec4_vs_god::emit_program_code()
> {
> this->need_all_constants_in_pull_buffer = false;
>
> @@ -407,7 +407,7 @@ vec4_vs_visitor::emit_program_code()
> }
>
> void
> -vec4_vs_visitor::setup_vp_regs()
> +vec4_vs_god::setup_vp_regs()
> {
> /* PROGRAM_TEMPORARY */
> int num_temp = prog->NumTemporaries;
> @@ -452,7 +452,7 @@ vec4_vs_visitor::setup_vp_regs()
> }
>
> dst_reg
> -vec4_vs_visitor::get_vp_dst_reg(const prog_dst_register &dst)
> +vec4_vs_god::get_vp_dst_reg(const prog_dst_register &dst)
> {
> dst_reg result;
>
> @@ -485,7 +485,7 @@ vec4_vs_visitor::get_vp_dst_reg(const prog_dst_register &dst)
> }
>
> src_reg
> -vec4_vs_visitor::get_vp_src_reg(const prog_src_register &src)
> +vec4_vs_god::get_vp_src_reg(const prog_src_register &src)
> {
> struct gl_program_parameter_list *plist =
> vs_compile->vp->program.Base.Parameters;
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_god.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_god.cpp
> new file mode 100644
> index 0000000..0b69409
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_god.cpp
> @@ -0,0 +1,231 @@
> +/*
> + * Copyright © 2013 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +
> +#include "brw_vs.h"
> +#include "main/context.h"
> +
> +
> +namespace brw {
> +
> +void
> +vec4_vs_god::emit_prolog()
> +{
> + dst_reg sign_recovery_shift;
> + dst_reg normalize_factor;
> + dst_reg es3_normalize_factor;
> +
> + for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
> + if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
> + uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
> + dst_reg reg(ATTR, i);
> + dst_reg reg_d = reg;
> + reg_d.type = BRW_REGISTER_TYPE_D;
> + dst_reg reg_ud = reg;
> + reg_ud.type = BRW_REGISTER_TYPE_UD;
> +
> + /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
> + * come in as floating point conversions of the integer values.
> + */
> + if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
> + dst_reg dst = reg;
> + dst.type = brw_type_for_base_type(glsl_type::vec4_type);
> + dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
> + emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
> + }
> +
> + /* Do sign recovery for 2101010 formats if required. */
> + if (wa_flags & BRW_ATTRIB_WA_SIGN) {
> + if (sign_recovery_shift.file == BAD_FILE) {
> + /* shift constant: <22,22,22,30> */
> + sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
> + emit(MOV(writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
> + emit(MOV(writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
> + }
> +
> + emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
> + emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
> + }
> +
> + /* Apply BGRA swizzle if required. */
> + if (wa_flags & BRW_ATTRIB_WA_BGRA) {
> + src_reg temp = src_reg(reg);
> + temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
> + emit(MOV(reg, temp));
> + }
> +
> + if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
> + /* ES 3.0 has different rules for converting signed normalized
> + * fixed-point numbers than desktop GL.
> + */
> + if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
> + /* According to equation 2.2 of the ES 3.0 specification,
> + * signed normalization conversion is done by:
> + *
> + * f = c / (2^(b-1)-1)
> + */
> + if (es3_normalize_factor.file == BAD_FILE) {
> + /* mul constant: 1 / (2^(b-1) - 1) */
> + es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
> + emit(MOV(writemask(es3_normalize_factor, WRITEMASK_XYZ),
> + src_reg(1.0f / ((1<<9) - 1))));
> + emit(MOV(writemask(es3_normalize_factor, WRITEMASK_W),
> + src_reg(1.0f / ((1<<1) - 1))));
> + }
> +
> + dst_reg dst = reg;
> + dst.type = brw_type_for_base_type(glsl_type::vec4_type);
> + emit(MOV(dst, src_reg(reg_d)));
> + emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
> + emit_minmax(BRW_CONDITIONAL_GE, dst, src_reg(dst), src_reg(-1.0f));
> + } else {
> + /* The following equations are from the OpenGL 3.2 specification:
> + *
> + * 2.1 unsigned normalization
> + * f = c/(2^n-1)
> + *
> + * 2.2 signed normalization
> + * f = (2c+1)/(2^n-1)
> + *
> + * Both of these share a common divisor, which is represented by
> + * "normalize_factor" in the code below.
> + */
> + if (normalize_factor.file == BAD_FILE) {
> + /* 1 / (2^b - 1) for b=<10,10,10,2> */
> + normalize_factor = dst_reg(this, glsl_type::vec4_type);
> + emit(MOV(writemask(normalize_factor, WRITEMASK_XYZ),
> + src_reg(1.0f / ((1<<10) - 1))));
> + emit(MOV(writemask(normalize_factor, WRITEMASK_W),
> + src_reg(1.0f / ((1<<2) - 1))));
> + }
> +
> + dst_reg dst = reg;
> + dst.type = brw_type_for_base_type(glsl_type::vec4_type);
> + emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
> +
> + /* For signed normalization, we want the numerator to be 2c+1. */
> + if (wa_flags & BRW_ATTRIB_WA_SIGN) {
> + emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
> + emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
> + }
> +
> + emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
> + }
> + }
> +
> + if (wa_flags & BRW_ATTRIB_WA_SCALE) {
> + dst_reg dst = reg;
> + dst.type = brw_type_for_base_type(glsl_type::vec4_type);
> + emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
> + }
> + }
> + }
> +}
> +
> +
> +dst_reg *
> +vec4_vs_god::make_reg_for_system_value(ir_variable *ir)
> +{
> + /* VertexID is stored by the VF as the last vertex element, but
> + * we don't represent it with a flag in inputs_read, so we call
> + * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
> + */
> + dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
> +
> + switch (ir->data.location) {
> + case SYSTEM_VALUE_BASE_VERTEX:
> + reg->writemask = WRITEMASK_X;
> + vs_prog_data->uses_vertexid = true;
> + break;
> + case SYSTEM_VALUE_VERTEX_ID:
> + case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
> + reg->writemask = WRITEMASK_Z;
> + vs_prog_data->uses_vertexid = true;
> + break;
> + case SYSTEM_VALUE_INSTANCE_ID:
> + reg->writemask = WRITEMASK_W;
> + vs_prog_data->uses_instanceid = true;
> + break;
> + default:
> + unreachable("not reached");
> + }
> +
> + return reg;
> +}
> +
> +
> +void
> +vec4_vs_god::emit_urb_write_header(int mrf)
> +{
> + /* No need to do anything for VS; an implied write to this MRF will be
> + * performed by VS_OPCODE_URB_WRITE.
> + */
> + (void) mrf;
> +}
> +
> +
> +vec4_instruction *
> +vec4_vs_god::emit_urb_write_opcode(bool complete)
> +{
> + /* For VS, the URB writes end the thread. */
> + if (complete) {
> + if (INTEL_DEBUG & DEBUG_SHADER_TIME)
> + emit_shader_time_end();
> + }
> +
> + vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
> + inst->urb_write_flags = complete ?
> + BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
> +
> + return inst;
> +}
> +
> +
> +void
> +vec4_vs_god::emit_thread_end()
> +{
> + /* For VS, we always end the thread by emitting a single vertex.
> + * emit_urb_write_opcode() will take care of setting the eot flag on the
> + * SEND instruction.
> + */
> + emit_vertex();
> +}
> +
> +
> +vec4_vs_god::vec4_vs_god(struct brw_context *brw,
> + struct brw_vs_compile *vs_compile,
> + struct brw_vs_prog_data *vs_prog_data,
> + struct gl_shader_program *prog,
> + void *mem_ctx)
> + : vec4_god(brw, &vs_compile->base, &vs_compile->vp->program.Base,
> + &vs_compile->key.base, &vs_prog_data->base, prog,
> + MESA_SHADER_VERTEX,
> + mem_ctx, false /* no_spills */,
> + ST_VS, ST_VS_WRITTEN, ST_VS_RESET),
> + vs_compile(vs_compile),
> + vs_prog_data(vs_prog_data)
> +{
> +}
> +
> +
> +} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
> deleted file mode 100644
> index 4baf73e..0000000
> --- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
> +++ /dev/null
> @@ -1,231 +0,0 @@
> -/*
> - * Copyright © 2013 Intel Corporation
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the "Software"),
> - * to deal in the Software without restriction, including without limitation
> - * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> - * and/or sell copies of the Software, and to permit persons to whom the
> - * Software is furnished to do so, subject to the following conditions:
> - *
> - * The above copyright notice and this permission notice (including the next
> - * paragraph) shall be included in all copies or substantial portions of the
> - * Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> - * DEALINGS IN THE SOFTWARE.
> - */
> -
> -
> -#include "brw_vs.h"
> -#include "main/context.h"
> -
> -
> -namespace brw {
> -
> -void
> -vec4_vs_visitor::emit_prolog()
> -{
> - dst_reg sign_recovery_shift;
> - dst_reg normalize_factor;
> - dst_reg es3_normalize_factor;
> -
> - for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
> - if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
> - uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
> - dst_reg reg(ATTR, i);
> - dst_reg reg_d = reg;
> - reg_d.type = BRW_REGISTER_TYPE_D;
> - dst_reg reg_ud = reg;
> - reg_ud.type = BRW_REGISTER_TYPE_UD;
> -
> - /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
> - * come in as floating point conversions of the integer values.
> - */
> - if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
> - dst_reg dst = reg;
> - dst.type = brw_type_for_base_type(glsl_type::vec4_type);
> - dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
> - emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
> - }
> -
> - /* Do sign recovery for 2101010 formats if required. */
> - if (wa_flags & BRW_ATTRIB_WA_SIGN) {
> - if (sign_recovery_shift.file == BAD_FILE) {
> - /* shift constant: <22,22,22,30> */
> - sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
> - emit(MOV(writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
> - emit(MOV(writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
> - }
> -
> - emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
> - emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
> - }
> -
> - /* Apply BGRA swizzle if required. */
> - if (wa_flags & BRW_ATTRIB_WA_BGRA) {
> - src_reg temp = src_reg(reg);
> - temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
> - emit(MOV(reg, temp));
> - }
> -
> - if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
> - /* ES 3.0 has different rules for converting signed normalized
> - * fixed-point numbers than desktop GL.
> - */
> - if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
> - /* According to equation 2.2 of the ES 3.0 specification,
> - * signed normalization conversion is done by:
> - *
> - * f = c / (2^(b-1)-1)
> - */
> - if (es3_normalize_factor.file == BAD_FILE) {
> - /* mul constant: 1 / (2^(b-1) - 1) */
> - es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
> - emit(MOV(writemask(es3_normalize_factor, WRITEMASK_XYZ),
> - src_reg(1.0f / ((1<<9) - 1))));
> - emit(MOV(writemask(es3_normalize_factor, WRITEMASK_W),
> - src_reg(1.0f / ((1<<1) - 1))));
> - }
> -
> - dst_reg dst = reg;
> - dst.type = brw_type_for_base_type(glsl_type::vec4_type);
> - emit(MOV(dst, src_reg(reg_d)));
> - emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
> - emit_minmax(BRW_CONDITIONAL_GE, dst, src_reg(dst), src_reg(-1.0f));
> - } else {
> - /* The following equations are from the OpenGL 3.2 specification:
> - *
> - * 2.1 unsigned normalization
> - * f = c/(2^n-1)
> - *
> - * 2.2 signed normalization
> - * f = (2c+1)/(2^n-1)
> - *
> - * Both of these share a common divisor, which is represented by
> - * "normalize_factor" in the code below.
> - */
> - if (normalize_factor.file == BAD_FILE) {
> - /* 1 / (2^b - 1) for b=<10,10,10,2> */
> - normalize_factor = dst_reg(this, glsl_type::vec4_type);
> - emit(MOV(writemask(normalize_factor, WRITEMASK_XYZ),
> - src_reg(1.0f / ((1<<10) - 1))));
> - emit(MOV(writemask(normalize_factor, WRITEMASK_W),
> - src_reg(1.0f / ((1<<2) - 1))));
> - }
> -
> - dst_reg dst = reg;
> - dst.type = brw_type_for_base_type(glsl_type::vec4_type);
> - emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
> -
> - /* For signed normalization, we want the numerator to be 2c+1. */
> - if (wa_flags & BRW_ATTRIB_WA_SIGN) {
> - emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
> - emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
> - }
> -
> - emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
> - }
> - }
> -
> - if (wa_flags & BRW_ATTRIB_WA_SCALE) {
> - dst_reg dst = reg;
> - dst.type = brw_type_for_base_type(glsl_type::vec4_type);
> - emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
> - }
> - }
> - }
> -}
> -
> -
> -dst_reg *
> -vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
> -{
> - /* VertexID is stored by the VF as the last vertex element, but
> - * we don't represent it with a flag in inputs_read, so we call
> - * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
> - */
> - dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
> -
> - switch (ir->data.location) {
> - case SYSTEM_VALUE_BASE_VERTEX:
> - reg->writemask = WRITEMASK_X;
> - vs_prog_data->uses_vertexid = true;
> - break;
> - case SYSTEM_VALUE_VERTEX_ID:
> - case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
> - reg->writemask = WRITEMASK_Z;
> - vs_prog_data->uses_vertexid = true;
> - break;
> - case SYSTEM_VALUE_INSTANCE_ID:
> - reg->writemask = WRITEMASK_W;
> - vs_prog_data->uses_instanceid = true;
> - break;
> - default:
> - unreachable("not reached");
> - }
> -
> - return reg;
> -}
> -
> -
> -void
> -vec4_vs_visitor::emit_urb_write_header(int mrf)
> -{
> - /* No need to do anything for VS; an implied write to this MRF will be
> - * performed by VS_OPCODE_URB_WRITE.
> - */
> - (void) mrf;
> -}
> -
> -
> -vec4_instruction *
> -vec4_vs_visitor::emit_urb_write_opcode(bool complete)
> -{
> - /* For VS, the URB writes end the thread. */
> - if (complete) {
> - if (INTEL_DEBUG & DEBUG_SHADER_TIME)
> - emit_shader_time_end();
> - }
> -
> - vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
> - inst->urb_write_flags = complete ?
> - BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
> -
> - return inst;
> -}
> -
> -
> -void
> -vec4_vs_visitor::emit_thread_end()
> -{
> - /* For VS, we always end the thread by emitting a single vertex.
> - * emit_urb_write_opcode() will take care of setting the eot flag on the
> - * SEND instruction.
> - */
> - emit_vertex();
> -}
> -
> -
> -vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
> - struct brw_vs_compile *vs_compile,
> - struct brw_vs_prog_data *vs_prog_data,
> - struct gl_shader_program *prog,
> - void *mem_ctx)
> - : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base,
> - &vs_compile->key.base, &vs_prog_data->base, prog,
> - MESA_SHADER_VERTEX,
> - mem_ctx, false /* no_spills */,
> - ST_VS, ST_VS_WRITTEN, ST_VS_RESET),
> - vs_compile(vs_compile),
> - vs_prog_data(vs_prog_data)
> -{
> -}
> -
> -
> -} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
> index ba2c23d..137990c 100644
> --- a/src/mesa/drivers/dri/i965/brw_vs.c
> +++ b/src/mesa/drivers/dri/i965/brw_vs.c
> @@ -232,7 +232,7 @@ do_vs_prog(struct brw_context *brw,
> } else {
> param_count = vp->program.Base.Parameters->NumParameters * 4;
> }
> - /* vec4_visitor::setup_uniform_clipplane_values() also uploads user clip
> + /* vec4_god::setup_uniform_clipplane_values() also uploads user clip
> * planes as uniforms.
> */
> param_count += c.key.base.nr_userclip_plane_consts * 4;
> diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
> index bad0f07..96ed4ce 100644
> --- a/src/mesa/drivers/dri/i965/brw_vs.h
> +++ b/src/mesa/drivers/dri/i965/brw_vs.h
> @@ -81,10 +81,10 @@ brw_upload_vs_prog(struct brw_context *brw);
>
> namespace brw {
>
> -class vec4_vs_visitor : public vec4_visitor
> +class vec4_vs_god : public vec4_god
> {
> public:
> - vec4_vs_visitor(struct brw_context *brw,
> + vec4_vs_god(struct brw_context *brw,
> struct brw_vs_compile *vs_compile,
> struct brw_vs_prog_data *vs_prog_data,
> struct gl_shader_program *prog,
> diff --git a/src/mesa/drivers/dri/i965/brw_wm_iz.cpp b/src/mesa/drivers/dri/i965/brw_wm_iz.cpp
> index 14930eb..8b0efda 100644
> --- a/src/mesa/drivers/dri/i965/brw_wm_iz.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_wm_iz.cpp
> @@ -120,7 +120,7 @@ static const struct {
> * \param line_aa AA_NEVER, AA_ALWAYS or AA_SOMETIMES
> * \param lookup bitmask of IZ_* flags
> */
> -void fs_visitor::setup_payload_gen4()
> +void fs_god::setup_payload_gen4()
> {
> assert(stage == MESA_SHADER_FRAGMENT);
> brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> diff --git a/src/mesa/drivers/dri/i965/gen6_gs_god.cpp b/src/mesa/drivers/dri/i965/gen6_gs_god.cpp
> new file mode 100644
> index 0000000..2ea3e6f
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/gen6_gs_god.cpp
> @@ -0,0 +1,776 @@
> +/*
> + * Copyright © 2014 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + * This code is based on original work by Ilia Mirkin.
> + */
> +
> +/**
> + * \file gen6_gs_god.cpp
> + *
> + * Gen6 geometry shader implementation
> + */
> +
> +#include "gen6_gs_god.h"
> +
> +const unsigned MAX_GS_INPUT_VERTICES = 6;
> +
> +namespace brw {
> +
> +void
> +gen6_gs_god::assign_binding_table_offsets()
> +{
> + /* In gen6 we reserve the first BRW_MAX_SOL_BINDINGS entries for transform
> + * feedback surfaces.
> + */
> + assign_common_binding_table_offsets(BRW_MAX_SOL_BINDINGS);
> +}
> +
> +void
> +gen6_gs_god::emit_prolog()
> +{
> + vec4_gs_god::emit_prolog();
> +
> + /* Gen6 geometry shaders require to allocate an initial VUE handle via
> + * FF_SYNC message, however the documentation remarks that only one thread
> + * can write to the URB simultaneously and the FF_SYNC message provides the
> + * synchronization mechanism for this, so using this message effectively
> + * stalls the thread until it is its turn to write to the URB. Because of
> + * this, the best way to implement geometry shader algorithms in gen6 is to
> + * execute the algorithm before the FF_SYNC message to maximize parallelism.
> + *
> + * To achieve this we buffer the geometry shader outputs for each emitted
> + * vertex in vertex_output during operation. Then, when we have processed
> + * the last vertex (that is, at thread end time), we send the FF_SYNC
> + * message to allocate the initial VUE handle and write all buffered vertex
> + * data to the URB in one go.
> + *
> + * For each emitted vertex, vertex_output will hold vue_map.num_slots
> + * data items plus one additional item to hold required flags
> + * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
> + * which come right after the data items for that vertex. Vertex data and
> + * flags for the next vertex come right after the data items and flags for
> + * the previous vertex.
> + */
> + this->current_annotation = "gen6 prolog";
> + this->vertex_output = src_reg(this,
> + glsl_type::uint_type,
> + (prog_data->vue_map.num_slots + 1) *
> + c->gp->program.VerticesOut);
> + this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
> + emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
> +
> + /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
> + * so initialize it once to R0.
> + */
> + vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
> + retype(brw_vec8_grf(0, 0),
> + BRW_REGISTER_TYPE_UD)));
> + inst->force_writemask_all = true;
> +
> + /* This will be used as a temporary to store writeback data of FF_SYNC
> + * and URB_WRITE messages.
> + */
> + this->temp = src_reg(this, glsl_type::uint_type);
> +
> + /* This will be used to know when we are processing the first vertex of
> + * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
> + * that we are processing the first vertex in the primitive and to zero
> + * otherwise. This way we can use its value directly in the URB write
> + * headers.
> + */
> + this->first_vertex = src_reg(this, glsl_type::uint_type);
> + emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
> +
> + /* The FF_SYNC message requires to know the number of primitives generated,
> + * so keep a counter for this.
> + */
> + this->prim_count = src_reg(this, glsl_type::uint_type);
> + emit(MOV(dst_reg(this->prim_count), 0u));
> +
> + if (c->prog_data.gen6_xfb_enabled) {
> + /* Create a virtual register to hold destination indices in SOL */
> + this->destination_indices = src_reg(this, glsl_type::uvec4_type);
> + /* Create a virtual register to hold number of written primitives */
> + this->sol_prim_written = src_reg(this, glsl_type::uint_type);
> + /* Create a virtual register to hold Streamed Vertex Buffer Indices */
> + this->svbi = src_reg(this, glsl_type::uvec4_type);
> + /* Create a virtual register to hold max values of SVBI */
> + this->max_svbi = src_reg(this, glsl_type::uvec4_type);
> + emit(MOV(dst_reg(this->max_svbi),
> + src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
> +
> + xfb_setup();
> + }
> +
> + /* PrimitveID is delivered in r0.1 of the thread payload. If the program
> + * needs it we have to move it to a separate register where we can map
> + * the atttribute.
> + *
> + * Notice that we cannot use a virtual register for this, because we need to
> + * map all input attributes to hardware registers in setup_payload(),
> + * which happens before virtual registers are mapped to hardware registers.
> + * We could work around that issue if we were able to compute the first
> + * non-payload register here and move the PrimitiveID information to that
> + * register, but we can't because at this point we don't know the final
> + * number uniforms that will be included in the payload.
> + *
> + * So, what we do is to place PrimitiveID information in r1, which is always
> + * delivered as part of the payload, but its only populated with data
> + * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
> + * in the 3DSTATE_GS state packet. That information can be obtained by other
> + * means though, so we can safely use r1 for this purpose.
> + */
> + if (c->prog_data.include_primitive_id) {
> + this->primitive_id =
> + src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
> + emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
> + }
> +}
> +
> +void
> +gen6_gs_god::visit(ir_emit_vertex *)
> +{
> + this->current_annotation = "gen6 emit vertex";
> + /* Honor max_vertex layout indication in geometry shader by ignoring any
> + * vertices coming after c->gp->program.VerticesOut.
> + */
> + unsigned num_output_vertices = c->gp->program.VerticesOut;
> + emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices),
> + BRW_CONDITIONAL_L));
> + emit(IF(BRW_PREDICATE_NORMAL));
> + {
> + /* Buffer all output slots for this vertex in vertex_output */
> + for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
> + int varying = prog_data->vue_map.slot_to_varying[slot];
> + if (varying != VARYING_SLOT_PSIZ) {
> + dst_reg dst(this->vertex_output);
> + dst.reladdr = ralloc(mem_ctx, src_reg);
> + memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> + emit_urb_slot(dst, varying);
> + } else {
> + /* The PSIZ slot can pack multiple varyings in different channels
> + * and emit_urb_slot() will produce a MOV instruction for each of
> + * them. Since we are writing to an array, that will translate to
> + * possibly multiple MOV instructions with an array destination and
> + * each will generate a scratch write with the same offset into
> + * scratch space (thus, each one overwriting the previous). This is
> + * not what we want. What we will do instead is emit PSIZ to a
> + * a regular temporary register, then move that resgister into the
> + * array. This way we only have one instruction with an array
> + * destination and we only produce a single scratch write.
> + */
> + dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
> + emit_urb_slot(tmp, varying);
> + dst_reg dst(this->vertex_output);
> + dst.reladdr = ralloc(mem_ctx, src_reg);
> + memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> + vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
> + inst->force_writemask_all = true;
> + }
> +
> + emit(ADD(dst_reg(this->vertex_output_offset),
> + this->vertex_output_offset, 1u));
> + }
> +
> + /* Now buffer flags for this vertex */
> + dst_reg dst(this->vertex_output);
> + dst.reladdr = ralloc(mem_ctx, src_reg);
> + memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> + if (c->gp->program.OutputType == GL_POINTS) {
> + /* If we are outputting points, then every vertex has PrimStart and
> + * PrimEnd set.
> + */
> + emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
> + URB_WRITE_PRIM_START | URB_WRITE_PRIM_END));
> + emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
> + } else {
> + /* Otherwise, we can only set the PrimStart flag, which we have stored
> + * in the first_vertex register. We will have to wait until we execute
> + * EndPrimitive() or we end the thread to set the PrimEnd flag on a
> + * vertex.
> + */
> + emit(OR(dst, this->first_vertex,
> + (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
> + emit(MOV(dst_reg(this->first_vertex), 0u));
> + }
> + emit(ADD(dst_reg(this->vertex_output_offset),
> + this->vertex_output_offset, 1u));
> +
> + /* Update vertex count */
> + emit(ADD(dst_reg(this->vertex_count), this->vertex_count, 1u));
> + }
> + emit(BRW_OPCODE_ENDIF);
> +}
> +
> +void
> +gen6_gs_god::visit(ir_end_primitive *)
> +{
> + this->current_annotation = "gen6 end primitive";
> + /* Calling EndPrimitive() is optional for point output. In this case we set
> + * the PrimEnd flag when we process EmitVertex().
> + */
> + if (c->gp->program.OutputType == GL_POINTS)
> + return;
> +
> + /* Otherwise we know that the last vertex we have processed was the last
> + * vertex in the primitive and we need to set its PrimEnd flag, so do this
> + * unless we haven't emitted that vertex at all (vertex_count != 0).
> + *
> + * Notice that we have already incremented vertex_count when we processed
> + * the last emit_vertex, so we need to take that into account in the
> + * comparison below (hence the num_output_vertices + 1 in the comparison
> + * below).
> + */
> + unsigned num_output_vertices = c->gp->program.VerticesOut;
> + emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
> + BRW_CONDITIONAL_L));
> + vec4_instruction *inst = emit(CMP(dst_null_d(),
> + this->vertex_count, 0u,
> + BRW_CONDITIONAL_NEQ));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + emit(IF(BRW_PREDICATE_NORMAL));
> + {
> + /* vertex_output_offset is already pointing at the first entry of the
> + * next vertex. So subtract 1 to modify the flags for the previous
> + * vertex.
> + */
> + src_reg offset(this, glsl_type::uint_type);
> + emit(ADD(dst_reg(offset), this->vertex_output_offset, src_reg(-1)));
> +
> + src_reg dst(this->vertex_output);
> + dst.reladdr = ralloc(mem_ctx, src_reg);
> + memcpy(dst.reladdr, &offset, sizeof(src_reg));
> +
> + emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END));
> + emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
> +
> + /* Set the first vertex flag to indicate that the next vertex will start
> + * a primitive.
> + */
> + emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
> + }
> + emit(BRW_OPCODE_ENDIF);
> +}
> +
> +void
> +gen6_gs_god::emit_urb_write_header(int mrf)
> +{
> + this->current_annotation = "gen6 urb header";
> + /* Compute offset of the flags for the current vertex in vertex_output and
> + * write them in dw2 of the message header.
> + *
> + * Notice that by the time that emit_thread_end() calls here
> + * vertex_output_offset should point to the first data item of the current
> + * vertex in vertex_output, thus we only need to add the number of output
> + * slots per vertex to that offset to obtain the flags data offset.
> + */
> + src_reg flags_offset(this, glsl_type::uint_type);
> + emit(ADD(dst_reg(flags_offset),
> + this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots)));
> +
> + src_reg flags_data(this->vertex_output);
> + flags_data.reladdr = ralloc(mem_ctx, src_reg);
> + memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
> +
> + emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
> +}
> +
> +void
> +gen6_gs_god::emit_urb_write_opcode(bool complete, int base_mrf,
> + int last_mrf, int urb_offset)
> +{
> + vec4_instruction *inst = NULL;
> +
> + if (!complete) {
> + /* If the vertex is not complete we don't have to do anything special */
> + inst = emit(GS_OPCODE_URB_WRITE);
> + inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
> + } else {
> + /* Otherwise we always request to allocate a new VUE handle. If this is
> + * the last write before the EOT message and the new handle never gets
> + * used it will be dereferenced when we send the EOT message. This is
> + * necessary to avoid different setups for the EOT message (one for the
> + * case when there is no output and another for the case when there is)
> + * which would require to end the program with an IF/ELSE/ENDIF block,
> + * something we do not want.
> + */
> + inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
> + inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
> + inst->dst = dst_reg(MRF, base_mrf);
> + inst->src[0] = this->temp;
> + }
> +
> + inst->base_mrf = base_mrf;
> + /* URB data written (does not include the message header reg) must
> + * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
> + * section 5.4.3.2.2: URB_INTERLEAVED.
> + */
> + int mlen = last_mrf - base_mrf;
> + if ((mlen % 2) != 1)
> + mlen++;
> + inst->mlen = mlen;
> + inst->offset = urb_offset;
> +}
> +
> +void
> +gen6_gs_god::emit_thread_end()
> +{
> + /* Make sure the current primitive is ended: we know it is not ended when
> + * first_vertex is not zero. This is only relevant for outputs other than
> + * points because in the point case we set PrimEnd on all vertices.
> + */
> + if (c->gp->program.OutputType != GL_POINTS) {
> + emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z));
> + emit(IF(BRW_PREDICATE_NORMAL));
> + {
> + visit((ir_end_primitive *) NULL);
> + }
> + emit(BRW_OPCODE_ENDIF);
> + }
> +
> + /* Here we have to:
> + * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
> + * 2) Loop over all buffered vertex data and write it to corresponding
> + * URB entries.
> + * 3) Allocate new VUE handles for all vertices other than the first.
> + * 4) Send a final EOT message.
> + */
> +
> + /* MRF 0 is reserved for the debugger, so start with message header
> + * in MRF 1.
> + */
> + int base_mrf = 1;
> +
> + /* In the process of generating our URB write message contents, we
> + * may need to unspill a register or load from an array. Those
> + * reads would use MRFs 14-15.
> + */
> + int max_usable_mrf = 13;
> +
> + /* Issue the FF_SYNC message and obtain the initial VUE handle. */
> + emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G));
> + emit(IF(BRW_PREDICATE_NORMAL));
> + {
> + this->current_annotation = "gen6 thread end: ff_sync";
> +
> + vec4_instruction *inst;
> + if (c->prog_data.gen6_xfb_enabled) {
> + src_reg sol_temp(this, glsl_type::uvec4_type);
> + emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
> + dst_reg(this->svbi),
> + this->vertex_count,
> + this->prim_count,
> + sol_temp);
> + inst = emit(GS_OPCODE_FF_SYNC,
> + dst_reg(this->temp), this->prim_count, this->svbi);
> + } else {
> + inst = emit(GS_OPCODE_FF_SYNC,
> + dst_reg(this->temp), this->prim_count, src_reg(0u));
> + }
> + inst->base_mrf = base_mrf;
> +
> + /* Loop over all buffered vertices and emit URB write messages */
> + this->current_annotation = "gen6 thread end: urb writes init";
> + src_reg vertex(this, glsl_type::uint_type);
> + emit(MOV(dst_reg(vertex), 0u));
> + emit(MOV(dst_reg(this->vertex_output_offset), 0u));
> +
> + this->current_annotation = "gen6 thread end: urb writes";
> + emit(BRW_OPCODE_DO);
> + {
> + emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
> + inst = emit(BRW_OPCODE_BREAK);
> + inst->predicate = BRW_PREDICATE_NORMAL;
> +
> + /* First we prepare the message header */
> + emit_urb_write_header(base_mrf);
> +
> + /* Then add vertex data to the message in interleaved fashion */
> + int slot = 0;
> + bool complete = false;
> + do {
> + int mrf = base_mrf + 1;
> +
> + /* URB offset is in URB row increments, and each of our MRFs is half
> + * of one of those, since we're doing interleaved writes.
> + */
> + int urb_offset = slot / 2;
> +
> + for (; slot < prog_data->vue_map.num_slots; ++slot) {
> + int varying = prog_data->vue_map.slot_to_varying[slot];
> + current_annotation = output_reg_annotation[varying];
> +
> + /* Compute offset of this slot for the current vertex
> + * in vertex_output
> + */
> + src_reg data(this->vertex_output);
> + data.reladdr = ralloc(mem_ctx, src_reg);
> + memcpy(data.reladdr, &this->vertex_output_offset,
> + sizeof(src_reg));
> +
> + /* Copy this slot to the appropriate message register */
> + dst_reg reg = dst_reg(MRF, mrf);
> + reg.type = output_reg[varying].type;
> + data.type = reg.type;
> + vec4_instruction *inst = emit(MOV(reg, data));
> + inst->force_writemask_all = true;
> +
> + mrf++;
> + emit(ADD(dst_reg(this->vertex_output_offset),
> + this->vertex_output_offset, 1u));
> +
> + /* If this was max_usable_mrf, we can't fit anything more into
> + * this URB WRITE.
> + */
> + if (mrf > max_usable_mrf) {
> + slot++;
> + break;
> + }
> + }
> +
> + complete = slot >= prog_data->vue_map.num_slots;
> + emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
> + } while (!complete);
> +
> + /* Skip over the flags data item so that vertex_output_offset points
> + * to the first data item of the next vertex, so that we can start
> + * writing the next vertex.
> + */
> + emit(ADD(dst_reg(this->vertex_output_offset),
> + this->vertex_output_offset, 1u));
> +
> + emit(ADD(dst_reg(vertex), vertex, 1u));
> + }
> + emit(BRW_OPCODE_WHILE);
> +
> + if (c->prog_data.gen6_xfb_enabled)
> + xfb_write();
> + }
> + emit(BRW_OPCODE_ENDIF);
> +
> + /* Finally, emit EOT message.
> + *
> + * In gen6 we need to end the thread differently depending on whether we have
> + * emitted at least one vertex or not. In case we did, the EOT message must
> + * always include the COMPLETE flag or else the GPU hangs. If we have not
> + * produced any output we can't use the COMPLETE flag.
> + *
> + * However, this would lead us to end the program with an ENDIF opcode,
> + * which we want to avoid, so what we do is that we always request a new
> + * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
> + * With this we make sure that whether we have emitted at least one vertex
> + * or none at all, we have to finish the thread without writing to the URB,
> + * which works for both cases by setting the COMPLETE and UNUSED flags in
> + * the EOT message.
> + */
> + this->current_annotation = "gen6 thread end: EOT";
> +
> + if (c->prog_data.gen6_xfb_enabled) {
> + /* When emitting EOT, set SONumPrimsWritten Increment Value. */
> + src_reg data(this, glsl_type::uint_type);
> + emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu)));
> + emit(SHL(dst_reg(data), data, src_reg(16u)));
> + emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
> + }
> +
> + vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
> + inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
> + inst->base_mrf = base_mrf;
> + inst->mlen = 1;
> +}
> +
> +void
> +gen6_gs_god::setup_payload()
> +{
> + int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
> +
> + /* Attributes are going to be interleaved, so one register contains two
> + * attribute slots.
> + */
> + int attributes_per_reg = 2;
> +
> + /* If a geometry shader tries to read from an input that wasn't written by
> + * the vertex shader, that produces undefined results, but it shouldn't
> + * crash anything. So initialize attribute_map to zeros--that ensures that
> + * these undefined results are read from r0.
> + */
> + memset(attribute_map, 0, sizeof(attribute_map));
> +
> + int reg = 0;
> +
> + /* The payload always contains important data in r0. */
> + reg++;
> +
> + /* r1 is always part of the payload and it holds information relevant
> + * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
> + * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
> + * information (and move the original value to a virtual register if
> + * necessary).
> + */
> + if (c->prog_data.include_primitive_id)
> + attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
> + reg++;
> +
> + reg = setup_uniforms(reg);
> +
> + reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
> +
> + lower_attributes_to_hw_regs(attribute_map, true);
> +
> + this->first_non_payload_grf = reg;
> +}
> +
> +void
> +gen6_gs_god::xfb_setup()
> +{
> + static const unsigned swizzle_for_offset[4] = {
> + BRW_SWIZZLE4(0, 1, 2, 3),
> + BRW_SWIZZLE4(1, 2, 3, 3),
> + BRW_SWIZZLE4(2, 3, 3, 3),
> + BRW_SWIZZLE4(3, 3, 3, 3)
> + };
> +
> + struct brw_gs_prog_data *prog_data =
> + (struct brw_gs_prog_data *) &c->prog_data;
> +
> + const struct gl_transform_feedback_info *linked_xfb_info =
> + &this->shader_prog->LinkedTransformFeedback;
> + int i;
> +
> + /* Make sure that the VUE slots won't overflow the unsigned chars in
> + * prog_data->transform_feedback_bindings[].
> + */
> + STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
> +
> + /* Make sure that we don't need more binding table entries than we've
> + * set aside for use in transform feedback. (We shouldn't, since we
> + * set aside enough binding table entries to have one per component).
> + */
> + assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
> +
> + prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
> + for (i = 0; i < prog_data->num_transform_feedback_bindings; i++) {
> + prog_data->transform_feedback_bindings[i] =
> + linked_xfb_info->Outputs[i].OutputRegister;
> + prog_data->transform_feedback_swizzles[i] =
> + swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
> + }
> +}
> +
> +void
> +gen6_gs_god::xfb_write()
> +{
> + unsigned num_verts;
> + struct brw_gs_prog_data *prog_data =
> + (struct brw_gs_prog_data *) &c->prog_data;
> +
> + if (!prog_data->num_transform_feedback_bindings)
> + return;
> +
> + switch (c->prog_data.output_topology) {
> + case _3DPRIM_POINTLIST:
> + num_verts = 1;
> + break;
> + case _3DPRIM_LINELIST:
> + case _3DPRIM_LINESTRIP:
> + case _3DPRIM_LINELOOP:
> + num_verts = 2;
> + break;
> + case _3DPRIM_TRILIST:
> + case _3DPRIM_TRIFAN:
> + case _3DPRIM_TRISTRIP:
> + case _3DPRIM_RECTLIST:
> + num_verts = 3;
> + break;
> + case _3DPRIM_QUADLIST:
> + case _3DPRIM_QUADSTRIP:
> + case _3DPRIM_POLYGON:
> + num_verts = 3;
> + break;
> + default:
> + unreachable("Unexpected primitive type in Gen6 SOL program.");
> + }
> +
> + this->current_annotation = "gen6 thread end: svb writes init";
> +
> + emit(MOV(dst_reg(this->vertex_output_offset), 0u));
> + emit(MOV(dst_reg(this->sol_prim_written), 0u));
> +
> + /* Check that at least one primitive can be written
> + *
> + * Note: since we use the binding table to keep track of buffer offsets
> + * and stride, the GS doesn't need to keep track of a separate pointer
> + * into each buffer; it uses a single pointer which increments by 1 for
> + * each vertex. So we use SVBI0 for this pointer, regardless of whether
> + * transform feedback is in interleaved or separate attribs mode.
> + */
> + src_reg sol_temp(this, glsl_type::uvec4_type);
> + emit(ADD(dst_reg(sol_temp), this->svbi, src_reg(num_verts)));
> +
> + /* Compare SVBI calculated number with the maximum value, which is
> + * in R1.4 (previously saved in this->max_svbi) for gen6.
> + */
> + emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
> + emit(IF(BRW_PREDICATE_NORMAL));
> + {
> + src_reg destination_indices_uw =
> + retype(destination_indices, BRW_REGISTER_TYPE_UW);
> +
> + vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw),
> + brw_imm_v(0x00020100))); /* (0, 1, 2) */
> + inst->force_writemask_all = true;
> +
> + emit(ADD(dst_reg(this->destination_indices),
> + this->destination_indices,
> + this->svbi));
> + }
> + emit(BRW_OPCODE_ENDIF);
> +
> + /* Write transform feedback data for all processed vertices. */
> + for (int i = 0; i < c->gp->program.VerticesOut; i++) {
> + emit(MOV(dst_reg(sol_temp), i));
> + emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
> + BRW_CONDITIONAL_L));
> + emit(IF(BRW_PREDICATE_NORMAL));
> + {
> + xfb_program(i, num_verts);
> + }
> + emit(BRW_OPCODE_ENDIF);
> + }
> +}
> +
> +void
> +gen6_gs_god::xfb_program(unsigned vertex, unsigned num_verts)
> +{
> + struct brw_gs_prog_data *prog_data =
> + (struct brw_gs_prog_data *) &c->prog_data;
> + unsigned binding;
> + unsigned num_bindings = prog_data->num_transform_feedback_bindings;
> + src_reg sol_temp(this, glsl_type::uvec4_type);
> +
> + /* Check for buffer overflow: we need room to write the complete primitive
> + * (all vertices). Otherwise, avoid writing any vertices for it
> + */
> + emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u));
> + emit(MUL(dst_reg(sol_temp), sol_temp, src_reg(num_verts)));
> + emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
> + emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
> + emit(IF(BRW_PREDICATE_NORMAL));
> + {
> + /* Avoid overwriting MRF 1 as it is used as URB write message header */
> + dst_reg mrf_reg(MRF, 2);
> +
> + this->current_annotation = "gen6: emit SOL vertex data";
> + /* For each vertex, generate code to output each varying using the
> + * appropriate binding table entry.
> + */
> + for (binding = 0; binding < num_bindings; ++binding) {
> + unsigned char varying =
> + prog_data->transform_feedback_bindings[binding];
> +
> + /* Set up the correct destination index for this vertex */
> + vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
> + mrf_reg,
> + this->destination_indices);
> + inst->sol_vertex = vertex % num_verts;
> +
> + /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
> + *
> + * "Prior to End of Thread with a URB_WRITE, the kernel must
> + * ensure that all writes are complete by sending the final
> + * write as a committed write."
> + */
> + bool final_write = binding == (unsigned) num_bindings - 1 &&
> + inst->sol_vertex == num_verts - 1;
> +
> + /* Compute offset of this varying for the current vertex
> + * in vertex_output
> + */
> + this->current_annotation = output_reg_annotation[varying];
> + src_reg data(this->vertex_output);
> + data.reladdr = ralloc(mem_ctx, src_reg);
> + int offset = get_vertex_output_offset_for_varying(vertex, varying);
> + emit(MOV(dst_reg(this->vertex_output_offset), offset));
> + memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> + data.type = output_reg[varying].type;
> +
> + /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
> + * same slot, so make sure we write the appropriate channel
> + */
> + if (varying == VARYING_SLOT_PSIZ)
> + data.swizzle = BRW_SWIZZLE_WWWW;
> + else if (varying == VARYING_SLOT_LAYER)
> + data.swizzle = BRW_SWIZZLE_YYYY;
> + else if (varying == VARYING_SLOT_VIEWPORT)
> + data.swizzle = BRW_SWIZZLE_ZZZZ;
> + else
> + data.swizzle = prog_data->transform_feedback_swizzles[binding];
> +
> + /* Write data */
> + inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
> + inst->sol_binding = binding;
> + inst->sol_final_write = final_write;
> +
> + if (final_write) {
> + /* This is the last vertex of the primitive, then increment
> + * SO num primitive counter and destination indices.
> + */
> + emit(ADD(dst_reg(this->destination_indices),
> + this->destination_indices,
> + src_reg(num_verts)));
> + emit(ADD(dst_reg(this->sol_prim_written),
> + this->sol_prim_written, 1u));
> + }
> +
> + }
> + this->current_annotation = NULL;
> + }
> + emit(BRW_OPCODE_ENDIF);
> +}
> +
> +int
> +gen6_gs_god::get_vertex_output_offset_for_varying(int vertex, int varying)
> +{
> + /* Find the output slot assigned to this varying.
> + *
> + * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
> + * as VARYING_SLOT_PSIZ.
> + */
> + if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
> + varying = VARYING_SLOT_PSIZ;
> + int slot = prog_data->vue_map.varying_to_slot[varying];
> +
> + if (slot < 0) {
> + /* This varying does not exist in the VUE so we are not writing to it
> + * and its value is undefined. We still want to return a valid offset
> + * into vertex_output though, to prevent any out-of-bound accesses into
> + * the vertex_output array. Since the value for this varying is undefined
> + * we don't really care for the value we assign to it, so any offset
> + * within the limits of vertex_output will do.
> + */
> + slot = 0;
> + }
> +
> + return vertex * (prog_data->vue_map.num_slots + 1) + slot;
> +}
> +
> +} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/gen6_gs_god.h b/src/mesa/drivers/dri/i965/gen6_gs_god.h
> new file mode 100644
> index 0000000..f99f2cc
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/gen6_gs_god.h
> @@ -0,0 +1,82 @@
> +/*
> + * Copyright © 2014 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef GEN6_GS_VISITOR_H
> +#define GEN6_GS_VISITOR_H
> +
> +#include "brw_vec4.h"
> +#include "brw_vec4_gs_god.h"
> +
> +#ifdef __cplusplus
> +
> +namespace brw {
> +
> +class gen6_gs_god : public vec4_gs_god
> +{
> +public:
> + gen6_gs_god(struct brw_context *brw,
> + struct brw_gs_compile *c,
> + struct gl_shader_program *prog,
> + void *mem_ctx,
> + bool no_spills) :
> + vec4_gs_god(brw, c, prog, mem_ctx, no_spills) {}
> +
> +protected:
> + virtual void assign_binding_table_offsets();
> + virtual void emit_prolog();
> + virtual void emit_thread_end();
> + virtual void visit(ir_emit_vertex *);
> + virtual void visit(ir_end_primitive *);
> + virtual void emit_urb_write_header(int mrf);
> + virtual void emit_urb_write_opcode(bool complete,
> + int base_mrf,
> + int last_mrf,
> + int urb_offset);
> + virtual void setup_payload();
> +
> +private:
> + void xfb_write();
> + void xfb_program(unsigned vertex, unsigned num_verts);
> + void xfb_setup();
> + int get_vertex_output_offset_for_varying(int vertex, int varying);
> +
> + src_reg vertex_output;
> + src_reg vertex_output_offset;
> + src_reg temp;
> + src_reg first_vertex;
> + src_reg prim_count;
> + src_reg primitive_id;
> +
> + /* Transform Feedback members */
> + src_reg sol_prim_written;
> + src_reg svbi;
> + src_reg max_svbi;
> + src_reg destination_indices;
> +};
> +
> +} /* namespace brw */
> +
> +#endif /* __cplusplus */
> +
> +#endif /* GEN6_GS_VISITOR_H */
> diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
> deleted file mode 100644
> index 782687a..0000000
> --- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
> +++ /dev/null
> @@ -1,776 +0,0 @@
> -/*
> - * Copyright © 2014 Intel Corporation
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the "Software"),
> - * to deal in the Software without restriction, including without limitation
> - * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> - * and/or sell copies of the Software, and to permit persons to whom the
> - * Software is furnished to do so, subject to the following conditions:
> - *
> - * The above copyright notice and this permission notice (including the next
> - * paragraph) shall be included in all copies or substantial portions of the
> - * Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> - * IN THE SOFTWARE.
> - *
> - * This code is based on original work by Ilia Mirkin.
> - */
> -
> -/**
> - * \file gen6_gs_visitor.cpp
> - *
> - * Gen6 geometry shader implementation
> - */
> -
> -#include "gen6_gs_visitor.h"
> -
> -const unsigned MAX_GS_INPUT_VERTICES = 6;
> -
> -namespace brw {
> -
> -void
> -gen6_gs_visitor::assign_binding_table_offsets()
> -{
> - /* In gen6 we reserve the first BRW_MAX_SOL_BINDINGS entries for transform
> - * feedback surfaces.
> - */
> - assign_common_binding_table_offsets(BRW_MAX_SOL_BINDINGS);
> -}
> -
> -void
> -gen6_gs_visitor::emit_prolog()
> -{
> - vec4_gs_visitor::emit_prolog();
> -
> - /* Gen6 geometry shaders require to allocate an initial VUE handle via
> - * FF_SYNC message, however the documentation remarks that only one thread
> - * can write to the URB simultaneously and the FF_SYNC message provides the
> - * synchronization mechanism for this, so using this message effectively
> - * stalls the thread until it is its turn to write to the URB. Because of
> - * this, the best way to implement geometry shader algorithms in gen6 is to
> - * execute the algorithm before the FF_SYNC message to maximize parallelism.
> - *
> - * To achieve this we buffer the geometry shader outputs for each emitted
> - * vertex in vertex_output during operation. Then, when we have processed
> - * the last vertex (that is, at thread end time), we send the FF_SYNC
> - * message to allocate the initial VUE handle and write all buffered vertex
> - * data to the URB in one go.
> - *
> - * For each emitted vertex, vertex_output will hold vue_map.num_slots
> - * data items plus one additional item to hold required flags
> - * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
> - * which come right after the data items for that vertex. Vertex data and
> - * flags for the next vertex come right after the data items and flags for
> - * the previous vertex.
> - */
> - this->current_annotation = "gen6 prolog";
> - this->vertex_output = src_reg(this,
> - glsl_type::uint_type,
> - (prog_data->vue_map.num_slots + 1) *
> - c->gp->program.VerticesOut);
> - this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
> - emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
> -
> - /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
> - * so initialize it once to R0.
> - */
> - vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
> - retype(brw_vec8_grf(0, 0),
> - BRW_REGISTER_TYPE_UD)));
> - inst->force_writemask_all = true;
> -
> - /* This will be used as a temporary to store writeback data of FF_SYNC
> - * and URB_WRITE messages.
> - */
> - this->temp = src_reg(this, glsl_type::uint_type);
> -
> - /* This will be used to know when we are processing the first vertex of
> - * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
> - * that we are processing the first vertex in the primitive and to zero
> - * otherwise. This way we can use its value directly in the URB write
> - * headers.
> - */
> - this->first_vertex = src_reg(this, glsl_type::uint_type);
> - emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
> -
> - /* The FF_SYNC message requires to know the number of primitives generated,
> - * so keep a counter for this.
> - */
> - this->prim_count = src_reg(this, glsl_type::uint_type);
> - emit(MOV(dst_reg(this->prim_count), 0u));
> -
> - if (c->prog_data.gen6_xfb_enabled) {
> - /* Create a virtual register to hold destination indices in SOL */
> - this->destination_indices = src_reg(this, glsl_type::uvec4_type);
> - /* Create a virtual register to hold number of written primitives */
> - this->sol_prim_written = src_reg(this, glsl_type::uint_type);
> - /* Create a virtual register to hold Streamed Vertex Buffer Indices */
> - this->svbi = src_reg(this, glsl_type::uvec4_type);
> - /* Create a virtual register to hold max values of SVBI */
> - this->max_svbi = src_reg(this, glsl_type::uvec4_type);
> - emit(MOV(dst_reg(this->max_svbi),
> - src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
> -
> - xfb_setup();
> - }
> -
> - /* PrimitveID is delivered in r0.1 of the thread payload. If the program
> - * needs it we have to move it to a separate register where we can map
> - * the atttribute.
> - *
> - * Notice that we cannot use a virtual register for this, because we need to
> - * map all input attributes to hardware registers in setup_payload(),
> - * which happens before virtual registers are mapped to hardware registers.
> - * We could work around that issue if we were able to compute the first
> - * non-payload register here and move the PrimitiveID information to that
> - * register, but we can't because at this point we don't know the final
> - * number uniforms that will be included in the payload.
> - *
> - * So, what we do is to place PrimitiveID information in r1, which is always
> - * delivered as part of the payload, but its only populated with data
> - * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
> - * in the 3DSTATE_GS state packet. That information can be obtained by other
> - * means though, so we can safely use r1 for this purpose.
> - */
> - if (c->prog_data.include_primitive_id) {
> - this->primitive_id =
> - src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
> - emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
> - }
> -}
> -
> -void
> -gen6_gs_visitor::visit(ir_emit_vertex *)
> -{
> - this->current_annotation = "gen6 emit vertex";
> - /* Honor max_vertex layout indication in geometry shader by ignoring any
> - * vertices coming after c->gp->program.VerticesOut.
> - */
> - unsigned num_output_vertices = c->gp->program.VerticesOut;
> - emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices),
> - BRW_CONDITIONAL_L));
> - emit(IF(BRW_PREDICATE_NORMAL));
> - {
> - /* Buffer all output slots for this vertex in vertex_output */
> - for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
> - int varying = prog_data->vue_map.slot_to_varying[slot];
> - if (varying != VARYING_SLOT_PSIZ) {
> - dst_reg dst(this->vertex_output);
> - dst.reladdr = ralloc(mem_ctx, src_reg);
> - memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> - emit_urb_slot(dst, varying);
> - } else {
> - /* The PSIZ slot can pack multiple varyings in different channels
> - * and emit_urb_slot() will produce a MOV instruction for each of
> - * them. Since we are writing to an array, that will translate to
> - * possibly multiple MOV instructions with an array destination and
> - * each will generate a scratch write with the same offset into
> - * scratch space (thus, each one overwriting the previous). This is
> - * not what we want. What we will do instead is emit PSIZ to a
> - * a regular temporary register, then move that resgister into the
> - * array. This way we only have one instruction with an array
> - * destination and we only produce a single scratch write.
> - */
> - dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
> - emit_urb_slot(tmp, varying);
> - dst_reg dst(this->vertex_output);
> - dst.reladdr = ralloc(mem_ctx, src_reg);
> - memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> - vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
> - inst->force_writemask_all = true;
> - }
> -
> - emit(ADD(dst_reg(this->vertex_output_offset),
> - this->vertex_output_offset, 1u));
> - }
> -
> - /* Now buffer flags for this vertex */
> - dst_reg dst(this->vertex_output);
> - dst.reladdr = ralloc(mem_ctx, src_reg);
> - memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> - if (c->gp->program.OutputType == GL_POINTS) {
> - /* If we are outputting points, then every vertex has PrimStart and
> - * PrimEnd set.
> - */
> - emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
> - URB_WRITE_PRIM_START | URB_WRITE_PRIM_END));
> - emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
> - } else {
> - /* Otherwise, we can only set the PrimStart flag, which we have stored
> - * in the first_vertex register. We will have to wait until we execute
> - * EndPrimitive() or we end the thread to set the PrimEnd flag on a
> - * vertex.
> - */
> - emit(OR(dst, this->first_vertex,
> - (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
> - emit(MOV(dst_reg(this->first_vertex), 0u));
> - }
> - emit(ADD(dst_reg(this->vertex_output_offset),
> - this->vertex_output_offset, 1u));
> -
> - /* Update vertex count */
> - emit(ADD(dst_reg(this->vertex_count), this->vertex_count, 1u));
> - }
> - emit(BRW_OPCODE_ENDIF);
> -}
> -
> -void
> -gen6_gs_visitor::visit(ir_end_primitive *)
> -{
> - this->current_annotation = "gen6 end primitive";
> - /* Calling EndPrimitive() is optional for point output. In this case we set
> - * the PrimEnd flag when we process EmitVertex().
> - */
> - if (c->gp->program.OutputType == GL_POINTS)
> - return;
> -
> - /* Otherwise we know that the last vertex we have processed was the last
> - * vertex in the primitive and we need to set its PrimEnd flag, so do this
> - * unless we haven't emitted that vertex at all (vertex_count != 0).
> - *
> - * Notice that we have already incremented vertex_count when we processed
> - * the last emit_vertex, so we need to take that into account in the
> - * comparison below (hence the num_output_vertices + 1 in the comparison
> - * below).
> - */
> - unsigned num_output_vertices = c->gp->program.VerticesOut;
> - emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
> - BRW_CONDITIONAL_L));
> - vec4_instruction *inst = emit(CMP(dst_null_d(),
> - this->vertex_count, 0u,
> - BRW_CONDITIONAL_NEQ));
> - inst->predicate = BRW_PREDICATE_NORMAL;
> - emit(IF(BRW_PREDICATE_NORMAL));
> - {
> - /* vertex_output_offset is already pointing at the first entry of the
> - * next vertex. So subtract 1 to modify the flags for the previous
> - * vertex.
> - */
> - src_reg offset(this, glsl_type::uint_type);
> - emit(ADD(dst_reg(offset), this->vertex_output_offset, src_reg(-1)));
> -
> - src_reg dst(this->vertex_output);
> - dst.reladdr = ralloc(mem_ctx, src_reg);
> - memcpy(dst.reladdr, &offset, sizeof(src_reg));
> -
> - emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END));
> - emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
> -
> - /* Set the first vertex flag to indicate that the next vertex will start
> - * a primitive.
> - */
> - emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
> - }
> - emit(BRW_OPCODE_ENDIF);
> -}
> -
> -void
> -gen6_gs_visitor::emit_urb_write_header(int mrf)
> -{
> - this->current_annotation = "gen6 urb header";
> - /* Compute offset of the flags for the current vertex in vertex_output and
> - * write them in dw2 of the message header.
> - *
> - * Notice that by the time that emit_thread_end() calls here
> - * vertex_output_offset should point to the first data item of the current
> - * vertex in vertex_output, thus we only need to add the number of output
> - * slots per vertex to that offset to obtain the flags data offset.
> - */
> - src_reg flags_offset(this, glsl_type::uint_type);
> - emit(ADD(dst_reg(flags_offset),
> - this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots)));
> -
> - src_reg flags_data(this->vertex_output);
> - flags_data.reladdr = ralloc(mem_ctx, src_reg);
> - memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
> -
> - emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
> -}
> -
> -void
> -gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf,
> - int last_mrf, int urb_offset)
> -{
> - vec4_instruction *inst = NULL;
> -
> - if (!complete) {
> - /* If the vertex is not complete we don't have to do anything special */
> - inst = emit(GS_OPCODE_URB_WRITE);
> - inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
> - } else {
> - /* Otherwise we always request to allocate a new VUE handle. If this is
> - * the last write before the EOT message and the new handle never gets
> - * used it will be dereferenced when we send the EOT message. This is
> - * necessary to avoid different setups for the EOT message (one for the
> - * case when there is no output and another for the case when there is)
> - * which would require to end the program with an IF/ELSE/ENDIF block,
> - * something we do not want.
> - */
> - inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
> - inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
> - inst->dst = dst_reg(MRF, base_mrf);
> - inst->src[0] = this->temp;
> - }
> -
> - inst->base_mrf = base_mrf;
> - /* URB data written (does not include the message header reg) must
> - * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
> - * section 5.4.3.2.2: URB_INTERLEAVED.
> - */
> - int mlen = last_mrf - base_mrf;
> - if ((mlen % 2) != 1)
> - mlen++;
> - inst->mlen = mlen;
> - inst->offset = urb_offset;
> -}
> -
> -void
> -gen6_gs_visitor::emit_thread_end()
> -{
> - /* Make sure the current primitive is ended: we know it is not ended when
> - * first_vertex is not zero. This is only relevant for outputs other than
> - * points because in the point case we set PrimEnd on all vertices.
> - */
> - if (c->gp->program.OutputType != GL_POINTS) {
> - emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z));
> - emit(IF(BRW_PREDICATE_NORMAL));
> - {
> - visit((ir_end_primitive *) NULL);
> - }
> - emit(BRW_OPCODE_ENDIF);
> - }
> -
> - /* Here we have to:
> - * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
> - * 2) Loop over all buffered vertex data and write it to corresponding
> - * URB entries.
> - * 3) Allocate new VUE handles for all vertices other than the first.
> - * 4) Send a final EOT message.
> - */
> -
> - /* MRF 0 is reserved for the debugger, so start with message header
> - * in MRF 1.
> - */
> - int base_mrf = 1;
> -
> - /* In the process of generating our URB write message contents, we
> - * may need to unspill a register or load from an array. Those
> - * reads would use MRFs 14-15.
> - */
> - int max_usable_mrf = 13;
> -
> - /* Issue the FF_SYNC message and obtain the initial VUE handle. */
> - emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G));
> - emit(IF(BRW_PREDICATE_NORMAL));
> - {
> - this->current_annotation = "gen6 thread end: ff_sync";
> -
> - vec4_instruction *inst;
> - if (c->prog_data.gen6_xfb_enabled) {
> - src_reg sol_temp(this, glsl_type::uvec4_type);
> - emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
> - dst_reg(this->svbi),
> - this->vertex_count,
> - this->prim_count,
> - sol_temp);
> - inst = emit(GS_OPCODE_FF_SYNC,
> - dst_reg(this->temp), this->prim_count, this->svbi);
> - } else {
> - inst = emit(GS_OPCODE_FF_SYNC,
> - dst_reg(this->temp), this->prim_count, src_reg(0u));
> - }
> - inst->base_mrf = base_mrf;
> -
> - /* Loop over all buffered vertices and emit URB write messages */
> - this->current_annotation = "gen6 thread end: urb writes init";
> - src_reg vertex(this, glsl_type::uint_type);
> - emit(MOV(dst_reg(vertex), 0u));
> - emit(MOV(dst_reg(this->vertex_output_offset), 0u));
> -
> - this->current_annotation = "gen6 thread end: urb writes";
> - emit(BRW_OPCODE_DO);
> - {
> - emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
> - inst = emit(BRW_OPCODE_BREAK);
> - inst->predicate = BRW_PREDICATE_NORMAL;
> -
> - /* First we prepare the message header */
> - emit_urb_write_header(base_mrf);
> -
> - /* Then add vertex data to the message in interleaved fashion */
> - int slot = 0;
> - bool complete = false;
> - do {
> - int mrf = base_mrf + 1;
> -
> - /* URB offset is in URB row increments, and each of our MRFs is half
> - * of one of those, since we're doing interleaved writes.
> - */
> - int urb_offset = slot / 2;
> -
> - for (; slot < prog_data->vue_map.num_slots; ++slot) {
> - int varying = prog_data->vue_map.slot_to_varying[slot];
> - current_annotation = output_reg_annotation[varying];
> -
> - /* Compute offset of this slot for the current vertex
> - * in vertex_output
> - */
> - src_reg data(this->vertex_output);
> - data.reladdr = ralloc(mem_ctx, src_reg);
> - memcpy(data.reladdr, &this->vertex_output_offset,
> - sizeof(src_reg));
> -
> - /* Copy this slot to the appropriate message register */
> - dst_reg reg = dst_reg(MRF, mrf);
> - reg.type = output_reg[varying].type;
> - data.type = reg.type;
> - vec4_instruction *inst = emit(MOV(reg, data));
> - inst->force_writemask_all = true;
> -
> - mrf++;
> - emit(ADD(dst_reg(this->vertex_output_offset),
> - this->vertex_output_offset, 1u));
> -
> - /* If this was max_usable_mrf, we can't fit anything more into
> - * this URB WRITE.
> - */
> - if (mrf > max_usable_mrf) {
> - slot++;
> - break;
> - }
> - }
> -
> - complete = slot >= prog_data->vue_map.num_slots;
> - emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
> - } while (!complete);
> -
> - /* Skip over the flags data item so that vertex_output_offset points
> - * to the first data item of the next vertex, so that we can start
> - * writing the next vertex.
> - */
> - emit(ADD(dst_reg(this->vertex_output_offset),
> - this->vertex_output_offset, 1u));
> -
> - emit(ADD(dst_reg(vertex), vertex, 1u));
> - }
> - emit(BRW_OPCODE_WHILE);
> -
> - if (c->prog_data.gen6_xfb_enabled)
> - xfb_write();
> - }
> - emit(BRW_OPCODE_ENDIF);
> -
> - /* Finally, emit EOT message.
> - *
> - * In gen6 we need to end the thread differently depending on whether we have
> - * emitted at least one vertex or not. In case we did, the EOT message must
> - * always include the COMPLETE flag or else the GPU hangs. If we have not
> - * produced any output we can't use the COMPLETE flag.
> - *
> - * However, this would lead us to end the program with an ENDIF opcode,
> - * which we want to avoid, so what we do is that we always request a new
> - * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
> - * With this we make sure that whether we have emitted at least one vertex
> - * or none at all, we have to finish the thread without writing to the URB,
> - * which works for both cases by setting the COMPLETE and UNUSED flags in
> - * the EOT message.
> - */
> - this->current_annotation = "gen6 thread end: EOT";
> -
> - if (c->prog_data.gen6_xfb_enabled) {
> - /* When emitting EOT, set SONumPrimsWritten Increment Value. */
> - src_reg data(this, glsl_type::uint_type);
> - emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu)));
> - emit(SHL(dst_reg(data), data, src_reg(16u)));
> - emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
> - }
> -
> - vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
> - inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
> - inst->base_mrf = base_mrf;
> - inst->mlen = 1;
> -}
> -
> -void
> -gen6_gs_visitor::setup_payload()
> -{
> - int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
> -
> - /* Attributes are going to be interleaved, so one register contains two
> - * attribute slots.
> - */
> - int attributes_per_reg = 2;
> -
> - /* If a geometry shader tries to read from an input that wasn't written by
> - * the vertex shader, that produces undefined results, but it shouldn't
> - * crash anything. So initialize attribute_map to zeros--that ensures that
> - * these undefined results are read from r0.
> - */
> - memset(attribute_map, 0, sizeof(attribute_map));
> -
> - int reg = 0;
> -
> - /* The payload always contains important data in r0. */
> - reg++;
> -
> - /* r1 is always part of the payload and it holds information relevant
> - * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
> - * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
> - * information (and move the original value to a virtual register if
> - * necessary).
> - */
> - if (c->prog_data.include_primitive_id)
> - attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
> - reg++;
> -
> - reg = setup_uniforms(reg);
> -
> - reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
> -
> - lower_attributes_to_hw_regs(attribute_map, true);
> -
> - this->first_non_payload_grf = reg;
> -}
> -
> -void
> -gen6_gs_visitor::xfb_setup()
> -{
> - static const unsigned swizzle_for_offset[4] = {
> - BRW_SWIZZLE4(0, 1, 2, 3),
> - BRW_SWIZZLE4(1, 2, 3, 3),
> - BRW_SWIZZLE4(2, 3, 3, 3),
> - BRW_SWIZZLE4(3, 3, 3, 3)
> - };
> -
> - struct brw_gs_prog_data *prog_data =
> - (struct brw_gs_prog_data *) &c->prog_data;
> -
> - const struct gl_transform_feedback_info *linked_xfb_info =
> - &this->shader_prog->LinkedTransformFeedback;
> - int i;
> -
> - /* Make sure that the VUE slots won't overflow the unsigned chars in
> - * prog_data->transform_feedback_bindings[].
> - */
> - STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
> -
> - /* Make sure that we don't need more binding table entries than we've
> - * set aside for use in transform feedback. (We shouldn't, since we
> - * set aside enough binding table entries to have one per component).
> - */
> - assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
> -
> - prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
> - for (i = 0; i < prog_data->num_transform_feedback_bindings; i++) {
> - prog_data->transform_feedback_bindings[i] =
> - linked_xfb_info->Outputs[i].OutputRegister;
> - prog_data->transform_feedback_swizzles[i] =
> - swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
> - }
> -}
> -
> -void
> -gen6_gs_visitor::xfb_write()
> -{
> - unsigned num_verts;
> - struct brw_gs_prog_data *prog_data =
> - (struct brw_gs_prog_data *) &c->prog_data;
> -
> - if (!prog_data->num_transform_feedback_bindings)
> - return;
> -
> - switch (c->prog_data.output_topology) {
> - case _3DPRIM_POINTLIST:
> - num_verts = 1;
> - break;
> - case _3DPRIM_LINELIST:
> - case _3DPRIM_LINESTRIP:
> - case _3DPRIM_LINELOOP:
> - num_verts = 2;
> - break;
> - case _3DPRIM_TRILIST:
> - case _3DPRIM_TRIFAN:
> - case _3DPRIM_TRISTRIP:
> - case _3DPRIM_RECTLIST:
> - num_verts = 3;
> - break;
> - case _3DPRIM_QUADLIST:
> - case _3DPRIM_QUADSTRIP:
> - case _3DPRIM_POLYGON:
> - num_verts = 3;
> - break;
> - default:
> - unreachable("Unexpected primitive type in Gen6 SOL program.");
> - }
> -
> - this->current_annotation = "gen6 thread end: svb writes init";
> -
> - emit(MOV(dst_reg(this->vertex_output_offset), 0u));
> - emit(MOV(dst_reg(this->sol_prim_written), 0u));
> -
> - /* Check that at least one primitive can be written
> - *
> - * Note: since we use the binding table to keep track of buffer offsets
> - * and stride, the GS doesn't need to keep track of a separate pointer
> - * into each buffer; it uses a single pointer which increments by 1 for
> - * each vertex. So we use SVBI0 for this pointer, regardless of whether
> - * transform feedback is in interleaved or separate attribs mode.
> - */
> - src_reg sol_temp(this, glsl_type::uvec4_type);
> - emit(ADD(dst_reg(sol_temp), this->svbi, src_reg(num_verts)));
> -
> - /* Compare SVBI calculated number with the maximum value, which is
> - * in R1.4 (previously saved in this->max_svbi) for gen6.
> - */
> - emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
> - emit(IF(BRW_PREDICATE_NORMAL));
> - {
> - src_reg destination_indices_uw =
> - retype(destination_indices, BRW_REGISTER_TYPE_UW);
> -
> - vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw),
> - brw_imm_v(0x00020100))); /* (0, 1, 2) */
> - inst->force_writemask_all = true;
> -
> - emit(ADD(dst_reg(this->destination_indices),
> - this->destination_indices,
> - this->svbi));
> - }
> - emit(BRW_OPCODE_ENDIF);
> -
> - /* Write transform feedback data for all processed vertices. */
> - for (int i = 0; i < c->gp->program.VerticesOut; i++) {
> - emit(MOV(dst_reg(sol_temp), i));
> - emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
> - BRW_CONDITIONAL_L));
> - emit(IF(BRW_PREDICATE_NORMAL));
> - {
> - xfb_program(i, num_verts);
> - }
> - emit(BRW_OPCODE_ENDIF);
> - }
> -}
> -
> -void
> -gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
> -{
> - struct brw_gs_prog_data *prog_data =
> - (struct brw_gs_prog_data *) &c->prog_data;
> - unsigned binding;
> - unsigned num_bindings = prog_data->num_transform_feedback_bindings;
> - src_reg sol_temp(this, glsl_type::uvec4_type);
> -
> - /* Check for buffer overflow: we need room to write the complete primitive
> - * (all vertices). Otherwise, avoid writing any vertices for it
> - */
> - emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u));
> - emit(MUL(dst_reg(sol_temp), sol_temp, src_reg(num_verts)));
> - emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
> - emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
> - emit(IF(BRW_PREDICATE_NORMAL));
> - {
> - /* Avoid overwriting MRF 1 as it is used as URB write message header */
> - dst_reg mrf_reg(MRF, 2);
> -
> - this->current_annotation = "gen6: emit SOL vertex data";
> - /* For each vertex, generate code to output each varying using the
> - * appropriate binding table entry.
> - */
> - for (binding = 0; binding < num_bindings; ++binding) {
> - unsigned char varying =
> - prog_data->transform_feedback_bindings[binding];
> -
> - /* Set up the correct destination index for this vertex */
> - vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
> - mrf_reg,
> - this->destination_indices);
> - inst->sol_vertex = vertex % num_verts;
> -
> - /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
> - *
> - * "Prior to End of Thread with a URB_WRITE, the kernel must
> - * ensure that all writes are complete by sending the final
> - * write as a committed write."
> - */
> - bool final_write = binding == (unsigned) num_bindings - 1 &&
> - inst->sol_vertex == num_verts - 1;
> -
> - /* Compute offset of this varying for the current vertex
> - * in vertex_output
> - */
> - this->current_annotation = output_reg_annotation[varying];
> - src_reg data(this->vertex_output);
> - data.reladdr = ralloc(mem_ctx, src_reg);
> - int offset = get_vertex_output_offset_for_varying(vertex, varying);
> - emit(MOV(dst_reg(this->vertex_output_offset), offset));
> - memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> - data.type = output_reg[varying].type;
> -
> - /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
> - * same slot, so make sure we write the appropriate channel
> - */
> - if (varying == VARYING_SLOT_PSIZ)
> - data.swizzle = BRW_SWIZZLE_WWWW;
> - else if (varying == VARYING_SLOT_LAYER)
> - data.swizzle = BRW_SWIZZLE_YYYY;
> - else if (varying == VARYING_SLOT_VIEWPORT)
> - data.swizzle = BRW_SWIZZLE_ZZZZ;
> - else
> - data.swizzle = prog_data->transform_feedback_swizzles[binding];
> -
> - /* Write data */
> - inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
> - inst->sol_binding = binding;
> - inst->sol_final_write = final_write;
> -
> - if (final_write) {
> - /* This is the last vertex of the primitive, then increment
> - * SO num primitive counter and destination indices.
> - */
> - emit(ADD(dst_reg(this->destination_indices),
> - this->destination_indices,
> - src_reg(num_verts)));
> - emit(ADD(dst_reg(this->sol_prim_written),
> - this->sol_prim_written, 1u));
> - }
> -
> - }
> - this->current_annotation = NULL;
> - }
> - emit(BRW_OPCODE_ENDIF);
> -}
> -
> -int
> -gen6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
> -{
> - /* Find the output slot assigned to this varying.
> - *
> - * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
> - * as VARYING_SLOT_PSIZ.
> - */
> - if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
> - varying = VARYING_SLOT_PSIZ;
> - int slot = prog_data->vue_map.varying_to_slot[varying];
> -
> - if (slot < 0) {
> - /* This varying does not exist in the VUE so we are not writing to it
> - * and its value is undefined. We still want to return a valid offset
> - * into vertex_output though, to prevent any out-of-bound accesses into
> - * the vertex_output array. Since the value for this varying is undefined
> - * we don't really care for the value we assign to it, so any offset
> - * within the limits of vertex_output will do.
> - */
> - slot = 0;
> - }
> -
> - return vertex * (prog_data->vue_map.num_slots + 1) + slot;
> -}
> -
> -} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
> deleted file mode 100644
> index 28f23c9..0000000
> --- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
> +++ /dev/null
> @@ -1,82 +0,0 @@
> -/*
> - * Copyright © 2014 Intel Corporation
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the "Software"),
> - * to deal in the Software without restriction, including without limitation
> - * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> - * and/or sell copies of the Software, and to permit persons to whom the
> - * Software is furnished to do so, subject to the following conditions:
> - *
> - * The above copyright notice and this permission notice (including the next
> - * paragraph) shall be included in all copies or substantial portions of the
> - * Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> - * IN THE SOFTWARE.
> - *
> - */
> -
> -#ifndef GEN6_GS_VISITOR_H
> -#define GEN6_GS_VISITOR_H
> -
> -#include "brw_vec4.h"
> -#include "brw_vec4_gs_visitor.h"
> -
> -#ifdef __cplusplus
> -
> -namespace brw {
> -
> -class gen6_gs_visitor : public vec4_gs_visitor
> -{
> -public:
> - gen6_gs_visitor(struct brw_context *brw,
> - struct brw_gs_compile *c,
> - struct gl_shader_program *prog,
> - void *mem_ctx,
> - bool no_spills) :
> - vec4_gs_visitor(brw, c, prog, mem_ctx, no_spills) {}
> -
> -protected:
> - virtual void assign_binding_table_offsets();
> - virtual void emit_prolog();
> - virtual void emit_thread_end();
> - virtual void visit(ir_emit_vertex *);
> - virtual void visit(ir_end_primitive *);
> - virtual void emit_urb_write_header(int mrf);
> - virtual void emit_urb_write_opcode(bool complete,
> - int base_mrf,
> - int last_mrf,
> - int urb_offset);
> - virtual void setup_payload();
> -
> -private:
> - void xfb_write();
> - void xfb_program(unsigned vertex, unsigned num_verts);
> - void xfb_setup();
> - int get_vertex_output_offset_for_varying(int vertex, int varying);
> -
> - src_reg vertex_output;
> - src_reg vertex_output_offset;
> - src_reg temp;
> - src_reg first_vertex;
> - src_reg prim_count;
> - src_reg primitive_id;
> -
> - /* Transform Feedback members */
> - src_reg sol_prim_written;
> - src_reg svbi;
> - src_reg max_svbi;
> - src_reg destination_indices;
> -};
> -
> -} /* namespace brw */
> -
> -#endif /* __cplusplus */
> -
> -#endif /* GEN6_GS_VISITOR_H */
> diff --git a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
> index ed8744d..239d225 100644
> --- a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
> +++ b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
> @@ -35,16 +35,16 @@ public:
> struct brw_wm_prog_data *prog_data;
> struct gl_shader_program *shader_prog;
> struct brw_fragment_program *fp;
> - fs_visitor *v;
> + fs_god *v;
> };
>
> -class cmod_propagation_fs_visitor : public fs_visitor
> +class cmod_propagation_fs_god : public fs_god
> {
> public:
> - cmod_propagation_fs_visitor(struct brw_context *brw,
> + cmod_propagation_fs_god(struct brw_context *brw,
> struct brw_wm_prog_data *prog_data,
> struct gl_shader_program *shader_prog)
> - : fs_visitor(brw, NULL, NULL, prog_data, shader_prog, NULL, 8) {}
> + : fs_god(brw, NULL, NULL, prog_data, shader_prog, NULL, 8) {}
> };
>
>
> @@ -57,7 +57,7 @@ void cmod_propagation_test::SetUp()
> prog_data = ralloc(NULL, struct brw_wm_prog_data);
> shader_prog = ralloc(NULL, struct gl_shader_program);
>
> - v = new cmod_propagation_fs_visitor(brw, prog_data, shader_prog);
> + v = new cmod_propagation_fs_god(brw, prog_data, shader_prog);
>
> _mesa_init_fragment_program(ctx, &fp->program, GL_FRAGMENT_SHADER, 0);
>
> @@ -75,7 +75,7 @@ instruction(bblock_t *block, int num)
> }
>
> static bool
> -cmod_propagation(fs_visitor *v)
> +cmod_propagation(fs_god *v)
> {
> const bool print = false;
>
> diff --git a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
> index 6f762bc..7ad0bd2 100644
> --- a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
> +++ b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
> @@ -35,16 +35,16 @@ public:
> struct brw_wm_prog_data *prog_data;
> struct gl_shader_program *shader_prog;
> struct brw_fragment_program *fp;
> - fs_visitor *v;
> + fs_god *v;
> };
>
> -class saturate_propagation_fs_visitor : public fs_visitor
> +class saturate_propagation_fs_god : public fs_god
> {
> public:
> - saturate_propagation_fs_visitor(struct brw_context *brw,
> + saturate_propagation_fs_god(struct brw_context *brw,
> struct brw_wm_prog_data *prog_data,
> struct gl_shader_program *shader_prog)
> - : fs_visitor(brw, NULL, NULL, prog_data, shader_prog, NULL, 8) {}
> + : fs_god(brw, NULL, NULL, prog_data, shader_prog, NULL, 8) {}
> };
>
>
> @@ -57,7 +57,7 @@ void saturate_propagation_test::SetUp()
> prog_data = ralloc(NULL, struct brw_wm_prog_data);
> shader_prog = ralloc(NULL, struct gl_shader_program);
>
> - v = new saturate_propagation_fs_visitor(brw, prog_data, shader_prog);
> + v = new saturate_propagation_fs_god(brw, prog_data, shader_prog);
>
> _mesa_init_fragment_program(ctx, &fp->program, GL_FRAGMENT_SHADER, 0);
>
> @@ -75,7 +75,7 @@ instruction(bblock_t *block, int num)
> }
>
> static bool
> -saturate_propagation(fs_visitor *v)
> +saturate_propagation(fs_god *v)
> {
> const bool print = false;
>
> diff --git a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
> index f9e4ce1..4913c30 100644
> --- a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
> +++ b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
> @@ -37,15 +37,15 @@ public:
> struct gl_context *ctx;
> struct gl_shader_program *shader_prog;
> struct brw_vertex_program *vp;
> - vec4_visitor *v;
> + vec4_god *v;
> };
>
> -class copy_propagation_vec4_visitor : public vec4_visitor
> +class copy_propagation_vec4_god : public vec4_god
> {
> public:
> - copy_propagation_vec4_visitor(struct brw_context *brw,
> + copy_propagation_vec4_god(struct brw_context *brw,
> struct gl_shader_program *shader_prog)
> - : vec4_visitor(brw, NULL, NULL, NULL, NULL, shader_prog,
> + : vec4_god(brw, NULL, NULL, NULL, NULL, shader_prog,
> MESA_SHADER_VERTEX, NULL,
> false /* no_spills */,
> ST_NONE, ST_NONE, ST_NONE)
> @@ -99,7 +99,7 @@ void copy_propagation_test::SetUp()
>
> shader_prog = ralloc(NULL, struct gl_shader_program);
>
> - v = new copy_propagation_vec4_visitor(brw, shader_prog);
> + v = new copy_propagation_vec4_god(brw, shader_prog);
>
> _mesa_init_vertex_program(ctx, &vp->program, GL_VERTEX_SHADER, 0);
>
> @@ -107,7 +107,7 @@ void copy_propagation_test::SetUp()
> }
>
> static void
> -copy_propagation(vec4_visitor *v)
> +copy_propagation(vec4_god *v)
> {
> bool print = false;
>
> diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
> index 0c27162..bab3532 100644
> --- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
> +++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
> @@ -39,16 +39,16 @@ public:
> struct gl_context *ctx;
> struct gl_shader_program *shader_prog;
> struct brw_vertex_program *vp;
> - vec4_visitor *v;
> + vec4_god *v;
> };
>
>
> -class register_coalesce_vec4_visitor : public vec4_visitor
> +class register_coalesce_vec4_god : public vec4_god
> {
> public:
> - register_coalesce_vec4_visitor(struct brw_context *brw,
> + register_coalesce_vec4_god(struct brw_context *brw,
> struct gl_shader_program *shader_prog)
> - : vec4_visitor(brw, NULL, NULL, NULL, NULL, shader_prog,
> + : vec4_god(brw, NULL, NULL, NULL, NULL, shader_prog,
> MESA_SHADER_VERTEX, NULL,
> false /* no_spills */,
> ST_NONE, ST_NONE, ST_NONE)
> @@ -102,7 +102,7 @@ void register_coalesce_test::SetUp()
>
> shader_prog = ralloc(NULL, struct gl_shader_program);
>
> - v = new register_coalesce_vec4_visitor(brw, shader_prog);
> + v = new register_coalesce_vec4_god(brw, shader_prog);
>
> _mesa_init_vertex_program(ctx, &vp->program, GL_VERTEX_SHADER, 0);
>
> @@ -110,7 +110,7 @@ void register_coalesce_test::SetUp()
> }
>
> static void
> -_register_coalesce(vec4_visitor *v, const char *func)
> +_register_coalesce(vec4_god *v, const char *func)
> {
> bool print = false;
>
> --
> 2.1.3
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list