[Mesa-dev] [PATCH] i965: Give the FS and VEC4 visitors more descriptive names.

Wed Apr 1 09:06:24 PDT 2015

Laughed-at-by: Connor Abbott <cwabbott0 at gmail.com>

On Wed, Apr 1, 2015 at 5:43 AM, Francisco Jerez <currojerez at riseup.net> wrote:
> It has always struck me as odd that these objects are both called
> visitors even though visiting only makes for a tiny fraction of their
> job.  Other no less important tasks seem misrepresented, like
> optimizing, analyzing, emitting and pretty-printing the IR,
> translating NIR and ARB assembly programs, applying all sorts of
> hardware workarounds, calculating the binding table, URB, push and
> pull constant layout, etc.
>
> The new names should better depict the extraordinary power of these
> objects, and have the additional advantage of being up to 40% shorter
> than the old ones, reducing the number of keystrokes required to refer
> to these frequently used objects and hopefully increasing everyone's
> productivity.
> ---
>  src/mesa/drivers/dri/i965/Makefile.sources         |   14 +-
>  src/mesa/drivers/dri/i965/brw_cfg.cpp              |    4 +-
>  src/mesa/drivers/dri/i965/brw_cfg.h                |    4 +-
>  .../drivers/dri/i965/brw_dead_control_flow.cpp     |    2 +-
>  src/mesa/drivers/dri/i965/brw_dead_control_flow.h  |    2 +-
>  src/mesa/drivers/dri/i965/brw_fs.cpp               |  164 +-
>  src/mesa/drivers/dri/i965/brw_fs.h                 |   10 +-
>  .../drivers/dri/i965/brw_fs_cmod_propagation.cpp   |    4 +-
>  .../drivers/dri/i965/brw_fs_combine_constants.cpp  |    2 +-
>  .../drivers/dri/i965/brw_fs_copy_propagation.cpp   |    8 +-
>  src/mesa/drivers/dri/i965/brw_fs_cse.cpp           |    4 +-
>  .../dri/i965/brw_fs_dead_code_eliminate.cpp        |    2 +-
>  src/mesa/drivers/dri/i965/brw_fs_fp.cpp            |   20 +-
>  src/mesa/drivers/dri/i965/brw_fs_god.cpp           | 4157 ++++++++++++++++++++
>  .../drivers/dri/i965/brw_fs_live_variables.cpp     |    8 +-
>  src/mesa/drivers/dri/i965/brw_fs_live_variables.h  |    4 +-
>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp           |   46 +-
>  .../dri/i965/brw_fs_peephole_predicated_break.cpp  |    2 +-
>  src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp  |   18 +-
>  .../drivers/dri/i965/brw_fs_register_coalesce.cpp  |    6 +-
>  .../dri/i965/brw_fs_saturate_propagation.cpp       |    4 +-
>  src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp  |    2 +-
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp       | 4157 --------------------
>  src/mesa/drivers/dri/i965/brw_gs.c                 |    2 +-
>  src/mesa/drivers/dri/i965/brw_ir_vec4.h            |    6 +-
>  .../drivers/dri/i965/brw_schedule_instructions.cpp |   20 +-
>  src/mesa/drivers/dri/i965/brw_shader.cpp           |   12 +-
>  src/mesa/drivers/dri/i965/brw_shader.h             |    4 +-
>  src/mesa/drivers/dri/i965/brw_vec4.cpp             |   54 +-
>  src/mesa/drivers/dri/i965/brw_vec4.h               |    6 +-
>  .../drivers/dri/i965/brw_vec4_copy_propagation.cpp |    2 +-
>  src/mesa/drivers/dri/i965/brw_vec4_cse.cpp         |    4 +-
>  .../dri/i965/brw_vec4_dead_code_eliminate.cpp      |    2 +-
>  src/mesa/drivers/dri/i965/brw_vec4_god.cpp         | 3658 +++++++++++++++++
>  src/mesa/drivers/dri/i965/brw_vec4_gs_god.cpp      |  706 ++++
>  src/mesa/drivers/dri/i965/brw_vec4_gs_god.h        |  103 +
>  src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp  |  706 ----
>  src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h    |  103 -
>  .../drivers/dri/i965/brw_vec4_live_variables.cpp   |   10 +-
>  .../drivers/dri/i965/brw_vec4_reg_allocate.cpp     |   12 +-
>  src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp     | 3658 -----------------
>  src/mesa/drivers/dri/i965/brw_vec4_vp.cpp          |   10 +-
>  src/mesa/drivers/dri/i965/brw_vec4_vs_god.cpp      |  231 ++
>  src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp  |  231 --
>  src/mesa/drivers/dri/i965/brw_vs.c                 |    2 +-
>  src/mesa/drivers/dri/i965/brw_vs.h                 |    4 +-
>  src/mesa/drivers/dri/i965/brw_wm_iz.cpp            |    2 +-
>  src/mesa/drivers/dri/i965/gen6_gs_god.cpp          |  776 ++++
>  src/mesa/drivers/dri/i965/gen6_gs_god.h            |   82 +
>  src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp      |  776 ----
>  src/mesa/drivers/dri/i965/gen6_gs_visitor.h        |   82 -
>  .../drivers/dri/i965/test_fs_cmod_propagation.cpp  |   12 +-
>  .../dri/i965/test_fs_saturate_propagation.cpp      |   12 +-
>  .../dri/i965/test_vec4_copy_propagation.cpp        |   12 +-
>  .../dri/i965/test_vec4_register_coalesce.cpp       |   12 +-
>  55 files changed, 9978 insertions(+), 9978 deletions(-)
>  create mode 100644 src/mesa/drivers/dri/i965/brw_fs_god.cpp
>  delete mode 100644 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
>  create mode 100644 src/mesa/drivers/dri/i965/brw_vec4_god.cpp
>  create mode 100644 src/mesa/drivers/dri/i965/brw_vec4_gs_god.cpp
>  create mode 100644 src/mesa/drivers/dri/i965/brw_vec4_gs_god.h
>  delete mode 100644 src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
>  delete mode 100644 src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
>  delete mode 100644 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
>  create mode 100644 src/mesa/drivers/dri/i965/brw_vec4_vs_god.cpp
>  delete mode 100644 src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
>  create mode 100644 src/mesa/drivers/dri/i965/gen6_gs_god.cpp
>  create mode 100644 src/mesa/drivers/dri/i965/gen6_gs_god.h
>  delete mode 100644 src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
>  delete mode 100644 src/mesa/drivers/dri/i965/gen6_gs_visitor.h
>
> diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
> index 498d5a7..82477c0 100644
> --- a/src/mesa/drivers/dri/i965/Makefile.sources
> +++ b/src/mesa/drivers/dri/i965/Makefile.sources
> @@ -57,7 +57,7 @@ i965_FILES = \
>         brw_fs_saturate_propagation.cpp \
>         brw_fs_sel_peephole.cpp \
>         brw_fs_vector_splitting.cpp \
> -       brw_fs_visitor.cpp \
> +       brw_fs_god.cpp \
>         brw_gs.c \
>         brw_gs.h \
>         brw_gs_state.c \
> @@ -112,15 +112,15 @@ i965_FILES = \
>         brw_vec4_cse.cpp \
>         brw_vec4_dead_code_eliminate.cpp \
>         brw_vec4_generator.cpp \
> -       brw_vec4_gs_visitor.cpp \
> -       brw_vec4_gs_visitor.h \
> +       brw_vec4_gs_god.cpp \
> +       brw_vec4_gs_god.h \
>         brw_vec4.h \
>         brw_vec4_live_variables.cpp \
>         brw_vec4_live_variables.h \
>         brw_vec4_reg_allocate.cpp \
> -       brw_vec4_visitor.cpp \
> +       brw_vec4_god.cpp \
>         brw_vec4_vp.cpp \
> -       brw_vec4_vs_visitor.cpp \
> +       brw_vec4_vs_god.cpp \
>         brw_vs.c \
>         brw_vs.h \
>         brw_vs_state.c \
> @@ -137,8 +137,8 @@ i965_FILES = \
>         gen6_depth_state.c \
>         gen6_depthstencil.c \
>         gen6_gs_state.c \
> -       gen6_gs_visitor.cpp \
> -       gen6_gs_visitor.h \
> +       gen6_gs_god.cpp \
> +       gen6_gs_god.h \
>         gen6_multisample_state.c \
>         gen6_queryobj.c \
>         gen6_sampler_state.c \
> diff --git a/src/mesa/drivers/dri/i965/brw_cfg.cpp b/src/mesa/drivers/dri/i965/brw_cfg.cpp
> index 7e7770e..8eaf276 100644
> --- a/src/mesa/drivers/dri/i965/brw_cfg.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_cfg.cpp
> @@ -141,7 +141,7 @@ bblock_t::combine_with(bblock_t *that)
>  }
>
>  void
> -bblock_t::dump(backend_visitor *v) const
> +bblock_t::dump(backend_god *v) const
>  {
>     int ip = this->start_ip;
>     foreach_inst_in_block(backend_instruction, inst, this) {
> @@ -411,7 +411,7 @@ cfg_t::make_block_array()
>  }
>
>  void
> -cfg_t::dump(backend_visitor *v)
> +cfg_t::dump(backend_god *v)
>  {
>     if (idom_dirty)
>        calculate_idom();
> diff --git a/src/mesa/drivers/dri/i965/brw_cfg.h b/src/mesa/drivers/dri/i965/brw_cfg.h
> index 56d7d07..961876f 100644
> --- a/src/mesa/drivers/dri/i965/brw_cfg.h
> +++ b/src/mesa/drivers/dri/i965/brw_cfg.h
> @@ -60,7 +60,7 @@ struct bblock_t {
>     bool is_successor_of(const bblock_t *block) const;
>     bool can_combine_with(const bblock_t *that) const;
>     void combine_with(bblock_t *that);
> -   void dump(backend_visitor *v) const;
> +   void dump(backend_god *v) const;
>
>     backend_instruction *start();
>     const backend_instruction *start() const;
> @@ -273,7 +273,7 @@ struct cfg_t {
>     void calculate_idom();
>     static bblock_t *intersect(bblock_t *b1, bblock_t *b2);
>
> -   void dump(backend_visitor *v);
> +   void dump(backend_god *v);
>     void dump_cfg();
>     void dump_domtree();
>  #endif
> diff --git a/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp b/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
> index 03f838d..256dee6 100644
> --- a/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
> @@ -36,7 +36,7 @@
>   *   - if/else/endif
>   */
>  bool
> -dead_control_flow_eliminate(backend_visitor *v)
> +dead_control_flow_eliminate(backend_god *v)
>  {
>     bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_dead_control_flow.h b/src/mesa/drivers/dri/i965/brw_dead_control_flow.h
> index 57a4dab..754a870 100644
> --- a/src/mesa/drivers/dri/i965/brw_dead_control_flow.h
> +++ b/src/mesa/drivers/dri/i965/brw_dead_control_flow.h
> @@ -23,4 +23,4 @@
>
>  #include "brw_shader.h"
>
> -bool dead_control_flow_eliminate(backend_visitor *v);
> +bool dead_control_flow_eliminate(backend_god *v);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 9c2ccce..8be13af 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -239,14 +239,14 @@ fs_inst::resize_sources(uint8_t num_sources)
>
>  #define ALU1(op)                                                        \
>     fs_inst *                                                            \
> -   fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
> +   fs_god::op(const fs_reg &dst, const fs_reg &src0)                \
>     {                                                                    \
>        return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
>     }
>
>  #define ALU2(op)                                                        \
>     fs_inst *                                                            \
> -   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
> +   fs_god::op(const fs_reg &dst, const fs_reg &src0,                \
>                    const fs_reg &src1)                                   \
>     {                                                                    \
>        return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
> @@ -254,7 +254,7 @@ fs_inst::resize_sources(uint8_t num_sources)
>
>  #define ALU2_ACC(op)                                                    \
>     fs_inst *                                                            \
> -   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
> +   fs_god::op(const fs_reg &dst, const fs_reg &src0,                \
>                    const fs_reg &src1)                                   \
>     {                                                                    \
>        fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
> @@ -264,7 +264,7 @@ fs_inst::resize_sources(uint8_t num_sources)
>
>  #define ALU3(op)                                                        \
>     fs_inst *                                                            \
> -   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
> +   fs_god::op(const fs_reg &dst, const fs_reg &src0,                \
>                    const fs_reg &src1, const fs_reg &src2)               \
>     {                                                                    \
>        return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
> @@ -301,7 +301,7 @@ ALU2(MAC)
>
>  /** Gen4 predicated IF. */
>  fs_inst *
> -fs_visitor::IF(enum brw_predicate predicate)
> +fs_god::IF(enum brw_predicate predicate)
>  {
>     fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
>     inst->predicate = predicate;
> @@ -310,7 +310,7 @@ fs_visitor::IF(enum brw_predicate predicate)
>
>  /** Gen6 IF with embedded comparison. */
>  fs_inst *
> -fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
> +fs_god::IF(const fs_reg &src0, const fs_reg &src1,
>                 enum brw_conditional_mod condition)
>  {
>     assert(brw->gen == 6);
> @@ -326,7 +326,7 @@ fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
>   * the flag register with the packed 16 bits of the result.
>   */
>  fs_inst *
> -fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
> +fs_god::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
>                  enum brw_conditional_mod condition)
>  {
>     fs_inst *inst;
> @@ -355,7 +355,7 @@ fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
>  }
>
>  fs_inst *
> -fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
> +fs_god::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
>  {
>     uint8_t exec_size = dst.width;
>     for (int i = 0; i < sources; ++i) {
> @@ -381,7 +381,7 @@ fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
>  }
>
>  exec_list
> -fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
> +fs_god::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
>                                         const fs_reg &surf_index,
>                                         const fs_reg &varying_offset,
>                                         uint32_t const_offset)
> @@ -448,7 +448,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
>   * handling.
>   */
>  fs_inst *
> -fs_visitor::DEP_RESOLVE_MOV(int grf)
> +fs_god::DEP_RESOLVE_MOV(int grf)
>  {
>     fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
>
> @@ -638,7 +638,7 @@ fs_reg::is_contiguous() const
>  }
>
>  int
> -fs_visitor::type_size(const struct glsl_type *type)
> +fs_god::type_size(const struct glsl_type *type)
>  {
>     unsigned int size, i;
>
> @@ -681,7 +681,7 @@ fs_visitor::type_size(const struct glsl_type *type)
>   * the destination of the MOV, with extra parameters set.
>   */
>  fs_reg
> -fs_visitor::get_timestamp(fs_inst **out_mov)
> +fs_god::get_timestamp(fs_inst **out_mov)
>  {
>     assert(brw->gen >= 7);
>
> @@ -715,7 +715,7 @@ fs_visitor::get_timestamp(fs_inst **out_mov)
>  }
>
>  void
> -fs_visitor::emit_shader_time_begin()
> +fs_god::emit_shader_time_begin()
>  {
>     current_annotation = "shader time start";
>     fs_inst *mov;
> @@ -724,7 +724,7 @@ fs_visitor::emit_shader_time_begin()
>  }
>
>  void
> -fs_visitor::emit_shader_time_end()
> +fs_god::emit_shader_time_end()
>  {
>     current_annotation = "shader time end";
>
> @@ -753,7 +753,7 @@ fs_visitor::emit_shader_time_end()
>        }
>        break;
>     default:
> -      unreachable("fs_visitor::emit_shader_time_end missing code");
> +      unreachable("fs_god::emit_shader_time_end missing code");
>     }
>
>     /* Insert our code just before the final SEND with EOT. */
> @@ -799,7 +799,7 @@ fs_visitor::emit_shader_time_end()
>  }
>
>  fs_inst *
> -fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
> +fs_god::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
>  {
>     int shader_time_index =
>        brw_get_shader_time_index(brw, shader_prog, prog, type);
> @@ -816,7 +816,7 @@ fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
>  }
>
>  void
> -fs_visitor::vfail(const char *format, va_list va)
> +fs_god::vfail(const char *format, va_list va)
>  {
>     char *msg;
>
> @@ -836,7 +836,7 @@ fs_visitor::vfail(const char *format, va_list va)
>  }
>
>  void
> -fs_visitor::fail(const char *format, ...)
> +fs_god::fail(const char *format, ...)
>  {
>     va_list va;
>
> @@ -855,7 +855,7 @@ fs_visitor::fail(const char *format, ...)
>   * During a SIMD16 compile (if one happens anyway), this just calls fail().
>   */
>  void
> -fs_visitor::no16(const char *format, ...)
> +fs_god::no16(const char *format, ...)
>  {
>     va_list va;
>
> @@ -878,39 +878,39 @@ fs_visitor::no16(const char *format, ...)
>  }
>
>  fs_inst *
> -fs_visitor::emit(enum opcode opcode)
> +fs_god::emit(enum opcode opcode)
>  {
>     return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
>  }
>
>  fs_inst *
> -fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
> +fs_god::emit(enum opcode opcode, const fs_reg &dst)
>  {
>     return emit(new(mem_ctx) fs_inst(opcode, dst));
>  }
>
>  fs_inst *
> -fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
> +fs_god::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
>  {
>     return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
>  }
>
>  fs_inst *
> -fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
> +fs_god::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
>                   const fs_reg &src1)
>  {
>     return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
>  }
>
>  fs_inst *
> -fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
> +fs_god::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
>                   const fs_reg &src1, const fs_reg &src2)
>  {
>     return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
>  }
>
>  fs_inst *
> -fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
> +fs_god::emit(enum opcode opcode, const fs_reg &dst,
>                   fs_reg src[], int sources)
>  {
>     return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
> @@ -991,7 +991,7 @@ fs_inst::writes_flag() const
>   * instruction -- the FS opcodes often generate MOVs in addition.
>   */
>  int
> -fs_visitor::implied_mrf_writes(fs_inst *inst)
> +fs_god::implied_mrf_writes(fs_inst *inst)
>  {
>     if (inst->mlen == 0)
>        return 0;
> @@ -1047,7 +1047,7 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
>  }
>
>  fs_reg
> -fs_visitor::vgrf(const glsl_type *const type)
> +fs_god::vgrf(const glsl_type *const type)
>  {
>     int reg_width = dispatch_width / 8;
>     return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
> @@ -1055,7 +1055,7 @@ fs_visitor::vgrf(const glsl_type *const type)
>  }
>
>  fs_reg
> -fs_visitor::vgrf(int num_components)
> +fs_god::vgrf(int num_components)
>  {
>     int reg_width = dispatch_width / 8;
>     return fs_reg(GRF, alloc.allocate(num_components * reg_width),
> @@ -1108,7 +1108,7 @@ fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
>  }
>
>  fs_reg *
> -fs_visitor::variable_storage(ir_variable *var)
> +fs_god::variable_storage(ir_variable *var)
>  {
>     return (fs_reg *)hash_table_find(this->variable_ht, var);
>  }
> @@ -1131,7 +1131,7 @@ import_uniforms_callback(const void *key,
>   * This brings in those uniform definitions
>   */
>  void
> -fs_visitor::import_uniforms(fs_visitor *v)
> +fs_god::import_uniforms(fs_god *v)
>  {
>     hash_table_call_foreach(v->variable_ht,
>                            import_uniforms_callback,
> @@ -1148,7 +1148,7 @@ fs_visitor::import_uniforms(fs_visitor *v)
>   * store.
>   */
>  void
> -fs_visitor::setup_uniform_values(ir_variable *ir)
> +fs_god::setup_uniform_values(ir_variable *ir)
>  {
>     int namelen = strlen(ir->name);
>
> @@ -1189,7 +1189,7 @@ fs_visitor::setup_uniform_values(ir_variable *ir)
>   * automatically updated from GL context state.
>   */
>  void
> -fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
> +fs_god::setup_builtin_uniform_values(ir_variable *ir)
>  {
>     const ir_state_slot *const slots = ir->get_state_slots();
>     assert(slots != NULL);
> @@ -1219,7 +1219,7 @@ fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
>  }
>
>  fs_reg *
> -fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
> +fs_god::emit_fragcoord_interpolation(bool pixel_center_integer,
>                                           bool origin_upper_left)
>  {
>     assert(stage == MESA_SHADER_FRAGMENT);
> @@ -1270,7 +1270,7 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
>  }
>
>  fs_inst *
> -fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
> +fs_god::emit_linterp(const fs_reg &attr, const fs_reg &interp,
>                           glsl_interp_qualifier interpolation_mode,
>                           bool is_centroid, bool is_sample)
>  {
> @@ -1305,7 +1305,7 @@ fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
>  }
>
>  void
> -fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
> +fs_god::emit_general_interpolation(fs_reg attr, const char *name,
>                                         const glsl_type *type,
>                                         glsl_interp_qualifier interpolation_mode,
>                                         int location, bool mod_centroid,
> @@ -1408,7 +1408,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
>  }
>
>  fs_reg *
> -fs_visitor::emit_frontfacing_interpolation()
> +fs_god::emit_frontfacing_interpolation()
>  {
>     fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
>
> @@ -1449,7 +1449,7 @@ fs_visitor::emit_frontfacing_interpolation()
>  }
>
>  void
> -fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
> +fs_god::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
>  {
>     assert(stage == MESA_SHADER_FRAGMENT);
>     brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> @@ -1472,7 +1472,7 @@ fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
>  }
>
>  fs_reg *
> -fs_visitor::emit_samplepos_setup()
> +fs_god::emit_samplepos_setup()
>  {
>     assert(brw->gen >= 6);
>
> @@ -1521,7 +1521,7 @@ fs_visitor::emit_samplepos_setup()
>  }
>
>  fs_reg *
> -fs_visitor::emit_sampleid_setup()
> +fs_god::emit_sampleid_setup()
>  {
>     assert(stage == MESA_SHADER_FRAGMENT);
>     brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> @@ -1580,7 +1580,7 @@ fs_visitor::emit_sampleid_setup()
>  }
>
>  void
> -fs_visitor::resolve_source_modifiers(fs_reg *src)
> +fs_god::resolve_source_modifiers(fs_reg *src)
>  {
>     if (!src->abs && !src->negate)
>        return;
> @@ -1591,7 +1591,7 @@ fs_visitor::resolve_source_modifiers(fs_reg *src)
>  }
>
>  fs_reg
> -fs_visitor::fix_math_operand(fs_reg src)
> +fs_god::fix_math_operand(fs_reg src)
>  {
>     /* Can't do hstride == 0 args on gen6 math, so expand it out. We
>      * might be able to do better by doing execsize = 1 math and then
> @@ -1618,7 +1618,7 @@ fs_visitor::fix_math_operand(fs_reg src)
>  }
>
>  fs_inst *
> -fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
> +fs_god::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
>  {
>     switch (opcode) {
>     case SHADER_OPCODE_RCP:
> @@ -1655,7 +1655,7 @@ fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
>  }
>
>  fs_inst *
> -fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
> +fs_god::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
>  {
>     int base_mrf = 2;
>     fs_inst *inst;
> @@ -1691,7 +1691,7 @@ fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
>  }
>
>  void
> -fs_visitor::emit_discard_jump()
> +fs_god::emit_discard_jump()
>  {
>     /* For performance, after a discard, jump to the end of the
>      * shader if all relevant channels have been discarded.
> @@ -1706,7 +1706,7 @@ fs_visitor::emit_discard_jump()
>  }
>
>  void
> -fs_visitor::assign_curb_setup()
> +fs_god::assign_curb_setup()
>  {
>     if (dispatch_width == 8) {
>        prog_data->dispatch_grf_start_reg = payload.num_regs;
> @@ -1749,7 +1749,7 @@ fs_visitor::assign_curb_setup()
>  }
>
>  void
> -fs_visitor::calculate_urb_setup()
> +fs_god::calculate_urb_setup()
>  {
>     assert(stage == MESA_SHADER_FRAGMENT);
>     brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> @@ -1837,7 +1837,7 @@ fs_visitor::calculate_urb_setup()
>  }
>
>  void
> -fs_visitor::assign_urb_setup()
> +fs_god::assign_urb_setup()
>  {
>     assert(stage == MESA_SHADER_FRAGMENT);
>     brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> @@ -1865,7 +1865,7 @@ fs_visitor::assign_urb_setup()
>  }
>
>  void
> -fs_visitor::assign_vs_urb_setup()
> +fs_god::assign_vs_urb_setup()
>  {
>     brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
>     int grf, count, slot, channel, attr;
> @@ -1938,7 +1938,7 @@ fs_visitor::assign_vs_urb_setup()
>   * live intervals and better dead code elimination and coalescing.
>   */
>  void
> -fs_visitor::split_virtual_grfs()
> +fs_god::split_virtual_grfs()
>  {
>     int num_vars = this->alloc.count;
>
> @@ -2069,7 +2069,7 @@ fs_visitor::split_virtual_grfs()
>   * overhead.
>   */
>  bool
> -fs_visitor::compact_virtual_grfs()
> +fs_god::compact_virtual_grfs()
>  {
>     bool progress = false;
>     int remap_table[this->alloc.count];
> @@ -2154,7 +2154,7 @@ fs_visitor::compact_virtual_grfs()
>   * uniform array access out to a pull constant buffer.
>   */
>  void
> -fs_visitor::move_uniform_array_access_to_pull_constants()
> +fs_god::move_uniform_array_access_to_pull_constants()
>  {
>     if (dispatch_width != 8)
>        return;
> @@ -2204,7 +2204,7 @@ fs_visitor::move_uniform_array_access_to_pull_constants()
>   * update the program to load them.
>   */
>  void
> -fs_visitor::assign_constant_locations()
> +fs_god::assign_constant_locations()
>  {
>     /* Only the first compile (SIMD8 mode) gets to decide on locations. */
>     if (dispatch_width != 8)
> @@ -2286,7 +2286,7 @@ fs_visitor::assign_constant_locations()
>   * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
>   */
>  void
> -fs_visitor::demote_pull_constants()
> +fs_god::demote_pull_constants()
>  {
>     foreach_block_and_inst (block, fs_inst, inst, cfg) {
>        for (int i = 0; i < inst->sources; i++) {
> @@ -2338,7 +2338,7 @@ fs_visitor::demote_pull_constants()
>  }
>
>  bool
> -fs_visitor::opt_algebraic()
> +fs_god::opt_algebraic()
>  {
>     bool progress = false;
>
> @@ -2548,7 +2548,7 @@ fs_visitor::opt_algebraic()
>  }
>
>  bool
> -fs_visitor::opt_register_renaming()
> +fs_god::opt_register_renaming()
>  {
>     bool progress = false;
>     int depth = 0;
> @@ -2623,7 +2623,7 @@ fs_visitor::opt_register_renaming()
>   * placeholder-halt
>   */
>  bool
> -fs_visitor::opt_redundant_discard_jumps()
> +fs_god::opt_redundant_discard_jumps()
>  {
>     bool progress = false;
>
> @@ -2655,7 +2655,7 @@ fs_visitor::opt_redundant_discard_jumps()
>  }
>
>  bool
> -fs_visitor::compute_to_mrf()
> +fs_god::compute_to_mrf()
>  {
>     bool progress = false;
>     int next_ip = 0;
> @@ -2819,7 +2819,7 @@ fs_visitor::compute_to_mrf()
>   * instructions to FS_OPCODE_REP_FB_WRITE.
>   */
>  void
> -fs_visitor::emit_repclear_shader()
> +fs_god::emit_repclear_shader()
>  {
>     brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
>     int base_mrf = 1;
> @@ -2865,7 +2865,7 @@ fs_visitor::emit_repclear_shader()
>   * removing the later ones.
>   */
>  bool
> -fs_visitor::remove_duplicate_mrf_writes()
> +fs_god::remove_duplicate_mrf_writes()
>  {
>     fs_inst *last_mrf_move[16];
>     bool progress = false;
> @@ -2970,7 +2970,7 @@ clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
>   *      same time that both consider ‘r3’ as the target of their final writes.
>   */
>  void
> -fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
> +fs_god::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
>                                                          fs_inst *inst)
>  {
>     int write_len = inst->regs_written;
> @@ -3042,7 +3042,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
>   *      instruction with a different destination register.
>   */
>  void
> -fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
> +fs_god::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
>  {
>     int write_len = inst->regs_written;
>     int first_write_grf = inst->dst.reg;
> @@ -3091,7 +3091,7 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
>  }
>
>  void
> -fs_visitor::insert_gen4_send_dependency_workarounds()
> +fs_god::insert_gen4_send_dependency_workarounds()
>  {
>     if (brw->gen != 4 || brw->is_g4x)
>        return;
> @@ -3131,7 +3131,7 @@ fs_visitor::insert_gen4_send_dependency_workarounds()
>   * source operand for all 8 or 16 of its channels.
>   */
>  void
> -fs_visitor::lower_uniform_pull_constant_loads()
> +fs_god::lower_uniform_pull_constant_loads()
>  {
>     foreach_block_and_inst (block, fs_inst, inst, cfg) {
>        if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
> @@ -3189,7 +3189,7 @@ fs_visitor::lower_uniform_pull_constant_loads()
>  }
>
>  bool
> -fs_visitor::lower_load_payload()
> +fs_god::lower_load_payload()
>  {
>     bool progress = false;
>
> @@ -3295,13 +3295,13 @@ fs_visitor::lower_load_payload()
>  }
>
>  void
> -fs_visitor::dump_instructions()
> +fs_god::dump_instructions()
>  {
>     dump_instructions(NULL);
>  }
>
>  void
> -fs_visitor::dump_instructions(const char *name)
> +fs_god::dump_instructions(const char *name)
>  {
>     FILE *file = stderr;
>     if (name && geteuid() != 0) {
> @@ -3334,13 +3334,13 @@ fs_visitor::dump_instructions(const char *name)
>  }
>
>  void
> -fs_visitor::dump_instruction(backend_instruction *be_inst)
> +fs_god::dump_instruction(backend_instruction *be_inst)
>  {
>     dump_instruction(be_inst, stderr);
>  }
>
>  void
> -fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
> +fs_god::dump_instruction(backend_instruction *be_inst, FILE *file)
>  {
>     fs_inst *inst = (fs_inst *)be_inst;
>
> @@ -3552,7 +3552,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
>   * only reg -- it might be the size=4 destination of a texture instruction.
>   */
>  fs_inst *
> -fs_visitor::get_instruction_generating_reg(fs_inst *start,
> +fs_god::get_instruction_generating_reg(fs_inst *start,
>                                            fs_inst *end,
>                                            const fs_reg &reg)
>  {
> @@ -3567,7 +3567,7 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start,
>  }
>
>  void
> -fs_visitor::setup_payload_gen6()
> +fs_god::setup_payload_gen6()
>  {
>     bool uses_depth =
>        (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
> @@ -3648,14 +3648,14 @@ fs_visitor::setup_payload_gen6()
>  }
>
>  void
> -fs_visitor::setup_vs_payload()
> +fs_god::setup_vs_payload()
>  {
>     /* R0: thread header, R1: urb handles */
>     payload.num_regs = 2;
>  }
>
>  void
> -fs_visitor::assign_binding_table_offsets()
> +fs_god::assign_binding_table_offsets()
>  {
>     assert(stage == MESA_SHADER_FRAGMENT);
>     brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> @@ -3672,7 +3672,7 @@ fs_visitor::assign_binding_table_offsets()
>  }
>
>  void
> -fs_visitor::calculate_register_pressure()
> +fs_god::calculate_register_pressure()
>  {
>     invalidate_live_intervals();
>     calculate_live_intervals();
> @@ -3690,7 +3690,7 @@ fs_visitor::calculate_register_pressure()
>  }
>
>  void
> -fs_visitor::optimize()
> +fs_god::optimize()
>  {
>     const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
>
> @@ -3709,7 +3709,7 @@ fs_visitor::optimize()
>           snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
>                    stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
>                                                                          \
> -         backend_visitor::dump_instructions(filename);                  \
> +         backend_god::dump_instructions(filename);                  \
>        }                                                                 \
>                                                                          \
>        progress = progress || this_progress;                             \
> @@ -3721,7 +3721,7 @@ fs_visitor::optimize()
>        snprintf(filename, 64, "%s%d-%04d-00-start",
>                 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
>
> -      backend_visitor::dump_instructions(filename);
> +      backend_god::dump_instructions(filename);
>     }
>
>     bool progress;
> @@ -3770,7 +3770,7 @@ fs_visitor::optimize()
>   * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
>   */
>  void
> -fs_visitor::fixup_3src_null_dest()
> +fs_god::fixup_3src_null_dest()
>  {
>     foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
>        if (inst->is_3src() && inst->dst.is_null()) {
> @@ -3781,7 +3781,7 @@ fs_visitor::fixup_3src_null_dest()
>  }
>
>  void
> -fs_visitor::allocate_registers()
> +fs_god::allocate_registers()
>  {
>     bool allocated_without_spills;
>
> @@ -3851,7 +3851,7 @@ fs_visitor::allocate_registers()
>  }
>
>  bool
> -fs_visitor::run_vs()
> +fs_god::run_vs()
>  {
>     assert(stage == MESA_SHADER_VERTEX);
>
> @@ -3891,7 +3891,7 @@ fs_visitor::run_vs()
>  }
>
>  bool
> -fs_visitor::run_fs()
> +fs_god::run_fs()
>  {
>     brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
>     brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
> @@ -4015,7 +4015,7 @@ brw_wm_fs_emit(struct brw_context *brw,
>
>     /* Now the main event: Visit the shader IR and generate our FS IR for it.
>      */
> -   fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
> +   fs_god v(brw, mem_ctx, key, prog_data, prog, fp, 8);
>     if (!v.run_fs()) {
>        if (prog) {
>           prog->LinkStatus = false;
> @@ -4029,7 +4029,7 @@ brw_wm_fs_emit(struct brw_context *brw,
>     }
>
>     cfg_t *simd16_cfg = NULL;
> -   fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
> +   fs_god v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
>     if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
>                                 brw->use_rep_send)) {
>        if (!v.simd16_unsupported) {
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index 278a8ee..ff1a8b8 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -66,14 +66,14 @@ namespace brw {
>   *
>   * Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR.
>   */
> -class fs_visitor : public backend_visitor
> +class fs_god : public backend_god
>  {
>  public:
>     const fs_reg reg_null_f;
>     const fs_reg reg_null_d;
>     const fs_reg reg_null_ud;
>
> -   fs_visitor(struct brw_context *brw,
> +   fs_god(struct brw_context *brw,
>                void *mem_ctx,
>                const struct brw_wm_prog_key *key,
>                struct brw_wm_prog_data *prog_data,
> @@ -81,7 +81,7 @@ public:
>                struct gl_fragment_program *fp,
>                unsigned dispatch_width);
>
> -   fs_visitor(struct brw_context *brw,
> +   fs_god(struct brw_context *brw,
>                void *mem_ctx,
>                const struct brw_vs_prog_key *key,
>                struct brw_vs_prog_data *prog_data,
> @@ -89,13 +89,13 @@ public:
>                struct gl_vertex_program *cp,
>                unsigned dispatch_width);
>
> -   ~fs_visitor();
> +   ~fs_god();
>     void init();
>
>     fs_reg *variable_storage(ir_variable *var);
>     fs_reg vgrf(const glsl_type *const type);
>     fs_reg vgrf(int num_components);
> -   void import_uniforms(fs_visitor *v);
> +   void import_uniforms(fs_god *v);
>     void setup_uniform_clipplane_values();
>     void compute_clip_distance();
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
> index 798fef3..db62f51 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
> @@ -67,7 +67,7 @@ opt_cmod_propagation_local(bblock_t *block)
>           continue;
>
>        /* Only an AND.NZ can be propagated.  Many AND.Z instructions are
> -       * generated (for ir_unop_not in fs_visitor::emit_bool_to_cond_code).
> +       * generated (for ir_unop_not in fs_god::emit_bool_to_cond_code).
>         * Propagating those would require inverting the condition on the CMP.
>         * This changes both the flag value and the register destination of the
>         * CMP.  That result may be used elsewhere, so we can't change its value
> @@ -153,7 +153,7 @@ opt_cmod_propagation_local(bblock_t *block)
>  }
>
>  bool
> -fs_visitor::opt_cmod_propagation()
> +fs_god::opt_cmod_propagation()
>  {
>     bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
> index ebde8df..5b84b34 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
> @@ -188,7 +188,7 @@ compare(const void *_a, const void *_b)
>  }
>
>  bool
> -fs_visitor::opt_combine_constants()
> +fs_god::opt_combine_constants()
>  {
>     void *const_ctx = ralloc_context(NULL);
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
> index 764741d..b059849 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
> @@ -276,7 +276,7 @@ is_logic_op(enum opcode opcode)
>  }
>
>  bool
> -fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
> +fs_god::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
>  {
>     if (inst->src[arg].file != GRF)
>        return false;
> @@ -422,7 +422,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
>
>
>  bool
> -fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
> +fs_god::try_constant_propagate(fs_inst *inst, acp_entry *entry)
>  {
>     bool progress = false;
>
> @@ -608,7 +608,7 @@ can_propagate_from(fs_inst *inst)
>   * list.
>   */
>  bool
> -fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
> +fs_god::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
>                                       exec_list *acp)
>  {
>     bool progress = false;
> @@ -687,7 +687,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
>  }
>
>  bool
> -fs_visitor::opt_copy_propagate()
> +fs_god::opt_copy_propagate()
>  {
>     bool progress = false;
>     void *copy_prop_ctx = ralloc_context(NULL);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
> index ca5b32f..ba4dbde 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
> @@ -155,7 +155,7 @@ instructions_match(fs_inst *a, fs_inst *b)
>  }
>
>  bool
> -fs_visitor::opt_cse_local(bblock_t *block)
> +fs_god::opt_cse_local(bblock_t *block)
>  {
>     bool progress = false;
>     exec_list aeb;
> @@ -300,7 +300,7 @@ fs_visitor::opt_cse_local(bblock_t *block)
>  }
>
>  bool
> -fs_visitor::opt_cse()
> +fs_god::opt_cse()
>  {
>     bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp
> index 4b5548a..669257b 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp
> @@ -35,7 +35,7 @@
>   */
>
>  bool
> -fs_visitor::dead_code_eliminate()
> +fs_god::dead_code_eliminate()
>  {
>     bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
> index c4064da..0a10e74 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
> @@ -31,7 +31,7 @@
>  #include "brw_fs.h"
>
>  void
> -fs_visitor::emit_fp_alu1(enum opcode opcode,
> +fs_god::emit_fp_alu1(enum opcode opcode,
>                           const struct prog_instruction *fpi,
>                           fs_reg dst, fs_reg src)
>  {
> @@ -42,7 +42,7 @@ fs_visitor::emit_fp_alu1(enum opcode opcode,
>  }
>
>  void
> -fs_visitor::emit_fp_alu2(enum opcode opcode,
> +fs_god::emit_fp_alu2(enum opcode opcode,
>                           const struct prog_instruction *fpi,
>                           fs_reg dst, fs_reg src0, fs_reg src1)
>  {
> @@ -54,7 +54,7 @@ fs_visitor::emit_fp_alu2(enum opcode opcode,
>  }
>
>  void
> -fs_visitor::emit_fp_minmax(const prog_instruction *fpi,
> +fs_god::emit_fp_minmax(const prog_instruction *fpi,
>                             fs_reg dst, fs_reg src0, fs_reg src1)
>  {
>     enum brw_conditional_mod conditionalmod;
> @@ -72,7 +72,7 @@ fs_visitor::emit_fp_minmax(const prog_instruction *fpi,
>  }
>
>  void
> -fs_visitor::emit_fp_sop(enum brw_conditional_mod conditional_mod,
> +fs_god::emit_fp_sop(enum brw_conditional_mod conditional_mod,
>                          const struct prog_instruction *fpi,
>                          fs_reg dst, fs_reg src0, fs_reg src1,
>                          fs_reg one)
> @@ -91,7 +91,7 @@ fs_visitor::emit_fp_sop(enum brw_conditional_mod conditional_mod,
>  }
>
>  void
> -fs_visitor::emit_fp_scalar_write(const struct prog_instruction *fpi,
> +fs_god::emit_fp_scalar_write(const struct prog_instruction *fpi,
>                                   fs_reg dst, fs_reg src)
>  {
>     for (int i = 0; i < 4; i++) {
> @@ -101,7 +101,7 @@ fs_visitor::emit_fp_scalar_write(const struct prog_instruction *fpi,
>  }
>
>  void
> -fs_visitor::emit_fp_scalar_math(enum opcode opcode,
> +fs_god::emit_fp_scalar_math(enum opcode opcode,
>                                  const struct prog_instruction *fpi,
>                                  fs_reg dst, fs_reg src)
>  {
> @@ -111,7 +111,7 @@ fs_visitor::emit_fp_scalar_math(enum opcode opcode,
>  }
>
>  void
> -fs_visitor::emit_fragment_program_code()
> +fs_god::emit_fragment_program_code()
>  {
>     setup_fp_regs();
>
> @@ -552,7 +552,7 @@ fs_visitor::emit_fragment_program_code()
>  }
>
>  void
> -fs_visitor::setup_fp_regs()
> +fs_god::setup_fp_regs()
>  {
>     /* PROGRAM_TEMPORARY */
>     int num_temp = prog->NumTemporaries;
> @@ -612,7 +612,7 @@ fs_visitor::setup_fp_regs()
>  }
>
>  fs_reg
> -fs_visitor::get_fp_dst_reg(const prog_dst_register *dst)
> +fs_god::get_fp_dst_reg(const prog_dst_register *dst)
>  {
>     assert(stage == MESA_SHADER_FRAGMENT);
>     brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> @@ -660,7 +660,7 @@ fs_visitor::get_fp_dst_reg(const prog_dst_register *dst)
>  }
>
>  fs_reg
> -fs_visitor::get_fp_src_reg(const prog_src_register *src)
> +fs_god::get_fp_src_reg(const prog_src_register *src)
>  {
>     struct gl_program_parameter_list *plist = prog->Parameters;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_god.cpp b/src/mesa/drivers/dri/i965/brw_fs_god.cpp
> new file mode 100644
> index 0000000..e3d8b3a
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_fs_god.cpp
> @@ -0,0 +1,4157 @@
> +/*
> + * Copyright © 2010 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +/** @file brw_fs_god.cpp
> + *
> + * This file supports generating the FS LIR from the GLSL IR.  The LIR
> + * makes it easier to do backend-specific optimizations than doing so
> + * in the GLSL IR or in the native code.
> + */
> +#include <sys/types.h>
> +
> +#include "main/macros.h"
> +#include "main/shaderobj.h"
> +#include "program/prog_parameter.h"
> +#include "program/prog_print.h"
> +#include "program/prog_optimize.h"
> +#include "util/register_allocate.h"
> +#include "program/hash_table.h"
> +#include "brw_context.h"
> +#include "brw_eu.h"
> +#include "brw_wm.h"
> +#include "brw_vec4.h"
> +#include "brw_fs.h"
> +#include "main/uniforms.h"
> +#include "glsl/glsl_types.h"
> +#include "glsl/ir_optimization.h"
> +#include "program/sampler.h"
> +
> +
> +fs_reg *
> +fs_god::emit_vs_system_value(int location)
> +{
> +   fs_reg *reg = new(this->mem_ctx)
> +      fs_reg(ATTR, VERT_ATTRIB_MAX, BRW_REGISTER_TYPE_D);
> +   brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
> +
> +   switch (location) {
> +   case SYSTEM_VALUE_BASE_VERTEX:
> +      reg->reg_offset = 0;
> +      vs_prog_data->uses_vertexid = true;
> +      break;
> +   case SYSTEM_VALUE_VERTEX_ID:
> +   case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
> +      reg->reg_offset = 2;
> +      vs_prog_data->uses_vertexid = true;
> +      break;
> +   case SYSTEM_VALUE_INSTANCE_ID:
> +      reg->reg_offset = 3;
> +      vs_prog_data->uses_instanceid = true;
> +      break;
> +   default:
> +      unreachable("not reached");
> +   }
> +
> +   return reg;
> +}
> +
> +void
> +fs_god::visit(ir_variable *ir)
> +{
> +   fs_reg *reg = NULL;
> +
> +   if (variable_storage(ir))
> +      return;
> +
> +   if (ir->data.mode == ir_var_shader_in) {
> +      assert(ir->data.location != -1);
> +      if (stage == MESA_SHADER_VERTEX) {
> +         reg = new(this->mem_ctx)
> +            fs_reg(ATTR, ir->data.location,
> +                   brw_type_for_base_type(ir->type->get_scalar_type()));
> +      } else if (ir->data.location == VARYING_SLOT_POS) {
> +         reg = emit_fragcoord_interpolation(ir->data.pixel_center_integer,
> +                                            ir->data.origin_upper_left);
> +      } else if (ir->data.location == VARYING_SLOT_FACE) {
> +        reg = emit_frontfacing_interpolation();
> +      } else {
> +         reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
> +         emit_general_interpolation(*reg, ir->name, ir->type,
> +                                    (glsl_interp_qualifier) ir->data.interpolation,
> +                                    ir->data.location, ir->data.centroid,
> +                                    ir->data.sample);
> +      }
> +      assert(reg);
> +      hash_table_insert(this->variable_ht, reg, ir);
> +      return;
> +   } else if (ir->data.mode == ir_var_shader_out) {
> +      reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
> +
> +      if (stage == MESA_SHADER_VERTEX) {
> +        int vector_elements =
> +           ir->type->is_array() ? ir->type->fields.array->vector_elements
> +                                : ir->type->vector_elements;
> +
> +        for (int i = 0; i < (type_size(ir->type) + 3) / 4; i++) {
> +           int output = ir->data.location + i;
> +           this->outputs[output] = *reg;
> +           this->outputs[output].reg_offset = i * 4;
> +           this->output_components[output] = vector_elements;
> +        }
> +
> +      } else if (ir->data.index > 0) {
> +        assert(ir->data.location == FRAG_RESULT_DATA0);
> +        assert(ir->data.index == 1);
> +        this->dual_src_output = *reg;
> +         this->do_dual_src = true;
> +      } else if (ir->data.location == FRAG_RESULT_COLOR) {
> +        /* Writing gl_FragColor outputs to all color regions. */
> +         assert(stage == MESA_SHADER_FRAGMENT);
> +         brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> +        for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
> +           this->outputs[i] = *reg;
> +           this->output_components[i] = 4;
> +        }
> +      } else if (ir->data.location == FRAG_RESULT_DEPTH) {
> +        this->frag_depth = *reg;
> +      } else if (ir->data.location == FRAG_RESULT_SAMPLE_MASK) {
> +         this->sample_mask = *reg;
> +      } else {
> +        /* gl_FragData or a user-defined FS output */
> +        assert(ir->data.location >= FRAG_RESULT_DATA0 &&
> +               ir->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
> +
> +        int vector_elements =
> +           ir->type->is_array() ? ir->type->fields.array->vector_elements
> +                                : ir->type->vector_elements;
> +
> +        /* General color output. */
> +        for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
> +           int output = ir->data.location - FRAG_RESULT_DATA0 + i;
> +           this->outputs[output] = offset(*reg, vector_elements * i);
> +           this->output_components[output] = vector_elements;
> +        }
> +      }
> +   } else if (ir->data.mode == ir_var_uniform) {
> +      int param_index = uniforms;
> +
> +      /* Thanks to the lower_ubo_reference pass, we will see only
> +       * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
> +       * variables, so no need for them to be in variable_ht.
> +       *
> +       * Some uniforms, such as samplers and atomic counters, have no actual
> +       * storage, so we should ignore them.
> +       */
> +      if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
> +         return;
> +
> +      if (dispatch_width == 16) {
> +        if (!variable_storage(ir)) {
> +           fail("Failed to find uniform '%s' in SIMD16\n", ir->name);
> +        }
> +        return;
> +      }
> +
> +      param_size[param_index] = type_size(ir->type);
> +      if (!strncmp(ir->name, "gl_", 3)) {
> +        setup_builtin_uniform_values(ir);
> +      } else {
> +        setup_uniform_values(ir);
> +      }
> +
> +      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
> +      reg->type = brw_type_for_base_type(ir->type);
> +
> +   } else if (ir->data.mode == ir_var_system_value) {
> +      switch (ir->data.location) {
> +      case SYSTEM_VALUE_BASE_VERTEX:
> +      case SYSTEM_VALUE_VERTEX_ID:
> +      case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
> +      case SYSTEM_VALUE_INSTANCE_ID:
> +         reg = emit_vs_system_value(ir->data.location);
> +         break;
> +      case SYSTEM_VALUE_SAMPLE_POS:
> +        reg = emit_samplepos_setup();
> +         break;
> +      case SYSTEM_VALUE_SAMPLE_ID:
> +        reg = emit_sampleid_setup();
> +         break;
> +      case SYSTEM_VALUE_SAMPLE_MASK_IN:
> +         assert(brw->gen >= 7);
> +         reg = new(mem_ctx)
> +            fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
> +                          BRW_REGISTER_TYPE_D));
> +         break;
> +      }
> +   }
> +
> +   if (!reg)
> +      reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
> +
> +   hash_table_insert(this->variable_ht, reg, ir);
> +}
> +
> +void
> +fs_god::visit(ir_dereference_variable *ir)
> +{
> +   fs_reg *reg = variable_storage(ir->var);
> +
> +   if (!reg) {
> +      fail("Failed to find variable storage for %s\n", ir->var->name);
> +      this->result = fs_reg(reg_null_d);
> +      return;
> +   }
> +   this->result = *reg;
> +}
> +
> +void
> +fs_god::visit(ir_dereference_record *ir)
> +{
> +   const glsl_type *struct_type = ir->record->type;
> +
> +   ir->record->accept(this);
> +
> +   unsigned int off = 0;
> +   for (unsigned int i = 0; i < struct_type->length; i++) {
> +      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
> +        break;
> +      off += type_size(struct_type->fields.structure[i].type);
> +   }
> +   this->result = offset(this->result, off);
> +   this->result.type = brw_type_for_base_type(ir->type);
> +}
> +
> +void
> +fs_god::visit(ir_dereference_array *ir)
> +{
> +   ir_constant *constant_index;
> +   fs_reg src;
> +   int element_size = type_size(ir->type);
> +
> +   constant_index = ir->array_index->as_constant();
> +
> +   ir->array->accept(this);
> +   src = this->result;
> +   src.type = brw_type_for_base_type(ir->type);
> +
> +   if (constant_index) {
> +      if (src.file == ATTR) {
> +         /* Attribute arrays get loaded as one vec4 per element.  In that case
> +          * offset the source register.
> +          */
> +         src.reg += constant_index->value.i[0];
> +      } else {
> +         assert(src.file == UNIFORM || src.file == GRF || src.file == HW_REG);
> +         src = offset(src, constant_index->value.i[0] * element_size);
> +      }
> +   } else {
> +      /* Variable index array dereference.  We attach the variable index
> +       * component to the reg as a pointer to a register containing the
> +       * offset.  Currently only uniform arrays are supported in this patch,
> +       * and that reladdr pointer is resolved by
> +       * move_uniform_array_access_to_pull_constants().  All other array types
> +       * are lowered by lower_variable_index_to_cond_assign().
> +       */
> +      ir->array_index->accept(this);
> +
> +      fs_reg index_reg;
> +      index_reg = vgrf(glsl_type::int_type);
> +      emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
> +
> +      if (src.reladdr) {
> +         emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
> +      }
> +
> +      src.reladdr = ralloc(mem_ctx, fs_reg);
> +      memcpy(src.reladdr, &index_reg, sizeof(index_reg));
> +   }
> +   this->result = src;
> +}
> +
> +fs_inst *
> +fs_god::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
> +                     const fs_reg &a)
> +{
> +   if (brw->gen < 6) {
> +      /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
> +      fs_reg y_times_a           = vgrf(glsl_type::float_type);
> +      fs_reg one_minus_a         = vgrf(glsl_type::float_type);
> +      fs_reg x_times_one_minus_a = vgrf(glsl_type::float_type);
> +
> +      emit(MUL(y_times_a, y, a));
> +
> +      fs_reg negative_a = a;
> +      negative_a.negate = !a.negate;
> +      emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
> +      emit(MUL(x_times_one_minus_a, x, one_minus_a));
> +
> +      return emit(ADD(dst, x_times_one_minus_a, y_times_a));
> +   } else {
> +      /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
> +       * we need to reorder the operands.
> +       */
> +      return emit(LRP(dst, a, y, x));
> +   }
> +}
> +
> +void
> +fs_god::emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
> +                        const fs_reg &src0, const fs_reg &src1)
> +{
> +   assert(conditionalmod == BRW_CONDITIONAL_GE ||
> +          conditionalmod == BRW_CONDITIONAL_L);
> +
> +   fs_inst *inst;
> +
> +   if (brw->gen >= 6) {
> +      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
> +      inst->conditional_mod = conditionalmod;
> +   } else {
> +      emit(CMP(reg_null_d, src0, src1, conditionalmod));
> +
> +      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
> +      inst->predicate = BRW_PREDICATE_NORMAL;
> +   }
> +}
> +
> +bool
> +fs_god::try_emit_saturate(ir_expression *ir)
> +{
> +   if (ir->operation != ir_unop_saturate)
> +      return false;
> +
> +   ir_rvalue *sat_val = ir->operands[0];
> +
> +   fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
> +
> +   sat_val->accept(this);
> +   fs_reg src = this->result;
> +
> +   fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
> +
> +   /* If the last instruction from our accept() generated our
> +    * src, just set the saturate flag instead of emmitting a separate mov.
> +    */
> +   fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
> +   if (modify && modify->regs_written == modify->dst.width / 8 &&
> +       modify->can_do_saturate()) {
> +      modify->saturate = true;
> +      this->result = src;
> +      return true;
> +   }
> +
> +   return false;
> +}
> +
> +bool
> +fs_god::try_emit_line(ir_expression *ir)
> +{
> +   /* LINE's src0 must be of type float. */
> +   if (ir->type != glsl_type::float_type)
> +      return false;
> +
> +   ir_rvalue *nonmul = ir->operands[1];
> +   ir_expression *mul = ir->operands[0]->as_expression();
> +
> +   if (!mul || mul->operation != ir_binop_mul) {
> +      nonmul = ir->operands[0];
> +      mul = ir->operands[1]->as_expression();
> +
> +      if (!mul || mul->operation != ir_binop_mul)
> +         return false;
> +   }
> +
> +   ir_constant *const_add = nonmul->as_constant();
> +   if (!const_add)
> +      return false;
> +
> +   int add_operand_vf = brw_float_to_vf(const_add->value.f[0]);
> +   if (add_operand_vf == -1)
> +      return false;
> +
> +   ir_rvalue *non_const_mul = mul->operands[1];
> +   ir_constant *const_mul = mul->operands[0]->as_constant();
> +   if (!const_mul) {
> +      const_mul = mul->operands[1]->as_constant();
> +
> +      if (!const_mul)
> +         return false;
> +
> +      non_const_mul = mul->operands[0];
> +   }
> +
> +   int mul_operand_vf = brw_float_to_vf(const_mul->value.f[0]);
> +   if (mul_operand_vf == -1)
> +      return false;
> +
> +   non_const_mul->accept(this);
> +   fs_reg src1 = this->result;
> +
> +   fs_reg src0 = vgrf(ir->type);
> +   emit(BRW_OPCODE_MOV, src0,
> +        fs_reg((uint8_t)mul_operand_vf, 0, 0, (uint8_t)add_operand_vf));
> +
> +   this->result = vgrf(ir->type);
> +   emit(BRW_OPCODE_LINE, this->result, src0, src1);
> +   return true;
> +}
> +
> +bool
> +fs_god::try_emit_mad(ir_expression *ir)
> +{
> +   /* 3-src instructions were introduced in gen6. */
> +   if (brw->gen < 6)
> +      return false;
> +
> +   /* MAD can only handle floating-point data. */
> +   if (ir->type != glsl_type::float_type)
> +      return false;
> +
> +   ir_rvalue *nonmul;
> +   ir_expression *mul;
> +   bool mul_negate, mul_abs;
> +
> +   for (int i = 0; i < 2; i++) {
> +      mul_negate = false;
> +      mul_abs = false;
> +
> +      mul = ir->operands[i]->as_expression();
> +      nonmul = ir->operands[1 - i];
> +
> +      if (mul && mul->operation == ir_unop_abs) {
> +         mul = mul->operands[0]->as_expression();
> +         mul_abs = true;
> +      } else if (mul && mul->operation == ir_unop_neg) {
> +         mul = mul->operands[0]->as_expression();
> +         mul_negate = true;
> +      }
> +
> +      if (mul && mul->operation == ir_binop_mul)
> +         break;
> +   }
> +
> +   if (!mul || mul->operation != ir_binop_mul)
> +      return false;
> +
> +   nonmul->accept(this);
> +   fs_reg src0 = this->result;
> +
> +   mul->operands[0]->accept(this);
> +   fs_reg src1 = this->result;
> +   src1.negate ^= mul_negate;
> +   src1.abs = mul_abs;
> +   if (mul_abs)
> +      src1.negate = false;
> +
> +   mul->operands[1]->accept(this);
> +   fs_reg src2 = this->result;
> +   src2.abs = mul_abs;
> +   if (mul_abs)
> +      src2.negate = false;
> +
> +   this->result = vgrf(ir->type);
> +   emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
> +
> +   return true;
> +}
> +
> +bool
> +fs_god::try_emit_b2f_of_comparison(ir_expression *ir)
> +{
> +   /* On platforms that do not natively generate 0u and ~0u for Boolean
> +    * results, b2f expressions that look like
> +    *
> +    *     f = b2f(expr cmp 0)
> +    *
> +    * will generate better code by pretending the expression is
> +    *
> +    *     f = ir_triop_csel(0.0, 1.0, expr cmp 0)
> +    *
> +    * This is because the last instruction of "expr" can generate the
> +    * condition code for the "cmp 0".  This avoids having to do the "-(b & 1)"
> +    * trick to generate 0u or ~0u for the Boolean result.  This means code like
> +    *
> +    *     mov(16)         g16<1>F         1F
> +    *     mul.ge.f0(16)   null            g6<8,8,1>F      g14<8,8,1>F
> +    *     (+f0) sel(16)   m6<1>F          g16<8,8,1>F     0F
> +    *
> +    * will be generated instead of
> +    *
> +    *     mul(16)         g2<1>F          g12<8,8,1>F     g4<8,8,1>F
> +    *     cmp.ge.f0(16)   g2<1>D          g4<8,8,1>F      0F
> +    *     and(16)         g4<1>D          g2<8,8,1>D      1D
> +    *     and(16)         m6<1>D          -g4<8,8,1>D     0x3f800000UD
> +    *
> +    * When the comparison is either == 0.0 or != 0.0 using the knowledge that
> +    * the true (or false) case already results in zero would allow better code
> +    * generation by possibly avoiding a load-immediate instruction.
> +    */
> +   ir_expression *cmp = ir->operands[0]->as_expression();
> +   if (cmp == NULL)
> +      return false;
> +
> +   if (cmp->operation == ir_binop_equal || cmp->operation == ir_binop_nequal) {
> +      for (unsigned i = 0; i < 2; i++) {
> +         ir_constant *c = cmp->operands[i]->as_constant();
> +         if (c == NULL || !c->is_zero())
> +            continue;
> +
> +         ir_expression *expr = cmp->operands[i ^ 1]->as_expression();
> +         if (expr != NULL) {
> +            fs_reg op[2];
> +
> +            for (unsigned j = 0; j < 2; j++) {
> +               cmp->operands[j]->accept(this);
> +               op[j] = this->result;
> +
> +               resolve_ud_negate(&op[j]);
> +            }
> +
> +            emit_bool_to_cond_code_of_reg(cmp, op);
> +
> +            /* In this case we know when the condition is true, op[i ^ 1]
> +             * contains zero.  Invert the predicate, use op[i ^ 1] as src0,
> +             * and immediate 1.0f as src1.
> +             */
> +            this->result = vgrf(ir->type);
> +            op[i ^ 1].type = BRW_REGISTER_TYPE_F;
> +
> +            fs_inst *inst = emit(SEL(this->result, op[i ^ 1], fs_reg(1.0f)));
> +            inst->predicate = BRW_PREDICATE_NORMAL;
> +            inst->predicate_inverse = cmp->operation == ir_binop_equal;
> +            return true;
> +         }
> +      }
> +   }
> +
> +   emit_bool_to_cond_code(cmp);
> +
> +   fs_reg temp = vgrf(ir->type);
> +   emit(MOV(temp, fs_reg(1.0f)));
> +
> +   this->result = vgrf(ir->type);
> +   fs_inst *inst = emit(SEL(this->result, temp, fs_reg(0.0f)));
> +   inst->predicate = BRW_PREDICATE_NORMAL;
> +
> +   return true;
> +}
> +
> +static int
> +pack_pixel_offset(float x)
> +{
> +   /* Clamp upper end of the range to +7/16. See explanation in non-constant
> +    * offset case below. */
> +   int n = MIN2((int)(x * 16), 7);
> +   return n & 0xf;
> +}
> +
> +void
> +fs_god::emit_interpolate_expression(ir_expression *ir)
> +{
> +   /* in SIMD16 mode, the pixel interpolator returns coords interleaved
> +    * 8 channels at a time, same as the barycentric coords presented in
> +    * the FS payload. this requires a bit of extra work to support.
> +    */
> +   no16("interpolate_at_* not yet supported in SIMD16 mode.");
> +
> +   assert(stage == MESA_SHADER_FRAGMENT);
> +   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> +
> +   ir_dereference * deref = ir->operands[0]->as_dereference();
> +   ir_swizzle * swiz = NULL;
> +   if (!deref) {
> +      /* the api does not allow a swizzle here, but the varying packing code
> +       * may have pushed one into here.
> +       */
> +      swiz = ir->operands[0]->as_swizzle();
> +      assert(swiz);
> +      deref = swiz->val->as_dereference();
> +   }
> +   assert(deref);
> +   ir_variable * var = deref->variable_referenced();
> +   assert(var);
> +
> +   /* 1. collect interpolation factors */
> +
> +   fs_reg dst_x = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
> +   fs_reg dst_y = offset(dst_x, 1);
> +
> +   /* for most messages, we need one reg of ignored data; the hardware requires mlen==1
> +    * even when there is no payload. in the per-slot offset case, we'll replace this with
> +    * the proper source data. */
> +   fs_reg src = vgrf(glsl_type::float_type);
> +   int mlen = 1;     /* one reg unless overriden */
> +   int reg_width = dispatch_width / 8;
> +   fs_inst *inst;
> +
> +   switch (ir->operation) {
> +   case ir_unop_interpolate_at_centroid:
> +      inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
> +      break;
> +
> +   case ir_binop_interpolate_at_sample: {
> +      ir_constant *sample_num = ir->operands[1]->as_constant();
> +      assert(sample_num || !"nonconstant sample number should have been lowered.");
> +
> +      unsigned msg_data = sample_num->value.i[0] << 4;
> +      inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, fs_reg(msg_data));
> +      break;
> +   }
> +
> +   case ir_binop_interpolate_at_offset: {
> +      ir_constant *const_offset = ir->operands[1]->as_constant();
> +      if (const_offset) {
> +         unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
> +                            (pack_pixel_offset(const_offset->value.f[1]) << 4);
> +         inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
> +                     fs_reg(msg_data));
> +      } else {
> +         /* pack the operands: hw wants offsets as 4 bit signed ints */
> +         ir->operands[1]->accept(this);
> +         src = vgrf(glsl_type::ivec2_type);
> +         fs_reg src2 = src;
> +         for (int i = 0; i < 2; i++) {
> +            fs_reg temp = vgrf(glsl_type::float_type);
> +            emit(MUL(temp, this->result, fs_reg(16.0f)));
> +            emit(MOV(src2, temp));  /* float to int */
> +
> +            /* Clamp the upper end of the range to +7/16. ARB_gpu_shader5 requires
> +             * that we support a maximum offset of +0.5, which isn't representable
> +             * in a S0.4 value -- if we didn't clamp it, we'd end up with -8/16,
> +             * which is the opposite of what the shader author wanted.
> +             *
> +             * This is legal due to ARB_gpu_shader5's quantization rules:
> +             *
> +             * "Not all values of <offset> may be supported; x and y offsets may
> +             * be rounded to fixed-point values with the number of fraction bits
> +             * given by the implementation-dependent constant
> +             * FRAGMENT_INTERPOLATION_OFFSET_BITS"
> +             */
> +
> +            fs_inst *inst = emit(BRW_OPCODE_SEL, src2, src2, fs_reg(7));
> +            inst->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
> +
> +            src2 = offset(src2, 1);
> +            this->result = offset(this->result, 1);
> +         }
> +
> +         mlen = 2 * reg_width;
> +         inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
> +                     fs_reg(0u));
> +      }
> +      break;
> +   }
> +
> +   default:
> +      unreachable("not reached");
> +   }
> +
> +   inst->mlen = mlen;
> +   inst->regs_written = 2 * reg_width; /* 2 floats per slot returned */
> +   inst->pi_noperspective = var->determine_interpolation_mode(key->flat_shade) ==
> +         INTERP_QUALIFIER_NOPERSPECTIVE;
> +
> +   /* 2. emit linterp */
> +
> +   fs_reg res = vgrf(ir->type);
> +   this->result = res;
> +
> +   for (int i = 0; i < ir->type->vector_elements; i++) {
> +      int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
> +      emit(FS_OPCODE_LINTERP, res,
> +           dst_x, dst_y,
> +           fs_reg(interp_reg(var->data.location, ch)));
> +      res = offset(res, 1);
> +   }
> +}
> +
> +void
> +fs_god::visit(ir_expression *ir)
> +{
> +   unsigned int operand;
> +   fs_reg op[3], temp;
> +   fs_inst *inst;
> +   struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
> +
> +   assert(ir->get_num_operands() <= 3);
> +
> +   if (try_emit_saturate(ir))
> +      return;
> +
> +   /* Deal with the real oddball stuff first */
> +   switch (ir->operation) {
> +   case ir_binop_add:
> +      if (brw->gen <= 5 && try_emit_line(ir))
> +         return;
> +      if (try_emit_mad(ir))
> +         return;
> +      break;
> +
> +   case ir_triop_csel:
> +      ir->operands[1]->accept(this);
> +      op[1] = this->result;
> +      ir->operands[2]->accept(this);
> +      op[2] = this->result;
> +
> +      emit_bool_to_cond_code(ir->operands[0]);
> +
> +      this->result = vgrf(ir->type);
> +      inst = emit(SEL(this->result, op[1], op[2]));
> +      inst->predicate = BRW_PREDICATE_NORMAL;
> +      return;
> +
> +   case ir_unop_b2f:
> +      if (brw->gen <= 5 && try_emit_b2f_of_comparison(ir))
> +         return;
> +      break;
> +
> +   case ir_unop_interpolate_at_centroid:
> +   case ir_binop_interpolate_at_offset:
> +   case ir_binop_interpolate_at_sample:
> +      emit_interpolate_expression(ir);
> +      return;
> +
> +   default:
> +      break;
> +   }
> +
> +   for (operand = 0; operand < ir->get_num_operands(); operand++) {
> +      ir->operands[operand]->accept(this);
> +      if (this->result.file == BAD_FILE) {
> +        fail("Failed to get tree for expression operand:\n");
> +        ir->operands[operand]->fprint(stderr);
> +         fprintf(stderr, "\n");
> +      }
> +      assert(this->result.file == GRF ||
> +             this->result.file == UNIFORM || this->result.file == ATTR);
> +      op[operand] = this->result;
> +
> +      /* Matrix expression operands should have been broken down to vector
> +       * operations already.
> +       */
> +      assert(!ir->operands[operand]->type->is_matrix());
> +      /* And then those vector operands should have been broken down to scalar.
> +       */
> +      assert(!ir->operands[operand]->type->is_vector());
> +   }
> +
> +   /* Storage for our result.  If our result goes into an assignment, it will
> +    * just get copy-propagated out, so no worries.
> +    */
> +   this->result = vgrf(ir->type);
> +
> +   switch (ir->operation) {
> +   case ir_unop_logic_not:
> +      emit(NOT(this->result, op[0]));
> +      break;
> +   case ir_unop_neg:
> +      op[0].negate = !op[0].negate;
> +      emit(MOV(this->result, op[0]));
> +      break;
> +   case ir_unop_abs:
> +      op[0].abs = true;
> +      op[0].negate = false;
> +      emit(MOV(this->result, op[0]));
> +      break;
> +   case ir_unop_sign:
> +      if (ir->type->is_float()) {
> +         /* AND(val, 0x80000000) gives the sign bit.
> +          *
> +          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
> +          * zero.
> +          */
> +         emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
> +
> +         op[0].type = BRW_REGISTER_TYPE_UD;
> +         this->result.type = BRW_REGISTER_TYPE_UD;
> +         emit(AND(this->result, op[0], fs_reg(0x80000000u)));
> +
> +         inst = emit(OR(this->result, this->result, fs_reg(0x3f800000u)));
> +         inst->predicate = BRW_PREDICATE_NORMAL;
> +
> +         this->result.type = BRW_REGISTER_TYPE_F;
> +      } else {
> +         /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
> +          *               -> non-negative val generates 0x00000000.
> +          *  Predicated OR sets 1 if val is positive.
> +          */
> +         emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
> +
> +         emit(ASR(this->result, op[0], fs_reg(31)));
> +
> +         inst = emit(OR(this->result, this->result, fs_reg(1)));
> +         inst->predicate = BRW_PREDICATE_NORMAL;
> +      }
> +      break;
> +   case ir_unop_rcp:
> +      emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
> +      break;
> +
> +   case ir_unop_exp2:
> +      emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
> +      break;
> +   case ir_unop_log2:
> +      emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
> +      break;
> +   case ir_unop_exp:
> +   case ir_unop_log:
> +      unreachable("not reached: should be handled by ir_explog_to_explog2");
> +   case ir_unop_sin:
> +   case ir_unop_sin_reduced:
> +      emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
> +      break;
> +   case ir_unop_cos:
> +   case ir_unop_cos_reduced:
> +      emit_math(SHADER_OPCODE_COS, this->result, op[0]);
> +      break;
> +
> +   case ir_unop_dFdx:
> +      /* Select one of the two opcodes based on the glHint value. */
> +      if (fs_key->high_quality_derivatives)
> +         emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
> +      else
> +         emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
> +      break;
> +
> +   case ir_unop_dFdx_coarse:
> +      emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
> +      break;
> +
> +   case ir_unop_dFdx_fine:
> +      emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
> +      break;
> +
> +   case ir_unop_dFdy:
> +      /* Select one of the two opcodes based on the glHint value. */
> +      if (fs_key->high_quality_derivatives)
> +         emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
> +      else
> +         emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
> +      break;
> +
> +   case ir_unop_dFdy_coarse:
> +      emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
> +      break;
> +
> +   case ir_unop_dFdy_fine:
> +      emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
> +      break;
> +
> +   case ir_binop_add:
> +      emit(ADD(this->result, op[0], op[1]));
> +      break;
> +   case ir_binop_sub:
> +      unreachable("not reached: should be handled by ir_sub_to_add_neg");
> +
> +   case ir_binop_mul:
> +      if (brw->gen < 8 && ir->type->is_integer()) {
> +        /* For integer multiplication, the MUL uses the low 16 bits
> +         * of one of the operands (src0 on gen6, src1 on gen7).  The
> +         * MACH accumulates in the contribution of the upper 16 bits
> +         * of that operand.
> +          */
> +         if (ir->operands[0]->is_uint16_constant()) {
> +            if (brw->gen < 7)
> +               emit(MUL(this->result, op[0], op[1]));
> +            else
> +               emit(MUL(this->result, op[1], op[0]));
> +         } else if (ir->operands[1]->is_uint16_constant()) {
> +            if (brw->gen < 7)
> +               emit(MUL(this->result, op[1], op[0]));
> +            else
> +               emit(MUL(this->result, op[0], op[1]));
> +         } else {
> +            if (brw->gen >= 7)
> +               no16("SIMD16 explicit accumulator operands unsupported\n");
> +
> +            struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> +                                        this->result.type);
> +
> +            emit(MUL(acc, op[0], op[1]));
> +            emit(MACH(reg_null_d, op[0], op[1]));
> +            emit(MOV(this->result, fs_reg(acc)));
> +         }
> +      } else {
> +        emit(MUL(this->result, op[0], op[1]));
> +      }
> +      break;
> +   case ir_binop_imul_high: {
> +      if (brw->gen == 7)
> +         no16("SIMD16 explicit accumulator operands unsupported\n");
> +
> +      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> +                                  this->result.type);
> +
> +      fs_inst *mul = emit(MUL(acc, op[0], op[1]));
> +      emit(MACH(this->result, op[0], op[1]));
> +
> +      /* Until Gen8, integer multiplies read 32-bits from one source, and
> +       * 16-bits from the other, and relying on the MACH instruction to
> +       * generate the high bits of the result.
> +       *
> +       * On Gen8, the multiply instruction does a full 32x32-bit multiply,
> +       * but in order to do a 64x64-bit multiply we have to simulate the
> +       * previous behavior and then use a MACH instruction.
> +       *
> +       * FINISHME: Don't use source modifiers on src1.
> +       */
> +      if (brw->gen >= 8) {
> +         assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
> +                mul->src[1].type == BRW_REGISTER_TYPE_UD);
> +         if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
> +            mul->src[1].type = BRW_REGISTER_TYPE_W;
> +         } else {
> +            mul->src[1].type = BRW_REGISTER_TYPE_UW;
> +         }
> +      }
> +
> +      break;
> +   }
> +   case ir_binop_div:
> +      /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
> +      assert(ir->type->is_integer());
> +      emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
> +      break;
> +   case ir_binop_carry: {
> +      if (brw->gen == 7)
> +         no16("SIMD16 explicit accumulator operands unsupported\n");
> +
> +      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> +                                  BRW_REGISTER_TYPE_UD);
> +
> +      emit(ADDC(reg_null_ud, op[0], op[1]));
> +      emit(MOV(this->result, fs_reg(acc)));
> +      break;
> +   }
> +   case ir_binop_borrow: {
> +      if (brw->gen == 7)
> +         no16("SIMD16 explicit accumulator operands unsupported\n");
> +
> +      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> +                                  BRW_REGISTER_TYPE_UD);
> +
> +      emit(SUBB(reg_null_ud, op[0], op[1]));
> +      emit(MOV(this->result, fs_reg(acc)));
> +      break;
> +   }
> +   case ir_binop_mod:
> +      /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
> +      assert(ir->type->is_integer());
> +      emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
> +      break;
> +
> +   case ir_binop_less:
> +   case ir_binop_greater:
> +   case ir_binop_lequal:
> +   case ir_binop_gequal:
> +   case ir_binop_equal:
> +   case ir_binop_all_equal:
> +   case ir_binop_nequal:
> +   case ir_binop_any_nequal:
> +      if (brw->gen <= 5) {
> +         resolve_bool_comparison(ir->operands[0], &op[0]);
> +         resolve_bool_comparison(ir->operands[1], &op[1]);
> +      }
> +
> +      emit(CMP(this->result, op[0], op[1],
> +               brw_conditional_for_comparison(ir->operation)));
> +      break;
> +
> +   case ir_binop_logic_xor:
> +      emit(XOR(this->result, op[0], op[1]));
> +      break;
> +
> +   case ir_binop_logic_or:
> +      emit(OR(this->result, op[0], op[1]));
> +      break;
> +
> +   case ir_binop_logic_and:
> +      emit(AND(this->result, op[0], op[1]));
> +      break;
> +
> +   case ir_binop_dot:
> +   case ir_unop_any:
> +      unreachable("not reached: should be handled by brw_fs_channel_expressions");
> +
> +   case ir_unop_noise:
> +      unreachable("not reached: should be handled by lower_noise");
> +
> +   case ir_quadop_vector:
> +      unreachable("not reached: should be handled by lower_quadop_vector");
> +
> +   case ir_binop_vector_extract:
> +      unreachable("not reached: should be handled by lower_vec_index_to_cond_assign()");
> +
> +   case ir_triop_vector_insert:
> +      unreachable("not reached: should be handled by lower_vector_insert()");
> +
> +   case ir_binop_ldexp:
> +      unreachable("not reached: should be handled by ldexp_to_arith()");
> +
> +   case ir_unop_sqrt:
> +      emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
> +      break;
> +
> +   case ir_unop_rsq:
> +      emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
> +      break;
> +
> +   case ir_unop_bitcast_i2f:
> +   case ir_unop_bitcast_u2f:
> +      op[0].type = BRW_REGISTER_TYPE_F;
> +      this->result = op[0];
> +      break;
> +   case ir_unop_i2u:
> +   case ir_unop_bitcast_f2u:
> +      op[0].type = BRW_REGISTER_TYPE_UD;
> +      this->result = op[0];
> +      break;
> +   case ir_unop_u2i:
> +   case ir_unop_bitcast_f2i:
> +      op[0].type = BRW_REGISTER_TYPE_D;
> +      this->result = op[0];
> +      break;
> +   case ir_unop_i2f:
> +   case ir_unop_u2f:
> +   case ir_unop_f2i:
> +   case ir_unop_f2u:
> +      emit(MOV(this->result, op[0]));
> +      break;
> +
> +   case ir_unop_b2i:
> +      emit(AND(this->result, op[0], fs_reg(1)));
> +      break;
> +   case ir_unop_b2f:
> +      if (brw->gen <= 5) {
> +         resolve_bool_comparison(ir->operands[0], &op[0]);
> +      }
> +      op[0].type = BRW_REGISTER_TYPE_D;
> +      this->result.type = BRW_REGISTER_TYPE_D;
> +      emit(AND(this->result, op[0], fs_reg(0x3f800000u)));
> +      this->result.type = BRW_REGISTER_TYPE_F;
> +      break;
> +
> +   case ir_unop_f2b:
> +      emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
> +      break;
> +   case ir_unop_i2b:
> +      emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
> +      break;
> +
> +   case ir_unop_trunc:
> +      emit(RNDZ(this->result, op[0]));
> +      break;
> +   case ir_unop_ceil: {
> +         fs_reg tmp = vgrf(ir->type);
> +         op[0].negate = !op[0].negate;
> +         emit(RNDD(tmp, op[0]));
> +         tmp.negate = true;
> +         emit(MOV(this->result, tmp));
> +      }
> +      break;
> +   case ir_unop_floor:
> +      emit(RNDD(this->result, op[0]));
> +      break;
> +   case ir_unop_fract:
> +      emit(FRC(this->result, op[0]));
> +      break;
> +   case ir_unop_round_even:
> +      emit(RNDE(this->result, op[0]));
> +      break;
> +
> +   case ir_binop_min:
> +   case ir_binop_max:
> +      resolve_ud_negate(&op[0]);
> +      resolve_ud_negate(&op[1]);
> +      emit_minmax(ir->operation == ir_binop_min ?
> +                  BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
> +                  this->result, op[0], op[1]);
> +      break;
> +   case ir_unop_pack_snorm_2x16:
> +   case ir_unop_pack_snorm_4x8:
> +   case ir_unop_pack_unorm_2x16:
> +   case ir_unop_pack_unorm_4x8:
> +   case ir_unop_unpack_snorm_2x16:
> +   case ir_unop_unpack_snorm_4x8:
> +   case ir_unop_unpack_unorm_2x16:
> +   case ir_unop_unpack_unorm_4x8:
> +   case ir_unop_unpack_half_2x16:
> +   case ir_unop_pack_half_2x16:
> +      unreachable("not reached: should be handled by lower_packing_builtins");
> +   case ir_unop_unpack_half_2x16_split_x:
> +      emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
> +      break;
> +   case ir_unop_unpack_half_2x16_split_y:
> +      emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
> +      break;
> +   case ir_binop_pow:
> +      emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
> +      break;
> +
> +   case ir_unop_bitfield_reverse:
> +      emit(BFREV(this->result, op[0]));
> +      break;
> +   case ir_unop_bit_count:
> +      emit(CBIT(this->result, op[0]));
> +      break;
> +   case ir_unop_find_msb:
> +      temp = vgrf(glsl_type::uint_type);
> +      emit(FBH(temp, op[0]));
> +
> +      /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
> +       * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
> +       * subtract the result from 31 to convert the MSB count into an LSB count.
> +       */
> +
> +      /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
> +      emit(MOV(this->result, temp));
> +      emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
> +
> +      temp.negate = true;
> +      inst = emit(ADD(this->result, temp, fs_reg(31)));
> +      inst->predicate = BRW_PREDICATE_NORMAL;
> +      break;
> +   case ir_unop_find_lsb:
> +      emit(FBL(this->result, op[0]));
> +      break;
> +   case ir_unop_saturate:
> +      inst = emit(MOV(this->result, op[0]));
> +      inst->saturate = true;
> +      break;
> +   case ir_triop_bitfield_extract:
> +      /* Note that the instruction's argument order is reversed from GLSL
> +       * and the IR.
> +       */
> +      emit(BFE(this->result, op[2], op[1], op[0]));
> +      break;
> +   case ir_binop_bfm:
> +      emit(BFI1(this->result, op[0], op[1]));
> +      break;
> +   case ir_triop_bfi:
> +      emit(BFI2(this->result, op[0], op[1], op[2]));
> +      break;
> +   case ir_quadop_bitfield_insert:
> +      unreachable("not reached: should be handled by "
> +              "lower_instructions::bitfield_insert_to_bfm_bfi");
> +
> +   case ir_unop_bit_not:
> +      emit(NOT(this->result, op[0]));
> +      break;
> +   case ir_binop_bit_and:
> +      emit(AND(this->result, op[0], op[1]));
> +      break;
> +   case ir_binop_bit_xor:
> +      emit(XOR(this->result, op[0], op[1]));
> +      break;
> +   case ir_binop_bit_or:
> +      emit(OR(this->result, op[0], op[1]));
> +      break;
> +
> +   case ir_binop_lshift:
> +      emit(SHL(this->result, op[0], op[1]));
> +      break;
> +
> +   case ir_binop_rshift:
> +      if (ir->type->base_type == GLSL_TYPE_INT)
> +        emit(ASR(this->result, op[0], op[1]));
> +      else
> +        emit(SHR(this->result, op[0], op[1]));
> +      break;
> +   case ir_binop_pack_half_2x16_split:
> +      emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
> +      break;
> +   case ir_binop_ubo_load: {
> +      /* This IR node takes a constant uniform block and a constant or
> +       * variable byte offset within the block and loads a vector from that.
> +       */
> +      ir_constant *const_uniform_block = ir->operands[0]->as_constant();
> +      ir_constant *const_offset = ir->operands[1]->as_constant();
> +      fs_reg surf_index;
> +
> +      if (const_uniform_block) {
> +         /* The block index is a constant, so just emit the binding table entry
> +          * as an immediate.
> +          */
> +         surf_index = fs_reg(stage_prog_data->binding_table.ubo_start +
> +                                 const_uniform_block->value.u[0]);
> +      } else {
> +         /* The block index is not a constant. Evaluate the index expression
> +          * per-channel and add the base UBO index; the generator will select
> +          * a value from any live channel.
> +          */
> +         surf_index = vgrf(glsl_type::uint_type);
> +         emit(ADD(surf_index, op[0],
> +                  fs_reg(stage_prog_data->binding_table.ubo_start)))
> +            ->force_writemask_all = true;
> +
> +         /* Assume this may touch any UBO. It would be nice to provide
> +          * a tighter bound, but the array information is already lowered away.
> +          */
> +         brw_mark_surface_used(prog_data,
> +                               stage_prog_data->binding_table.ubo_start +
> +                               shader_prog->NumUniformBlocks - 1);
> +      }
> +
> +      if (const_offset) {
> +         fs_reg packed_consts = vgrf(glsl_type::float_type);
> +         packed_consts.type = result.type;
> +
> +         fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
> +         emit(new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
> +                                   packed_consts, surf_index, const_offset_reg));
> +
> +         for (int i = 0; i < ir->type->vector_elements; i++) {
> +            packed_consts.set_smear(const_offset->value.u[0] % 16 / 4 + i);
> +
> +            /* The std140 packing rules don't allow vectors to cross 16-byte
> +             * boundaries, and a reg is 32 bytes.
> +             */
> +            assert(packed_consts.subreg_offset < 32);
> +
> +            /* UBO bools are any nonzero value.  We consider bools to be
> +             * values with the low bit set to 1.  Convert them using CMP.
> +             */
> +            if (ir->type->base_type == GLSL_TYPE_BOOL) {
> +               emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
> +            } else {
> +               emit(MOV(result, packed_consts));
> +            }
> +
> +            result = offset(result, 1);
> +         }
> +      } else {
> +         /* Turn the byte offset into a dword offset. */
> +         fs_reg base_offset = vgrf(glsl_type::int_type);
> +         emit(SHR(base_offset, op[1], fs_reg(2)));
> +
> +         for (int i = 0; i < ir->type->vector_elements; i++) {
> +            emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
> +                                            base_offset, i));
> +
> +            if (ir->type->base_type == GLSL_TYPE_BOOL)
> +               emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
> +
> +            result = offset(result, 1);
> +         }
> +      }
> +
> +      result.reg_offset = 0;
> +      break;
> +   }
> +
> +   case ir_triop_fma:
> +      /* Note that the instruction's argument order is reversed from GLSL
> +       * and the IR.
> +       */
> +      emit(MAD(this->result, op[2], op[1], op[0]));
> +      break;
> +
> +   case ir_triop_lrp:
> +      emit_lrp(this->result, op[0], op[1], op[2]);
> +      break;
> +
> +   case ir_triop_csel:
> +   case ir_unop_interpolate_at_centroid:
> +   case ir_binop_interpolate_at_offset:
> +   case ir_binop_interpolate_at_sample:
> +      unreachable("already handled above");
> +      break;
> +
> +   case ir_unop_d2f:
> +   case ir_unop_f2d:
> +   case ir_unop_d2i:
> +   case ir_unop_i2d:
> +   case ir_unop_d2u:
> +   case ir_unop_u2d:
> +   case ir_unop_d2b:
> +   case ir_unop_pack_double_2x32:
> +   case ir_unop_unpack_double_2x32:
> +   case ir_unop_frexp_sig:
> +   case ir_unop_frexp_exp:
> +      unreachable("fp64 todo");
> +      break;
> +   }
> +}
> +
> +void
> +fs_god::emit_assignment_writes(fs_reg &l, fs_reg &r,
> +                                  const glsl_type *type, bool predicated)
> +{
> +   switch (type->base_type) {
> +   case GLSL_TYPE_FLOAT:
> +   case GLSL_TYPE_UINT:
> +   case GLSL_TYPE_INT:
> +   case GLSL_TYPE_BOOL:
> +      for (unsigned int i = 0; i < type->components(); i++) {
> +        l.type = brw_type_for_base_type(type);
> +        r.type = brw_type_for_base_type(type);
> +
> +        if (predicated || !l.equals(r)) {
> +           fs_inst *inst = emit(MOV(l, r));
> +           inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
> +        }
> +
> +        l = offset(l, 1);
> +        r = offset(r, 1);
> +      }
> +      break;
> +   case GLSL_TYPE_ARRAY:
> +      for (unsigned int i = 0; i < type->length; i++) {
> +        emit_assignment_writes(l, r, type->fields.array, predicated);
> +      }
> +      break;
> +
> +   case GLSL_TYPE_STRUCT:
> +      for (unsigned int i = 0; i < type->length; i++) {
> +        emit_assignment_writes(l, r, type->fields.structure[i].type,
> +                               predicated);
> +      }
> +      break;
> +
> +   case GLSL_TYPE_SAMPLER:
> +   case GLSL_TYPE_IMAGE:
> +   case GLSL_TYPE_ATOMIC_UINT:
> +      break;
> +
> +   case GLSL_TYPE_DOUBLE:
> +   case GLSL_TYPE_VOID:
> +   case GLSL_TYPE_ERROR:
> +   case GLSL_TYPE_INTERFACE:
> +      unreachable("not reached");
> +   }
> +}
> +
> +/* If the RHS processing resulted in an instruction generating a
> + * temporary value, and it would be easy to rewrite the instruction to
> + * generate its result right into the LHS instead, do so.  This ends
> + * up reliably removing instructions where it can be tricky to do so
> + * later without real UD chain information.
> + */
> +bool
> +fs_god::try_rewrite_rhs_to_dst(ir_assignment *ir,
> +                                   fs_reg dst,
> +                                   fs_reg src,
> +                                   fs_inst *pre_rhs_inst,
> +                                   fs_inst *last_rhs_inst)
> +{
> +   /* Only attempt if we're doing a direct assignment. */
> +   if (ir->condition ||
> +       !(ir->lhs->type->is_scalar() ||
> +        (ir->lhs->type->is_vector() &&
> +         ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
> +      return false;
> +
> +   /* Make sure the last instruction generated our source reg. */
> +   fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
> +                                                   last_rhs_inst,
> +                                                   src);
> +   if (!modify)
> +      return false;
> +
> +   /* If last_rhs_inst wrote a different number of components than our LHS,
> +    * we can't safely rewrite it.
> +    */
> +   if (alloc.sizes[dst.reg] != modify->regs_written)
> +      return false;
> +
> +   /* Success!  Rewrite the instruction. */
> +   modify->dst = dst;
> +
> +   return true;
> +}
> +
> +void
> +fs_god::visit(ir_assignment *ir)
> +{
> +   fs_reg l, r;
> +   fs_inst *inst;
> +
> +   /* FINISHME: arrays on the lhs */
> +   ir->lhs->accept(this);
> +   l = this->result;
> +
> +   fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
> +
> +   ir->rhs->accept(this);
> +   r = this->result;
> +
> +   fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
> +
> +   assert(l.file != BAD_FILE);
> +   assert(r.file != BAD_FILE);
> +
> +   if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
> +      return;
> +
> +   if (ir->condition) {
> +      emit_bool_to_cond_code(ir->condition);
> +   }
> +
> +   if (ir->lhs->type->is_scalar() ||
> +       ir->lhs->type->is_vector()) {
> +      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
> +        if (ir->write_mask & (1 << i)) {
> +           inst = emit(MOV(l, r));
> +           if (ir->condition)
> +              inst->predicate = BRW_PREDICATE_NORMAL;
> +           r = offset(r, 1);
> +        }
> +        l = offset(l, 1);
> +      }
> +   } else {
> +      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
> +   }
> +}
> +
> +fs_inst *
> +fs_god::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
> +                              fs_reg coordinate, int coord_components,
> +                              fs_reg shadow_c,
> +                              fs_reg lod, fs_reg dPdy, int grad_components,
> +                              uint32_t sampler)
> +{
> +   int mlen;
> +   int base_mrf = 1;
> +   bool simd16 = false;
> +   fs_reg orig_dst;
> +
> +   /* g0 header. */
> +   mlen = 1;
> +
> +   if (shadow_c.file != BAD_FILE) {
> +      for (int i = 0; i < coord_components; i++) {
> +        emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
> +        coordinate = offset(coordinate, 1);
> +      }
> +
> +      /* gen4's SIMD8 sampler always has the slots for u,v,r present.
> +       * the unused slots must be zeroed.
> +       */
> +      for (int i = coord_components; i < 3; i++) {
> +         emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
> +      }
> +      mlen += 3;
> +
> +      if (op == ir_tex) {
> +        /* There's no plain shadow compare message, so we use shadow
> +         * compare with a bias of 0.0.
> +         */
> +        emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
> +        mlen++;
> +      } else if (op == ir_txb || op == ir_txl) {
> +        emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
> +        mlen++;
> +      } else {
> +         unreachable("Should not get here.");
> +      }
> +
> +      emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
> +      mlen++;
> +   } else if (op == ir_tex) {
> +      for (int i = 0; i < coord_components; i++) {
> +        emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
> +        coordinate = offset(coordinate, 1);
> +      }
> +      /* zero the others. */
> +      for (int i = coord_components; i<3; i++) {
> +         emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
> +      }
> +      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
> +      mlen += 3;
> +   } else if (op == ir_txd) {
> +      fs_reg &dPdx = lod;
> +
> +      for (int i = 0; i < coord_components; i++) {
> +        emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
> +        coordinate = offset(coordinate, 1);
> +      }
> +      /* the slots for u and v are always present, but r is optional */
> +      mlen += MAX2(coord_components, 2);
> +
> +      /*  P   = u, v, r
> +       * dPdx = dudx, dvdx, drdx
> +       * dPdy = dudy, dvdy, drdy
> +       *
> +       * 1-arg: Does not exist.
> +       *
> +       * 2-arg: dudx   dvdx   dudy   dvdy
> +       *        dPdx.x dPdx.y dPdy.x dPdy.y
> +       *        m4     m5     m6     m7
> +       *
> +       * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
> +       *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
> +       *        m5     m6     m7     m8     m9     m10
> +       */
> +      for (int i = 0; i < grad_components; i++) {
> +        emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
> +        dPdx = offset(dPdx, 1);
> +      }
> +      mlen += MAX2(grad_components, 2);
> +
> +      for (int i = 0; i < grad_components; i++) {
> +        emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
> +        dPdy = offset(dPdy, 1);
> +      }
> +      mlen += MAX2(grad_components, 2);
> +   } else if (op == ir_txs) {
> +      /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
> +      simd16 = true;
> +      emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
> +      mlen += 2;
> +   } else {
> +      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
> +       * instructions.  We'll need to do SIMD16 here.
> +       */
> +      simd16 = true;
> +      assert(op == ir_txb || op == ir_txl || op == ir_txf);
> +
> +      for (int i = 0; i < coord_components; i++) {
> +        emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
> +                  coordinate));
> +        coordinate = offset(coordinate, 1);
> +      }
> +
> +      /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
> +       * be necessary for TXF (ld), but seems wise to do for all messages.
> +       */
> +      for (int i = coord_components; i < 3; i++) {
> +        emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
> +      }
> +
> +      /* lod/bias appears after u/v/r. */
> +      mlen += 6;
> +
> +      emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
> +      mlen++;
> +
> +      /* The unused upper half. */
> +      mlen++;
> +   }
> +
> +   if (simd16) {
> +      /* Now, since we're doing simd16, the return is 2 interleaved
> +       * vec4s where the odd-indexed ones are junk. We'll need to move
> +       * this weirdness around to the expected layout.
> +       */
> +      orig_dst = dst;
> +      dst = fs_reg(GRF, alloc.allocate(8), orig_dst.type);
> +   }
> +
> +   enum opcode opcode;
> +   switch (op) {
> +   case ir_tex: opcode = SHADER_OPCODE_TEX; break;
> +   case ir_txb: opcode = FS_OPCODE_TXB; break;
> +   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
> +   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
> +   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
> +   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
> +   default:
> +      unreachable("not reached");
> +   }
> +
> +   fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
> +   inst->base_mrf = base_mrf;
> +   inst->mlen = mlen;
> +   inst->header_present = true;
> +   inst->regs_written = simd16 ? 8 : 4;
> +
> +   if (simd16) {
> +      for (int i = 0; i < 4; i++) {
> +        emit(MOV(orig_dst, dst));
> +        orig_dst = offset(orig_dst, 1);
> +        dst = offset(dst, 2);
> +      }
> +   }
> +
> +   return inst;
> +}
> +
> +/* gen5's sampler has slots for u, v, r, array index, then optional
> + * parameters like shadow comparitor or LOD bias.  If optional
> + * parameters aren't present, those base slots are optional and don't
> + * need to be included in the message.
> + *
> + * We don't fill in the unnecessary slots regardless, which may look
> + * surprising in the disassembly.
> + */
> +fs_inst *
> +fs_god::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
> +                              fs_reg coordinate, int vector_elements,
> +                              fs_reg shadow_c,
> +                              fs_reg lod, fs_reg lod2, int grad_components,
> +                              fs_reg sample_index, uint32_t sampler,
> +                              bool has_offset)
> +{
> +   int reg_width = dispatch_width / 8;
> +   bool header_present = false;
> +
> +   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
> +   fs_reg msg_coords = message;
> +
> +   if (has_offset) {
> +      /* The offsets set up by the ir_texture visitor are in the
> +       * m1 header, so we can't go headerless.
> +       */
> +      header_present = true;
> +      message.reg--;
> +   }
> +
> +   for (int i = 0; i < vector_elements; i++) {
> +      emit(MOV(retype(offset(msg_coords, i), coordinate.type), coordinate));
> +      coordinate = offset(coordinate, 1);
> +   }
> +   fs_reg msg_end = offset(msg_coords, vector_elements);
> +   fs_reg msg_lod = offset(msg_coords, 4);
> +
> +   if (shadow_c.file != BAD_FILE) {
> +      fs_reg msg_shadow = msg_lod;
> +      emit(MOV(msg_shadow, shadow_c));
> +      msg_lod = offset(msg_shadow, 1);
> +      msg_end = msg_lod;
> +   }
> +
> +   enum opcode opcode;
> +   switch (op) {
> +   case ir_tex:
> +      opcode = SHADER_OPCODE_TEX;
> +      break;
> +   case ir_txb:
> +      emit(MOV(msg_lod, lod));
> +      msg_end = offset(msg_lod, 1);
> +
> +      opcode = FS_OPCODE_TXB;
> +      break;
> +   case ir_txl:
> +      emit(MOV(msg_lod, lod));
> +      msg_end = offset(msg_lod, 1);
> +
> +      opcode = SHADER_OPCODE_TXL;
> +      break;
> +   case ir_txd: {
> +      /**
> +       *  P   =  u,    v,    r
> +       * dPdx = dudx, dvdx, drdx
> +       * dPdy = dudy, dvdy, drdy
> +       *
> +       * Load up these values:
> +       * - dudx   dudy   dvdx   dvdy   drdx   drdy
> +       * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
> +       */
> +      msg_end = msg_lod;
> +      for (int i = 0; i < grad_components; i++) {
> +         emit(MOV(msg_end, lod));
> +         lod = offset(lod, 1);
> +         msg_end = offset(msg_end, 1);
> +
> +         emit(MOV(msg_end, lod2));
> +         lod2 = offset(lod2, 1);
> +         msg_end = offset(msg_end, 1);
> +      }
> +
> +      opcode = SHADER_OPCODE_TXD;
> +      break;
> +   }
> +   case ir_txs:
> +      msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
> +      emit(MOV(msg_lod, lod));
> +      msg_end = offset(msg_lod, 1);
> +
> +      opcode = SHADER_OPCODE_TXS;
> +      break;
> +   case ir_query_levels:
> +      msg_lod = msg_end;
> +      emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
> +      msg_end = offset(msg_lod, 1);
> +
> +      opcode = SHADER_OPCODE_TXS;
> +      break;
> +   case ir_txf:
> +      msg_lod = offset(msg_coords, 3);
> +      emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod));
> +      msg_end = offset(msg_lod, 1);
> +
> +      opcode = SHADER_OPCODE_TXF;
> +      break;
> +   case ir_txf_ms:
> +      msg_lod = offset(msg_coords, 3);
> +      /* lod */
> +      emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
> +      /* sample index */
> +      emit(MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index));
> +      msg_end = offset(msg_lod, 2);
> +
> +      opcode = SHADER_OPCODE_TXF_CMS;
> +      break;
> +   case ir_lod:
> +      opcode = SHADER_OPCODE_LOD;
> +      break;
> +   case ir_tg4:
> +      opcode = SHADER_OPCODE_TG4;
> +      break;
> +   default:
> +      unreachable("not reached");
> +   }
> +
> +   fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
> +   inst->base_mrf = message.reg;
> +   inst->mlen = msg_end.reg - message.reg;
> +   inst->header_present = header_present;
> +   inst->regs_written = 4 * reg_width;
> +
> +   if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
> +      fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
> +           " disallowed by hardware\n");
> +   }
> +
> +   return inst;
> +}
> +
> +static bool
> +is_high_sampler(struct brw_context *brw, fs_reg sampler)
> +{
> +   if (brw->gen < 8 && !brw->is_haswell)
> +      return false;
> +
> +   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
> +}
> +
> +fs_inst *
> +fs_god::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
> +                              fs_reg coordinate, int coord_components,
> +                              fs_reg shadow_c,
> +                              fs_reg lod, fs_reg lod2, int grad_components,
> +                              fs_reg sample_index, fs_reg mcs, fs_reg sampler,
> +                              fs_reg offset_value)
> +{
> +   int reg_width = dispatch_width / 8;
> +   bool header_present = false;
> +
> +   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
> +   for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
> +      sources[i] = vgrf(glsl_type::float_type);
> +   }
> +   int length = 0;
> +
> +   if (op == ir_tg4 || offset_value.file != BAD_FILE ||
> +       is_high_sampler(brw, sampler)) {
> +      /* For general texture offsets (no txf workaround), we need a header to
> +       * put them in.  Note that for SIMD16 we're making space for two actual
> +       * hardware registers here, so the emit will have to fix up for this.
> +       *
> +       * * ir4_tg4 needs to place its channel select in the header,
> +       * for interaction with ARB_texture_swizzle
> +       *
> +       * The sampler index is only 4-bits, so for larger sampler numbers we
> +       * need to offset the Sampler State Pointer in the header.
> +       */
> +      header_present = true;
> +      sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> +      length++;
> +   }
> +
> +   if (shadow_c.file != BAD_FILE) {
> +      emit(MOV(sources[length], shadow_c));
> +      length++;
> +   }
> +
> +   bool has_nonconstant_offset =
> +      offset_value.file != BAD_FILE && offset_value.file != IMM;
> +   bool coordinate_done = false;
> +
> +   /* Set up the LOD info */
> +   switch (op) {
> +   case ir_tex:
> +   case ir_lod:
> +      break;
> +   case ir_txb:
> +      emit(MOV(sources[length], lod));
> +      length++;
> +      break;
> +   case ir_txl:
> +      emit(MOV(sources[length], lod));
> +      length++;
> +      break;
> +   case ir_txd: {
> +      no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
> +
> +      /* Load dPdx and the coordinate together:
> +       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
> +       */
> +      for (int i = 0; i < coord_components; i++) {
> +        emit(MOV(sources[length], coordinate));
> +        coordinate = offset(coordinate, 1);
> +        length++;
> +
> +         /* For cube map array, the coordinate is (u,v,r,ai) but there are
> +          * only derivatives for (u, v, r).
> +          */
> +         if (i < grad_components) {
> +            emit(MOV(sources[length], lod));
> +            lod = offset(lod, 1);
> +            length++;
> +
> +            emit(MOV(sources[length], lod2));
> +            lod2 = offset(lod2, 1);
> +            length++;
> +         }
> +      }
> +
> +      coordinate_done = true;
> +      break;
> +   }
> +   case ir_txs:
> +      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod));
> +      length++;
> +      break;
> +   case ir_query_levels:
> +      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u)));
> +      length++;
> +      break;
> +   case ir_txf:
> +      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
> +      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
> +      coordinate = offset(coordinate, 1);
> +      length++;
> +
> +      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod));
> +      length++;
> +
> +      for (int i = 1; i < coord_components; i++) {
> +        emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
> +        coordinate = offset(coordinate, 1);
> +        length++;
> +      }
> +
> +      coordinate_done = true;
> +      break;
> +   case ir_txf_ms:
> +      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index));
> +      length++;
> +
> +      /* data from the multisample control surface */
> +      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs));
> +      length++;
> +
> +      /* there is no offsetting for this message; just copy in the integer
> +       * texture coordinates
> +       */
> +      for (int i = 0; i < coord_components; i++) {
> +         emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
> +         coordinate = offset(coordinate, 1);
> +         length++;
> +      }
> +
> +      coordinate_done = true;
> +      break;
> +   case ir_tg4:
> +      if (has_nonconstant_offset) {
> +         if (shadow_c.file != BAD_FILE)
> +            no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
> +
> +         /* More crazy intermixing */
> +         for (int i = 0; i < 2; i++) { /* u, v */
> +            emit(MOV(sources[length], coordinate));
> +            coordinate = offset(coordinate, 1);
> +            length++;
> +         }
> +
> +         for (int i = 0; i < 2; i++) { /* offu, offv */
> +            emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value));
> +            offset_value = offset(offset_value, 1);
> +            length++;
> +         }
> +
> +         if (coord_components == 3) { /* r if present */
> +            emit(MOV(sources[length], coordinate));
> +            coordinate = offset(coordinate, 1);
> +            length++;
> +         }
> +
> +         coordinate_done = true;
> +      }
> +      break;
> +   }
> +
> +   /* Set up the coordinate (except for cases where it was done above) */
> +   if (!coordinate_done) {
> +      for (int i = 0; i < coord_components; i++) {
> +         emit(MOV(sources[length], coordinate));
> +         coordinate = offset(coordinate, 1);
> +         length++;
> +      }
> +   }
> +
> +   int mlen;
> +   if (reg_width == 2)
> +      mlen = length * reg_width - header_present;
> +   else
> +      mlen = length * reg_width;
> +
> +   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
> +                               BRW_REGISTER_TYPE_F);
> +   emit(LOAD_PAYLOAD(src_payload, sources, length));
> +
> +   /* Generate the SEND */
> +   enum opcode opcode;
> +   switch (op) {
> +   case ir_tex: opcode = SHADER_OPCODE_TEX; break;
> +   case ir_txb: opcode = FS_OPCODE_TXB; break;
> +   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
> +   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
> +   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
> +   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
> +   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
> +   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
> +   case ir_lod: opcode = SHADER_OPCODE_LOD; break;
> +   case ir_tg4:
> +      if (has_nonconstant_offset)
> +         opcode = SHADER_OPCODE_TG4_OFFSET;
> +      else
> +         opcode = SHADER_OPCODE_TG4;
> +      break;
> +   default:
> +      unreachable("not reached");
> +   }
> +   fs_inst *inst = emit(opcode, dst, src_payload, sampler);
> +   inst->base_mrf = -1;
> +   inst->mlen = mlen;
> +   inst->header_present = header_present;
> +   inst->regs_written = 4 * reg_width;
> +
> +   if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
> +      fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
> +           " disallowed by hardware\n");
> +   }
> +
> +   return inst;
> +}
> +
> +fs_reg
> +fs_god::rescale_texcoord(fs_reg coordinate, int coord_components,
> +                             bool is_rect, uint32_t sampler, int texunit)
> +{
> +   fs_inst *inst = NULL;
> +   bool needs_gl_clamp = true;
> +   fs_reg scale_x, scale_y;
> +
> +   /* The 965 requires the EU to do the normalization of GL rectangle
> +    * texture coordinates.  We use the program parameter state
> +    * tracking to get the scaling factor.
> +    */
> +   if (is_rect &&
> +       (brw->gen < 6 ||
> +        (brw->gen >= 6 && (key_tex->gl_clamp_mask[0] & (1 << sampler) ||
> +                           key_tex->gl_clamp_mask[1] & (1 << sampler))))) {
> +      struct gl_program_parameter_list *params = prog->Parameters;
> +      int tokens[STATE_LENGTH] = {
> +        STATE_INTERNAL,
> +        STATE_TEXRECT_SCALE,
> +        texunit,
> +        0,
> +        0
> +      };
> +
> +      no16("rectangle scale uniform setup not supported on SIMD16\n");
> +      if (dispatch_width == 16) {
> +        return coordinate;
> +      }
> +
> +      GLuint index = _mesa_add_state_reference(params,
> +                                              (gl_state_index *)tokens);
> +      /* Try to find existing copies of the texrect scale uniforms. */
> +      for (unsigned i = 0; i < uniforms; i++) {
> +         if (stage_prog_data->param[i] ==
> +             &prog->Parameters->ParameterValues[index][0]) {
> +            scale_x = fs_reg(UNIFORM, i);
> +            scale_y = fs_reg(UNIFORM, i + 1);
> +            break;
> +         }
> +      }
> +
> +      /* If we didn't already set them up, do so now. */
> +      if (scale_x.file == BAD_FILE) {
> +         scale_x = fs_reg(UNIFORM, uniforms);
> +         scale_y = fs_reg(UNIFORM, uniforms + 1);
> +
> +         stage_prog_data->param[uniforms++] =
> +            &prog->Parameters->ParameterValues[index][0];
> +         stage_prog_data->param[uniforms++] =
> +            &prog->Parameters->ParameterValues[index][1];
> +      }
> +   }
> +
> +   /* The 965 requires the EU to do the normalization of GL rectangle
> +    * texture coordinates.  We use the program parameter state
> +    * tracking to get the scaling factor.
> +    */
> +   if (brw->gen < 6 && is_rect) {
> +      fs_reg dst = fs_reg(GRF, alloc.allocate(coord_components));
> +      fs_reg src = coordinate;
> +      coordinate = dst;
> +
> +      emit(MUL(dst, src, scale_x));
> +      dst = offset(dst, 1);
> +      src = offset(src, 1);
> +      emit(MUL(dst, src, scale_y));
> +   } else if (is_rect) {
> +      /* On gen6+, the sampler handles the rectangle coordinates
> +       * natively, without needing rescaling.  But that means we have
> +       * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
> +       * not [0, 1] like the default case below.
> +       */
> +      needs_gl_clamp = false;
> +
> +      for (int i = 0; i < 2; i++) {
> +        if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
> +           fs_reg chan = coordinate;
> +           chan = offset(chan, i);
> +
> +           inst = emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f));
> +           inst->conditional_mod = BRW_CONDITIONAL_GE;
> +
> +           /* Our parameter comes in as 1.0/width or 1.0/height,
> +            * because that's what people normally want for doing
> +            * texture rectangle handling.  We need width or height
> +            * for clamping, but we don't care enough to make a new
> +            * parameter type, so just invert back.
> +            */
> +           fs_reg limit = vgrf(glsl_type::float_type);
> +           emit(MOV(limit, i == 0 ? scale_x : scale_y));
> +           emit(SHADER_OPCODE_RCP, limit, limit);
> +
> +           inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
> +           inst->conditional_mod = BRW_CONDITIONAL_L;
> +        }
> +      }
> +   }
> +
> +   if (coord_components > 0 && needs_gl_clamp) {
> +      for (int i = 0; i < MIN2(coord_components, 3); i++) {
> +        if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
> +           fs_reg chan = coordinate;
> +           chan = offset(chan, i);
> +
> +           fs_inst *inst = emit(MOV(chan, chan));
> +           inst->saturate = true;
> +        }
> +      }
> +   }
> +   return coordinate;
> +}
> +
> +/* Sample from the MCS surface attached to this multisample texture. */
> +fs_reg
> +fs_god::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
> +{
> +   int reg_width = dispatch_width / 8;
> +   fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
> +                           BRW_REGISTER_TYPE_F);
> +   fs_reg dest = vgrf(glsl_type::uvec4_type);
> +   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
> +
> +   /* parameters are: u, v, r; missing parameters are treated as zero */
> +   for (int i = 0; i < components; i++) {
> +      sources[i] = vgrf(glsl_type::float_type);
> +      emit(MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate));
> +      coordinate = offset(coordinate, 1);
> +   }
> +
> +   emit(LOAD_PAYLOAD(payload, sources, components));
> +
> +   fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
> +   inst->base_mrf = -1;
> +   inst->mlen = components * reg_width;
> +   inst->header_present = false;
> +   inst->regs_written = 4 * reg_width; /* we only care about one reg of
> +                                        * response, but the sampler always
> +                                        * writes 4/8
> +                                        */
> +
> +   return dest;
> +}
> +
> +void
> +fs_god::emit_texture(ir_texture_opcode op,
> +                         const glsl_type *dest_type,
> +                         fs_reg coordinate, int coord_components,
> +                         fs_reg shadow_c,
> +                         fs_reg lod, fs_reg lod2, int grad_components,
> +                         fs_reg sample_index,
> +                         fs_reg offset_value,
> +                         fs_reg mcs,
> +                         int gather_component,
> +                         bool is_cube_array,
> +                         bool is_rect,
> +                         uint32_t sampler,
> +                         fs_reg sampler_reg, int texunit)
> +{
> +   fs_inst *inst = NULL;
> +
> +   if (op == ir_tg4) {
> +      /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
> +       * emitting anything other than setting up the constant result.
> +       */
> +      int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
> +      if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
> +
> +         fs_reg res = vgrf(glsl_type::vec4_type);
> +         this->result = res;
> +
> +         for (int i=0; i<4; i++) {
> +            emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
> +            res = offset(res, 1);
> +         }
> +         return;
> +      }
> +   }
> +
> +   if (coordinate.file != BAD_FILE) {
> +      /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
> +       * samplers.  This should only be a problem with GL_CLAMP on Gen7.
> +       */
> +      coordinate = rescale_texcoord(coordinate, coord_components, is_rect,
> +                                    sampler, texunit);
> +   }
> +
> +   /* Writemasking doesn't eliminate channels on SIMD8 texture
> +    * samples, so don't worry about them.
> +    */
> +   fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
> +
> +   if (brw->gen >= 7) {
> +      inst = emit_texture_gen7(op, dst, coordinate, coord_components,
> +                               shadow_c, lod, lod2, grad_components,
> +                               sample_index, mcs, sampler_reg,
> +                               offset_value);
> +   } else if (brw->gen >= 5) {
> +      inst = emit_texture_gen5(op, dst, coordinate, coord_components,
> +                               shadow_c, lod, lod2, grad_components,
> +                               sample_index, sampler,
> +                               offset_value.file != BAD_FILE);
> +   } else {
> +      inst = emit_texture_gen4(op, dst, coordinate, coord_components,
> +                               shadow_c, lod, lod2, grad_components,
> +                               sampler);
> +   }
> +
> +   if (shadow_c.file != BAD_FILE)
> +      inst->shadow_compare = true;
> +
> +   if (offset_value.file == IMM)
> +      inst->offset = offset_value.fixed_hw_reg.dw1.ud;
> +
> +   if (op == ir_tg4) {
> +      inst->offset |=
> +         gather_channel(gather_component, sampler) << 16; /* M0.2:16-17 */
> +
> +      if (brw->gen == 6)
> +         emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], dst);
> +   }
> +
> +   /* fixup #layers for cube map arrays */
> +   if (op == ir_txs && is_cube_array) {
> +      fs_reg depth = offset(dst, 2);
> +      fs_reg fixed_depth = vgrf(glsl_type::int_type);
> +      emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
> +
> +      fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
> +      int components = inst->regs_written / (dst.width / 8);
> +      for (int i = 0; i < components; i++) {
> +         if (i == 2) {
> +            fixed_payload[i] = fixed_depth;
> +         } else {
> +            fixed_payload[i] = offset(dst, i);
> +         }
> +      }
> +      emit(LOAD_PAYLOAD(dst, fixed_payload, components));
> +   }
> +
> +   swizzle_result(op, dest_type->vector_elements, dst, sampler);
> +}
> +
> +void
> +fs_god::visit(ir_texture *ir)
> +{
> +   uint32_t sampler =
> +      _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
> +
> +   ir_rvalue *nonconst_sampler_index =
> +      _mesa_get_sampler_array_nonconst_index(ir->sampler);
> +
> +   /* Handle non-constant sampler array indexing */
> +   fs_reg sampler_reg;
> +   if (nonconst_sampler_index) {
> +      /* The highest sampler which may be used by this operation is
> +       * the last element of the array. Mark it here, because the generator
> +       * doesn't have enough information to determine the bound.
> +       */
> +      uint32_t array_size = ir->sampler->as_dereference_array()
> +         ->array->type->array_size();
> +
> +      uint32_t max_used = sampler + array_size - 1;
> +      if (ir->op == ir_tg4 && brw->gen < 8) {
> +         max_used += stage_prog_data->binding_table.gather_texture_start;
> +      } else {
> +         max_used += stage_prog_data->binding_table.texture_start;
> +      }
> +
> +      brw_mark_surface_used(prog_data, max_used);
> +
> +      /* Emit code to evaluate the actual indexing expression */
> +      nonconst_sampler_index->accept(this);
> +      fs_reg temp = vgrf(glsl_type::uint_type);
> +      emit(ADD(temp, this->result, fs_reg(sampler)))
> +            ->force_writemask_all = true;
> +      sampler_reg = temp;
> +   } else {
> +      /* Single sampler, or constant array index; the indexing expression
> +       * is just an immediate.
> +       */
> +      sampler_reg = fs_reg(sampler);
> +   }
> +
> +   /* FINISHME: We're failing to recompile our programs when the sampler is
> +    * updated.  This only matters for the texture rectangle scale parameters
> +    * (pre-gen6, or gen6+ with GL_CLAMP).
> +    */
> +   int texunit = prog->SamplerUnits[sampler];
> +
> +   /* Should be lowered by do_lower_texture_projection */
> +   assert(!ir->projector);
> +
> +   /* Should be lowered */
> +   assert(!ir->offset || !ir->offset->type->is_array());
> +
> +   /* Generate code to compute all the subexpression trees.  This has to be
> +    * done before loading any values into MRFs for the sampler message since
> +    * generating these values may involve SEND messages that need the MRFs.
> +    */
> +   fs_reg coordinate;
> +   int coord_components = 0;
> +   if (ir->coordinate) {
> +      coord_components = ir->coordinate->type->vector_elements;
> +      ir->coordinate->accept(this);
> +      coordinate = this->result;
> +   }
> +
> +   fs_reg shadow_comparitor;
> +   if (ir->shadow_comparitor) {
> +      ir->shadow_comparitor->accept(this);
> +      shadow_comparitor = this->result;
> +   }
> +
> +   fs_reg offset_value;
> +   if (ir->offset) {
> +      ir_constant *const_offset = ir->offset->as_constant();
> +      if (const_offset) {
> +         /* Store the header bitfield in an IMM register.  This allows us to
> +          * use offset_value.file to distinguish between no offset, a constant
> +          * offset, and a non-constant offset.
> +          */
> +         offset_value =
> +            fs_reg(brw_texture_offset(ctx, const_offset->value.i,
> +                                      const_offset->type->vector_elements));
> +      } else {
> +         ir->offset->accept(this);
> +         offset_value = this->result;
> +      }
> +   }
> +
> +   fs_reg lod, lod2, sample_index, mcs;
> +   int grad_components = 0;
> +   switch (ir->op) {
> +   case ir_tex:
> +   case ir_lod:
> +   case ir_tg4:
> +   case ir_query_levels:
> +      break;
> +   case ir_txb:
> +      ir->lod_info.bias->accept(this);
> +      lod = this->result;
> +      break;
> +   case ir_txd:
> +      ir->lod_info.grad.dPdx->accept(this);
> +      lod = this->result;
> +
> +      ir->lod_info.grad.dPdy->accept(this);
> +      lod2 = this->result;
> +
> +      grad_components = ir->lod_info.grad.dPdx->type->vector_elements;
> +      break;
> +   case ir_txf:
> +   case ir_txl:
> +   case ir_txs:
> +      ir->lod_info.lod->accept(this);
> +      lod = this->result;
> +      break;
> +   case ir_txf_ms:
> +      ir->lod_info.sample_index->accept(this);
> +      sample_index = this->result;
> +
> +      if (brw->gen >= 7 &&
> +          key_tex->compressed_multisample_layout_mask & (1 << sampler)) {
> +         mcs = emit_mcs_fetch(coordinate, ir->coordinate->type->vector_elements,
> +                              sampler_reg);
> +      } else {
> +         mcs = fs_reg(0u);
> +      }
> +      break;
> +   default:
> +      unreachable("Unrecognized texture opcode");
> +   };
> +
> +   int gather_component = 0;
> +   if (ir->op == ir_tg4)
> +      gather_component = ir->lod_info.component->as_constant()->value.i[0];
> +
> +   bool is_rect =
> +      ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT;
> +
> +   bool is_cube_array =
> +      ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
> +      ir->sampler->type->sampler_array;
> +
> +   emit_texture(ir->op, ir->type, coordinate, coord_components,
> +                shadow_comparitor, lod, lod2, grad_components,
> +                sample_index, offset_value, mcs,
> +                gather_component, is_cube_array, is_rect, sampler,
> +                sampler_reg, texunit);
> +}
> +
> +/**
> + * Apply workarounds for Gen6 gather with UINT/SINT
> + */
> +void
> +fs_god::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
> +{
> +   if (!wa)
> +      return;
> +
> +   int width = (wa & WA_8BIT) ? 8 : 16;
> +
> +   for (int i = 0; i < 4; i++) {
> +      fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
> +      /* Convert from UNORM to UINT */
> +      emit(MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1))));
> +      emit(MOV(dst, dst_f));
> +
> +      if (wa & WA_SIGN) {
> +         /* Reinterpret the UINT value as a signed INT value by
> +          * shifting the sign bit into place, then shifting back
> +          * preserving sign.
> +          */
> +         emit(SHL(dst, dst, fs_reg(32 - width)));
> +         emit(ASR(dst, dst, fs_reg(32 - width)));
> +      }
> +
> +      dst = offset(dst, 1);
> +   }
> +}
> +
> +/**
> + * Set up the gather channel based on the swizzle, for gather4.
> + */
> +uint32_t
> +fs_god::gather_channel(int orig_chan, uint32_t sampler)
> +{
> +   int swiz = GET_SWZ(key_tex->swizzles[sampler], orig_chan);
> +   switch (swiz) {
> +      case SWIZZLE_X: return 0;
> +      case SWIZZLE_Y:
> +         /* gather4 sampler is broken for green channel on RG32F --
> +          * we must ask for blue instead.
> +          */
> +         if (key_tex->gather_channel_quirk_mask & (1 << sampler))
> +            return 2;
> +         return 1;
> +      case SWIZZLE_Z: return 2;
> +      case SWIZZLE_W: return 3;
> +      default:
> +         unreachable("Not reached"); /* zero, one swizzles handled already */
> +   }
> +}
> +
> +/**
> + * Swizzle the result of a texture result.  This is necessary for
> + * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
> + */
> +void
> +fs_god::swizzle_result(ir_texture_opcode op, int dest_components,
> +                           fs_reg orig_val, uint32_t sampler)
> +{
> +   if (op == ir_query_levels) {
> +      /* # levels is in .w */
> +      this->result = offset(orig_val, 3);
> +      return;
> +   }
> +
> +   this->result = orig_val;
> +
> +   /* txs,lod don't actually sample the texture, so swizzling the result
> +    * makes no sense.
> +    */
> +   if (op == ir_txs || op == ir_lod || op == ir_tg4)
> +      return;
> +
> +   if (dest_components == 1) {
> +      /* Ignore DEPTH_TEXTURE_MODE swizzling. */
> +   } else if (key_tex->swizzles[sampler] != SWIZZLE_NOOP) {
> +      fs_reg swizzled_result = vgrf(glsl_type::vec4_type);
> +      swizzled_result.type = orig_val.type;
> +
> +      for (int i = 0; i < 4; i++) {
> +        int swiz = GET_SWZ(key_tex->swizzles[sampler], i);
> +        fs_reg l = swizzled_result;
> +        l = offset(l, i);
> +
> +        if (swiz == SWIZZLE_ZERO) {
> +           emit(MOV(l, fs_reg(0.0f)));
> +        } else if (swiz == SWIZZLE_ONE) {
> +           emit(MOV(l, fs_reg(1.0f)));
> +        } else {
> +            emit(MOV(l, offset(orig_val,
> +                               GET_SWZ(key_tex->swizzles[sampler], i))));
> +        }
> +      }
> +      this->result = swizzled_result;
> +   }
> +}
> +
> +void
> +fs_god::visit(ir_swizzle *ir)
> +{
> +   ir->val->accept(this);
> +   fs_reg val = this->result;
> +
> +   if (ir->type->vector_elements == 1) {
> +      this->result = offset(this->result, ir->mask.x);
> +      return;
> +   }
> +
> +   fs_reg result = vgrf(ir->type);
> +   this->result = result;
> +
> +   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
> +      fs_reg channel = val;
> +      int swiz = 0;
> +
> +      switch (i) {
> +      case 0:
> +        swiz = ir->mask.x;
> +        break;
> +      case 1:
> +        swiz = ir->mask.y;
> +        break;
> +      case 2:
> +        swiz = ir->mask.z;
> +        break;
> +      case 3:
> +        swiz = ir->mask.w;
> +        break;
> +      }
> +
> +      emit(MOV(result, offset(channel, swiz)));
> +      result = offset(result, 1);
> +   }
> +}
> +
> +void
> +fs_god::visit(ir_discard *ir)
> +{
> +   /* We track our discarded pixels in f0.1.  By predicating on it, we can
> +    * update just the flag bits that aren't yet discarded.  If there's no
> +    * condition, we emit a CMP of g0 != g0, so all currently executing
> +    * channels will get turned off.
> +    */
> +   fs_inst *cmp;
> +   if (ir->condition) {
> +      emit_bool_to_cond_code(ir->condition);
> +      cmp = (fs_inst *) this->instructions.get_tail();
> +      cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
> +   } else {
> +      fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
> +                                      BRW_REGISTER_TYPE_UW));
> +      cmp = emit(CMP(reg_null_f, some_reg, some_reg, BRW_CONDITIONAL_NZ));
> +   }
> +   cmp->predicate = BRW_PREDICATE_NORMAL;
> +   cmp->flag_subreg = 1;
> +
> +   if (brw->gen >= 6) {
> +      emit_discard_jump();
> +   }
> +}
> +
> +void
> +fs_god::visit(ir_constant *ir)
> +{
> +   /* Set this->result to reg at the bottom of the function because some code
> +    * paths will cause this visitor to be applied to other fields.  This will
> +    * cause the value stored in this->result to be modified.
> +    *
> +    * Make reg constant so that it doesn't get accidentally modified along the
> +    * way.  Yes, I actually had this problem. :(
> +    */
> +   const fs_reg reg = vgrf(ir->type);
> +   fs_reg dst_reg = reg;
> +
> +   if (ir->type->is_array()) {
> +      const unsigned size = type_size(ir->type->fields.array);
> +
> +      for (unsigned i = 0; i < ir->type->length; i++) {
> +        ir->array_elements[i]->accept(this);
> +        fs_reg src_reg = this->result;
> +
> +        dst_reg.type = src_reg.type;
> +        for (unsigned j = 0; j < size; j++) {
> +           emit(MOV(dst_reg, src_reg));
> +           src_reg = offset(src_reg, 1);
> +           dst_reg = offset(dst_reg, 1);
> +        }
> +      }
> +   } else if (ir->type->is_record()) {
> +      foreach_in_list(ir_constant, field, &ir->components) {
> +        const unsigned size = type_size(field->type);
> +
> +        field->accept(this);
> +        fs_reg src_reg = this->result;
> +
> +        dst_reg.type = src_reg.type;
> +        for (unsigned j = 0; j < size; j++) {
> +           emit(MOV(dst_reg, src_reg));
> +           src_reg = offset(src_reg, 1);
> +           dst_reg = offset(dst_reg, 1);
> +        }
> +      }
> +   } else {
> +      const unsigned size = type_size(ir->type);
> +
> +      for (unsigned i = 0; i < size; i++) {
> +        switch (ir->type->base_type) {
> +        case GLSL_TYPE_FLOAT:
> +           emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
> +           break;
> +        case GLSL_TYPE_UINT:
> +           emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
> +           break;
> +        case GLSL_TYPE_INT:
> +           emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
> +           break;
> +        case GLSL_TYPE_BOOL:
> +            emit(MOV(dst_reg,
> +                     fs_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
> +                                                : 0)));
> +           break;
> +        default:
> +           unreachable("Non-float/uint/int/bool constant");
> +        }
> +        dst_reg = offset(dst_reg, 1);
> +      }
> +   }
> +
> +   this->result = reg;
> +}
> +
> +void
> +fs_god::emit_bool_to_cond_code(ir_rvalue *ir)
> +{
> +   ir_expression *expr = ir->as_expression();
> +
> +   if (!expr || expr->operation == ir_binop_ubo_load) {
> +      ir->accept(this);
> +
> +      fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
> +      inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +      return;
> +   }
> +
> +   fs_reg op[3];
> +
> +   assert(expr->get_num_operands() <= 3);
> +   for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> +      assert(expr->operands[i]->type->is_scalar());
> +
> +      expr->operands[i]->accept(this);
> +      op[i] = this->result;
> +
> +      resolve_ud_negate(&op[i]);
> +   }
> +
> +   emit_bool_to_cond_code_of_reg(expr, op);
> +}
> +
> +void
> +fs_god::emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3])
> +{
> +   fs_inst *inst;
> +
> +   switch (expr->operation) {
> +   case ir_unop_logic_not:
> +      inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
> +      inst->conditional_mod = BRW_CONDITIONAL_Z;
> +      break;
> +
> +   case ir_binop_logic_xor:
> +      if (brw->gen <= 5) {
> +         fs_reg temp = vgrf(expr->type);
> +         emit(XOR(temp, op[0], op[1]));
> +         inst = emit(AND(reg_null_d, temp, fs_reg(1)));
> +      } else {
> +         inst = emit(XOR(reg_null_d, op[0], op[1]));
> +      }
> +      inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +      break;
> +
> +   case ir_binop_logic_or:
> +      if (brw->gen <= 5) {
> +         fs_reg temp = vgrf(expr->type);
> +         emit(OR(temp, op[0], op[1]));
> +         inst = emit(AND(reg_null_d, temp, fs_reg(1)));
> +      } else {
> +         inst = emit(OR(reg_null_d, op[0], op[1]));
> +      }
> +      inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +      break;
> +
> +   case ir_binop_logic_and:
> +      if (brw->gen <= 5) {
> +         fs_reg temp = vgrf(expr->type);
> +         emit(AND(temp, op[0], op[1]));
> +         inst = emit(AND(reg_null_d, temp, fs_reg(1)));
> +      } else {
> +         inst = emit(AND(reg_null_d, op[0], op[1]));
> +      }
> +      inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +      break;
> +
> +   case ir_unop_f2b:
> +      if (brw->gen >= 6) {
> +         emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
> +      } else {
> +         inst = emit(MOV(reg_null_f, op[0]));
> +         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +      }
> +      break;
> +
> +   case ir_unop_i2b:
> +      if (brw->gen >= 6) {
> +         emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
> +      } else {
> +         inst = emit(MOV(reg_null_d, op[0]));
> +         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +      }
> +      break;
> +
> +   case ir_binop_greater:
> +   case ir_binop_gequal:
> +   case ir_binop_less:
> +   case ir_binop_lequal:
> +   case ir_binop_equal:
> +   case ir_binop_all_equal:
> +   case ir_binop_nequal:
> +   case ir_binop_any_nequal:
> +      if (brw->gen <= 5) {
> +         resolve_bool_comparison(expr->operands[0], &op[0]);
> +         resolve_bool_comparison(expr->operands[1], &op[1]);
> +      }
> +
> +      emit(CMP(reg_null_d, op[0], op[1],
> +               brw_conditional_for_comparison(expr->operation)));
> +      break;
> +
> +   case ir_triop_csel: {
> +      /* Expand the boolean condition into the flag register. */
> +      inst = emit(MOV(reg_null_d, op[0]));
> +      inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +
> +      /* Select which boolean to return. */
> +      fs_reg temp = vgrf(expr->operands[1]->type);
> +      inst = emit(SEL(temp, op[1], op[2]));
> +      inst->predicate = BRW_PREDICATE_NORMAL;
> +
> +      /* Expand the result to a condition code. */
> +      inst = emit(MOV(reg_null_d, temp));
> +      inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +      break;
> +   }
> +
> +   default:
> +      unreachable("not reached");
> +   }
> +}
> +
> +/**
> + * Emit a gen6 IF statement with the comparison folded into the IF
> + * instruction.
> + */
> +void
> +fs_god::emit_if_gen6(ir_if *ir)
> +{
> +   ir_expression *expr = ir->condition->as_expression();
> +
> +   if (expr && expr->operation != ir_binop_ubo_load) {
> +      fs_reg op[3];
> +      fs_inst *inst;
> +      fs_reg temp;
> +
> +      assert(expr->get_num_operands() <= 3);
> +      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> +        assert(expr->operands[i]->type->is_scalar());
> +
> +        expr->operands[i]->accept(this);
> +        op[i] = this->result;
> +      }
> +
> +      switch (expr->operation) {
> +      case ir_unop_logic_not:
> +         emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_Z));
> +         return;
> +
> +      case ir_binop_logic_xor:
> +         emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
> +         return;
> +
> +      case ir_binop_logic_or:
> +         temp = vgrf(glsl_type::bool_type);
> +         emit(OR(temp, op[0], op[1]));
> +         emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
> +         return;
> +
> +      case ir_binop_logic_and:
> +         temp = vgrf(glsl_type::bool_type);
> +         emit(AND(temp, op[0], op[1]));
> +         emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
> +         return;
> +
> +      case ir_unop_f2b:
> +        inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
> +        inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +        return;
> +
> +      case ir_unop_i2b:
> +        emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
> +        return;
> +
> +      case ir_binop_greater:
> +      case ir_binop_gequal:
> +      case ir_binop_less:
> +      case ir_binop_lequal:
> +      case ir_binop_equal:
> +      case ir_binop_all_equal:
> +      case ir_binop_nequal:
> +      case ir_binop_any_nequal:
> +         if (brw->gen <= 5) {
> +            resolve_bool_comparison(expr->operands[0], &op[0]);
> +            resolve_bool_comparison(expr->operands[1], &op[1]);
> +         }
> +
> +        emit(IF(op[0], op[1],
> +                 brw_conditional_for_comparison(expr->operation)));
> +        return;
> +
> +      case ir_triop_csel: {
> +         /* Expand the boolean condition into the flag register. */
> +         fs_inst *inst = emit(MOV(reg_null_d, op[0]));
> +         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +
> +         /* Select which boolean to use as the result. */
> +         fs_reg temp = vgrf(expr->operands[1]->type);
> +         inst = emit(SEL(temp, op[1], op[2]));
> +         inst->predicate = BRW_PREDICATE_NORMAL;
> +
> +        emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
> +        return;
> +      }
> +
> +      default:
> +        unreachable("not reached");
> +      }
> +   }
> +
> +   ir->condition->accept(this);
> +   emit(IF(this->result, fs_reg(0), BRW_CONDITIONAL_NZ));
> +}
> +
> +bool
> +fs_god::try_opt_frontfacing_ternary(ir_if *ir)
> +{
> +   ir_dereference_variable *deref = ir->condition->as_dereference_variable();
> +   if (!deref || strcmp(deref->var->name, "gl_FrontFacing") != 0)
> +      return false;
> +
> +   if (ir->then_instructions.length() != 1 ||
> +       ir->else_instructions.length() != 1)
> +      return false;
> +
> +   ir_assignment *then_assign =
> +         ((ir_instruction *)ir->then_instructions.head)->as_assignment();
> +   ir_assignment *else_assign =
> +         ((ir_instruction *)ir->else_instructions.head)->as_assignment();
> +
> +   if (!then_assign || then_assign->condition ||
> +       !else_assign || else_assign->condition ||
> +       then_assign->write_mask != else_assign->write_mask ||
> +       !then_assign->lhs->equals(else_assign->lhs))
> +      return false;
> +
> +   ir_constant *then_rhs = then_assign->rhs->as_constant();
> +   ir_constant *else_rhs = else_assign->rhs->as_constant();
> +
> +   if (!then_rhs || !else_rhs)
> +      return false;
> +
> +   if (then_rhs->type->base_type != GLSL_TYPE_FLOAT)
> +      return false;
> +
> +   if ((then_rhs->is_one() && else_rhs->is_negative_one()) ||
> +       (else_rhs->is_one() && then_rhs->is_negative_one())) {
> +      then_assign->lhs->accept(this);
> +      fs_reg dst = this->result;
> +      dst.type = BRW_REGISTER_TYPE_D;
> +      fs_reg tmp = vgrf(glsl_type::int_type);
> +
> +      if (brw->gen >= 6) {
> +         /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
> +         fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
> +
> +         /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
> +          *
> +          *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
> +          *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
> +          *
> +          * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
> +          */
> +
> +         if (then_rhs->is_negative_one()) {
> +            assert(else_rhs->is_one());
> +            g0.negate = true;
> +         }
> +
> +         tmp.type = BRW_REGISTER_TYPE_W;
> +         tmp.subreg_offset = 2;
> +         tmp.stride = 2;
> +
> +         fs_inst *or_inst = emit(OR(tmp, g0, fs_reg(0x3f80)));
> +         or_inst->src[1].type = BRW_REGISTER_TYPE_UW;
> +
> +         tmp.type = BRW_REGISTER_TYPE_D;
> +         tmp.subreg_offset = 0;
> +         tmp.stride = 1;
> +      } else {
> +         /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
> +         fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
> +
> +         /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
> +          *
> +          *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
> +          *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
> +          *
> +          * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
> +          */
> +
> +         if (then_rhs->is_negative_one()) {
> +            assert(else_rhs->is_one());
> +            g1_6.negate = true;
> +         }
> +
> +         emit(OR(tmp, g1_6, fs_reg(0x3f800000)));
> +      }
> +      emit(AND(dst, tmp, fs_reg(0xbf800000)));
> +      return true;
> +   }
> +
> +   return false;
> +}
> +
> +/**
> + * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
> + *
> + * Many GLSL shaders contain the following pattern:
> + *
> + *    x = condition ? foo : bar
> + *
> + * The compiler emits an ir_if tree for this, since each subexpression might be
> + * a complex tree that could have side-effects or short-circuit logic.
> + *
> + * However, the common case is to simply select one of two constants or
> + * variable values---which is exactly what SEL is for.  In this case, the
> + * assembly looks like:
> + *
> + *    (+f0) IF
> + *    MOV dst src0
> + *    ELSE
> + *    MOV dst src1
> + *    ENDIF
> + *
> + * which can be easily translated into:
> + *
> + *    (+f0) SEL dst src0 src1
> + *
> + * If src0 is an immediate value, we promote it to a temporary GRF.
> + */
> +bool
> +fs_god::try_replace_with_sel()
> +{
> +   fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
> +   assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
> +
> +   /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
> +   int opcodes[] = {
> +      BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
> +   };
> +
> +   fs_inst *match = (fs_inst *) endif_inst->prev;
> +   for (int i = 0; i < 4; i++) {
> +      if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
> +         return false;
> +      match = (fs_inst *) match->prev;
> +   }
> +
> +   /* The opcodes match; it looks like the right sequence of instructions. */
> +   fs_inst *else_mov = (fs_inst *) endif_inst->prev;
> +   fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
> +   fs_inst *if_inst = (fs_inst *) then_mov->prev;
> +
> +   /* Check that the MOVs are the right form. */
> +   if (then_mov->dst.equals(else_mov->dst) &&
> +       !then_mov->is_partial_write() &&
> +       !else_mov->is_partial_write()) {
> +
> +      /* Remove the matched instructions; we'll emit a SEL to replace them. */
> +      while (!if_inst->next->is_tail_sentinel())
> +         if_inst->next->exec_node::remove();
> +      if_inst->exec_node::remove();
> +
> +      /* Only the last source register can be a constant, so if the MOV in
> +       * the "then" clause uses a constant, we need to put it in a temporary.
> +       */
> +      fs_reg src0(then_mov->src[0]);
> +      if (src0.file == IMM) {
> +         src0 = vgrf(glsl_type::float_type);
> +         src0.type = then_mov->src[0].type;
> +         emit(MOV(src0, then_mov->src[0]));
> +      }
> +
> +      fs_inst *sel;
> +      if (if_inst->conditional_mod) {
> +         /* Sandybridge-specific IF with embedded comparison */
> +         emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
> +                  if_inst->conditional_mod));
> +         sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
> +         sel->predicate = BRW_PREDICATE_NORMAL;
> +      } else {
> +         /* Separate CMP and IF instructions */
> +         sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
> +         sel->predicate = if_inst->predicate;
> +         sel->predicate_inverse = if_inst->predicate_inverse;
> +      }
> +
> +      return true;
> +   }
> +
> +   return false;
> +}
> +
> +void
> +fs_god::visit(ir_if *ir)
> +{
> +   if (try_opt_frontfacing_ternary(ir))
> +      return;
> +
> +   /* Don't point the annotation at the if statement, because then it plus
> +    * the then and else blocks get printed.
> +    */
> +   this->base_ir = ir->condition;
> +
> +   if (brw->gen == 6) {
> +      emit_if_gen6(ir);
> +   } else {
> +      emit_bool_to_cond_code(ir->condition);
> +
> +      emit(IF(BRW_PREDICATE_NORMAL));
> +   }
> +
> +   foreach_in_list(ir_instruction, ir_, &ir->then_instructions) {
> +      this->base_ir = ir_;
> +      ir_->accept(this);
> +   }
> +
> +   if (!ir->else_instructions.is_empty()) {
> +      emit(BRW_OPCODE_ELSE);
> +
> +      foreach_in_list(ir_instruction, ir_, &ir->else_instructions) {
> +        this->base_ir = ir_;
> +        ir_->accept(this);
> +      }
> +   }
> +
> +   emit(BRW_OPCODE_ENDIF);
> +
> +   if (!try_replace_with_sel() && brw->gen < 6) {
> +      no16("Can't support (non-uniform) control flow on SIMD16\n");
> +   }
> +}
> +
> +void
> +fs_god::visit(ir_loop *ir)
> +{
> +   if (brw->gen < 6) {
> +      no16("Can't support (non-uniform) control flow on SIMD16\n");
> +   }
> +
> +   this->base_ir = NULL;
> +   emit(BRW_OPCODE_DO);
> +
> +   foreach_in_list(ir_instruction, ir_, &ir->body_instructions) {
> +      this->base_ir = ir_;
> +      ir_->accept(this);
> +   }
> +
> +   this->base_ir = NULL;
> +   emit(BRW_OPCODE_WHILE);
> +}
> +
> +void
> +fs_god::visit(ir_loop_jump *ir)
> +{
> +   switch (ir->mode) {
> +   case ir_loop_jump::jump_break:
> +      emit(BRW_OPCODE_BREAK);
> +      break;
> +   case ir_loop_jump::jump_continue:
> +      emit(BRW_OPCODE_CONTINUE);
> +      break;
> +   }
> +}
> +
> +void
> +fs_god::visit_atomic_counter_intrinsic(ir_call *ir)
> +{
> +   ir_dereference *deref = static_cast<ir_dereference *>(
> +      ir->actual_parameters.get_head());
> +   ir_variable *location = deref->variable_referenced();
> +   unsigned surf_index = (stage_prog_data->binding_table.abo_start +
> +                          location->data.binding);
> +
> +   /* Calculate the surface offset */
> +   fs_reg offset = vgrf(glsl_type::uint_type);
> +   ir_dereference_array *deref_array = deref->as_dereference_array();
> +
> +   if (deref_array) {
> +      deref_array->array_index->accept(this);
> +
> +      fs_reg tmp = vgrf(glsl_type::uint_type);
> +      emit(MUL(tmp, this->result, fs_reg(ATOMIC_COUNTER_SIZE)));
> +      emit(ADD(offset, tmp, fs_reg(location->data.atomic.offset)));
> +   } else {
> +      offset = fs_reg(location->data.atomic.offset);
> +   }
> +
> +   /* Emit the appropriate machine instruction */
> +   const char *callee = ir->callee->function_name();
> +   ir->return_deref->accept(this);
> +   fs_reg dst = this->result;
> +
> +   if (!strcmp("__intrinsic_atomic_read", callee)) {
> +      emit_untyped_surface_read(surf_index, dst, offset);
> +
> +   } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
> +      emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
> +                          fs_reg(), fs_reg());
> +
> +   } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
> +      emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
> +                          fs_reg(), fs_reg());
> +   }
> +}
> +
> +void
> +fs_god::visit(ir_call *ir)
> +{
> +   const char *callee = ir->callee->function_name();
> +
> +   if (!strcmp("__intrinsic_atomic_read", callee) ||
> +       !strcmp("__intrinsic_atomic_increment", callee) ||
> +       !strcmp("__intrinsic_atomic_predecrement", callee)) {
> +      visit_atomic_counter_intrinsic(ir);
> +   } else {
> +      unreachable("Unsupported intrinsic.");
> +   }
> +}
> +
> +void
> +fs_god::visit(ir_return *)
> +{
> +   unreachable("FINISHME");
> +}
> +
> +void
> +fs_god::visit(ir_function *ir)
> +{
> +   /* Ignore function bodies other than main() -- we shouldn't see calls to
> +    * them since they should all be inlined before we get to ir_to_mesa.
> +    */
> +   if (strcmp(ir->name, "main") == 0) {
> +      const ir_function_signature *sig;
> +      exec_list empty;
> +
> +      sig = ir->matching_signature(NULL, &empty, false);
> +
> +      assert(sig);
> +
> +      foreach_in_list(ir_instruction, ir_, &sig->body) {
> +        this->base_ir = ir_;
> +        ir_->accept(this);
> +      }
> +   }
> +}
> +
> +void
> +fs_god::visit(ir_function_signature *)
> +{
> +   unreachable("not reached");
> +}
> +
> +void
> +fs_god::visit(ir_emit_vertex *)
> +{
> +   unreachable("not reached");
> +}
> +
> +void
> +fs_god::visit(ir_end_primitive *)
> +{
> +   unreachable("not reached");
> +}
> +
> +void
> +fs_god::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
> +                                fs_reg dst, fs_reg offset, fs_reg src0,
> +                                fs_reg src1)
> +{
> +   int reg_width = dispatch_width / 8;
> +   int length = 0;
> +
> +   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 4);
> +
> +   sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> +   /* Initialize the sample mask in the message header. */
> +   emit(MOV(sources[0], fs_reg(0u)))
> +      ->force_writemask_all = true;
> +
> +   if (stage == MESA_SHADER_FRAGMENT) {
> +      if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
> +         emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
> +            ->force_writemask_all = true;
> +      } else {
> +         emit(MOV(component(sources[0], 7),
> +                  retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
> +            ->force_writemask_all = true;
> +      }
> +   } else {
> +      /* The execution mask is part of the side-band information sent together with
> +       * the message payload to the data port. It's implicitly ANDed with the sample
> +       * mask sent in the header to compute the actual set of channels that execute
> +       * the atomic operation.
> +       */
> +      assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
> +      emit(MOV(component(sources[0], 7),
> +               fs_reg(0xffffu)))->force_writemask_all = true;
> +   }
> +   length++;
> +
> +   /* Set the atomic operation offset. */
> +   sources[1] = vgrf(glsl_type::uint_type);
> +   emit(MOV(sources[1], offset));
> +   length++;
> +
> +   /* Set the atomic operation arguments. */
> +   if (src0.file != BAD_FILE) {
> +      sources[length] = vgrf(glsl_type::uint_type);
> +      emit(MOV(sources[length], src0));
> +      length++;
> +   }
> +
> +   if (src1.file != BAD_FILE) {
> +      sources[length] = vgrf(glsl_type::uint_type);
> +      emit(MOV(sources[length], src1));
> +      length++;
> +   }
> +
> +   int mlen = 1 + (length - 1) * reg_width;
> +   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
> +                               BRW_REGISTER_TYPE_UD);
> +   emit(LOAD_PAYLOAD(src_payload, sources, length));
> +
> +   /* Emit the instruction. */
> +   fs_inst *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
> +                        fs_reg(atomic_op), fs_reg(surf_index));
> +   inst->mlen = mlen;
> +}
> +
> +void
> +fs_god::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
> +                                      fs_reg offset)
> +{
> +   int reg_width = dispatch_width / 8;
> +
> +   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
> +
> +   sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> +   /* Initialize the sample mask in the message header. */
> +   emit(MOV(sources[0], fs_reg(0u)))
> +      ->force_writemask_all = true;
> +
> +   if (stage == MESA_SHADER_FRAGMENT) {
> +      if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
> +         emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
> +            ->force_writemask_all = true;
> +      } else {
> +         emit(MOV(component(sources[0], 7),
> +                  retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
> +            ->force_writemask_all = true;
> +      }
> +   } else {
> +      /* The execution mask is part of the side-band information sent together with
> +       * the message payload to the data port. It's implicitly ANDed with the sample
> +       * mask sent in the header to compute the actual set of channels that execute
> +       * the atomic operation.
> +       */
> +      assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
> +      emit(MOV(component(sources[0], 7),
> +               fs_reg(0xffffu)))->force_writemask_all = true;
> +   }
> +
> +   /* Set the surface read offset. */
> +   sources[1] = vgrf(glsl_type::uint_type);
> +   emit(MOV(sources[1], offset));
> +
> +   int mlen = 1 + reg_width;
> +   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
> +                               BRW_REGISTER_TYPE_UD);
> +   fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2));
> +
> +   /* Emit the instruction. */
> +   inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
> +               fs_reg(surf_index));
> +   inst->mlen = mlen;
> +}
> +
> +fs_inst *
> +fs_god::emit(fs_inst *inst)
> +{
> +   if (dispatch_width == 16 && inst->exec_size == 8)
> +      inst->force_uncompressed = true;
> +
> +   inst->annotation = this->current_annotation;
> +   inst->ir = this->base_ir;
> +
> +   this->instructions.push_tail(inst);
> +
> +   return inst;
> +}
> +
> +void
> +fs_god::emit(exec_list list)
> +{
> +   foreach_in_list_safe(fs_inst, inst, &list) {
> +      inst->exec_node::remove();
> +      emit(inst);
> +   }
> +}
> +
> +/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
> +void
> +fs_god::emit_dummy_fs()
> +{
> +   int reg_width = dispatch_width / 8;
> +
> +   /* Everyone's favorite color. */
> +   const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
> +   for (int i = 0; i < 4; i++) {
> +      emit(MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
> +                      dispatch_width), fs_reg(color[i])));
> +   }
> +
> +   fs_inst *write;
> +   write = emit(FS_OPCODE_FB_WRITE);
> +   write->eot = true;
> +   if (brw->gen >= 6) {
> +      write->base_mrf = 2;
> +      write->mlen = 4 * reg_width;
> +   } else {
> +      write->header_present = true;
> +      write->base_mrf = 0;
> +      write->mlen = 2 + 4 * reg_width;
> +   }
> +
> +   /* Tell the SF we don't have any inputs.  Gen4-5 require at least one
> +    * varying to avoid GPU hangs, so set that.
> +    */
> +   brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
> +   wm_prog_data->num_varying_inputs = brw->gen < 6 ? 1 : 0;
> +   memset(wm_prog_data->urb_setup, -1,
> +          sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
> +
> +   /* We don't have any uniforms. */
> +   stage_prog_data->nr_params = 0;
> +   stage_prog_data->nr_pull_params = 0;
> +   stage_prog_data->curb_read_length = 0;
> +   stage_prog_data->dispatch_grf_start_reg = 2;
> +   wm_prog_data->dispatch_grf_start_reg_16 = 2;
> +   grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
> +
> +   calculate_cfg();
> +}
> +
> +/* The register location here is relative to the start of the URB
> + * data.  It will get adjusted to be a real location before
> + * generate_code() time.
> + */
> +struct brw_reg
> +fs_god::interp_reg(int location, int channel)
> +{
> +   assert(stage == MESA_SHADER_FRAGMENT);
> +   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> +   int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
> +   int stride = (channel & 1) * 4;
> +
> +   assert(prog_data->urb_setup[location] != -1);
> +
> +   return brw_vec1_grf(regnr, stride);
> +}
> +
> +/** Emits the interpolation for the varying inputs. */
> +void
> +fs_god::emit_interpolation_setup_gen4()
> +{
> +   this->current_annotation = "compute pixel centers";
> +   this->pixel_x = vgrf(glsl_type::uint_type);
> +   this->pixel_y = vgrf(glsl_type::uint_type);
> +   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
> +   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
> +
> +   emit(FS_OPCODE_PIXEL_X, this->pixel_x);
> +   emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
> +
> +   this->current_annotation = "compute pixel deltas from v0";
> +   if (brw->has_pln) {
> +      this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> +         vgrf(glsl_type::vec2_type);
> +      this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> +         offset(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 1);
> +   } else {
> +      this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> +         vgrf(glsl_type::float_type);
> +      this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> +         vgrf(glsl_type::float_type);
> +   }
> +   emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> +            this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))));
> +   emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> +            this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))));
> +
> +   this->current_annotation = "compute pos.w and 1/pos.w";
> +   /* Compute wpos.w.  It's always in our setup, since it's needed to
> +    * interpolate the other attributes.
> +    */
> +   this->wpos_w = vgrf(glsl_type::float_type);
> +   emit(FS_OPCODE_LINTERP, wpos_w,
> +        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> +        this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> +       interp_reg(VARYING_SLOT_POS, 3));
> +   /* Compute the pixel 1/W value from wpos.w. */
> +   this->pixel_w = vgrf(glsl_type::float_type);
> +   emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
> +   this->current_annotation = NULL;
> +}
> +
> +/** Emits the interpolation for the varying inputs. */
> +void
> +fs_god::emit_interpolation_setup_gen6()
> +{
> +   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
> +
> +   /* If the pixel centers end up used, the setup is the same as for gen4. */
> +   this->current_annotation = "compute pixel centers";
> +   fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
> +   fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
> +   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
> +   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
> +   emit(ADD(int_pixel_x,
> +            fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
> +            fs_reg(brw_imm_v(0x10101010))));
> +   emit(ADD(int_pixel_y,
> +            fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
> +            fs_reg(brw_imm_v(0x11001100))));
> +
> +   /* As of gen6, we can no longer mix float and int sources.  We have
> +    * to turn the integer pixel centers into floats for their actual
> +    * use.
> +    */
> +   this->pixel_x = vgrf(glsl_type::float_type);
> +   this->pixel_y = vgrf(glsl_type::float_type);
> +   emit(MOV(this->pixel_x, int_pixel_x));
> +   emit(MOV(this->pixel_y, int_pixel_y));
> +
> +   this->current_annotation = "compute pos.w";
> +   this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
> +   this->wpos_w = vgrf(glsl_type::float_type);
> +   emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
> +
> +   for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
> +      uint8_t reg = payload.barycentric_coord_reg[i];
> +      this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
> +      this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
> +   }
> +
> +   this->current_annotation = NULL;
> +}
> +
> +int
> +fs_god::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
> +                                bool use_2nd_half)
> +{
> +   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> +   fs_inst *inst;
> +
> +   if (color.file == BAD_FILE) {
> +      return 4 * (dispatch_width / 8);
> +   }
> +
> +   uint8_t colors_enabled;
> +   if (components == 0) {
> +      /* We want to write one component to the alpha channel */
> +      colors_enabled = 0x8;
> +   } else {
> +      /* Enable the first components-many channels */
> +      colors_enabled = (1 << components) - 1;
> +   }
> +
> +   if (dispatch_width == 8 || (brw->gen >= 6 && !do_dual_src)) {
> +      /* SIMD8 write looks like:
> +       * m + 0: r0
> +       * m + 1: r1
> +       * m + 2: g0
> +       * m + 3: g1
> +       *
> +       * gen6 SIMD16 DP write looks like:
> +       * m + 0: r0
> +       * m + 1: r1
> +       * m + 2: g0
> +       * m + 3: g1
> +       * m + 4: b0
> +       * m + 5: b1
> +       * m + 6: a0
> +       * m + 7: a1
> +       */
> +      int len = 0;
> +      for (unsigned i = 0; i < 4; ++i) {
> +         if (colors_enabled & (1 << i)) {
> +            dst[len] = fs_reg(GRF, alloc.allocate(color.width / 8),
> +                              color.type, color.width);
> +            inst = emit(MOV(dst[len], offset(color, i)));
> +            inst->saturate = key->clamp_fragment_color;
> +         } else if (color.width == 16) {
> +            /* We need two BAD_FILE slots for a 16-wide color */
> +            len++;
> +         }
> +         len++;
> +      }
> +      return len;
> +   } else if (brw->gen >= 6 && do_dual_src) {
> +      /* SIMD16 dual source blending for gen6+.
> +       *
> +       * From the SNB PRM, volume 4, part 1, page 193:
> +       *
> +       * "The dual source render target messages only have SIMD8 forms due to
> +       *  maximum message length limitations. SIMD16 pixel shaders must send two
> +       *  of these messages to cover all of the pixels. Each message contains
> +       *  two colors (4 channels each) for each pixel in the message payload."
> +       *
> +       * So in SIMD16 dual source blending we will send 2 SIMD8 messages,
> +       * each one will call this function twice (one for each color involved),
> +       * so in each pass we only write 4 registers. Notice that the second
> +       * SIMD8 message needs to read color data from the 2nd half of the color
> +       * registers, so it needs to call this with use_2nd_half = true.
> +       */
> +      for (unsigned i = 0; i < 4; ++i) {
> +         if (colors_enabled & (1 << i)) {
> +            dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
> +            inst = emit(MOV(dst[i], half(offset(color, i),
> +                                         use_2nd_half ? 1 : 0)));
> +            inst->saturate = key->clamp_fragment_color;
> +            if (use_2nd_half)
> +               inst->force_sechalf = true;
> +         }
> +      }
> +      return 4;
> +   } else {
> +      /* pre-gen6 SIMD16 single source DP write looks like:
> +       * m + 0: r0
> +       * m + 1: g0
> +       * m + 2: b0
> +       * m + 3: a0
> +       * m + 4: r1
> +       * m + 5: g1
> +       * m + 6: b1
> +       * m + 7: a1
> +       */
> +      for (unsigned i = 0; i < 4; ++i) {
> +         if (colors_enabled & (1 << i)) {
> +            dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
> +            inst = emit(MOV(dst[i], half(offset(color, i), 0)));
> +            inst->saturate = key->clamp_fragment_color;
> +
> +            dst[i + 4] = fs_reg(GRF, alloc.allocate(1), color.type);
> +            inst = emit(MOV(dst[i + 4], half(offset(color, i), 1)));
> +            inst->saturate = key->clamp_fragment_color;
> +            inst->force_sechalf = true;
> +         }
> +      }
> +      return 8;
> +   }
> +}
> +
> +static enum brw_conditional_mod
> +cond_for_alpha_func(GLenum func)
> +{
> +   switch(func) {
> +      case GL_GREATER:
> +         return BRW_CONDITIONAL_G;
> +      case GL_GEQUAL:
> +         return BRW_CONDITIONAL_GE;
> +      case GL_LESS:
> +         return BRW_CONDITIONAL_L;
> +      case GL_LEQUAL:
> +         return BRW_CONDITIONAL_LE;
> +      case GL_EQUAL:
> +         return BRW_CONDITIONAL_EQ;
> +      case GL_NOTEQUAL:
> +         return BRW_CONDITIONAL_NEQ;
> +      default:
> +         unreachable("Not reached");
> +   }
> +}
> +
> +/**
> + * Alpha test support for when we compile it into the shader instead
> + * of using the normal fixed-function alpha test.
> + */
> +void
> +fs_god::emit_alpha_test()
> +{
> +   assert(stage == MESA_SHADER_FRAGMENT);
> +   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> +   this->current_annotation = "Alpha test";
> +
> +   fs_inst *cmp;
> +   if (key->alpha_test_func == GL_ALWAYS)
> +      return;
> +
> +   if (key->alpha_test_func == GL_NEVER) {
> +      /* f0.1 = 0 */
> +      fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
> +                                      BRW_REGISTER_TYPE_UW));
> +      cmp = emit(CMP(reg_null_f, some_reg, some_reg,
> +                     BRW_CONDITIONAL_NEQ));
> +   } else {
> +      /* RT0 alpha */
> +      fs_reg color = offset(outputs[0], 3);
> +
> +      /* f0.1 &= func(color, ref) */
> +      cmp = emit(CMP(reg_null_f, color, fs_reg(key->alpha_test_ref),
> +                     cond_for_alpha_func(key->alpha_test_func)));
> +   }
> +   cmp->predicate = BRW_PREDICATE_NORMAL;
> +   cmp->flag_subreg = 1;
> +}
> +
> +fs_inst *
> +fs_god::emit_single_fb_write(fs_reg color0, fs_reg color1,
> +                                 fs_reg src0_alpha, unsigned components,
> +                                 bool use_2nd_half)
> +{
> +   assert(stage == MESA_SHADER_FRAGMENT);
> +   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> +   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> +
> +   this->current_annotation = "FB write header";
> +   bool header_present = true;
> +   int reg_size = dispatch_width / 8;
> +
> +   /* We can potentially have a message length of up to 15, so we have to set
> +    * base_mrf to either 0 or 1 in order to fit in m0..m15.
> +    */
> +   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15);
> +   int length = 0;
> +
> +   /* From the Sandy Bridge PRM, volume 4, page 198:
> +    *
> +    *     "Dispatched Pixel Enables. One bit per pixel indicating
> +    *      which pixels were originally enabled when the thread was
> +    *      dispatched. This field is only required for the end-of-
> +    *      thread message and on all dual-source messages."
> +    */
> +   if (brw->gen >= 6 &&
> +       (brw->is_haswell || brw->gen >= 8 || !prog_data->uses_kill) &&
> +       color1.file == BAD_FILE &&
> +       key->nr_color_regions == 1) {
> +      header_present = false;
> +   }
> +
> +   if (header_present)
> +      /* Allocate 2 registers for a header */
> +      length += 2;
> +
> +   if (payload.aa_dest_stencil_reg) {
> +      sources[length] = fs_reg(GRF, alloc.allocate(1));
> +      emit(MOV(sources[length],
> +               fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))));
> +      length++;
> +   }
> +
> +   prog_data->uses_omask =
> +      prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
> +   if (prog_data->uses_omask) {
> +      this->current_annotation = "FB write oMask";
> +      assert(this->sample_mask.file != BAD_FILE);
> +      /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
> +       * it's unsinged single words, one vgrf is always 16-wide.
> +       */
> +      sources[length] = fs_reg(GRF, alloc.allocate(1),
> +                               BRW_REGISTER_TYPE_UW, 16);
> +      emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
> +      length++;
> +   }
> +
> +   if (color0.file == BAD_FILE) {
> +      /* Even if there's no color buffers enabled, we still need to send
> +       * alpha out the pipeline to our null renderbuffer to support
> +       * alpha-testing, alpha-to-coverage, and so on.
> +       */
> +      length += setup_color_payload(sources + length, this->outputs[0], 0,
> +                                    false);
> +   } else if (color1.file == BAD_FILE) {
> +      if (src0_alpha.file != BAD_FILE) {
> +         sources[length] = fs_reg(GRF, alloc.allocate(reg_size),
> +                                  src0_alpha.type, src0_alpha.width);
> +         fs_inst *inst = emit(MOV(sources[length], src0_alpha));
> +         inst->saturate = key->clamp_fragment_color;
> +         length++;
> +      }
> +
> +      length += setup_color_payload(sources + length, color0, components,
> +                                    false);
> +   } else {
> +      length += setup_color_payload(sources + length, color0, components,
> +                                    use_2nd_half);
> +      length += setup_color_payload(sources + length, color1, components,
> +                                    use_2nd_half);
> +   }
> +
> +   if (source_depth_to_render_target) {
> +      if (brw->gen == 6) {
> +        /* For outputting oDepth on gen6, SIMD8 writes have to be
> +         * used.  This would require SIMD8 moves of each half to
> +         * message regs, kind of like pre-gen5 SIMD16 FB writes.
> +         * Just bail on doing so for now.
> +         */
> +        no16("Missing support for simd16 depth writes on gen6\n");
> +      }
> +
> +      sources[length] = vgrf(glsl_type::float_type);
> +      if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
> +        /* Hand over gl_FragDepth. */
> +        assert(this->frag_depth.file != BAD_FILE);
> +        emit(MOV(sources[length], this->frag_depth));
> +      } else {
> +        /* Pass through the payload depth. */
> +        emit(MOV(sources[length],
> +                  fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
> +      }
> +      length++;
> +   }
> +
> +   if (payload.dest_depth_reg) {
> +      sources[length] = vgrf(glsl_type::float_type);
> +      emit(MOV(sources[length],
> +               fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0))));
> +      length++;
> +   }
> +
> +   fs_inst *load;
> +   fs_inst *write;
> +   if (brw->gen >= 7) {
> +      /* Send from the GRF */
> +      fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
> +      load = emit(LOAD_PAYLOAD(payload, sources, length));
> +      payload.reg = alloc.allocate(load->regs_written);
> +      payload.width = dispatch_width;
> +      load->dst = payload;
> +      write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
> +      write->base_mrf = -1;
> +   } else {
> +      /* Send from the MRF */
> +      load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
> +                               sources, length));
> +      write = emit(FS_OPCODE_FB_WRITE);
> +      write->exec_size = dispatch_width;
> +      write->base_mrf = 1;
> +   }
> +
> +   write->mlen = load->regs_written;
> +   write->header_present = header_present;
> +   if (prog_data->uses_kill) {
> +      write->predicate = BRW_PREDICATE_NORMAL;
> +      write->flag_subreg = 1;
> +   }
> +   return write;
> +}
> +
> +void
> +fs_god::emit_fb_writes()
> +{
> +   assert(stage == MESA_SHADER_FRAGMENT);
> +   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> +   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> +
> +   fs_inst *inst = NULL;
> +   if (do_dual_src) {
> +      this->current_annotation = ralloc_asprintf(this->mem_ctx,
> +                                                "FB dual-source write");
> +      inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
> +                                  reg_undef, 4);
> +      inst->target = 0;
> +
> +      /* SIMD16 dual source blending requires to send two SIMD8 dual source
> +       * messages, where each message contains color data for 8 pixels. Color
> +       * data for the first group of pixels is stored in the "lower" half of
> +       * the color registers, so in SIMD16, the previous message did:
> +       * m + 0: r0
> +       * m + 1: g0
> +       * m + 2: b0
> +       * m + 3: a0
> +       *
> +       * Here goes the second message, which packs color data for the
> +       * remaining 8 pixels. Color data for these pixels is stored in the
> +       * "upper" half of the color registers, so we need to do:
> +       * m + 0: r1
> +       * m + 1: g1
> +       * m + 2: b1
> +       * m + 3: a1
> +       */
> +      if (dispatch_width == 16) {
> +         inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
> +                                     reg_undef, 4, true);
> +         inst->target = 0;
> +      }
> +
> +      prog_data->dual_src_blend = true;
> +   } else {
> +      for (int target = 0; target < key->nr_color_regions; target++) {
> +         /* Skip over outputs that weren't written. */
> +         if (this->outputs[target].file == BAD_FILE)
> +            continue;
> +
> +         this->current_annotation = ralloc_asprintf(this->mem_ctx,
> +                                                    "FB write target %d",
> +                                                    target);
> +         fs_reg src0_alpha;
> +         if (brw->gen >= 6 && key->replicate_alpha && target != 0)
> +            src0_alpha = offset(outputs[0], 3);
> +
> +         inst = emit_single_fb_write(this->outputs[target], reg_undef,
> +                                     src0_alpha,
> +                                     this->output_components[target]);
> +         inst->target = target;
> +      }
> +   }
> +
> +   if (inst == NULL) {
> +      /* Even if there's no color buffers enabled, we still need to send
> +       * alpha out the pipeline to our null renderbuffer to support
> +       * alpha-testing, alpha-to-coverage, and so on.
> +       */
> +      inst = emit_single_fb_write(reg_undef, reg_undef, reg_undef, 0);
> +      inst->target = 0;
> +   }
> +
> +   inst->eot = true;
> +   this->current_annotation = NULL;
> +}
> +
> +void
> +fs_god::setup_uniform_clipplane_values()
> +{
> +   gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
> +   const struct brw_vue_prog_key *key =
> +      (const struct brw_vue_prog_key *) this->key;
> +
> +   for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
> +      this->userplane[i] = fs_reg(UNIFORM, uniforms);
> +      for (int j = 0; j < 4; ++j) {
> +         stage_prog_data->param[uniforms + j] =
> +            (gl_constant_value *) &clip_planes[i][j];
> +      }
> +      uniforms += 4;
> +   }
> +}
> +
> +void fs_god::compute_clip_distance()
> +{
> +   struct brw_vue_prog_data *vue_prog_data =
> +      (struct brw_vue_prog_data *) prog_data;
> +   const struct brw_vue_prog_key *key =
> +      (const struct brw_vue_prog_key *) this->key;
> +
> +   /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
> +    *
> +    *     "If a linked set of shaders forming the vertex stage contains no
> +    *     static write to gl_ClipVertex or gl_ClipDistance, but the
> +    *     application has requested clipping against user clip planes through
> +    *     the API, then the coordinate written to gl_Position is used for
> +    *     comparison against the user clip planes."
> +    *
> +    * This function is only called if the shader didn't write to
> +    * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
> +    * if the user wrote to it; otherwise we use gl_Position.
> +    */
> +
> +   gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
> +   if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
> +      clip_vertex = VARYING_SLOT_POS;
> +
> +   /* If the clip vertex isn't written, skip this.  Typically this means
> +    * the GS will set up clipping. */
> +   if (outputs[clip_vertex].file == BAD_FILE)
> +      return;
> +
> +   setup_uniform_clipplane_values();
> +
> +   current_annotation = "user clip distances";
> +
> +   this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
> +   this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
> +
> +   for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
> +      fs_reg u = userplane[i];
> +      fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
> +      output.reg_offset = i & 3;
> +
> +      emit(MUL(output, outputs[clip_vertex], u));
> +      for (int j = 1; j < 4; j++) {
> +         u.reg = userplane[i].reg + j;
> +         emit(MAD(output, output, offset(outputs[clip_vertex], j), u));
> +      }
> +   }
> +}
> +
> +void
> +fs_god::emit_urb_writes()
> +{
> +   int slot, urb_offset, length;
> +   struct brw_vs_prog_data *vs_prog_data =
> +      (struct brw_vs_prog_data *) prog_data;
> +   const struct brw_vs_prog_key *key =
> +      (const struct brw_vs_prog_key *) this->key;
> +   const GLbitfield64 psiz_mask =
> +      VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
> +   const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map;
> +   bool flush;
> +   fs_reg sources[8];
> +
> +   /* Lower legacy ff and ClipVertex clipping to clip distances */
> +   if (key->base.userclip_active && !prog->UsesClipDistanceOut)
> +      compute_clip_distance();
> +
> +   /* If we don't have any valid slots to write, just do a minimal urb write
> +    * send to terminate the shader. */
> +   if (vue_map->slots_valid == 0) {
> +
> +      fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> +      fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
> +                                                      BRW_REGISTER_TYPE_UD))));
> +      inst->force_writemask_all = true;
> +
> +      inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
> +      inst->eot = true;
> +      inst->mlen = 1;
> +      inst->offset = 1;
> +      return;
> +   }
> +
> +   length = 0;
> +   urb_offset = 0;
> +   flush = false;
> +   for (slot = 0; slot < vue_map->num_slots; slot++) {
> +      fs_reg reg, src, zero;
> +
> +      int varying = vue_map->slot_to_varying[slot];
> +      switch (varying) {
> +      case VARYING_SLOT_PSIZ:
> +
> +         /* The point size varying slot is the vue header and is always in the
> +          * vue map.  But often none of the special varyings that live there
> +          * are written and in that case we can skip writing to the vue
> +          * header, provided the corresponding state properly clamps the
> +          * values further down the pipeline. */
> +         if ((vue_map->slots_valid & psiz_mask) == 0) {
> +            assert(length == 0);
> +            urb_offset++;
> +            break;
> +         }
> +
> +         zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> +         emit(MOV(zero, fs_reg(0u)));
> +
> +         sources[length++] = zero;
> +         if (vue_map->slots_valid & VARYING_BIT_LAYER)
> +            sources[length++] = this->outputs[VARYING_SLOT_LAYER];
> +         else
> +            sources[length++] = zero;
> +
> +         if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
> +            sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
> +         else
> +            sources[length++] = zero;
> +
> +         if (vue_map->slots_valid & VARYING_BIT_PSIZ)
> +            sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
> +         else
> +            sources[length++] = zero;
> +         break;
> +
> +      case BRW_VARYING_SLOT_NDC:
> +      case VARYING_SLOT_EDGE:
> +         unreachable("unexpected scalar vs output");
> +         break;
> +
> +      case BRW_VARYING_SLOT_PAD:
> +         break;
> +
> +      default:
> +         /* gl_Position is always in the vue map, but isn't always written by
> +          * the shader.  Other varyings (clip distances) get added to the vue
> +          * map but don't always get written.  In those cases, the
> +          * corresponding this->output[] slot will be invalid we and can skip
> +          * the urb write for the varying.  If we've already queued up a vue
> +          * slot for writing we flush a mlen 5 urb write, otherwise we just
> +          * advance the urb_offset.
> +          */
> +         if (this->outputs[varying].file == BAD_FILE) {
> +            if (length > 0)
> +               flush = true;
> +            else
> +               urb_offset++;
> +            break;
> +         }
> +
> +         if ((varying == VARYING_SLOT_COL0 ||
> +              varying == VARYING_SLOT_COL1 ||
> +              varying == VARYING_SLOT_BFC0 ||
> +              varying == VARYING_SLOT_BFC1) &&
> +             key->clamp_vertex_color) {
> +            /* We need to clamp these guys, so do a saturating MOV into a
> +             * temp register and use that for the payload.
> +             */
> +            for (int i = 0; i < 4; i++) {
> +               reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
> +               src = offset(this->outputs[varying], i);
> +               fs_inst *inst = emit(MOV(reg, src));
> +               inst->saturate = true;
> +               sources[length++] = reg;
> +            }
> +         } else {
> +            for (int i = 0; i < 4; i++)
> +               sources[length++] = offset(this->outputs[varying], i);
> +         }
> +         break;
> +      }
> +
> +      current_annotation = "URB write";
> +
> +      /* If we've queued up 8 registers of payload (2 VUE slots), if this is
> +       * the last slot or if we need to flush (see BAD_FILE varying case
> +       * above), emit a URB write send now to flush out the data.
> +       */
> +      int last = slot == vue_map->num_slots - 1;
> +      if (length == 8 || last)
> +         flush = true;
> +      if (flush) {
> +         fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
> +         fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
> +                                 BRW_REGISTER_TYPE_F);
> +
> +         /* We need WE_all on the MOV for the message header (the URB handles)
> +          * so do a MOV to a dummy register and set force_writemask_all on the
> +          * MOV.  LOAD_PAYLOAD will preserve that.
> +          */
> +         fs_reg dummy = fs_reg(GRF, alloc.allocate(1),
> +                               BRW_REGISTER_TYPE_UD);
> +         fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0),
> +                                                       BRW_REGISTER_TYPE_UD))));
> +         inst->force_writemask_all = true;
> +         payload_sources[0] = dummy;
> +
> +         memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
> +         emit(LOAD_PAYLOAD(payload, payload_sources, length + 1));
> +
> +         inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
> +         inst->eot = last;
> +         inst->mlen = length + 1;
> +         inst->offset = urb_offset;
> +         urb_offset = slot + 1;
> +         length = 0;
> +         flush = false;
> +      }
> +   }
> +}
> +
> +void
> +fs_god::resolve_ud_negate(fs_reg *reg)
> +{
> +   if (reg->type != BRW_REGISTER_TYPE_UD ||
> +       !reg->negate)
> +      return;
> +
> +   fs_reg temp = vgrf(glsl_type::uint_type);
> +   emit(MOV(temp, *reg));
> +   *reg = temp;
> +}
> +
> +/**
> + * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
> + *
> + * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
> + * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
> + */
> +void
> +fs_god::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
> +{
> +   assert(brw->gen <= 5);
> +
> +   if (rvalue->type != glsl_type::bool_type)
> +      return;
> +
> +   fs_reg and_result = vgrf(glsl_type::bool_type);
> +   fs_reg neg_result = vgrf(glsl_type::bool_type);
> +   emit(AND(and_result, *reg, fs_reg(1)));
> +   emit(MOV(neg_result, negate(and_result)));
> +   *reg = neg_result;
> +}
> +
> +fs_god::fs_god(struct brw_context *brw,
> +                       void *mem_ctx,
> +                       const struct brw_wm_prog_key *key,
> +                       struct brw_wm_prog_data *prog_data,
> +                       struct gl_shader_program *shader_prog,
> +                       struct gl_fragment_program *fp,
> +                       unsigned dispatch_width)
> +   : backend_god(brw, shader_prog, &fp->Base, &prog_data->base,
> +                     MESA_SHADER_FRAGMENT),
> +     reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
> +     reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
> +     reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
> +     key(key), prog_data(&prog_data->base),
> +     dispatch_width(dispatch_width), promoted_constants(0)
> +{
> +   this->mem_ctx = mem_ctx;
> +   init();
> +}
> +
> +fs_god::fs_god(struct brw_context *brw,
> +                       void *mem_ctx,
> +                       const struct brw_vs_prog_key *key,
> +                       struct brw_vs_prog_data *prog_data,
> +                       struct gl_shader_program *shader_prog,
> +                       struct gl_vertex_program *cp,
> +                       unsigned dispatch_width)
> +   : backend_god(brw, shader_prog, &cp->Base, &prog_data->base.base,
> +                     MESA_SHADER_VERTEX),
> +     reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
> +     reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
> +     reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
> +     key(key), prog_data(&prog_data->base.base),
> +     dispatch_width(dispatch_width), promoted_constants(0)
> +{
> +   this->mem_ctx = mem_ctx;
> +   init();
> +}
> +
> +void
> +fs_god::init()
> +{
> +   switch (stage) {
> +   case MESA_SHADER_FRAGMENT:
> +      key_tex = &((const brw_wm_prog_key *) key)->tex;
> +      break;
> +   case MESA_SHADER_VERTEX:
> +   case MESA_SHADER_GEOMETRY:
> +      key_tex = &((const brw_vue_prog_key *) key)->tex;
> +      break;
> +   default:
> +      unreachable("unhandled shader stage");
> +   }
> +
> +   this->failed = false;
> +   this->simd16_unsupported = false;
> +   this->no16_msg = NULL;
> +   this->variable_ht = hash_table_ctor(0,
> +                                       hash_table_pointer_hash,
> +                                       hash_table_pointer_compare);
> +
> +   this->nir_locals = NULL;
> +   this->nir_globals = NULL;
> +
> +   memset(&this->payload, 0, sizeof(this->payload));
> +   memset(this->outputs, 0, sizeof(this->outputs));
> +   memset(this->output_components, 0, sizeof(this->output_components));
> +   this->source_depth_to_render_target = false;
> +   this->runtime_check_aads_emit = false;
> +   this->first_non_payload_grf = 0;
> +   this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
> +
> +   this->current_annotation = NULL;
> +   this->base_ir = NULL;
> +
> +   this->virtual_grf_start = NULL;
> +   this->virtual_grf_end = NULL;
> +   this->live_intervals = NULL;
> +   this->regs_live_at_ip = NULL;
> +
> +   this->uniforms = 0;
> +   this->last_scratch = 0;
> +   this->pull_constant_loc = NULL;
> +   this->push_constant_loc = NULL;
> +
> +   this->spilled_any_registers = false;
> +   this->do_dual_src = false;
> +
> +   if (dispatch_width == 8)
> +      this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
> +}
> +
> +fs_god::~fs_god()
> +{
> +   hash_table_dtor(this->variable_ht);
> +}
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
> index 502161d..dca6f56 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
> @@ -273,7 +273,7 @@ fs_live_variables::compute_start_end()
>     }
>  }
>
> -fs_live_variables::fs_live_variables(fs_visitor *v, const cfg_t *cfg)
> +fs_live_variables::fs_live_variables(fs_god *v, const cfg_t *cfg)
>     : v(v), cfg(cfg)
>  {
>     mem_ctx = ralloc_context(NULL);
> @@ -326,7 +326,7 @@ fs_live_variables::~fs_live_variables()
>  }
>
>  void
> -fs_visitor::invalidate_live_intervals()
> +fs_god::invalidate_live_intervals()
>  {
>     ralloc_free(live_intervals);
>     live_intervals = NULL;
> @@ -339,7 +339,7 @@ fs_visitor::invalidate_live_intervals()
>   * information about whole VGRFs.
>   */
>  void
> -fs_visitor::calculate_live_intervals()
> +fs_god::calculate_live_intervals()
>  {
>     if (this->live_intervals)
>        return;
> @@ -375,7 +375,7 @@ fs_live_variables::vars_interfere(int a, int b)
>  }
>
>  bool
> -fs_visitor::virtual_grf_interferes(int a, int b)
> +fs_god::virtual_grf_interferes(int a, int b)
>  {
>     return !(virtual_grf_end[a] <= virtual_grf_start[b] ||
>              virtual_grf_end[b] <= virtual_grf_start[a]);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.h b/src/mesa/drivers/dri/i965/brw_fs_live_variables.h
> index c745706..27512de 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.h
> @@ -62,7 +62,7 @@ class fs_live_variables {
>  public:
>     DECLARE_RALLOC_CXX_OPERATORS(fs_live_variables)
>
> -   fs_live_variables(fs_visitor *v, const cfg_t *cfg);
> +   fs_live_variables(fs_god *v, const cfg_t *cfg);
>     ~fs_live_variables();
>
>     bool vars_interfere(int a, int b);
> @@ -106,7 +106,7 @@ protected:
>     void compute_live_variables();
>     void compute_start_end();
>
> -   fs_visitor *v;
> +   fs_god *v;
>     const cfg_t *cfg;
>     void *mem_ctx;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> index 21e52fe..a720f55 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> @@ -82,7 +82,7 @@ count_nir_instrs(nir_shader *nir)
>  }
>
>  void
> -fs_visitor::emit_nir_code()
> +fs_god::emit_nir_code()
>  {
>     const nir_shader_compiler_options *options =
>        ctx->Const.ShaderCompilerOptions[stage].NirOptions;
> @@ -226,7 +226,7 @@ fs_visitor::emit_nir_code()
>  }
>
>  void
> -fs_visitor::nir_setup_inputs(nir_shader *shader)
> +fs_god::nir_setup_inputs(nir_shader *shader)
>  {
>     foreach_list_typed(nir_variable, var, node, &shader->inputs) {
>        enum brw_reg_type type = brw_type_for_base_type(var->type);
> @@ -257,7 +257,7 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
>        }
>        case MESA_SHADER_GEOMETRY:
>        case MESA_SHADER_COMPUTE:
> -         unreachable("fs_visitor not used for these stages yet.");
> +         unreachable("fs_god not used for these stages yet.");
>           break;
>        case MESA_SHADER_FRAGMENT:
>           if (var->data.location == VARYING_SLOT_POS) {
> @@ -276,7 +276,7 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
>  }
>
>  void
> -fs_visitor::nir_setup_outputs(nir_shader *shader)
> +fs_god::nir_setup_outputs(nir_shader *shader)
>  {
>     brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
>
> @@ -324,7 +324,7 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
>  }
>
>  void
> -fs_visitor::nir_setup_uniforms(nir_shader *shader)
> +fs_god::nir_setup_uniforms(nir_shader *shader)
>  {
>     uniforms = shader->num_uniforms;
>
> @@ -361,7 +361,7 @@ fs_visitor::nir_setup_uniforms(nir_shader *shader)
>  }
>
>  void
> -fs_visitor::nir_setup_uniform(nir_variable *var)
> +fs_god::nir_setup_uniform(nir_variable *var)
>  {
>     int namelen = strlen(var->name);
>
> @@ -397,7 +397,7 @@ fs_visitor::nir_setup_uniform(nir_variable *var)
>  }
>
>  void
> -fs_visitor::nir_setup_builtin_uniform(nir_variable *var)
> +fs_god::nir_setup_builtin_uniform(nir_variable *var)
>  {
>     const nir_state_slot *const slots = var->state_slots;
>     assert(var->state_slots != NULL);
> @@ -430,7 +430,7 @@ fs_visitor::nir_setup_builtin_uniform(nir_variable *var)
>  static bool
>  emit_system_values_block(nir_block *block, void *void_visitor)
>  {
> -   fs_visitor *v = (fs_visitor *)void_visitor;
> +   fs_god *v = (fs_god *)void_visitor;
>     fs_reg *reg;
>
>     nir_foreach_instr(block, instr) {
> @@ -495,7 +495,7 @@ emit_system_values_block(nir_block *block, void *void_visitor)
>  }
>
>  void
> -fs_visitor::nir_emit_system_values(nir_shader *shader)
> +fs_god::nir_emit_system_values(nir_shader *shader)
>  {
>     nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
>     nir_foreach_overload(shader, overload) {
> @@ -506,7 +506,7 @@ fs_visitor::nir_emit_system_values(nir_shader *shader)
>  }
>
>  void
> -fs_visitor::nir_emit_impl(nir_function_impl *impl)
> +fs_god::nir_emit_impl(nir_function_impl *impl)
>  {
>     nir_locals = reralloc(mem_ctx, nir_locals, fs_reg, impl->reg_alloc);
>     foreach_list_typed(nir_register, reg, node, &impl->registers) {
> @@ -520,7 +520,7 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl)
>  }
>
>  void
> -fs_visitor::nir_emit_cf_list(exec_list *list)
> +fs_god::nir_emit_cf_list(exec_list *list)
>  {
>     exec_list_validate(list);
>     foreach_list_typed(nir_cf_node, node, node, list) {
> @@ -544,7 +544,7 @@ fs_visitor::nir_emit_cf_list(exec_list *list)
>  }
>
>  void
> -fs_visitor::nir_emit_if(nir_if *if_stmt)
> +fs_god::nir_emit_if(nir_if *if_stmt)
>  {
>     /* first, put the condition into f0 */
>     fs_inst *inst = emit(MOV(reg_null_d,
> @@ -569,7 +569,7 @@ fs_visitor::nir_emit_if(nir_if *if_stmt)
>  }
>
>  void
> -fs_visitor::nir_emit_loop(nir_loop *loop)
> +fs_god::nir_emit_loop(nir_loop *loop)
>  {
>     if (brw->gen < 6) {
>        no16("Can't support (non-uniform) control flow on SIMD16\n");
> @@ -583,7 +583,7 @@ fs_visitor::nir_emit_loop(nir_loop *loop)
>  }
>
>  void
> -fs_visitor::nir_emit_block(nir_block *block)
> +fs_god::nir_emit_block(nir_block *block)
>  {
>     nir_foreach_instr(block, instr) {
>        nir_emit_instr(instr);
> @@ -591,7 +591,7 @@ fs_visitor::nir_emit_block(nir_block *block)
>  }
>
>  void
> -fs_visitor::nir_emit_instr(nir_instr *instr)
> +fs_god::nir_emit_instr(nir_instr *instr)
>  {
>     switch (instr->type) {
>     case nir_instr_type_alu:
> @@ -640,7 +640,7 @@ brw_type_for_nir_type(nir_alu_type type)
>  }
>
>  bool
> -fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
> +fs_god::optimize_frontfacing_ternary(nir_alu_instr *instr,
>                                           const fs_reg &result)
>  {
>     if (instr->src[0].src.is_ssa ||
> @@ -724,7 +724,7 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
>  }
>
>  void
> -fs_visitor::nir_emit_alu(nir_alu_instr *instr)
> +fs_god::nir_emit_alu(nir_alu_instr *instr)
>  {
>     struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
>     fs_inst *inst;
> @@ -1311,7 +1311,7 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
>  }
>
>  fs_reg
> -fs_visitor::get_nir_src(nir_src src)
> +fs_god::get_nir_src(nir_src src)
>  {
>     if (src.is_ssa) {
>        assert(src.ssa->parent_instr->type == nir_instr_type_load_const);
> @@ -1346,7 +1346,7 @@ fs_visitor::get_nir_src(nir_src src)
>  }
>
>  fs_reg
> -fs_visitor::get_nir_dest(nir_dest dest)
> +fs_god::get_nir_dest(nir_dest dest)
>  {
>     fs_reg reg;
>     if (dest.reg.reg->is_global)
> @@ -1365,7 +1365,7 @@ fs_visitor::get_nir_dest(nir_dest dest)
>  }
>
>  void
> -fs_visitor::emit_percomp(fs_inst *inst, unsigned wr_mask)
> +fs_god::emit_percomp(fs_inst *inst, unsigned wr_mask)
>  {
>     for (unsigned i = 0; i < 4; i++) {
>        if (!((wr_mask >> i) & 1))
> @@ -1382,7 +1382,7 @@ fs_visitor::emit_percomp(fs_inst *inst, unsigned wr_mask)
>  }
>
>  void
> -fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
> +fs_god::nir_emit_intrinsic(nir_intrinsic_instr *instr)
>  {
>     fs_reg dest;
>     if (nir_intrinsic_infos[instr->intrinsic].has_dest)
> @@ -1750,7 +1750,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
>  }
>
>  void
> -fs_visitor::nir_emit_texture(nir_tex_instr *instr)
> +fs_god::nir_emit_texture(nir_tex_instr *instr)
>  {
>     unsigned sampler = instr->sampler_index;
>     fs_reg sampler_reg(sampler);
> @@ -1920,7 +1920,7 @@ fs_visitor::nir_emit_texture(nir_tex_instr *instr)
>  }
>
>  void
> -fs_visitor::nir_emit_jump(nir_jump_instr *instr)
> +fs_god::nir_emit_jump(nir_jump_instr *instr)
>  {
>     switch (instr->type) {
>     case nir_jump_break:
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp b/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
> index 047c2c0..adadb51 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
> @@ -53,7 +53,7 @@
>   */
>
>  bool
> -fs_visitor::opt_peephole_predicated_break()
> +fs_god::opt_peephole_predicated_break()
>  {
>     bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> index 72c490b..578951c 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> @@ -41,7 +41,7 @@ assign_reg(unsigned *reg_hw_locations, fs_reg *reg)
>  }
>
>  void
> -fs_visitor::assign_regs_trivial()
> +fs_god::assign_regs_trivial()
>  {
>     unsigned hw_reg_mapping[this->alloc.count + 1];
>     unsigned i;
> @@ -332,7 +332,7 @@ count_to_loop_end(const bblock_t *block)
>   * (note that in SIMD16, a node is two registers).
>   */
>  void
> -fs_visitor::setup_payload_interference(struct ra_graph *g,
> +fs_god::setup_payload_interference(struct ra_graph *g,
>                                         int payload_node_count,
>                                         int first_payload_node)
>  {
> @@ -466,7 +466,7 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
>   * contents.
>   */
>  void
> -fs_visitor::get_used_mrfs(bool *mrf_used)
> +fs_god::get_used_mrfs(bool *mrf_used)
>  {
>     int reg_width = dispatch_width / 8;
>
> @@ -498,7 +498,7 @@ fs_visitor::get_used_mrfs(bool *mrf_used)
>   * messages (treated as MRFs in code generation).
>   */
>  void
> -fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
> +fs_god::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
>  {
>     bool mrf_used[BRW_MAX_MRF];
>     get_used_mrfs(mrf_used);
> @@ -523,7 +523,7 @@ fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
>  }
>
>  bool
> -fs_visitor::assign_regs(bool allow_spilling)
> +fs_god::assign_regs(bool allow_spilling)
>  {
>     struct intel_screen *screen = brw->intelScreen;
>     /* Most of this allocation was written for a reg_width of 1
> @@ -684,7 +684,7 @@ fs_visitor::assign_regs(bool allow_spilling)
>  }
>
>  void
> -fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
> +fs_god::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
>                           uint32_t spill_offset, int count)
>  {
>     int reg_size = 1;
> @@ -719,7 +719,7 @@ fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
>  }
>
>  void
> -fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
> +fs_god::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
>                         uint32_t spill_offset, int count)
>  {
>     int reg_size = 1;
> @@ -744,7 +744,7 @@ fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
>  }
>
>  int
> -fs_visitor::choose_spill_reg(struct ra_graph *g)
> +fs_god::choose_spill_reg(struct ra_graph *g)
>  {
>     float loop_scale = 1.0;
>     float spill_costs[this->alloc.count];
> @@ -820,7 +820,7 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)
>  }
>
>  void
> -fs_visitor::spill_reg(int spill_reg)
> +fs_god::spill_reg(int spill_reg)
>  {
>     int size = alloc.sizes[spill_reg];
>     unsigned int spill_offset = last_scratch;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
> index 09f0fad..44cbf76 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
> @@ -64,7 +64,7 @@ is_nop_mov(const fs_inst *inst)
>  }
>
>  static bool
> -is_copy_payload(const fs_visitor *v, const fs_inst *inst)
> +is_copy_payload(const fs_god *v, const fs_inst *inst)
>  {
>     if (v->alloc.sizes[inst->src[0].reg] != inst->regs_written)
>        return false;
> @@ -79,7 +79,7 @@ is_copy_payload(const fs_visitor *v, const fs_inst *inst)
>  }
>
>  static bool
> -is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst)
> +is_coalesce_candidate(const fs_god *v, const fs_inst *inst)
>  {
>     if ((inst->opcode != BRW_OPCODE_MOV &&
>          inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) ||
> @@ -152,7 +152,7 @@ can_coalesce_vars(brw::fs_live_variables *live_intervals,
>  }
>
>  bool
> -fs_visitor::register_coalesce()
> +fs_god::register_coalesce()
>  {
>     bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
> index e406c28..0bda2d3 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
> @@ -43,7 +43,7 @@
>   */
>
>  static bool
> -opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
> +opt_saturate_propagation_local(fs_god *v, bblock_t *block)
>  {
>     bool progress = false;
>     int ip = block->end_ip + 1;
> @@ -103,7 +103,7 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
>  }
>
>  bool
> -fs_visitor::opt_saturate_propagation()
> +fs_god::opt_saturate_propagation()
>  {
>     bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
> index 740ba67..0a3e32d 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
> @@ -120,7 +120,7 @@ count_movs_from_if(fs_inst *then_mov[MAX_MOVS], fs_inst *else_mov[MAX_MOVS],
>   * If src0 is an immediate value, we promote it to a temporary GRF.
>   */
>  bool
> -fs_visitor::opt_peephole_sel()
> +fs_god::opt_peephole_sel()
>  {
>     bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> deleted file mode 100644
> index e6fb0cb..0000000
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ /dev/null
> @@ -1,4157 +0,0 @@
> -/*
> - * Copyright © 2010 Intel Corporation
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the "Software"),
> - * to deal in the Software without restriction, including without limitation
> - * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> - * and/or sell copies of the Software, and to permit persons to whom the
> - * Software is furnished to do so, subject to the following conditions:
> - *
> - * The above copyright notice and this permission notice (including the next
> - * paragraph) shall be included in all copies or substantial portions of the
> - * Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> - * IN THE SOFTWARE.
> - */
> -
> -/** @file brw_fs_visitor.cpp
> - *
> - * This file supports generating the FS LIR from the GLSL IR.  The LIR
> - * makes it easier to do backend-specific optimizations than doing so
> - * in the GLSL IR or in the native code.
> - */
> -#include <sys/types.h>
> -
> -#include "main/macros.h"
> -#include "main/shaderobj.h"
> -#include "program/prog_parameter.h"
> -#include "program/prog_print.h"
> -#include "program/prog_optimize.h"
> -#include "util/register_allocate.h"
> -#include "program/hash_table.h"
> -#include "brw_context.h"
> -#include "brw_eu.h"
> -#include "brw_wm.h"
> -#include "brw_vec4.h"
> -#include "brw_fs.h"
> -#include "main/uniforms.h"
> -#include "glsl/glsl_types.h"
> -#include "glsl/ir_optimization.h"
> -#include "program/sampler.h"
> -
> -
> -fs_reg *
> -fs_visitor::emit_vs_system_value(int location)
> -{
> -   fs_reg *reg = new(this->mem_ctx)
> -      fs_reg(ATTR, VERT_ATTRIB_MAX, BRW_REGISTER_TYPE_D);
> -   brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
> -
> -   switch (location) {
> -   case SYSTEM_VALUE_BASE_VERTEX:
> -      reg->reg_offset = 0;
> -      vs_prog_data->uses_vertexid = true;
> -      break;
> -   case SYSTEM_VALUE_VERTEX_ID:
> -   case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
> -      reg->reg_offset = 2;
> -      vs_prog_data->uses_vertexid = true;
> -      break;
> -   case SYSTEM_VALUE_INSTANCE_ID:
> -      reg->reg_offset = 3;
> -      vs_prog_data->uses_instanceid = true;
> -      break;
> -   default:
> -      unreachable("not reached");
> -   }
> -
> -   return reg;
> -}
> -
> -void
> -fs_visitor::visit(ir_variable *ir)
> -{
> -   fs_reg *reg = NULL;
> -
> -   if (variable_storage(ir))
> -      return;
> -
> -   if (ir->data.mode == ir_var_shader_in) {
> -      assert(ir->data.location != -1);
> -      if (stage == MESA_SHADER_VERTEX) {
> -         reg = new(this->mem_ctx)
> -            fs_reg(ATTR, ir->data.location,
> -                   brw_type_for_base_type(ir->type->get_scalar_type()));
> -      } else if (ir->data.location == VARYING_SLOT_POS) {
> -         reg = emit_fragcoord_interpolation(ir->data.pixel_center_integer,
> -                                            ir->data.origin_upper_left);
> -      } else if (ir->data.location == VARYING_SLOT_FACE) {
> -        reg = emit_frontfacing_interpolation();
> -      } else {
> -         reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
> -         emit_general_interpolation(*reg, ir->name, ir->type,
> -                                    (glsl_interp_qualifier) ir->data.interpolation,
> -                                    ir->data.location, ir->data.centroid,
> -                                    ir->data.sample);
> -      }
> -      assert(reg);
> -      hash_table_insert(this->variable_ht, reg, ir);
> -      return;
> -   } else if (ir->data.mode == ir_var_shader_out) {
> -      reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
> -
> -      if (stage == MESA_SHADER_VERTEX) {
> -        int vector_elements =
> -           ir->type->is_array() ? ir->type->fields.array->vector_elements
> -                                : ir->type->vector_elements;
> -
> -        for (int i = 0; i < (type_size(ir->type) + 3) / 4; i++) {
> -           int output = ir->data.location + i;
> -           this->outputs[output] = *reg;
> -           this->outputs[output].reg_offset = i * 4;
> -           this->output_components[output] = vector_elements;
> -        }
> -
> -      } else if (ir->data.index > 0) {
> -        assert(ir->data.location == FRAG_RESULT_DATA0);
> -        assert(ir->data.index == 1);
> -        this->dual_src_output = *reg;
> -         this->do_dual_src = true;
> -      } else if (ir->data.location == FRAG_RESULT_COLOR) {
> -        /* Writing gl_FragColor outputs to all color regions. */
> -         assert(stage == MESA_SHADER_FRAGMENT);
> -         brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> -        for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
> -           this->outputs[i] = *reg;
> -           this->output_components[i] = 4;
> -        }
> -      } else if (ir->data.location == FRAG_RESULT_DEPTH) {
> -        this->frag_depth = *reg;
> -      } else if (ir->data.location == FRAG_RESULT_SAMPLE_MASK) {
> -         this->sample_mask = *reg;
> -      } else {
> -        /* gl_FragData or a user-defined FS output */
> -        assert(ir->data.location >= FRAG_RESULT_DATA0 &&
> -               ir->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
> -
> -        int vector_elements =
> -           ir->type->is_array() ? ir->type->fields.array->vector_elements
> -                                : ir->type->vector_elements;
> -
> -        /* General color output. */
> -        for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
> -           int output = ir->data.location - FRAG_RESULT_DATA0 + i;
> -           this->outputs[output] = offset(*reg, vector_elements * i);
> -           this->output_components[output] = vector_elements;
> -        }
> -      }
> -   } else if (ir->data.mode == ir_var_uniform) {
> -      int param_index = uniforms;
> -
> -      /* Thanks to the lower_ubo_reference pass, we will see only
> -       * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
> -       * variables, so no need for them to be in variable_ht.
> -       *
> -       * Some uniforms, such as samplers and atomic counters, have no actual
> -       * storage, so we should ignore them.
> -       */
> -      if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
> -         return;
> -
> -      if (dispatch_width == 16) {
> -        if (!variable_storage(ir)) {
> -           fail("Failed to find uniform '%s' in SIMD16\n", ir->name);
> -        }
> -        return;
> -      }
> -
> -      param_size[param_index] = type_size(ir->type);
> -      if (!strncmp(ir->name, "gl_", 3)) {
> -        setup_builtin_uniform_values(ir);
> -      } else {
> -        setup_uniform_values(ir);
> -      }
> -
> -      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
> -      reg->type = brw_type_for_base_type(ir->type);
> -
> -   } else if (ir->data.mode == ir_var_system_value) {
> -      switch (ir->data.location) {
> -      case SYSTEM_VALUE_BASE_VERTEX:
> -      case SYSTEM_VALUE_VERTEX_ID:
> -      case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
> -      case SYSTEM_VALUE_INSTANCE_ID:
> -         reg = emit_vs_system_value(ir->data.location);
> -         break;
> -      case SYSTEM_VALUE_SAMPLE_POS:
> -        reg = emit_samplepos_setup();
> -         break;
> -      case SYSTEM_VALUE_SAMPLE_ID:
> -        reg = emit_sampleid_setup();
> -         break;
> -      case SYSTEM_VALUE_SAMPLE_MASK_IN:
> -         assert(brw->gen >= 7);
> -         reg = new(mem_ctx)
> -            fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
> -                          BRW_REGISTER_TYPE_D));
> -         break;
> -      }
> -   }
> -
> -   if (!reg)
> -      reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
> -
> -   hash_table_insert(this->variable_ht, reg, ir);
> -}
> -
> -void
> -fs_visitor::visit(ir_dereference_variable *ir)
> -{
> -   fs_reg *reg = variable_storage(ir->var);
> -
> -   if (!reg) {
> -      fail("Failed to find variable storage for %s\n", ir->var->name);
> -      this->result = fs_reg(reg_null_d);
> -      return;
> -   }
> -   this->result = *reg;
> -}
> -
> -void
> -fs_visitor::visit(ir_dereference_record *ir)
> -{
> -   const glsl_type *struct_type = ir->record->type;
> -
> -   ir->record->accept(this);
> -
> -   unsigned int off = 0;
> -   for (unsigned int i = 0; i < struct_type->length; i++) {
> -      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
> -        break;
> -      off += type_size(struct_type->fields.structure[i].type);
> -   }
> -   this->result = offset(this->result, off);
> -   this->result.type = brw_type_for_base_type(ir->type);
> -}
> -
> -void
> -fs_visitor::visit(ir_dereference_array *ir)
> -{
> -   ir_constant *constant_index;
> -   fs_reg src;
> -   int element_size = type_size(ir->type);
> -
> -   constant_index = ir->array_index->as_constant();
> -
> -   ir->array->accept(this);
> -   src = this->result;
> -   src.type = brw_type_for_base_type(ir->type);
> -
> -   if (constant_index) {
> -      if (src.file == ATTR) {
> -         /* Attribute arrays get loaded as one vec4 per element.  In that case
> -          * offset the source register.
> -          */
> -         src.reg += constant_index->value.i[0];
> -      } else {
> -         assert(src.file == UNIFORM || src.file == GRF || src.file == HW_REG);
> -         src = offset(src, constant_index->value.i[0] * element_size);
> -      }
> -   } else {
> -      /* Variable index array dereference.  We attach the variable index
> -       * component to the reg as a pointer to a register containing the
> -       * offset.  Currently only uniform arrays are supported in this patch,
> -       * and that reladdr pointer is resolved by
> -       * move_uniform_array_access_to_pull_constants().  All other array types
> -       * are lowered by lower_variable_index_to_cond_assign().
> -       */
> -      ir->array_index->accept(this);
> -
> -      fs_reg index_reg;
> -      index_reg = vgrf(glsl_type::int_type);
> -      emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
> -
> -      if (src.reladdr) {
> -         emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
> -      }
> -
> -      src.reladdr = ralloc(mem_ctx, fs_reg);
> -      memcpy(src.reladdr, &index_reg, sizeof(index_reg));
> -   }
> -   this->result = src;
> -}
> -
> -fs_inst *
> -fs_visitor::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
> -                     const fs_reg &a)
> -{
> -   if (brw->gen < 6) {
> -      /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
> -      fs_reg y_times_a           = vgrf(glsl_type::float_type);
> -      fs_reg one_minus_a         = vgrf(glsl_type::float_type);
> -      fs_reg x_times_one_minus_a = vgrf(glsl_type::float_type);
> -
> -      emit(MUL(y_times_a, y, a));
> -
> -      fs_reg negative_a = a;
> -      negative_a.negate = !a.negate;
> -      emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
> -      emit(MUL(x_times_one_minus_a, x, one_minus_a));
> -
> -      return emit(ADD(dst, x_times_one_minus_a, y_times_a));
> -   } else {
> -      /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
> -       * we need to reorder the operands.
> -       */
> -      return emit(LRP(dst, a, y, x));
> -   }
> -}
> -
> -void
> -fs_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
> -                        const fs_reg &src0, const fs_reg &src1)
> -{
> -   assert(conditionalmod == BRW_CONDITIONAL_GE ||
> -          conditionalmod == BRW_CONDITIONAL_L);
> -
> -   fs_inst *inst;
> -
> -   if (brw->gen >= 6) {
> -      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
> -      inst->conditional_mod = conditionalmod;
> -   } else {
> -      emit(CMP(reg_null_d, src0, src1, conditionalmod));
> -
> -      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
> -      inst->predicate = BRW_PREDICATE_NORMAL;
> -   }
> -}
> -
> -bool
> -fs_visitor::try_emit_saturate(ir_expression *ir)
> -{
> -   if (ir->operation != ir_unop_saturate)
> -      return false;
> -
> -   ir_rvalue *sat_val = ir->operands[0];
> -
> -   fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
> -
> -   sat_val->accept(this);
> -   fs_reg src = this->result;
> -
> -   fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
> -
> -   /* If the last instruction from our accept() generated our
> -    * src, just set the saturate flag instead of emmitting a separate mov.
> -    */
> -   fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
> -   if (modify && modify->regs_written == modify->dst.width / 8 &&
> -       modify->can_do_saturate()) {
> -      modify->saturate = true;
> -      this->result = src;
> -      return true;
> -   }
> -
> -   return false;
> -}
> -
> -bool
> -fs_visitor::try_emit_line(ir_expression *ir)
> -{
> -   /* LINE's src0 must be of type float. */
> -   if (ir->type != glsl_type::float_type)
> -      return false;
> -
> -   ir_rvalue *nonmul = ir->operands[1];
> -   ir_expression *mul = ir->operands[0]->as_expression();
> -
> -   if (!mul || mul->operation != ir_binop_mul) {
> -      nonmul = ir->operands[0];
> -      mul = ir->operands[1]->as_expression();
> -
> -      if (!mul || mul->operation != ir_binop_mul)
> -         return false;
> -   }
> -
> -   ir_constant *const_add = nonmul->as_constant();
> -   if (!const_add)
> -      return false;
> -
> -   int add_operand_vf = brw_float_to_vf(const_add->value.f[0]);
> -   if (add_operand_vf == -1)
> -      return false;
> -
> -   ir_rvalue *non_const_mul = mul->operands[1];
> -   ir_constant *const_mul = mul->operands[0]->as_constant();
> -   if (!const_mul) {
> -      const_mul = mul->operands[1]->as_constant();
> -
> -      if (!const_mul)
> -         return false;
> -
> -      non_const_mul = mul->operands[0];
> -   }
> -
> -   int mul_operand_vf = brw_float_to_vf(const_mul->value.f[0]);
> -   if (mul_operand_vf == -1)
> -      return false;
> -
> -   non_const_mul->accept(this);
> -   fs_reg src1 = this->result;
> -
> -   fs_reg src0 = vgrf(ir->type);
> -   emit(BRW_OPCODE_MOV, src0,
> -        fs_reg((uint8_t)mul_operand_vf, 0, 0, (uint8_t)add_operand_vf));
> -
> -   this->result = vgrf(ir->type);
> -   emit(BRW_OPCODE_LINE, this->result, src0, src1);
> -   return true;
> -}
> -
> -bool
> -fs_visitor::try_emit_mad(ir_expression *ir)
> -{
> -   /* 3-src instructions were introduced in gen6. */
> -   if (brw->gen < 6)
> -      return false;
> -
> -   /* MAD can only handle floating-point data. */
> -   if (ir->type != glsl_type::float_type)
> -      return false;
> -
> -   ir_rvalue *nonmul;
> -   ir_expression *mul;
> -   bool mul_negate, mul_abs;
> -
> -   for (int i = 0; i < 2; i++) {
> -      mul_negate = false;
> -      mul_abs = false;
> -
> -      mul = ir->operands[i]->as_expression();
> -      nonmul = ir->operands[1 - i];
> -
> -      if (mul && mul->operation == ir_unop_abs) {
> -         mul = mul->operands[0]->as_expression();
> -         mul_abs = true;
> -      } else if (mul && mul->operation == ir_unop_neg) {
> -         mul = mul->operands[0]->as_expression();
> -         mul_negate = true;
> -      }
> -
> -      if (mul && mul->operation == ir_binop_mul)
> -         break;
> -   }
> -
> -   if (!mul || mul->operation != ir_binop_mul)
> -      return false;
> -
> -   nonmul->accept(this);
> -   fs_reg src0 = this->result;
> -
> -   mul->operands[0]->accept(this);
> -   fs_reg src1 = this->result;
> -   src1.negate ^= mul_negate;
> -   src1.abs = mul_abs;
> -   if (mul_abs)
> -      src1.negate = false;
> -
> -   mul->operands[1]->accept(this);
> -   fs_reg src2 = this->result;
> -   src2.abs = mul_abs;
> -   if (mul_abs)
> -      src2.negate = false;
> -
> -   this->result = vgrf(ir->type);
> -   emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
> -
> -   return true;
> -}
> -
> -bool
> -fs_visitor::try_emit_b2f_of_comparison(ir_expression *ir)
> -{
> -   /* On platforms that do not natively generate 0u and ~0u for Boolean
> -    * results, b2f expressions that look like
> -    *
> -    *     f = b2f(expr cmp 0)
> -    *
> -    * will generate better code by pretending the expression is
> -    *
> -    *     f = ir_triop_csel(0.0, 1.0, expr cmp 0)
> -    *
> -    * This is because the last instruction of "expr" can generate the
> -    * condition code for the "cmp 0".  This avoids having to do the "-(b & 1)"
> -    * trick to generate 0u or ~0u for the Boolean result.  This means code like
> -    *
> -    *     mov(16)         g16<1>F         1F
> -    *     mul.ge.f0(16)   null            g6<8,8,1>F      g14<8,8,1>F
> -    *     (+f0) sel(16)   m6<1>F          g16<8,8,1>F     0F
> -    *
> -    * will be generated instead of
> -    *
> -    *     mul(16)         g2<1>F          g12<8,8,1>F     g4<8,8,1>F
> -    *     cmp.ge.f0(16)   g2<1>D          g4<8,8,1>F      0F
> -    *     and(16)         g4<1>D          g2<8,8,1>D      1D
> -    *     and(16)         m6<1>D          -g4<8,8,1>D     0x3f800000UD
> -    *
> -    * When the comparison is either == 0.0 or != 0.0 using the knowledge that
> -    * the true (or false) case already results in zero would allow better code
> -    * generation by possibly avoiding a load-immediate instruction.
> -    */
> -   ir_expression *cmp = ir->operands[0]->as_expression();
> -   if (cmp == NULL)
> -      return false;
> -
> -   if (cmp->operation == ir_binop_equal || cmp->operation == ir_binop_nequal) {
> -      for (unsigned i = 0; i < 2; i++) {
> -         ir_constant *c = cmp->operands[i]->as_constant();
> -         if (c == NULL || !c->is_zero())
> -            continue;
> -
> -         ir_expression *expr = cmp->operands[i ^ 1]->as_expression();
> -         if (expr != NULL) {
> -            fs_reg op[2];
> -
> -            for (unsigned j = 0; j < 2; j++) {
> -               cmp->operands[j]->accept(this);
> -               op[j] = this->result;
> -
> -               resolve_ud_negate(&op[j]);
> -            }
> -
> -            emit_bool_to_cond_code_of_reg(cmp, op);
> -
> -            /* In this case we know when the condition is true, op[i ^ 1]
> -             * contains zero.  Invert the predicate, use op[i ^ 1] as src0,
> -             * and immediate 1.0f as src1.
> -             */
> -            this->result = vgrf(ir->type);
> -            op[i ^ 1].type = BRW_REGISTER_TYPE_F;
> -
> -            fs_inst *inst = emit(SEL(this->result, op[i ^ 1], fs_reg(1.0f)));
> -            inst->predicate = BRW_PREDICATE_NORMAL;
> -            inst->predicate_inverse = cmp->operation == ir_binop_equal;
> -            return true;
> -         }
> -      }
> -   }
> -
> -   emit_bool_to_cond_code(cmp);
> -
> -   fs_reg temp = vgrf(ir->type);
> -   emit(MOV(temp, fs_reg(1.0f)));
> -
> -   this->result = vgrf(ir->type);
> -   fs_inst *inst = emit(SEL(this->result, temp, fs_reg(0.0f)));
> -   inst->predicate = BRW_PREDICATE_NORMAL;
> -
> -   return true;
> -}
> -
> -static int
> -pack_pixel_offset(float x)
> -{
> -   /* Clamp upper end of the range to +7/16. See explanation in non-constant
> -    * offset case below. */
> -   int n = MIN2((int)(x * 16), 7);
> -   return n & 0xf;
> -}
> -
> -void
> -fs_visitor::emit_interpolate_expression(ir_expression *ir)
> -{
> -   /* in SIMD16 mode, the pixel interpolator returns coords interleaved
> -    * 8 channels at a time, same as the barycentric coords presented in
> -    * the FS payload. this requires a bit of extra work to support.
> -    */
> -   no16("interpolate_at_* not yet supported in SIMD16 mode.");
> -
> -   assert(stage == MESA_SHADER_FRAGMENT);
> -   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> -
> -   ir_dereference * deref = ir->operands[0]->as_dereference();
> -   ir_swizzle * swiz = NULL;
> -   if (!deref) {
> -      /* the api does not allow a swizzle here, but the varying packing code
> -       * may have pushed one into here.
> -       */
> -      swiz = ir->operands[0]->as_swizzle();
> -      assert(swiz);
> -      deref = swiz->val->as_dereference();
> -   }
> -   assert(deref);
> -   ir_variable * var = deref->variable_referenced();
> -   assert(var);
> -
> -   /* 1. collect interpolation factors */
> -
> -   fs_reg dst_x = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
> -   fs_reg dst_y = offset(dst_x, 1);
> -
> -   /* for most messages, we need one reg of ignored data; the hardware requires mlen==1
> -    * even when there is no payload. in the per-slot offset case, we'll replace this with
> -    * the proper source data. */
> -   fs_reg src = vgrf(glsl_type::float_type);
> -   int mlen = 1;     /* one reg unless overriden */
> -   int reg_width = dispatch_width / 8;
> -   fs_inst *inst;
> -
> -   switch (ir->operation) {
> -   case ir_unop_interpolate_at_centroid:
> -      inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
> -      break;
> -
> -   case ir_binop_interpolate_at_sample: {
> -      ir_constant *sample_num = ir->operands[1]->as_constant();
> -      assert(sample_num || !"nonconstant sample number should have been lowered.");
> -
> -      unsigned msg_data = sample_num->value.i[0] << 4;
> -      inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, fs_reg(msg_data));
> -      break;
> -   }
> -
> -   case ir_binop_interpolate_at_offset: {
> -      ir_constant *const_offset = ir->operands[1]->as_constant();
> -      if (const_offset) {
> -         unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
> -                            (pack_pixel_offset(const_offset->value.f[1]) << 4);
> -         inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
> -                     fs_reg(msg_data));
> -      } else {
> -         /* pack the operands: hw wants offsets as 4 bit signed ints */
> -         ir->operands[1]->accept(this);
> -         src = vgrf(glsl_type::ivec2_type);
> -         fs_reg src2 = src;
> -         for (int i = 0; i < 2; i++) {
> -            fs_reg temp = vgrf(glsl_type::float_type);
> -            emit(MUL(temp, this->result, fs_reg(16.0f)));
> -            emit(MOV(src2, temp));  /* float to int */
> -
> -            /* Clamp the upper end of the range to +7/16. ARB_gpu_shader5 requires
> -             * that we support a maximum offset of +0.5, which isn't representable
> -             * in a S0.4 value -- if we didn't clamp it, we'd end up with -8/16,
> -             * which is the opposite of what the shader author wanted.
> -             *
> -             * This is legal due to ARB_gpu_shader5's quantization rules:
> -             *
> -             * "Not all values of <offset> may be supported; x and y offsets may
> -             * be rounded to fixed-point values with the number of fraction bits
> -             * given by the implementation-dependent constant
> -             * FRAGMENT_INTERPOLATION_OFFSET_BITS"
> -             */
> -
> -            fs_inst *inst = emit(BRW_OPCODE_SEL, src2, src2, fs_reg(7));
> -            inst->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
> -
> -            src2 = offset(src2, 1);
> -            this->result = offset(this->result, 1);
> -         }
> -
> -         mlen = 2 * reg_width;
> -         inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
> -                     fs_reg(0u));
> -      }
> -      break;
> -   }
> -
> -   default:
> -      unreachable("not reached");
> -   }
> -
> -   inst->mlen = mlen;
> -   inst->regs_written = 2 * reg_width; /* 2 floats per slot returned */
> -   inst->pi_noperspective = var->determine_interpolation_mode(key->flat_shade) ==
> -         INTERP_QUALIFIER_NOPERSPECTIVE;
> -
> -   /* 2. emit linterp */
> -
> -   fs_reg res = vgrf(ir->type);
> -   this->result = res;
> -
> -   for (int i = 0; i < ir->type->vector_elements; i++) {
> -      int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
> -      emit(FS_OPCODE_LINTERP, res,
> -           dst_x, dst_y,
> -           fs_reg(interp_reg(var->data.location, ch)));
> -      res = offset(res, 1);
> -   }
> -}
> -
> -void
> -fs_visitor::visit(ir_expression *ir)
> -{
> -   unsigned int operand;
> -   fs_reg op[3], temp;
> -   fs_inst *inst;
> -   struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
> -
> -   assert(ir->get_num_operands() <= 3);
> -
> -   if (try_emit_saturate(ir))
> -      return;
> -
> -   /* Deal with the real oddball stuff first */
> -   switch (ir->operation) {
> -   case ir_binop_add:
> -      if (brw->gen <= 5 && try_emit_line(ir))
> -         return;
> -      if (try_emit_mad(ir))
> -         return;
> -      break;
> -
> -   case ir_triop_csel:
> -      ir->operands[1]->accept(this);
> -      op[1] = this->result;
> -      ir->operands[2]->accept(this);
> -      op[2] = this->result;
> -
> -      emit_bool_to_cond_code(ir->operands[0]);
> -
> -      this->result = vgrf(ir->type);
> -      inst = emit(SEL(this->result, op[1], op[2]));
> -      inst->predicate = BRW_PREDICATE_NORMAL;
> -      return;
> -
> -   case ir_unop_b2f:
> -      if (brw->gen <= 5 && try_emit_b2f_of_comparison(ir))
> -         return;
> -      break;
> -
> -   case ir_unop_interpolate_at_centroid:
> -   case ir_binop_interpolate_at_offset:
> -   case ir_binop_interpolate_at_sample:
> -      emit_interpolate_expression(ir);
> -      return;
> -
> -   default:
> -      break;
> -   }
> -
> -   for (operand = 0; operand < ir->get_num_operands(); operand++) {
> -      ir->operands[operand]->accept(this);
> -      if (this->result.file == BAD_FILE) {
> -        fail("Failed to get tree for expression operand:\n");
> -        ir->operands[operand]->fprint(stderr);
> -         fprintf(stderr, "\n");
> -      }
> -      assert(this->result.file == GRF ||
> -             this->result.file == UNIFORM || this->result.file == ATTR);
> -      op[operand] = this->result;
> -
> -      /* Matrix expression operands should have been broken down to vector
> -       * operations already.
> -       */
> -      assert(!ir->operands[operand]->type->is_matrix());
> -      /* And then those vector operands should have been broken down to scalar.
> -       */
> -      assert(!ir->operands[operand]->type->is_vector());
> -   }
> -
> -   /* Storage for our result.  If our result goes into an assignment, it will
> -    * just get copy-propagated out, so no worries.
> -    */
> -   this->result = vgrf(ir->type);
> -
> -   switch (ir->operation) {
> -   case ir_unop_logic_not:
> -      emit(NOT(this->result, op[0]));
> -      break;
> -   case ir_unop_neg:
> -      op[0].negate = !op[0].negate;
> -      emit(MOV(this->result, op[0]));
> -      break;
> -   case ir_unop_abs:
> -      op[0].abs = true;
> -      op[0].negate = false;
> -      emit(MOV(this->result, op[0]));
> -      break;
> -   case ir_unop_sign:
> -      if (ir->type->is_float()) {
> -         /* AND(val, 0x80000000) gives the sign bit.
> -          *
> -          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
> -          * zero.
> -          */
> -         emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
> -
> -         op[0].type = BRW_REGISTER_TYPE_UD;
> -         this->result.type = BRW_REGISTER_TYPE_UD;
> -         emit(AND(this->result, op[0], fs_reg(0x80000000u)));
> -
> -         inst = emit(OR(this->result, this->result, fs_reg(0x3f800000u)));
> -         inst->predicate = BRW_PREDICATE_NORMAL;
> -
> -         this->result.type = BRW_REGISTER_TYPE_F;
> -      } else {
> -         /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
> -          *               -> non-negative val generates 0x00000000.
> -          *  Predicated OR sets 1 if val is positive.
> -          */
> -         emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
> -
> -         emit(ASR(this->result, op[0], fs_reg(31)));
> -
> -         inst = emit(OR(this->result, this->result, fs_reg(1)));
> -         inst->predicate = BRW_PREDICATE_NORMAL;
> -      }
> -      break;
> -   case ir_unop_rcp:
> -      emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
> -      break;
> -
> -   case ir_unop_exp2:
> -      emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
> -      break;
> -   case ir_unop_log2:
> -      emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
> -      break;
> -   case ir_unop_exp:
> -   case ir_unop_log:
> -      unreachable("not reached: should be handled by ir_explog_to_explog2");
> -   case ir_unop_sin:
> -   case ir_unop_sin_reduced:
> -      emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
> -      break;
> -   case ir_unop_cos:
> -   case ir_unop_cos_reduced:
> -      emit_math(SHADER_OPCODE_COS, this->result, op[0]);
> -      break;
> -
> -   case ir_unop_dFdx:
> -      /* Select one of the two opcodes based on the glHint value. */
> -      if (fs_key->high_quality_derivatives)
> -         emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
> -      else
> -         emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
> -      break;
> -
> -   case ir_unop_dFdx_coarse:
> -      emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
> -      break;
> -
> -   case ir_unop_dFdx_fine:
> -      emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
> -      break;
> -
> -   case ir_unop_dFdy:
> -      /* Select one of the two opcodes based on the glHint value. */
> -      if (fs_key->high_quality_derivatives)
> -         emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
> -      else
> -         emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
> -      break;
> -
> -   case ir_unop_dFdy_coarse:
> -      emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
> -      break;
> -
> -   case ir_unop_dFdy_fine:
> -      emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
> -      break;
> -
> -   case ir_binop_add:
> -      emit(ADD(this->result, op[0], op[1]));
> -      break;
> -   case ir_binop_sub:
> -      unreachable("not reached: should be handled by ir_sub_to_add_neg");
> -
> -   case ir_binop_mul:
> -      if (brw->gen < 8 && ir->type->is_integer()) {
> -        /* For integer multiplication, the MUL uses the low 16 bits
> -         * of one of the operands (src0 on gen6, src1 on gen7).  The
> -         * MACH accumulates in the contribution of the upper 16 bits
> -         * of that operand.
> -          */
> -         if (ir->operands[0]->is_uint16_constant()) {
> -            if (brw->gen < 7)
> -               emit(MUL(this->result, op[0], op[1]));
> -            else
> -               emit(MUL(this->result, op[1], op[0]));
> -         } else if (ir->operands[1]->is_uint16_constant()) {
> -            if (brw->gen < 7)
> -               emit(MUL(this->result, op[1], op[0]));
> -            else
> -               emit(MUL(this->result, op[0], op[1]));
> -         } else {
> -            if (brw->gen >= 7)
> -               no16("SIMD16 explicit accumulator operands unsupported\n");
> -
> -            struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> -                                        this->result.type);
> -
> -            emit(MUL(acc, op[0], op[1]));
> -            emit(MACH(reg_null_d, op[0], op[1]));
> -            emit(MOV(this->result, fs_reg(acc)));
> -         }
> -      } else {
> -        emit(MUL(this->result, op[0], op[1]));
> -      }
> -      break;
> -   case ir_binop_imul_high: {
> -      if (brw->gen == 7)
> -         no16("SIMD16 explicit accumulator operands unsupported\n");
> -
> -      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> -                                  this->result.type);
> -
> -      fs_inst *mul = emit(MUL(acc, op[0], op[1]));
> -      emit(MACH(this->result, op[0], op[1]));
> -
> -      /* Until Gen8, integer multiplies read 32-bits from one source, and
> -       * 16-bits from the other, and relying on the MACH instruction to
> -       * generate the high bits of the result.
> -       *
> -       * On Gen8, the multiply instruction does a full 32x32-bit multiply,
> -       * but in order to do a 64x64-bit multiply we have to simulate the
> -       * previous behavior and then use a MACH instruction.
> -       *
> -       * FINISHME: Don't use source modifiers on src1.
> -       */
> -      if (brw->gen >= 8) {
> -         assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
> -                mul->src[1].type == BRW_REGISTER_TYPE_UD);
> -         if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
> -            mul->src[1].type = BRW_REGISTER_TYPE_W;
> -         } else {
> -            mul->src[1].type = BRW_REGISTER_TYPE_UW;
> -         }
> -      }
> -
> -      break;
> -   }
> -   case ir_binop_div:
> -      /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
> -      assert(ir->type->is_integer());
> -      emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
> -      break;
> -   case ir_binop_carry: {
> -      if (brw->gen == 7)
> -         no16("SIMD16 explicit accumulator operands unsupported\n");
> -
> -      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> -                                  BRW_REGISTER_TYPE_UD);
> -
> -      emit(ADDC(reg_null_ud, op[0], op[1]));
> -      emit(MOV(this->result, fs_reg(acc)));
> -      break;
> -   }
> -   case ir_binop_borrow: {
> -      if (brw->gen == 7)
> -         no16("SIMD16 explicit accumulator operands unsupported\n");
> -
> -      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
> -                                  BRW_REGISTER_TYPE_UD);
> -
> -      emit(SUBB(reg_null_ud, op[0], op[1]));
> -      emit(MOV(this->result, fs_reg(acc)));
> -      break;
> -   }
> -   case ir_binop_mod:
> -      /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
> -      assert(ir->type->is_integer());
> -      emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
> -      break;
> -
> -   case ir_binop_less:
> -   case ir_binop_greater:
> -   case ir_binop_lequal:
> -   case ir_binop_gequal:
> -   case ir_binop_equal:
> -   case ir_binop_all_equal:
> -   case ir_binop_nequal:
> -   case ir_binop_any_nequal:
> -      if (brw->gen <= 5) {
> -         resolve_bool_comparison(ir->operands[0], &op[0]);
> -         resolve_bool_comparison(ir->operands[1], &op[1]);
> -      }
> -
> -      emit(CMP(this->result, op[0], op[1],
> -               brw_conditional_for_comparison(ir->operation)));
> -      break;
> -
> -   case ir_binop_logic_xor:
> -      emit(XOR(this->result, op[0], op[1]));
> -      break;
> -
> -   case ir_binop_logic_or:
> -      emit(OR(this->result, op[0], op[1]));
> -      break;
> -
> -   case ir_binop_logic_and:
> -      emit(AND(this->result, op[0], op[1]));
> -      break;
> -
> -   case ir_binop_dot:
> -   case ir_unop_any:
> -      unreachable("not reached: should be handled by brw_fs_channel_expressions");
> -
> -   case ir_unop_noise:
> -      unreachable("not reached: should be handled by lower_noise");
> -
> -   case ir_quadop_vector:
> -      unreachable("not reached: should be handled by lower_quadop_vector");
> -
> -   case ir_binop_vector_extract:
> -      unreachable("not reached: should be handled by lower_vec_index_to_cond_assign()");
> -
> -   case ir_triop_vector_insert:
> -      unreachable("not reached: should be handled by lower_vector_insert()");
> -
> -   case ir_binop_ldexp:
> -      unreachable("not reached: should be handled by ldexp_to_arith()");
> -
> -   case ir_unop_sqrt:
> -      emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
> -      break;
> -
> -   case ir_unop_rsq:
> -      emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
> -      break;
> -
> -   case ir_unop_bitcast_i2f:
> -   case ir_unop_bitcast_u2f:
> -      op[0].type = BRW_REGISTER_TYPE_F;
> -      this->result = op[0];
> -      break;
> -   case ir_unop_i2u:
> -   case ir_unop_bitcast_f2u:
> -      op[0].type = BRW_REGISTER_TYPE_UD;
> -      this->result = op[0];
> -      break;
> -   case ir_unop_u2i:
> -   case ir_unop_bitcast_f2i:
> -      op[0].type = BRW_REGISTER_TYPE_D;
> -      this->result = op[0];
> -      break;
> -   case ir_unop_i2f:
> -   case ir_unop_u2f:
> -   case ir_unop_f2i:
> -   case ir_unop_f2u:
> -      emit(MOV(this->result, op[0]));
> -      break;
> -
> -   case ir_unop_b2i:
> -      emit(AND(this->result, op[0], fs_reg(1)));
> -      break;
> -   case ir_unop_b2f:
> -      if (brw->gen <= 5) {
> -         resolve_bool_comparison(ir->operands[0], &op[0]);
> -      }
> -      op[0].type = BRW_REGISTER_TYPE_D;
> -      this->result.type = BRW_REGISTER_TYPE_D;
> -      emit(AND(this->result, op[0], fs_reg(0x3f800000u)));
> -      this->result.type = BRW_REGISTER_TYPE_F;
> -      break;
> -
> -   case ir_unop_f2b:
> -      emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
> -      break;
> -   case ir_unop_i2b:
> -      emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
> -      break;
> -
> -   case ir_unop_trunc:
> -      emit(RNDZ(this->result, op[0]));
> -      break;
> -   case ir_unop_ceil: {
> -         fs_reg tmp = vgrf(ir->type);
> -         op[0].negate = !op[0].negate;
> -         emit(RNDD(tmp, op[0]));
> -         tmp.negate = true;
> -         emit(MOV(this->result, tmp));
> -      }
> -      break;
> -   case ir_unop_floor:
> -      emit(RNDD(this->result, op[0]));
> -      break;
> -   case ir_unop_fract:
> -      emit(FRC(this->result, op[0]));
> -      break;
> -   case ir_unop_round_even:
> -      emit(RNDE(this->result, op[0]));
> -      break;
> -
> -   case ir_binop_min:
> -   case ir_binop_max:
> -      resolve_ud_negate(&op[0]);
> -      resolve_ud_negate(&op[1]);
> -      emit_minmax(ir->operation == ir_binop_min ?
> -                  BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
> -                  this->result, op[0], op[1]);
> -      break;
> -   case ir_unop_pack_snorm_2x16:
> -   case ir_unop_pack_snorm_4x8:
> -   case ir_unop_pack_unorm_2x16:
> -   case ir_unop_pack_unorm_4x8:
> -   case ir_unop_unpack_snorm_2x16:
> -   case ir_unop_unpack_snorm_4x8:
> -   case ir_unop_unpack_unorm_2x16:
> -   case ir_unop_unpack_unorm_4x8:
> -   case ir_unop_unpack_half_2x16:
> -   case ir_unop_pack_half_2x16:
> -      unreachable("not reached: should be handled by lower_packing_builtins");
> -   case ir_unop_unpack_half_2x16_split_x:
> -      emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
> -      break;
> -   case ir_unop_unpack_half_2x16_split_y:
> -      emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
> -      break;
> -   case ir_binop_pow:
> -      emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
> -      break;
> -
> -   case ir_unop_bitfield_reverse:
> -      emit(BFREV(this->result, op[0]));
> -      break;
> -   case ir_unop_bit_count:
> -      emit(CBIT(this->result, op[0]));
> -      break;
> -   case ir_unop_find_msb:
> -      temp = vgrf(glsl_type::uint_type);
> -      emit(FBH(temp, op[0]));
> -
> -      /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
> -       * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
> -       * subtract the result from 31 to convert the MSB count into an LSB count.
> -       */
> -
> -      /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
> -      emit(MOV(this->result, temp));
> -      emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
> -
> -      temp.negate = true;
> -      inst = emit(ADD(this->result, temp, fs_reg(31)));
> -      inst->predicate = BRW_PREDICATE_NORMAL;
> -      break;
> -   case ir_unop_find_lsb:
> -      emit(FBL(this->result, op[0]));
> -      break;
> -   case ir_unop_saturate:
> -      inst = emit(MOV(this->result, op[0]));
> -      inst->saturate = true;
> -      break;
> -   case ir_triop_bitfield_extract:
> -      /* Note that the instruction's argument order is reversed from GLSL
> -       * and the IR.
> -       */
> -      emit(BFE(this->result, op[2], op[1], op[0]));
> -      break;
> -   case ir_binop_bfm:
> -      emit(BFI1(this->result, op[0], op[1]));
> -      break;
> -   case ir_triop_bfi:
> -      emit(BFI2(this->result, op[0], op[1], op[2]));
> -      break;
> -   case ir_quadop_bitfield_insert:
> -      unreachable("not reached: should be handled by "
> -              "lower_instructions::bitfield_insert_to_bfm_bfi");
> -
> -   case ir_unop_bit_not:
> -      emit(NOT(this->result, op[0]));
> -      break;
> -   case ir_binop_bit_and:
> -      emit(AND(this->result, op[0], op[1]));
> -      break;
> -   case ir_binop_bit_xor:
> -      emit(XOR(this->result, op[0], op[1]));
> -      break;
> -   case ir_binop_bit_or:
> -      emit(OR(this->result, op[0], op[1]));
> -      break;
> -
> -   case ir_binop_lshift:
> -      emit(SHL(this->result, op[0], op[1]));
> -      break;
> -
> -   case ir_binop_rshift:
> -      if (ir->type->base_type == GLSL_TYPE_INT)
> -        emit(ASR(this->result, op[0], op[1]));
> -      else
> -        emit(SHR(this->result, op[0], op[1]));
> -      break;
> -   case ir_binop_pack_half_2x16_split:
> -      emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
> -      break;
> -   case ir_binop_ubo_load: {
> -      /* This IR node takes a constant uniform block and a constant or
> -       * variable byte offset within the block and loads a vector from that.
> -       */
> -      ir_constant *const_uniform_block = ir->operands[0]->as_constant();
> -      ir_constant *const_offset = ir->operands[1]->as_constant();
> -      fs_reg surf_index;
> -
> -      if (const_uniform_block) {
> -         /* The block index is a constant, so just emit the binding table entry
> -          * as an immediate.
> -          */
> -         surf_index = fs_reg(stage_prog_data->binding_table.ubo_start +
> -                                 const_uniform_block->value.u[0]);
> -      } else {
> -         /* The block index is not a constant. Evaluate the index expression
> -          * per-channel and add the base UBO index; the generator will select
> -          * a value from any live channel.
> -          */
> -         surf_index = vgrf(glsl_type::uint_type);
> -         emit(ADD(surf_index, op[0],
> -                  fs_reg(stage_prog_data->binding_table.ubo_start)))
> -            ->force_writemask_all = true;
> -
> -         /* Assume this may touch any UBO. It would be nice to provide
> -          * a tighter bound, but the array information is already lowered away.
> -          */
> -         brw_mark_surface_used(prog_data,
> -                               stage_prog_data->binding_table.ubo_start +
> -                               shader_prog->NumUniformBlocks - 1);
> -      }
> -
> -      if (const_offset) {
> -         fs_reg packed_consts = vgrf(glsl_type::float_type);
> -         packed_consts.type = result.type;
> -
> -         fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
> -         emit(new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
> -                                   packed_consts, surf_index, const_offset_reg));
> -
> -         for (int i = 0; i < ir->type->vector_elements; i++) {
> -            packed_consts.set_smear(const_offset->value.u[0] % 16 / 4 + i);
> -
> -            /* The std140 packing rules don't allow vectors to cross 16-byte
> -             * boundaries, and a reg is 32 bytes.
> -             */
> -            assert(packed_consts.subreg_offset < 32);
> -
> -            /* UBO bools are any nonzero value.  We consider bools to be
> -             * values with the low bit set to 1.  Convert them using CMP.
> -             */
> -            if (ir->type->base_type == GLSL_TYPE_BOOL) {
> -               emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
> -            } else {
> -               emit(MOV(result, packed_consts));
> -            }
> -
> -            result = offset(result, 1);
> -         }
> -      } else {
> -         /* Turn the byte offset into a dword offset. */
> -         fs_reg base_offset = vgrf(glsl_type::int_type);
> -         emit(SHR(base_offset, op[1], fs_reg(2)));
> -
> -         for (int i = 0; i < ir->type->vector_elements; i++) {
> -            emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
> -                                            base_offset, i));
> -
> -            if (ir->type->base_type == GLSL_TYPE_BOOL)
> -               emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
> -
> -            result = offset(result, 1);
> -         }
> -      }
> -
> -      result.reg_offset = 0;
> -      break;
> -   }
> -
> -   case ir_triop_fma:
> -      /* Note that the instruction's argument order is reversed from GLSL
> -       * and the IR.
> -       */
> -      emit(MAD(this->result, op[2], op[1], op[0]));
> -      break;
> -
> -   case ir_triop_lrp:
> -      emit_lrp(this->result, op[0], op[1], op[2]);
> -      break;
> -
> -   case ir_triop_csel:
> -   case ir_unop_interpolate_at_centroid:
> -   case ir_binop_interpolate_at_offset:
> -   case ir_binop_interpolate_at_sample:
> -      unreachable("already handled above");
> -      break;
> -
> -   case ir_unop_d2f:
> -   case ir_unop_f2d:
> -   case ir_unop_d2i:
> -   case ir_unop_i2d:
> -   case ir_unop_d2u:
> -   case ir_unop_u2d:
> -   case ir_unop_d2b:
> -   case ir_unop_pack_double_2x32:
> -   case ir_unop_unpack_double_2x32:
> -   case ir_unop_frexp_sig:
> -   case ir_unop_frexp_exp:
> -      unreachable("fp64 todo");
> -      break;
> -   }
> -}
> -
> -void
> -fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
> -                                  const glsl_type *type, bool predicated)
> -{
> -   switch (type->base_type) {
> -   case GLSL_TYPE_FLOAT:
> -   case GLSL_TYPE_UINT:
> -   case GLSL_TYPE_INT:
> -   case GLSL_TYPE_BOOL:
> -      for (unsigned int i = 0; i < type->components(); i++) {
> -        l.type = brw_type_for_base_type(type);
> -        r.type = brw_type_for_base_type(type);
> -
> -        if (predicated || !l.equals(r)) {
> -           fs_inst *inst = emit(MOV(l, r));
> -           inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
> -        }
> -
> -        l = offset(l, 1);
> -        r = offset(r, 1);
> -      }
> -      break;
> -   case GLSL_TYPE_ARRAY:
> -      for (unsigned int i = 0; i < type->length; i++) {
> -        emit_assignment_writes(l, r, type->fields.array, predicated);
> -      }
> -      break;
> -
> -   case GLSL_TYPE_STRUCT:
> -      for (unsigned int i = 0; i < type->length; i++) {
> -        emit_assignment_writes(l, r, type->fields.structure[i].type,
> -                               predicated);
> -      }
> -      break;
> -
> -   case GLSL_TYPE_SAMPLER:
> -   case GLSL_TYPE_IMAGE:
> -   case GLSL_TYPE_ATOMIC_UINT:
> -      break;
> -
> -   case GLSL_TYPE_DOUBLE:
> -   case GLSL_TYPE_VOID:
> -   case GLSL_TYPE_ERROR:
> -   case GLSL_TYPE_INTERFACE:
> -      unreachable("not reached");
> -   }
> -}
> -
> -/* If the RHS processing resulted in an instruction generating a
> - * temporary value, and it would be easy to rewrite the instruction to
> - * generate its result right into the LHS instead, do so.  This ends
> - * up reliably removing instructions where it can be tricky to do so
> - * later without real UD chain information.
> - */
> -bool
> -fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
> -                                   fs_reg dst,
> -                                   fs_reg src,
> -                                   fs_inst *pre_rhs_inst,
> -                                   fs_inst *last_rhs_inst)
> -{
> -   /* Only attempt if we're doing a direct assignment. */
> -   if (ir->condition ||
> -       !(ir->lhs->type->is_scalar() ||
> -        (ir->lhs->type->is_vector() &&
> -         ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
> -      return false;
> -
> -   /* Make sure the last instruction generated our source reg. */
> -   fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
> -                                                   last_rhs_inst,
> -                                                   src);
> -   if (!modify)
> -      return false;
> -
> -   /* If last_rhs_inst wrote a different number of components than our LHS,
> -    * we can't safely rewrite it.
> -    */
> -   if (alloc.sizes[dst.reg] != modify->regs_written)
> -      return false;
> -
> -   /* Success!  Rewrite the instruction. */
> -   modify->dst = dst;
> -
> -   return true;
> -}
> -
> -void
> -fs_visitor::visit(ir_assignment *ir)
> -{
> -   fs_reg l, r;
> -   fs_inst *inst;
> -
> -   /* FINISHME: arrays on the lhs */
> -   ir->lhs->accept(this);
> -   l = this->result;
> -
> -   fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
> -
> -   ir->rhs->accept(this);
> -   r = this->result;
> -
> -   fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
> -
> -   assert(l.file != BAD_FILE);
> -   assert(r.file != BAD_FILE);
> -
> -   if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
> -      return;
> -
> -   if (ir->condition) {
> -      emit_bool_to_cond_code(ir->condition);
> -   }
> -
> -   if (ir->lhs->type->is_scalar() ||
> -       ir->lhs->type->is_vector()) {
> -      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
> -        if (ir->write_mask & (1 << i)) {
> -           inst = emit(MOV(l, r));
> -           if (ir->condition)
> -              inst->predicate = BRW_PREDICATE_NORMAL;
> -           r = offset(r, 1);
> -        }
> -        l = offset(l, 1);
> -      }
> -   } else {
> -      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
> -   }
> -}
> -
> -fs_inst *
> -fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
> -                              fs_reg coordinate, int coord_components,
> -                              fs_reg shadow_c,
> -                              fs_reg lod, fs_reg dPdy, int grad_components,
> -                              uint32_t sampler)
> -{
> -   int mlen;
> -   int base_mrf = 1;
> -   bool simd16 = false;
> -   fs_reg orig_dst;
> -
> -   /* g0 header. */
> -   mlen = 1;
> -
> -   if (shadow_c.file != BAD_FILE) {
> -      for (int i = 0; i < coord_components; i++) {
> -        emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
> -        coordinate = offset(coordinate, 1);
> -      }
> -
> -      /* gen4's SIMD8 sampler always has the slots for u,v,r present.
> -       * the unused slots must be zeroed.
> -       */
> -      for (int i = coord_components; i < 3; i++) {
> -         emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
> -      }
> -      mlen += 3;
> -
> -      if (op == ir_tex) {
> -        /* There's no plain shadow compare message, so we use shadow
> -         * compare with a bias of 0.0.
> -         */
> -        emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
> -        mlen++;
> -      } else if (op == ir_txb || op == ir_txl) {
> -        emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
> -        mlen++;
> -      } else {
> -         unreachable("Should not get here.");
> -      }
> -
> -      emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
> -      mlen++;
> -   } else if (op == ir_tex) {
> -      for (int i = 0; i < coord_components; i++) {
> -        emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
> -        coordinate = offset(coordinate, 1);
> -      }
> -      /* zero the others. */
> -      for (int i = coord_components; i<3; i++) {
> -         emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
> -      }
> -      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
> -      mlen += 3;
> -   } else if (op == ir_txd) {
> -      fs_reg &dPdx = lod;
> -
> -      for (int i = 0; i < coord_components; i++) {
> -        emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
> -        coordinate = offset(coordinate, 1);
> -      }
> -      /* the slots for u and v are always present, but r is optional */
> -      mlen += MAX2(coord_components, 2);
> -
> -      /*  P   = u, v, r
> -       * dPdx = dudx, dvdx, drdx
> -       * dPdy = dudy, dvdy, drdy
> -       *
> -       * 1-arg: Does not exist.
> -       *
> -       * 2-arg: dudx   dvdx   dudy   dvdy
> -       *        dPdx.x dPdx.y dPdy.x dPdy.y
> -       *        m4     m5     m6     m7
> -       *
> -       * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
> -       *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
> -       *        m5     m6     m7     m8     m9     m10
> -       */
> -      for (int i = 0; i < grad_components; i++) {
> -        emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
> -        dPdx = offset(dPdx, 1);
> -      }
> -      mlen += MAX2(grad_components, 2);
> -
> -      for (int i = 0; i < grad_components; i++) {
> -        emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
> -        dPdy = offset(dPdy, 1);
> -      }
> -      mlen += MAX2(grad_components, 2);
> -   } else if (op == ir_txs) {
> -      /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
> -      simd16 = true;
> -      emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
> -      mlen += 2;
> -   } else {
> -      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
> -       * instructions.  We'll need to do SIMD16 here.
> -       */
> -      simd16 = true;
> -      assert(op == ir_txb || op == ir_txl || op == ir_txf);
> -
> -      for (int i = 0; i < coord_components; i++) {
> -        emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
> -                  coordinate));
> -        coordinate = offset(coordinate, 1);
> -      }
> -
> -      /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
> -       * be necessary for TXF (ld), but seems wise to do for all messages.
> -       */
> -      for (int i = coord_components; i < 3; i++) {
> -        emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
> -      }
> -
> -      /* lod/bias appears after u/v/r. */
> -      mlen += 6;
> -
> -      emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
> -      mlen++;
> -
> -      /* The unused upper half. */
> -      mlen++;
> -   }
> -
> -   if (simd16) {
> -      /* Now, since we're doing simd16, the return is 2 interleaved
> -       * vec4s where the odd-indexed ones are junk. We'll need to move
> -       * this weirdness around to the expected layout.
> -       */
> -      orig_dst = dst;
> -      dst = fs_reg(GRF, alloc.allocate(8), orig_dst.type);
> -   }
> -
> -   enum opcode opcode;
> -   switch (op) {
> -   case ir_tex: opcode = SHADER_OPCODE_TEX; break;
> -   case ir_txb: opcode = FS_OPCODE_TXB; break;
> -   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
> -   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
> -   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
> -   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
> -   default:
> -      unreachable("not reached");
> -   }
> -
> -   fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
> -   inst->base_mrf = base_mrf;
> -   inst->mlen = mlen;
> -   inst->header_present = true;
> -   inst->regs_written = simd16 ? 8 : 4;
> -
> -   if (simd16) {
> -      for (int i = 0; i < 4; i++) {
> -        emit(MOV(orig_dst, dst));
> -        orig_dst = offset(orig_dst, 1);
> -        dst = offset(dst, 2);
> -      }
> -   }
> -
> -   return inst;
> -}
> -
> -/* gen5's sampler has slots for u, v, r, array index, then optional
> - * parameters like shadow comparitor or LOD bias.  If optional
> - * parameters aren't present, those base slots are optional and don't
> - * need to be included in the message.
> - *
> - * We don't fill in the unnecessary slots regardless, which may look
> - * surprising in the disassembly.
> - */
> -fs_inst *
> -fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
> -                              fs_reg coordinate, int vector_elements,
> -                              fs_reg shadow_c,
> -                              fs_reg lod, fs_reg lod2, int grad_components,
> -                              fs_reg sample_index, uint32_t sampler,
> -                              bool has_offset)
> -{
> -   int reg_width = dispatch_width / 8;
> -   bool header_present = false;
> -
> -   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
> -   fs_reg msg_coords = message;
> -
> -   if (has_offset) {
> -      /* The offsets set up by the ir_texture visitor are in the
> -       * m1 header, so we can't go headerless.
> -       */
> -      header_present = true;
> -      message.reg--;
> -   }
> -
> -   for (int i = 0; i < vector_elements; i++) {
> -      emit(MOV(retype(offset(msg_coords, i), coordinate.type), coordinate));
> -      coordinate = offset(coordinate, 1);
> -   }
> -   fs_reg msg_end = offset(msg_coords, vector_elements);
> -   fs_reg msg_lod = offset(msg_coords, 4);
> -
> -   if (shadow_c.file != BAD_FILE) {
> -      fs_reg msg_shadow = msg_lod;
> -      emit(MOV(msg_shadow, shadow_c));
> -      msg_lod = offset(msg_shadow, 1);
> -      msg_end = msg_lod;
> -   }
> -
> -   enum opcode opcode;
> -   switch (op) {
> -   case ir_tex:
> -      opcode = SHADER_OPCODE_TEX;
> -      break;
> -   case ir_txb:
> -      emit(MOV(msg_lod, lod));
> -      msg_end = offset(msg_lod, 1);
> -
> -      opcode = FS_OPCODE_TXB;
> -      break;
> -   case ir_txl:
> -      emit(MOV(msg_lod, lod));
> -      msg_end = offset(msg_lod, 1);
> -
> -      opcode = SHADER_OPCODE_TXL;
> -      break;
> -   case ir_txd: {
> -      /**
> -       *  P   =  u,    v,    r
> -       * dPdx = dudx, dvdx, drdx
> -       * dPdy = dudy, dvdy, drdy
> -       *
> -       * Load up these values:
> -       * - dudx   dudy   dvdx   dvdy   drdx   drdy
> -       * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
> -       */
> -      msg_end = msg_lod;
> -      for (int i = 0; i < grad_components; i++) {
> -         emit(MOV(msg_end, lod));
> -         lod = offset(lod, 1);
> -         msg_end = offset(msg_end, 1);
> -
> -         emit(MOV(msg_end, lod2));
> -         lod2 = offset(lod2, 1);
> -         msg_end = offset(msg_end, 1);
> -      }
> -
> -      opcode = SHADER_OPCODE_TXD;
> -      break;
> -   }
> -   case ir_txs:
> -      msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
> -      emit(MOV(msg_lod, lod));
> -      msg_end = offset(msg_lod, 1);
> -
> -      opcode = SHADER_OPCODE_TXS;
> -      break;
> -   case ir_query_levels:
> -      msg_lod = msg_end;
> -      emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
> -      msg_end = offset(msg_lod, 1);
> -
> -      opcode = SHADER_OPCODE_TXS;
> -      break;
> -   case ir_txf:
> -      msg_lod = offset(msg_coords, 3);
> -      emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod));
> -      msg_end = offset(msg_lod, 1);
> -
> -      opcode = SHADER_OPCODE_TXF;
> -      break;
> -   case ir_txf_ms:
> -      msg_lod = offset(msg_coords, 3);
> -      /* lod */
> -      emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
> -      /* sample index */
> -      emit(MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index));
> -      msg_end = offset(msg_lod, 2);
> -
> -      opcode = SHADER_OPCODE_TXF_CMS;
> -      break;
> -   case ir_lod:
> -      opcode = SHADER_OPCODE_LOD;
> -      break;
> -   case ir_tg4:
> -      opcode = SHADER_OPCODE_TG4;
> -      break;
> -   default:
> -      unreachable("not reached");
> -   }
> -
> -   fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
> -   inst->base_mrf = message.reg;
> -   inst->mlen = msg_end.reg - message.reg;
> -   inst->header_present = header_present;
> -   inst->regs_written = 4 * reg_width;
> -
> -   if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
> -      fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
> -           " disallowed by hardware\n");
> -   }
> -
> -   return inst;
> -}
> -
> -static bool
> -is_high_sampler(struct brw_context *brw, fs_reg sampler)
> -{
> -   if (brw->gen < 8 && !brw->is_haswell)
> -      return false;
> -
> -   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
> -}
> -
> -fs_inst *
> -fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
> -                              fs_reg coordinate, int coord_components,
> -                              fs_reg shadow_c,
> -                              fs_reg lod, fs_reg lod2, int grad_components,
> -                              fs_reg sample_index, fs_reg mcs, fs_reg sampler,
> -                              fs_reg offset_value)
> -{
> -   int reg_width = dispatch_width / 8;
> -   bool header_present = false;
> -
> -   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
> -   for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
> -      sources[i] = vgrf(glsl_type::float_type);
> -   }
> -   int length = 0;
> -
> -   if (op == ir_tg4 || offset_value.file != BAD_FILE ||
> -       is_high_sampler(brw, sampler)) {
> -      /* For general texture offsets (no txf workaround), we need a header to
> -       * put them in.  Note that for SIMD16 we're making space for two actual
> -       * hardware registers here, so the emit will have to fix up for this.
> -       *
> -       * * ir4_tg4 needs to place its channel select in the header,
> -       * for interaction with ARB_texture_swizzle
> -       *
> -       * The sampler index is only 4-bits, so for larger sampler numbers we
> -       * need to offset the Sampler State Pointer in the header.
> -       */
> -      header_present = true;
> -      sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> -      length++;
> -   }
> -
> -   if (shadow_c.file != BAD_FILE) {
> -      emit(MOV(sources[length], shadow_c));
> -      length++;
> -   }
> -
> -   bool has_nonconstant_offset =
> -      offset_value.file != BAD_FILE && offset_value.file != IMM;
> -   bool coordinate_done = false;
> -
> -   /* Set up the LOD info */
> -   switch (op) {
> -   case ir_tex:
> -   case ir_lod:
> -      break;
> -   case ir_txb:
> -      emit(MOV(sources[length], lod));
> -      length++;
> -      break;
> -   case ir_txl:
> -      emit(MOV(sources[length], lod));
> -      length++;
> -      break;
> -   case ir_txd: {
> -      no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
> -
> -      /* Load dPdx and the coordinate together:
> -       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
> -       */
> -      for (int i = 0; i < coord_components; i++) {
> -        emit(MOV(sources[length], coordinate));
> -        coordinate = offset(coordinate, 1);
> -        length++;
> -
> -         /* For cube map array, the coordinate is (u,v,r,ai) but there are
> -          * only derivatives for (u, v, r).
> -          */
> -         if (i < grad_components) {
> -            emit(MOV(sources[length], lod));
> -            lod = offset(lod, 1);
> -            length++;
> -
> -            emit(MOV(sources[length], lod2));
> -            lod2 = offset(lod2, 1);
> -            length++;
> -         }
> -      }
> -
> -      coordinate_done = true;
> -      break;
> -   }
> -   case ir_txs:
> -      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod));
> -      length++;
> -      break;
> -   case ir_query_levels:
> -      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u)));
> -      length++;
> -      break;
> -   case ir_txf:
> -      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
> -      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
> -      coordinate = offset(coordinate, 1);
> -      length++;
> -
> -      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod));
> -      length++;
> -
> -      for (int i = 1; i < coord_components; i++) {
> -        emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
> -        coordinate = offset(coordinate, 1);
> -        length++;
> -      }
> -
> -      coordinate_done = true;
> -      break;
> -   case ir_txf_ms:
> -      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index));
> -      length++;
> -
> -      /* data from the multisample control surface */
> -      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs));
> -      length++;
> -
> -      /* there is no offsetting for this message; just copy in the integer
> -       * texture coordinates
> -       */
> -      for (int i = 0; i < coord_components; i++) {
> -         emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
> -         coordinate = offset(coordinate, 1);
> -         length++;
> -      }
> -
> -      coordinate_done = true;
> -      break;
> -   case ir_tg4:
> -      if (has_nonconstant_offset) {
> -         if (shadow_c.file != BAD_FILE)
> -            no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
> -
> -         /* More crazy intermixing */
> -         for (int i = 0; i < 2; i++) { /* u, v */
> -            emit(MOV(sources[length], coordinate));
> -            coordinate = offset(coordinate, 1);
> -            length++;
> -         }
> -
> -         for (int i = 0; i < 2; i++) { /* offu, offv */
> -            emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value));
> -            offset_value = offset(offset_value, 1);
> -            length++;
> -         }
> -
> -         if (coord_components == 3) { /* r if present */
> -            emit(MOV(sources[length], coordinate));
> -            coordinate = offset(coordinate, 1);
> -            length++;
> -         }
> -
> -         coordinate_done = true;
> -      }
> -      break;
> -   }
> -
> -   /* Set up the coordinate (except for cases where it was done above) */
> -   if (!coordinate_done) {
> -      for (int i = 0; i < coord_components; i++) {
> -         emit(MOV(sources[length], coordinate));
> -         coordinate = offset(coordinate, 1);
> -         length++;
> -      }
> -   }
> -
> -   int mlen;
> -   if (reg_width == 2)
> -      mlen = length * reg_width - header_present;
> -   else
> -      mlen = length * reg_width;
> -
> -   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
> -                               BRW_REGISTER_TYPE_F);
> -   emit(LOAD_PAYLOAD(src_payload, sources, length));
> -
> -   /* Generate the SEND */
> -   enum opcode opcode;
> -   switch (op) {
> -   case ir_tex: opcode = SHADER_OPCODE_TEX; break;
> -   case ir_txb: opcode = FS_OPCODE_TXB; break;
> -   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
> -   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
> -   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
> -   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
> -   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
> -   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
> -   case ir_lod: opcode = SHADER_OPCODE_LOD; break;
> -   case ir_tg4:
> -      if (has_nonconstant_offset)
> -         opcode = SHADER_OPCODE_TG4_OFFSET;
> -      else
> -         opcode = SHADER_OPCODE_TG4;
> -      break;
> -   default:
> -      unreachable("not reached");
> -   }
> -   fs_inst *inst = emit(opcode, dst, src_payload, sampler);
> -   inst->base_mrf = -1;
> -   inst->mlen = mlen;
> -   inst->header_present = header_present;
> -   inst->regs_written = 4 * reg_width;
> -
> -   if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
> -      fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
> -           " disallowed by hardware\n");
> -   }
> -
> -   return inst;
> -}
> -
> -fs_reg
> -fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
> -                             bool is_rect, uint32_t sampler, int texunit)
> -{
> -   fs_inst *inst = NULL;
> -   bool needs_gl_clamp = true;
> -   fs_reg scale_x, scale_y;
> -
> -   /* The 965 requires the EU to do the normalization of GL rectangle
> -    * texture coordinates.  We use the program parameter state
> -    * tracking to get the scaling factor.
> -    */
> -   if (is_rect &&
> -       (brw->gen < 6 ||
> -        (brw->gen >= 6 && (key_tex->gl_clamp_mask[0] & (1 << sampler) ||
> -                           key_tex->gl_clamp_mask[1] & (1 << sampler))))) {
> -      struct gl_program_parameter_list *params = prog->Parameters;
> -      int tokens[STATE_LENGTH] = {
> -        STATE_INTERNAL,
> -        STATE_TEXRECT_SCALE,
> -        texunit,
> -        0,
> -        0
> -      };
> -
> -      no16("rectangle scale uniform setup not supported on SIMD16\n");
> -      if (dispatch_width == 16) {
> -        return coordinate;
> -      }
> -
> -      GLuint index = _mesa_add_state_reference(params,
> -                                              (gl_state_index *)tokens);
> -      /* Try to find existing copies of the texrect scale uniforms. */
> -      for (unsigned i = 0; i < uniforms; i++) {
> -         if (stage_prog_data->param[i] ==
> -             &prog->Parameters->ParameterValues[index][0]) {
> -            scale_x = fs_reg(UNIFORM, i);
> -            scale_y = fs_reg(UNIFORM, i + 1);
> -            break;
> -         }
> -      }
> -
> -      /* If we didn't already set them up, do so now. */
> -      if (scale_x.file == BAD_FILE) {
> -         scale_x = fs_reg(UNIFORM, uniforms);
> -         scale_y = fs_reg(UNIFORM, uniforms + 1);
> -
> -         stage_prog_data->param[uniforms++] =
> -            &prog->Parameters->ParameterValues[index][0];
> -         stage_prog_data->param[uniforms++] =
> -            &prog->Parameters->ParameterValues[index][1];
> -      }
> -   }
> -
> -   /* The 965 requires the EU to do the normalization of GL rectangle
> -    * texture coordinates.  We use the program parameter state
> -    * tracking to get the scaling factor.
> -    */
> -   if (brw->gen < 6 && is_rect) {
> -      fs_reg dst = fs_reg(GRF, alloc.allocate(coord_components));
> -      fs_reg src = coordinate;
> -      coordinate = dst;
> -
> -      emit(MUL(dst, src, scale_x));
> -      dst = offset(dst, 1);
> -      src = offset(src, 1);
> -      emit(MUL(dst, src, scale_y));
> -   } else if (is_rect) {
> -      /* On gen6+, the sampler handles the rectangle coordinates
> -       * natively, without needing rescaling.  But that means we have
> -       * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
> -       * not [0, 1] like the default case below.
> -       */
> -      needs_gl_clamp = false;
> -
> -      for (int i = 0; i < 2; i++) {
> -        if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
> -           fs_reg chan = coordinate;
> -           chan = offset(chan, i);
> -
> -           inst = emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f));
> -           inst->conditional_mod = BRW_CONDITIONAL_GE;
> -
> -           /* Our parameter comes in as 1.0/width or 1.0/height,
> -            * because that's what people normally want for doing
> -            * texture rectangle handling.  We need width or height
> -            * for clamping, but we don't care enough to make a new
> -            * parameter type, so just invert back.
> -            */
> -           fs_reg limit = vgrf(glsl_type::float_type);
> -           emit(MOV(limit, i == 0 ? scale_x : scale_y));
> -           emit(SHADER_OPCODE_RCP, limit, limit);
> -
> -           inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
> -           inst->conditional_mod = BRW_CONDITIONAL_L;
> -        }
> -      }
> -   }
> -
> -   if (coord_components > 0 && needs_gl_clamp) {
> -      for (int i = 0; i < MIN2(coord_components, 3); i++) {
> -        if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
> -           fs_reg chan = coordinate;
> -           chan = offset(chan, i);
> -
> -           fs_inst *inst = emit(MOV(chan, chan));
> -           inst->saturate = true;
> -        }
> -      }
> -   }
> -   return coordinate;
> -}
> -
> -/* Sample from the MCS surface attached to this multisample texture. */
> -fs_reg
> -fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
> -{
> -   int reg_width = dispatch_width / 8;
> -   fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
> -                           BRW_REGISTER_TYPE_F);
> -   fs_reg dest = vgrf(glsl_type::uvec4_type);
> -   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
> -
> -   /* parameters are: u, v, r; missing parameters are treated as zero */
> -   for (int i = 0; i < components; i++) {
> -      sources[i] = vgrf(glsl_type::float_type);
> -      emit(MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate));
> -      coordinate = offset(coordinate, 1);
> -   }
> -
> -   emit(LOAD_PAYLOAD(payload, sources, components));
> -
> -   fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
> -   inst->base_mrf = -1;
> -   inst->mlen = components * reg_width;
> -   inst->header_present = false;
> -   inst->regs_written = 4 * reg_width; /* we only care about one reg of
> -                                        * response, but the sampler always
> -                                        * writes 4/8
> -                                        */
> -
> -   return dest;
> -}
> -
> -void
> -fs_visitor::emit_texture(ir_texture_opcode op,
> -                         const glsl_type *dest_type,
> -                         fs_reg coordinate, int coord_components,
> -                         fs_reg shadow_c,
> -                         fs_reg lod, fs_reg lod2, int grad_components,
> -                         fs_reg sample_index,
> -                         fs_reg offset_value,
> -                         fs_reg mcs,
> -                         int gather_component,
> -                         bool is_cube_array,
> -                         bool is_rect,
> -                         uint32_t sampler,
> -                         fs_reg sampler_reg, int texunit)
> -{
> -   fs_inst *inst = NULL;
> -
> -   if (op == ir_tg4) {
> -      /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
> -       * emitting anything other than setting up the constant result.
> -       */
> -      int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
> -      if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
> -
> -         fs_reg res = vgrf(glsl_type::vec4_type);
> -         this->result = res;
> -
> -         for (int i=0; i<4; i++) {
> -            emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
> -            res = offset(res, 1);
> -         }
> -         return;
> -      }
> -   }
> -
> -   if (coordinate.file != BAD_FILE) {
> -      /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
> -       * samplers.  This should only be a problem with GL_CLAMP on Gen7.
> -       */
> -      coordinate = rescale_texcoord(coordinate, coord_components, is_rect,
> -                                    sampler, texunit);
> -   }
> -
> -   /* Writemasking doesn't eliminate channels on SIMD8 texture
> -    * samples, so don't worry about them.
> -    */
> -   fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
> -
> -   if (brw->gen >= 7) {
> -      inst = emit_texture_gen7(op, dst, coordinate, coord_components,
> -                               shadow_c, lod, lod2, grad_components,
> -                               sample_index, mcs, sampler_reg,
> -                               offset_value);
> -   } else if (brw->gen >= 5) {
> -      inst = emit_texture_gen5(op, dst, coordinate, coord_components,
> -                               shadow_c, lod, lod2, grad_components,
> -                               sample_index, sampler,
> -                               offset_value.file != BAD_FILE);
> -   } else {
> -      inst = emit_texture_gen4(op, dst, coordinate, coord_components,
> -                               shadow_c, lod, lod2, grad_components,
> -                               sampler);
> -   }
> -
> -   if (shadow_c.file != BAD_FILE)
> -      inst->shadow_compare = true;
> -
> -   if (offset_value.file == IMM)
> -      inst->offset = offset_value.fixed_hw_reg.dw1.ud;
> -
> -   if (op == ir_tg4) {
> -      inst->offset |=
> -         gather_channel(gather_component, sampler) << 16; /* M0.2:16-17 */
> -
> -      if (brw->gen == 6)
> -         emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], dst);
> -   }
> -
> -   /* fixup #layers for cube map arrays */
> -   if (op == ir_txs && is_cube_array) {
> -      fs_reg depth = offset(dst, 2);
> -      fs_reg fixed_depth = vgrf(glsl_type::int_type);
> -      emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
> -
> -      fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
> -      int components = inst->regs_written / (dst.width / 8);
> -      for (int i = 0; i < components; i++) {
> -         if (i == 2) {
> -            fixed_payload[i] = fixed_depth;
> -         } else {
> -            fixed_payload[i] = offset(dst, i);
> -         }
> -      }
> -      emit(LOAD_PAYLOAD(dst, fixed_payload, components));
> -   }
> -
> -   swizzle_result(op, dest_type->vector_elements, dst, sampler);
> -}
> -
> -void
> -fs_visitor::visit(ir_texture *ir)
> -{
> -   uint32_t sampler =
> -      _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
> -
> -   ir_rvalue *nonconst_sampler_index =
> -      _mesa_get_sampler_array_nonconst_index(ir->sampler);
> -
> -   /* Handle non-constant sampler array indexing */
> -   fs_reg sampler_reg;
> -   if (nonconst_sampler_index) {
> -      /* The highest sampler which may be used by this operation is
> -       * the last element of the array. Mark it here, because the generator
> -       * doesn't have enough information to determine the bound.
> -       */
> -      uint32_t array_size = ir->sampler->as_dereference_array()
> -         ->array->type->array_size();
> -
> -      uint32_t max_used = sampler + array_size - 1;
> -      if (ir->op == ir_tg4 && brw->gen < 8) {
> -         max_used += stage_prog_data->binding_table.gather_texture_start;
> -      } else {
> -         max_used += stage_prog_data->binding_table.texture_start;
> -      }
> -
> -      brw_mark_surface_used(prog_data, max_used);
> -
> -      /* Emit code to evaluate the actual indexing expression */
> -      nonconst_sampler_index->accept(this);
> -      fs_reg temp = vgrf(glsl_type::uint_type);
> -      emit(ADD(temp, this->result, fs_reg(sampler)))
> -            ->force_writemask_all = true;
> -      sampler_reg = temp;
> -   } else {
> -      /* Single sampler, or constant array index; the indexing expression
> -       * is just an immediate.
> -       */
> -      sampler_reg = fs_reg(sampler);
> -   }
> -
> -   /* FINISHME: We're failing to recompile our programs when the sampler is
> -    * updated.  This only matters for the texture rectangle scale parameters
> -    * (pre-gen6, or gen6+ with GL_CLAMP).
> -    */
> -   int texunit = prog->SamplerUnits[sampler];
> -
> -   /* Should be lowered by do_lower_texture_projection */
> -   assert(!ir->projector);
> -
> -   /* Should be lowered */
> -   assert(!ir->offset || !ir->offset->type->is_array());
> -
> -   /* Generate code to compute all the subexpression trees.  This has to be
> -    * done before loading any values into MRFs for the sampler message since
> -    * generating these values may involve SEND messages that need the MRFs.
> -    */
> -   fs_reg coordinate;
> -   int coord_components = 0;
> -   if (ir->coordinate) {
> -      coord_components = ir->coordinate->type->vector_elements;
> -      ir->coordinate->accept(this);
> -      coordinate = this->result;
> -   }
> -
> -   fs_reg shadow_comparitor;
> -   if (ir->shadow_comparitor) {
> -      ir->shadow_comparitor->accept(this);
> -      shadow_comparitor = this->result;
> -   }
> -
> -   fs_reg offset_value;
> -   if (ir->offset) {
> -      ir_constant *const_offset = ir->offset->as_constant();
> -      if (const_offset) {
> -         /* Store the header bitfield in an IMM register.  This allows us to
> -          * use offset_value.file to distinguish between no offset, a constant
> -          * offset, and a non-constant offset.
> -          */
> -         offset_value =
> -            fs_reg(brw_texture_offset(ctx, const_offset->value.i,
> -                                      const_offset->type->vector_elements));
> -      } else {
> -         ir->offset->accept(this);
> -         offset_value = this->result;
> -      }
> -   }
> -
> -   fs_reg lod, lod2, sample_index, mcs;
> -   int grad_components = 0;
> -   switch (ir->op) {
> -   case ir_tex:
> -   case ir_lod:
> -   case ir_tg4:
> -   case ir_query_levels:
> -      break;
> -   case ir_txb:
> -      ir->lod_info.bias->accept(this);
> -      lod = this->result;
> -      break;
> -   case ir_txd:
> -      ir->lod_info.grad.dPdx->accept(this);
> -      lod = this->result;
> -
> -      ir->lod_info.grad.dPdy->accept(this);
> -      lod2 = this->result;
> -
> -      grad_components = ir->lod_info.grad.dPdx->type->vector_elements;
> -      break;
> -   case ir_txf:
> -   case ir_txl:
> -   case ir_txs:
> -      ir->lod_info.lod->accept(this);
> -      lod = this->result;
> -      break;
> -   case ir_txf_ms:
> -      ir->lod_info.sample_index->accept(this);
> -      sample_index = this->result;
> -
> -      if (brw->gen >= 7 &&
> -          key_tex->compressed_multisample_layout_mask & (1 << sampler)) {
> -         mcs = emit_mcs_fetch(coordinate, ir->coordinate->type->vector_elements,
> -                              sampler_reg);
> -      } else {
> -         mcs = fs_reg(0u);
> -      }
> -      break;
> -   default:
> -      unreachable("Unrecognized texture opcode");
> -   };
> -
> -   int gather_component = 0;
> -   if (ir->op == ir_tg4)
> -      gather_component = ir->lod_info.component->as_constant()->value.i[0];
> -
> -   bool is_rect =
> -      ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT;
> -
> -   bool is_cube_array =
> -      ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
> -      ir->sampler->type->sampler_array;
> -
> -   emit_texture(ir->op, ir->type, coordinate, coord_components,
> -                shadow_comparitor, lod, lod2, grad_components,
> -                sample_index, offset_value, mcs,
> -                gather_component, is_cube_array, is_rect, sampler,
> -                sampler_reg, texunit);
> -}
> -
> -/**
> - * Apply workarounds for Gen6 gather with UINT/SINT
> - */
> -void
> -fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
> -{
> -   if (!wa)
> -      return;
> -
> -   int width = (wa & WA_8BIT) ? 8 : 16;
> -
> -   for (int i = 0; i < 4; i++) {
> -      fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
> -      /* Convert from UNORM to UINT */
> -      emit(MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1))));
> -      emit(MOV(dst, dst_f));
> -
> -      if (wa & WA_SIGN) {
> -         /* Reinterpret the UINT value as a signed INT value by
> -          * shifting the sign bit into place, then shifting back
> -          * preserving sign.
> -          */
> -         emit(SHL(dst, dst, fs_reg(32 - width)));
> -         emit(ASR(dst, dst, fs_reg(32 - width)));
> -      }
> -
> -      dst = offset(dst, 1);
> -   }
> -}
> -
> -/**
> - * Set up the gather channel based on the swizzle, for gather4.
> - */
> -uint32_t
> -fs_visitor::gather_channel(int orig_chan, uint32_t sampler)
> -{
> -   int swiz = GET_SWZ(key_tex->swizzles[sampler], orig_chan);
> -   switch (swiz) {
> -      case SWIZZLE_X: return 0;
> -      case SWIZZLE_Y:
> -         /* gather4 sampler is broken for green channel on RG32F --
> -          * we must ask for blue instead.
> -          */
> -         if (key_tex->gather_channel_quirk_mask & (1 << sampler))
> -            return 2;
> -         return 1;
> -      case SWIZZLE_Z: return 2;
> -      case SWIZZLE_W: return 3;
> -      default:
> -         unreachable("Not reached"); /* zero, one swizzles handled already */
> -   }
> -}
> -
> -/**
> - * Swizzle the result of a texture result.  This is necessary for
> - * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
> - */
> -void
> -fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
> -                           fs_reg orig_val, uint32_t sampler)
> -{
> -   if (op == ir_query_levels) {
> -      /* # levels is in .w */
> -      this->result = offset(orig_val, 3);
> -      return;
> -   }
> -
> -   this->result = orig_val;
> -
> -   /* txs,lod don't actually sample the texture, so swizzling the result
> -    * makes no sense.
> -    */
> -   if (op == ir_txs || op == ir_lod || op == ir_tg4)
> -      return;
> -
> -   if (dest_components == 1) {
> -      /* Ignore DEPTH_TEXTURE_MODE swizzling. */
> -   } else if (key_tex->swizzles[sampler] != SWIZZLE_NOOP) {
> -      fs_reg swizzled_result = vgrf(glsl_type::vec4_type);
> -      swizzled_result.type = orig_val.type;
> -
> -      for (int i = 0; i < 4; i++) {
> -        int swiz = GET_SWZ(key_tex->swizzles[sampler], i);
> -        fs_reg l = swizzled_result;
> -        l = offset(l, i);
> -
> -        if (swiz == SWIZZLE_ZERO) {
> -           emit(MOV(l, fs_reg(0.0f)));
> -        } else if (swiz == SWIZZLE_ONE) {
> -           emit(MOV(l, fs_reg(1.0f)));
> -        } else {
> -            emit(MOV(l, offset(orig_val,
> -                               GET_SWZ(key_tex->swizzles[sampler], i))));
> -        }
> -      }
> -      this->result = swizzled_result;
> -   }
> -}
> -
> -void
> -fs_visitor::visit(ir_swizzle *ir)
> -{
> -   ir->val->accept(this);
> -   fs_reg val = this->result;
> -
> -   if (ir->type->vector_elements == 1) {
> -      this->result = offset(this->result, ir->mask.x);
> -      return;
> -   }
> -
> -   fs_reg result = vgrf(ir->type);
> -   this->result = result;
> -
> -   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
> -      fs_reg channel = val;
> -      int swiz = 0;
> -
> -      switch (i) {
> -      case 0:
> -        swiz = ir->mask.x;
> -        break;
> -      case 1:
> -        swiz = ir->mask.y;
> -        break;
> -      case 2:
> -        swiz = ir->mask.z;
> -        break;
> -      case 3:
> -        swiz = ir->mask.w;
> -        break;
> -      }
> -
> -      emit(MOV(result, offset(channel, swiz)));
> -      result = offset(result, 1);
> -   }
> -}
> -
> -void
> -fs_visitor::visit(ir_discard *ir)
> -{
> -   /* We track our discarded pixels in f0.1.  By predicating on it, we can
> -    * update just the flag bits that aren't yet discarded.  If there's no
> -    * condition, we emit a CMP of g0 != g0, so all currently executing
> -    * channels will get turned off.
> -    */
> -   fs_inst *cmp;
> -   if (ir->condition) {
> -      emit_bool_to_cond_code(ir->condition);
> -      cmp = (fs_inst *) this->instructions.get_tail();
> -      cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
> -   } else {
> -      fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
> -                                      BRW_REGISTER_TYPE_UW));
> -      cmp = emit(CMP(reg_null_f, some_reg, some_reg, BRW_CONDITIONAL_NZ));
> -   }
> -   cmp->predicate = BRW_PREDICATE_NORMAL;
> -   cmp->flag_subreg = 1;
> -
> -   if (brw->gen >= 6) {
> -      emit_discard_jump();
> -   }
> -}
> -
> -void
> -fs_visitor::visit(ir_constant *ir)
> -{
> -   /* Set this->result to reg at the bottom of the function because some code
> -    * paths will cause this visitor to be applied to other fields.  This will
> -    * cause the value stored in this->result to be modified.
> -    *
> -    * Make reg constant so that it doesn't get accidentally modified along the
> -    * way.  Yes, I actually had this problem. :(
> -    */
> -   const fs_reg reg = vgrf(ir->type);
> -   fs_reg dst_reg = reg;
> -
> -   if (ir->type->is_array()) {
> -      const unsigned size = type_size(ir->type->fields.array);
> -
> -      for (unsigned i = 0; i < ir->type->length; i++) {
> -        ir->array_elements[i]->accept(this);
> -        fs_reg src_reg = this->result;
> -
> -        dst_reg.type = src_reg.type;
> -        for (unsigned j = 0; j < size; j++) {
> -           emit(MOV(dst_reg, src_reg));
> -           src_reg = offset(src_reg, 1);
> -           dst_reg = offset(dst_reg, 1);
> -        }
> -      }
> -   } else if (ir->type->is_record()) {
> -      foreach_in_list(ir_constant, field, &ir->components) {
> -        const unsigned size = type_size(field->type);
> -
> -        field->accept(this);
> -        fs_reg src_reg = this->result;
> -
> -        dst_reg.type = src_reg.type;
> -        for (unsigned j = 0; j < size; j++) {
> -           emit(MOV(dst_reg, src_reg));
> -           src_reg = offset(src_reg, 1);
> -           dst_reg = offset(dst_reg, 1);
> -        }
> -      }
> -   } else {
> -      const unsigned size = type_size(ir->type);
> -
> -      for (unsigned i = 0; i < size; i++) {
> -        switch (ir->type->base_type) {
> -        case GLSL_TYPE_FLOAT:
> -           emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
> -           break;
> -        case GLSL_TYPE_UINT:
> -           emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
> -           break;
> -        case GLSL_TYPE_INT:
> -           emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
> -           break;
> -        case GLSL_TYPE_BOOL:
> -            emit(MOV(dst_reg,
> -                     fs_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
> -                                                : 0)));
> -           break;
> -        default:
> -           unreachable("Non-float/uint/int/bool constant");
> -        }
> -        dst_reg = offset(dst_reg, 1);
> -      }
> -   }
> -
> -   this->result = reg;
> -}
> -
> -void
> -fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
> -{
> -   ir_expression *expr = ir->as_expression();
> -
> -   if (!expr || expr->operation == ir_binop_ubo_load) {
> -      ir->accept(this);
> -
> -      fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
> -      inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -      return;
> -   }
> -
> -   fs_reg op[3];
> -
> -   assert(expr->get_num_operands() <= 3);
> -   for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> -      assert(expr->operands[i]->type->is_scalar());
> -
> -      expr->operands[i]->accept(this);
> -      op[i] = this->result;
> -
> -      resolve_ud_negate(&op[i]);
> -   }
> -
> -   emit_bool_to_cond_code_of_reg(expr, op);
> -}
> -
> -void
> -fs_visitor::emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3])
> -{
> -   fs_inst *inst;
> -
> -   switch (expr->operation) {
> -   case ir_unop_logic_not:
> -      inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
> -      inst->conditional_mod = BRW_CONDITIONAL_Z;
> -      break;
> -
> -   case ir_binop_logic_xor:
> -      if (brw->gen <= 5) {
> -         fs_reg temp = vgrf(expr->type);
> -         emit(XOR(temp, op[0], op[1]));
> -         inst = emit(AND(reg_null_d, temp, fs_reg(1)));
> -      } else {
> -         inst = emit(XOR(reg_null_d, op[0], op[1]));
> -      }
> -      inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -      break;
> -
> -   case ir_binop_logic_or:
> -      if (brw->gen <= 5) {
> -         fs_reg temp = vgrf(expr->type);
> -         emit(OR(temp, op[0], op[1]));
> -         inst = emit(AND(reg_null_d, temp, fs_reg(1)));
> -      } else {
> -         inst = emit(OR(reg_null_d, op[0], op[1]));
> -      }
> -      inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -      break;
> -
> -   case ir_binop_logic_and:
> -      if (brw->gen <= 5) {
> -         fs_reg temp = vgrf(expr->type);
> -         emit(AND(temp, op[0], op[1]));
> -         inst = emit(AND(reg_null_d, temp, fs_reg(1)));
> -      } else {
> -         inst = emit(AND(reg_null_d, op[0], op[1]));
> -      }
> -      inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -      break;
> -
> -   case ir_unop_f2b:
> -      if (brw->gen >= 6) {
> -         emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
> -      } else {
> -         inst = emit(MOV(reg_null_f, op[0]));
> -         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -      }
> -      break;
> -
> -   case ir_unop_i2b:
> -      if (brw->gen >= 6) {
> -         emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
> -      } else {
> -         inst = emit(MOV(reg_null_d, op[0]));
> -         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -      }
> -      break;
> -
> -   case ir_binop_greater:
> -   case ir_binop_gequal:
> -   case ir_binop_less:
> -   case ir_binop_lequal:
> -   case ir_binop_equal:
> -   case ir_binop_all_equal:
> -   case ir_binop_nequal:
> -   case ir_binop_any_nequal:
> -      if (brw->gen <= 5) {
> -         resolve_bool_comparison(expr->operands[0], &op[0]);
> -         resolve_bool_comparison(expr->operands[1], &op[1]);
> -      }
> -
> -      emit(CMP(reg_null_d, op[0], op[1],
> -               brw_conditional_for_comparison(expr->operation)));
> -      break;
> -
> -   case ir_triop_csel: {
> -      /* Expand the boolean condition into the flag register. */
> -      inst = emit(MOV(reg_null_d, op[0]));
> -      inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -
> -      /* Select which boolean to return. */
> -      fs_reg temp = vgrf(expr->operands[1]->type);
> -      inst = emit(SEL(temp, op[1], op[2]));
> -      inst->predicate = BRW_PREDICATE_NORMAL;
> -
> -      /* Expand the result to a condition code. */
> -      inst = emit(MOV(reg_null_d, temp));
> -      inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -      break;
> -   }
> -
> -   default:
> -      unreachable("not reached");
> -   }
> -}
> -
> -/**
> - * Emit a gen6 IF statement with the comparison folded into the IF
> - * instruction.
> - */
> -void
> -fs_visitor::emit_if_gen6(ir_if *ir)
> -{
> -   ir_expression *expr = ir->condition->as_expression();
> -
> -   if (expr && expr->operation != ir_binop_ubo_load) {
> -      fs_reg op[3];
> -      fs_inst *inst;
> -      fs_reg temp;
> -
> -      assert(expr->get_num_operands() <= 3);
> -      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> -        assert(expr->operands[i]->type->is_scalar());
> -
> -        expr->operands[i]->accept(this);
> -        op[i] = this->result;
> -      }
> -
> -      switch (expr->operation) {
> -      case ir_unop_logic_not:
> -         emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_Z));
> -         return;
> -
> -      case ir_binop_logic_xor:
> -         emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
> -         return;
> -
> -      case ir_binop_logic_or:
> -         temp = vgrf(glsl_type::bool_type);
> -         emit(OR(temp, op[0], op[1]));
> -         emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
> -         return;
> -
> -      case ir_binop_logic_and:
> -         temp = vgrf(glsl_type::bool_type);
> -         emit(AND(temp, op[0], op[1]));
> -         emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
> -         return;
> -
> -      case ir_unop_f2b:
> -        inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
> -        inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -        return;
> -
> -      case ir_unop_i2b:
> -        emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
> -        return;
> -
> -      case ir_binop_greater:
> -      case ir_binop_gequal:
> -      case ir_binop_less:
> -      case ir_binop_lequal:
> -      case ir_binop_equal:
> -      case ir_binop_all_equal:
> -      case ir_binop_nequal:
> -      case ir_binop_any_nequal:
> -         if (brw->gen <= 5) {
> -            resolve_bool_comparison(expr->operands[0], &op[0]);
> -            resolve_bool_comparison(expr->operands[1], &op[1]);
> -         }
> -
> -        emit(IF(op[0], op[1],
> -                 brw_conditional_for_comparison(expr->operation)));
> -        return;
> -
> -      case ir_triop_csel: {
> -         /* Expand the boolean condition into the flag register. */
> -         fs_inst *inst = emit(MOV(reg_null_d, op[0]));
> -         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -
> -         /* Select which boolean to use as the result. */
> -         fs_reg temp = vgrf(expr->operands[1]->type);
> -         inst = emit(SEL(temp, op[1], op[2]));
> -         inst->predicate = BRW_PREDICATE_NORMAL;
> -
> -        emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
> -        return;
> -      }
> -
> -      default:
> -        unreachable("not reached");
> -      }
> -   }
> -
> -   ir->condition->accept(this);
> -   emit(IF(this->result, fs_reg(0), BRW_CONDITIONAL_NZ));
> -}
> -
> -bool
> -fs_visitor::try_opt_frontfacing_ternary(ir_if *ir)
> -{
> -   ir_dereference_variable *deref = ir->condition->as_dereference_variable();
> -   if (!deref || strcmp(deref->var->name, "gl_FrontFacing") != 0)
> -      return false;
> -
> -   if (ir->then_instructions.length() != 1 ||
> -       ir->else_instructions.length() != 1)
> -      return false;
> -
> -   ir_assignment *then_assign =
> -         ((ir_instruction *)ir->then_instructions.head)->as_assignment();
> -   ir_assignment *else_assign =
> -         ((ir_instruction *)ir->else_instructions.head)->as_assignment();
> -
> -   if (!then_assign || then_assign->condition ||
> -       !else_assign || else_assign->condition ||
> -       then_assign->write_mask != else_assign->write_mask ||
> -       !then_assign->lhs->equals(else_assign->lhs))
> -      return false;
> -
> -   ir_constant *then_rhs = then_assign->rhs->as_constant();
> -   ir_constant *else_rhs = else_assign->rhs->as_constant();
> -
> -   if (!then_rhs || !else_rhs)
> -      return false;
> -
> -   if (then_rhs->type->base_type != GLSL_TYPE_FLOAT)
> -      return false;
> -
> -   if ((then_rhs->is_one() && else_rhs->is_negative_one()) ||
> -       (else_rhs->is_one() && then_rhs->is_negative_one())) {
> -      then_assign->lhs->accept(this);
> -      fs_reg dst = this->result;
> -      dst.type = BRW_REGISTER_TYPE_D;
> -      fs_reg tmp = vgrf(glsl_type::int_type);
> -
> -      if (brw->gen >= 6) {
> -         /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
> -         fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
> -
> -         /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
> -          *
> -          *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
> -          *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
> -          *
> -          * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
> -          */
> -
> -         if (then_rhs->is_negative_one()) {
> -            assert(else_rhs->is_one());
> -            g0.negate = true;
> -         }
> -
> -         tmp.type = BRW_REGISTER_TYPE_W;
> -         tmp.subreg_offset = 2;
> -         tmp.stride = 2;
> -
> -         fs_inst *or_inst = emit(OR(tmp, g0, fs_reg(0x3f80)));
> -         or_inst->src[1].type = BRW_REGISTER_TYPE_UW;
> -
> -         tmp.type = BRW_REGISTER_TYPE_D;
> -         tmp.subreg_offset = 0;
> -         tmp.stride = 1;
> -      } else {
> -         /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
> -         fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
> -
> -         /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
> -          *
> -          *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
> -          *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
> -          *
> -          * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
> -          */
> -
> -         if (then_rhs->is_negative_one()) {
> -            assert(else_rhs->is_one());
> -            g1_6.negate = true;
> -         }
> -
> -         emit(OR(tmp, g1_6, fs_reg(0x3f800000)));
> -      }
> -      emit(AND(dst, tmp, fs_reg(0xbf800000)));
> -      return true;
> -   }
> -
> -   return false;
> -}
> -
> -/**
> - * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
> - *
> - * Many GLSL shaders contain the following pattern:
> - *
> - *    x = condition ? foo : bar
> - *
> - * The compiler emits an ir_if tree for this, since each subexpression might be
> - * a complex tree that could have side-effects or short-circuit logic.
> - *
> - * However, the common case is to simply select one of two constants or
> - * variable values---which is exactly what SEL is for.  In this case, the
> - * assembly looks like:
> - *
> - *    (+f0) IF
> - *    MOV dst src0
> - *    ELSE
> - *    MOV dst src1
> - *    ENDIF
> - *
> - * which can be easily translated into:
> - *
> - *    (+f0) SEL dst src0 src1
> - *
> - * If src0 is an immediate value, we promote it to a temporary GRF.
> - */
> -bool
> -fs_visitor::try_replace_with_sel()
> -{
> -   fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
> -   assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
> -
> -   /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
> -   int opcodes[] = {
> -      BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
> -   };
> -
> -   fs_inst *match = (fs_inst *) endif_inst->prev;
> -   for (int i = 0; i < 4; i++) {
> -      if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
> -         return false;
> -      match = (fs_inst *) match->prev;
> -   }
> -
> -   /* The opcodes match; it looks like the right sequence of instructions. */
> -   fs_inst *else_mov = (fs_inst *) endif_inst->prev;
> -   fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
> -   fs_inst *if_inst = (fs_inst *) then_mov->prev;
> -
> -   /* Check that the MOVs are the right form. */
> -   if (then_mov->dst.equals(else_mov->dst) &&
> -       !then_mov->is_partial_write() &&
> -       !else_mov->is_partial_write()) {
> -
> -      /* Remove the matched instructions; we'll emit a SEL to replace them. */
> -      while (!if_inst->next->is_tail_sentinel())
> -         if_inst->next->exec_node::remove();
> -      if_inst->exec_node::remove();
> -
> -      /* Only the last source register can be a constant, so if the MOV in
> -       * the "then" clause uses a constant, we need to put it in a temporary.
> -       */
> -      fs_reg src0(then_mov->src[0]);
> -      if (src0.file == IMM) {
> -         src0 = vgrf(glsl_type::float_type);
> -         src0.type = then_mov->src[0].type;
> -         emit(MOV(src0, then_mov->src[0]));
> -      }
> -
> -      fs_inst *sel;
> -      if (if_inst->conditional_mod) {
> -         /* Sandybridge-specific IF with embedded comparison */
> -         emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
> -                  if_inst->conditional_mod));
> -         sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
> -         sel->predicate = BRW_PREDICATE_NORMAL;
> -      } else {
> -         /* Separate CMP and IF instructions */
> -         sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
> -         sel->predicate = if_inst->predicate;
> -         sel->predicate_inverse = if_inst->predicate_inverse;
> -      }
> -
> -      return true;
> -   }
> -
> -   return false;
> -}
> -
> -void
> -fs_visitor::visit(ir_if *ir)
> -{
> -   if (try_opt_frontfacing_ternary(ir))
> -      return;
> -
> -   /* Don't point the annotation at the if statement, because then it plus
> -    * the then and else blocks get printed.
> -    */
> -   this->base_ir = ir->condition;
> -
> -   if (brw->gen == 6) {
> -      emit_if_gen6(ir);
> -   } else {
> -      emit_bool_to_cond_code(ir->condition);
> -
> -      emit(IF(BRW_PREDICATE_NORMAL));
> -   }
> -
> -   foreach_in_list(ir_instruction, ir_, &ir->then_instructions) {
> -      this->base_ir = ir_;
> -      ir_->accept(this);
> -   }
> -
> -   if (!ir->else_instructions.is_empty()) {
> -      emit(BRW_OPCODE_ELSE);
> -
> -      foreach_in_list(ir_instruction, ir_, &ir->else_instructions) {
> -        this->base_ir = ir_;
> -        ir_->accept(this);
> -      }
> -   }
> -
> -   emit(BRW_OPCODE_ENDIF);
> -
> -   if (!try_replace_with_sel() && brw->gen < 6) {
> -      no16("Can't support (non-uniform) control flow on SIMD16\n");
> -   }
> -}
> -
> -void
> -fs_visitor::visit(ir_loop *ir)
> -{
> -   if (brw->gen < 6) {
> -      no16("Can't support (non-uniform) control flow on SIMD16\n");
> -   }
> -
> -   this->base_ir = NULL;
> -   emit(BRW_OPCODE_DO);
> -
> -   foreach_in_list(ir_instruction, ir_, &ir->body_instructions) {
> -      this->base_ir = ir_;
> -      ir_->accept(this);
> -   }
> -
> -   this->base_ir = NULL;
> -   emit(BRW_OPCODE_WHILE);
> -}
> -
> -void
> -fs_visitor::visit(ir_loop_jump *ir)
> -{
> -   switch (ir->mode) {
> -   case ir_loop_jump::jump_break:
> -      emit(BRW_OPCODE_BREAK);
> -      break;
> -   case ir_loop_jump::jump_continue:
> -      emit(BRW_OPCODE_CONTINUE);
> -      break;
> -   }
> -}
> -
> -void
> -fs_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
> -{
> -   ir_dereference *deref = static_cast<ir_dereference *>(
> -      ir->actual_parameters.get_head());
> -   ir_variable *location = deref->variable_referenced();
> -   unsigned surf_index = (stage_prog_data->binding_table.abo_start +
> -                          location->data.binding);
> -
> -   /* Calculate the surface offset */
> -   fs_reg offset = vgrf(glsl_type::uint_type);
> -   ir_dereference_array *deref_array = deref->as_dereference_array();
> -
> -   if (deref_array) {
> -      deref_array->array_index->accept(this);
> -
> -      fs_reg tmp = vgrf(glsl_type::uint_type);
> -      emit(MUL(tmp, this->result, fs_reg(ATOMIC_COUNTER_SIZE)));
> -      emit(ADD(offset, tmp, fs_reg(location->data.atomic.offset)));
> -   } else {
> -      offset = fs_reg(location->data.atomic.offset);
> -   }
> -
> -   /* Emit the appropriate machine instruction */
> -   const char *callee = ir->callee->function_name();
> -   ir->return_deref->accept(this);
> -   fs_reg dst = this->result;
> -
> -   if (!strcmp("__intrinsic_atomic_read", callee)) {
> -      emit_untyped_surface_read(surf_index, dst, offset);
> -
> -   } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
> -      emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
> -                          fs_reg(), fs_reg());
> -
> -   } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
> -      emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
> -                          fs_reg(), fs_reg());
> -   }
> -}
> -
> -void
> -fs_visitor::visit(ir_call *ir)
> -{
> -   const char *callee = ir->callee->function_name();
> -
> -   if (!strcmp("__intrinsic_atomic_read", callee) ||
> -       !strcmp("__intrinsic_atomic_increment", callee) ||
> -       !strcmp("__intrinsic_atomic_predecrement", callee)) {
> -      visit_atomic_counter_intrinsic(ir);
> -   } else {
> -      unreachable("Unsupported intrinsic.");
> -   }
> -}
> -
> -void
> -fs_visitor::visit(ir_return *)
> -{
> -   unreachable("FINISHME");
> -}
> -
> -void
> -fs_visitor::visit(ir_function *ir)
> -{
> -   /* Ignore function bodies other than main() -- we shouldn't see calls to
> -    * them since they should all be inlined before we get to ir_to_mesa.
> -    */
> -   if (strcmp(ir->name, "main") == 0) {
> -      const ir_function_signature *sig;
> -      exec_list empty;
> -
> -      sig = ir->matching_signature(NULL, &empty, false);
> -
> -      assert(sig);
> -
> -      foreach_in_list(ir_instruction, ir_, &sig->body) {
> -        this->base_ir = ir_;
> -        ir_->accept(this);
> -      }
> -   }
> -}
> -
> -void
> -fs_visitor::visit(ir_function_signature *)
> -{
> -   unreachable("not reached");
> -}
> -
> -void
> -fs_visitor::visit(ir_emit_vertex *)
> -{
> -   unreachable("not reached");
> -}
> -
> -void
> -fs_visitor::visit(ir_end_primitive *)
> -{
> -   unreachable("not reached");
> -}
> -
> -void
> -fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
> -                                fs_reg dst, fs_reg offset, fs_reg src0,
> -                                fs_reg src1)
> -{
> -   int reg_width = dispatch_width / 8;
> -   int length = 0;
> -
> -   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 4);
> -
> -   sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> -   /* Initialize the sample mask in the message header. */
> -   emit(MOV(sources[0], fs_reg(0u)))
> -      ->force_writemask_all = true;
> -
> -   if (stage == MESA_SHADER_FRAGMENT) {
> -      if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
> -         emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
> -            ->force_writemask_all = true;
> -      } else {
> -         emit(MOV(component(sources[0], 7),
> -                  retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
> -            ->force_writemask_all = true;
> -      }
> -   } else {
> -      /* The execution mask is part of the side-band information sent together with
> -       * the message payload to the data port. It's implicitly ANDed with the sample
> -       * mask sent in the header to compute the actual set of channels that execute
> -       * the atomic operation.
> -       */
> -      assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
> -      emit(MOV(component(sources[0], 7),
> -               fs_reg(0xffffu)))->force_writemask_all = true;
> -   }
> -   length++;
> -
> -   /* Set the atomic operation offset. */
> -   sources[1] = vgrf(glsl_type::uint_type);
> -   emit(MOV(sources[1], offset));
> -   length++;
> -
> -   /* Set the atomic operation arguments. */
> -   if (src0.file != BAD_FILE) {
> -      sources[length] = vgrf(glsl_type::uint_type);
> -      emit(MOV(sources[length], src0));
> -      length++;
> -   }
> -
> -   if (src1.file != BAD_FILE) {
> -      sources[length] = vgrf(glsl_type::uint_type);
> -      emit(MOV(sources[length], src1));
> -      length++;
> -   }
> -
> -   int mlen = 1 + (length - 1) * reg_width;
> -   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
> -                               BRW_REGISTER_TYPE_UD);
> -   emit(LOAD_PAYLOAD(src_payload, sources, length));
> -
> -   /* Emit the instruction. */
> -   fs_inst *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
> -                        fs_reg(atomic_op), fs_reg(surf_index));
> -   inst->mlen = mlen;
> -}
> -
> -void
> -fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
> -                                      fs_reg offset)
> -{
> -   int reg_width = dispatch_width / 8;
> -
> -   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
> -
> -   sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> -   /* Initialize the sample mask in the message header. */
> -   emit(MOV(sources[0], fs_reg(0u)))
> -      ->force_writemask_all = true;
> -
> -   if (stage == MESA_SHADER_FRAGMENT) {
> -      if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
> -         emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
> -            ->force_writemask_all = true;
> -      } else {
> -         emit(MOV(component(sources[0], 7),
> -                  retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
> -            ->force_writemask_all = true;
> -      }
> -   } else {
> -      /* The execution mask is part of the side-band information sent together with
> -       * the message payload to the data port. It's implicitly ANDed with the sample
> -       * mask sent in the header to compute the actual set of channels that execute
> -       * the atomic operation.
> -       */
> -      assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
> -      emit(MOV(component(sources[0], 7),
> -               fs_reg(0xffffu)))->force_writemask_all = true;
> -   }
> -
> -   /* Set the surface read offset. */
> -   sources[1] = vgrf(glsl_type::uint_type);
> -   emit(MOV(sources[1], offset));
> -
> -   int mlen = 1 + reg_width;
> -   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
> -                               BRW_REGISTER_TYPE_UD);
> -   fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2));
> -
> -   /* Emit the instruction. */
> -   inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
> -               fs_reg(surf_index));
> -   inst->mlen = mlen;
> -}
> -
> -fs_inst *
> -fs_visitor::emit(fs_inst *inst)
> -{
> -   if (dispatch_width == 16 && inst->exec_size == 8)
> -      inst->force_uncompressed = true;
> -
> -   inst->annotation = this->current_annotation;
> -   inst->ir = this->base_ir;
> -
> -   this->instructions.push_tail(inst);
> -
> -   return inst;
> -}
> -
> -void
> -fs_visitor::emit(exec_list list)
> -{
> -   foreach_in_list_safe(fs_inst, inst, &list) {
> -      inst->exec_node::remove();
> -      emit(inst);
> -   }
> -}
> -
> -/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
> -void
> -fs_visitor::emit_dummy_fs()
> -{
> -   int reg_width = dispatch_width / 8;
> -
> -   /* Everyone's favorite color. */
> -   const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
> -   for (int i = 0; i < 4; i++) {
> -      emit(MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
> -                      dispatch_width), fs_reg(color[i])));
> -   }
> -
> -   fs_inst *write;
> -   write = emit(FS_OPCODE_FB_WRITE);
> -   write->eot = true;
> -   if (brw->gen >= 6) {
> -      write->base_mrf = 2;
> -      write->mlen = 4 * reg_width;
> -   } else {
> -      write->header_present = true;
> -      write->base_mrf = 0;
> -      write->mlen = 2 + 4 * reg_width;
> -   }
> -
> -   /* Tell the SF we don't have any inputs.  Gen4-5 require at least one
> -    * varying to avoid GPU hangs, so set that.
> -    */
> -   brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
> -   wm_prog_data->num_varying_inputs = brw->gen < 6 ? 1 : 0;
> -   memset(wm_prog_data->urb_setup, -1,
> -          sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
> -
> -   /* We don't have any uniforms. */
> -   stage_prog_data->nr_params = 0;
> -   stage_prog_data->nr_pull_params = 0;
> -   stage_prog_data->curb_read_length = 0;
> -   stage_prog_data->dispatch_grf_start_reg = 2;
> -   wm_prog_data->dispatch_grf_start_reg_16 = 2;
> -   grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
> -
> -   calculate_cfg();
> -}
> -
> -/* The register location here is relative to the start of the URB
> - * data.  It will get adjusted to be a real location before
> - * generate_code() time.
> - */
> -struct brw_reg
> -fs_visitor::interp_reg(int location, int channel)
> -{
> -   assert(stage == MESA_SHADER_FRAGMENT);
> -   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> -   int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
> -   int stride = (channel & 1) * 4;
> -
> -   assert(prog_data->urb_setup[location] != -1);
> -
> -   return brw_vec1_grf(regnr, stride);
> -}
> -
> -/** Emits the interpolation for the varying inputs. */
> -void
> -fs_visitor::emit_interpolation_setup_gen4()
> -{
> -   this->current_annotation = "compute pixel centers";
> -   this->pixel_x = vgrf(glsl_type::uint_type);
> -   this->pixel_y = vgrf(glsl_type::uint_type);
> -   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
> -   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
> -
> -   emit(FS_OPCODE_PIXEL_X, this->pixel_x);
> -   emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
> -
> -   this->current_annotation = "compute pixel deltas from v0";
> -   if (brw->has_pln) {
> -      this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> -         vgrf(glsl_type::vec2_type);
> -      this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> -         offset(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 1);
> -   } else {
> -      this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> -         vgrf(glsl_type::float_type);
> -      this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
> -         vgrf(glsl_type::float_type);
> -   }
> -   emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> -            this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))));
> -   emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> -            this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))));
> -
> -   this->current_annotation = "compute pos.w and 1/pos.w";
> -   /* Compute wpos.w.  It's always in our setup, since it's needed to
> -    * interpolate the other attributes.
> -    */
> -   this->wpos_w = vgrf(glsl_type::float_type);
> -   emit(FS_OPCODE_LINTERP, wpos_w,
> -        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> -        this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> -       interp_reg(VARYING_SLOT_POS, 3));
> -   /* Compute the pixel 1/W value from wpos.w. */
> -   this->pixel_w = vgrf(glsl_type::float_type);
> -   emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
> -   this->current_annotation = NULL;
> -}
> -
> -/** Emits the interpolation for the varying inputs. */
> -void
> -fs_visitor::emit_interpolation_setup_gen6()
> -{
> -   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
> -
> -   /* If the pixel centers end up used, the setup is the same as for gen4. */
> -   this->current_annotation = "compute pixel centers";
> -   fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
> -   fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
> -   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
> -   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
> -   emit(ADD(int_pixel_x,
> -            fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
> -            fs_reg(brw_imm_v(0x10101010))));
> -   emit(ADD(int_pixel_y,
> -            fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
> -            fs_reg(brw_imm_v(0x11001100))));
> -
> -   /* As of gen6, we can no longer mix float and int sources.  We have
> -    * to turn the integer pixel centers into floats for their actual
> -    * use.
> -    */
> -   this->pixel_x = vgrf(glsl_type::float_type);
> -   this->pixel_y = vgrf(glsl_type::float_type);
> -   emit(MOV(this->pixel_x, int_pixel_x));
> -   emit(MOV(this->pixel_y, int_pixel_y));
> -
> -   this->current_annotation = "compute pos.w";
> -   this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
> -   this->wpos_w = vgrf(glsl_type::float_type);
> -   emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
> -
> -   for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
> -      uint8_t reg = payload.barycentric_coord_reg[i];
> -      this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
> -      this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
> -   }
> -
> -   this->current_annotation = NULL;
> -}
> -
> -int
> -fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
> -                                bool use_2nd_half)
> -{
> -   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> -   fs_inst *inst;
> -
> -   if (color.file == BAD_FILE) {
> -      return 4 * (dispatch_width / 8);
> -   }
> -
> -   uint8_t colors_enabled;
> -   if (components == 0) {
> -      /* We want to write one component to the alpha channel */
> -      colors_enabled = 0x8;
> -   } else {
> -      /* Enable the first components-many channels */
> -      colors_enabled = (1 << components) - 1;
> -   }
> -
> -   if (dispatch_width == 8 || (brw->gen >= 6 && !do_dual_src)) {
> -      /* SIMD8 write looks like:
> -       * m + 0: r0
> -       * m + 1: r1
> -       * m + 2: g0
> -       * m + 3: g1
> -       *
> -       * gen6 SIMD16 DP write looks like:
> -       * m + 0: r0
> -       * m + 1: r1
> -       * m + 2: g0
> -       * m + 3: g1
> -       * m + 4: b0
> -       * m + 5: b1
> -       * m + 6: a0
> -       * m + 7: a1
> -       */
> -      int len = 0;
> -      for (unsigned i = 0; i < 4; ++i) {
> -         if (colors_enabled & (1 << i)) {
> -            dst[len] = fs_reg(GRF, alloc.allocate(color.width / 8),
> -                              color.type, color.width);
> -            inst = emit(MOV(dst[len], offset(color, i)));
> -            inst->saturate = key->clamp_fragment_color;
> -         } else if (color.width == 16) {
> -            /* We need two BAD_FILE slots for a 16-wide color */
> -            len++;
> -         }
> -         len++;
> -      }
> -      return len;
> -   } else if (brw->gen >= 6 && do_dual_src) {
> -      /* SIMD16 dual source blending for gen6+.
> -       *
> -       * From the SNB PRM, volume 4, part 1, page 193:
> -       *
> -       * "The dual source render target messages only have SIMD8 forms due to
> -       *  maximum message length limitations. SIMD16 pixel shaders must send two
> -       *  of these messages to cover all of the pixels. Each message contains
> -       *  two colors (4 channels each) for each pixel in the message payload."
> -       *
> -       * So in SIMD16 dual source blending we will send 2 SIMD8 messages,
> -       * each one will call this function twice (one for each color involved),
> -       * so in each pass we only write 4 registers. Notice that the second
> -       * SIMD8 message needs to read color data from the 2nd half of the color
> -       * registers, so it needs to call this with use_2nd_half = true.
> -       */
> -      for (unsigned i = 0; i < 4; ++i) {
> -         if (colors_enabled & (1 << i)) {
> -            dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
> -            inst = emit(MOV(dst[i], half(offset(color, i),
> -                                         use_2nd_half ? 1 : 0)));
> -            inst->saturate = key->clamp_fragment_color;
> -            if (use_2nd_half)
> -               inst->force_sechalf = true;
> -         }
> -      }
> -      return 4;
> -   } else {
> -      /* pre-gen6 SIMD16 single source DP write looks like:
> -       * m + 0: r0
> -       * m + 1: g0
> -       * m + 2: b0
> -       * m + 3: a0
> -       * m + 4: r1
> -       * m + 5: g1
> -       * m + 6: b1
> -       * m + 7: a1
> -       */
> -      for (unsigned i = 0; i < 4; ++i) {
> -         if (colors_enabled & (1 << i)) {
> -            dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
> -            inst = emit(MOV(dst[i], half(offset(color, i), 0)));
> -            inst->saturate = key->clamp_fragment_color;
> -
> -            dst[i + 4] = fs_reg(GRF, alloc.allocate(1), color.type);
> -            inst = emit(MOV(dst[i + 4], half(offset(color, i), 1)));
> -            inst->saturate = key->clamp_fragment_color;
> -            inst->force_sechalf = true;
> -         }
> -      }
> -      return 8;
> -   }
> -}
> -
> -static enum brw_conditional_mod
> -cond_for_alpha_func(GLenum func)
> -{
> -   switch(func) {
> -      case GL_GREATER:
> -         return BRW_CONDITIONAL_G;
> -      case GL_GEQUAL:
> -         return BRW_CONDITIONAL_GE;
> -      case GL_LESS:
> -         return BRW_CONDITIONAL_L;
> -      case GL_LEQUAL:
> -         return BRW_CONDITIONAL_LE;
> -      case GL_EQUAL:
> -         return BRW_CONDITIONAL_EQ;
> -      case GL_NOTEQUAL:
> -         return BRW_CONDITIONAL_NEQ;
> -      default:
> -         unreachable("Not reached");
> -   }
> -}
> -
> -/**
> - * Alpha test support for when we compile it into the shader instead
> - * of using the normal fixed-function alpha test.
> - */
> -void
> -fs_visitor::emit_alpha_test()
> -{
> -   assert(stage == MESA_SHADER_FRAGMENT);
> -   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> -   this->current_annotation = "Alpha test";
> -
> -   fs_inst *cmp;
> -   if (key->alpha_test_func == GL_ALWAYS)
> -      return;
> -
> -   if (key->alpha_test_func == GL_NEVER) {
> -      /* f0.1 = 0 */
> -      fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
> -                                      BRW_REGISTER_TYPE_UW));
> -      cmp = emit(CMP(reg_null_f, some_reg, some_reg,
> -                     BRW_CONDITIONAL_NEQ));
> -   } else {
> -      /* RT0 alpha */
> -      fs_reg color = offset(outputs[0], 3);
> -
> -      /* f0.1 &= func(color, ref) */
> -      cmp = emit(CMP(reg_null_f, color, fs_reg(key->alpha_test_ref),
> -                     cond_for_alpha_func(key->alpha_test_func)));
> -   }
> -   cmp->predicate = BRW_PREDICATE_NORMAL;
> -   cmp->flag_subreg = 1;
> -}
> -
> -fs_inst *
> -fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
> -                                 fs_reg src0_alpha, unsigned components,
> -                                 bool use_2nd_half)
> -{
> -   assert(stage == MESA_SHADER_FRAGMENT);
> -   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> -   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> -
> -   this->current_annotation = "FB write header";
> -   bool header_present = true;
> -   int reg_size = dispatch_width / 8;
> -
> -   /* We can potentially have a message length of up to 15, so we have to set
> -    * base_mrf to either 0 or 1 in order to fit in m0..m15.
> -    */
> -   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15);
> -   int length = 0;
> -
> -   /* From the Sandy Bridge PRM, volume 4, page 198:
> -    *
> -    *     "Dispatched Pixel Enables. One bit per pixel indicating
> -    *      which pixels were originally enabled when the thread was
> -    *      dispatched. This field is only required for the end-of-
> -    *      thread message and on all dual-source messages."
> -    */
> -   if (brw->gen >= 6 &&
> -       (brw->is_haswell || brw->gen >= 8 || !prog_data->uses_kill) &&
> -       color1.file == BAD_FILE &&
> -       key->nr_color_regions == 1) {
> -      header_present = false;
> -   }
> -
> -   if (header_present)
> -      /* Allocate 2 registers for a header */
> -      length += 2;
> -
> -   if (payload.aa_dest_stencil_reg) {
> -      sources[length] = fs_reg(GRF, alloc.allocate(1));
> -      emit(MOV(sources[length],
> -               fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))));
> -      length++;
> -   }
> -
> -   prog_data->uses_omask =
> -      prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
> -   if (prog_data->uses_omask) {
> -      this->current_annotation = "FB write oMask";
> -      assert(this->sample_mask.file != BAD_FILE);
> -      /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
> -       * it's unsinged single words, one vgrf is always 16-wide.
> -       */
> -      sources[length] = fs_reg(GRF, alloc.allocate(1),
> -                               BRW_REGISTER_TYPE_UW, 16);
> -      emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
> -      length++;
> -   }
> -
> -   if (color0.file == BAD_FILE) {
> -      /* Even if there's no color buffers enabled, we still need to send
> -       * alpha out the pipeline to our null renderbuffer to support
> -       * alpha-testing, alpha-to-coverage, and so on.
> -       */
> -      length += setup_color_payload(sources + length, this->outputs[0], 0,
> -                                    false);
> -   } else if (color1.file == BAD_FILE) {
> -      if (src0_alpha.file != BAD_FILE) {
> -         sources[length] = fs_reg(GRF, alloc.allocate(reg_size),
> -                                  src0_alpha.type, src0_alpha.width);
> -         fs_inst *inst = emit(MOV(sources[length], src0_alpha));
> -         inst->saturate = key->clamp_fragment_color;
> -         length++;
> -      }
> -
> -      length += setup_color_payload(sources + length, color0, components,
> -                                    false);
> -   } else {
> -      length += setup_color_payload(sources + length, color0, components,
> -                                    use_2nd_half);
> -      length += setup_color_payload(sources + length, color1, components,
> -                                    use_2nd_half);
> -   }
> -
> -   if (source_depth_to_render_target) {
> -      if (brw->gen == 6) {
> -        /* For outputting oDepth on gen6, SIMD8 writes have to be
> -         * used.  This would require SIMD8 moves of each half to
> -         * message regs, kind of like pre-gen5 SIMD16 FB writes.
> -         * Just bail on doing so for now.
> -         */
> -        no16("Missing support for simd16 depth writes on gen6\n");
> -      }
> -
> -      sources[length] = vgrf(glsl_type::float_type);
> -      if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
> -        /* Hand over gl_FragDepth. */
> -        assert(this->frag_depth.file != BAD_FILE);
> -        emit(MOV(sources[length], this->frag_depth));
> -      } else {
> -        /* Pass through the payload depth. */
> -        emit(MOV(sources[length],
> -                  fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
> -      }
> -      length++;
> -   }
> -
> -   if (payload.dest_depth_reg) {
> -      sources[length] = vgrf(glsl_type::float_type);
> -      emit(MOV(sources[length],
> -               fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0))));
> -      length++;
> -   }
> -
> -   fs_inst *load;
> -   fs_inst *write;
> -   if (brw->gen >= 7) {
> -      /* Send from the GRF */
> -      fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
> -      load = emit(LOAD_PAYLOAD(payload, sources, length));
> -      payload.reg = alloc.allocate(load->regs_written);
> -      payload.width = dispatch_width;
> -      load->dst = payload;
> -      write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
> -      write->base_mrf = -1;
> -   } else {
> -      /* Send from the MRF */
> -      load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
> -                               sources, length));
> -      write = emit(FS_OPCODE_FB_WRITE);
> -      write->exec_size = dispatch_width;
> -      write->base_mrf = 1;
> -   }
> -
> -   write->mlen = load->regs_written;
> -   write->header_present = header_present;
> -   if (prog_data->uses_kill) {
> -      write->predicate = BRW_PREDICATE_NORMAL;
> -      write->flag_subreg = 1;
> -   }
> -   return write;
> -}
> -
> -void
> -fs_visitor::emit_fb_writes()
> -{
> -   assert(stage == MESA_SHADER_FRAGMENT);
> -   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
> -   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> -
> -   fs_inst *inst = NULL;
> -   if (do_dual_src) {
> -      this->current_annotation = ralloc_asprintf(this->mem_ctx,
> -                                                "FB dual-source write");
> -      inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
> -                                  reg_undef, 4);
> -      inst->target = 0;
> -
> -      /* SIMD16 dual source blending requires to send two SIMD8 dual source
> -       * messages, where each message contains color data for 8 pixels. Color
> -       * data for the first group of pixels is stored in the "lower" half of
> -       * the color registers, so in SIMD16, the previous message did:
> -       * m + 0: r0
> -       * m + 1: g0
> -       * m + 2: b0
> -       * m + 3: a0
> -       *
> -       * Here goes the second message, which packs color data for the
> -       * remaining 8 pixels. Color data for these pixels is stored in the
> -       * "upper" half of the color registers, so we need to do:
> -       * m + 0: r1
> -       * m + 1: g1
> -       * m + 2: b1
> -       * m + 3: a1
> -       */
> -      if (dispatch_width == 16) {
> -         inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
> -                                     reg_undef, 4, true);
> -         inst->target = 0;
> -      }
> -
> -      prog_data->dual_src_blend = true;
> -   } else {
> -      for (int target = 0; target < key->nr_color_regions; target++) {
> -         /* Skip over outputs that weren't written. */
> -         if (this->outputs[target].file == BAD_FILE)
> -            continue;
> -
> -         this->current_annotation = ralloc_asprintf(this->mem_ctx,
> -                                                    "FB write target %d",
> -                                                    target);
> -         fs_reg src0_alpha;
> -         if (brw->gen >= 6 && key->replicate_alpha && target != 0)
> -            src0_alpha = offset(outputs[0], 3);
> -
> -         inst = emit_single_fb_write(this->outputs[target], reg_undef,
> -                                     src0_alpha,
> -                                     this->output_components[target]);
> -         inst->target = target;
> -      }
> -   }
> -
> -   if (inst == NULL) {
> -      /* Even if there's no color buffers enabled, we still need to send
> -       * alpha out the pipeline to our null renderbuffer to support
> -       * alpha-testing, alpha-to-coverage, and so on.
> -       */
> -      inst = emit_single_fb_write(reg_undef, reg_undef, reg_undef, 0);
> -      inst->target = 0;
> -   }
> -
> -   inst->eot = true;
> -   this->current_annotation = NULL;
> -}
> -
> -void
> -fs_visitor::setup_uniform_clipplane_values()
> -{
> -   gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
> -   const struct brw_vue_prog_key *key =
> -      (const struct brw_vue_prog_key *) this->key;
> -
> -   for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
> -      this->userplane[i] = fs_reg(UNIFORM, uniforms);
> -      for (int j = 0; j < 4; ++j) {
> -         stage_prog_data->param[uniforms + j] =
> -            (gl_constant_value *) &clip_planes[i][j];
> -      }
> -      uniforms += 4;
> -   }
> -}
> -
> -void fs_visitor::compute_clip_distance()
> -{
> -   struct brw_vue_prog_data *vue_prog_data =
> -      (struct brw_vue_prog_data *) prog_data;
> -   const struct brw_vue_prog_key *key =
> -      (const struct brw_vue_prog_key *) this->key;
> -
> -   /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
> -    *
> -    *     "If a linked set of shaders forming the vertex stage contains no
> -    *     static write to gl_ClipVertex or gl_ClipDistance, but the
> -    *     application has requested clipping against user clip planes through
> -    *     the API, then the coordinate written to gl_Position is used for
> -    *     comparison against the user clip planes."
> -    *
> -    * This function is only called if the shader didn't write to
> -    * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
> -    * if the user wrote to it; otherwise we use gl_Position.
> -    */
> -
> -   gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
> -   if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
> -      clip_vertex = VARYING_SLOT_POS;
> -
> -   /* If the clip vertex isn't written, skip this.  Typically this means
> -    * the GS will set up clipping. */
> -   if (outputs[clip_vertex].file == BAD_FILE)
> -      return;
> -
> -   setup_uniform_clipplane_values();
> -
> -   current_annotation = "user clip distances";
> -
> -   this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
> -   this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
> -
> -   for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
> -      fs_reg u = userplane[i];
> -      fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
> -      output.reg_offset = i & 3;
> -
> -      emit(MUL(output, outputs[clip_vertex], u));
> -      for (int j = 1; j < 4; j++) {
> -         u.reg = userplane[i].reg + j;
> -         emit(MAD(output, output, offset(outputs[clip_vertex], j), u));
> -      }
> -   }
> -}
> -
> -void
> -fs_visitor::emit_urb_writes()
> -{
> -   int slot, urb_offset, length;
> -   struct brw_vs_prog_data *vs_prog_data =
> -      (struct brw_vs_prog_data *) prog_data;
> -   const struct brw_vs_prog_key *key =
> -      (const struct brw_vs_prog_key *) this->key;
> -   const GLbitfield64 psiz_mask =
> -      VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
> -   const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map;
> -   bool flush;
> -   fs_reg sources[8];
> -
> -   /* Lower legacy ff and ClipVertex clipping to clip distances */
> -   if (key->base.userclip_active && !prog->UsesClipDistanceOut)
> -      compute_clip_distance();
> -
> -   /* If we don't have any valid slots to write, just do a minimal urb write
> -    * send to terminate the shader. */
> -   if (vue_map->slots_valid == 0) {
> -
> -      fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> -      fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
> -                                                      BRW_REGISTER_TYPE_UD))));
> -      inst->force_writemask_all = true;
> -
> -      inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
> -      inst->eot = true;
> -      inst->mlen = 1;
> -      inst->offset = 1;
> -      return;
> -   }
> -
> -   length = 0;
> -   urb_offset = 0;
> -   flush = false;
> -   for (slot = 0; slot < vue_map->num_slots; slot++) {
> -      fs_reg reg, src, zero;
> -
> -      int varying = vue_map->slot_to_varying[slot];
> -      switch (varying) {
> -      case VARYING_SLOT_PSIZ:
> -
> -         /* The point size varying slot is the vue header and is always in the
> -          * vue map.  But often none of the special varyings that live there
> -          * are written and in that case we can skip writing to the vue
> -          * header, provided the corresponding state properly clamps the
> -          * values further down the pipeline. */
> -         if ((vue_map->slots_valid & psiz_mask) == 0) {
> -            assert(length == 0);
> -            urb_offset++;
> -            break;
> -         }
> -
> -         zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
> -         emit(MOV(zero, fs_reg(0u)));
> -
> -         sources[length++] = zero;
> -         if (vue_map->slots_valid & VARYING_BIT_LAYER)
> -            sources[length++] = this->outputs[VARYING_SLOT_LAYER];
> -         else
> -            sources[length++] = zero;
> -
> -         if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
> -            sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
> -         else
> -            sources[length++] = zero;
> -
> -         if (vue_map->slots_valid & VARYING_BIT_PSIZ)
> -            sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
> -         else
> -            sources[length++] = zero;
> -         break;
> -
> -      case BRW_VARYING_SLOT_NDC:
> -      case VARYING_SLOT_EDGE:
> -         unreachable("unexpected scalar vs output");
> -         break;
> -
> -      case BRW_VARYING_SLOT_PAD:
> -         break;
> -
> -      default:
> -         /* gl_Position is always in the vue map, but isn't always written by
> -          * the shader.  Other varyings (clip distances) get added to the vue
> -          * map but don't always get written.  In those cases, the
> -          * corresponding this->output[] slot will be invalid we and can skip
> -          * the urb write for the varying.  If we've already queued up a vue
> -          * slot for writing we flush a mlen 5 urb write, otherwise we just
> -          * advance the urb_offset.
> -          */
> -         if (this->outputs[varying].file == BAD_FILE) {
> -            if (length > 0)
> -               flush = true;
> -            else
> -               urb_offset++;
> -            break;
> -         }
> -
> -         if ((varying == VARYING_SLOT_COL0 ||
> -              varying == VARYING_SLOT_COL1 ||
> -              varying == VARYING_SLOT_BFC0 ||
> -              varying == VARYING_SLOT_BFC1) &&
> -             key->clamp_vertex_color) {
> -            /* We need to clamp these guys, so do a saturating MOV into a
> -             * temp register and use that for the payload.
> -             */
> -            for (int i = 0; i < 4; i++) {
> -               reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
> -               src = offset(this->outputs[varying], i);
> -               fs_inst *inst = emit(MOV(reg, src));
> -               inst->saturate = true;
> -               sources[length++] = reg;
> -            }
> -         } else {
> -            for (int i = 0; i < 4; i++)
> -               sources[length++] = offset(this->outputs[varying], i);
> -         }
> -         break;
> -      }
> -
> -      current_annotation = "URB write";
> -
> -      /* If we've queued up 8 registers of payload (2 VUE slots), if this is
> -       * the last slot or if we need to flush (see BAD_FILE varying case
> -       * above), emit a URB write send now to flush out the data.
> -       */
> -      int last = slot == vue_map->num_slots - 1;
> -      if (length == 8 || last)
> -         flush = true;
> -      if (flush) {
> -         fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
> -         fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
> -                                 BRW_REGISTER_TYPE_F);
> -
> -         /* We need WE_all on the MOV for the message header (the URB handles)
> -          * so do a MOV to a dummy register and set force_writemask_all on the
> -          * MOV.  LOAD_PAYLOAD will preserve that.
> -          */
> -         fs_reg dummy = fs_reg(GRF, alloc.allocate(1),
> -                               BRW_REGISTER_TYPE_UD);
> -         fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0),
> -                                                       BRW_REGISTER_TYPE_UD))));
> -         inst->force_writemask_all = true;
> -         payload_sources[0] = dummy;
> -
> -         memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
> -         emit(LOAD_PAYLOAD(payload, payload_sources, length + 1));
> -
> -         inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
> -         inst->eot = last;
> -         inst->mlen = length + 1;
> -         inst->offset = urb_offset;
> -         urb_offset = slot + 1;
> -         length = 0;
> -         flush = false;
> -      }
> -   }
> -}
> -
> -void
> -fs_visitor::resolve_ud_negate(fs_reg *reg)
> -{
> -   if (reg->type != BRW_REGISTER_TYPE_UD ||
> -       !reg->negate)
> -      return;
> -
> -   fs_reg temp = vgrf(glsl_type::uint_type);
> -   emit(MOV(temp, *reg));
> -   *reg = temp;
> -}
> -
> -/**
> - * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
> - *
> - * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
> - * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
> - */
> -void
> -fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
> -{
> -   assert(brw->gen <= 5);
> -
> -   if (rvalue->type != glsl_type::bool_type)
> -      return;
> -
> -   fs_reg and_result = vgrf(glsl_type::bool_type);
> -   fs_reg neg_result = vgrf(glsl_type::bool_type);
> -   emit(AND(and_result, *reg, fs_reg(1)));
> -   emit(MOV(neg_result, negate(and_result)));
> -   *reg = neg_result;
> -}
> -
> -fs_visitor::fs_visitor(struct brw_context *brw,
> -                       void *mem_ctx,
> -                       const struct brw_wm_prog_key *key,
> -                       struct brw_wm_prog_data *prog_data,
> -                       struct gl_shader_program *shader_prog,
> -                       struct gl_fragment_program *fp,
> -                       unsigned dispatch_width)
> -   : backend_visitor(brw, shader_prog, &fp->Base, &prog_data->base,
> -                     MESA_SHADER_FRAGMENT),
> -     reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
> -     reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
> -     reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
> -     key(key), prog_data(&prog_data->base),
> -     dispatch_width(dispatch_width), promoted_constants(0)
> -{
> -   this->mem_ctx = mem_ctx;
> -   init();
> -}
> -
> -fs_visitor::fs_visitor(struct brw_context *brw,
> -                       void *mem_ctx,
> -                       const struct brw_vs_prog_key *key,
> -                       struct brw_vs_prog_data *prog_data,
> -                       struct gl_shader_program *shader_prog,
> -                       struct gl_vertex_program *cp,
> -                       unsigned dispatch_width)
> -   : backend_visitor(brw, shader_prog, &cp->Base, &prog_data->base.base,
> -                     MESA_SHADER_VERTEX),
> -     reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
> -     reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
> -     reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
> -     key(key), prog_data(&prog_data->base.base),
> -     dispatch_width(dispatch_width), promoted_constants(0)
> -{
> -   this->mem_ctx = mem_ctx;
> -   init();
> -}
> -
> -void
> -fs_visitor::init()
> -{
> -   switch (stage) {
> -   case MESA_SHADER_FRAGMENT:
> -      key_tex = &((const brw_wm_prog_key *) key)->tex;
> -      break;
> -   case MESA_SHADER_VERTEX:
> -   case MESA_SHADER_GEOMETRY:
> -      key_tex = &((const brw_vue_prog_key *) key)->tex;
> -      break;
> -   default:
> -      unreachable("unhandled shader stage");
> -   }
> -
> -   this->failed = false;
> -   this->simd16_unsupported = false;
> -   this->no16_msg = NULL;
> -   this->variable_ht = hash_table_ctor(0,
> -                                       hash_table_pointer_hash,
> -                                       hash_table_pointer_compare);
> -
> -   this->nir_locals = NULL;
> -   this->nir_globals = NULL;
> -
> -   memset(&this->payload, 0, sizeof(this->payload));
> -   memset(this->outputs, 0, sizeof(this->outputs));
> -   memset(this->output_components, 0, sizeof(this->output_components));
> -   this->source_depth_to_render_target = false;
> -   this->runtime_check_aads_emit = false;
> -   this->first_non_payload_grf = 0;
> -   this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
> -
> -   this->current_annotation = NULL;
> -   this->base_ir = NULL;
> -
> -   this->virtual_grf_start = NULL;
> -   this->virtual_grf_end = NULL;
> -   this->live_intervals = NULL;
> -   this->regs_live_at_ip = NULL;
> -
> -   this->uniforms = 0;
> -   this->last_scratch = 0;
> -   this->pull_constant_loc = NULL;
> -   this->push_constant_loc = NULL;
> -
> -   this->spilled_any_registers = false;
> -   this->do_dual_src = false;
> -
> -   if (dispatch_width == 8)
> -      this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
> -}
> -
> -fs_visitor::~fs_visitor()
> -{
> -   hash_table_dtor(this->variable_ht);
> -}
> diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
> index 45c157a..3694811 100644
> --- a/src/mesa/drivers/dri/i965/brw_gs.c
> +++ b/src/mesa/drivers/dri/i965/brw_gs.c
> @@ -29,7 +29,7 @@
>
>  #include "brw_gs.h"
>  #include "brw_context.h"
> -#include "brw_vec4_gs_visitor.h"
> +#include "brw_vec4_gs_god.h"
>  #include "brw_state.h"
>  #include "brw_ff_gs.h"
>
> diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
> index d3bd64d..9f3473e 100644
> --- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h
> +++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
> @@ -50,8 +50,8 @@ public:
>
>     bool equals(const src_reg &r) const;
>
> -   src_reg(class vec4_visitor *v, const struct glsl_type *type);
> -   src_reg(class vec4_visitor *v, const struct glsl_type *type, int size);
> +   src_reg(class vec4_god *v, const struct glsl_type *type);
> +   src_reg(class vec4_god *v, const struct glsl_type *type, int size);
>
>     explicit src_reg(const dst_reg &reg);
>
> @@ -107,7 +107,7 @@ public:
>     dst_reg(register_file file, int reg, const glsl_type *type,
>             unsigned writemask);
>     dst_reg(struct brw_reg reg);
> -   dst_reg(class vec4_visitor *v, const struct glsl_type *type);
> +   dst_reg(class vec4_god *v, const struct glsl_type *type);
>
>     explicit dst_reg(const src_reg &reg);
>
> diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> index 56f69ea..120a13d 100644
> --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> @@ -395,7 +395,7 @@ schedule_node::set_latency_gen7(bool is_haswell)
>
>  class instruction_scheduler {
>  public:
> -   instruction_scheduler(backend_visitor *v, int grf_count,
> +   instruction_scheduler(backend_god *v, int grf_count,
>                           instruction_scheduler_mode mode)
>     {
>        this->bv = v;
> @@ -451,7 +451,7 @@ public:
>     int grf_count;
>     int time;
>     exec_list instructions;
> -   backend_visitor *bv;
> +   backend_god *bv;
>
>     instruction_scheduler_mode mode;
>
> @@ -475,20 +475,20 @@ public:
>  class fs_instruction_scheduler : public instruction_scheduler
>  {
>  public:
> -   fs_instruction_scheduler(fs_visitor *v, int grf_count,
> +   fs_instruction_scheduler(fs_god *v, int grf_count,
>                              instruction_scheduler_mode mode);
>     void calculate_deps();
>     bool is_compressed(fs_inst *inst);
>     schedule_node *choose_instruction_to_schedule();
>     int issue_time(backend_instruction *inst);
> -   fs_visitor *v;
> +   fs_god *v;
>
>     void count_remaining_grf_uses(backend_instruction *inst);
>     void update_register_pressure(backend_instruction *inst);
>     int get_register_pressure_benefit(backend_instruction *inst);
>  };
>
> -fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
> +fs_instruction_scheduler::fs_instruction_scheduler(fs_god *v,
>                                                     int grf_count,
>                                                     instruction_scheduler_mode mode)
>     : instruction_scheduler(v, grf_count, mode),
> @@ -565,18 +565,18 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
>  class vec4_instruction_scheduler : public instruction_scheduler
>  {
>  public:
> -   vec4_instruction_scheduler(vec4_visitor *v, int grf_count);
> +   vec4_instruction_scheduler(vec4_god *v, int grf_count);
>     void calculate_deps();
>     schedule_node *choose_instruction_to_schedule();
>     int issue_time(backend_instruction *inst);
> -   vec4_visitor *v;
> +   vec4_god *v;
>
>     void count_remaining_grf_uses(backend_instruction *inst);
>     void update_register_pressure(backend_instruction *inst);
>     int get_register_pressure_benefit(backend_instruction *inst);
>  };
>
> -vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
> +vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_god *v,
>                                                         int grf_count)
>     : instruction_scheduler(v, grf_count, SCHEDULE_POST),
>       v(v)
> @@ -1506,7 +1506,7 @@ instruction_scheduler::run(cfg_t *cfg)
>  }
>
>  void
> -fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
> +fs_god::schedule_instructions(instruction_scheduler_mode mode)
>  {
>     int grf_count;
>     if (mode == SCHEDULE_POST)
> @@ -1526,7 +1526,7 @@ fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
>  }
>
>  void
> -vec4_visitor::opt_schedule_instructions()
> +vec4_god::opt_schedule_instructions()
>  {
>     vec4_instruction_scheduler sched(this, prog_data->total_grf);
>     sched.run(cfg);
> diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
> index 0dda9bb..24c86a0 100644
> --- a/src/mesa/drivers/dri/i965/brw_shader.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
> @@ -695,7 +695,7 @@ brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg)
>     return false;
>  }
>
> -backend_visitor::backend_visitor(struct brw_context *brw,
> +backend_god::backend_god(struct brw_context *brw,
>                                   struct gl_shader_program *shader_prog,
>                                   struct gl_program *prog,
>                                   struct brw_stage_prog_data *stage_prog_data,
> @@ -1083,13 +1083,13 @@ backend_instruction::remove(bblock_t *block)
>  }
>
>  void
> -backend_visitor::dump_instructions()
> +backend_god::dump_instructions()
>  {
>     dump_instructions(NULL);
>  }
>
>  void
> -backend_visitor::dump_instructions(const char *name)
> +backend_god::dump_instructions(const char *name)
>  {
>     FILE *file = stderr;
>     if (name && geteuid() != 0) {
> @@ -1118,7 +1118,7 @@ backend_visitor::dump_instructions(const char *name)
>  }
>
>  void
> -backend_visitor::calculate_cfg()
> +backend_god::calculate_cfg()
>  {
>     if (this->cfg)
>        return;
> @@ -1126,7 +1126,7 @@ backend_visitor::calculate_cfg()
>  }
>
>  void
> -backend_visitor::invalidate_cfg()
> +backend_god::invalidate_cfg()
>  {
>     ralloc_free(this->cfg);
>     this->cfg = NULL;
> @@ -1141,7 +1141,7 @@ backend_visitor::invalidate_cfg()
>   * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
>   */
>  void
> -backend_visitor::assign_common_binding_table_offsets(uint32_t next_binding_table_offset)
> +backend_god::assign_common_binding_table_offsets(uint32_t next_binding_table_offset)
>  {
>     int num_textures = _mesa_fls(prog->SamplersUsed);
>
> diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
> index 8a3263e..4479002 100644
> --- a/src/mesa/drivers/dri/i965/brw_shader.h
> +++ b/src/mesa/drivers/dri/i965/brw_shader.h
> @@ -157,10 +157,10 @@ enum instruction_scheduler_mode {
>     SCHEDULE_POST,
>  };
>
> -class backend_visitor : public ir_visitor {
> +class backend_god : public ir_visitor {
>  protected:
>
> -   backend_visitor(struct brw_context *brw,
> +   backend_god(struct brw_context *brw,
>                     struct gl_shader_program *shader_prog,
>                     struct gl_program *prog,
>                     struct brw_stage_prog_data *stage_prog_data,
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> index 480e50c..cc85790 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> @@ -260,7 +260,7 @@ vec4_instruction::can_do_source_mods(struct brw_context *brw)
>   * for setup.
>   */
>  int
> -vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
> +vec4_god::implied_mrf_writes(vec4_instruction *inst)
>  {
>     if (inst->mlen == 0 || inst->is_send_from_grf())
>        return 0;
> @@ -328,7 +328,7 @@ src_reg::equals(const src_reg &r) const
>  }
>
>  bool
> -vec4_visitor::opt_vector_float()
> +vec4_god::opt_vector_float()
>  {
>     bool progress = false;
>
> @@ -407,7 +407,7 @@ vec4_visitor::opt_vector_float()
>   * remove the instructions that wrote them.
>   */
>  bool
> -vec4_visitor::opt_reduce_swizzle()
> +vec4_god::opt_reduce_swizzle()
>  {
>     bool progress = false;
>
> @@ -461,7 +461,7 @@ vec4_visitor::opt_reduce_swizzle()
>  }
>
>  void
> -vec4_visitor::split_uniform_registers()
> +vec4_god::split_uniform_registers()
>  {
>     /* Prior to this, uniforms have been in an array sized according to
>      * the number of vector uniforms present, sparsely filled (so an
> @@ -489,7 +489,7 @@ vec4_visitor::split_uniform_registers()
>  }
>
>  void
> -vec4_visitor::pack_uniform_registers()
> +vec4_god::pack_uniform_registers()
>  {
>     bool uniform_used[this->uniforms];
>     int new_loc[this->uniforms];
> @@ -583,7 +583,7 @@ vec4_visitor::pack_uniform_registers()
>   * instructions involving 0.
>   */
>  bool
> -vec4_visitor::opt_algebraic()
> +vec4_god::opt_algebraic()
>  {
>     bool progress = false;
>
> @@ -689,7 +689,7 @@ vec4_visitor::opt_algebraic()
>   * pull constants.
>   */
>  void
> -vec4_visitor::move_push_constants_to_pull_constants()
> +vec4_god::move_push_constants_to_pull_constants()
>  {
>     int pull_constant_loc[this->uniforms];
>
> @@ -772,7 +772,7 @@ vec4_visitor::move_push_constants_to_pull_constants()
>
>  /* Conditions for which we want to avoid setting the dependency control bits */
>  bool
> -vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
> +vec4_god::is_dep_ctrl_unsafe(const vec4_instruction *inst)
>  {
>  #define IS_DWORD(reg) \
>     (reg.type == BRW_REGISTER_TYPE_UD || \
> @@ -833,7 +833,7 @@ vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
>   * manual fields we can set in the instructions that let it do so.
>   */
>  void
> -vec4_visitor::opt_set_dependency_control()
> +vec4_god::opt_set_dependency_control()
>  {
>     vec4_instruction *last_grf_write[BRW_MAX_GRF];
>     uint8_t grf_channels_written[BRW_MAX_GRF];
> @@ -958,7 +958,7 @@ vec4_instruction::reswizzle(int dst_writemask, int swizzle)
>   * of the GRF write directly to the final destination instead.
>   */
>  bool
> -vec4_visitor::opt_register_coalesce()
> +vec4_god::opt_register_coalesce()
>  {
>     bool progress = false;
>     int next_ip = 0;
> @@ -1124,7 +1124,7 @@ vec4_visitor::opt_register_coalesce()
>   * a GRF on IVB.
>   */
>  void
> -vec4_visitor::split_virtual_grfs()
> +vec4_god::split_virtual_grfs()
>  {
>     int num_vars = this->alloc.count;
>     int new_virtual_grf[num_vars];
> @@ -1186,13 +1186,13 @@ vec4_visitor::split_virtual_grfs()
>  }
>
>  void
> -vec4_visitor::dump_instruction(backend_instruction *be_inst)
> +vec4_god::dump_instruction(backend_instruction *be_inst)
>  {
>     dump_instruction(be_inst, stderr);
>  }
>
>  void
> -vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
> +vec4_god::dump_instruction(backend_instruction *be_inst, FILE *file)
>  {
>     vec4_instruction *inst = (vec4_instruction *)be_inst;
>
> @@ -1404,7 +1404,7 @@ attribute_to_hw_reg(int attr, bool interleaved)
>   * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
>   */
>  void
> -vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
> +vec4_god::lower_attributes_to_hw_regs(const int *attribute_map,
>                                            bool interleaved)
>  {
>     foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
> @@ -1451,7 +1451,7 @@ vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
>  }
>
>  int
> -vec4_vs_visitor::setup_attributes(int payload_reg)
> +vec4_vs_god::setup_attributes(int payload_reg)
>  {
>     int nr_attributes;
>     int attribute_map[VERT_ATTRIB_MAX + 1];
> @@ -1496,7 +1496,7 @@ vec4_vs_visitor::setup_attributes(int payload_reg)
>  }
>
>  int
> -vec4_visitor::setup_uniforms(int reg)
> +vec4_god::setup_uniforms(int reg)
>  {
>     prog_data->base.dispatch_grf_start_reg = reg;
>
> @@ -1530,7 +1530,7 @@ vec4_visitor::setup_uniforms(int reg)
>  }
>
>  void
> -vec4_vs_visitor::setup_payload(void)
> +vec4_vs_god::setup_payload(void)
>  {
>     int reg = 0;
>
> @@ -1548,13 +1548,13 @@ vec4_vs_visitor::setup_payload(void)
>  }
>
>  void
> -vec4_visitor::assign_binding_table_offsets()
> +vec4_god::assign_binding_table_offsets()
>  {
>     assign_common_binding_table_offsets(0);
>  }
>
>  src_reg
> -vec4_visitor::get_timestamp()
> +vec4_god::get_timestamp()
>  {
>     assert(brw->gen >= 7);
>
> @@ -1582,14 +1582,14 @@ vec4_visitor::get_timestamp()
>  }
>
>  void
> -vec4_visitor::emit_shader_time_begin()
> +vec4_god::emit_shader_time_begin()
>  {
>     current_annotation = "shader time start";
>     shader_start_time = get_timestamp();
>  }
>
>  void
> -vec4_visitor::emit_shader_time_end()
> +vec4_god::emit_shader_time_end()
>  {
>     current_annotation = "shader time end";
>     src_reg shader_end_time = get_timestamp();
> @@ -1624,7 +1624,7 @@ vec4_visitor::emit_shader_time_end()
>  }
>
>  void
> -vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
> +vec4_god::emit_shader_time_write(enum shader_time_shader_type type,
>                                       src_reg value)
>  {
>     int shader_time_index =
> @@ -1649,7 +1649,7 @@ vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
>  }
>
>  bool
> -vec4_visitor::run()
> +vec4_god::run()
>  {
>     sanity_param_count = prog->Parameters->NumParameters;
>
> @@ -1710,7 +1710,7 @@ vec4_visitor::run()
>           snprintf(filename, 64, "%s-%04d-%02d-%02d-" #pass,            \
>                    stage_name, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
>                                                                         \
> -         backend_visitor::dump_instructions(filename);                 \
> +         backend_god::dump_instructions(filename);                 \
>        }                                                                \
>                                                                         \
>        progress = progress || this_progress;                            \
> @@ -1723,7 +1723,7 @@ vec4_visitor::run()
>        snprintf(filename, 64, "%s-%04d-00-start",
>                 stage_name, shader_prog ? shader_prog->Name : 0);
>
> -      backend_visitor::dump_instructions(filename);
> +      backend_god::dump_instructions(filename);
>     }
>
>     bool progress;
> @@ -1824,7 +1824,7 @@ brw_vs_emit(struct brw_context *brw,
>        brw_dump_ir("vertex", prog, &shader->base, &c->vp->program.Base);
>
>     if (brw->scalar_vs && (prog || brw_env_var_as_boolean("INTEL_USE_NIR", false))) {
> -      fs_visitor v(brw, mem_ctx, &c->key, prog_data, prog, &c->vp->program, 8);
> +      fs_god v(brw, mem_ctx, &c->key, prog_data, prog, &c->vp->program, 8);
>        if (!v.run_vs()) {
>           if (prog) {
>              prog->LinkStatus = false;
> @@ -1861,7 +1861,7 @@ brw_vs_emit(struct brw_context *brw,
>     }
>
>     if (!assembly) {
> -      vec4_vs_visitor v(brw, c, prog_data, prog, mem_ctx);
> +      vec4_vs_god v(brw, c, prog_data, prog, mem_ctx);
>        if (!v.run()) {
>           if (prog) {
>              prog->LinkStatus = false;
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
> index 33297ae..b8418b1 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.h
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.h
> @@ -73,10 +73,10 @@ class vec4_live_variables;
>   * Translates either GLSL IR or Mesa IR (for ARB_vertex_program and
>   * fixed-function) into VS IR.
>   */
> -class vec4_visitor : public backend_visitor
> +class vec4_god : public backend_god
>  {
>  public:
> -   vec4_visitor(struct brw_context *brw,
> +   vec4_god(struct brw_context *brw,
>                  struct brw_vec4_compile *c,
>                  struct gl_program *prog,
>                  const struct brw_vue_prog_key *key,
> @@ -88,7 +88,7 @@ public:
>                  shader_time_shader_type st_base,
>                  shader_time_shader_type st_written,
>                  shader_time_shader_type st_reset);
> -   ~vec4_visitor();
> +   ~vec4_god();
>
>     dst_reg dst_null_f()
>     {
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
> index e897be2..5b40ab4 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
> @@ -364,7 +364,7 @@ try_copy_propagate(struct brw_context *brw, vec4_instruction *inst,
>  }
>
>  bool
> -vec4_visitor::opt_copy_propagation(bool do_constant_prop)
> +vec4_god::opt_copy_propagation(bool do_constant_prop)
>  {
>     bool progress = false;
>     struct copy_entry entries[alloc.total_size];
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
> index 100e511..fe6d0bd 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
> @@ -121,7 +121,7 @@ instructions_match(vec4_instruction *a, vec4_instruction *b)
>  }
>
>  bool
> -vec4_visitor::opt_cse_local(bblock_t *block)
> +vec4_god::opt_cse_local(bblock_t *block)
>  {
>     bool progress = false;
>     exec_list aeb;
> @@ -250,7 +250,7 @@ vec4_visitor::opt_cse_local(bblock_t *block)
>  }
>
>  bool
> -vec4_visitor::opt_cse()
> +vec4_god::opt_cse()
>  {
>     bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
> index 980e266..3a8e0b7 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
> @@ -60,7 +60,7 @@ can_do_writemask(const struct brw_context *brw,
>  }
>
>  bool
> -vec4_visitor::dead_code_eliminate()
> +vec4_god::dead_code_eliminate()
>  {
>     bool progress = false;
>
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_god.cpp b/src/mesa/drivers/dri/i965/brw_vec4_god.cpp
> new file mode 100644
> index 0000000..3483143
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_god.cpp
> @@ -0,0 +1,3658 @@
> +/*
> + * Copyright © 2011 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +#include "brw_vec4.h"
> +#include "brw_cfg.h"
> +#include "glsl/ir_uniform.h"
> +#include "program/sampler.h"
> +
> +namespace brw {
> +
> +vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
> +                                   const src_reg &src0, const src_reg &src1,
> +                                   const src_reg &src2)
> +{
> +   this->opcode = opcode;
> +   this->dst = dst;
> +   this->src[0] = src0;
> +   this->src[1] = src1;
> +   this->src[2] = src2;
> +   this->saturate = false;
> +   this->force_writemask_all = false;
> +   this->no_dd_clear = false;
> +   this->no_dd_check = false;
> +   this->writes_accumulator = false;
> +   this->conditional_mod = BRW_CONDITIONAL_NONE;
> +   this->predicate = BRW_PREDICATE_NONE;
> +   this->predicate_inverse = false;
> +   this->target = 0;
> +   this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
> +   this->shadow_compare = false;
> +   this->ir = NULL;
> +   this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
> +   this->header_present = false;
> +   this->flag_subreg = 0;
> +   this->mlen = 0;
> +   this->base_mrf = 0;
> +   this->offset = 0;
> +   this->annotation = NULL;
> +}
> +
> +vec4_instruction *
> +vec4_god::emit(vec4_instruction *inst)
> +{
> +   inst->ir = this->base_ir;
> +   inst->annotation = this->current_annotation;
> +
> +   this->instructions.push_tail(inst);
> +
> +   return inst;
> +}
> +
> +vec4_instruction *
> +vec4_god::emit_before(bblock_t *block, vec4_instruction *inst,
> +                          vec4_instruction *new_inst)
> +{
> +   new_inst->ir = inst->ir;
> +   new_inst->annotation = inst->annotation;
> +
> +   inst->insert_before(block, new_inst);
> +
> +   return inst;
> +}
> +
> +vec4_instruction *
> +vec4_god::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
> +                   const src_reg &src1, const src_reg &src2)
> +{
> +   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
> +}
> +
> +
> +vec4_instruction *
> +vec4_god::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
> +                   const src_reg &src1)
> +{
> +   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
> +}
> +
> +vec4_instruction *
> +vec4_god::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
> +{
> +   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
> +}
> +
> +vec4_instruction *
> +vec4_god::emit(enum opcode opcode, const dst_reg &dst)
> +{
> +   return emit(new(mem_ctx) vec4_instruction(opcode, dst));
> +}
> +
> +vec4_instruction *
> +vec4_god::emit(enum opcode opcode)
> +{
> +   return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
> +}
> +
> +#define ALU1(op)                                                       \
> +   vec4_instruction *                                                  \
> +   vec4_god::op(const dst_reg &dst, const src_reg &src0)               \
> +   {                                                                   \
> +      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
> +   }
> +
> +#define ALU2(op)                                                       \
> +   vec4_instruction *                                                  \
> +   vec4_god::op(const dst_reg &dst, const src_reg &src0,               \
> +                    const src_reg &src1)                               \
> +   {                                                                   \
> +      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
> +                                           src0, src1);                 \
> +   }
> +
> +#define ALU2_ACC(op)                                                   \
> +   vec4_instruction *                                                  \
> +   vec4_god::op(const dst_reg &dst, const src_reg &src0,               \
> +                    const src_reg &src1)                               \
> +   {                                                                   \
> +      vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
> +                       BRW_OPCODE_##op, dst, src0, src1);              \
> +      inst->writes_accumulator = true;                                  \
> +      return inst;                                                      \
> +   }
> +
> +#define ALU3(op)                                                       \
> +   vec4_instruction *                                                  \
> +   vec4_god::op(const dst_reg &dst, const src_reg &src0,               \
> +                    const src_reg &src1, const src_reg &src2)          \
> +   {                                                                   \
> +      assert(brw->gen >= 6);                                           \
> +      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,       \
> +                                          src0, src1, src2);           \
> +   }
> +
> +ALU1(NOT)
> +ALU1(MOV)
> +ALU1(FRC)
> +ALU1(RNDD)
> +ALU1(RNDE)
> +ALU1(RNDZ)
> +ALU1(F32TO16)
> +ALU1(F16TO32)
> +ALU2(ADD)
> +ALU2(MUL)
> +ALU2_ACC(MACH)
> +ALU2(AND)
> +ALU2(OR)
> +ALU2(XOR)
> +ALU2(DP3)
> +ALU2(DP4)
> +ALU2(DPH)
> +ALU2(SHL)
> +ALU2(SHR)
> +ALU2(ASR)
> +ALU3(LRP)
> +ALU1(BFREV)
> +ALU3(BFE)
> +ALU2(BFI1)
> +ALU3(BFI2)
> +ALU1(FBH)
> +ALU1(FBL)
> +ALU1(CBIT)
> +ALU3(MAD)
> +ALU2_ACC(ADDC)
> +ALU2_ACC(SUBB)
> +ALU2(MAC)
> +
> +/** Gen4 predicated IF. */
> +vec4_instruction *
> +vec4_god::IF(enum brw_predicate predicate)
> +{
> +   vec4_instruction *inst;
> +
> +   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
> +   inst->predicate = predicate;
> +
> +   return inst;
> +}
> +
> +/** Gen6 IF with embedded comparison. */
> +vec4_instruction *
> +vec4_god::IF(src_reg src0, src_reg src1,
> +                 enum brw_conditional_mod condition)
> +{
> +   assert(brw->gen == 6);
> +
> +   vec4_instruction *inst;
> +
> +   resolve_ud_negate(&src0);
> +   resolve_ud_negate(&src1);
> +
> +   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
> +                                       src0, src1);
> +   inst->conditional_mod = condition;
> +
> +   return inst;
> +}
> +
> +/**
> + * CMP: Sets the low bit of the destination channels with the result
> + * of the comparison, while the upper bits are undefined, and updates
> + * the flag register with the packed 16 bits of the result.
> + */
> +vec4_instruction *
> +vec4_god::CMP(dst_reg dst, src_reg src0, src_reg src1,
> +                  enum brw_conditional_mod condition)
> +{
> +   vec4_instruction *inst;
> +
> +   /* Take the instruction:
> +    *
> +    * CMP null<d> src0<f> src1<f>
> +    *
> +    * Original gen4 does type conversion to the destination type before
> +    * comparison, producing garbage results for floating point comparisons.
> +    *
> +    * The destination type doesn't matter on newer generations, so we set the
> +    * type to match src0 so we can compact the instruction.
> +    */
> +   dst.type = src0.type;
> +   if (dst.file == HW_REG)
> +      dst.fixed_hw_reg.type = dst.type;
> +
> +   resolve_ud_negate(&src0);
> +   resolve_ud_negate(&src1);
> +
> +   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
> +   inst->conditional_mod = condition;
> +
> +   return inst;
> +}
> +
> +vec4_instruction *
> +vec4_god::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
> +{
> +   vec4_instruction *inst;
> +
> +   inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
> +                                       dst, index);
> +   inst->base_mrf = 14;
> +   inst->mlen = 2;
> +
> +   return inst;
> +}
> +
> +vec4_instruction *
> +vec4_god::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
> +                            const src_reg &index)
> +{
> +   vec4_instruction *inst;
> +
> +   inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
> +                                       dst, src, index);
> +   inst->base_mrf = 13;
> +   inst->mlen = 3;
> +
> +   return inst;
> +}
> +
> +void
> +vec4_god::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
> +{
> +   static enum opcode dot_opcodes[] = {
> +      BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
> +   };
> +
> +   emit(dot_opcodes[elements - 2], dst, src0, src1);
> +}
> +
> +src_reg
> +vec4_god::fix_3src_operand(src_reg src)
> +{
> +   /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
> +    * able to use vertical stride of zero to replicate the vec4 uniform, like
> +    *
> +    *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
> +    *
> +    * But you can't, since vertical stride is always four in three-source
> +    * instructions. Instead, insert a MOV instruction to do the replication so
> +    * that the three-source instruction can consume it.
> +    */
> +
> +   /* The MOV is only needed if the source is a uniform or immediate. */
> +   if (src.file != UNIFORM && src.file != IMM)
> +      return src;
> +
> +   if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
> +      return src;
> +
> +   dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
> +   expanded.type = src.type;
> +   emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
> +   return src_reg(expanded);
> +}
> +
> +src_reg
> +vec4_god::fix_math_operand(src_reg src)
> +{
> +   if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
> +      return src;
> +
> +   /* The gen6 math instruction ignores the source modifiers --
> +    * swizzle, abs, negate, and at least some parts of the register
> +    * region description.
> +    *
> +    * Rather than trying to enumerate all these cases, *always* expand the
> +    * operand to a temp GRF for gen6.
> +    *
> +    * For gen7, keep the operand as-is, except if immediate, which gen7 still
> +    * can't use.
> +    */
> +
> +   if (brw->gen == 7 && src.file != IMM)
> +      return src;
> +
> +   dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
> +   expanded.type = src.type;
> +   emit(MOV(expanded, src));
> +   return src_reg(expanded);
> +}
> +
> +void
> +vec4_god::emit_math(enum opcode opcode,
> +                        const dst_reg &dst,
> +                        const src_reg &src0, const src_reg &src1)
> +{
> +   vec4_instruction *math =
> +      emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
> +
> +   if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
> +      /* MATH on Gen6 must be align1, so we can't do writemasks. */
> +      math->dst = dst_reg(this, glsl_type::vec4_type);
> +      math->dst.type = dst.type;
> +      emit(MOV(dst, src_reg(math->dst)));
> +   } else if (brw->gen < 6) {
> +      math->base_mrf = 1;
> +      math->mlen = src1.file == BAD_FILE ? 1 : 2;
> +   }
> +}
> +
> +void
> +vec4_god::emit_pack_half_2x16(dst_reg dst, src_reg src0)
> +{
> +   if (brw->gen < 7) {
> +      unreachable("ir_unop_pack_half_2x16 should be lowered");
> +   }
> +
> +   assert(dst.type == BRW_REGISTER_TYPE_UD);
> +   assert(src0.type == BRW_REGISTER_TYPE_F);
> +
> +   /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
> +    *
> +    *   Because this instruction does not have a 16-bit floating-point type,
> +    *   the destination data type must be Word (W).
> +    *
> +    *   The destination must be DWord-aligned and specify a horizontal stride
> +    *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
> +    *   each destination channel and the upper word is not modified.
> +    *
> +    * The above restriction implies that the f32to16 instruction must use
> +    * align1 mode, because only in align1 mode is it possible to specify
> +    * horizontal stride.  We choose here to defy the hardware docs and emit
> +    * align16 instructions.
> +    *
> +    * (I [chadv] did attempt to emit align1 instructions for VS f32to16
> +    * instructions. I was partially successful in that the code passed all
> +    * tests.  However, the code was dubiously correct and fragile, and the
> +    * tests were not harsh enough to probe that frailty. Not trusting the
> +    * code, I chose instead to remain in align16 mode in defiance of the hw
> +    * docs).
> +    *
> +    * I've [chadv] experimentally confirmed that, on gen7 hardware and the
> +    * simulator, emitting a f32to16 in align16 mode with UD as destination
> +    * data type is safe. The behavior differs from that specified in the PRM
> +    * in that the upper word of each destination channel is cleared to 0.
> +    */
> +
> +   dst_reg tmp_dst(this, glsl_type::uvec2_type);
> +   src_reg tmp_src(tmp_dst);
> +
> +#if 0
> +   /* Verify the undocumented behavior on which the following instructions
> +    * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
> +    * then the result of the bit-or instruction below will be incorrect.
> +    *
> +    * You should inspect the disasm output in order to verify that the MOV is
> +    * not optimized away.
> +    */
> +   emit(MOV(tmp_dst, src_reg(0x12345678u)));
> +#endif
> +
> +   /* Give tmp the form below, where "." means untouched.
> +    *
> +    *     w z          y          x w z          y          x
> +    *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
> +    *
> +    * That the upper word of each write-channel be 0 is required for the
> +    * following bit-shift and bit-or instructions to work. Note that this
> +    * relies on the undocumented hardware behavior mentioned above.
> +    */
> +   tmp_dst.writemask = WRITEMASK_XY;
> +   emit(F32TO16(tmp_dst, src0));
> +
> +   /* Give the write-channels of dst the form:
> +    *   0xhhhh0000
> +    */
> +   tmp_src.swizzle = BRW_SWIZZLE_YYYY;
> +   emit(SHL(dst, tmp_src, src_reg(16u)));
> +
> +   /* Finally, give the write-channels of dst the form of packHalf2x16's
> +    * output:
> +    *   0xhhhhllll
> +    */
> +   tmp_src.swizzle = BRW_SWIZZLE_XXXX;
> +   emit(OR(dst, src_reg(dst), tmp_src));
> +}
> +
> +void
> +vec4_god::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
> +{
> +   if (brw->gen < 7) {
> +      unreachable("ir_unop_unpack_half_2x16 should be lowered");
> +   }
> +
> +   assert(dst.type == BRW_REGISTER_TYPE_F);
> +   assert(src0.type == BRW_REGISTER_TYPE_UD);
> +
> +   /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
> +    *
> +    *   Because this instruction does not have a 16-bit floating-point type,
> +    *   the source data type must be Word (W). The destination type must be
> +    *   F (Float).
> +    *
> +    * To use W as the source data type, we must adjust horizontal strides,
> +    * which is only possible in align1 mode. All my [chadv] attempts at
> +    * emitting align1 instructions for unpackHalf2x16 failed to pass the
> +    * Piglit tests, so I gave up.
> +    *
> +    * I've verified that, on gen7 hardware and the simulator, it is safe to
> +    * emit f16to32 in align16 mode with UD as source data type.
> +    */
> +
> +   dst_reg tmp_dst(this, glsl_type::uvec2_type);
> +   src_reg tmp_src(tmp_dst);
> +
> +   tmp_dst.writemask = WRITEMASK_X;
> +   emit(AND(tmp_dst, src0, src_reg(0xffffu)));
> +
> +   tmp_dst.writemask = WRITEMASK_Y;
> +   emit(SHR(tmp_dst, src0, src_reg(16u)));
> +
> +   dst.writemask = WRITEMASK_XY;
> +   emit(F16TO32(dst, tmp_src));
> +}
> +
> +void
> +vec4_god::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
> +{
> +   /* Instead of splitting the 32-bit integer, shifting, and ORing it back
> +    * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
> +    * is not suitable to generate the shift values, but we can use the packed
> +    * vector float and a type-converting MOV.
> +    */
> +   dst_reg shift(this, glsl_type::uvec4_type);
> +   emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
> +
> +   dst_reg shifted(this, glsl_type::uvec4_type);
> +   src0.swizzle = BRW_SWIZZLE_XXXX;
> +   emit(SHR(shifted, src0, src_reg(shift)));
> +
> +   shifted.type = BRW_REGISTER_TYPE_UB;
> +   dst_reg f(this, glsl_type::vec4_type);
> +   emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
> +
> +   emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
> +}
> +
> +void
> +vec4_god::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
> +{
> +   /* Instead of splitting the 32-bit integer, shifting, and ORing it back
> +    * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
> +    * is not suitable to generate the shift values, but we can use the packed
> +    * vector float and a type-converting MOV.
> +    */
> +   dst_reg shift(this, glsl_type::uvec4_type);
> +   emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
> +
> +   dst_reg shifted(this, glsl_type::uvec4_type);
> +   src0.swizzle = BRW_SWIZZLE_XXXX;
> +   emit(SHR(shifted, src0, src_reg(shift)));
> +
> +   shifted.type = BRW_REGISTER_TYPE_B;
> +   dst_reg f(this, glsl_type::vec4_type);
> +   emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
> +
> +   dst_reg scaled(this, glsl_type::vec4_type);
> +   emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
> +
> +   dst_reg max(this, glsl_type::vec4_type);
> +   emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
> +   emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
> +}
> +
> +void
> +vec4_god::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
> +{
> +   dst_reg saturated(this, glsl_type::vec4_type);
> +   vec4_instruction *inst = emit(MOV(saturated, src0));
> +   inst->saturate = true;
> +
> +   dst_reg scaled(this, glsl_type::vec4_type);
> +   emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
> +
> +   dst_reg rounded(this, glsl_type::vec4_type);
> +   emit(RNDE(rounded, src_reg(scaled)));
> +
> +   dst_reg u(this, glsl_type::uvec4_type);
> +   emit(MOV(u, src_reg(rounded)));
> +
> +   src_reg bytes(u);
> +   emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
> +}
> +
> +void
> +vec4_god::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
> +{
> +   dst_reg max(this, glsl_type::vec4_type);
> +   emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
> +
> +   dst_reg min(this, glsl_type::vec4_type);
> +   emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
> +
> +   dst_reg scaled(this, glsl_type::vec4_type);
> +   emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
> +
> +   dst_reg rounded(this, glsl_type::vec4_type);
> +   emit(RNDE(rounded, src_reg(scaled)));
> +
> +   dst_reg i(this, glsl_type::ivec4_type);
> +   emit(MOV(i, src_reg(rounded)));
> +
> +   src_reg bytes(i);
> +   emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
> +}
> +
> +void
> +vec4_god::visit_instructions(const exec_list *list)
> +{
> +   foreach_in_list(ir_instruction, ir, list) {
> +      base_ir = ir;
> +      ir->accept(this);
> +   }
> +}
> +
> +
> +static int
> +type_size(const struct glsl_type *type)
> +{
> +   unsigned int i;
> +   int size;
> +
> +   switch (type->base_type) {
> +   case GLSL_TYPE_UINT:
> +   case GLSL_TYPE_INT:
> +   case GLSL_TYPE_FLOAT:
> +   case GLSL_TYPE_BOOL:
> +      if (type->is_matrix()) {
> +        return type->matrix_columns;
> +      } else {
> +        /* Regardless of size of vector, it gets a vec4. This is bad
> +         * packing for things like floats, but otherwise arrays become a
> +         * mess.  Hopefully a later pass over the code can pack scalars
> +         * down if appropriate.
> +         */
> +        return 1;
> +      }
> +   case GLSL_TYPE_ARRAY:
> +      assert(type->length > 0);
> +      return type_size(type->fields.array) * type->length;
> +   case GLSL_TYPE_STRUCT:
> +      size = 0;
> +      for (i = 0; i < type->length; i++) {
> +        size += type_size(type->fields.structure[i].type);
> +      }
> +      return size;
> +   case GLSL_TYPE_SAMPLER:
> +      /* Samplers take up no register space, since they're baked in at
> +       * link time.
> +       */
> +      return 0;
> +   case GLSL_TYPE_ATOMIC_UINT:
> +      return 0;
> +   case GLSL_TYPE_IMAGE:
> +   case GLSL_TYPE_VOID:
> +   case GLSL_TYPE_DOUBLE:
> +   case GLSL_TYPE_ERROR:
> +   case GLSL_TYPE_INTERFACE:
> +      unreachable("not reached");
> +   }
> +
> +   return 0;
> +}
> +
> +src_reg::src_reg(class vec4_god *v, const struct glsl_type *type)
> +{
> +   init();
> +
> +   this->file = GRF;
> +   this->reg = v->alloc.allocate(type_size(type));
> +
> +   if (type->is_array() || type->is_record()) {
> +      this->swizzle = BRW_SWIZZLE_NOOP;
> +   } else {
> +      this->swizzle = brw_swizzle_for_size(type->vector_elements);
> +   }
> +
> +   this->type = brw_type_for_base_type(type);
> +}
> +
> +src_reg::src_reg(class vec4_god *v, const struct glsl_type *type, int size)
> +{
> +   assert(size > 0);
> +
> +   init();
> +
> +   this->file = GRF;
> +   this->reg = v->alloc.allocate(type_size(type) * size);
> +
> +   this->swizzle = BRW_SWIZZLE_NOOP;
> +
> +   this->type = brw_type_for_base_type(type);
> +}
> +
> +dst_reg::dst_reg(class vec4_god *v, const struct glsl_type *type)
> +{
> +   init();
> +
> +   this->file = GRF;
> +   this->reg = v->alloc.allocate(type_size(type));
> +
> +   if (type->is_array() || type->is_record()) {
> +      this->writemask = WRITEMASK_XYZW;
> +   } else {
> +      this->writemask = (1 << type->vector_elements) - 1;
> +   }
> +
> +   this->type = brw_type_for_base_type(type);
> +}
> +
> +/* Our support for uniforms is piggy-backed on the struct
> + * gl_fragment_program, because that's where the values actually
> + * get stored, rather than in some global gl_shader_program uniform
> + * store.
> + */
> +void
> +vec4_god::setup_uniform_values(ir_variable *ir)
> +{
> +   int namelen = strlen(ir->name);
> +
> +   /* The data for our (non-builtin) uniforms is stored in a series of
> +    * gl_uniform_driver_storage structs for each subcomponent that
> +    * glGetUniformLocation() could name.  We know it's been set up in the same
> +    * order we'd walk the type, so walk the list of storage and find anything
> +    * with our name, or the prefix of a component that starts with our name.
> +    */
> +   for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
> +      struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
> +
> +      if (strncmp(ir->name, storage->name, namelen) != 0 ||
> +          (storage->name[namelen] != 0 &&
> +           storage->name[namelen] != '.' &&
> +           storage->name[namelen] != '[')) {
> +         continue;
> +      }
> +
> +      gl_constant_value *components = storage->storage;
> +      unsigned vector_count = (MAX2(storage->array_elements, 1) *
> +                               storage->type->matrix_columns);
> +
> +      for (unsigned s = 0; s < vector_count; s++) {
> +         assert(uniforms < uniform_array_size);
> +         uniform_vector_size[uniforms] = storage->type->vector_elements;
> +
> +         int i;
> +         for (i = 0; i < uniform_vector_size[uniforms]; i++) {
> +            stage_prog_data->param[uniforms * 4 + i] = components;
> +            components++;
> +         }
> +         for (; i < 4; i++) {
> +            static gl_constant_value zero = { 0.0 };
> +            stage_prog_data->param[uniforms * 4 + i] = &zero;
> +         }
> +
> +         uniforms++;
> +      }
> +   }
> +}
> +
> +void
> +vec4_god::setup_uniform_clipplane_values()
> +{
> +   gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
> +
> +   for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
> +      assert(this->uniforms < uniform_array_size);
> +      this->uniform_vector_size[this->uniforms] = 4;
> +      this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
> +      this->userplane[i].type = BRW_REGISTER_TYPE_F;
> +      for (int j = 0; j < 4; ++j) {
> +         stage_prog_data->param[this->uniforms * 4 + j] =
> +            (gl_constant_value *) &clip_planes[i][j];
> +      }
> +      ++this->uniforms;
> +   }
> +}
> +
> +/* Our support for builtin uniforms is even scarier than non-builtin.
> + * It sits on top of the PROG_STATE_VAR parameters that are
> + * automatically updated from GL context state.
> + */
> +void
> +vec4_god::setup_builtin_uniform_values(ir_variable *ir)
> +{
> +   const ir_state_slot *const slots = ir->get_state_slots();
> +   assert(slots != NULL);
> +
> +   for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
> +      /* This state reference has already been setup by ir_to_mesa,
> +       * but we'll get the same index back here.  We can reference
> +       * ParameterValues directly, since unlike brw_fs.cpp, we never
> +       * add new state references during compile.
> +       */
> +      int index = _mesa_add_state_reference(this->prog->Parameters,
> +                                           (gl_state_index *)slots[i].tokens);
> +      gl_constant_value *values =
> +         &this->prog->Parameters->ParameterValues[index][0];
> +
> +      assert(this->uniforms < uniform_array_size);
> +
> +      for (unsigned j = 0; j < 4; j++)
> +        stage_prog_data->param[this->uniforms * 4 + j] =
> +            &values[GET_SWZ(slots[i].swizzle, j)];
> +
> +      this->uniform_vector_size[this->uniforms] =
> +         (ir->type->is_scalar() || ir->type->is_vector() ||
> +          ir->type->is_matrix() ? ir->type->vector_elements : 4);
> +
> +      this->uniforms++;
> +   }
> +}
> +
> +dst_reg *
> +vec4_god::variable_storage(ir_variable *var)
> +{
> +   return (dst_reg *)hash_table_find(this->variable_ht, var);
> +}
> +
> +void
> +vec4_god::emit_bool_to_cond_code(ir_rvalue *ir,
> +                                     enum brw_predicate *predicate)
> +{
> +   ir_expression *expr = ir->as_expression();
> +
> +   *predicate = BRW_PREDICATE_NORMAL;
> +
> +   if (expr && expr->operation != ir_binop_ubo_load) {
> +      src_reg op[3];
> +      vec4_instruction *inst;
> +
> +      assert(expr->get_num_operands() <= 3);
> +      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> +        expr->operands[i]->accept(this);
> +        op[i] = this->result;
> +
> +        resolve_ud_negate(&op[i]);
> +      }
> +
> +      switch (expr->operation) {
> +      case ir_unop_logic_not:
> +        inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
> +        inst->conditional_mod = BRW_CONDITIONAL_Z;
> +        break;
> +
> +      case ir_binop_logic_xor:
> +         if (brw->gen <= 5) {
> +            src_reg temp = src_reg(this, ir->type);
> +            emit(XOR(dst_reg(temp), op[0], op[1]));
> +            inst = emit(AND(dst_null_d(), temp, src_reg(1)));
> +         } else {
> +            inst = emit(XOR(dst_null_d(), op[0], op[1]));
> +         }
> +         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +        break;
> +
> +      case ir_binop_logic_or:
> +         if (brw->gen <= 5) {
> +            src_reg temp = src_reg(this, ir->type);
> +            emit(OR(dst_reg(temp), op[0], op[1]));
> +            inst = emit(AND(dst_null_d(), temp, src_reg(1)));
> +         } else {
> +            inst = emit(OR(dst_null_d(), op[0], op[1]));
> +         }
> +         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +        break;
> +
> +      case ir_binop_logic_and:
> +         if (brw->gen <= 5) {
> +            src_reg temp = src_reg(this, ir->type);
> +            emit(AND(dst_reg(temp), op[0], op[1]));
> +            inst = emit(AND(dst_null_d(), temp, src_reg(1)));
> +         } else {
> +            inst = emit(AND(dst_null_d(), op[0], op[1]));
> +         }
> +         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +        break;
> +
> +      case ir_unop_f2b:
> +        if (brw->gen >= 6) {
> +           emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
> +        } else {
> +           inst = emit(MOV(dst_null_f(), op[0]));
> +           inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +        }
> +        break;
> +
> +      case ir_unop_i2b:
> +        if (brw->gen >= 6) {
> +           emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> +        } else {
> +           inst = emit(MOV(dst_null_d(), op[0]));
> +           inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +        }
> +        break;
> +
> +      case ir_binop_all_equal:
> +         if (brw->gen <= 5) {
> +            resolve_bool_comparison(expr->operands[0], &op[0]);
> +            resolve_bool_comparison(expr->operands[1], &op[1]);
> +         }
> +        inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
> +        *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
> +        break;
> +
> +      case ir_binop_any_nequal:
> +         if (brw->gen <= 5) {
> +            resolve_bool_comparison(expr->operands[0], &op[0]);
> +            resolve_bool_comparison(expr->operands[1], &op[1]);
> +         }
> +        inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
> +        *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
> +        break;
> +
> +      case ir_unop_any:
> +         if (brw->gen <= 5) {
> +            resolve_bool_comparison(expr->operands[0], &op[0]);
> +         }
> +        inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> +        *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
> +        break;
> +
> +      case ir_binop_greater:
> +      case ir_binop_gequal:
> +      case ir_binop_less:
> +      case ir_binop_lequal:
> +      case ir_binop_equal:
> +      case ir_binop_nequal:
> +         if (brw->gen <= 5) {
> +            resolve_bool_comparison(expr->operands[0], &op[0]);
> +            resolve_bool_comparison(expr->operands[1], &op[1]);
> +         }
> +        emit(CMP(dst_null_d(), op[0], op[1],
> +                 brw_conditional_for_comparison(expr->operation)));
> +        break;
> +
> +      case ir_triop_csel: {
> +         /* Expand the boolean condition into the flag register. */
> +         inst = emit(MOV(dst_null_d(), op[0]));
> +         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +
> +         /* Select which boolean to return. */
> +         dst_reg temp(this, expr->operands[1]->type);
> +         inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
> +         inst->predicate = BRW_PREDICATE_NORMAL;
> +
> +         /* Expand the result to a condition code. */
> +         inst = emit(MOV(dst_null_d(), src_reg(temp)));
> +         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +         break;
> +      }
> +
> +      default:
> +        unreachable("not reached");
> +      }
> +      return;
> +   }
> +
> +   ir->accept(this);
> +
> +   resolve_ud_negate(&this->result);
> +
> +   vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
> +   inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +}
> +
> +/**
> + * Emit a gen6 IF statement with the comparison folded into the IF
> + * instruction.
> + */
> +void
> +vec4_god::emit_if_gen6(ir_if *ir)
> +{
> +   ir_expression *expr = ir->condition->as_expression();
> +
> +   if (expr && expr->operation != ir_binop_ubo_load) {
> +      src_reg op[3];
> +      dst_reg temp;
> +
> +      assert(expr->get_num_operands() <= 3);
> +      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> +        expr->operands[i]->accept(this);
> +        op[i] = this->result;
> +      }
> +
> +      switch (expr->operation) {
> +      case ir_unop_logic_not:
> +        emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
> +        return;
> +
> +      case ir_binop_logic_xor:
> +        emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
> +        return;
> +
> +      case ir_binop_logic_or:
> +        temp = dst_reg(this, glsl_type::bool_type);
> +        emit(OR(temp, op[0], op[1]));
> +        emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
> +        return;
> +
> +      case ir_binop_logic_and:
> +        temp = dst_reg(this, glsl_type::bool_type);
> +        emit(AND(temp, op[0], op[1]));
> +        emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
> +        return;
> +
> +      case ir_unop_f2b:
> +        emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> +        return;
> +
> +      case ir_unop_i2b:
> +        emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> +        return;
> +
> +      case ir_binop_greater:
> +      case ir_binop_gequal:
> +      case ir_binop_less:
> +      case ir_binop_lequal:
> +      case ir_binop_equal:
> +      case ir_binop_nequal:
> +        emit(IF(op[0], op[1],
> +                brw_conditional_for_comparison(expr->operation)));
> +        return;
> +
> +      case ir_binop_all_equal:
> +        emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
> +        emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
> +        return;
> +
> +      case ir_binop_any_nequal:
> +        emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
> +        emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
> +        return;
> +
> +      case ir_unop_any:
> +        emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> +        emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
> +        return;
> +
> +      case ir_triop_csel: {
> +         /* Expand the boolean condition into the flag register. */
> +         vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
> +         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> +
> +         /* Select which boolean to return. */
> +         dst_reg temp(this, expr->operands[1]->type);
> +         inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
> +         inst->predicate = BRW_PREDICATE_NORMAL;
> +
> +         emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
> +         return;
> +      }
> +
> +      default:
> +        unreachable("not reached");
> +      }
> +      return;
> +   }
> +
> +   ir->condition->accept(this);
> +
> +   emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
> +}
> +
> +void
> +vec4_god::visit(ir_variable *ir)
> +{
> +   dst_reg *reg = NULL;
> +
> +   if (variable_storage(ir))
> +      return;
> +
> +   switch (ir->data.mode) {
> +   case ir_var_shader_in:
> +      assert(ir->data.location != -1);
> +      reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
> +      break;
> +
> +   case ir_var_shader_out:
> +      assert(ir->data.location != -1);
> +      reg = new(mem_ctx) dst_reg(this, ir->type);
> +
> +      for (int i = 0; i < type_size(ir->type); i++) {
> +        output_reg[ir->data.location + i] = *reg;
> +        output_reg[ir->data.location + i].reg_offset = i;
> +        output_reg[ir->data.location + i].type =
> +            brw_type_for_base_type(ir->type->get_scalar_type());
> +        output_reg_annotation[ir->data.location + i] = ir->name;
> +      }
> +      break;
> +
> +   case ir_var_auto:
> +   case ir_var_temporary:
> +      reg = new(mem_ctx) dst_reg(this, ir->type);
> +      break;
> +
> +   case ir_var_uniform:
> +      reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
> +
> +      /* Thanks to the lower_ubo_reference pass, we will see only
> +       * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
> +       * variables, so no need for them to be in variable_ht.
> +       *
> +       * Some uniforms, such as samplers and atomic counters, have no actual
> +       * storage, so we should ignore them.
> +       */
> +      if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
> +         return;
> +
> +      /* Track how big the whole uniform variable is, in case we need to put a
> +       * copy of its data into pull constants for array access.
> +       */
> +      assert(this->uniforms < uniform_array_size);
> +      this->uniform_size[this->uniforms] = type_size(ir->type);
> +
> +      if (!strncmp(ir->name, "gl_", 3)) {
> +        setup_builtin_uniform_values(ir);
> +      } else {
> +        setup_uniform_values(ir);
> +      }
> +      break;
> +
> +   case ir_var_system_value:
> +      reg = make_reg_for_system_value(ir);
> +      break;
> +
> +   default:
> +      unreachable("not reached");
> +   }
> +
> +   reg->type = brw_type_for_base_type(ir->type);
> +   hash_table_insert(this->variable_ht, reg, ir);
> +}
> +
> +void
> +vec4_god::visit(ir_loop *ir)
> +{
> +   /* We don't want debugging output to print the whole body of the
> +    * loop as the annotation.
> +    */
> +   this->base_ir = NULL;
> +
> +   emit(BRW_OPCODE_DO);
> +
> +   visit_instructions(&ir->body_instructions);
> +
> +   emit(BRW_OPCODE_WHILE);
> +}
> +
> +void
> +vec4_god::visit(ir_loop_jump *ir)
> +{
> +   switch (ir->mode) {
> +   case ir_loop_jump::jump_break:
> +      emit(BRW_OPCODE_BREAK);
> +      break;
> +   case ir_loop_jump::jump_continue:
> +      emit(BRW_OPCODE_CONTINUE);
> +      break;
> +   }
> +}
> +
> +
> +void
> +vec4_god::visit(ir_function_signature *)
> +{
> +   unreachable("not reached");
> +}
> +
> +void
> +vec4_god::visit(ir_function *ir)
> +{
> +   /* Ignore function bodies other than main() -- we shouldn't see calls to
> +    * them since they should all be inlined.
> +    */
> +   if (strcmp(ir->name, "main") == 0) {
> +      const ir_function_signature *sig;
> +      exec_list empty;
> +
> +      sig = ir->matching_signature(NULL, &empty, false);
> +
> +      assert(sig);
> +
> +      visit_instructions(&sig->body);
> +   }
> +}
> +
> +bool
> +vec4_god::try_emit_mad(ir_expression *ir)
> +{
> +   /* 3-src instructions were introduced in gen6. */
> +   if (brw->gen < 6)
> +      return false;
> +
> +   /* MAD can only handle floating-point data. */
> +   if (ir->type->base_type != GLSL_TYPE_FLOAT)
> +      return false;
> +
> +   ir_rvalue *nonmul;
> +   ir_expression *mul;
> +   bool mul_negate, mul_abs;
> +
> +   for (int i = 0; i < 2; i++) {
> +      mul_negate = false;
> +      mul_abs = false;
> +
> +      mul = ir->operands[i]->as_expression();
> +      nonmul = ir->operands[1 - i];
> +
> +      if (mul && mul->operation == ir_unop_abs) {
> +         mul = mul->operands[0]->as_expression();
> +         mul_abs = true;
> +      } else if (mul && mul->operation == ir_unop_neg) {
> +         mul = mul->operands[0]->as_expression();
> +         mul_negate = true;
> +      }
> +
> +      if (mul && mul->operation == ir_binop_mul)
> +         break;
> +   }
> +
> +   if (!mul || mul->operation != ir_binop_mul)
> +      return false;
> +
> +   nonmul->accept(this);
> +   src_reg src0 = fix_3src_operand(this->result);
> +
> +   mul->operands[0]->accept(this);
> +   src_reg src1 = fix_3src_operand(this->result);
> +   src1.negate ^= mul_negate;
> +   src1.abs = mul_abs;
> +   if (mul_abs)
> +      src1.negate = false;
> +
> +   mul->operands[1]->accept(this);
> +   src_reg src2 = fix_3src_operand(this->result);
> +   src2.abs = mul_abs;
> +   if (mul_abs)
> +      src2.negate = false;
> +
> +   this->result = src_reg(this, ir->type);
> +   emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
> +
> +   return true;
> +}
> +
> +bool
> +vec4_god::try_emit_b2f_of_compare(ir_expression *ir)
> +{
> +   /* This optimization relies on CMP setting the destination to 0 when
> +    * false.  Early hardware only sets the least significant bit, and
> +    * leaves the other bits undefined.  So we can't use it.
> +    */
> +   if (brw->gen < 6)
> +      return false;
> +
> +   ir_expression *const cmp = ir->operands[0]->as_expression();
> +
> +   if (cmp == NULL)
> +      return false;
> +
> +   switch (cmp->operation) {
> +   case ir_binop_less:
> +   case ir_binop_greater:
> +   case ir_binop_lequal:
> +   case ir_binop_gequal:
> +   case ir_binop_equal:
> +   case ir_binop_nequal:
> +      break;
> +
> +   default:
> +      return false;
> +   }
> +
> +   cmp->operands[0]->accept(this);
> +   const src_reg cmp_src0 = this->result;
> +
> +   cmp->operands[1]->accept(this);
> +   const src_reg cmp_src1 = this->result;
> +
> +   this->result = src_reg(this, ir->type);
> +
> +   emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
> +            brw_conditional_for_comparison(cmp->operation)));
> +
> +   /* If the comparison is false, this->result will just happen to be zero.
> +    */
> +   vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
> +                                       this->result, src_reg(1.0f));
> +   inst->predicate = BRW_PREDICATE_NORMAL;
> +   inst->predicate_inverse = true;
> +
> +   return true;
> +}
> +
> +void
> +vec4_god::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
> +                          src_reg src0, src_reg src1)
> +{
> +   vec4_instruction *inst;
> +
> +   if (brw->gen >= 6) {
> +      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
> +      inst->conditional_mod = conditionalmod;
> +   } else {
> +      emit(CMP(dst, src0, src1, conditionalmod));
> +
> +      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
> +      inst->predicate = BRW_PREDICATE_NORMAL;
> +   }
> +}
> +
> +void
> +vec4_god::emit_lrp(const dst_reg &dst,
> +                       const src_reg &x, const src_reg &y, const src_reg &a)
> +{
> +   if (brw->gen >= 6) {
> +      /* Note that the instruction's argument order is reversed from GLSL
> +       * and the IR.
> +       */
> +      emit(LRP(dst,
> +               fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
> +   } else {
> +      /* Earlier generations don't support three source operations, so we
> +       * need to emit x*(1-a) + y*a.
> +       */
> +      dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
> +      dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
> +      dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
> +      y_times_a.writemask           = dst.writemask;
> +      one_minus_a.writemask         = dst.writemask;
> +      x_times_one_minus_a.writemask = dst.writemask;
> +
> +      emit(MUL(y_times_a, y, a));
> +      emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
> +      emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
> +      emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
> +   }
> +}
> +
> +void
> +vec4_god::visit(ir_expression *ir)
> +{
> +   unsigned int operand;
> +   src_reg op[ARRAY_SIZE(ir->operands)];
> +   vec4_instruction *inst;
> +
> +   if (ir->operation == ir_binop_add) {
> +      if (try_emit_mad(ir))
> +        return;
> +   }
> +
> +   if (ir->operation == ir_unop_b2f) {
> +      if (try_emit_b2f_of_compare(ir))
> +        return;
> +   }
> +
> +   /* Storage for our result.  Ideally for an assignment we'd be using
> +    * the actual storage for the result here, instead.
> +    */
> +   dst_reg result_dst(this, ir->type);
> +   src_reg result_src(result_dst);
> +
> +   if (ir->operation == ir_triop_csel) {
> +      ir->operands[1]->accept(this);
> +      op[1] = this->result;
> +      ir->operands[2]->accept(this);
> +      op[2] = this->result;
> +
> +      enum brw_predicate predicate;
> +      emit_bool_to_cond_code(ir->operands[0], &predicate);
> +      inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
> +      inst->predicate = predicate;
> +      this->result = result_src;
> +      return;
> +   }
> +
> +   for (operand = 0; operand < ir->get_num_operands(); operand++) {
> +      this->result.file = BAD_FILE;
> +      ir->operands[operand]->accept(this);
> +      if (this->result.file == BAD_FILE) {
> +        fprintf(stderr, "Failed to get tree for expression operand:\n");
> +        ir->operands[operand]->fprint(stderr);
> +        exit(1);
> +      }
> +      op[operand] = this->result;
> +
> +      /* Matrix expression operands should have been broken down to vector
> +       * operations already.
> +       */
> +      assert(!ir->operands[operand]->type->is_matrix());
> +   }
> +
> +   /* If nothing special happens, this is the result. */
> +   this->result = result_src;
> +
> +   switch (ir->operation) {
> +   case ir_unop_logic_not:
> +      emit(NOT(result_dst, op[0]));
> +      break;
> +   case ir_unop_neg:
> +      op[0].negate = !op[0].negate;
> +      emit(MOV(result_dst, op[0]));
> +      break;
> +   case ir_unop_abs:
> +      op[0].abs = true;
> +      op[0].negate = false;
> +      emit(MOV(result_dst, op[0]));
> +      break;
> +
> +   case ir_unop_sign:
> +      if (ir->type->is_float()) {
> +         /* AND(val, 0x80000000) gives the sign bit.
> +          *
> +          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
> +          * zero.
> +          */
> +         emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
> +
> +         op[0].type = BRW_REGISTER_TYPE_UD;
> +         result_dst.type = BRW_REGISTER_TYPE_UD;
> +         emit(AND(result_dst, op[0], src_reg(0x80000000u)));
> +
> +         inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
> +         inst->predicate = BRW_PREDICATE_NORMAL;
> +
> +         this->result.type = BRW_REGISTER_TYPE_F;
> +      } else {
> +         /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
> +          *               -> non-negative val generates 0x00000000.
> +          *  Predicated OR sets 1 if val is positive.
> +          */
> +         emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
> +
> +         emit(ASR(result_dst, op[0], src_reg(31)));
> +
> +         inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
> +         inst->predicate = BRW_PREDICATE_NORMAL;
> +      }
> +      break;
> +
> +   case ir_unop_rcp:
> +      emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
> +      break;
> +
> +   case ir_unop_exp2:
> +      emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
> +      break;
> +   case ir_unop_log2:
> +      emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
> +      break;
> +   case ir_unop_exp:
> +   case ir_unop_log:
> +      unreachable("not reached: should be handled by ir_explog_to_explog2");
> +   case ir_unop_sin:
> +   case ir_unop_sin_reduced:
> +      emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
> +      break;
> +   case ir_unop_cos:
> +   case ir_unop_cos_reduced:
> +      emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
> +      break;
> +
> +   case ir_unop_dFdx:
> +   case ir_unop_dFdx_coarse:
> +   case ir_unop_dFdx_fine:
> +   case ir_unop_dFdy:
> +   case ir_unop_dFdy_coarse:
> +   case ir_unop_dFdy_fine:
> +      unreachable("derivatives not valid in vertex shader");
> +
> +   case ir_unop_bitfield_reverse:
> +      emit(BFREV(result_dst, op[0]));
> +      break;
> +   case ir_unop_bit_count:
> +      emit(CBIT(result_dst, op[0]));
> +      break;
> +   case ir_unop_find_msb: {
> +      src_reg temp = src_reg(this, glsl_type::uint_type);
> +
> +      inst = emit(FBH(dst_reg(temp), op[0]));
> +      inst->dst.writemask = WRITEMASK_XYZW;
> +
> +      /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
> +       * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
> +       * subtract the result from 31 to convert the MSB count into an LSB count.
> +       */
> +
> +      /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
> +      temp.swizzle = BRW_SWIZZLE_NOOP;
> +      emit(MOV(result_dst, temp));
> +
> +      src_reg src_tmp = src_reg(result_dst);
> +      emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
> +
> +      src_tmp.negate = true;
> +      inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
> +      inst->predicate = BRW_PREDICATE_NORMAL;
> +      break;
> +   }
> +   case ir_unop_find_lsb:
> +      emit(FBL(result_dst, op[0]));
> +      break;
> +   case ir_unop_saturate:
> +      inst = emit(MOV(result_dst, op[0]));
> +      inst->saturate = true;
> +      break;
> +
> +   case ir_unop_noise:
> +      unreachable("not reached: should be handled by lower_noise");
> +
> +   case ir_binop_add:
> +      emit(ADD(result_dst, op[0], op[1]));
> +      break;
> +   case ir_binop_sub:
> +      unreachable("not reached: should be handled by ir_sub_to_add_neg");
> +
> +   case ir_binop_mul:
> +      if (brw->gen < 8 && ir->type->is_integer()) {
> +        /* For integer multiplication, the MUL uses the low 16 bits of one of
> +         * the operands (src0 through SNB, src1 on IVB and later).  The MACH
> +         * accumulates in the contribution of the upper 16 bits of that
> +         * operand.  If we can determine that one of the args is in the low
> +         * 16 bits, though, we can just emit a single MUL.
> +          */
> +         if (ir->operands[0]->is_uint16_constant()) {
> +            if (brw->gen < 7)
> +               emit(MUL(result_dst, op[0], op[1]));
> +            else
> +               emit(MUL(result_dst, op[1], op[0]));
> +         } else if (ir->operands[1]->is_uint16_constant()) {
> +            if (brw->gen < 7)
> +               emit(MUL(result_dst, op[1], op[0]));
> +            else
> +               emit(MUL(result_dst, op[0], op[1]));
> +         } else {
> +            struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
> +
> +            emit(MUL(acc, op[0], op[1]));
> +            emit(MACH(dst_null_d(), op[0], op[1]));
> +            emit(MOV(result_dst, src_reg(acc)));
> +         }
> +      } else {
> +        emit(MUL(result_dst, op[0], op[1]));
> +      }
> +      break;
> +   case ir_binop_imul_high: {
> +      struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
> +
> +      emit(MUL(acc, op[0], op[1]));
> +      emit(MACH(result_dst, op[0], op[1]));
> +      break;
> +   }
> +   case ir_binop_div:
> +      /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
> +      assert(ir->type->is_integer());
> +      emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
> +      break;
> +   case ir_binop_carry: {
> +      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
> +
> +      emit(ADDC(dst_null_ud(), op[0], op[1]));
> +      emit(MOV(result_dst, src_reg(acc)));
> +      break;
> +   }
> +   case ir_binop_borrow: {
> +      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
> +
> +      emit(SUBB(dst_null_ud(), op[0], op[1]));
> +      emit(MOV(result_dst, src_reg(acc)));
> +      break;
> +   }
> +   case ir_binop_mod:
> +      /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
> +      assert(ir->type->is_integer());
> +      emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
> +      break;
> +
> +   case ir_binop_less:
> +   case ir_binop_greater:
> +   case ir_binop_lequal:
> +   case ir_binop_gequal:
> +   case ir_binop_equal:
> +   case ir_binop_nequal: {
> +      if (brw->gen <= 5) {
> +         resolve_bool_comparison(ir->operands[0], &op[0]);
> +         resolve_bool_comparison(ir->operands[1], &op[1]);
> +      }
> +      emit(CMP(result_dst, op[0], op[1],
> +              brw_conditional_for_comparison(ir->operation)));
> +      break;
> +   }
> +
> +   case ir_binop_all_equal:
> +      if (brw->gen <= 5) {
> +         resolve_bool_comparison(ir->operands[0], &op[0]);
> +         resolve_bool_comparison(ir->operands[1], &op[1]);
> +      }
> +
> +      /* "==" operator producing a scalar boolean. */
> +      if (ir->operands[0]->type->is_vector() ||
> +         ir->operands[1]->type->is_vector()) {
> +        emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
> +        emit(MOV(result_dst, src_reg(0)));
> +         inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
> +        inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
> +      } else {
> +        emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
> +      }
> +      break;
> +   case ir_binop_any_nequal:
> +      if (brw->gen <= 5) {
> +         resolve_bool_comparison(ir->operands[0], &op[0]);
> +         resolve_bool_comparison(ir->operands[1], &op[1]);
> +      }
> +
> +      /* "!=" operator producing a scalar boolean. */
> +      if (ir->operands[0]->type->is_vector() ||
> +         ir->operands[1]->type->is_vector()) {
> +        emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
> +
> +        emit(MOV(result_dst, src_reg(0)));
> +         inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
> +        inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
> +      } else {
> +        emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
> +      }
> +      break;
> +
> +   case ir_unop_any:
> +      if (brw->gen <= 5) {
> +         resolve_bool_comparison(ir->operands[0], &op[0]);
> +      }
> +      emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> +      emit(MOV(result_dst, src_reg(0)));
> +
> +      inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
> +      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
> +      break;
> +
> +   case ir_binop_logic_xor:
> +      emit(XOR(result_dst, op[0], op[1]));
> +      break;
> +
> +   case ir_binop_logic_or:
> +      emit(OR(result_dst, op[0], op[1]));
> +      break;
> +
> +   case ir_binop_logic_and:
> +      emit(AND(result_dst, op[0], op[1]));
> +      break;
> +
> +   case ir_binop_dot:
> +      assert(ir->operands[0]->type->is_vector());
> +      assert(ir->operands[0]->type == ir->operands[1]->type);
> +      emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
> +      break;
> +
> +   case ir_unop_sqrt:
> +      emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
> +      break;
> +   case ir_unop_rsq:
> +      emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
> +      break;
> +
> +   case ir_unop_bitcast_i2f:
> +   case ir_unop_bitcast_u2f:
> +      this->result = op[0];
> +      this->result.type = BRW_REGISTER_TYPE_F;
> +      break;
> +
> +   case ir_unop_bitcast_f2i:
> +      this->result = op[0];
> +      this->result.type = BRW_REGISTER_TYPE_D;
> +      break;
> +
> +   case ir_unop_bitcast_f2u:
> +      this->result = op[0];
> +      this->result.type = BRW_REGISTER_TYPE_UD;
> +      break;
> +
> +   case ir_unop_i2f:
> +   case ir_unop_i2u:
> +   case ir_unop_u2i:
> +   case ir_unop_u2f:
> +   case ir_unop_f2i:
> +   case ir_unop_f2u:
> +      emit(MOV(result_dst, op[0]));
> +      break;
> +   case ir_unop_b2i:
> +      emit(AND(result_dst, op[0], src_reg(1)));
> +      break;
> +   case ir_unop_b2f:
> +      if (brw->gen <= 5) {
> +         resolve_bool_comparison(ir->operands[0], &op[0]);
> +      }
> +      op[0].type = BRW_REGISTER_TYPE_D;
> +      result_dst.type = BRW_REGISTER_TYPE_D;
> +      emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
> +      result_dst.type = BRW_REGISTER_TYPE_F;
> +      break;
> +   case ir_unop_f2b:
> +      emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
> +      break;
> +   case ir_unop_i2b:
> +      emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> +      break;
> +
> +   case ir_unop_trunc:
> +      emit(RNDZ(result_dst, op[0]));
> +      break;
> +   case ir_unop_ceil: {
> +         src_reg tmp = src_reg(this, ir->type);
> +         op[0].negate = !op[0].negate;
> +         emit(RNDD(dst_reg(tmp), op[0]));
> +         tmp.negate = true;
> +         emit(MOV(result_dst, tmp));
> +      }
> +      break;
> +   case ir_unop_floor:
> +      inst = emit(RNDD(result_dst, op[0]));
> +      break;
> +   case ir_unop_fract:
> +      inst = emit(FRC(result_dst, op[0]));
> +      break;
> +   case ir_unop_round_even:
> +      emit(RNDE(result_dst, op[0]));
> +      break;
> +
> +   case ir_binop_min:
> +      emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
> +      break;
> +   case ir_binop_max:
> +      emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
> +      break;
> +
> +   case ir_binop_pow:
> +      emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
> +      break;
> +
> +   case ir_unop_bit_not:
> +      inst = emit(NOT(result_dst, op[0]));
> +      break;
> +   case ir_binop_bit_and:
> +      inst = emit(AND(result_dst, op[0], op[1]));
> +      break;
> +   case ir_binop_bit_xor:
> +      inst = emit(XOR(result_dst, op[0], op[1]));
> +      break;
> +   case ir_binop_bit_or:
> +      inst = emit(OR(result_dst, op[0], op[1]));
> +      break;
> +
> +   case ir_binop_lshift:
> +      inst = emit(SHL(result_dst, op[0], op[1]));
> +      break;
> +
> +   case ir_binop_rshift:
> +      if (ir->type->base_type == GLSL_TYPE_INT)
> +         inst = emit(ASR(result_dst, op[0], op[1]));
> +      else
> +         inst = emit(SHR(result_dst, op[0], op[1]));
> +      break;
> +
> +   case ir_binop_bfm:
> +      emit(BFI1(result_dst, op[0], op[1]));
> +      break;
> +
> +   case ir_binop_ubo_load: {
> +      ir_constant *const_uniform_block = ir->operands[0]->as_constant();
> +      ir_constant *const_offset_ir = ir->operands[1]->as_constant();
> +      unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
> +      src_reg offset;
> +
> +      /* Now, load the vector from that offset. */
> +      assert(ir->type->is_vector() || ir->type->is_scalar());
> +
> +      src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
> +      packed_consts.type = result.type;
> +      src_reg surf_index;
> +
> +      if (const_uniform_block) {
> +         /* The block index is a constant, so just emit the binding table entry
> +          * as an immediate.
> +          */
> +         surf_index = src_reg(prog_data->base.binding_table.ubo_start +
> +                              const_uniform_block->value.u[0]);
> +      } else {
> +         /* The block index is not a constant. Evaluate the index expression
> +          * per-channel and add the base UBO index; the generator will select
> +          * a value from any live channel.
> +          */
> +         surf_index = src_reg(this, glsl_type::uint_type);
> +         emit(ADD(dst_reg(surf_index), op[0],
> +                  src_reg(prog_data->base.binding_table.ubo_start)));
> +
> +         /* Assume this may touch any UBO. It would be nice to provide
> +          * a tighter bound, but the array information is already lowered away.
> +          */
> +         brw_mark_surface_used(&prog_data->base,
> +                               prog_data->base.binding_table.ubo_start +
> +                               shader_prog->NumUniformBlocks - 1);
> +      }
> +
> +      if (const_offset_ir) {
> +         if (brw->gen >= 8) {
> +            /* Store the offset in a GRF so we can send-from-GRF. */
> +            offset = src_reg(this, glsl_type::int_type);
> +            emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
> +         } else {
> +            /* Immediates are fine on older generations since they'll be moved
> +             * to a (potentially fake) MRF at the generator level.
> +             */
> +            offset = src_reg(const_offset / 16);
> +         }
> +      } else {
> +         offset = src_reg(this, glsl_type::uint_type);
> +         emit(SHR(dst_reg(offset), op[1], src_reg(4)));
> +      }
> +
> +      if (brw->gen >= 7) {
> +         dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
> +
> +         /* We have to use a message header on Skylake to get SIMD4x2 mode.
> +          * Reserve space for the register.
> +          */
> +         if (brw->gen >= 9) {
> +            grf_offset.reg_offset++;
> +            alloc.sizes[grf_offset.reg] = 2;
> +         }
> +
> +         grf_offset.type = offset.type;
> +
> +         emit(MOV(grf_offset, offset));
> +
> +         vec4_instruction *pull =
> +            emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
> +                                               dst_reg(packed_consts),
> +                                               surf_index,
> +                                               src_reg(grf_offset)));
> +         pull->mlen = 1;
> +      } else {
> +         vec4_instruction *pull =
> +            emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
> +                                               dst_reg(packed_consts),
> +                                               surf_index,
> +                                               offset));
> +         pull->base_mrf = 14;
> +         pull->mlen = 1;
> +      }
> +
> +      packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
> +      packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
> +                                            const_offset % 16 / 4,
> +                                            const_offset % 16 / 4,
> +                                            const_offset % 16 / 4);
> +
> +      /* UBO bools are any nonzero int.  We need to convert them to use the
> +       * value of true stored in ctx->Const.UniformBooleanTrue.
> +       */
> +      if (ir->type->base_type == GLSL_TYPE_BOOL) {
> +         emit(CMP(result_dst, packed_consts, src_reg(0u),
> +                  BRW_CONDITIONAL_NZ));
> +      } else {
> +         emit(MOV(result_dst, packed_consts));
> +      }
> +      break;
> +   }
> +
> +   case ir_binop_vector_extract:
> +      unreachable("should have been lowered by vec_index_to_cond_assign");
> +
> +   case ir_triop_fma:
> +      op[0] = fix_3src_operand(op[0]);
> +      op[1] = fix_3src_operand(op[1]);
> +      op[2] = fix_3src_operand(op[2]);
> +      /* Note that the instruction's argument order is reversed from GLSL
> +       * and the IR.
> +       */
> +      emit(MAD(result_dst, op[2], op[1], op[0]));
> +      break;
> +
> +   case ir_triop_lrp:
> +      emit_lrp(result_dst, op[0], op[1], op[2]);
> +      break;
> +
> +   case ir_triop_csel:
> +      unreachable("already handled above");
> +      break;
> +
> +   case ir_triop_bfi:
> +      op[0] = fix_3src_operand(op[0]);
> +      op[1] = fix_3src_operand(op[1]);
> +      op[2] = fix_3src_operand(op[2]);
> +      emit(BFI2(result_dst, op[0], op[1], op[2]));
> +      break;
> +
> +   case ir_triop_bitfield_extract:
> +      op[0] = fix_3src_operand(op[0]);
> +      op[1] = fix_3src_operand(op[1]);
> +      op[2] = fix_3src_operand(op[2]);
> +      /* Note that the instruction's argument order is reversed from GLSL
> +       * and the IR.
> +       */
> +      emit(BFE(result_dst, op[2], op[1], op[0]));
> +      break;
> +
> +   case ir_triop_vector_insert:
> +      unreachable("should have been lowered by lower_vector_insert");
> +
> +   case ir_quadop_bitfield_insert:
> +      unreachable("not reached: should be handled by "
> +              "bitfield_insert_to_bfm_bfi\n");
> +
> +   case ir_quadop_vector:
> +      unreachable("not reached: should be handled by lower_quadop_vector");
> +
> +   case ir_unop_pack_half_2x16:
> +      emit_pack_half_2x16(result_dst, op[0]);
> +      break;
> +   case ir_unop_unpack_half_2x16:
> +      emit_unpack_half_2x16(result_dst, op[0]);
> +      break;
> +   case ir_unop_unpack_unorm_4x8:
> +      emit_unpack_unorm_4x8(result_dst, op[0]);
> +      break;
> +   case ir_unop_unpack_snorm_4x8:
> +      emit_unpack_snorm_4x8(result_dst, op[0]);
> +      break;
> +   case ir_unop_pack_unorm_4x8:
> +      emit_pack_unorm_4x8(result_dst, op[0]);
> +      break;
> +   case ir_unop_pack_snorm_4x8:
> +      emit_pack_snorm_4x8(result_dst, op[0]);
> +      break;
> +   case ir_unop_pack_snorm_2x16:
> +   case ir_unop_pack_unorm_2x16:
> +   case ir_unop_unpack_snorm_2x16:
> +   case ir_unop_unpack_unorm_2x16:
> +      unreachable("not reached: should be handled by lower_packing_builtins");
> +   case ir_unop_unpack_half_2x16_split_x:
> +   case ir_unop_unpack_half_2x16_split_y:
> +   case ir_binop_pack_half_2x16_split:
> +   case ir_unop_interpolate_at_centroid:
> +   case ir_binop_interpolate_at_sample:
> +   case ir_binop_interpolate_at_offset:
> +      unreachable("not reached: should not occur in vertex shader");
> +   case ir_binop_ldexp:
> +      unreachable("not reached: should be handled by ldexp_to_arith()");
> +   case ir_unop_d2f:
> +   case ir_unop_f2d:
> +   case ir_unop_d2i:
> +   case ir_unop_i2d:
> +   case ir_unop_d2u:
> +   case ir_unop_u2d:
> +   case ir_unop_d2b:
> +   case ir_unop_pack_double_2x32:
> +   case ir_unop_unpack_double_2x32:
> +   case ir_unop_frexp_sig:
> +   case ir_unop_frexp_exp:
> +      unreachable("fp64 todo");
> +   }
> +}
> +
> +
> +void
> +vec4_god::visit(ir_swizzle *ir)
> +{
> +   /* Note that this is only swizzles in expressions, not those on the left
> +    * hand side of an assignment, which do write masking.  See ir_assignment
> +    * for that.
> +    */
> +   const unsigned swz = brw_compose_swizzle(
> +      brw_swizzle_for_size(ir->type->vector_elements),
> +      BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
> +
> +   ir->val->accept(this);
> +   this->result = swizzle(this->result, swz);
> +}
> +
> +void
> +vec4_god::visit(ir_dereference_variable *ir)
> +{
> +   const struct glsl_type *type = ir->type;
> +   dst_reg *reg = variable_storage(ir->var);
> +
> +   if (!reg) {
> +      fail("Failed to find variable storage for %s\n", ir->var->name);
> +      this->result = src_reg(brw_null_reg());
> +      return;
> +   }
> +
> +   this->result = src_reg(*reg);
> +
> +   /* System values get their swizzle from the dst_reg writemask */
> +   if (ir->var->data.mode == ir_var_system_value)
> +      return;
> +
> +   if (type->is_scalar() || type->is_vector() || type->is_matrix())
> +      this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
> +}
> +
> +
> +int
> +vec4_god::compute_array_stride(ir_dereference_array *ir)
> +{
> +   /* Under normal circumstances array elements are stored consecutively, so
> +    * the stride is equal to the size of the array element.
> +    */
> +   return type_size(ir->type);
> +}
> +
> +
> +void
> +vec4_god::visit(ir_dereference_array *ir)
> +{
> +   ir_constant *constant_index;
> +   src_reg src;
> +   int array_stride = compute_array_stride(ir);
> +
> +   constant_index = ir->array_index->constant_expression_value();
> +
> +   ir->array->accept(this);
> +   src = this->result;
> +
> +   if (constant_index) {
> +      src.reg_offset += constant_index->value.i[0] * array_stride;
> +   } else {
> +      /* Variable index array dereference.  It eats the "vec4" of the
> +       * base of the array and an index that offsets the Mesa register
> +       * index.
> +       */
> +      ir->array_index->accept(this);
> +
> +      src_reg index_reg;
> +
> +      if (array_stride == 1) {
> +        index_reg = this->result;
> +      } else {
> +        index_reg = src_reg(this, glsl_type::int_type);
> +
> +        emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
> +      }
> +
> +      if (src.reladdr) {
> +        src_reg temp = src_reg(this, glsl_type::int_type);
> +
> +        emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
> +
> +        index_reg = temp;
> +      }
> +
> +      src.reladdr = ralloc(mem_ctx, src_reg);
> +      memcpy(src.reladdr, &index_reg, sizeof(index_reg));
> +   }
> +
> +   /* If the type is smaller than a vec4, replicate the last channel out. */
> +   if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
> +      src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
> +   else
> +      src.swizzle = BRW_SWIZZLE_NOOP;
> +   src.type = brw_type_for_base_type(ir->type);
> +
> +   this->result = src;
> +}
> +
> +void
> +vec4_god::visit(ir_dereference_record *ir)
> +{
> +   unsigned int i;
> +   const glsl_type *struct_type = ir->record->type;
> +   int offset = 0;
> +
> +   ir->record->accept(this);
> +
> +   for (i = 0; i < struct_type->length; i++) {
> +      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
> +        break;
> +      offset += type_size(struct_type->fields.structure[i].type);
> +   }
> +
> +   /* If the type is smaller than a vec4, replicate the last channel out. */
> +   if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
> +      this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
> +   else
> +      this->result.swizzle = BRW_SWIZZLE_NOOP;
> +   this->result.type = brw_type_for_base_type(ir->type);
> +
> +   this->result.reg_offset += offset;
> +}
> +
> +/**
> + * We want to be careful in assignment setup to hit the actual storage
> + * instead of potentially using a temporary like we might with the
> + * ir_dereference handler.
> + */
> +static dst_reg
> +get_assignment_lhs(ir_dereference *ir, vec4_god *v)
> +{
> +   /* The LHS must be a dereference.  If the LHS is a variable indexed array
> +    * access of a vector, it must be separated into a series conditional moves
> +    * before reaching this point (see ir_vec_index_to_cond_assign).
> +    */
> +   assert(ir->as_dereference());
> +   ir_dereference_array *deref_array = ir->as_dereference_array();
> +   if (deref_array) {
> +      assert(!deref_array->array->type->is_vector());
> +   }
> +
> +   /* Use the rvalue deref handler for the most part.  We'll ignore
> +    * swizzles in it and write swizzles using writemask, though.
> +    */
> +   ir->accept(v);
> +   return dst_reg(v->result);
> +}
> +
> +void
> +vec4_god::emit_block_move(dst_reg *dst, src_reg *src,
> +                              const struct glsl_type *type,
> +                              enum brw_predicate predicate)
> +{
> +   if (type->base_type == GLSL_TYPE_STRUCT) {
> +      for (unsigned int i = 0; i < type->length; i++) {
> +        emit_block_move(dst, src, type->fields.structure[i].type, predicate);
> +      }
> +      return;
> +   }
> +
> +   if (type->is_array()) {
> +      for (unsigned int i = 0; i < type->length; i++) {
> +        emit_block_move(dst, src, type->fields.array, predicate);
> +      }
> +      return;
> +   }
> +
> +   if (type->is_matrix()) {
> +      const struct glsl_type *vec_type;
> +
> +      vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
> +                                        type->vector_elements, 1);
> +
> +      for (int i = 0; i < type->matrix_columns; i++) {
> +        emit_block_move(dst, src, vec_type, predicate);
> +      }
> +      return;
> +   }
> +
> +   assert(type->is_scalar() || type->is_vector());
> +
> +   dst->type = brw_type_for_base_type(type);
> +   src->type = dst->type;
> +
> +   dst->writemask = (1 << type->vector_elements) - 1;
> +
> +   src->swizzle = brw_swizzle_for_size(type->vector_elements);
> +
> +   vec4_instruction *inst = emit(MOV(*dst, *src));
> +   inst->predicate = predicate;
> +
> +   dst->reg_offset++;
> +   src->reg_offset++;
> +}
> +
> +
> +/* If the RHS processing resulted in an instruction generating a
> + * temporary value, and it would be easy to rewrite the instruction to
> + * generate its result right into the LHS instead, do so.  This ends
> + * up reliably removing instructions where it can be tricky to do so
> + * later without real UD chain information.
> + */
> +bool
> +vec4_god::try_rewrite_rhs_to_dst(ir_assignment *ir,
> +                                    dst_reg dst,
> +                                    src_reg src,
> +                                    vec4_instruction *pre_rhs_inst,
> +                                    vec4_instruction *last_rhs_inst)
> +{
> +   /* This could be supported, but it would take more smarts. */
> +   if (ir->condition)
> +      return false;
> +
> +   if (pre_rhs_inst == last_rhs_inst)
> +      return false; /* No instructions generated to work with. */
> +
> +   /* Make sure the last instruction generated our source reg. */
> +   if (src.file != GRF ||
> +       src.file != last_rhs_inst->dst.file ||
> +       src.reg != last_rhs_inst->dst.reg ||
> +       src.reg_offset != last_rhs_inst->dst.reg_offset ||
> +       src.reladdr ||
> +       src.abs ||
> +       src.negate ||
> +       last_rhs_inst->predicate != BRW_PREDICATE_NONE)
> +      return false;
> +
> +   /* Check that that last instruction fully initialized the channels
> +    * we want to use, in the order we want to use them.  We could
> +    * potentially reswizzle the operands of many instructions so that
> +    * we could handle out of order channels, but don't yet.
> +    */
> +
> +   for (unsigned i = 0; i < 4; i++) {
> +      if (dst.writemask & (1 << i)) {
> +        if (!(last_rhs_inst->dst.writemask & (1 << i)))
> +           return false;
> +
> +        if (BRW_GET_SWZ(src.swizzle, i) != i)
> +           return false;
> +      }
> +   }
> +
> +   /* Success!  Rewrite the instruction. */
> +   last_rhs_inst->dst.file = dst.file;
> +   last_rhs_inst->dst.reg = dst.reg;
> +   last_rhs_inst->dst.reg_offset = dst.reg_offset;
> +   last_rhs_inst->dst.reladdr = dst.reladdr;
> +   last_rhs_inst->dst.writemask &= dst.writemask;
> +
> +   return true;
> +}
> +
> +void
> +vec4_god::visit(ir_assignment *ir)
> +{
> +   dst_reg dst = get_assignment_lhs(ir->lhs, this);
> +   enum brw_predicate predicate = BRW_PREDICATE_NONE;
> +
> +   if (!ir->lhs->type->is_scalar() &&
> +       !ir->lhs->type->is_vector()) {
> +      ir->rhs->accept(this);
> +      src_reg src = this->result;
> +
> +      if (ir->condition) {
> +        emit_bool_to_cond_code(ir->condition, &predicate);
> +      }
> +
> +      /* emit_block_move doesn't account for swizzles in the source register.
> +       * This should be ok, since the source register is a structure or an
> +       * array, and those can't be swizzled.  But double-check to be sure.
> +       */
> +      assert(src.swizzle ==
> +             (ir->rhs->type->is_matrix()
> +              ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
> +              : BRW_SWIZZLE_NOOP));
> +
> +      emit_block_move(&dst, &src, ir->rhs->type, predicate);
> +      return;
> +   }
> +
> +   /* Now we're down to just a scalar/vector with writemasks. */
> +   int i;
> +
> +   vec4_instruction *pre_rhs_inst, *last_rhs_inst;
> +   pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
> +
> +   ir->rhs->accept(this);
> +
> +   last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
> +
> +   int swizzles[4];
> +   int src_chan = 0;
> +
> +   assert(ir->lhs->type->is_vector() ||
> +         ir->lhs->type->is_scalar());
> +   dst.writemask = ir->write_mask;
> +
> +   /* Swizzle a small RHS vector into the channels being written.
> +    *
> +    * glsl ir treats write_mask as dictating how many channels are
> +    * present on the RHS while in our instructions we need to make
> +    * those channels appear in the slots of the vec4 they're written to.
> +    */
> +   for (int i = 0; i < 4; i++)
> +      swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
> +
> +   src_reg src = swizzle(this->result,
> +                         BRW_SWIZZLE4(swizzles[0], swizzles[1],
> +                                      swizzles[2], swizzles[3]));
> +
> +   if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
> +      return;
> +   }
> +
> +   if (ir->condition) {
> +      emit_bool_to_cond_code(ir->condition, &predicate);
> +   }
> +
> +   for (i = 0; i < type_size(ir->lhs->type); i++) {
> +      vec4_instruction *inst = emit(MOV(dst, src));
> +      inst->predicate = predicate;
> +
> +      dst.reg_offset++;
> +      src.reg_offset++;
> +   }
> +}
> +
> +void
> +vec4_god::emit_constant_values(dst_reg *dst, ir_constant *ir)
> +{
> +   if (ir->type->base_type == GLSL_TYPE_STRUCT) {
> +      foreach_in_list(ir_constant, field_value, &ir->components) {
> +        emit_constant_values(dst, field_value);
> +      }
> +      return;
> +   }
> +
> +   if (ir->type->is_array()) {
> +      for (unsigned int i = 0; i < ir->type->length; i++) {
> +        emit_constant_values(dst, ir->array_elements[i]);
> +      }
> +      return;
> +   }
> +
> +   if (ir->type->is_matrix()) {
> +      for (int i = 0; i < ir->type->matrix_columns; i++) {
> +        float *vec = &ir->value.f[i * ir->type->vector_elements];
> +
> +        for (int j = 0; j < ir->type->vector_elements; j++) {
> +           dst->writemask = 1 << j;
> +           dst->type = BRW_REGISTER_TYPE_F;
> +
> +           emit(MOV(*dst, src_reg(vec[j])));
> +        }
> +        dst->reg_offset++;
> +      }
> +      return;
> +   }
> +
> +   int remaining_writemask = (1 << ir->type->vector_elements) - 1;
> +
> +   for (int i = 0; i < ir->type->vector_elements; i++) {
> +      if (!(remaining_writemask & (1 << i)))
> +        continue;
> +
> +      dst->writemask = 1 << i;
> +      dst->type = brw_type_for_base_type(ir->type);
> +
> +      /* Find other components that match the one we're about to
> +       * write.  Emits fewer instructions for things like vec4(0.5,
> +       * 1.5, 1.5, 1.5).
> +       */
> +      for (int j = i + 1; j < ir->type->vector_elements; j++) {
> +        if (ir->type->base_type == GLSL_TYPE_BOOL) {
> +           if (ir->value.b[i] == ir->value.b[j])
> +              dst->writemask |= (1 << j);
> +        } else {
> +           /* u, i, and f storage all line up, so no need for a
> +            * switch case for comparing each type.
> +            */
> +           if (ir->value.u[i] == ir->value.u[j])
> +              dst->writemask |= (1 << j);
> +        }
> +      }
> +
> +      switch (ir->type->base_type) {
> +      case GLSL_TYPE_FLOAT:
> +        emit(MOV(*dst, src_reg(ir->value.f[i])));
> +        break;
> +      case GLSL_TYPE_INT:
> +        emit(MOV(*dst, src_reg(ir->value.i[i])));
> +        break;
> +      case GLSL_TYPE_UINT:
> +        emit(MOV(*dst, src_reg(ir->value.u[i])));
> +        break;
> +      case GLSL_TYPE_BOOL:
> +         emit(MOV(*dst,
> +                  src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
> +                                              : 0)));
> +        break;
> +      default:
> +        unreachable("Non-float/uint/int/bool constant");
> +      }
> +
> +      remaining_writemask &= ~dst->writemask;
> +   }
> +   dst->reg_offset++;
> +}
> +
> +void
> +vec4_god::visit(ir_constant *ir)
> +{
> +   dst_reg dst = dst_reg(this, ir->type);
> +   this->result = src_reg(dst);
> +
> +   emit_constant_values(&dst, ir);
> +}
> +
> +void
> +vec4_god::visit_atomic_counter_intrinsic(ir_call *ir)
> +{
> +   ir_dereference *deref = static_cast<ir_dereference *>(
> +      ir->actual_parameters.get_head());
> +   ir_variable *location = deref->variable_referenced();
> +   unsigned surf_index = (prog_data->base.binding_table.abo_start +
> +                          location->data.binding);
> +
> +   /* Calculate the surface offset */
> +   src_reg offset(this, glsl_type::uint_type);
> +   ir_dereference_array *deref_array = deref->as_dereference_array();
> +   if (deref_array) {
> +      deref_array->array_index->accept(this);
> +
> +      src_reg tmp(this, glsl_type::uint_type);
> +      emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
> +      emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
> +   } else {
> +      offset = location->data.atomic.offset;
> +   }
> +
> +   /* Emit the appropriate machine instruction */
> +   const char *callee = ir->callee->function_name();
> +   dst_reg dst = get_assignment_lhs(ir->return_deref, this);
> +
> +   if (!strcmp("__intrinsic_atomic_read", callee)) {
> +      emit_untyped_surface_read(surf_index, dst, offset);
> +
> +   } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
> +      emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
> +                          src_reg(), src_reg());
> +
> +   } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
> +      emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
> +                          src_reg(), src_reg());
> +   }
> +}
> +
> +void
> +vec4_god::visit(ir_call *ir)
> +{
> +   const char *callee = ir->callee->function_name();
> +
> +   if (!strcmp("__intrinsic_atomic_read", callee) ||
> +       !strcmp("__intrinsic_atomic_increment", callee) ||
> +       !strcmp("__intrinsic_atomic_predecrement", callee)) {
> +      visit_atomic_counter_intrinsic(ir);
> +   } else {
> +      unreachable("Unsupported intrinsic.");
> +   }
> +}
> +
> +src_reg
> +vec4_god::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
> +{
> +   vec4_instruction *inst =
> +      new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
> +                                    dst_reg(this, glsl_type::uvec4_type));
> +   inst->base_mrf = 2;
> +   inst->mlen = 1;
> +   inst->src[1] = sampler;
> +
> +   /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
> +   int param_base = inst->base_mrf;
> +   int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
> +   int zero_mask = 0xf & ~coord_mask;
> +
> +   emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
> +            coordinate));
> +
> +   emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
> +            src_reg(0)));
> +
> +   emit(inst);
> +   return src_reg(inst->dst);
> +}
> +
> +static bool
> +is_high_sampler(struct brw_context *brw, src_reg sampler)
> +{
> +   if (brw->gen < 8 && !brw->is_haswell)
> +      return false;
> +
> +   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
> +}
> +
> +void
> +vec4_god::visit(ir_texture *ir)
> +{
> +   uint32_t sampler =
> +      _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
> +
> +   ir_rvalue *nonconst_sampler_index =
> +      _mesa_get_sampler_array_nonconst_index(ir->sampler);
> +
> +   /* Handle non-constant sampler array indexing */
> +   src_reg sampler_reg;
> +   if (nonconst_sampler_index) {
> +      /* The highest sampler which may be used by this operation is
> +       * the last element of the array. Mark it here, because the generator
> +       * doesn't have enough information to determine the bound.
> +       */
> +      uint32_t array_size = ir->sampler->as_dereference_array()
> +         ->array->type->array_size();
> +
> +      uint32_t max_used = sampler + array_size - 1;
> +      if (ir->op == ir_tg4 && brw->gen < 8) {
> +         max_used += prog_data->base.binding_table.gather_texture_start;
> +      } else {
> +         max_used += prog_data->base.binding_table.texture_start;
> +      }
> +
> +      brw_mark_surface_used(&prog_data->base, max_used);
> +
> +      /* Emit code to evaluate the actual indexing expression */
> +      nonconst_sampler_index->accept(this);
> +      dst_reg temp(this, glsl_type::uint_type);
> +      emit(ADD(temp, this->result, src_reg(sampler)))
> +         ->force_writemask_all = true;
> +      sampler_reg = src_reg(temp);
> +   } else {
> +      /* Single sampler, or constant array index; the indexing expression
> +       * is just an immediate.
> +       */
> +      sampler_reg = src_reg(sampler);
> +   }
> +
> +   /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
> +    * emitting anything other than setting up the constant result.
> +    */
> +   if (ir->op == ir_tg4) {
> +      ir_constant *chan = ir->lod_info.component->as_constant();
> +      int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
> +      if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
> +         dst_reg result(this, ir->type);
> +         this->result = src_reg(result);
> +         emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
> +         return;
> +      }
> +   }
> +
> +   /* Should be lowered by do_lower_texture_projection */
> +   assert(!ir->projector);
> +
> +   /* Should be lowered */
> +   assert(!ir->offset || !ir->offset->type->is_array());
> +
> +   /* Generate code to compute all the subexpression trees.  This has to be
> +    * done before loading any values into MRFs for the sampler message since
> +    * generating these values may involve SEND messages that need the MRFs.
> +    */
> +   src_reg coordinate;
> +   if (ir->coordinate) {
> +      ir->coordinate->accept(this);
> +      coordinate = this->result;
> +   }
> +
> +   src_reg shadow_comparitor;
> +   if (ir->shadow_comparitor) {
> +      ir->shadow_comparitor->accept(this);
> +      shadow_comparitor = this->result;
> +   }
> +
> +   bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
> +   src_reg offset_value;
> +   if (has_nonconstant_offset) {
> +      ir->offset->accept(this);
> +      offset_value = src_reg(this->result);
> +   }
> +
> +   const glsl_type *lod_type = NULL, *sample_index_type = NULL;
> +   src_reg lod, dPdx, dPdy, sample_index, mcs;
> +   switch (ir->op) {
> +   case ir_tex:
> +      lod = src_reg(0.0f);
> +      lod_type = glsl_type::float_type;
> +      break;
> +   case ir_txf:
> +   case ir_txl:
> +   case ir_txs:
> +      ir->lod_info.lod->accept(this);
> +      lod = this->result;
> +      lod_type = ir->lod_info.lod->type;
> +      break;
> +   case ir_query_levels:
> +      lod = src_reg(0);
> +      lod_type = glsl_type::int_type;
> +      break;
> +   case ir_txf_ms:
> +      ir->lod_info.sample_index->accept(this);
> +      sample_index = this->result;
> +      sample_index_type = ir->lod_info.sample_index->type;
> +
> +      if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
> +         mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
> +      else
> +         mcs = src_reg(0u);
> +      break;
> +   case ir_txd:
> +      ir->lod_info.grad.dPdx->accept(this);
> +      dPdx = this->result;
> +
> +      ir->lod_info.grad.dPdy->accept(this);
> +      dPdy = this->result;
> +
> +      lod_type = ir->lod_info.grad.dPdx->type;
> +      break;
> +   case ir_txb:
> +   case ir_lod:
> +   case ir_tg4:
> +      break;
> +   }
> +
> +   enum opcode opcode;
> +   switch (ir->op) {
> +   case ir_tex: opcode = SHADER_OPCODE_TXL; break;
> +   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
> +   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
> +   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
> +   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
> +   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
> +   case ir_tg4: opcode = has_nonconstant_offset
> +                         ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
> +   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
> +   case ir_txb:
> +      unreachable("TXB is not valid for vertex shaders.");
> +   case ir_lod:
> +      unreachable("LOD is not valid for vertex shaders.");
> +   default:
> +      unreachable("Unrecognized tex op");
> +   }
> +
> +   vec4_instruction *inst = new(mem_ctx) vec4_instruction(
> +      opcode, dst_reg(this, ir->type));
> +
> +   if (ir->offset != NULL && !has_nonconstant_offset) {
> +      inst->offset =
> +         brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
> +                            ir->offset->type->vector_elements);
> +   }
> +
> +   /* Stuff the channel select bits in the top of the texture offset */
> +   if (ir->op == ir_tg4)
> +      inst->offset |= gather_channel(ir, sampler) << 16;
> +
> +   /* The message header is necessary for:
> +    * - Gen4 (always)
> +    * - Gen9+ for selecting SIMD4x2
> +    * - Texel offsets
> +    * - Gather channel selection
> +    * - Sampler indices too large to fit in a 4-bit value.
> +    */
> +   inst->header_present =
> +      brw->gen < 5 || brw->gen >= 9 ||
> +      inst->offset != 0 || ir->op == ir_tg4 ||
> +      is_high_sampler(brw, sampler_reg);
> +   inst->base_mrf = 2;
> +   inst->mlen = inst->header_present + 1; /* always at least one */
> +   inst->dst.writemask = WRITEMASK_XYZW;
> +   inst->shadow_compare = ir->shadow_comparitor != NULL;
> +
> +   inst->src[1] = sampler_reg;
> +
> +   /* MRF for the first parameter */
> +   int param_base = inst->base_mrf + inst->header_present;
> +
> +   if (ir->op == ir_txs || ir->op == ir_query_levels) {
> +      int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
> +      emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
> +   } else {
> +      /* Load the coordinate */
> +      /* FINISHME: gl_clamp_mask and saturate */
> +      int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
> +      int zero_mask = 0xf & ~coord_mask;
> +
> +      emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
> +               coordinate));
> +
> +      if (zero_mask != 0) {
> +         emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
> +                  src_reg(0)));
> +      }
> +      /* Load the shadow comparitor */
> +      if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
> +        emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
> +                         WRITEMASK_X),
> +                 shadow_comparitor));
> +        inst->mlen++;
> +      }
> +
> +      /* Load the LOD info */
> +      if (ir->op == ir_tex || ir->op == ir_txl) {
> +        int mrf, writemask;
> +        if (brw->gen >= 5) {
> +           mrf = param_base + 1;
> +           if (ir->shadow_comparitor) {
> +              writemask = WRITEMASK_Y;
> +              /* mlen already incremented */
> +           } else {
> +              writemask = WRITEMASK_X;
> +              inst->mlen++;
> +           }
> +        } else /* brw->gen == 4 */ {
> +           mrf = param_base;
> +           writemask = WRITEMASK_W;
> +        }
> +        emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
> +      } else if (ir->op == ir_txf) {
> +         emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
> +      } else if (ir->op == ir_txf_ms) {
> +         emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
> +                  sample_index));
> +         if (brw->gen >= 7) {
> +            /* MCS data is in the first channel of `mcs`, but we need to get it into
> +             * the .y channel of the second vec4 of params, so replicate .x across
> +             * the whole vec4 and then mask off everything except .y
> +             */
> +            mcs.swizzle = BRW_SWIZZLE_XXXX;
> +            emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
> +                     mcs));
> +         }
> +         inst->mlen++;
> +      } else if (ir->op == ir_txd) {
> +        const glsl_type *type = lod_type;
> +
> +        if (brw->gen >= 5) {
> +           dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
> +           dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
> +           emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
> +           emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
> +           inst->mlen++;
> +
> +           if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
> +              dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
> +              dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
> +              emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
> +              emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
> +              inst->mlen++;
> +
> +               if (ir->shadow_comparitor) {
> +                  emit(MOV(dst_reg(MRF, param_base + 2,
> +                                   ir->shadow_comparitor->type, WRITEMASK_Z),
> +                           shadow_comparitor));
> +               }
> +           }
> +        } else /* brw->gen == 4 */ {
> +           emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
> +           emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
> +           inst->mlen += 2;
> +        }
> +      } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
> +         if (ir->shadow_comparitor) {
> +            emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
> +                     shadow_comparitor));
> +         }
> +
> +         emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
> +                  offset_value));
> +         inst->mlen++;
> +      }
> +   }
> +
> +   emit(inst);
> +
> +   /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
> +    * spec requires layers.
> +    */
> +   if (ir->op == ir_txs) {
> +      glsl_type const *type = ir->sampler->type;
> +      if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
> +          type->sampler_array) {
> +         emit_math(SHADER_OPCODE_INT_QUOTIENT,
> +                   writemask(inst->dst, WRITEMASK_Z),
> +                   src_reg(inst->dst), src_reg(6));
> +      }
> +   }
> +
> +   if (brw->gen == 6 && ir->op == ir_tg4) {
> +      emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
> +   }
> +
> +   swizzle_result(ir, src_reg(inst->dst), sampler);
> +}
> +
> +/**
> + * Apply workarounds for Gen6 gather with UINT/SINT
> + */
> +void
> +vec4_god::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
> +{
> +   if (!wa)
> +      return;
> +
> +   int width = (wa & WA_8BIT) ? 8 : 16;
> +   dst_reg dst_f = dst;
> +   dst_f.type = BRW_REGISTER_TYPE_F;
> +
> +   /* Convert from UNORM to UINT */
> +   emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
> +   emit(MOV(dst, src_reg(dst_f)));
> +
> +   if (wa & WA_SIGN) {
> +      /* Reinterpret the UINT value as a signed INT value by
> +       * shifting the sign bit into place, then shifting back
> +       * preserving sign.
> +       */
> +      emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
> +      emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
> +   }
> +}
> +
> +/**
> + * Set up the gather channel based on the swizzle, for gather4.
> + */
> +uint32_t
> +vec4_god::gather_channel(ir_texture *ir, uint32_t sampler)
> +{
> +   ir_constant *chan = ir->lod_info.component->as_constant();
> +   int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
> +   switch (swiz) {
> +      case SWIZZLE_X: return 0;
> +      case SWIZZLE_Y:
> +         /* gather4 sampler is broken for green channel on RG32F --
> +          * we must ask for blue instead.
> +          */
> +         if (key->tex.gather_channel_quirk_mask & (1<<sampler))
> +            return 2;
> +         return 1;
> +      case SWIZZLE_Z: return 2;
> +      case SWIZZLE_W: return 3;
> +      default:
> +         unreachable("Not reached"); /* zero, one swizzles handled already */
> +   }
> +}
> +
> +void
> +vec4_god::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
> +{
> +   int s = key->tex.swizzles[sampler];
> +
> +   this->result = src_reg(this, ir->type);
> +   dst_reg swizzled_result(this->result);
> +
> +   if (ir->op == ir_query_levels) {
> +      /* # levels is in .w */
> +      orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
> +      emit(MOV(swizzled_result, orig_val));
> +      return;
> +   }
> +
> +   if (ir->op == ir_txs || ir->type == glsl_type::float_type
> +                       || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
> +      emit(MOV(swizzled_result, orig_val));
> +      return;
> +   }
> +
> +
> +   int zero_mask = 0, one_mask = 0, copy_mask = 0;
> +   int swizzle[4] = {0};
> +
> +   for (int i = 0; i < 4; i++) {
> +      switch (GET_SWZ(s, i)) {
> +      case SWIZZLE_ZERO:
> +        zero_mask |= (1 << i);
> +        break;
> +      case SWIZZLE_ONE:
> +        one_mask |= (1 << i);
> +        break;
> +      default:
> +        copy_mask |= (1 << i);
> +        swizzle[i] = GET_SWZ(s, i);
> +        break;
> +      }
> +   }
> +
> +   if (copy_mask) {
> +      orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
> +      swizzled_result.writemask = copy_mask;
> +      emit(MOV(swizzled_result, orig_val));
> +   }
> +
> +   if (zero_mask) {
> +      swizzled_result.writemask = zero_mask;
> +      emit(MOV(swizzled_result, src_reg(0.0f)));
> +   }
> +
> +   if (one_mask) {
> +      swizzled_result.writemask = one_mask;
> +      emit(MOV(swizzled_result, src_reg(1.0f)));
> +   }
> +}
> +
> +void
> +vec4_god::visit(ir_return *)
> +{
> +   unreachable("not reached");
> +}
> +
> +void
> +vec4_god::visit(ir_discard *)
> +{
> +   unreachable("not reached");
> +}
> +
> +void
> +vec4_god::visit(ir_if *ir)
> +{
> +   /* Don't point the annotation at the if statement, because then it plus
> +    * the then and else blocks get printed.
> +    */
> +   this->base_ir = ir->condition;
> +
> +   if (brw->gen == 6) {
> +      emit_if_gen6(ir);
> +   } else {
> +      enum brw_predicate predicate;
> +      emit_bool_to_cond_code(ir->condition, &predicate);
> +      emit(IF(predicate));
> +   }
> +
> +   visit_instructions(&ir->then_instructions);
> +
> +   if (!ir->else_instructions.is_empty()) {
> +      this->base_ir = ir->condition;
> +      emit(BRW_OPCODE_ELSE);
> +
> +      visit_instructions(&ir->else_instructions);
> +   }
> +
> +   this->base_ir = ir->condition;
> +   emit(BRW_OPCODE_ENDIF);
> +}
> +
> +void
> +vec4_god::visit(ir_emit_vertex *)
> +{
> +   unreachable("not reached");
> +}
> +
> +void
> +vec4_god::visit(ir_end_primitive *)
> +{
> +   unreachable("not reached");
> +}
> +
> +void
> +vec4_god::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
> +                                  dst_reg dst, src_reg offset,
> +                                  src_reg src0, src_reg src1)
> +{
> +   unsigned mlen = 0;
> +
> +   /* Set the atomic operation offset. */
> +   emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
> +   mlen++;
> +
> +   /* Set the atomic operation arguments. */
> +   if (src0.file != BAD_FILE) {
> +      emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
> +      mlen++;
> +   }
> +
> +   if (src1.file != BAD_FILE) {
> +      emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
> +      mlen++;
> +   }
> +
> +   /* Emit the instruction.  Note that this maps to the normal SIMD8
> +    * untyped atomic message on Ivy Bridge, but that's OK because
> +    * unused channels will be masked out.
> +    */
> +   vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
> +                                 src_reg(atomic_op), src_reg(surf_index));
> +   inst->base_mrf = 0;
> +   inst->mlen = mlen;
> +}
> +
> +void
> +vec4_god::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
> +                                        src_reg offset)
> +{
> +   /* Set the surface read offset. */
> +   emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
> +
> +   /* Emit the instruction.  Note that this maps to the normal SIMD8
> +    * untyped surface read message, but that's OK because unused
> +    * channels will be masked out.
> +    */
> +   vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
> +                                 dst, src_reg(surf_index));
> +   inst->base_mrf = 0;
> +   inst->mlen = 1;
> +}
> +
> +void
> +vec4_god::emit_ndc_computation()
> +{
> +   /* Get the position */
> +   src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
> +
> +   /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
> +   dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
> +   output_reg[BRW_VARYING_SLOT_NDC] = ndc;
> +
> +   current_annotation = "NDC";
> +   dst_reg ndc_w = ndc;
> +   ndc_w.writemask = WRITEMASK_W;
> +   src_reg pos_w = pos;
> +   pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
> +   emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
> +
> +   dst_reg ndc_xyz = ndc;
> +   ndc_xyz.writemask = WRITEMASK_XYZ;
> +
> +   emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
> +}
> +
> +void
> +vec4_god::emit_psiz_and_flags(dst_reg reg)
> +{
> +   if (brw->gen < 6 &&
> +       ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
> +        key->userclip_active || brw->has_negative_rhw_bug)) {
> +      dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
> +      dst_reg header1_w = header1;
> +      header1_w.writemask = WRITEMASK_W;
> +
> +      emit(MOV(header1, 0u));
> +
> +      if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
> +        src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
> +
> +        current_annotation = "Point size";
> +        emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
> +        emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
> +      }
> +
> +      if (key->userclip_active) {
> +         current_annotation = "Clipping flags";
> +         dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
> +         dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
> +
> +         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
> +         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
> +         emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
> +
> +         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
> +         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
> +         emit(SHL(flags1, src_reg(flags1), src_reg(4)));
> +         emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
> +      }
> +
> +      /* i965 clipping workaround:
> +       * 1) Test for -ve rhw
> +       * 2) If set,
> +       *      set ndc = (0,0,0,0)
> +       *      set ucp[6] = 1
> +       *
> +       * Later, clipping will detect ucp[6] and ensure the primitive is
> +       * clipped against all fixed planes.
> +       */
> +      if (brw->has_negative_rhw_bug) {
> +         src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
> +         ndc_w.swizzle = BRW_SWIZZLE_WWWW;
> +         emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
> +         vec4_instruction *inst;
> +         inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
> +         inst->predicate = BRW_PREDICATE_NORMAL;
> +         inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
> +         inst->predicate = BRW_PREDICATE_NORMAL;
> +      }
> +
> +      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
> +   } else if (brw->gen < 6) {
> +      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
> +   } else {
> +      emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
> +      if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
> +         dst_reg reg_w = reg;
> +         reg_w.writemask = WRITEMASK_W;
> +         emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
> +      }
> +      if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
> +         dst_reg reg_y = reg;
> +         reg_y.writemask = WRITEMASK_Y;
> +         reg_y.type = BRW_REGISTER_TYPE_D;
> +         emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
> +      }
> +      if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
> +         dst_reg reg_z = reg;
> +         reg_z.writemask = WRITEMASK_Z;
> +         reg_z.type = BRW_REGISTER_TYPE_D;
> +         emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
> +      }
> +   }
> +}
> +
> +void
> +vec4_god::emit_clip_distances(dst_reg reg, int offset)
> +{
> +   /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
> +    *
> +    *     "If a linked set of shaders forming the vertex stage contains no
> +    *     static write to gl_ClipVertex or gl_ClipDistance, but the
> +    *     application has requested clipping against user clip planes through
> +    *     the API, then the coordinate written to gl_Position is used for
> +    *     comparison against the user clip planes."
> +    *
> +    * This function is only called if the shader didn't write to
> +    * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
> +    * if the user wrote to it; otherwise we use gl_Position.
> +    */
> +   gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
> +   if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
> +      clip_vertex = VARYING_SLOT_POS;
> +   }
> +
> +   for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
> +        ++i) {
> +      reg.writemask = 1 << i;
> +      emit(DP4(reg,
> +               src_reg(output_reg[clip_vertex]),
> +               src_reg(this->userplane[i + offset])));
> +   }
> +}
> +
> +vec4_instruction *
> +vec4_god::emit_generic_urb_slot(dst_reg reg, int varying)
> +{
> +   assert (varying < VARYING_SLOT_MAX);
> +   reg.type = output_reg[varying].type;
> +   current_annotation = output_reg_annotation[varying];
> +   /* Copy the register, saturating if necessary */
> +   return emit(MOV(reg, src_reg(output_reg[varying])));
> +}
> +
> +void
> +vec4_god::emit_urb_slot(dst_reg reg, int varying)
> +{
> +   reg.type = BRW_REGISTER_TYPE_F;
> +
> +   switch (varying) {
> +   case VARYING_SLOT_PSIZ:
> +   {
> +      /* PSIZ is always in slot 0, and is coupled with other flags. */
> +      current_annotation = "indices, point width, clip flags";
> +      emit_psiz_and_flags(reg);
> +      break;
> +   }
> +   case BRW_VARYING_SLOT_NDC:
> +      current_annotation = "NDC";
> +      emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
> +      break;
> +   case VARYING_SLOT_POS:
> +      current_annotation = "gl_Position";
> +      emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
> +      break;
> +   case VARYING_SLOT_EDGE:
> +      /* This is present when doing unfilled polygons.  We're supposed to copy
> +       * the edge flag from the user-provided vertex array
> +       * (glEdgeFlagPointer), or otherwise we'll copy from the current value
> +       * of that attribute (starts as 1.0f).  This is then used in clipping to
> +       * determine which edges should be drawn as wireframe.
> +       */
> +      current_annotation = "edge flag";
> +      emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
> +                                    glsl_type::float_type, WRITEMASK_XYZW))));
> +      break;
> +   case BRW_VARYING_SLOT_PAD:
> +      /* No need to write to this slot */
> +      break;
> +   case VARYING_SLOT_COL0:
> +   case VARYING_SLOT_COL1:
> +   case VARYING_SLOT_BFC0:
> +   case VARYING_SLOT_BFC1: {
> +      /* These built-in varyings are only supported in compatibility mode,
> +       * and we only support GS in core profile.  So, this must be a vertex
> +       * shader.
> +       */
> +      assert(stage == MESA_SHADER_VERTEX);
> +      vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
> +      if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
> +         inst->saturate = true;
> +      break;
> +   }
> +
> +   default:
> +      emit_generic_urb_slot(reg, varying);
> +      break;
> +   }
> +}
> +
> +static int
> +align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
> +{
> +   if (brw->gen >= 6) {
> +      /* URB data written (does not include the message header reg) must
> +       * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
> +       * section 5.4.3.2.2: URB_INTERLEAVED.
> +       *
> +       * URB entries are allocated on a multiple of 1024 bits, so an
> +       * extra 128 bits written here to make the end align to 256 is
> +       * no problem.
> +       */
> +      if ((mlen % 2) != 1)
> +        mlen++;
> +   }
> +
> +   return mlen;
> +}
> +
> +
> +/**
> + * Generates the VUE payload plus the necessary URB write instructions to
> + * output it.
> + *
> + * The VUE layout is documented in Volume 2a.
> + */
> +void
> +vec4_god::emit_vertex()
> +{
> +   /* MRF 0 is reserved for the debugger, so start with message header
> +    * in MRF 1.
> +    */
> +   int base_mrf = 1;
> +   int mrf = base_mrf;
> +   /* In the process of generating our URB write message contents, we
> +    * may need to unspill a register or load from an array.  Those
> +    * reads would use MRFs 14-15.
> +    */
> +   int max_usable_mrf = 13;
> +
> +   /* The following assertion verifies that max_usable_mrf causes an
> +    * even-numbered amount of URB write data, which will meet gen6's
> +    * requirements for length alignment.
> +    */
> +   assert ((max_usable_mrf - base_mrf) % 2 == 0);
> +
> +   /* First mrf is the g0-based message header containing URB handles and
> +    * such.
> +    */
> +   emit_urb_write_header(mrf++);
> +
> +   if (brw->gen < 6) {
> +      emit_ndc_computation();
> +   }
> +
> +   /* Lower legacy ff and ClipVertex clipping to clip distances */
> +   if (key->userclip_active && !prog->UsesClipDistanceOut) {
> +      current_annotation = "user clip distances";
> +
> +      output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
> +      output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
> +
> +      emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
> +      emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
> +   }
> +
> +   /* We may need to split this up into several URB writes, so do them in a
> +    * loop.
> +    */
> +   int slot = 0;
> +   bool complete = false;
> +   do {
> +      /* URB offset is in URB row increments, and each of our MRFs is half of
> +       * one of those, since we're doing interleaved writes.
> +       */
> +      int offset = slot / 2;
> +
> +      mrf = base_mrf + 1;
> +      for (; slot < prog_data->vue_map.num_slots; ++slot) {
> +         emit_urb_slot(dst_reg(MRF, mrf++),
> +                       prog_data->vue_map.slot_to_varying[slot]);
> +
> +         /* If this was max_usable_mrf, we can't fit anything more into this
> +          * URB WRITE.
> +          */
> +         if (mrf > max_usable_mrf) {
> +            slot++;
> +            break;
> +         }
> +      }
> +
> +      complete = slot >= prog_data->vue_map.num_slots;
> +      current_annotation = "URB write";
> +      vec4_instruction *inst = emit_urb_write_opcode(complete);
> +      inst->base_mrf = base_mrf;
> +      inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
> +      inst->offset += offset;
> +   } while(!complete);
> +}
> +
> +
> +src_reg
> +vec4_god::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
> +                                src_reg *reladdr, int reg_offset)
> +{
> +   /* Because we store the values to scratch interleaved like our
> +    * vertex data, we need to scale the vec4 index by 2.
> +    */
> +   int message_header_scale = 2;
> +
> +   /* Pre-gen6, the message header uses byte offsets instead of vec4
> +    * (16-byte) offset units.
> +    */
> +   if (brw->gen < 6)
> +      message_header_scale *= 16;
> +
> +   if (reladdr) {
> +      src_reg index = src_reg(this, glsl_type::int_type);
> +
> +      emit_before(block, inst, ADD(dst_reg(index), *reladdr,
> +                                   src_reg(reg_offset)));
> +      emit_before(block, inst, MUL(dst_reg(index), index,
> +                                   src_reg(message_header_scale)));
> +
> +      return index;
> +   } else {
> +      return src_reg(reg_offset * message_header_scale);
> +   }
> +}
> +
> +src_reg
> +vec4_god::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
> +                                      src_reg *reladdr, int reg_offset)
> +{
> +   if (reladdr) {
> +      src_reg index = src_reg(this, glsl_type::int_type);
> +
> +      emit_before(block, inst, ADD(dst_reg(index), *reladdr,
> +                                   src_reg(reg_offset)));
> +
> +      /* Pre-gen6, the message header uses byte offsets instead of vec4
> +       * (16-byte) offset units.
> +       */
> +      if (brw->gen < 6) {
> +         emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
> +      }
> +
> +      return index;
> +   } else if (brw->gen >= 8) {
> +      /* Store the offset in a GRF so we can send-from-GRF. */
> +      src_reg offset = src_reg(this, glsl_type::int_type);
> +      emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
> +      return offset;
> +   } else {
> +      int message_header_scale = brw->gen < 6 ? 16 : 1;
> +      return src_reg(reg_offset * message_header_scale);
> +   }
> +}
> +
> +/**
> + * Emits an instruction before @inst to load the value named by @orig_src
> + * from scratch space at @base_offset to @temp.
> + *
> + * @base_offset is measured in 32-byte units (the size of a register).
> + */
> +void
> +vec4_god::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
> +                               dst_reg temp, src_reg orig_src,
> +                               int base_offset)
> +{
> +   int reg_offset = base_offset + orig_src.reg_offset;
> +   src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
> +                                      reg_offset);
> +
> +   emit_before(block, inst, SCRATCH_READ(temp, index));
> +}
> +
> +/**
> + * Emits an instruction after @inst to store the value to be written
> + * to @orig_dst to scratch space at @base_offset, from @temp.
> + *
> + * @base_offset is measured in 32-byte units (the size of a register).
> + */
> +void
> +vec4_god::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
> +                                 int base_offset)
> +{
> +   int reg_offset = base_offset + inst->dst.reg_offset;
> +   src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
> +                                      reg_offset);
> +
> +   /* Create a temporary register to store *inst's result in.
> +    *
> +    * We have to be careful in MOVing from our temporary result register in
> +    * the scratch write.  If we swizzle from channels of the temporary that
> +    * weren't initialized, it will confuse live interval analysis, which will
> +    * make spilling fail to make progress.
> +    */
> +   const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
> +                                       inst->dst.type),
> +                                brw_swizzle_for_mask(inst->dst.writemask));
> +   dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
> +                                      inst->dst.writemask));
> +   vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
> +   write->predicate = inst->predicate;
> +   write->ir = inst->ir;
> +   write->annotation = inst->annotation;
> +   inst->insert_after(block, write);
> +
> +   inst->dst.file = temp.file;
> +   inst->dst.reg = temp.reg;
> +   inst->dst.reg_offset = temp.reg_offset;
> +   inst->dst.reladdr = NULL;
> +}
> +
> +/**
> + * We can't generally support array access in GRF space, because a
> + * single instruction's destination can only span 2 contiguous
> + * registers.  So, we send all GRF arrays that get variable index
> + * access to scratch space.
> + */
> +void
> +vec4_god::move_grf_array_access_to_scratch()
> +{
> +   int scratch_loc[this->alloc.count];
> +   memset(scratch_loc, -1, sizeof(scratch_loc));
> +
> +   /* First, calculate the set of virtual GRFs that need to be punted
> +    * to scratch due to having any array access on them, and where in
> +    * scratch.
> +    */
> +   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
> +      if (inst->dst.file == GRF && inst->dst.reladdr &&
> +         scratch_loc[inst->dst.reg] == -1) {
> +        scratch_loc[inst->dst.reg] = c->last_scratch;
> +        c->last_scratch += this->alloc.sizes[inst->dst.reg];
> +      }
> +
> +      for (int i = 0 ; i < 3; i++) {
> +        src_reg *src = &inst->src[i];
> +
> +        if (src->file == GRF && src->reladdr &&
> +            scratch_loc[src->reg] == -1) {
> +           scratch_loc[src->reg] = c->last_scratch;
> +           c->last_scratch += this->alloc.sizes[src->reg];
> +        }
> +      }
> +   }
> +
> +   /* Now, for anything that will be accessed through scratch, rewrite
> +    * it to load/store.  Note that this is a _safe list walk, because
> +    * we may generate a new scratch_write instruction after the one
> +    * we're processing.
> +    */
> +   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
> +      /* Set up the annotation tracking for new generated instructions. */
> +      base_ir = inst->ir;
> +      current_annotation = inst->annotation;
> +
> +      if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
> +        emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
> +      }
> +
> +      for (int i = 0 ; i < 3; i++) {
> +        if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
> +           continue;
> +
> +        dst_reg temp = dst_reg(this, glsl_type::vec4_type);
> +
> +        emit_scratch_read(block, inst, temp, inst->src[i],
> +                          scratch_loc[inst->src[i].reg]);
> +
> +        inst->src[i].file = temp.file;
> +        inst->src[i].reg = temp.reg;
> +        inst->src[i].reg_offset = temp.reg_offset;
> +        inst->src[i].reladdr = NULL;
> +      }
> +   }
> +}
> +
> +/**
> + * Emits an instruction before @inst to load the value named by @orig_src
> + * from the pull constant buffer (surface) at @base_offset to @temp.
> + */
> +void
> +vec4_god::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
> +                                     dst_reg temp, src_reg orig_src,
> +                                     int base_offset)
> +{
> +   int reg_offset = base_offset + orig_src.reg_offset;
> +   src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
> +   src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
> +                                             reg_offset);
> +   vec4_instruction *load;
> +
> +   if (brw->gen >= 7) {
> +      dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
> +
> +      /* We have to use a message header on Skylake to get SIMD4x2 mode.
> +       * Reserve space for the register.
> +       */
> +      if (brw->gen >= 9) {
> +         grf_offset.reg_offset++;
> +         alloc.sizes[grf_offset.reg] = 2;
> +      }
> +
> +      grf_offset.type = offset.type;
> +      emit_before(block, inst, MOV(grf_offset, offset));
> +
> +      load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
> +                                           temp, index, src_reg(grf_offset));
> +      load->mlen = 1;
> +   } else {
> +      load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
> +                                           temp, index, offset);
> +      load->base_mrf = 14;
> +      load->mlen = 1;
> +   }
> +   emit_before(block, inst, load);
> +}
> +
> +/**
> + * Implements array access of uniforms by inserting a
> + * PULL_CONSTANT_LOAD instruction.
> + *
> + * Unlike temporary GRF array access (where we don't support it due to
> + * the difficulty of doing relative addressing on instruction
> + * destinations), we could potentially do array access of uniforms
> + * that were loaded in GRF space as push constants.  In real-world
> + * usage we've seen, though, the arrays being used are always larger
> + * than we could load as push constants, so just always move all
> + * uniform array access out to a pull constant buffer.
> + */
> +void
> +vec4_god::move_uniform_array_access_to_pull_constants()
> +{
> +   int pull_constant_loc[this->uniforms];
> +   memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
> +   bool nested_reladdr;
> +
> +   /* Walk through and find array access of uniforms.  Put a copy of that
> +    * uniform in the pull constant buffer.
> +    *
> +    * Note that we don't move constant-indexed accesses to arrays.  No
> +    * testing has been done of the performance impact of this choice.
> +    */
> +   do {
> +      nested_reladdr = false;
> +
> +      foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
> +         for (int i = 0 ; i < 3; i++) {
> +            if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
> +               continue;
> +
> +            int uniform = inst->src[i].reg;
> +
> +            if (inst->src[i].reladdr->reladdr)
> +               nested_reladdr = true;  /* will need another pass */
> +
> +            /* If this array isn't already present in the pull constant buffer,
> +             * add it.
> +             */
> +            if (pull_constant_loc[uniform] == -1) {
> +               const gl_constant_value **values =
> +                  &stage_prog_data->param[uniform * 4];
> +
> +               pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
> +
> +               assert(uniform < uniform_array_size);
> +               for (int j = 0; j < uniform_size[uniform] * 4; j++) {
> +                  stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
> +                     = values[j];
> +               }
> +            }
> +
> +            /* Set up the annotation tracking for new generated instructions. */
> +            base_ir = inst->ir;
> +            current_annotation = inst->annotation;
> +
> +            dst_reg temp = dst_reg(this, glsl_type::vec4_type);
> +
> +            emit_pull_constant_load(block, inst, temp, inst->src[i],
> +                                    pull_constant_loc[uniform]);
> +
> +            inst->src[i].file = temp.file;
> +            inst->src[i].reg = temp.reg;
> +            inst->src[i].reg_offset = temp.reg_offset;
> +            inst->src[i].reladdr = NULL;
> +         }
> +      }
> +   } while (nested_reladdr);
> +
> +   /* Now there are no accesses of the UNIFORM file with a reladdr, so
> +    * no need to track them as larger-than-vec4 objects.  This will be
> +    * relied on in cutting out unused uniform vectors from push
> +    * constants.
> +    */
> +   split_uniform_registers();
> +}
> +
> +void
> +vec4_god::resolve_ud_negate(src_reg *reg)
> +{
> +   if (reg->type != BRW_REGISTER_TYPE_UD ||
> +       !reg->negate)
> +      return;
> +
> +   src_reg temp = src_reg(this, glsl_type::uvec4_type);
> +   emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
> +   *reg = temp;
> +}
> +
> +/**
> + * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
> + *
> + * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
> + * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
> + */
> +void
> +vec4_god::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
> +{
> +   assert(brw->gen <= 5);
> +
> +   if (!rvalue->type->is_boolean())
> +      return;
> +
> +   src_reg and_result = src_reg(this, rvalue->type);
> +   src_reg neg_result = src_reg(this, rvalue->type);
> +   emit(AND(dst_reg(and_result), *reg, src_reg(1)));
> +   emit(MOV(dst_reg(neg_result), negate(and_result)));
> +   *reg = neg_result;
> +}
> +
> +vec4_god::vec4_god(struct brw_context *brw,
> +                           struct brw_vec4_compile *c,
> +                           struct gl_program *prog,
> +                           const struct brw_vue_prog_key *key,
> +                           struct brw_vue_prog_data *prog_data,
> +                          struct gl_shader_program *shader_prog,
> +                           gl_shader_stage stage,
> +                          void *mem_ctx,
> +                           bool no_spills,
> +                           shader_time_shader_type st_base,
> +                           shader_time_shader_type st_written,
> +                           shader_time_shader_type st_reset)
> +   : backend_god(brw, shader_prog, prog, &prog_data->base, stage),
> +     c(c),
> +     key(key),
> +     prog_data(prog_data),
> +     sanity_param_count(0),
> +     fail_msg(NULL),
> +     first_non_payload_grf(0),
> +     need_all_constants_in_pull_buffer(false),
> +     no_spills(no_spills),
> +     st_base(st_base),
> +     st_written(st_written),
> +     st_reset(st_reset)
> +{
> +   this->mem_ctx = mem_ctx;
> +   this->failed = false;
> +
> +   this->base_ir = NULL;
> +   this->current_annotation = NULL;
> +   memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
> +
> +   this->variable_ht = hash_table_ctor(0,
> +                                      hash_table_pointer_hash,
> +                                      hash_table_pointer_compare);
> +
> +   this->virtual_grf_start = NULL;
> +   this->virtual_grf_end = NULL;
> +   this->live_intervals = NULL;
> +
> +   this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
> +
> +   this->uniforms = 0;
> +
> +   /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
> +    * at least one. See setup_uniforms() in brw_vec4.cpp.
> +    */
> +   this->uniform_array_size = 1;
> +   if (prog_data) {
> +      this->uniform_array_size =
> +         MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
> +   }
> +
> +   this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
> +   this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
> +}
> +
> +vec4_god::~vec4_god()
> +{
> +   hash_table_dtor(this->variable_ht);
> +}
> +
> +
> +void
> +vec4_god::fail(const char *format, ...)
> +{
> +   va_list va;
> +   char *msg;
> +
> +   if (failed)
> +      return;
> +
> +   failed = true;
> +
> +   va_start(va, format);
> +   msg = ralloc_vasprintf(mem_ctx, format, va);
> +   va_end(va);
> +   msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
> +
> +   this->fail_msg = msg;
> +
> +   if (debug_enabled) {
> +      fprintf(stderr, "%s",  msg);
> +   }
> +}
> +
> +} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_god.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_god.cpp
> new file mode 100644
> index 0000000..cbb83e3
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_god.cpp
> @@ -0,0 +1,706 @@
> +/*
> + * Copyright © 2013 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +/**
> + * \file brw_vec4_gs_god.cpp
> + *
> + * Geometry-shader-specific code derived from the vec4_god class.
> + */
> +
> +#include "brw_vec4_gs_god.h"
> +#include "gen6_gs_god.h"
> +
> +const unsigned MAX_GS_INPUT_VERTICES = 6;
> +
> +namespace brw {
> +
> +vec4_gs_god::vec4_gs_god(struct brw_context *brw,
> +                                 struct brw_gs_compile *c,
> +                                 struct gl_shader_program *prog,
> +                                 void *mem_ctx,
> +                                 bool no_spills)
> +   : vec4_god(brw, &c->base, &c->gp->program.Base, &c->key.base,
> +                  &c->prog_data.base, prog, MESA_SHADER_GEOMETRY, mem_ctx,
> +                  no_spills,
> +                  ST_GS, ST_GS_WRITTEN, ST_GS_RESET),
> +     c(c)
> +{
> +}
> +
> +
> +dst_reg *
> +vec4_gs_god::make_reg_for_system_value(ir_variable *ir)
> +{
> +   dst_reg *reg = new(mem_ctx) dst_reg(this, ir->type);
> +
> +   switch (ir->data.location) {
> +   case SYSTEM_VALUE_INVOCATION_ID:
> +      this->current_annotation = "initialize gl_InvocationID";
> +      emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
> +      break;
> +   default:
> +      unreachable("not reached");
> +   }
> +
> +   return reg;
> +}
> +
> +
> +int
> +vec4_gs_god::setup_varying_inputs(int payload_reg, int *attribute_map,
> +                                      int attributes_per_reg)
> +{
> +   /* For geometry shaders there are N copies of the input attributes, where N
> +    * is the number of input vertices.  attribute_map[BRW_VARYING_SLOT_COUNT *
> +    * i + j] represents attribute j for vertex i.
> +    *
> +    * Note that GS inputs are read from the VUE 256 bits (2 vec4's) at a time,
> +    * so the total number of input slots that will be delivered to the GS (and
> +    * thus the stride of the input arrays) is urb_read_length * 2.
> +    */
> +   const unsigned num_input_vertices = c->gp->program.VerticesIn;
> +   assert(num_input_vertices <= MAX_GS_INPUT_VERTICES);
> +   unsigned input_array_stride = c->prog_data.base.urb_read_length * 2;
> +
> +   for (int slot = 0; slot < c->input_vue_map.num_slots; slot++) {
> +      int varying = c->input_vue_map.slot_to_varying[slot];
> +      for (unsigned vertex = 0; vertex < num_input_vertices; vertex++) {
> +         attribute_map[BRW_VARYING_SLOT_COUNT * vertex + varying] =
> +            attributes_per_reg * payload_reg + input_array_stride * vertex +
> +            slot;
> +      }
> +   }
> +
> +   int regs_used = ALIGN(input_array_stride * num_input_vertices,
> +                         attributes_per_reg) / attributes_per_reg;
> +   return payload_reg + regs_used;
> +}
> +
> +
> +void
> +vec4_gs_god::setup_payload()
> +{
> +   int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
> +
> +   /* If we are in dual instanced or single mode, then attributes are going
> +    * to be interleaved, so one register contains two attribute slots.
> +    */
> +   int attributes_per_reg =
> +      c->prog_data.dispatch_mode == GEN7_GS_DISPATCH_MODE_DUAL_OBJECT ? 1 : 2;
> +
> +   /* If a geometry shader tries to read from an input that wasn't written by
> +    * the vertex shader, that produces undefined results, but it shouldn't
> +    * crash anything.  So initialize attribute_map to zeros--that ensures that
> +    * these undefined results are read from r0.
> +    */
> +   memset(attribute_map, 0, sizeof(attribute_map));
> +
> +   int reg = 0;
> +
> +   /* The payload always contains important data in r0, which contains
> +    * the URB handles that are passed on to the URB write at the end
> +    * of the thread.
> +    */
> +   reg++;
> +
> +   /* If the shader uses gl_PrimitiveIDIn, that goes in r1. */
> +   if (c->prog_data.include_primitive_id)
> +      attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg++;
> +
> +   reg = setup_uniforms(reg);
> +
> +   reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
> +
> +   lower_attributes_to_hw_regs(attribute_map, attributes_per_reg > 1);
> +
> +   this->first_non_payload_grf = reg;
> +}
> +
> +
> +void
> +vec4_gs_god::emit_prolog()
> +{
> +   /* In vertex shaders, r0.2 is guaranteed to be initialized to zero.  In
> +    * geometry shaders, it isn't (it contains a bunch of information we don't
> +    * need, like the input primitive type).  We need r0.2 to be zero in order
> +    * to build scratch read/write messages correctly (otherwise this value
> +    * will be interpreted as a global offset, causing us to do our scratch
> +    * reads/writes to garbage memory).  So just set it to zero at the top of
> +    * the shader.
> +    */
> +   this->current_annotation = "clear r0.2";
> +   dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD));
> +   vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, 0u);
> +   inst->force_writemask_all = true;
> +
> +   /* Create a virtual register to hold the vertex count */
> +   this->vertex_count = src_reg(this, glsl_type::uint_type);
> +
> +   /* Initialize the vertex_count register to 0 */
> +   this->current_annotation = "initialize vertex_count";
> +   inst = emit(MOV(dst_reg(this->vertex_count), 0u));
> +   inst->force_writemask_all = true;
> +
> +   if (c->control_data_header_size_bits > 0) {
> +      /* Create a virtual register to hold the current set of control data
> +       * bits.
> +       */
> +      this->control_data_bits = src_reg(this, glsl_type::uint_type);
> +
> +      /* If we're outputting more than 32 control data bits, then EmitVertex()
> +       * will set control_data_bits to 0 after emitting the first vertex.
> +       * Otherwise, we need to initialize it to 0 here.
> +       */
> +      if (c->control_data_header_size_bits <= 32) {
> +         this->current_annotation = "initialize control data bits";
> +         inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
> +         inst->force_writemask_all = true;
> +      }
> +   }
> +
> +   /* If the geometry shader uses the gl_PointSize input, we need to fix it up
> +    * to account for the fact that the vertex shader stored it in the w
> +    * component of VARYING_SLOT_PSIZ.
> +    */
> +   if (c->gp->program.Base.InputsRead & VARYING_BIT_PSIZ) {
> +      this->current_annotation = "swizzle gl_PointSize input";
> +      for (int vertex = 0; vertex < c->gp->program.VerticesIn; vertex++) {
> +         dst_reg dst(ATTR,
> +                     BRW_VARYING_SLOT_COUNT * vertex + VARYING_SLOT_PSIZ);
> +         dst.type = BRW_REGISTER_TYPE_F;
> +         src_reg src(dst);
> +         dst.writemask = WRITEMASK_X;
> +         src.swizzle = BRW_SWIZZLE_WWWW;
> +         inst = emit(MOV(dst, src));
> +
> +         /* In dual instanced dispatch mode, dst has a width of 4, so we need
> +          * to make sure the MOV happens regardless of which channels are
> +          * enabled.
> +          */
> +         inst->force_writemask_all = true;
> +      }
> +   }
> +
> +   this->current_annotation = NULL;
> +}
> +
> +
> +void
> +vec4_gs_god::emit_program_code()
> +{
> +   /* We don't support NV_geometry_program4. */
> +   unreachable("Unreached");
> +}
> +
> +
> +void
> +vec4_gs_god::emit_thread_end()
> +{
> +   if (c->control_data_header_size_bits > 0) {
> +      /* During shader execution, we only ever call emit_control_data_bits()
> +       * just prior to outputting a vertex.  Therefore, the control data bits
> +       * corresponding to the most recently output vertex still need to be
> +       * emitted.
> +       */
> +      current_annotation = "thread end: emit control data bits";
> +      emit_control_data_bits();
> +   }
> +
> +   /* MRF 0 is reserved for the debugger, so start with message header
> +    * in MRF 1.
> +    */
> +   int base_mrf = 1;
> +
> +   current_annotation = "thread end";
> +   dst_reg mrf_reg(MRF, base_mrf);
> +   src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
> +   vec4_instruction *inst = emit(MOV(mrf_reg, r0));
> +   inst->force_writemask_all = true;
> +   emit(GS_OPCODE_SET_VERTEX_COUNT, mrf_reg, this->vertex_count);
> +   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
> +      emit_shader_time_end();
> +   inst = emit(GS_OPCODE_THREAD_END);
> +   inst->base_mrf = base_mrf;
> +   inst->mlen = 1;
> +}
> +
> +
> +void
> +vec4_gs_god::emit_urb_write_header(int mrf)
> +{
> +   /* The SEND instruction that writes the vertex data to the VUE will use
> +    * per_slot_offset=true, which means that DWORDs 3 and 4 of the message
> +    * header specify an offset (in multiples of 256 bits) into the URB entry
> +    * at which the write should take place.
> +    *
> +    * So we have to prepare a message header with the appropriate offset
> +    * values.
> +    */
> +   dst_reg mrf_reg(MRF, mrf);
> +   src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
> +   this->current_annotation = "URB write header";
> +   vec4_instruction *inst = emit(MOV(mrf_reg, r0));
> +   inst->force_writemask_all = true;
> +   emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count,
> +        (uint32_t) c->prog_data.output_vertex_size_hwords);
> +}
> +
> +
> +vec4_instruction *
> +vec4_gs_god::emit_urb_write_opcode(bool complete)
> +{
> +   /* We don't care whether the vertex is complete, because in general
> +    * geometry shaders output multiple vertices, and we don't terminate the
> +    * thread until all vertices are complete.
> +    */
> +   (void) complete;
> +
> +   vec4_instruction *inst = emit(GS_OPCODE_URB_WRITE);
> +   inst->offset = c->prog_data.control_data_header_size_hwords;
> +
> +   /* We need to increment Global Offset by 1 to make room for Broadwell's
> +    * extra "Vertex Count" payload at the beginning of the URB entry.
> +    */
> +   if (brw->gen >= 8)
> +      inst->offset++;
> +
> +   inst->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
> +   return inst;
> +}
> +
> +
> +int
> +vec4_gs_god::compute_array_stride(ir_dereference_array *ir)
> +{
> +   /* Geometry shader inputs are arrays, but they use an unusual array layout:
> +    * instead of all array elements for a given geometry shader input being
> +    * stored consecutively, all geometry shader inputs are interleaved into
> +    * one giant array.  At this stage of compilation, we assume that the
> +    * stride of the array is BRW_VARYING_SLOT_COUNT.  Later,
> +    * setup_attributes() will remap our accesses to the actual input array.
> +    */
> +   ir_dereference_variable *deref_var = ir->array->as_dereference_variable();
> +   if (deref_var && deref_var->var->data.mode == ir_var_shader_in)
> +      return BRW_VARYING_SLOT_COUNT;
> +   else
> +      return vec4_god::compute_array_stride(ir);
> +}
> +
> +
> +/**
> + * Write out a batch of 32 control data bits from the control_data_bits
> + * register to the URB.
> + *
> + * The current value of the vertex_count register determines which DWORD in
> + * the URB receives the control data bits.  The control_data_bits register is
> + * assumed to contain the correct data for the vertex that was most recently
> + * output, and all previous vertices that share the same DWORD.
> + *
> + * This function takes care of ensuring that if no vertices have been output
> + * yet, no control bits are emitted.
> + */
> +void
> +vec4_gs_god::emit_control_data_bits()
> +{
> +   assert(c->control_data_bits_per_vertex != 0);
> +
> +   /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized)
> +    * granularity, we need to use two tricks to ensure that the batch of 32
> +    * control data bits is written to the appropriate DWORD in the URB.  To
> +    * select which vec4 we are writing to, we use the "slot {0,1} offset"
> +    * fields of the message header.  To select which DWORD in the vec4 we are
> +    * writing to, we use the channel mask fields of the message header.  To
> +    * avoid penalizing geometry shaders that emit a small number of vertices
> +    * with extra bookkeeping, we only do each of these tricks when
> +    * c->prog_data.control_data_header_size_bits is large enough to make it
> +    * necessary.
> +    *
> +    * Note: this means that if we're outputting just a single DWORD of control
> +    * data bits, we'll actually replicate it four times since we won't do any
> +    * channel masking.  But that's not a problem since in this case the
> +    * hardware only pays attention to the first DWORD.
> +    */
> +   enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD;
> +   if (c->control_data_header_size_bits > 32)
> +      urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS;
> +   if (c->control_data_header_size_bits > 128)
> +      urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;
> +
> +   /* If vertex_count is 0, then no control data bits have been accumulated
> +    * yet, so we should do nothing.
> +    */
> +   emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ));
> +   emit(IF(BRW_PREDICATE_NORMAL));
> +   {
> +      /* If we are using either channel masks or a per-slot offset, then we
> +       * need to figure out which DWORD we are trying to write to, using the
> +       * formula:
> +       *
> +       *     dword_index = (vertex_count - 1) * bits_per_vertex / 32
> +       *
> +       * Since bits_per_vertex is a power of two, and is known at compile
> +       * time, this can be optimized to:
> +       *
> +       *     dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
> +       */
> +      src_reg dword_index(this, glsl_type::uint_type);
> +      if (urb_write_flags) {
> +         src_reg prev_count(this, glsl_type::uint_type);
> +         emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
> +         unsigned log2_bits_per_vertex =
> +            _mesa_fls(c->control_data_bits_per_vertex);
> +         emit(SHR(dst_reg(dword_index), prev_count,
> +                  (uint32_t) (6 - log2_bits_per_vertex)));
> +      }
> +
> +      /* Start building the URB write message.  The first MRF gets a copy of
> +       * R0.
> +       */
> +      int base_mrf = 1;
> +      dst_reg mrf_reg(MRF, base_mrf);
> +      src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
> +      vec4_instruction *inst = emit(MOV(mrf_reg, r0));
> +      inst->force_writemask_all = true;
> +
> +      if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
> +         /* Set the per-slot offset to dword_index / 4, to that we'll write to
> +          * the appropriate OWORD within the control data header.
> +          */
> +         src_reg per_slot_offset(this, glsl_type::uint_type);
> +         emit(SHR(dst_reg(per_slot_offset), dword_index, 2u));
> +         emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
> +      }
> +
> +      if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
> +         /* Set the channel masks to 1 << (dword_index % 4), so that we'll
> +          * write to the appropriate DWORD within the OWORD.  We need to do
> +          * this computation with force_writemask_all, otherwise garbage data
> +          * from invocation 0 might clobber the mask for invocation 1 when
> +          * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
> +          * together.
> +          */
> +         src_reg channel(this, glsl_type::uint_type);
> +         inst = emit(AND(dst_reg(channel), dword_index, 3u));
> +         inst->force_writemask_all = true;
> +         src_reg one(this, glsl_type::uint_type);
> +         inst = emit(MOV(dst_reg(one), 1u));
> +         inst->force_writemask_all = true;
> +         src_reg channel_mask(this, glsl_type::uint_type);
> +         inst = emit(SHL(dst_reg(channel_mask), one, channel));
> +         inst->force_writemask_all = true;
> +         emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
> +                                               channel_mask);
> +         emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
> +      }
> +
> +      /* Store the control data bits in the message payload and send it. */
> +      dst_reg mrf_reg2(MRF, base_mrf + 1);
> +      inst = emit(MOV(mrf_reg2, this->control_data_bits));
> +      inst->force_writemask_all = true;
> +      inst = emit(GS_OPCODE_URB_WRITE);
> +      inst->urb_write_flags = urb_write_flags;
> +      /* We need to increment Global Offset by 256-bits to make room for
> +       * Broadwell's extra "Vertex Count" payload at the beginning of the
> +       * URB entry.  Since this is an OWord message, Global Offset is counted
> +       * in 128-bit units, so we must set it to 2.
> +       */
> +      if (brw->gen >= 8)
> +         inst->offset = 2;
> +      inst->base_mrf = base_mrf;
> +      inst->mlen = 2;
> +   }
> +   emit(BRW_OPCODE_ENDIF);
> +}
> +
> +void
> +vec4_gs_god::set_stream_control_data_bits(unsigned stream_id)
> +{
> +   /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
> +
> +   /* Note: we are calling this *before* increasing vertex_count, so
> +    * this->vertex_count == vertex_count - 1 in the formula above.
> +    */
> +
> +   /* Stream mode uses 2 bits per vertex */
> +   assert(c->control_data_bits_per_vertex == 2);
> +
> +   /* Must be a valid stream */
> +   assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
> +
> +   /* Control data bits are initialized to 0 so we don't have to set any
> +    * bits when sending vertices to stream 0.
> +    */
> +   if (stream_id == 0)
> +      return;
> +
> +   /* reg::sid = stream_id */
> +   src_reg sid(this, glsl_type::uint_type);
> +   emit(MOV(dst_reg(sid), stream_id));
> +
> +   /* reg:shift_count = 2 * (vertex_count - 1) */
> +   src_reg shift_count(this, glsl_type::uint_type);
> +   emit(SHL(dst_reg(shift_count), this->vertex_count, 1u));
> +
> +   /* Note: we're relying on the fact that the GEN SHL instruction only pays
> +    * attention to the lower 5 bits of its second source argument, so on this
> +    * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
> +    * stream_id << ((2 * (vertex_count - 1)) % 32).
> +    */
> +   src_reg mask(this, glsl_type::uint_type);
> +   emit(SHL(dst_reg(mask), sid, shift_count));
> +   emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
> +}
> +
> +void
> +vec4_gs_god::visit(ir_emit_vertex *ir)
> +{
> +   this->current_annotation = "emit vertex: safety check";
> +
> +   /* To ensure that we don't output more vertices than the shader specified
> +    * using max_vertices, do the logic inside a conditional of the form "if
> +    * (vertex_count < MAX)"
> +    */
> +   unsigned num_output_vertices = c->gp->program.VerticesOut;
> +   emit(CMP(dst_null_d(), this->vertex_count,
> +            src_reg(num_output_vertices), BRW_CONDITIONAL_L));
> +   emit(IF(BRW_PREDICATE_NORMAL));
> +   {
> +      /* If we're outputting 32 control data bits or less, then we can wait
> +       * until the shader is over to output them all.  Otherwise we need to
> +       * output them as we go.  Now is the time to do it, since we're about to
> +       * output the vertex_count'th vertex, so it's guaranteed that the
> +       * control data bits associated with the (vertex_count - 1)th vertex are
> +       * correct.
> +       */
> +      if (c->control_data_header_size_bits > 32) {
> +         this->current_annotation = "emit vertex: emit control data bits";
> +         /* Only emit control data bits if we've finished accumulating a batch
> +          * of 32 bits.  This is the case when:
> +          *
> +          *     (vertex_count * bits_per_vertex) % 32 == 0
> +          *
> +          * (in other words, when the last 5 bits of vertex_count *
> +          * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
> +          * integer n (which is always the case, since bits_per_vertex is
> +          * always 1 or 2), this is equivalent to requiring that the last 5-n
> +          * bits of vertex_count are 0:
> +          *
> +          *     vertex_count & (2^(5-n) - 1) == 0
> +          *
> +          * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
> +          * equivalent to:
> +          *
> +          *     vertex_count & (32 / bits_per_vertex - 1) == 0
> +          */
> +         vec4_instruction *inst =
> +            emit(AND(dst_null_d(), this->vertex_count,
> +                     (uint32_t) (32 / c->control_data_bits_per_vertex - 1)));
> +         inst->conditional_mod = BRW_CONDITIONAL_Z;
> +         emit(IF(BRW_PREDICATE_NORMAL));
> +         {
> +            emit_control_data_bits();
> +
> +            /* Reset control_data_bits to 0 so we can start accumulating a new
> +             * batch.
> +             *
> +             * Note: in the case where vertex_count == 0, this neutralizes the
> +             * effect of any call to EndPrimitive() that the shader may have
> +             * made before outputting its first vertex.
> +             */
> +            inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
> +            inst->force_writemask_all = true;
> +         }
> +         emit(BRW_OPCODE_ENDIF);
> +      }
> +
> +      this->current_annotation = "emit vertex: vertex data";
> +      emit_vertex();
> +
> +      /* In stream mode we have to set control data bits for all vertices
> +       * unless we have disabled control data bits completely (which we do
> +       * do for GL_POINTS outputs that don't use streams).
> +       */
> +      if (c->control_data_header_size_bits > 0 &&
> +          c->prog_data.control_data_format ==
> +             GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
> +          this->current_annotation = "emit vertex: Stream control data bits";
> +          set_stream_control_data_bits(ir->stream_id());
> +      }
> +
> +      this->current_annotation = "emit vertex: increment vertex count";
> +      emit(ADD(dst_reg(this->vertex_count), this->vertex_count,
> +               src_reg(1u)));
> +   }
> +   emit(BRW_OPCODE_ENDIF);
> +
> +   this->current_annotation = NULL;
> +}
> +
> +void
> +vec4_gs_god::visit(ir_end_primitive *)
> +{
> +   /* We can only do EndPrimitive() functionality when the control data
> +    * consists of cut bits.  Fortunately, the only time it isn't is when the
> +    * output type is points, in which case EndPrimitive() is a no-op.
> +    */
> +   if (c->prog_data.control_data_format !=
> +       GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
> +      return;
> +   }
> +
> +   /* Cut bits use one bit per vertex. */
> +   assert(c->control_data_bits_per_vertex == 1);
> +
> +   /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
> +    * vertex n, 0 otherwise.  So all we need to do here is mark bit
> +    * (vertex_count - 1) % 32 in the cut_bits register to indicate that
> +    * EndPrimitive() was called after emitting vertex (vertex_count - 1);
> +    * vec4_gs_god::emit_control_data_bits() will take care of the rest.
> +    *
> +    * Note that if EndPrimitve() is called before emitting any vertices, this
> +    * will cause us to set bit 31 of the control_data_bits register to 1.
> +    * That's fine because:
> +    *
> +    * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
> +    *   output, so the hardware will ignore cut bit 31.
> +    *
> +    * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
> +    *   last vertex, so setting cut bit 31 has no effect (since the primitive
> +    *   is automatically ended when the GS terminates).
> +    *
> +    * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
> +    *   control_data_bits register to 0 when the first vertex is emitted.
> +    */
> +
> +   /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
> +   src_reg one(this, glsl_type::uint_type);
> +   emit(MOV(dst_reg(one), 1u));
> +   src_reg prev_count(this, glsl_type::uint_type);
> +   emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
> +   src_reg mask(this, glsl_type::uint_type);
> +   /* Note: we're relying on the fact that the GEN SHL instruction only pays
> +    * attention to the lower 5 bits of its second source argument, so on this
> +    * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
> +    * ((vertex_count - 1) % 32).
> +    */
> +   emit(SHL(dst_reg(mask), one, prev_count));
> +   emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
> +}
> +
> +static const unsigned *
> +generate_assembly(struct brw_context *brw,
> +                  struct gl_shader_program *shader_prog,
> +                  struct gl_program *prog,
> +                  struct brw_vue_prog_data *prog_data,
> +                  void *mem_ctx,
> +                  const cfg_t *cfg,
> +                  unsigned *final_assembly_size)
> +{
> +   vec4_generator g(brw, shader_prog, prog, prog_data, mem_ctx,
> +                    INTEL_DEBUG & DEBUG_GS, "geometry", "GS");
> +   return g.generate_assembly(cfg, final_assembly_size);
> +}
> +
> +extern "C" const unsigned *
> +brw_gs_emit(struct brw_context *brw,
> +            struct gl_shader_program *prog,
> +            struct brw_gs_compile *c,
> +            void *mem_ctx,
> +            unsigned *final_assembly_size)
> +{
> +   if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
> +      struct brw_shader *shader =
> +         (brw_shader *) prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
> +
> +      brw_dump_ir("geometry", prog, &shader->base, NULL);
> +   }
> +
> +   if (brw->gen >= 7) {
> +      /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
> +       * so without spilling. If the GS invocations count > 1, then we can't use
> +       * dual object mode.
> +       */
> +      if (c->prog_data.invocations <= 1 &&
> +          likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
> +         c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_DUAL_OBJECT;
> +
> +         vec4_gs_god v(brw, c, prog, mem_ctx, true /* no_spills */);
> +         if (v.run()) {
> +            return generate_assembly(brw, prog, &c->gp->program.Base,
> +                                     &c->prog_data.base, mem_ctx, v.cfg,
> +                                     final_assembly_size);
> +         }
> +      }
> +   }
> +
> +   /* Either we failed to compile in DUAL_OBJECT mode (probably because it
> +    * would have required spilling) or DUAL_OBJECT mode is disabled.  So fall
> +    * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
> +    *
> +    * FIXME: Single dispatch mode requires that the driver can handle
> +    * interleaving of input registers, but this is already supported (dual
> +    * instance mode has the same requirement). However, to take full advantage
> +    * of single dispatch mode to reduce register pressure we would also need to
> +    * do interleaved outputs, but currently, the vec4 visitor and generator
> +    * classes do not support this, so at the moment register pressure in
> +    * single and dual instance modes is the same.
> +    *
> +    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
> +    * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
> +    * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
> +    * is also supported. When InstanceCount=1 (one instance per object) software
> +    * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
> +    * the best choice for performance, followed by SINGLE mode."
> +    *
> +    * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
> +    * mode is more performant when invocations > 1. Gen6 only supports
> +    * SINGLE mode.
> +    */
> +   if (c->prog_data.invocations <= 1 || brw->gen < 7)
> +      c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_SINGLE;
> +   else
> +      c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_DUAL_INSTANCE;
> +
> +   vec4_gs_god *gs = NULL;
> +   const unsigned *ret = NULL;
> +
> +   if (brw->gen >= 7)
> +      gs = new vec4_gs_god(brw, c, prog, mem_ctx, false /* no_spills */);
> +   else
> +      gs = new gen6_gs_god(brw, c, prog, mem_ctx, false /* no_spills */);
> +
> +   if (!gs->run()) {
> +      prog->LinkStatus = false;
> +      ralloc_strcat(&prog->InfoLog, gs->fail_msg);
> +   } else {
> +      ret = generate_assembly(brw, prog, &c->gp->program.Base,
> +                              &c->prog_data.base, mem_ctx, gs->cfg,
> +                              final_assembly_size);
> +   }
> +
> +   delete gs;
> +   return ret;
> +}
> +
> +
> +} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_god.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_god.h
> new file mode 100644
> index 0000000..18d849e
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_god.h
> @@ -0,0 +1,103 @@
> +/*
> + * Copyright © 2013 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +/**
> + * \file brw_vec4_gs_god.h
> + *
> + * Geometry-shader-specific code derived from the vec4_god class.
> + */
> +
> +#ifndef BRW_VEC4_GS_VISITOR_H
> +#define BRW_VEC4_GS_VISITOR_H
> +
> +#include "brw_vec4.h"
> +
> +/**
> + * Scratch data used when compiling a GLSL geometry shader.
> + */
> +struct brw_gs_compile
> +{
> +   struct brw_vec4_compile base;
> +   struct brw_gs_prog_key key;
> +   struct brw_gs_prog_data prog_data;
> +   struct brw_vue_map input_vue_map;
> +
> +   struct brw_geometry_program *gp;
> +
> +   unsigned control_data_bits_per_vertex;
> +   unsigned control_data_header_size_bits;
> +};
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +const unsigned *brw_gs_emit(struct brw_context *brw,
> +                            struct gl_shader_program *prog,
> +                            struct brw_gs_compile *c,
> +                            void *mem_ctx,
> +                            unsigned *final_assembly_size);
> +
> +#ifdef __cplusplus
> +} /* extern "C" */
> +#endif
> +
> +#ifdef __cplusplus
> +namespace brw {
> +
> +class vec4_gs_god : public vec4_god
> +{
> +public:
> +   vec4_gs_god(struct brw_context *brw,
> +                   struct brw_gs_compile *c,
> +                   struct gl_shader_program *prog,
> +                   void *mem_ctx,
> +                   bool no_spills);
> +
> +protected:
> +   virtual dst_reg *make_reg_for_system_value(ir_variable *ir);
> +   virtual void setup_payload();
> +   virtual void emit_prolog();
> +   virtual void emit_program_code();
> +   virtual void emit_thread_end();
> +   virtual void emit_urb_write_header(int mrf);
> +   virtual vec4_instruction *emit_urb_write_opcode(bool complete);
> +   virtual int compute_array_stride(ir_dereference_array *ir);
> +   virtual void visit(ir_emit_vertex *);
> +   virtual void visit(ir_end_primitive *);
> +
> +protected:
> +   int setup_varying_inputs(int payload_reg, int *attribute_map,
> +                            int attributes_per_reg);
> +   void emit_control_data_bits();
> +   void set_stream_control_data_bits(unsigned stream_id);
> +
> +   src_reg vertex_count;
> +   src_reg control_data_bits;
> +   const struct brw_gs_compile * const c;
> +};
> +
> +} /* namespace brw */
> +#endif /* __cplusplus */
> +
> +#endif /* BRW_VEC4_GS_VISITOR_H */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
> deleted file mode 100644
> index 2002ffd..0000000
> --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
> +++ /dev/null
> @@ -1,706 +0,0 @@
> -/*
> - * Copyright © 2013 Intel Corporation
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the "Software"),
> - * to deal in the Software without restriction, including without limitation
> - * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> - * and/or sell copies of the Software, and to permit persons to whom the
> - * Software is furnished to do so, subject to the following conditions:
> - *
> - * The above copyright notice and this permission notice (including the next
> - * paragraph) shall be included in all copies or substantial portions of the
> - * Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> - * DEALINGS IN THE SOFTWARE.
> - */
> -
> -/**
> - * \file brw_vec4_gs_visitor.cpp
> - *
> - * Geometry-shader-specific code derived from the vec4_visitor class.
> - */
> -
> -#include "brw_vec4_gs_visitor.h"
> -#include "gen6_gs_visitor.h"
> -
> -const unsigned MAX_GS_INPUT_VERTICES = 6;
> -
> -namespace brw {
> -
> -vec4_gs_visitor::vec4_gs_visitor(struct brw_context *brw,
> -                                 struct brw_gs_compile *c,
> -                                 struct gl_shader_program *prog,
> -                                 void *mem_ctx,
> -                                 bool no_spills)
> -   : vec4_visitor(brw, &c->base, &c->gp->program.Base, &c->key.base,
> -                  &c->prog_data.base, prog, MESA_SHADER_GEOMETRY, mem_ctx,
> -                  no_spills,
> -                  ST_GS, ST_GS_WRITTEN, ST_GS_RESET),
> -     c(c)
> -{
> -}
> -
> -
> -dst_reg *
> -vec4_gs_visitor::make_reg_for_system_value(ir_variable *ir)
> -{
> -   dst_reg *reg = new(mem_ctx) dst_reg(this, ir->type);
> -
> -   switch (ir->data.location) {
> -   case SYSTEM_VALUE_INVOCATION_ID:
> -      this->current_annotation = "initialize gl_InvocationID";
> -      emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
> -      break;
> -   default:
> -      unreachable("not reached");
> -   }
> -
> -   return reg;
> -}
> -
> -
> -int
> -vec4_gs_visitor::setup_varying_inputs(int payload_reg, int *attribute_map,
> -                                      int attributes_per_reg)
> -{
> -   /* For geometry shaders there are N copies of the input attributes, where N
> -    * is the number of input vertices.  attribute_map[BRW_VARYING_SLOT_COUNT *
> -    * i + j] represents attribute j for vertex i.
> -    *
> -    * Note that GS inputs are read from the VUE 256 bits (2 vec4's) at a time,
> -    * so the total number of input slots that will be delivered to the GS (and
> -    * thus the stride of the input arrays) is urb_read_length * 2.
> -    */
> -   const unsigned num_input_vertices = c->gp->program.VerticesIn;
> -   assert(num_input_vertices <= MAX_GS_INPUT_VERTICES);
> -   unsigned input_array_stride = c->prog_data.base.urb_read_length * 2;
> -
> -   for (int slot = 0; slot < c->input_vue_map.num_slots; slot++) {
> -      int varying = c->input_vue_map.slot_to_varying[slot];
> -      for (unsigned vertex = 0; vertex < num_input_vertices; vertex++) {
> -         attribute_map[BRW_VARYING_SLOT_COUNT * vertex + varying] =
> -            attributes_per_reg * payload_reg + input_array_stride * vertex +
> -            slot;
> -      }
> -   }
> -
> -   int regs_used = ALIGN(input_array_stride * num_input_vertices,
> -                         attributes_per_reg) / attributes_per_reg;
> -   return payload_reg + regs_used;
> -}
> -
> -
> -void
> -vec4_gs_visitor::setup_payload()
> -{
> -   int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
> -
> -   /* If we are in dual instanced or single mode, then attributes are going
> -    * to be interleaved, so one register contains two attribute slots.
> -    */
> -   int attributes_per_reg =
> -      c->prog_data.dispatch_mode == GEN7_GS_DISPATCH_MODE_DUAL_OBJECT ? 1 : 2;
> -
> -   /* If a geometry shader tries to read from an input that wasn't written by
> -    * the vertex shader, that produces undefined results, but it shouldn't
> -    * crash anything.  So initialize attribute_map to zeros--that ensures that
> -    * these undefined results are read from r0.
> -    */
> -   memset(attribute_map, 0, sizeof(attribute_map));
> -
> -   int reg = 0;
> -
> -   /* The payload always contains important data in r0, which contains
> -    * the URB handles that are passed on to the URB write at the end
> -    * of the thread.
> -    */
> -   reg++;
> -
> -   /* If the shader uses gl_PrimitiveIDIn, that goes in r1. */
> -   if (c->prog_data.include_primitive_id)
> -      attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg++;
> -
> -   reg = setup_uniforms(reg);
> -
> -   reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
> -
> -   lower_attributes_to_hw_regs(attribute_map, attributes_per_reg > 1);
> -
> -   this->first_non_payload_grf = reg;
> -}
> -
> -
> -void
> -vec4_gs_visitor::emit_prolog()
> -{
> -   /* In vertex shaders, r0.2 is guaranteed to be initialized to zero.  In
> -    * geometry shaders, it isn't (it contains a bunch of information we don't
> -    * need, like the input primitive type).  We need r0.2 to be zero in order
> -    * to build scratch read/write messages correctly (otherwise this value
> -    * will be interpreted as a global offset, causing us to do our scratch
> -    * reads/writes to garbage memory).  So just set it to zero at the top of
> -    * the shader.
> -    */
> -   this->current_annotation = "clear r0.2";
> -   dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD));
> -   vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, 0u);
> -   inst->force_writemask_all = true;
> -
> -   /* Create a virtual register to hold the vertex count */
> -   this->vertex_count = src_reg(this, glsl_type::uint_type);
> -
> -   /* Initialize the vertex_count register to 0 */
> -   this->current_annotation = "initialize vertex_count";
> -   inst = emit(MOV(dst_reg(this->vertex_count), 0u));
> -   inst->force_writemask_all = true;
> -
> -   if (c->control_data_header_size_bits > 0) {
> -      /* Create a virtual register to hold the current set of control data
> -       * bits.
> -       */
> -      this->control_data_bits = src_reg(this, glsl_type::uint_type);
> -
> -      /* If we're outputting more than 32 control data bits, then EmitVertex()
> -       * will set control_data_bits to 0 after emitting the first vertex.
> -       * Otherwise, we need to initialize it to 0 here.
> -       */
> -      if (c->control_data_header_size_bits <= 32) {
> -         this->current_annotation = "initialize control data bits";
> -         inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
> -         inst->force_writemask_all = true;
> -      }
> -   }
> -
> -   /* If the geometry shader uses the gl_PointSize input, we need to fix it up
> -    * to account for the fact that the vertex shader stored it in the w
> -    * component of VARYING_SLOT_PSIZ.
> -    */
> -   if (c->gp->program.Base.InputsRead & VARYING_BIT_PSIZ) {
> -      this->current_annotation = "swizzle gl_PointSize input";
> -      for (int vertex = 0; vertex < c->gp->program.VerticesIn; vertex++) {
> -         dst_reg dst(ATTR,
> -                     BRW_VARYING_SLOT_COUNT * vertex + VARYING_SLOT_PSIZ);
> -         dst.type = BRW_REGISTER_TYPE_F;
> -         src_reg src(dst);
> -         dst.writemask = WRITEMASK_X;
> -         src.swizzle = BRW_SWIZZLE_WWWW;
> -         inst = emit(MOV(dst, src));
> -
> -         /* In dual instanced dispatch mode, dst has a width of 4, so we need
> -          * to make sure the MOV happens regardless of which channels are
> -          * enabled.
> -          */
> -         inst->force_writemask_all = true;
> -      }
> -   }
> -
> -   this->current_annotation = NULL;
> -}
> -
> -
> -void
> -vec4_gs_visitor::emit_program_code()
> -{
> -   /* We don't support NV_geometry_program4. */
> -   unreachable("Unreached");
> -}
> -
> -
> -void
> -vec4_gs_visitor::emit_thread_end()
> -{
> -   if (c->control_data_header_size_bits > 0) {
> -      /* During shader execution, we only ever call emit_control_data_bits()
> -       * just prior to outputting a vertex.  Therefore, the control data bits
> -       * corresponding to the most recently output vertex still need to be
> -       * emitted.
> -       */
> -      current_annotation = "thread end: emit control data bits";
> -      emit_control_data_bits();
> -   }
> -
> -   /* MRF 0 is reserved for the debugger, so start with message header
> -    * in MRF 1.
> -    */
> -   int base_mrf = 1;
> -
> -   current_annotation = "thread end";
> -   dst_reg mrf_reg(MRF, base_mrf);
> -   src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
> -   vec4_instruction *inst = emit(MOV(mrf_reg, r0));
> -   inst->force_writemask_all = true;
> -   emit(GS_OPCODE_SET_VERTEX_COUNT, mrf_reg, this->vertex_count);
> -   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
> -      emit_shader_time_end();
> -   inst = emit(GS_OPCODE_THREAD_END);
> -   inst->base_mrf = base_mrf;
> -   inst->mlen = 1;
> -}
> -
> -
> -void
> -vec4_gs_visitor::emit_urb_write_header(int mrf)
> -{
> -   /* The SEND instruction that writes the vertex data to the VUE will use
> -    * per_slot_offset=true, which means that DWORDs 3 and 4 of the message
> -    * header specify an offset (in multiples of 256 bits) into the URB entry
> -    * at which the write should take place.
> -    *
> -    * So we have to prepare a message header with the appropriate offset
> -    * values.
> -    */
> -   dst_reg mrf_reg(MRF, mrf);
> -   src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
> -   this->current_annotation = "URB write header";
> -   vec4_instruction *inst = emit(MOV(mrf_reg, r0));
> -   inst->force_writemask_all = true;
> -   emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count,
> -        (uint32_t) c->prog_data.output_vertex_size_hwords);
> -}
> -
> -
> -vec4_instruction *
> -vec4_gs_visitor::emit_urb_write_opcode(bool complete)
> -{
> -   /* We don't care whether the vertex is complete, because in general
> -    * geometry shaders output multiple vertices, and we don't terminate the
> -    * thread until all vertices are complete.
> -    */
> -   (void) complete;
> -
> -   vec4_instruction *inst = emit(GS_OPCODE_URB_WRITE);
> -   inst->offset = c->prog_data.control_data_header_size_hwords;
> -
> -   /* We need to increment Global Offset by 1 to make room for Broadwell's
> -    * extra "Vertex Count" payload at the beginning of the URB entry.
> -    */
> -   if (brw->gen >= 8)
> -      inst->offset++;
> -
> -   inst->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
> -   return inst;
> -}
> -
> -
> -int
> -vec4_gs_visitor::compute_array_stride(ir_dereference_array *ir)
> -{
> -   /* Geometry shader inputs are arrays, but they use an unusual array layout:
> -    * instead of all array elements for a given geometry shader input being
> -    * stored consecutively, all geometry shader inputs are interleaved into
> -    * one giant array.  At this stage of compilation, we assume that the
> -    * stride of the array is BRW_VARYING_SLOT_COUNT.  Later,
> -    * setup_attributes() will remap our accesses to the actual input array.
> -    */
> -   ir_dereference_variable *deref_var = ir->array->as_dereference_variable();
> -   if (deref_var && deref_var->var->data.mode == ir_var_shader_in)
> -      return BRW_VARYING_SLOT_COUNT;
> -   else
> -      return vec4_visitor::compute_array_stride(ir);
> -}
> -
> -
> -/**
> - * Write out a batch of 32 control data bits from the control_data_bits
> - * register to the URB.
> - *
> - * The current value of the vertex_count register determines which DWORD in
> - * the URB receives the control data bits.  The control_data_bits register is
> - * assumed to contain the correct data for the vertex that was most recently
> - * output, and all previous vertices that share the same DWORD.
> - *
> - * This function takes care of ensuring that if no vertices have been output
> - * yet, no control bits are emitted.
> - */
> -void
> -vec4_gs_visitor::emit_control_data_bits()
> -{
> -   assert(c->control_data_bits_per_vertex != 0);
> -
> -   /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized)
> -    * granularity, we need to use two tricks to ensure that the batch of 32
> -    * control data bits is written to the appropriate DWORD in the URB.  To
> -    * select which vec4 we are writing to, we use the "slot {0,1} offset"
> -    * fields of the message header.  To select which DWORD in the vec4 we are
> -    * writing to, we use the channel mask fields of the message header.  To
> -    * avoid penalizing geometry shaders that emit a small number of vertices
> -    * with extra bookkeeping, we only do each of these tricks when
> -    * c->prog_data.control_data_header_size_bits is large enough to make it
> -    * necessary.
> -    *
> -    * Note: this means that if we're outputting just a single DWORD of control
> -    * data bits, we'll actually replicate it four times since we won't do any
> -    * channel masking.  But that's not a problem since in this case the
> -    * hardware only pays attention to the first DWORD.
> -    */
> -   enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD;
> -   if (c->control_data_header_size_bits > 32)
> -      urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS;
> -   if (c->control_data_header_size_bits > 128)
> -      urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;
> -
> -   /* If vertex_count is 0, then no control data bits have been accumulated
> -    * yet, so we should do nothing.
> -    */
> -   emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ));
> -   emit(IF(BRW_PREDICATE_NORMAL));
> -   {
> -      /* If we are using either channel masks or a per-slot offset, then we
> -       * need to figure out which DWORD we are trying to write to, using the
> -       * formula:
> -       *
> -       *     dword_index = (vertex_count - 1) * bits_per_vertex / 32
> -       *
> -       * Since bits_per_vertex is a power of two, and is known at compile
> -       * time, this can be optimized to:
> -       *
> -       *     dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
> -       */
> -      src_reg dword_index(this, glsl_type::uint_type);
> -      if (urb_write_flags) {
> -         src_reg prev_count(this, glsl_type::uint_type);
> -         emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
> -         unsigned log2_bits_per_vertex =
> -            _mesa_fls(c->control_data_bits_per_vertex);
> -         emit(SHR(dst_reg(dword_index), prev_count,
> -                  (uint32_t) (6 - log2_bits_per_vertex)));
> -      }
> -
> -      /* Start building the URB write message.  The first MRF gets a copy of
> -       * R0.
> -       */
> -      int base_mrf = 1;
> -      dst_reg mrf_reg(MRF, base_mrf);
> -      src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
> -      vec4_instruction *inst = emit(MOV(mrf_reg, r0));
> -      inst->force_writemask_all = true;
> -
> -      if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
> -         /* Set the per-slot offset to dword_index / 4, to that we'll write to
> -          * the appropriate OWORD within the control data header.
> -          */
> -         src_reg per_slot_offset(this, glsl_type::uint_type);
> -         emit(SHR(dst_reg(per_slot_offset), dword_index, 2u));
> -         emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
> -      }
> -
> -      if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
> -         /* Set the channel masks to 1 << (dword_index % 4), so that we'll
> -          * write to the appropriate DWORD within the OWORD.  We need to do
> -          * this computation with force_writemask_all, otherwise garbage data
> -          * from invocation 0 might clobber the mask for invocation 1 when
> -          * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
> -          * together.
> -          */
> -         src_reg channel(this, glsl_type::uint_type);
> -         inst = emit(AND(dst_reg(channel), dword_index, 3u));
> -         inst->force_writemask_all = true;
> -         src_reg one(this, glsl_type::uint_type);
> -         inst = emit(MOV(dst_reg(one), 1u));
> -         inst->force_writemask_all = true;
> -         src_reg channel_mask(this, glsl_type::uint_type);
> -         inst = emit(SHL(dst_reg(channel_mask), one, channel));
> -         inst->force_writemask_all = true;
> -         emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
> -                                               channel_mask);
> -         emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
> -      }
> -
> -      /* Store the control data bits in the message payload and send it. */
> -      dst_reg mrf_reg2(MRF, base_mrf + 1);
> -      inst = emit(MOV(mrf_reg2, this->control_data_bits));
> -      inst->force_writemask_all = true;
> -      inst = emit(GS_OPCODE_URB_WRITE);
> -      inst->urb_write_flags = urb_write_flags;
> -      /* We need to increment Global Offset by 256-bits to make room for
> -       * Broadwell's extra "Vertex Count" payload at the beginning of the
> -       * URB entry.  Since this is an OWord message, Global Offset is counted
> -       * in 128-bit units, so we must set it to 2.
> -       */
> -      if (brw->gen >= 8)
> -         inst->offset = 2;
> -      inst->base_mrf = base_mrf;
> -      inst->mlen = 2;
> -   }
> -   emit(BRW_OPCODE_ENDIF);
> -}
> -
> -void
> -vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id)
> -{
> -   /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
> -
> -   /* Note: we are calling this *before* increasing vertex_count, so
> -    * this->vertex_count == vertex_count - 1 in the formula above.
> -    */
> -
> -   /* Stream mode uses 2 bits per vertex */
> -   assert(c->control_data_bits_per_vertex == 2);
> -
> -   /* Must be a valid stream */
> -   assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
> -
> -   /* Control data bits are initialized to 0 so we don't have to set any
> -    * bits when sending vertices to stream 0.
> -    */
> -   if (stream_id == 0)
> -      return;
> -
> -   /* reg::sid = stream_id */
> -   src_reg sid(this, glsl_type::uint_type);
> -   emit(MOV(dst_reg(sid), stream_id));
> -
> -   /* reg:shift_count = 2 * (vertex_count - 1) */
> -   src_reg shift_count(this, glsl_type::uint_type);
> -   emit(SHL(dst_reg(shift_count), this->vertex_count, 1u));
> -
> -   /* Note: we're relying on the fact that the GEN SHL instruction only pays
> -    * attention to the lower 5 bits of its second source argument, so on this
> -    * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
> -    * stream_id << ((2 * (vertex_count - 1)) % 32).
> -    */
> -   src_reg mask(this, glsl_type::uint_type);
> -   emit(SHL(dst_reg(mask), sid, shift_count));
> -   emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
> -}
> -
> -void
> -vec4_gs_visitor::visit(ir_emit_vertex *ir)
> -{
> -   this->current_annotation = "emit vertex: safety check";
> -
> -   /* To ensure that we don't output more vertices than the shader specified
> -    * using max_vertices, do the logic inside a conditional of the form "if
> -    * (vertex_count < MAX)"
> -    */
> -   unsigned num_output_vertices = c->gp->program.VerticesOut;
> -   emit(CMP(dst_null_d(), this->vertex_count,
> -            src_reg(num_output_vertices), BRW_CONDITIONAL_L));
> -   emit(IF(BRW_PREDICATE_NORMAL));
> -   {
> -      /* If we're outputting 32 control data bits or less, then we can wait
> -       * until the shader is over to output them all.  Otherwise we need to
> -       * output them as we go.  Now is the time to do it, since we're about to
> -       * output the vertex_count'th vertex, so it's guaranteed that the
> -       * control data bits associated with the (vertex_count - 1)th vertex are
> -       * correct.
> -       */
> -      if (c->control_data_header_size_bits > 32) {
> -         this->current_annotation = "emit vertex: emit control data bits";
> -         /* Only emit control data bits if we've finished accumulating a batch
> -          * of 32 bits.  This is the case when:
> -          *
> -          *     (vertex_count * bits_per_vertex) % 32 == 0
> -          *
> -          * (in other words, when the last 5 bits of vertex_count *
> -          * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
> -          * integer n (which is always the case, since bits_per_vertex is
> -          * always 1 or 2), this is equivalent to requiring that the last 5-n
> -          * bits of vertex_count are 0:
> -          *
> -          *     vertex_count & (2^(5-n) - 1) == 0
> -          *
> -          * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
> -          * equivalent to:
> -          *
> -          *     vertex_count & (32 / bits_per_vertex - 1) == 0
> -          */
> -         vec4_instruction *inst =
> -            emit(AND(dst_null_d(), this->vertex_count,
> -                     (uint32_t) (32 / c->control_data_bits_per_vertex - 1)));
> -         inst->conditional_mod = BRW_CONDITIONAL_Z;
> -         emit(IF(BRW_PREDICATE_NORMAL));
> -         {
> -            emit_control_data_bits();
> -
> -            /* Reset control_data_bits to 0 so we can start accumulating a new
> -             * batch.
> -             *
> -             * Note: in the case where vertex_count == 0, this neutralizes the
> -             * effect of any call to EndPrimitive() that the shader may have
> -             * made before outputting its first vertex.
> -             */
> -            inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
> -            inst->force_writemask_all = true;
> -         }
> -         emit(BRW_OPCODE_ENDIF);
> -      }
> -
> -      this->current_annotation = "emit vertex: vertex data";
> -      emit_vertex();
> -
> -      /* In stream mode we have to set control data bits for all vertices
> -       * unless we have disabled control data bits completely (which we do
> -       * do for GL_POINTS outputs that don't use streams).
> -       */
> -      if (c->control_data_header_size_bits > 0 &&
> -          c->prog_data.control_data_format ==
> -             GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
> -          this->current_annotation = "emit vertex: Stream control data bits";
> -          set_stream_control_data_bits(ir->stream_id());
> -      }
> -
> -      this->current_annotation = "emit vertex: increment vertex count";
> -      emit(ADD(dst_reg(this->vertex_count), this->vertex_count,
> -               src_reg(1u)));
> -   }
> -   emit(BRW_OPCODE_ENDIF);
> -
> -   this->current_annotation = NULL;
> -}
> -
> -void
> -vec4_gs_visitor::visit(ir_end_primitive *)
> -{
> -   /* We can only do EndPrimitive() functionality when the control data
> -    * consists of cut bits.  Fortunately, the only time it isn't is when the
> -    * output type is points, in which case EndPrimitive() is a no-op.
> -    */
> -   if (c->prog_data.control_data_format !=
> -       GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
> -      return;
> -   }
> -
> -   /* Cut bits use one bit per vertex. */
> -   assert(c->control_data_bits_per_vertex == 1);
> -
> -   /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
> -    * vertex n, 0 otherwise.  So all we need to do here is mark bit
> -    * (vertex_count - 1) % 32 in the cut_bits register to indicate that
> -    * EndPrimitive() was called after emitting vertex (vertex_count - 1);
> -    * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
> -    *
> -    * Note that if EndPrimitve() is called before emitting any vertices, this
> -    * will cause us to set bit 31 of the control_data_bits register to 1.
> -    * That's fine because:
> -    *
> -    * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
> -    *   output, so the hardware will ignore cut bit 31.
> -    *
> -    * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
> -    *   last vertex, so setting cut bit 31 has no effect (since the primitive
> -    *   is automatically ended when the GS terminates).
> -    *
> -    * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
> -    *   control_data_bits register to 0 when the first vertex is emitted.
> -    */
> -
> -   /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
> -   src_reg one(this, glsl_type::uint_type);
> -   emit(MOV(dst_reg(one), 1u));
> -   src_reg prev_count(this, glsl_type::uint_type);
> -   emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
> -   src_reg mask(this, glsl_type::uint_type);
> -   /* Note: we're relying on the fact that the GEN SHL instruction only pays
> -    * attention to the lower 5 bits of its second source argument, so on this
> -    * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
> -    * ((vertex_count - 1) % 32).
> -    */
> -   emit(SHL(dst_reg(mask), one, prev_count));
> -   emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
> -}
> -
> -static const unsigned *
> -generate_assembly(struct brw_context *brw,
> -                  struct gl_shader_program *shader_prog,
> -                  struct gl_program *prog,
> -                  struct brw_vue_prog_data *prog_data,
> -                  void *mem_ctx,
> -                  const cfg_t *cfg,
> -                  unsigned *final_assembly_size)
> -{
> -   vec4_generator g(brw, shader_prog, prog, prog_data, mem_ctx,
> -                    INTEL_DEBUG & DEBUG_GS, "geometry", "GS");
> -   return g.generate_assembly(cfg, final_assembly_size);
> -}
> -
> -extern "C" const unsigned *
> -brw_gs_emit(struct brw_context *brw,
> -            struct gl_shader_program *prog,
> -            struct brw_gs_compile *c,
> -            void *mem_ctx,
> -            unsigned *final_assembly_size)
> -{
> -   if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
> -      struct brw_shader *shader =
> -         (brw_shader *) prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
> -
> -      brw_dump_ir("geometry", prog, &shader->base, NULL);
> -   }
> -
> -   if (brw->gen >= 7) {
> -      /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
> -       * so without spilling. If the GS invocations count > 1, then we can't use
> -       * dual object mode.
> -       */
> -      if (c->prog_data.invocations <= 1 &&
> -          likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
> -         c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_DUAL_OBJECT;
> -
> -         vec4_gs_visitor v(brw, c, prog, mem_ctx, true /* no_spills */);
> -         if (v.run()) {
> -            return generate_assembly(brw, prog, &c->gp->program.Base,
> -                                     &c->prog_data.base, mem_ctx, v.cfg,
> -                                     final_assembly_size);
> -         }
> -      }
> -   }
> -
> -   /* Either we failed to compile in DUAL_OBJECT mode (probably because it
> -    * would have required spilling) or DUAL_OBJECT mode is disabled.  So fall
> -    * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
> -    *
> -    * FIXME: Single dispatch mode requires that the driver can handle
> -    * interleaving of input registers, but this is already supported (dual
> -    * instance mode has the same requirement). However, to take full advantage
> -    * of single dispatch mode to reduce register pressure we would also need to
> -    * do interleaved outputs, but currently, the vec4 visitor and generator
> -    * classes do not support this, so at the moment register pressure in
> -    * single and dual instance modes is the same.
> -    *
> -    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
> -    * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
> -    * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
> -    * is also supported. When InstanceCount=1 (one instance per object) software
> -    * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
> -    * the best choice for performance, followed by SINGLE mode."
> -    *
> -    * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
> -    * mode is more performant when invocations > 1. Gen6 only supports
> -    * SINGLE mode.
> -    */
> -   if (c->prog_data.invocations <= 1 || brw->gen < 7)
> -      c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_SINGLE;
> -   else
> -      c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_DUAL_INSTANCE;
> -
> -   vec4_gs_visitor *gs = NULL;
> -   const unsigned *ret = NULL;
> -
> -   if (brw->gen >= 7)
> -      gs = new vec4_gs_visitor(brw, c, prog, mem_ctx, false /* no_spills */);
> -   else
> -      gs = new gen6_gs_visitor(brw, c, prog, mem_ctx, false /* no_spills */);
> -
> -   if (!gs->run()) {
> -      prog->LinkStatus = false;
> -      ralloc_strcat(&prog->InfoLog, gs->fail_msg);
> -   } else {
> -      ret = generate_assembly(brw, prog, &c->gp->program.Base,
> -                              &c->prog_data.base, mem_ctx, gs->cfg,
> -                              final_assembly_size);
> -   }
> -
> -   delete gs;
> -   return ret;
> -}
> -
> -
> -} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
> deleted file mode 100644
> index bcb5a2b..0000000
> --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
> +++ /dev/null
> @@ -1,103 +0,0 @@
> -/*
> - * Copyright © 2013 Intel Corporation
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the "Software"),
> - * to deal in the Software without restriction, including without limitation
> - * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> - * and/or sell copies of the Software, and to permit persons to whom the
> - * Software is furnished to do so, subject to the following conditions:
> - *
> - * The above copyright notice and this permission notice (including the next
> - * paragraph) shall be included in all copies or substantial portions of the
> - * Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> - * DEALINGS IN THE SOFTWARE.
> - */
> -
> -/**
> - * \file brw_vec4_gs_visitor.h
> - *
> - * Geometry-shader-specific code derived from the vec4_visitor class.
> - */
> -
> -#ifndef BRW_VEC4_GS_VISITOR_H
> -#define BRW_VEC4_GS_VISITOR_H
> -
> -#include "brw_vec4.h"
> -
> -/**
> - * Scratch data used when compiling a GLSL geometry shader.
> - */
> -struct brw_gs_compile
> -{
> -   struct brw_vec4_compile base;
> -   struct brw_gs_prog_key key;
> -   struct brw_gs_prog_data prog_data;
> -   struct brw_vue_map input_vue_map;
> -
> -   struct brw_geometry_program *gp;
> -
> -   unsigned control_data_bits_per_vertex;
> -   unsigned control_data_header_size_bits;
> -};
> -
> -#ifdef __cplusplus
> -extern "C" {
> -#endif
> -
> -const unsigned *brw_gs_emit(struct brw_context *brw,
> -                            struct gl_shader_program *prog,
> -                            struct brw_gs_compile *c,
> -                            void *mem_ctx,
> -                            unsigned *final_assembly_size);
> -
> -#ifdef __cplusplus
> -} /* extern "C" */
> -#endif
> -
> -#ifdef __cplusplus
> -namespace brw {
> -
> -class vec4_gs_visitor : public vec4_visitor
> -{
> -public:
> -   vec4_gs_visitor(struct brw_context *brw,
> -                   struct brw_gs_compile *c,
> -                   struct gl_shader_program *prog,
> -                   void *mem_ctx,
> -                   bool no_spills);
> -
> -protected:
> -   virtual dst_reg *make_reg_for_system_value(ir_variable *ir);
> -   virtual void setup_payload();
> -   virtual void emit_prolog();
> -   virtual void emit_program_code();
> -   virtual void emit_thread_end();
> -   virtual void emit_urb_write_header(int mrf);
> -   virtual vec4_instruction *emit_urb_write_opcode(bool complete);
> -   virtual int compute_array_stride(ir_dereference_array *ir);
> -   virtual void visit(ir_emit_vertex *);
> -   virtual void visit(ir_end_primitive *);
> -
> -protected:
> -   int setup_varying_inputs(int payload_reg, int *attribute_map,
> -                            int attributes_per_reg);
> -   void emit_control_data_bits();
> -   void set_stream_control_data_bits(unsigned stream_id);
> -
> -   src_reg vertex_count;
> -   src_reg control_data_bits;
> -   const struct brw_gs_compile * const c;
> -};
> -
> -} /* namespace brw */
> -#endif /* __cplusplus */
> -
> -#endif /* BRW_VEC4_GS_VISITOR_H */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
> index 95b9d90..8ef0acb 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
> @@ -228,7 +228,7 @@ vec4_live_variables::~vec4_live_variables()
>   * for register allocation performance.
>   */
>  void
> -vec4_visitor::calculate_live_intervals()
> +vec4_god::calculate_live_intervals()
>  {
>     if (this->live_intervals)
>        return;
> @@ -304,14 +304,14 @@ vec4_visitor::calculate_live_intervals()
>  }
>
>  void
> -vec4_visitor::invalidate_live_intervals()
> +vec4_god::invalidate_live_intervals()
>  {
>     ralloc_free(live_intervals);
>     live_intervals = NULL;
>  }
>
>  int
> -vec4_visitor::var_range_start(unsigned v, unsigned n) const
> +vec4_god::var_range_start(unsigned v, unsigned n) const
>  {
>     int start = INT_MAX;
>
> @@ -322,7 +322,7 @@ vec4_visitor::var_range_start(unsigned v, unsigned n) const
>  }
>
>  int
> -vec4_visitor::var_range_end(unsigned v, unsigned n) const
> +vec4_god::var_range_end(unsigned v, unsigned n) const
>  {
>     int end = INT_MIN;
>
> @@ -333,7 +333,7 @@ vec4_visitor::var_range_end(unsigned v, unsigned n) const
>  }
>
>  bool
> -vec4_visitor::virtual_grf_interferes(int a, int b)
> +vec4_god::virtual_grf_interferes(int a, int b)
>  {
>     return !((var_range_end(4 * alloc.offsets[a], 4 * alloc.sizes[a]) <=
>               var_range_start(4 * alloc.offsets[b], 4 * alloc.sizes[b])) ||
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
> index 3186824..5016a7c 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
> @@ -42,7 +42,7 @@ assign(unsigned int *reg_hw_locations, backend_reg *reg)
>  }
>
>  bool
> -vec4_visitor::reg_allocate_trivial()
> +vec4_god::reg_allocate_trivial()
>  {
>     unsigned int hw_reg_mapping[this->alloc.count];
>     bool virtual_grf_used[this->alloc.count];
> @@ -166,7 +166,7 @@ brw_vec4_alloc_reg_set(struct intel_screen *screen)
>  }
>
>  void
> -vec4_visitor::setup_payload_interference(struct ra_graph *g,
> +vec4_god::setup_payload_interference(struct ra_graph *g,
>                                           int first_payload_node,
>                                           int reg_node_count)
>  {
> @@ -190,7 +190,7 @@ vec4_visitor::setup_payload_interference(struct ra_graph *g,
>  }
>
>  bool
> -vec4_visitor::reg_allocate()
> +vec4_god::reg_allocate()
>  {
>     struct intel_screen *screen = brw->intelScreen;
>     unsigned int hw_reg_mapping[alloc.count];
> @@ -267,7 +267,7 @@ vec4_visitor::reg_allocate()
>  }
>
>  void
> -vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
> +vec4_god::evaluate_spill_costs(float *spill_costs, bool *no_spill)
>  {
>     float loop_scale = 1.0;
>
> @@ -322,7 +322,7 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
>  }
>
>  int
> -vec4_visitor::choose_spill_reg(struct ra_graph *g)
> +vec4_god::choose_spill_reg(struct ra_graph *g)
>  {
>     float spill_costs[this->alloc.count];
>     bool no_spill[this->alloc.count];
> @@ -338,7 +338,7 @@ vec4_visitor::choose_spill_reg(struct ra_graph *g)
>  }
>
>  void
> -vec4_visitor::spill_reg(int spill_reg_nr)
> +vec4_god::spill_reg(int spill_reg_nr)
>  {
>     assert(alloc.sizes[spill_reg_nr] == 1);
>     unsigned int spill_offset = c->last_scratch++;
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> deleted file mode 100644
> index 26a3b9f..0000000
> --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> +++ /dev/null
> @@ -1,3658 +0,0 @@
> -/*
> - * Copyright © 2011 Intel Corporation
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the "Software"),
> - * to deal in the Software without restriction, including without limitation
> - * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> - * and/or sell copies of the Software, and to permit persons to whom the
> - * Software is furnished to do so, subject to the following conditions:
> - *
> - * The above copyright notice and this permission notice (including the next
> - * paragraph) shall be included in all copies or substantial portions of the
> - * Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> - * IN THE SOFTWARE.
> - */
> -
> -#include "brw_vec4.h"
> -#include "brw_cfg.h"
> -#include "glsl/ir_uniform.h"
> -#include "program/sampler.h"
> -
> -namespace brw {
> -
> -vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
> -                                   const src_reg &src0, const src_reg &src1,
> -                                   const src_reg &src2)
> -{
> -   this->opcode = opcode;
> -   this->dst = dst;
> -   this->src[0] = src0;
> -   this->src[1] = src1;
> -   this->src[2] = src2;
> -   this->saturate = false;
> -   this->force_writemask_all = false;
> -   this->no_dd_clear = false;
> -   this->no_dd_check = false;
> -   this->writes_accumulator = false;
> -   this->conditional_mod = BRW_CONDITIONAL_NONE;
> -   this->predicate = BRW_PREDICATE_NONE;
> -   this->predicate_inverse = false;
> -   this->target = 0;
> -   this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
> -   this->shadow_compare = false;
> -   this->ir = NULL;
> -   this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
> -   this->header_present = false;
> -   this->flag_subreg = 0;
> -   this->mlen = 0;
> -   this->base_mrf = 0;
> -   this->offset = 0;
> -   this->annotation = NULL;
> -}
> -
> -vec4_instruction *
> -vec4_visitor::emit(vec4_instruction *inst)
> -{
> -   inst->ir = this->base_ir;
> -   inst->annotation = this->current_annotation;
> -
> -   this->instructions.push_tail(inst);
> -
> -   return inst;
> -}
> -
> -vec4_instruction *
> -vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
> -                          vec4_instruction *new_inst)
> -{
> -   new_inst->ir = inst->ir;
> -   new_inst->annotation = inst->annotation;
> -
> -   inst->insert_before(block, new_inst);
> -
> -   return inst;
> -}
> -
> -vec4_instruction *
> -vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
> -                   const src_reg &src1, const src_reg &src2)
> -{
> -   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
> -}
> -
> -
> -vec4_instruction *
> -vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
> -                   const src_reg &src1)
> -{
> -   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
> -}
> -
> -vec4_instruction *
> -vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
> -{
> -   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
> -}
> -
> -vec4_instruction *
> -vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
> -{
> -   return emit(new(mem_ctx) vec4_instruction(opcode, dst));
> -}
> -
> -vec4_instruction *
> -vec4_visitor::emit(enum opcode opcode)
> -{
> -   return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
> -}
> -
> -#define ALU1(op)                                                       \
> -   vec4_instruction *                                                  \
> -   vec4_visitor::op(const dst_reg &dst, const src_reg &src0)           \
> -   {                                                                   \
> -      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
> -   }
> -
> -#define ALU2(op)                                                       \
> -   vec4_instruction *                                                  \
> -   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,           \
> -                    const src_reg &src1)                               \
> -   {                                                                   \
> -      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
> -                                           src0, src1);                 \
> -   }
> -
> -#define ALU2_ACC(op)                                                   \
> -   vec4_instruction *                                                  \
> -   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,           \
> -                    const src_reg &src1)                               \
> -   {                                                                   \
> -      vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
> -                       BRW_OPCODE_##op, dst, src0, src1);              \
> -      inst->writes_accumulator = true;                                  \
> -      return inst;                                                      \
> -   }
> -
> -#define ALU3(op)                                                       \
> -   vec4_instruction *                                                  \
> -   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,           \
> -                    const src_reg &src1, const src_reg &src2)          \
> -   {                                                                   \
> -      assert(brw->gen >= 6);                                           \
> -      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,       \
> -                                          src0, src1, src2);           \
> -   }
> -
> -ALU1(NOT)
> -ALU1(MOV)
> -ALU1(FRC)
> -ALU1(RNDD)
> -ALU1(RNDE)
> -ALU1(RNDZ)
> -ALU1(F32TO16)
> -ALU1(F16TO32)
> -ALU2(ADD)
> -ALU2(MUL)
> -ALU2_ACC(MACH)
> -ALU2(AND)
> -ALU2(OR)
> -ALU2(XOR)
> -ALU2(DP3)
> -ALU2(DP4)
> -ALU2(DPH)
> -ALU2(SHL)
> -ALU2(SHR)
> -ALU2(ASR)
> -ALU3(LRP)
> -ALU1(BFREV)
> -ALU3(BFE)
> -ALU2(BFI1)
> -ALU3(BFI2)
> -ALU1(FBH)
> -ALU1(FBL)
> -ALU1(CBIT)
> -ALU3(MAD)
> -ALU2_ACC(ADDC)
> -ALU2_ACC(SUBB)
> -ALU2(MAC)
> -
> -/** Gen4 predicated IF. */
> -vec4_instruction *
> -vec4_visitor::IF(enum brw_predicate predicate)
> -{
> -   vec4_instruction *inst;
> -
> -   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
> -   inst->predicate = predicate;
> -
> -   return inst;
> -}
> -
> -/** Gen6 IF with embedded comparison. */
> -vec4_instruction *
> -vec4_visitor::IF(src_reg src0, src_reg src1,
> -                 enum brw_conditional_mod condition)
> -{
> -   assert(brw->gen == 6);
> -
> -   vec4_instruction *inst;
> -
> -   resolve_ud_negate(&src0);
> -   resolve_ud_negate(&src1);
> -
> -   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
> -                                       src0, src1);
> -   inst->conditional_mod = condition;
> -
> -   return inst;
> -}
> -
> -/**
> - * CMP: Sets the low bit of the destination channels with the result
> - * of the comparison, while the upper bits are undefined, and updates
> - * the flag register with the packed 16 bits of the result.
> - */
> -vec4_instruction *
> -vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
> -                  enum brw_conditional_mod condition)
> -{
> -   vec4_instruction *inst;
> -
> -   /* Take the instruction:
> -    *
> -    * CMP null<d> src0<f> src1<f>
> -    *
> -    * Original gen4 does type conversion to the destination type before
> -    * comparison, producing garbage results for floating point comparisons.
> -    *
> -    * The destination type doesn't matter on newer generations, so we set the
> -    * type to match src0 so we can compact the instruction.
> -    */
> -   dst.type = src0.type;
> -   if (dst.file == HW_REG)
> -      dst.fixed_hw_reg.type = dst.type;
> -
> -   resolve_ud_negate(&src0);
> -   resolve_ud_negate(&src1);
> -
> -   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
> -   inst->conditional_mod = condition;
> -
> -   return inst;
> -}
> -
> -vec4_instruction *
> -vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
> -{
> -   vec4_instruction *inst;
> -
> -   inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
> -                                       dst, index);
> -   inst->base_mrf = 14;
> -   inst->mlen = 2;
> -
> -   return inst;
> -}
> -
> -vec4_instruction *
> -vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
> -                            const src_reg &index)
> -{
> -   vec4_instruction *inst;
> -
> -   inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
> -                                       dst, src, index);
> -   inst->base_mrf = 13;
> -   inst->mlen = 3;
> -
> -   return inst;
> -}
> -
> -void
> -vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
> -{
> -   static enum opcode dot_opcodes[] = {
> -      BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
> -   };
> -
> -   emit(dot_opcodes[elements - 2], dst, src0, src1);
> -}
> -
> -src_reg
> -vec4_visitor::fix_3src_operand(src_reg src)
> -{
> -   /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
> -    * able to use vertical stride of zero to replicate the vec4 uniform, like
> -    *
> -    *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
> -    *
> -    * But you can't, since vertical stride is always four in three-source
> -    * instructions. Instead, insert a MOV instruction to do the replication so
> -    * that the three-source instruction can consume it.
> -    */
> -
> -   /* The MOV is only needed if the source is a uniform or immediate. */
> -   if (src.file != UNIFORM && src.file != IMM)
> -      return src;
> -
> -   if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
> -      return src;
> -
> -   dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
> -   expanded.type = src.type;
> -   emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
> -   return src_reg(expanded);
> -}
> -
> -src_reg
> -vec4_visitor::fix_math_operand(src_reg src)
> -{
> -   if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
> -      return src;
> -
> -   /* The gen6 math instruction ignores the source modifiers --
> -    * swizzle, abs, negate, and at least some parts of the register
> -    * region description.
> -    *
> -    * Rather than trying to enumerate all these cases, *always* expand the
> -    * operand to a temp GRF for gen6.
> -    *
> -    * For gen7, keep the operand as-is, except if immediate, which gen7 still
> -    * can't use.
> -    */
> -
> -   if (brw->gen == 7 && src.file != IMM)
> -      return src;
> -
> -   dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
> -   expanded.type = src.type;
> -   emit(MOV(expanded, src));
> -   return src_reg(expanded);
> -}
> -
> -void
> -vec4_visitor::emit_math(enum opcode opcode,
> -                        const dst_reg &dst,
> -                        const src_reg &src0, const src_reg &src1)
> -{
> -   vec4_instruction *math =
> -      emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
> -
> -   if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
> -      /* MATH on Gen6 must be align1, so we can't do writemasks. */
> -      math->dst = dst_reg(this, glsl_type::vec4_type);
> -      math->dst.type = dst.type;
> -      emit(MOV(dst, src_reg(math->dst)));
> -   } else if (brw->gen < 6) {
> -      math->base_mrf = 1;
> -      math->mlen = src1.file == BAD_FILE ? 1 : 2;
> -   }
> -}
> -
> -void
> -vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
> -{
> -   if (brw->gen < 7) {
> -      unreachable("ir_unop_pack_half_2x16 should be lowered");
> -   }
> -
> -   assert(dst.type == BRW_REGISTER_TYPE_UD);
> -   assert(src0.type == BRW_REGISTER_TYPE_F);
> -
> -   /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
> -    *
> -    *   Because this instruction does not have a 16-bit floating-point type,
> -    *   the destination data type must be Word (W).
> -    *
> -    *   The destination must be DWord-aligned and specify a horizontal stride
> -    *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
> -    *   each destination channel and the upper word is not modified.
> -    *
> -    * The above restriction implies that the f32to16 instruction must use
> -    * align1 mode, because only in align1 mode is it possible to specify
> -    * horizontal stride.  We choose here to defy the hardware docs and emit
> -    * align16 instructions.
> -    *
> -    * (I [chadv] did attempt to emit align1 instructions for VS f32to16
> -    * instructions. I was partially successful in that the code passed all
> -    * tests.  However, the code was dubiously correct and fragile, and the
> -    * tests were not harsh enough to probe that frailty. Not trusting the
> -    * code, I chose instead to remain in align16 mode in defiance of the hw
> -    * docs).
> -    *
> -    * I've [chadv] experimentally confirmed that, on gen7 hardware and the
> -    * simulator, emitting a f32to16 in align16 mode with UD as destination
> -    * data type is safe. The behavior differs from that specified in the PRM
> -    * in that the upper word of each destination channel is cleared to 0.
> -    */
> -
> -   dst_reg tmp_dst(this, glsl_type::uvec2_type);
> -   src_reg tmp_src(tmp_dst);
> -
> -#if 0
> -   /* Verify the undocumented behavior on which the following instructions
> -    * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
> -    * then the result of the bit-or instruction below will be incorrect.
> -    *
> -    * You should inspect the disasm output in order to verify that the MOV is
> -    * not optimized away.
> -    */
> -   emit(MOV(tmp_dst, src_reg(0x12345678u)));
> -#endif
> -
> -   /* Give tmp the form below, where "." means untouched.
> -    *
> -    *     w z          y          x w z          y          x
> -    *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
> -    *
> -    * That the upper word of each write-channel be 0 is required for the
> -    * following bit-shift and bit-or instructions to work. Note that this
> -    * relies on the undocumented hardware behavior mentioned above.
> -    */
> -   tmp_dst.writemask = WRITEMASK_XY;
> -   emit(F32TO16(tmp_dst, src0));
> -
> -   /* Give the write-channels of dst the form:
> -    *   0xhhhh0000
> -    */
> -   tmp_src.swizzle = BRW_SWIZZLE_YYYY;
> -   emit(SHL(dst, tmp_src, src_reg(16u)));
> -
> -   /* Finally, give the write-channels of dst the form of packHalf2x16's
> -    * output:
> -    *   0xhhhhllll
> -    */
> -   tmp_src.swizzle = BRW_SWIZZLE_XXXX;
> -   emit(OR(dst, src_reg(dst), tmp_src));
> -}
> -
> -void
> -vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
> -{
> -   if (brw->gen < 7) {
> -      unreachable("ir_unop_unpack_half_2x16 should be lowered");
> -   }
> -
> -   assert(dst.type == BRW_REGISTER_TYPE_F);
> -   assert(src0.type == BRW_REGISTER_TYPE_UD);
> -
> -   /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
> -    *
> -    *   Because this instruction does not have a 16-bit floating-point type,
> -    *   the source data type must be Word (W). The destination type must be
> -    *   F (Float).
> -    *
> -    * To use W as the source data type, we must adjust horizontal strides,
> -    * which is only possible in align1 mode. All my [chadv] attempts at
> -    * emitting align1 instructions for unpackHalf2x16 failed to pass the
> -    * Piglit tests, so I gave up.
> -    *
> -    * I've verified that, on gen7 hardware and the simulator, it is safe to
> -    * emit f16to32 in align16 mode with UD as source data type.
> -    */
> -
> -   dst_reg tmp_dst(this, glsl_type::uvec2_type);
> -   src_reg tmp_src(tmp_dst);
> -
> -   tmp_dst.writemask = WRITEMASK_X;
> -   emit(AND(tmp_dst, src0, src_reg(0xffffu)));
> -
> -   tmp_dst.writemask = WRITEMASK_Y;
> -   emit(SHR(tmp_dst, src0, src_reg(16u)));
> -
> -   dst.writemask = WRITEMASK_XY;
> -   emit(F16TO32(dst, tmp_src));
> -}
> -
> -void
> -vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
> -{
> -   /* Instead of splitting the 32-bit integer, shifting, and ORing it back
> -    * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
> -    * is not suitable to generate the shift values, but we can use the packed
> -    * vector float and a type-converting MOV.
> -    */
> -   dst_reg shift(this, glsl_type::uvec4_type);
> -   emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
> -
> -   dst_reg shifted(this, glsl_type::uvec4_type);
> -   src0.swizzle = BRW_SWIZZLE_XXXX;
> -   emit(SHR(shifted, src0, src_reg(shift)));
> -
> -   shifted.type = BRW_REGISTER_TYPE_UB;
> -   dst_reg f(this, glsl_type::vec4_type);
> -   emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
> -
> -   emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
> -}
> -
> -void
> -vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
> -{
> -   /* Instead of splitting the 32-bit integer, shifting, and ORing it back
> -    * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
> -    * is not suitable to generate the shift values, but we can use the packed
> -    * vector float and a type-converting MOV.
> -    */
> -   dst_reg shift(this, glsl_type::uvec4_type);
> -   emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
> -
> -   dst_reg shifted(this, glsl_type::uvec4_type);
> -   src0.swizzle = BRW_SWIZZLE_XXXX;
> -   emit(SHR(shifted, src0, src_reg(shift)));
> -
> -   shifted.type = BRW_REGISTER_TYPE_B;
> -   dst_reg f(this, glsl_type::vec4_type);
> -   emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
> -
> -   dst_reg scaled(this, glsl_type::vec4_type);
> -   emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
> -
> -   dst_reg max(this, glsl_type::vec4_type);
> -   emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
> -   emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
> -}
> -
> -void
> -vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
> -{
> -   dst_reg saturated(this, glsl_type::vec4_type);
> -   vec4_instruction *inst = emit(MOV(saturated, src0));
> -   inst->saturate = true;
> -
> -   dst_reg scaled(this, glsl_type::vec4_type);
> -   emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
> -
> -   dst_reg rounded(this, glsl_type::vec4_type);
> -   emit(RNDE(rounded, src_reg(scaled)));
> -
> -   dst_reg u(this, glsl_type::uvec4_type);
> -   emit(MOV(u, src_reg(rounded)));
> -
> -   src_reg bytes(u);
> -   emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
> -}
> -
> -void
> -vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
> -{
> -   dst_reg max(this, glsl_type::vec4_type);
> -   emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
> -
> -   dst_reg min(this, glsl_type::vec4_type);
> -   emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
> -
> -   dst_reg scaled(this, glsl_type::vec4_type);
> -   emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
> -
> -   dst_reg rounded(this, glsl_type::vec4_type);
> -   emit(RNDE(rounded, src_reg(scaled)));
> -
> -   dst_reg i(this, glsl_type::ivec4_type);
> -   emit(MOV(i, src_reg(rounded)));
> -
> -   src_reg bytes(i);
> -   emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
> -}
> -
> -void
> -vec4_visitor::visit_instructions(const exec_list *list)
> -{
> -   foreach_in_list(ir_instruction, ir, list) {
> -      base_ir = ir;
> -      ir->accept(this);
> -   }
> -}
> -
> -
> -static int
> -type_size(const struct glsl_type *type)
> -{
> -   unsigned int i;
> -   int size;
> -
> -   switch (type->base_type) {
> -   case GLSL_TYPE_UINT:
> -   case GLSL_TYPE_INT:
> -   case GLSL_TYPE_FLOAT:
> -   case GLSL_TYPE_BOOL:
> -      if (type->is_matrix()) {
> -        return type->matrix_columns;
> -      } else {
> -        /* Regardless of size of vector, it gets a vec4. This is bad
> -         * packing for things like floats, but otherwise arrays become a
> -         * mess.  Hopefully a later pass over the code can pack scalars
> -         * down if appropriate.
> -         */
> -        return 1;
> -      }
> -   case GLSL_TYPE_ARRAY:
> -      assert(type->length > 0);
> -      return type_size(type->fields.array) * type->length;
> -   case GLSL_TYPE_STRUCT:
> -      size = 0;
> -      for (i = 0; i < type->length; i++) {
> -        size += type_size(type->fields.structure[i].type);
> -      }
> -      return size;
> -   case GLSL_TYPE_SAMPLER:
> -      /* Samplers take up no register space, since they're baked in at
> -       * link time.
> -       */
> -      return 0;
> -   case GLSL_TYPE_ATOMIC_UINT:
> -      return 0;
> -   case GLSL_TYPE_IMAGE:
> -   case GLSL_TYPE_VOID:
> -   case GLSL_TYPE_DOUBLE:
> -   case GLSL_TYPE_ERROR:
> -   case GLSL_TYPE_INTERFACE:
> -      unreachable("not reached");
> -   }
> -
> -   return 0;
> -}
> -
> -src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
> -{
> -   init();
> -
> -   this->file = GRF;
> -   this->reg = v->alloc.allocate(type_size(type));
> -
> -   if (type->is_array() || type->is_record()) {
> -      this->swizzle = BRW_SWIZZLE_NOOP;
> -   } else {
> -      this->swizzle = brw_swizzle_for_size(type->vector_elements);
> -   }
> -
> -   this->type = brw_type_for_base_type(type);
> -}
> -
> -src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
> -{
> -   assert(size > 0);
> -
> -   init();
> -
> -   this->file = GRF;
> -   this->reg = v->alloc.allocate(type_size(type) * size);
> -
> -   this->swizzle = BRW_SWIZZLE_NOOP;
> -
> -   this->type = brw_type_for_base_type(type);
> -}
> -
> -dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
> -{
> -   init();
> -
> -   this->file = GRF;
> -   this->reg = v->alloc.allocate(type_size(type));
> -
> -   if (type->is_array() || type->is_record()) {
> -      this->writemask = WRITEMASK_XYZW;
> -   } else {
> -      this->writemask = (1 << type->vector_elements) - 1;
> -   }
> -
> -   this->type = brw_type_for_base_type(type);
> -}
> -
> -/* Our support for uniforms is piggy-backed on the struct
> - * gl_fragment_program, because that's where the values actually
> - * get stored, rather than in some global gl_shader_program uniform
> - * store.
> - */
> -void
> -vec4_visitor::setup_uniform_values(ir_variable *ir)
> -{
> -   int namelen = strlen(ir->name);
> -
> -   /* The data for our (non-builtin) uniforms is stored in a series of
> -    * gl_uniform_driver_storage structs for each subcomponent that
> -    * glGetUniformLocation() could name.  We know it's been set up in the same
> -    * order we'd walk the type, so walk the list of storage and find anything
> -    * with our name, or the prefix of a component that starts with our name.
> -    */
> -   for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
> -      struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
> -
> -      if (strncmp(ir->name, storage->name, namelen) != 0 ||
> -          (storage->name[namelen] != 0 &&
> -           storage->name[namelen] != '.' &&
> -           storage->name[namelen] != '[')) {
> -         continue;
> -      }
> -
> -      gl_constant_value *components = storage->storage;
> -      unsigned vector_count = (MAX2(storage->array_elements, 1) *
> -                               storage->type->matrix_columns);
> -
> -      for (unsigned s = 0; s < vector_count; s++) {
> -         assert(uniforms < uniform_array_size);
> -         uniform_vector_size[uniforms] = storage->type->vector_elements;
> -
> -         int i;
> -         for (i = 0; i < uniform_vector_size[uniforms]; i++) {
> -            stage_prog_data->param[uniforms * 4 + i] = components;
> -            components++;
> -         }
> -         for (; i < 4; i++) {
> -            static gl_constant_value zero = { 0.0 };
> -            stage_prog_data->param[uniforms * 4 + i] = &zero;
> -         }
> -
> -         uniforms++;
> -      }
> -   }
> -}
> -
> -void
> -vec4_visitor::setup_uniform_clipplane_values()
> -{
> -   gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
> -
> -   for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
> -      assert(this->uniforms < uniform_array_size);
> -      this->uniform_vector_size[this->uniforms] = 4;
> -      this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
> -      this->userplane[i].type = BRW_REGISTER_TYPE_F;
> -      for (int j = 0; j < 4; ++j) {
> -         stage_prog_data->param[this->uniforms * 4 + j] =
> -            (gl_constant_value *) &clip_planes[i][j];
> -      }
> -      ++this->uniforms;
> -   }
> -}
> -
> -/* Our support for builtin uniforms is even scarier than non-builtin.
> - * It sits on top of the PROG_STATE_VAR parameters that are
> - * automatically updated from GL context state.
> - */
> -void
> -vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
> -{
> -   const ir_state_slot *const slots = ir->get_state_slots();
> -   assert(slots != NULL);
> -
> -   for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
> -      /* This state reference has already been setup by ir_to_mesa,
> -       * but we'll get the same index back here.  We can reference
> -       * ParameterValues directly, since unlike brw_fs.cpp, we never
> -       * add new state references during compile.
> -       */
> -      int index = _mesa_add_state_reference(this->prog->Parameters,
> -                                           (gl_state_index *)slots[i].tokens);
> -      gl_constant_value *values =
> -         &this->prog->Parameters->ParameterValues[index][0];
> -
> -      assert(this->uniforms < uniform_array_size);
> -
> -      for (unsigned j = 0; j < 4; j++)
> -        stage_prog_data->param[this->uniforms * 4 + j] =
> -            &values[GET_SWZ(slots[i].swizzle, j)];
> -
> -      this->uniform_vector_size[this->uniforms] =
> -         (ir->type->is_scalar() || ir->type->is_vector() ||
> -          ir->type->is_matrix() ? ir->type->vector_elements : 4);
> -
> -      this->uniforms++;
> -   }
> -}
> -
> -dst_reg *
> -vec4_visitor::variable_storage(ir_variable *var)
> -{
> -   return (dst_reg *)hash_table_find(this->variable_ht, var);
> -}
> -
> -void
> -vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
> -                                     enum brw_predicate *predicate)
> -{
> -   ir_expression *expr = ir->as_expression();
> -
> -   *predicate = BRW_PREDICATE_NORMAL;
> -
> -   if (expr && expr->operation != ir_binop_ubo_load) {
> -      src_reg op[3];
> -      vec4_instruction *inst;
> -
> -      assert(expr->get_num_operands() <= 3);
> -      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> -        expr->operands[i]->accept(this);
> -        op[i] = this->result;
> -
> -        resolve_ud_negate(&op[i]);
> -      }
> -
> -      switch (expr->operation) {
> -      case ir_unop_logic_not:
> -        inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
> -        inst->conditional_mod = BRW_CONDITIONAL_Z;
> -        break;
> -
> -      case ir_binop_logic_xor:
> -         if (brw->gen <= 5) {
> -            src_reg temp = src_reg(this, ir->type);
> -            emit(XOR(dst_reg(temp), op[0], op[1]));
> -            inst = emit(AND(dst_null_d(), temp, src_reg(1)));
> -         } else {
> -            inst = emit(XOR(dst_null_d(), op[0], op[1]));
> -         }
> -         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -        break;
> -
> -      case ir_binop_logic_or:
> -         if (brw->gen <= 5) {
> -            src_reg temp = src_reg(this, ir->type);
> -            emit(OR(dst_reg(temp), op[0], op[1]));
> -            inst = emit(AND(dst_null_d(), temp, src_reg(1)));
> -         } else {
> -            inst = emit(OR(dst_null_d(), op[0], op[1]));
> -         }
> -         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -        break;
> -
> -      case ir_binop_logic_and:
> -         if (brw->gen <= 5) {
> -            src_reg temp = src_reg(this, ir->type);
> -            emit(AND(dst_reg(temp), op[0], op[1]));
> -            inst = emit(AND(dst_null_d(), temp, src_reg(1)));
> -         } else {
> -            inst = emit(AND(dst_null_d(), op[0], op[1]));
> -         }
> -         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -        break;
> -
> -      case ir_unop_f2b:
> -        if (brw->gen >= 6) {
> -           emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
> -        } else {
> -           inst = emit(MOV(dst_null_f(), op[0]));
> -           inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -        }
> -        break;
> -
> -      case ir_unop_i2b:
> -        if (brw->gen >= 6) {
> -           emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> -        } else {
> -           inst = emit(MOV(dst_null_d(), op[0]));
> -           inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -        }
> -        break;
> -
> -      case ir_binop_all_equal:
> -         if (brw->gen <= 5) {
> -            resolve_bool_comparison(expr->operands[0], &op[0]);
> -            resolve_bool_comparison(expr->operands[1], &op[1]);
> -         }
> -        inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
> -        *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
> -        break;
> -
> -      case ir_binop_any_nequal:
> -         if (brw->gen <= 5) {
> -            resolve_bool_comparison(expr->operands[0], &op[0]);
> -            resolve_bool_comparison(expr->operands[1], &op[1]);
> -         }
> -        inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
> -        *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
> -        break;
> -
> -      case ir_unop_any:
> -         if (brw->gen <= 5) {
> -            resolve_bool_comparison(expr->operands[0], &op[0]);
> -         }
> -        inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> -        *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
> -        break;
> -
> -      case ir_binop_greater:
> -      case ir_binop_gequal:
> -      case ir_binop_less:
> -      case ir_binop_lequal:
> -      case ir_binop_equal:
> -      case ir_binop_nequal:
> -         if (brw->gen <= 5) {
> -            resolve_bool_comparison(expr->operands[0], &op[0]);
> -            resolve_bool_comparison(expr->operands[1], &op[1]);
> -         }
> -        emit(CMP(dst_null_d(), op[0], op[1],
> -                 brw_conditional_for_comparison(expr->operation)));
> -        break;
> -
> -      case ir_triop_csel: {
> -         /* Expand the boolean condition into the flag register. */
> -         inst = emit(MOV(dst_null_d(), op[0]));
> -         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -
> -         /* Select which boolean to return. */
> -         dst_reg temp(this, expr->operands[1]->type);
> -         inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
> -         inst->predicate = BRW_PREDICATE_NORMAL;
> -
> -         /* Expand the result to a condition code. */
> -         inst = emit(MOV(dst_null_d(), src_reg(temp)));
> -         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -         break;
> -      }
> -
> -      default:
> -        unreachable("not reached");
> -      }
> -      return;
> -   }
> -
> -   ir->accept(this);
> -
> -   resolve_ud_negate(&this->result);
> -
> -   vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
> -   inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -}
> -
> -/**
> - * Emit a gen6 IF statement with the comparison folded into the IF
> - * instruction.
> - */
> -void
> -vec4_visitor::emit_if_gen6(ir_if *ir)
> -{
> -   ir_expression *expr = ir->condition->as_expression();
> -
> -   if (expr && expr->operation != ir_binop_ubo_load) {
> -      src_reg op[3];
> -      dst_reg temp;
> -
> -      assert(expr->get_num_operands() <= 3);
> -      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> -        expr->operands[i]->accept(this);
> -        op[i] = this->result;
> -      }
> -
> -      switch (expr->operation) {
> -      case ir_unop_logic_not:
> -        emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
> -        return;
> -
> -      case ir_binop_logic_xor:
> -        emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
> -        return;
> -
> -      case ir_binop_logic_or:
> -        temp = dst_reg(this, glsl_type::bool_type);
> -        emit(OR(temp, op[0], op[1]));
> -        emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
> -        return;
> -
> -      case ir_binop_logic_and:
> -        temp = dst_reg(this, glsl_type::bool_type);
> -        emit(AND(temp, op[0], op[1]));
> -        emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
> -        return;
> -
> -      case ir_unop_f2b:
> -        emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> -        return;
> -
> -      case ir_unop_i2b:
> -        emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> -        return;
> -
> -      case ir_binop_greater:
> -      case ir_binop_gequal:
> -      case ir_binop_less:
> -      case ir_binop_lequal:
> -      case ir_binop_equal:
> -      case ir_binop_nequal:
> -        emit(IF(op[0], op[1],
> -                brw_conditional_for_comparison(expr->operation)));
> -        return;
> -
> -      case ir_binop_all_equal:
> -        emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
> -        emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
> -        return;
> -
> -      case ir_binop_any_nequal:
> -        emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
> -        emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
> -        return;
> -
> -      case ir_unop_any:
> -        emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> -        emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
> -        return;
> -
> -      case ir_triop_csel: {
> -         /* Expand the boolean condition into the flag register. */
> -         vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
> -         inst->conditional_mod = BRW_CONDITIONAL_NZ;
> -
> -         /* Select which boolean to return. */
> -         dst_reg temp(this, expr->operands[1]->type);
> -         inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
> -         inst->predicate = BRW_PREDICATE_NORMAL;
> -
> -         emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
> -         return;
> -      }
> -
> -      default:
> -        unreachable("not reached");
> -      }
> -      return;
> -   }
> -
> -   ir->condition->accept(this);
> -
> -   emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
> -}
> -
> -void
> -vec4_visitor::visit(ir_variable *ir)
> -{
> -   dst_reg *reg = NULL;
> -
> -   if (variable_storage(ir))
> -      return;
> -
> -   switch (ir->data.mode) {
> -   case ir_var_shader_in:
> -      assert(ir->data.location != -1);
> -      reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
> -      break;
> -
> -   case ir_var_shader_out:
> -      assert(ir->data.location != -1);
> -      reg = new(mem_ctx) dst_reg(this, ir->type);
> -
> -      for (int i = 0; i < type_size(ir->type); i++) {
> -        output_reg[ir->data.location + i] = *reg;
> -        output_reg[ir->data.location + i].reg_offset = i;
> -        output_reg[ir->data.location + i].type =
> -            brw_type_for_base_type(ir->type->get_scalar_type());
> -        output_reg_annotation[ir->data.location + i] = ir->name;
> -      }
> -      break;
> -
> -   case ir_var_auto:
> -   case ir_var_temporary:
> -      reg = new(mem_ctx) dst_reg(this, ir->type);
> -      break;
> -
> -   case ir_var_uniform:
> -      reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
> -
> -      /* Thanks to the lower_ubo_reference pass, we will see only
> -       * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
> -       * variables, so no need for them to be in variable_ht.
> -       *
> -       * Some uniforms, such as samplers and atomic counters, have no actual
> -       * storage, so we should ignore them.
> -       */
> -      if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
> -         return;
> -
> -      /* Track how big the whole uniform variable is, in case we need to put a
> -       * copy of its data into pull constants for array access.
> -       */
> -      assert(this->uniforms < uniform_array_size);
> -      this->uniform_size[this->uniforms] = type_size(ir->type);
> -
> -      if (!strncmp(ir->name, "gl_", 3)) {
> -        setup_builtin_uniform_values(ir);
> -      } else {
> -        setup_uniform_values(ir);
> -      }
> -      break;
> -
> -   case ir_var_system_value:
> -      reg = make_reg_for_system_value(ir);
> -      break;
> -
> -   default:
> -      unreachable("not reached");
> -   }
> -
> -   reg->type = brw_type_for_base_type(ir->type);
> -   hash_table_insert(this->variable_ht, reg, ir);
> -}
> -
> -void
> -vec4_visitor::visit(ir_loop *ir)
> -{
> -   /* We don't want debugging output to print the whole body of the
> -    * loop as the annotation.
> -    */
> -   this->base_ir = NULL;
> -
> -   emit(BRW_OPCODE_DO);
> -
> -   visit_instructions(&ir->body_instructions);
> -
> -   emit(BRW_OPCODE_WHILE);
> -}
> -
> -void
> -vec4_visitor::visit(ir_loop_jump *ir)
> -{
> -   switch (ir->mode) {
> -   case ir_loop_jump::jump_break:
> -      emit(BRW_OPCODE_BREAK);
> -      break;
> -   case ir_loop_jump::jump_continue:
> -      emit(BRW_OPCODE_CONTINUE);
> -      break;
> -   }
> -}
> -
> -
> -void
> -vec4_visitor::visit(ir_function_signature *)
> -{
> -   unreachable("not reached");
> -}
> -
> -void
> -vec4_visitor::visit(ir_function *ir)
> -{
> -   /* Ignore function bodies other than main() -- we shouldn't see calls to
> -    * them since they should all be inlined.
> -    */
> -   if (strcmp(ir->name, "main") == 0) {
> -      const ir_function_signature *sig;
> -      exec_list empty;
> -
> -      sig = ir->matching_signature(NULL, &empty, false);
> -
> -      assert(sig);
> -
> -      visit_instructions(&sig->body);
> -   }
> -}
> -
> -bool
> -vec4_visitor::try_emit_mad(ir_expression *ir)
> -{
> -   /* 3-src instructions were introduced in gen6. */
> -   if (brw->gen < 6)
> -      return false;
> -
> -   /* MAD can only handle floating-point data. */
> -   if (ir->type->base_type != GLSL_TYPE_FLOAT)
> -      return false;
> -
> -   ir_rvalue *nonmul;
> -   ir_expression *mul;
> -   bool mul_negate, mul_abs;
> -
> -   for (int i = 0; i < 2; i++) {
> -      mul_negate = false;
> -      mul_abs = false;
> -
> -      mul = ir->operands[i]->as_expression();
> -      nonmul = ir->operands[1 - i];
> -
> -      if (mul && mul->operation == ir_unop_abs) {
> -         mul = mul->operands[0]->as_expression();
> -         mul_abs = true;
> -      } else if (mul && mul->operation == ir_unop_neg) {
> -         mul = mul->operands[0]->as_expression();
> -         mul_negate = true;
> -      }
> -
> -      if (mul && mul->operation == ir_binop_mul)
> -         break;
> -   }
> -
> -   if (!mul || mul->operation != ir_binop_mul)
> -      return false;
> -
> -   nonmul->accept(this);
> -   src_reg src0 = fix_3src_operand(this->result);
> -
> -   mul->operands[0]->accept(this);
> -   src_reg src1 = fix_3src_operand(this->result);
> -   src1.negate ^= mul_negate;
> -   src1.abs = mul_abs;
> -   if (mul_abs)
> -      src1.negate = false;
> -
> -   mul->operands[1]->accept(this);
> -   src_reg src2 = fix_3src_operand(this->result);
> -   src2.abs = mul_abs;
> -   if (mul_abs)
> -      src2.negate = false;
> -
> -   this->result = src_reg(this, ir->type);
> -   emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
> -
> -   return true;
> -}
> -
> -bool
> -vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
> -{
> -   /* This optimization relies on CMP setting the destination to 0 when
> -    * false.  Early hardware only sets the least significant bit, and
> -    * leaves the other bits undefined.  So we can't use it.
> -    */
> -   if (brw->gen < 6)
> -      return false;
> -
> -   ir_expression *const cmp = ir->operands[0]->as_expression();
> -
> -   if (cmp == NULL)
> -      return false;
> -
> -   switch (cmp->operation) {
> -   case ir_binop_less:
> -   case ir_binop_greater:
> -   case ir_binop_lequal:
> -   case ir_binop_gequal:
> -   case ir_binop_equal:
> -   case ir_binop_nequal:
> -      break;
> -
> -   default:
> -      return false;
> -   }
> -
> -   cmp->operands[0]->accept(this);
> -   const src_reg cmp_src0 = this->result;
> -
> -   cmp->operands[1]->accept(this);
> -   const src_reg cmp_src1 = this->result;
> -
> -   this->result = src_reg(this, ir->type);
> -
> -   emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
> -            brw_conditional_for_comparison(cmp->operation)));
> -
> -   /* If the comparison is false, this->result will just happen to be zero.
> -    */
> -   vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
> -                                       this->result, src_reg(1.0f));
> -   inst->predicate = BRW_PREDICATE_NORMAL;
> -   inst->predicate_inverse = true;
> -
> -   return true;
> -}
> -
> -void
> -vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
> -                          src_reg src0, src_reg src1)
> -{
> -   vec4_instruction *inst;
> -
> -   if (brw->gen >= 6) {
> -      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
> -      inst->conditional_mod = conditionalmod;
> -   } else {
> -      emit(CMP(dst, src0, src1, conditionalmod));
> -
> -      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
> -      inst->predicate = BRW_PREDICATE_NORMAL;
> -   }
> -}
> -
> -void
> -vec4_visitor::emit_lrp(const dst_reg &dst,
> -                       const src_reg &x, const src_reg &y, const src_reg &a)
> -{
> -   if (brw->gen >= 6) {
> -      /* Note that the instruction's argument order is reversed from GLSL
> -       * and the IR.
> -       */
> -      emit(LRP(dst,
> -               fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
> -   } else {
> -      /* Earlier generations don't support three source operations, so we
> -       * need to emit x*(1-a) + y*a.
> -       */
> -      dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
> -      dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
> -      dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
> -      y_times_a.writemask           = dst.writemask;
> -      one_minus_a.writemask         = dst.writemask;
> -      x_times_one_minus_a.writemask = dst.writemask;
> -
> -      emit(MUL(y_times_a, y, a));
> -      emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
> -      emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
> -      emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
> -   }
> -}
> -
> -void
> -vec4_visitor::visit(ir_expression *ir)
> -{
> -   unsigned int operand;
> -   src_reg op[ARRAY_SIZE(ir->operands)];
> -   vec4_instruction *inst;
> -
> -   if (ir->operation == ir_binop_add) {
> -      if (try_emit_mad(ir))
> -        return;
> -   }
> -
> -   if (ir->operation == ir_unop_b2f) {
> -      if (try_emit_b2f_of_compare(ir))
> -        return;
> -   }
> -
> -   /* Storage for our result.  Ideally for an assignment we'd be using
> -    * the actual storage for the result here, instead.
> -    */
> -   dst_reg result_dst(this, ir->type);
> -   src_reg result_src(result_dst);
> -
> -   if (ir->operation == ir_triop_csel) {
> -      ir->operands[1]->accept(this);
> -      op[1] = this->result;
> -      ir->operands[2]->accept(this);
> -      op[2] = this->result;
> -
> -      enum brw_predicate predicate;
> -      emit_bool_to_cond_code(ir->operands[0], &predicate);
> -      inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
> -      inst->predicate = predicate;
> -      this->result = result_src;
> -      return;
> -   }
> -
> -   for (operand = 0; operand < ir->get_num_operands(); operand++) {
> -      this->result.file = BAD_FILE;
> -      ir->operands[operand]->accept(this);
> -      if (this->result.file == BAD_FILE) {
> -        fprintf(stderr, "Failed to get tree for expression operand:\n");
> -        ir->operands[operand]->fprint(stderr);
> -        exit(1);
> -      }
> -      op[operand] = this->result;
> -
> -      /* Matrix expression operands should have been broken down to vector
> -       * operations already.
> -       */
> -      assert(!ir->operands[operand]->type->is_matrix());
> -   }
> -
> -   /* If nothing special happens, this is the result. */
> -   this->result = result_src;
> -
> -   switch (ir->operation) {
> -   case ir_unop_logic_not:
> -      emit(NOT(result_dst, op[0]));
> -      break;
> -   case ir_unop_neg:
> -      op[0].negate = !op[0].negate;
> -      emit(MOV(result_dst, op[0]));
> -      break;
> -   case ir_unop_abs:
> -      op[0].abs = true;
> -      op[0].negate = false;
> -      emit(MOV(result_dst, op[0]));
> -      break;
> -
> -   case ir_unop_sign:
> -      if (ir->type->is_float()) {
> -         /* AND(val, 0x80000000) gives the sign bit.
> -          *
> -          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
> -          * zero.
> -          */
> -         emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
> -
> -         op[0].type = BRW_REGISTER_TYPE_UD;
> -         result_dst.type = BRW_REGISTER_TYPE_UD;
> -         emit(AND(result_dst, op[0], src_reg(0x80000000u)));
> -
> -         inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
> -         inst->predicate = BRW_PREDICATE_NORMAL;
> -
> -         this->result.type = BRW_REGISTER_TYPE_F;
> -      } else {
> -         /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
> -          *               -> non-negative val generates 0x00000000.
> -          *  Predicated OR sets 1 if val is positive.
> -          */
> -         emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
> -
> -         emit(ASR(result_dst, op[0], src_reg(31)));
> -
> -         inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
> -         inst->predicate = BRW_PREDICATE_NORMAL;
> -      }
> -      break;
> -
> -   case ir_unop_rcp:
> -      emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
> -      break;
> -
> -   case ir_unop_exp2:
> -      emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
> -      break;
> -   case ir_unop_log2:
> -      emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
> -      break;
> -   case ir_unop_exp:
> -   case ir_unop_log:
> -      unreachable("not reached: should be handled by ir_explog_to_explog2");
> -   case ir_unop_sin:
> -   case ir_unop_sin_reduced:
> -      emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
> -      break;
> -   case ir_unop_cos:
> -   case ir_unop_cos_reduced:
> -      emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
> -      break;
> -
> -   case ir_unop_dFdx:
> -   case ir_unop_dFdx_coarse:
> -   case ir_unop_dFdx_fine:
> -   case ir_unop_dFdy:
> -   case ir_unop_dFdy_coarse:
> -   case ir_unop_dFdy_fine:
> -      unreachable("derivatives not valid in vertex shader");
> -
> -   case ir_unop_bitfield_reverse:
> -      emit(BFREV(result_dst, op[0]));
> -      break;
> -   case ir_unop_bit_count:
> -      emit(CBIT(result_dst, op[0]));
> -      break;
> -   case ir_unop_find_msb: {
> -      src_reg temp = src_reg(this, glsl_type::uint_type);
> -
> -      inst = emit(FBH(dst_reg(temp), op[0]));
> -      inst->dst.writemask = WRITEMASK_XYZW;
> -
> -      /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
> -       * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
> -       * subtract the result from 31 to convert the MSB count into an LSB count.
> -       */
> -
> -      /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
> -      temp.swizzle = BRW_SWIZZLE_NOOP;
> -      emit(MOV(result_dst, temp));
> -
> -      src_reg src_tmp = src_reg(result_dst);
> -      emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
> -
> -      src_tmp.negate = true;
> -      inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
> -      inst->predicate = BRW_PREDICATE_NORMAL;
> -      break;
> -   }
> -   case ir_unop_find_lsb:
> -      emit(FBL(result_dst, op[0]));
> -      break;
> -   case ir_unop_saturate:
> -      inst = emit(MOV(result_dst, op[0]));
> -      inst->saturate = true;
> -      break;
> -
> -   case ir_unop_noise:
> -      unreachable("not reached: should be handled by lower_noise");
> -
> -   case ir_binop_add:
> -      emit(ADD(result_dst, op[0], op[1]));
> -      break;
> -   case ir_binop_sub:
> -      unreachable("not reached: should be handled by ir_sub_to_add_neg");
> -
> -   case ir_binop_mul:
> -      if (brw->gen < 8 && ir->type->is_integer()) {
> -        /* For integer multiplication, the MUL uses the low 16 bits of one of
> -         * the operands (src0 through SNB, src1 on IVB and later).  The MACH
> -         * accumulates in the contribution of the upper 16 bits of that
> -         * operand.  If we can determine that one of the args is in the low
> -         * 16 bits, though, we can just emit a single MUL.
> -          */
> -         if (ir->operands[0]->is_uint16_constant()) {
> -            if (brw->gen < 7)
> -               emit(MUL(result_dst, op[0], op[1]));
> -            else
> -               emit(MUL(result_dst, op[1], op[0]));
> -         } else if (ir->operands[1]->is_uint16_constant()) {
> -            if (brw->gen < 7)
> -               emit(MUL(result_dst, op[1], op[0]));
> -            else
> -               emit(MUL(result_dst, op[0], op[1]));
> -         } else {
> -            struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
> -
> -            emit(MUL(acc, op[0], op[1]));
> -            emit(MACH(dst_null_d(), op[0], op[1]));
> -            emit(MOV(result_dst, src_reg(acc)));
> -         }
> -      } else {
> -        emit(MUL(result_dst, op[0], op[1]));
> -      }
> -      break;
> -   case ir_binop_imul_high: {
> -      struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
> -
> -      emit(MUL(acc, op[0], op[1]));
> -      emit(MACH(result_dst, op[0], op[1]));
> -      break;
> -   }
> -   case ir_binop_div:
> -      /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
> -      assert(ir->type->is_integer());
> -      emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
> -      break;
> -   case ir_binop_carry: {
> -      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
> -
> -      emit(ADDC(dst_null_ud(), op[0], op[1]));
> -      emit(MOV(result_dst, src_reg(acc)));
> -      break;
> -   }
> -   case ir_binop_borrow: {
> -      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
> -
> -      emit(SUBB(dst_null_ud(), op[0], op[1]));
> -      emit(MOV(result_dst, src_reg(acc)));
> -      break;
> -   }
> -   case ir_binop_mod:
> -      /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
> -      assert(ir->type->is_integer());
> -      emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
> -      break;
> -
> -   case ir_binop_less:
> -   case ir_binop_greater:
> -   case ir_binop_lequal:
> -   case ir_binop_gequal:
> -   case ir_binop_equal:
> -   case ir_binop_nequal: {
> -      if (brw->gen <= 5) {
> -         resolve_bool_comparison(ir->operands[0], &op[0]);
> -         resolve_bool_comparison(ir->operands[1], &op[1]);
> -      }
> -      emit(CMP(result_dst, op[0], op[1],
> -              brw_conditional_for_comparison(ir->operation)));
> -      break;
> -   }
> -
> -   case ir_binop_all_equal:
> -      if (brw->gen <= 5) {
> -         resolve_bool_comparison(ir->operands[0], &op[0]);
> -         resolve_bool_comparison(ir->operands[1], &op[1]);
> -      }
> -
> -      /* "==" operator producing a scalar boolean. */
> -      if (ir->operands[0]->type->is_vector() ||
> -         ir->operands[1]->type->is_vector()) {
> -        emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
> -        emit(MOV(result_dst, src_reg(0)));
> -         inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
> -        inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
> -      } else {
> -        emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
> -      }
> -      break;
> -   case ir_binop_any_nequal:
> -      if (brw->gen <= 5) {
> -         resolve_bool_comparison(ir->operands[0], &op[0]);
> -         resolve_bool_comparison(ir->operands[1], &op[1]);
> -      }
> -
> -      /* "!=" operator producing a scalar boolean. */
> -      if (ir->operands[0]->type->is_vector() ||
> -         ir->operands[1]->type->is_vector()) {
> -        emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
> -
> -        emit(MOV(result_dst, src_reg(0)));
> -         inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
> -        inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
> -      } else {
> -        emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
> -      }
> -      break;
> -
> -   case ir_unop_any:
> -      if (brw->gen <= 5) {
> -         resolve_bool_comparison(ir->operands[0], &op[0]);
> -      }
> -      emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> -      emit(MOV(result_dst, src_reg(0)));
> -
> -      inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
> -      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
> -      break;
> -
> -   case ir_binop_logic_xor:
> -      emit(XOR(result_dst, op[0], op[1]));
> -      break;
> -
> -   case ir_binop_logic_or:
> -      emit(OR(result_dst, op[0], op[1]));
> -      break;
> -
> -   case ir_binop_logic_and:
> -      emit(AND(result_dst, op[0], op[1]));
> -      break;
> -
> -   case ir_binop_dot:
> -      assert(ir->operands[0]->type->is_vector());
> -      assert(ir->operands[0]->type == ir->operands[1]->type);
> -      emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
> -      break;
> -
> -   case ir_unop_sqrt:
> -      emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
> -      break;
> -   case ir_unop_rsq:
> -      emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
> -      break;
> -
> -   case ir_unop_bitcast_i2f:
> -   case ir_unop_bitcast_u2f:
> -      this->result = op[0];
> -      this->result.type = BRW_REGISTER_TYPE_F;
> -      break;
> -
> -   case ir_unop_bitcast_f2i:
> -      this->result = op[0];
> -      this->result.type = BRW_REGISTER_TYPE_D;
> -      break;
> -
> -   case ir_unop_bitcast_f2u:
> -      this->result = op[0];
> -      this->result.type = BRW_REGISTER_TYPE_UD;
> -      break;
> -
> -   case ir_unop_i2f:
> -   case ir_unop_i2u:
> -   case ir_unop_u2i:
> -   case ir_unop_u2f:
> -   case ir_unop_f2i:
> -   case ir_unop_f2u:
> -      emit(MOV(result_dst, op[0]));
> -      break;
> -   case ir_unop_b2i:
> -      emit(AND(result_dst, op[0], src_reg(1)));
> -      break;
> -   case ir_unop_b2f:
> -      if (brw->gen <= 5) {
> -         resolve_bool_comparison(ir->operands[0], &op[0]);
> -      }
> -      op[0].type = BRW_REGISTER_TYPE_D;
> -      result_dst.type = BRW_REGISTER_TYPE_D;
> -      emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
> -      result_dst.type = BRW_REGISTER_TYPE_F;
> -      break;
> -   case ir_unop_f2b:
> -      emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
> -      break;
> -   case ir_unop_i2b:
> -      emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
> -      break;
> -
> -   case ir_unop_trunc:
> -      emit(RNDZ(result_dst, op[0]));
> -      break;
> -   case ir_unop_ceil: {
> -         src_reg tmp = src_reg(this, ir->type);
> -         op[0].negate = !op[0].negate;
> -         emit(RNDD(dst_reg(tmp), op[0]));
> -         tmp.negate = true;
> -         emit(MOV(result_dst, tmp));
> -      }
> -      break;
> -   case ir_unop_floor:
> -      inst = emit(RNDD(result_dst, op[0]));
> -      break;
> -   case ir_unop_fract:
> -      inst = emit(FRC(result_dst, op[0]));
> -      break;
> -   case ir_unop_round_even:
> -      emit(RNDE(result_dst, op[0]));
> -      break;
> -
> -   case ir_binop_min:
> -      emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
> -      break;
> -   case ir_binop_max:
> -      emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
> -      break;
> -
> -   case ir_binop_pow:
> -      emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
> -      break;
> -
> -   case ir_unop_bit_not:
> -      inst = emit(NOT(result_dst, op[0]));
> -      break;
> -   case ir_binop_bit_and:
> -      inst = emit(AND(result_dst, op[0], op[1]));
> -      break;
> -   case ir_binop_bit_xor:
> -      inst = emit(XOR(result_dst, op[0], op[1]));
> -      break;
> -   case ir_binop_bit_or:
> -      inst = emit(OR(result_dst, op[0], op[1]));
> -      break;
> -
> -   case ir_binop_lshift:
> -      inst = emit(SHL(result_dst, op[0], op[1]));
> -      break;
> -
> -   case ir_binop_rshift:
> -      if (ir->type->base_type == GLSL_TYPE_INT)
> -         inst = emit(ASR(result_dst, op[0], op[1]));
> -      else
> -         inst = emit(SHR(result_dst, op[0], op[1]));
> -      break;
> -
> -   case ir_binop_bfm:
> -      emit(BFI1(result_dst, op[0], op[1]));
> -      break;
> -
> -   case ir_binop_ubo_load: {
> -      ir_constant *const_uniform_block = ir->operands[0]->as_constant();
> -      ir_constant *const_offset_ir = ir->operands[1]->as_constant();
> -      unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
> -      src_reg offset;
> -
> -      /* Now, load the vector from that offset. */
> -      assert(ir->type->is_vector() || ir->type->is_scalar());
> -
> -      src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
> -      packed_consts.type = result.type;
> -      src_reg surf_index;
> -
> -      if (const_uniform_block) {
> -         /* The block index is a constant, so just emit the binding table entry
> -          * as an immediate.
> -          */
> -         surf_index = src_reg(prog_data->base.binding_table.ubo_start +
> -                              const_uniform_block->value.u[0]);
> -      } else {
> -         /* The block index is not a constant. Evaluate the index expression
> -          * per-channel and add the base UBO index; the generator will select
> -          * a value from any live channel.
> -          */
> -         surf_index = src_reg(this, glsl_type::uint_type);
> -         emit(ADD(dst_reg(surf_index), op[0],
> -                  src_reg(prog_data->base.binding_table.ubo_start)));
> -
> -         /* Assume this may touch any UBO. It would be nice to provide
> -          * a tighter bound, but the array information is already lowered away.
> -          */
> -         brw_mark_surface_used(&prog_data->base,
> -                               prog_data->base.binding_table.ubo_start +
> -                               shader_prog->NumUniformBlocks - 1);
> -      }
> -
> -      if (const_offset_ir) {
> -         if (brw->gen >= 8) {
> -            /* Store the offset in a GRF so we can send-from-GRF. */
> -            offset = src_reg(this, glsl_type::int_type);
> -            emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
> -         } else {
> -            /* Immediates are fine on older generations since they'll be moved
> -             * to a (potentially fake) MRF at the generator level.
> -             */
> -            offset = src_reg(const_offset / 16);
> -         }
> -      } else {
> -         offset = src_reg(this, glsl_type::uint_type);
> -         emit(SHR(dst_reg(offset), op[1], src_reg(4)));
> -      }
> -
> -      if (brw->gen >= 7) {
> -         dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
> -
> -         /* We have to use a message header on Skylake to get SIMD4x2 mode.
> -          * Reserve space for the register.
> -          */
> -         if (brw->gen >= 9) {
> -            grf_offset.reg_offset++;
> -            alloc.sizes[grf_offset.reg] = 2;
> -         }
> -
> -         grf_offset.type = offset.type;
> -
> -         emit(MOV(grf_offset, offset));
> -
> -         vec4_instruction *pull =
> -            emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
> -                                               dst_reg(packed_consts),
> -                                               surf_index,
> -                                               src_reg(grf_offset)));
> -         pull->mlen = 1;
> -      } else {
> -         vec4_instruction *pull =
> -            emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
> -                                               dst_reg(packed_consts),
> -                                               surf_index,
> -                                               offset));
> -         pull->base_mrf = 14;
> -         pull->mlen = 1;
> -      }
> -
> -      packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
> -      packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
> -                                            const_offset % 16 / 4,
> -                                            const_offset % 16 / 4,
> -                                            const_offset % 16 / 4);
> -
> -      /* UBO bools are any nonzero int.  We need to convert them to use the
> -       * value of true stored in ctx->Const.UniformBooleanTrue.
> -       */
> -      if (ir->type->base_type == GLSL_TYPE_BOOL) {
> -         emit(CMP(result_dst, packed_consts, src_reg(0u),
> -                  BRW_CONDITIONAL_NZ));
> -      } else {
> -         emit(MOV(result_dst, packed_consts));
> -      }
> -      break;
> -   }
> -
> -   case ir_binop_vector_extract:
> -      unreachable("should have been lowered by vec_index_to_cond_assign");
> -
> -   case ir_triop_fma:
> -      op[0] = fix_3src_operand(op[0]);
> -      op[1] = fix_3src_operand(op[1]);
> -      op[2] = fix_3src_operand(op[2]);
> -      /* Note that the instruction's argument order is reversed from GLSL
> -       * and the IR.
> -       */
> -      emit(MAD(result_dst, op[2], op[1], op[0]));
> -      break;
> -
> -   case ir_triop_lrp:
> -      emit_lrp(result_dst, op[0], op[1], op[2]);
> -      break;
> -
> -   case ir_triop_csel:
> -      unreachable("already handled above");
> -      break;
> -
> -   case ir_triop_bfi:
> -      op[0] = fix_3src_operand(op[0]);
> -      op[1] = fix_3src_operand(op[1]);
> -      op[2] = fix_3src_operand(op[2]);
> -      emit(BFI2(result_dst, op[0], op[1], op[2]));
> -      break;
> -
> -   case ir_triop_bitfield_extract:
> -      op[0] = fix_3src_operand(op[0]);
> -      op[1] = fix_3src_operand(op[1]);
> -      op[2] = fix_3src_operand(op[2]);
> -      /* Note that the instruction's argument order is reversed from GLSL
> -       * and the IR.
> -       */
> -      emit(BFE(result_dst, op[2], op[1], op[0]));
> -      break;
> -
> -   case ir_triop_vector_insert:
> -      unreachable("should have been lowered by lower_vector_insert");
> -
> -   case ir_quadop_bitfield_insert:
> -      unreachable("not reached: should be handled by "
> -              "bitfield_insert_to_bfm_bfi\n");
> -
> -   case ir_quadop_vector:
> -      unreachable("not reached: should be handled by lower_quadop_vector");
> -
> -   case ir_unop_pack_half_2x16:
> -      emit_pack_half_2x16(result_dst, op[0]);
> -      break;
> -   case ir_unop_unpack_half_2x16:
> -      emit_unpack_half_2x16(result_dst, op[0]);
> -      break;
> -   case ir_unop_unpack_unorm_4x8:
> -      emit_unpack_unorm_4x8(result_dst, op[0]);
> -      break;
> -   case ir_unop_unpack_snorm_4x8:
> -      emit_unpack_snorm_4x8(result_dst, op[0]);
> -      break;
> -   case ir_unop_pack_unorm_4x8:
> -      emit_pack_unorm_4x8(result_dst, op[0]);
> -      break;
> -   case ir_unop_pack_snorm_4x8:
> -      emit_pack_snorm_4x8(result_dst, op[0]);
> -      break;
> -   case ir_unop_pack_snorm_2x16:
> -   case ir_unop_pack_unorm_2x16:
> -   case ir_unop_unpack_snorm_2x16:
> -   case ir_unop_unpack_unorm_2x16:
> -      unreachable("not reached: should be handled by lower_packing_builtins");
> -   case ir_unop_unpack_half_2x16_split_x:
> -   case ir_unop_unpack_half_2x16_split_y:
> -   case ir_binop_pack_half_2x16_split:
> -   case ir_unop_interpolate_at_centroid:
> -   case ir_binop_interpolate_at_sample:
> -   case ir_binop_interpolate_at_offset:
> -      unreachable("not reached: should not occur in vertex shader");
> -   case ir_binop_ldexp:
> -      unreachable("not reached: should be handled by ldexp_to_arith()");
> -   case ir_unop_d2f:
> -   case ir_unop_f2d:
> -   case ir_unop_d2i:
> -   case ir_unop_i2d:
> -   case ir_unop_d2u:
> -   case ir_unop_u2d:
> -   case ir_unop_d2b:
> -   case ir_unop_pack_double_2x32:
> -   case ir_unop_unpack_double_2x32:
> -   case ir_unop_frexp_sig:
> -   case ir_unop_frexp_exp:
> -      unreachable("fp64 todo");
> -   }
> -}
> -
> -
> -void
> -vec4_visitor::visit(ir_swizzle *ir)
> -{
> -   /* Note that this is only swizzles in expressions, not those on the left
> -    * hand side of an assignment, which do write masking.  See ir_assignment
> -    * for that.
> -    */
> -   const unsigned swz = brw_compose_swizzle(
> -      brw_swizzle_for_size(ir->type->vector_elements),
> -      BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
> -
> -   ir->val->accept(this);
> -   this->result = swizzle(this->result, swz);
> -}
> -
> -void
> -vec4_visitor::visit(ir_dereference_variable *ir)
> -{
> -   const struct glsl_type *type = ir->type;
> -   dst_reg *reg = variable_storage(ir->var);
> -
> -   if (!reg) {
> -      fail("Failed to find variable storage for %s\n", ir->var->name);
> -      this->result = src_reg(brw_null_reg());
> -      return;
> -   }
> -
> -   this->result = src_reg(*reg);
> -
> -   /* System values get their swizzle from the dst_reg writemask */
> -   if (ir->var->data.mode == ir_var_system_value)
> -      return;
> -
> -   if (type->is_scalar() || type->is_vector() || type->is_matrix())
> -      this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
> -}
> -
> -
> -int
> -vec4_visitor::compute_array_stride(ir_dereference_array *ir)
> -{
> -   /* Under normal circumstances array elements are stored consecutively, so
> -    * the stride is equal to the size of the array element.
> -    */
> -   return type_size(ir->type);
> -}
> -
> -
> -void
> -vec4_visitor::visit(ir_dereference_array *ir)
> -{
> -   ir_constant *constant_index;
> -   src_reg src;
> -   int array_stride = compute_array_stride(ir);
> -
> -   constant_index = ir->array_index->constant_expression_value();
> -
> -   ir->array->accept(this);
> -   src = this->result;
> -
> -   if (constant_index) {
> -      src.reg_offset += constant_index->value.i[0] * array_stride;
> -   } else {
> -      /* Variable index array dereference.  It eats the "vec4" of the
> -       * base of the array and an index that offsets the Mesa register
> -       * index.
> -       */
> -      ir->array_index->accept(this);
> -
> -      src_reg index_reg;
> -
> -      if (array_stride == 1) {
> -        index_reg = this->result;
> -      } else {
> -        index_reg = src_reg(this, glsl_type::int_type);
> -
> -        emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
> -      }
> -
> -      if (src.reladdr) {
> -        src_reg temp = src_reg(this, glsl_type::int_type);
> -
> -        emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
> -
> -        index_reg = temp;
> -      }
> -
> -      src.reladdr = ralloc(mem_ctx, src_reg);
> -      memcpy(src.reladdr, &index_reg, sizeof(index_reg));
> -   }
> -
> -   /* If the type is smaller than a vec4, replicate the last channel out. */
> -   if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
> -      src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
> -   else
> -      src.swizzle = BRW_SWIZZLE_NOOP;
> -   src.type = brw_type_for_base_type(ir->type);
> -
> -   this->result = src;
> -}
> -
> -void
> -vec4_visitor::visit(ir_dereference_record *ir)
> -{
> -   unsigned int i;
> -   const glsl_type *struct_type = ir->record->type;
> -   int offset = 0;
> -
> -   ir->record->accept(this);
> -
> -   for (i = 0; i < struct_type->length; i++) {
> -      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
> -        break;
> -      offset += type_size(struct_type->fields.structure[i].type);
> -   }
> -
> -   /* If the type is smaller than a vec4, replicate the last channel out. */
> -   if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
> -      this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
> -   else
> -      this->result.swizzle = BRW_SWIZZLE_NOOP;
> -   this->result.type = brw_type_for_base_type(ir->type);
> -
> -   this->result.reg_offset += offset;
> -}
> -
> -/**
> - * We want to be careful in assignment setup to hit the actual storage
> - * instead of potentially using a temporary like we might with the
> - * ir_dereference handler.
> - */
> -static dst_reg
> -get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
> -{
> -   /* The LHS must be a dereference.  If the LHS is a variable indexed array
> -    * access of a vector, it must be separated into a series conditional moves
> -    * before reaching this point (see ir_vec_index_to_cond_assign).
> -    */
> -   assert(ir->as_dereference());
> -   ir_dereference_array *deref_array = ir->as_dereference_array();
> -   if (deref_array) {
> -      assert(!deref_array->array->type->is_vector());
> -   }
> -
> -   /* Use the rvalue deref handler for the most part.  We'll ignore
> -    * swizzles in it and write swizzles using writemask, though.
> -    */
> -   ir->accept(v);
> -   return dst_reg(v->result);
> -}
> -
> -void
> -vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
> -                              const struct glsl_type *type,
> -                              enum brw_predicate predicate)
> -{
> -   if (type->base_type == GLSL_TYPE_STRUCT) {
> -      for (unsigned int i = 0; i < type->length; i++) {
> -        emit_block_move(dst, src, type->fields.structure[i].type, predicate);
> -      }
> -      return;
> -   }
> -
> -   if (type->is_array()) {
> -      for (unsigned int i = 0; i < type->length; i++) {
> -        emit_block_move(dst, src, type->fields.array, predicate);
> -      }
> -      return;
> -   }
> -
> -   if (type->is_matrix()) {
> -      const struct glsl_type *vec_type;
> -
> -      vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
> -                                        type->vector_elements, 1);
> -
> -      for (int i = 0; i < type->matrix_columns; i++) {
> -        emit_block_move(dst, src, vec_type, predicate);
> -      }
> -      return;
> -   }
> -
> -   assert(type->is_scalar() || type->is_vector());
> -
> -   dst->type = brw_type_for_base_type(type);
> -   src->type = dst->type;
> -
> -   dst->writemask = (1 << type->vector_elements) - 1;
> -
> -   src->swizzle = brw_swizzle_for_size(type->vector_elements);
> -
> -   vec4_instruction *inst = emit(MOV(*dst, *src));
> -   inst->predicate = predicate;
> -
> -   dst->reg_offset++;
> -   src->reg_offset++;
> -}
> -
> -
> -/* If the RHS processing resulted in an instruction generating a
> - * temporary value, and it would be easy to rewrite the instruction to
> - * generate its result right into the LHS instead, do so.  This ends
> - * up reliably removing instructions where it can be tricky to do so
> - * later without real UD chain information.
> - */
> -bool
> -vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
> -                                    dst_reg dst,
> -                                    src_reg src,
> -                                    vec4_instruction *pre_rhs_inst,
> -                                    vec4_instruction *last_rhs_inst)
> -{
> -   /* This could be supported, but it would take more smarts. */
> -   if (ir->condition)
> -      return false;
> -
> -   if (pre_rhs_inst == last_rhs_inst)
> -      return false; /* No instructions generated to work with. */
> -
> -   /* Make sure the last instruction generated our source reg. */
> -   if (src.file != GRF ||
> -       src.file != last_rhs_inst->dst.file ||
> -       src.reg != last_rhs_inst->dst.reg ||
> -       src.reg_offset != last_rhs_inst->dst.reg_offset ||
> -       src.reladdr ||
> -       src.abs ||
> -       src.negate ||
> -       last_rhs_inst->predicate != BRW_PREDICATE_NONE)
> -      return false;
> -
> -   /* Check that that last instruction fully initialized the channels
> -    * we want to use, in the order we want to use them.  We could
> -    * potentially reswizzle the operands of many instructions so that
> -    * we could handle out of order channels, but don't yet.
> -    */
> -
> -   for (unsigned i = 0; i < 4; i++) {
> -      if (dst.writemask & (1 << i)) {
> -        if (!(last_rhs_inst->dst.writemask & (1 << i)))
> -           return false;
> -
> -        if (BRW_GET_SWZ(src.swizzle, i) != i)
> -           return false;
> -      }
> -   }
> -
> -   /* Success!  Rewrite the instruction. */
> -   last_rhs_inst->dst.file = dst.file;
> -   last_rhs_inst->dst.reg = dst.reg;
> -   last_rhs_inst->dst.reg_offset = dst.reg_offset;
> -   last_rhs_inst->dst.reladdr = dst.reladdr;
> -   last_rhs_inst->dst.writemask &= dst.writemask;
> -
> -   return true;
> -}
> -
> -void
> -vec4_visitor::visit(ir_assignment *ir)
> -{
> -   dst_reg dst = get_assignment_lhs(ir->lhs, this);
> -   enum brw_predicate predicate = BRW_PREDICATE_NONE;
> -
> -   if (!ir->lhs->type->is_scalar() &&
> -       !ir->lhs->type->is_vector()) {
> -      ir->rhs->accept(this);
> -      src_reg src = this->result;
> -
> -      if (ir->condition) {
> -        emit_bool_to_cond_code(ir->condition, &predicate);
> -      }
> -
> -      /* emit_block_move doesn't account for swizzles in the source register.
> -       * This should be ok, since the source register is a structure or an
> -       * array, and those can't be swizzled.  But double-check to be sure.
> -       */
> -      assert(src.swizzle ==
> -             (ir->rhs->type->is_matrix()
> -              ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
> -              : BRW_SWIZZLE_NOOP));
> -
> -      emit_block_move(&dst, &src, ir->rhs->type, predicate);
> -      return;
> -   }
> -
> -   /* Now we're down to just a scalar/vector with writemasks. */
> -   int i;
> -
> -   vec4_instruction *pre_rhs_inst, *last_rhs_inst;
> -   pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
> -
> -   ir->rhs->accept(this);
> -
> -   last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
> -
> -   int swizzles[4];
> -   int src_chan = 0;
> -
> -   assert(ir->lhs->type->is_vector() ||
> -         ir->lhs->type->is_scalar());
> -   dst.writemask = ir->write_mask;
> -
> -   /* Swizzle a small RHS vector into the channels being written.
> -    *
> -    * glsl ir treats write_mask as dictating how many channels are
> -    * present on the RHS while in our instructions we need to make
> -    * those channels appear in the slots of the vec4 they're written to.
> -    */
> -   for (int i = 0; i < 4; i++)
> -      swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
> -
> -   src_reg src = swizzle(this->result,
> -                         BRW_SWIZZLE4(swizzles[0], swizzles[1],
> -                                      swizzles[2], swizzles[3]));
> -
> -   if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
> -      return;
> -   }
> -
> -   if (ir->condition) {
> -      emit_bool_to_cond_code(ir->condition, &predicate);
> -   }
> -
> -   for (i = 0; i < type_size(ir->lhs->type); i++) {
> -      vec4_instruction *inst = emit(MOV(dst, src));
> -      inst->predicate = predicate;
> -
> -      dst.reg_offset++;
> -      src.reg_offset++;
> -   }
> -}
> -
> -void
> -vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
> -{
> -   if (ir->type->base_type == GLSL_TYPE_STRUCT) {
> -      foreach_in_list(ir_constant, field_value, &ir->components) {
> -        emit_constant_values(dst, field_value);
> -      }
> -      return;
> -   }
> -
> -   if (ir->type->is_array()) {
> -      for (unsigned int i = 0; i < ir->type->length; i++) {
> -        emit_constant_values(dst, ir->array_elements[i]);
> -      }
> -      return;
> -   }
> -
> -   if (ir->type->is_matrix()) {
> -      for (int i = 0; i < ir->type->matrix_columns; i++) {
> -        float *vec = &ir->value.f[i * ir->type->vector_elements];
> -
> -        for (int j = 0; j < ir->type->vector_elements; j++) {
> -           dst->writemask = 1 << j;
> -           dst->type = BRW_REGISTER_TYPE_F;
> -
> -           emit(MOV(*dst, src_reg(vec[j])));
> -        }
> -        dst->reg_offset++;
> -      }
> -      return;
> -   }
> -
> -   int remaining_writemask = (1 << ir->type->vector_elements) - 1;
> -
> -   for (int i = 0; i < ir->type->vector_elements; i++) {
> -      if (!(remaining_writemask & (1 << i)))
> -        continue;
> -
> -      dst->writemask = 1 << i;
> -      dst->type = brw_type_for_base_type(ir->type);
> -
> -      /* Find other components that match the one we're about to
> -       * write.  Emits fewer instructions for things like vec4(0.5,
> -       * 1.5, 1.5, 1.5).
> -       */
> -      for (int j = i + 1; j < ir->type->vector_elements; j++) {
> -        if (ir->type->base_type == GLSL_TYPE_BOOL) {
> -           if (ir->value.b[i] == ir->value.b[j])
> -              dst->writemask |= (1 << j);
> -        } else {
> -           /* u, i, and f storage all line up, so no need for a
> -            * switch case for comparing each type.
> -            */
> -           if (ir->value.u[i] == ir->value.u[j])
> -              dst->writemask |= (1 << j);
> -        }
> -      }
> -
> -      switch (ir->type->base_type) {
> -      case GLSL_TYPE_FLOAT:
> -        emit(MOV(*dst, src_reg(ir->value.f[i])));
> -        break;
> -      case GLSL_TYPE_INT:
> -        emit(MOV(*dst, src_reg(ir->value.i[i])));
> -        break;
> -      case GLSL_TYPE_UINT:
> -        emit(MOV(*dst, src_reg(ir->value.u[i])));
> -        break;
> -      case GLSL_TYPE_BOOL:
> -         emit(MOV(*dst,
> -                  src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
> -                                              : 0)));
> -        break;
> -      default:
> -        unreachable("Non-float/uint/int/bool constant");
> -      }
> -
> -      remaining_writemask &= ~dst->writemask;
> -   }
> -   dst->reg_offset++;
> -}
> -
> -void
> -vec4_visitor::visit(ir_constant *ir)
> -{
> -   dst_reg dst = dst_reg(this, ir->type);
> -   this->result = src_reg(dst);
> -
> -   emit_constant_values(&dst, ir);
> -}
> -
> -void
> -vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
> -{
> -   ir_dereference *deref = static_cast<ir_dereference *>(
> -      ir->actual_parameters.get_head());
> -   ir_variable *location = deref->variable_referenced();
> -   unsigned surf_index = (prog_data->base.binding_table.abo_start +
> -                          location->data.binding);
> -
> -   /* Calculate the surface offset */
> -   src_reg offset(this, glsl_type::uint_type);
> -   ir_dereference_array *deref_array = deref->as_dereference_array();
> -   if (deref_array) {
> -      deref_array->array_index->accept(this);
> -
> -      src_reg tmp(this, glsl_type::uint_type);
> -      emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
> -      emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
> -   } else {
> -      offset = location->data.atomic.offset;
> -   }
> -
> -   /* Emit the appropriate machine instruction */
> -   const char *callee = ir->callee->function_name();
> -   dst_reg dst = get_assignment_lhs(ir->return_deref, this);
> -
> -   if (!strcmp("__intrinsic_atomic_read", callee)) {
> -      emit_untyped_surface_read(surf_index, dst, offset);
> -
> -   } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
> -      emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
> -                          src_reg(), src_reg());
> -
> -   } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
> -      emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
> -                          src_reg(), src_reg());
> -   }
> -}
> -
> -void
> -vec4_visitor::visit(ir_call *ir)
> -{
> -   const char *callee = ir->callee->function_name();
> -
> -   if (!strcmp("__intrinsic_atomic_read", callee) ||
> -       !strcmp("__intrinsic_atomic_increment", callee) ||
> -       !strcmp("__intrinsic_atomic_predecrement", callee)) {
> -      visit_atomic_counter_intrinsic(ir);
> -   } else {
> -      unreachable("Unsupported intrinsic.");
> -   }
> -}
> -
> -src_reg
> -vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
> -{
> -   vec4_instruction *inst =
> -      new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
> -                                    dst_reg(this, glsl_type::uvec4_type));
> -   inst->base_mrf = 2;
> -   inst->mlen = 1;
> -   inst->src[1] = sampler;
> -
> -   /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
> -   int param_base = inst->base_mrf;
> -   int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
> -   int zero_mask = 0xf & ~coord_mask;
> -
> -   emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
> -            coordinate));
> -
> -   emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
> -            src_reg(0)));
> -
> -   emit(inst);
> -   return src_reg(inst->dst);
> -}
> -
> -static bool
> -is_high_sampler(struct brw_context *brw, src_reg sampler)
> -{
> -   if (brw->gen < 8 && !brw->is_haswell)
> -      return false;
> -
> -   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
> -}
> -
> -void
> -vec4_visitor::visit(ir_texture *ir)
> -{
> -   uint32_t sampler =
> -      _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
> -
> -   ir_rvalue *nonconst_sampler_index =
> -      _mesa_get_sampler_array_nonconst_index(ir->sampler);
> -
> -   /* Handle non-constant sampler array indexing */
> -   src_reg sampler_reg;
> -   if (nonconst_sampler_index) {
> -      /* The highest sampler which may be used by this operation is
> -       * the last element of the array. Mark it here, because the generator
> -       * doesn't have enough information to determine the bound.
> -       */
> -      uint32_t array_size = ir->sampler->as_dereference_array()
> -         ->array->type->array_size();
> -
> -      uint32_t max_used = sampler + array_size - 1;
> -      if (ir->op == ir_tg4 && brw->gen < 8) {
> -         max_used += prog_data->base.binding_table.gather_texture_start;
> -      } else {
> -         max_used += prog_data->base.binding_table.texture_start;
> -      }
> -
> -      brw_mark_surface_used(&prog_data->base, max_used);
> -
> -      /* Emit code to evaluate the actual indexing expression */
> -      nonconst_sampler_index->accept(this);
> -      dst_reg temp(this, glsl_type::uint_type);
> -      emit(ADD(temp, this->result, src_reg(sampler)))
> -         ->force_writemask_all = true;
> -      sampler_reg = src_reg(temp);
> -   } else {
> -      /* Single sampler, or constant array index; the indexing expression
> -       * is just an immediate.
> -       */
> -      sampler_reg = src_reg(sampler);
> -   }
> -
> -   /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
> -    * emitting anything other than setting up the constant result.
> -    */
> -   if (ir->op == ir_tg4) {
> -      ir_constant *chan = ir->lod_info.component->as_constant();
> -      int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
> -      if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
> -         dst_reg result(this, ir->type);
> -         this->result = src_reg(result);
> -         emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
> -         return;
> -      }
> -   }
> -
> -   /* Should be lowered by do_lower_texture_projection */
> -   assert(!ir->projector);
> -
> -   /* Should be lowered */
> -   assert(!ir->offset || !ir->offset->type->is_array());
> -
> -   /* Generate code to compute all the subexpression trees.  This has to be
> -    * done before loading any values into MRFs for the sampler message since
> -    * generating these values may involve SEND messages that need the MRFs.
> -    */
> -   src_reg coordinate;
> -   if (ir->coordinate) {
> -      ir->coordinate->accept(this);
> -      coordinate = this->result;
> -   }
> -
> -   src_reg shadow_comparitor;
> -   if (ir->shadow_comparitor) {
> -      ir->shadow_comparitor->accept(this);
> -      shadow_comparitor = this->result;
> -   }
> -
> -   bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
> -   src_reg offset_value;
> -   if (has_nonconstant_offset) {
> -      ir->offset->accept(this);
> -      offset_value = src_reg(this->result);
> -   }
> -
> -   const glsl_type *lod_type = NULL, *sample_index_type = NULL;
> -   src_reg lod, dPdx, dPdy, sample_index, mcs;
> -   switch (ir->op) {
> -   case ir_tex:
> -      lod = src_reg(0.0f);
> -      lod_type = glsl_type::float_type;
> -      break;
> -   case ir_txf:
> -   case ir_txl:
> -   case ir_txs:
> -      ir->lod_info.lod->accept(this);
> -      lod = this->result;
> -      lod_type = ir->lod_info.lod->type;
> -      break;
> -   case ir_query_levels:
> -      lod = src_reg(0);
> -      lod_type = glsl_type::int_type;
> -      break;
> -   case ir_txf_ms:
> -      ir->lod_info.sample_index->accept(this);
> -      sample_index = this->result;
> -      sample_index_type = ir->lod_info.sample_index->type;
> -
> -      if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
> -         mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
> -      else
> -         mcs = src_reg(0u);
> -      break;
> -   case ir_txd:
> -      ir->lod_info.grad.dPdx->accept(this);
> -      dPdx = this->result;
> -
> -      ir->lod_info.grad.dPdy->accept(this);
> -      dPdy = this->result;
> -
> -      lod_type = ir->lod_info.grad.dPdx->type;
> -      break;
> -   case ir_txb:
> -   case ir_lod:
> -   case ir_tg4:
> -      break;
> -   }
> -
> -   enum opcode opcode;
> -   switch (ir->op) {
> -   case ir_tex: opcode = SHADER_OPCODE_TXL; break;
> -   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
> -   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
> -   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
> -   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
> -   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
> -   case ir_tg4: opcode = has_nonconstant_offset
> -                         ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
> -   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
> -   case ir_txb:
> -      unreachable("TXB is not valid for vertex shaders.");
> -   case ir_lod:
> -      unreachable("LOD is not valid for vertex shaders.");
> -   default:
> -      unreachable("Unrecognized tex op");
> -   }
> -
> -   vec4_instruction *inst = new(mem_ctx) vec4_instruction(
> -      opcode, dst_reg(this, ir->type));
> -
> -   if (ir->offset != NULL && !has_nonconstant_offset) {
> -      inst->offset =
> -         brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
> -                            ir->offset->type->vector_elements);
> -   }
> -
> -   /* Stuff the channel select bits in the top of the texture offset */
> -   if (ir->op == ir_tg4)
> -      inst->offset |= gather_channel(ir, sampler) << 16;
> -
> -   /* The message header is necessary for:
> -    * - Gen4 (always)
> -    * - Gen9+ for selecting SIMD4x2
> -    * - Texel offsets
> -    * - Gather channel selection
> -    * - Sampler indices too large to fit in a 4-bit value.
> -    */
> -   inst->header_present =
> -      brw->gen < 5 || brw->gen >= 9 ||
> -      inst->offset != 0 || ir->op == ir_tg4 ||
> -      is_high_sampler(brw, sampler_reg);
> -   inst->base_mrf = 2;
> -   inst->mlen = inst->header_present + 1; /* always at least one */
> -   inst->dst.writemask = WRITEMASK_XYZW;
> -   inst->shadow_compare = ir->shadow_comparitor != NULL;
> -
> -   inst->src[1] = sampler_reg;
> -
> -   /* MRF for the first parameter */
> -   int param_base = inst->base_mrf + inst->header_present;
> -
> -   if (ir->op == ir_txs || ir->op == ir_query_levels) {
> -      int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
> -      emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
> -   } else {
> -      /* Load the coordinate */
> -      /* FINISHME: gl_clamp_mask and saturate */
> -      int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
> -      int zero_mask = 0xf & ~coord_mask;
> -
> -      emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
> -               coordinate));
> -
> -      if (zero_mask != 0) {
> -         emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
> -                  src_reg(0)));
> -      }
> -      /* Load the shadow comparitor */
> -      if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
> -        emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
> -                         WRITEMASK_X),
> -                 shadow_comparitor));
> -        inst->mlen++;
> -      }
> -
> -      /* Load the LOD info */
> -      if (ir->op == ir_tex || ir->op == ir_txl) {
> -        int mrf, writemask;
> -        if (brw->gen >= 5) {
> -           mrf = param_base + 1;
> -           if (ir->shadow_comparitor) {
> -              writemask = WRITEMASK_Y;
> -              /* mlen already incremented */
> -           } else {
> -              writemask = WRITEMASK_X;
> -              inst->mlen++;
> -           }
> -        } else /* brw->gen == 4 */ {
> -           mrf = param_base;
> -           writemask = WRITEMASK_W;
> -        }
> -        emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
> -      } else if (ir->op == ir_txf) {
> -         emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
> -      } else if (ir->op == ir_txf_ms) {
> -         emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
> -                  sample_index));
> -         if (brw->gen >= 7) {
> -            /* MCS data is in the first channel of `mcs`, but we need to get it into
> -             * the .y channel of the second vec4 of params, so replicate .x across
> -             * the whole vec4 and then mask off everything except .y
> -             */
> -            mcs.swizzle = BRW_SWIZZLE_XXXX;
> -            emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
> -                     mcs));
> -         }
> -         inst->mlen++;
> -      } else if (ir->op == ir_txd) {
> -        const glsl_type *type = lod_type;
> -
> -        if (brw->gen >= 5) {
> -           dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
> -           dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
> -           emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
> -           emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
> -           inst->mlen++;
> -
> -           if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
> -              dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
> -              dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
> -              emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
> -              emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
> -              inst->mlen++;
> -
> -               if (ir->shadow_comparitor) {
> -                  emit(MOV(dst_reg(MRF, param_base + 2,
> -                                   ir->shadow_comparitor->type, WRITEMASK_Z),
> -                           shadow_comparitor));
> -               }
> -           }
> -        } else /* brw->gen == 4 */ {
> -           emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
> -           emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
> -           inst->mlen += 2;
> -        }
> -      } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
> -         if (ir->shadow_comparitor) {
> -            emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
> -                     shadow_comparitor));
> -         }
> -
> -         emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
> -                  offset_value));
> -         inst->mlen++;
> -      }
> -   }
> -
> -   emit(inst);
> -
> -   /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
> -    * spec requires layers.
> -    */
> -   if (ir->op == ir_txs) {
> -      glsl_type const *type = ir->sampler->type;
> -      if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
> -          type->sampler_array) {
> -         emit_math(SHADER_OPCODE_INT_QUOTIENT,
> -                   writemask(inst->dst, WRITEMASK_Z),
> -                   src_reg(inst->dst), src_reg(6));
> -      }
> -   }
> -
> -   if (brw->gen == 6 && ir->op == ir_tg4) {
> -      emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
> -   }
> -
> -   swizzle_result(ir, src_reg(inst->dst), sampler);
> -}
> -
> -/**
> - * Apply workarounds for Gen6 gather with UINT/SINT
> - */
> -void
> -vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
> -{
> -   if (!wa)
> -      return;
> -
> -   int width = (wa & WA_8BIT) ? 8 : 16;
> -   dst_reg dst_f = dst;
> -   dst_f.type = BRW_REGISTER_TYPE_F;
> -
> -   /* Convert from UNORM to UINT */
> -   emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
> -   emit(MOV(dst, src_reg(dst_f)));
> -
> -   if (wa & WA_SIGN) {
> -      /* Reinterpret the UINT value as a signed INT value by
> -       * shifting the sign bit into place, then shifting back
> -       * preserving sign.
> -       */
> -      emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
> -      emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
> -   }
> -}
> -
> -/**
> - * Set up the gather channel based on the swizzle, for gather4.
> - */
> -uint32_t
> -vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
> -{
> -   ir_constant *chan = ir->lod_info.component->as_constant();
> -   int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
> -   switch (swiz) {
> -      case SWIZZLE_X: return 0;
> -      case SWIZZLE_Y:
> -         /* gather4 sampler is broken for green channel on RG32F --
> -          * we must ask for blue instead.
> -          */
> -         if (key->tex.gather_channel_quirk_mask & (1<<sampler))
> -            return 2;
> -         return 1;
> -      case SWIZZLE_Z: return 2;
> -      case SWIZZLE_W: return 3;
> -      default:
> -         unreachable("Not reached"); /* zero, one swizzles handled already */
> -   }
> -}
> -
> -void
> -vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
> -{
> -   int s = key->tex.swizzles[sampler];
> -
> -   this->result = src_reg(this, ir->type);
> -   dst_reg swizzled_result(this->result);
> -
> -   if (ir->op == ir_query_levels) {
> -      /* # levels is in .w */
> -      orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
> -      emit(MOV(swizzled_result, orig_val));
> -      return;
> -   }
> -
> -   if (ir->op == ir_txs || ir->type == glsl_type::float_type
> -                       || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
> -      emit(MOV(swizzled_result, orig_val));
> -      return;
> -   }
> -
> -
> -   int zero_mask = 0, one_mask = 0, copy_mask = 0;
> -   int swizzle[4] = {0};
> -
> -   for (int i = 0; i < 4; i++) {
> -      switch (GET_SWZ(s, i)) {
> -      case SWIZZLE_ZERO:
> -        zero_mask |= (1 << i);
> -        break;
> -      case SWIZZLE_ONE:
> -        one_mask |= (1 << i);
> -        break;
> -      default:
> -        copy_mask |= (1 << i);
> -        swizzle[i] = GET_SWZ(s, i);
> -        break;
> -      }
> -   }
> -
> -   if (copy_mask) {
> -      orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
> -      swizzled_result.writemask = copy_mask;
> -      emit(MOV(swizzled_result, orig_val));
> -   }
> -
> -   if (zero_mask) {
> -      swizzled_result.writemask = zero_mask;
> -      emit(MOV(swizzled_result, src_reg(0.0f)));
> -   }
> -
> -   if (one_mask) {
> -      swizzled_result.writemask = one_mask;
> -      emit(MOV(swizzled_result, src_reg(1.0f)));
> -   }
> -}
> -
> -void
> -vec4_visitor::visit(ir_return *)
> -{
> -   unreachable("not reached");
> -}
> -
> -void
> -vec4_visitor::visit(ir_discard *)
> -{
> -   unreachable("not reached");
> -}
> -
> -void
> -vec4_visitor::visit(ir_if *ir)
> -{
> -   /* Don't point the annotation at the if statement, because then it plus
> -    * the then and else blocks get printed.
> -    */
> -   this->base_ir = ir->condition;
> -
> -   if (brw->gen == 6) {
> -      emit_if_gen6(ir);
> -   } else {
> -      enum brw_predicate predicate;
> -      emit_bool_to_cond_code(ir->condition, &predicate);
> -      emit(IF(predicate));
> -   }
> -
> -   visit_instructions(&ir->then_instructions);
> -
> -   if (!ir->else_instructions.is_empty()) {
> -      this->base_ir = ir->condition;
> -      emit(BRW_OPCODE_ELSE);
> -
> -      visit_instructions(&ir->else_instructions);
> -   }
> -
> -   this->base_ir = ir->condition;
> -   emit(BRW_OPCODE_ENDIF);
> -}
> -
> -void
> -vec4_visitor::visit(ir_emit_vertex *)
> -{
> -   unreachable("not reached");
> -}
> -
> -void
> -vec4_visitor::visit(ir_end_primitive *)
> -{
> -   unreachable("not reached");
> -}
> -
> -void
> -vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
> -                                  dst_reg dst, src_reg offset,
> -                                  src_reg src0, src_reg src1)
> -{
> -   unsigned mlen = 0;
> -
> -   /* Set the atomic operation offset. */
> -   emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
> -   mlen++;
> -
> -   /* Set the atomic operation arguments. */
> -   if (src0.file != BAD_FILE) {
> -      emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
> -      mlen++;
> -   }
> -
> -   if (src1.file != BAD_FILE) {
> -      emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
> -      mlen++;
> -   }
> -
> -   /* Emit the instruction.  Note that this maps to the normal SIMD8
> -    * untyped atomic message on Ivy Bridge, but that's OK because
> -    * unused channels will be masked out.
> -    */
> -   vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
> -                                 src_reg(atomic_op), src_reg(surf_index));
> -   inst->base_mrf = 0;
> -   inst->mlen = mlen;
> -}
> -
> -void
> -vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
> -                                        src_reg offset)
> -{
> -   /* Set the surface read offset. */
> -   emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
> -
> -   /* Emit the instruction.  Note that this maps to the normal SIMD8
> -    * untyped surface read message, but that's OK because unused
> -    * channels will be masked out.
> -    */
> -   vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
> -                                 dst, src_reg(surf_index));
> -   inst->base_mrf = 0;
> -   inst->mlen = 1;
> -}
> -
> -void
> -vec4_visitor::emit_ndc_computation()
> -{
> -   /* Get the position */
> -   src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
> -
> -   /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
> -   dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
> -   output_reg[BRW_VARYING_SLOT_NDC] = ndc;
> -
> -   current_annotation = "NDC";
> -   dst_reg ndc_w = ndc;
> -   ndc_w.writemask = WRITEMASK_W;
> -   src_reg pos_w = pos;
> -   pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
> -   emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
> -
> -   dst_reg ndc_xyz = ndc;
> -   ndc_xyz.writemask = WRITEMASK_XYZ;
> -
> -   emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
> -}
> -
> -void
> -vec4_visitor::emit_psiz_and_flags(dst_reg reg)
> -{
> -   if (brw->gen < 6 &&
> -       ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
> -        key->userclip_active || brw->has_negative_rhw_bug)) {
> -      dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
> -      dst_reg header1_w = header1;
> -      header1_w.writemask = WRITEMASK_W;
> -
> -      emit(MOV(header1, 0u));
> -
> -      if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
> -        src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
> -
> -        current_annotation = "Point size";
> -        emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
> -        emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
> -      }
> -
> -      if (key->userclip_active) {
> -         current_annotation = "Clipping flags";
> -         dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
> -         dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
> -
> -         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
> -         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
> -         emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
> -
> -         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
> -         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
> -         emit(SHL(flags1, src_reg(flags1), src_reg(4)));
> -         emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
> -      }
> -
> -      /* i965 clipping workaround:
> -       * 1) Test for -ve rhw
> -       * 2) If set,
> -       *      set ndc = (0,0,0,0)
> -       *      set ucp[6] = 1
> -       *
> -       * Later, clipping will detect ucp[6] and ensure the primitive is
> -       * clipped against all fixed planes.
> -       */
> -      if (brw->has_negative_rhw_bug) {
> -         src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
> -         ndc_w.swizzle = BRW_SWIZZLE_WWWW;
> -         emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
> -         vec4_instruction *inst;
> -         inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
> -         inst->predicate = BRW_PREDICATE_NORMAL;
> -         inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
> -         inst->predicate = BRW_PREDICATE_NORMAL;
> -      }
> -
> -      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
> -   } else if (brw->gen < 6) {
> -      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
> -   } else {
> -      emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
> -      if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
> -         dst_reg reg_w = reg;
> -         reg_w.writemask = WRITEMASK_W;
> -         emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
> -      }
> -      if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
> -         dst_reg reg_y = reg;
> -         reg_y.writemask = WRITEMASK_Y;
> -         reg_y.type = BRW_REGISTER_TYPE_D;
> -         emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
> -      }
> -      if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
> -         dst_reg reg_z = reg;
> -         reg_z.writemask = WRITEMASK_Z;
> -         reg_z.type = BRW_REGISTER_TYPE_D;
> -         emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
> -      }
> -   }
> -}
> -
> -void
> -vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
> -{
> -   /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
> -    *
> -    *     "If a linked set of shaders forming the vertex stage contains no
> -    *     static write to gl_ClipVertex or gl_ClipDistance, but the
> -    *     application has requested clipping against user clip planes through
> -    *     the API, then the coordinate written to gl_Position is used for
> -    *     comparison against the user clip planes."
> -    *
> -    * This function is only called if the shader didn't write to
> -    * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
> -    * if the user wrote to it; otherwise we use gl_Position.
> -    */
> -   gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
> -   if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
> -      clip_vertex = VARYING_SLOT_POS;
> -   }
> -
> -   for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
> -        ++i) {
> -      reg.writemask = 1 << i;
> -      emit(DP4(reg,
> -               src_reg(output_reg[clip_vertex]),
> -               src_reg(this->userplane[i + offset])));
> -   }
> -}
> -
> -vec4_instruction *
> -vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
> -{
> -   assert (varying < VARYING_SLOT_MAX);
> -   reg.type = output_reg[varying].type;
> -   current_annotation = output_reg_annotation[varying];
> -   /* Copy the register, saturating if necessary */
> -   return emit(MOV(reg, src_reg(output_reg[varying])));
> -}
> -
> -void
> -vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
> -{
> -   reg.type = BRW_REGISTER_TYPE_F;
> -
> -   switch (varying) {
> -   case VARYING_SLOT_PSIZ:
> -   {
> -      /* PSIZ is always in slot 0, and is coupled with other flags. */
> -      current_annotation = "indices, point width, clip flags";
> -      emit_psiz_and_flags(reg);
> -      break;
> -   }
> -   case BRW_VARYING_SLOT_NDC:
> -      current_annotation = "NDC";
> -      emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
> -      break;
> -   case VARYING_SLOT_POS:
> -      current_annotation = "gl_Position";
> -      emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
> -      break;
> -   case VARYING_SLOT_EDGE:
> -      /* This is present when doing unfilled polygons.  We're supposed to copy
> -       * the edge flag from the user-provided vertex array
> -       * (glEdgeFlagPointer), or otherwise we'll copy from the current value
> -       * of that attribute (starts as 1.0f).  This is then used in clipping to
> -       * determine which edges should be drawn as wireframe.
> -       */
> -      current_annotation = "edge flag";
> -      emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
> -                                    glsl_type::float_type, WRITEMASK_XYZW))));
> -      break;
> -   case BRW_VARYING_SLOT_PAD:
> -      /* No need to write to this slot */
> -      break;
> -   case VARYING_SLOT_COL0:
> -   case VARYING_SLOT_COL1:
> -   case VARYING_SLOT_BFC0:
> -   case VARYING_SLOT_BFC1: {
> -      /* These built-in varyings are only supported in compatibility mode,
> -       * and we only support GS in core profile.  So, this must be a vertex
> -       * shader.
> -       */
> -      assert(stage == MESA_SHADER_VERTEX);
> -      vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
> -      if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
> -         inst->saturate = true;
> -      break;
> -   }
> -
> -   default:
> -      emit_generic_urb_slot(reg, varying);
> -      break;
> -   }
> -}
> -
> -static int
> -align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
> -{
> -   if (brw->gen >= 6) {
> -      /* URB data written (does not include the message header reg) must
> -       * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
> -       * section 5.4.3.2.2: URB_INTERLEAVED.
> -       *
> -       * URB entries are allocated on a multiple of 1024 bits, so an
> -       * extra 128 bits written here to make the end align to 256 is
> -       * no problem.
> -       */
> -      if ((mlen % 2) != 1)
> -        mlen++;
> -   }
> -
> -   return mlen;
> -}
> -
> -
> -/**
> - * Generates the VUE payload plus the necessary URB write instructions to
> - * output it.
> - *
> - * The VUE layout is documented in Volume 2a.
> - */
> -void
> -vec4_visitor::emit_vertex()
> -{
> -   /* MRF 0 is reserved for the debugger, so start with message header
> -    * in MRF 1.
> -    */
> -   int base_mrf = 1;
> -   int mrf = base_mrf;
> -   /* In the process of generating our URB write message contents, we
> -    * may need to unspill a register or load from an array.  Those
> -    * reads would use MRFs 14-15.
> -    */
> -   int max_usable_mrf = 13;
> -
> -   /* The following assertion verifies that max_usable_mrf causes an
> -    * even-numbered amount of URB write data, which will meet gen6's
> -    * requirements for length alignment.
> -    */
> -   assert ((max_usable_mrf - base_mrf) % 2 == 0);
> -
> -   /* First mrf is the g0-based message header containing URB handles and
> -    * such.
> -    */
> -   emit_urb_write_header(mrf++);
> -
> -   if (brw->gen < 6) {
> -      emit_ndc_computation();
> -   }
> -
> -   /* Lower legacy ff and ClipVertex clipping to clip distances */
> -   if (key->userclip_active && !prog->UsesClipDistanceOut) {
> -      current_annotation = "user clip distances";
> -
> -      output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
> -      output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
> -
> -      emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
> -      emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
> -   }
> -
> -   /* We may need to split this up into several URB writes, so do them in a
> -    * loop.
> -    */
> -   int slot = 0;
> -   bool complete = false;
> -   do {
> -      /* URB offset is in URB row increments, and each of our MRFs is half of
> -       * one of those, since we're doing interleaved writes.
> -       */
> -      int offset = slot / 2;
> -
> -      mrf = base_mrf + 1;
> -      for (; slot < prog_data->vue_map.num_slots; ++slot) {
> -         emit_urb_slot(dst_reg(MRF, mrf++),
> -                       prog_data->vue_map.slot_to_varying[slot]);
> -
> -         /* If this was max_usable_mrf, we can't fit anything more into this
> -          * URB WRITE.
> -          */
> -         if (mrf > max_usable_mrf) {
> -            slot++;
> -            break;
> -         }
> -      }
> -
> -      complete = slot >= prog_data->vue_map.num_slots;
> -      current_annotation = "URB write";
> -      vec4_instruction *inst = emit_urb_write_opcode(complete);
> -      inst->base_mrf = base_mrf;
> -      inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
> -      inst->offset += offset;
> -   } while(!complete);
> -}
> -
> -
> -src_reg
> -vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
> -                                src_reg *reladdr, int reg_offset)
> -{
> -   /* Because we store the values to scratch interleaved like our
> -    * vertex data, we need to scale the vec4 index by 2.
> -    */
> -   int message_header_scale = 2;
> -
> -   /* Pre-gen6, the message header uses byte offsets instead of vec4
> -    * (16-byte) offset units.
> -    */
> -   if (brw->gen < 6)
> -      message_header_scale *= 16;
> -
> -   if (reladdr) {
> -      src_reg index = src_reg(this, glsl_type::int_type);
> -
> -      emit_before(block, inst, ADD(dst_reg(index), *reladdr,
> -                                   src_reg(reg_offset)));
> -      emit_before(block, inst, MUL(dst_reg(index), index,
> -                                   src_reg(message_header_scale)));
> -
> -      return index;
> -   } else {
> -      return src_reg(reg_offset * message_header_scale);
> -   }
> -}
> -
> -src_reg
> -vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
> -                                      src_reg *reladdr, int reg_offset)
> -{
> -   if (reladdr) {
> -      src_reg index = src_reg(this, glsl_type::int_type);
> -
> -      emit_before(block, inst, ADD(dst_reg(index), *reladdr,
> -                                   src_reg(reg_offset)));
> -
> -      /* Pre-gen6, the message header uses byte offsets instead of vec4
> -       * (16-byte) offset units.
> -       */
> -      if (brw->gen < 6) {
> -         emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
> -      }
> -
> -      return index;
> -   } else if (brw->gen >= 8) {
> -      /* Store the offset in a GRF so we can send-from-GRF. */
> -      src_reg offset = src_reg(this, glsl_type::int_type);
> -      emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
> -      return offset;
> -   } else {
> -      int message_header_scale = brw->gen < 6 ? 16 : 1;
> -      return src_reg(reg_offset * message_header_scale);
> -   }
> -}
> -
> -/**
> - * Emits an instruction before @inst to load the value named by @orig_src
> - * from scratch space at @base_offset to @temp.
> - *
> - * @base_offset is measured in 32-byte units (the size of a register).
> - */
> -void
> -vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
> -                               dst_reg temp, src_reg orig_src,
> -                               int base_offset)
> -{
> -   int reg_offset = base_offset + orig_src.reg_offset;
> -   src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
> -                                      reg_offset);
> -
> -   emit_before(block, inst, SCRATCH_READ(temp, index));
> -}
> -
> -/**
> - * Emits an instruction after @inst to store the value to be written
> - * to @orig_dst to scratch space at @base_offset, from @temp.
> - *
> - * @base_offset is measured in 32-byte units (the size of a register).
> - */
> -void
> -vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
> -                                 int base_offset)
> -{
> -   int reg_offset = base_offset + inst->dst.reg_offset;
> -   src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
> -                                      reg_offset);
> -
> -   /* Create a temporary register to store *inst's result in.
> -    *
> -    * We have to be careful in MOVing from our temporary result register in
> -    * the scratch write.  If we swizzle from channels of the temporary that
> -    * weren't initialized, it will confuse live interval analysis, which will
> -    * make spilling fail to make progress.
> -    */
> -   const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
> -                                       inst->dst.type),
> -                                brw_swizzle_for_mask(inst->dst.writemask));
> -   dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
> -                                      inst->dst.writemask));
> -   vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
> -   write->predicate = inst->predicate;
> -   write->ir = inst->ir;
> -   write->annotation = inst->annotation;
> -   inst->insert_after(block, write);
> -
> -   inst->dst.file = temp.file;
> -   inst->dst.reg = temp.reg;
> -   inst->dst.reg_offset = temp.reg_offset;
> -   inst->dst.reladdr = NULL;
> -}
> -
> -/**
> - * We can't generally support array access in GRF space, because a
> - * single instruction's destination can only span 2 contiguous
> - * registers.  So, we send all GRF arrays that get variable index
> - * access to scratch space.
> - */
> -void
> -vec4_visitor::move_grf_array_access_to_scratch()
> -{
> -   int scratch_loc[this->alloc.count];
> -   memset(scratch_loc, -1, sizeof(scratch_loc));
> -
> -   /* First, calculate the set of virtual GRFs that need to be punted
> -    * to scratch due to having any array access on them, and where in
> -    * scratch.
> -    */
> -   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
> -      if (inst->dst.file == GRF && inst->dst.reladdr &&
> -         scratch_loc[inst->dst.reg] == -1) {
> -        scratch_loc[inst->dst.reg] = c->last_scratch;
> -        c->last_scratch += this->alloc.sizes[inst->dst.reg];
> -      }
> -
> -      for (int i = 0 ; i < 3; i++) {
> -        src_reg *src = &inst->src[i];
> -
> -        if (src->file == GRF && src->reladdr &&
> -            scratch_loc[src->reg] == -1) {
> -           scratch_loc[src->reg] = c->last_scratch;
> -           c->last_scratch += this->alloc.sizes[src->reg];
> -        }
> -      }
> -   }
> -
> -   /* Now, for anything that will be accessed through scratch, rewrite
> -    * it to load/store.  Note that this is a _safe list walk, because
> -    * we may generate a new scratch_write instruction after the one
> -    * we're processing.
> -    */
> -   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
> -      /* Set up the annotation tracking for new generated instructions. */
> -      base_ir = inst->ir;
> -      current_annotation = inst->annotation;
> -
> -      if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
> -        emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
> -      }
> -
> -      for (int i = 0 ; i < 3; i++) {
> -        if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
> -           continue;
> -
> -        dst_reg temp = dst_reg(this, glsl_type::vec4_type);
> -
> -        emit_scratch_read(block, inst, temp, inst->src[i],
> -                          scratch_loc[inst->src[i].reg]);
> -
> -        inst->src[i].file = temp.file;
> -        inst->src[i].reg = temp.reg;
> -        inst->src[i].reg_offset = temp.reg_offset;
> -        inst->src[i].reladdr = NULL;
> -      }
> -   }
> -}
> -
> -/**
> - * Emits an instruction before @inst to load the value named by @orig_src
> - * from the pull constant buffer (surface) at @base_offset to @temp.
> - */
> -void
> -vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
> -                                     dst_reg temp, src_reg orig_src,
> -                                     int base_offset)
> -{
> -   int reg_offset = base_offset + orig_src.reg_offset;
> -   src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
> -   src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
> -                                             reg_offset);
> -   vec4_instruction *load;
> -
> -   if (brw->gen >= 7) {
> -      dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
> -
> -      /* We have to use a message header on Skylake to get SIMD4x2 mode.
> -       * Reserve space for the register.
> -       */
> -      if (brw->gen >= 9) {
> -         grf_offset.reg_offset++;
> -         alloc.sizes[grf_offset.reg] = 2;
> -      }
> -
> -      grf_offset.type = offset.type;
> -      emit_before(block, inst, MOV(grf_offset, offset));
> -
> -      load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
> -                                           temp, index, src_reg(grf_offset));
> -      load->mlen = 1;
> -   } else {
> -      load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
> -                                           temp, index, offset);
> -      load->base_mrf = 14;
> -      load->mlen = 1;
> -   }
> -   emit_before(block, inst, load);
> -}
> -
> -/**
> - * Implements array access of uniforms by inserting a
> - * PULL_CONSTANT_LOAD instruction.
> - *
> - * Unlike temporary GRF array access (where we don't support it due to
> - * the difficulty of doing relative addressing on instruction
> - * destinations), we could potentially do array access of uniforms
> - * that were loaded in GRF space as push constants.  In real-world
> - * usage we've seen, though, the arrays being used are always larger
> - * than we could load as push constants, so just always move all
> - * uniform array access out to a pull constant buffer.
> - */
> -void
> -vec4_visitor::move_uniform_array_access_to_pull_constants()
> -{
> -   int pull_constant_loc[this->uniforms];
> -   memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
> -   bool nested_reladdr;
> -
> -   /* Walk through and find array access of uniforms.  Put a copy of that
> -    * uniform in the pull constant buffer.
> -    *
> -    * Note that we don't move constant-indexed accesses to arrays.  No
> -    * testing has been done of the performance impact of this choice.
> -    */
> -   do {
> -      nested_reladdr = false;
> -
> -      foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
> -         for (int i = 0 ; i < 3; i++) {
> -            if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
> -               continue;
> -
> -            int uniform = inst->src[i].reg;
> -
> -            if (inst->src[i].reladdr->reladdr)
> -               nested_reladdr = true;  /* will need another pass */
> -
> -            /* If this array isn't already present in the pull constant buffer,
> -             * add it.
> -             */
> -            if (pull_constant_loc[uniform] == -1) {
> -               const gl_constant_value **values =
> -                  &stage_prog_data->param[uniform * 4];
> -
> -               pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
> -
> -               assert(uniform < uniform_array_size);
> -               for (int j = 0; j < uniform_size[uniform] * 4; j++) {
> -                  stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
> -                     = values[j];
> -               }
> -            }
> -
> -            /* Set up the annotation tracking for new generated instructions. */
> -            base_ir = inst->ir;
> -            current_annotation = inst->annotation;
> -
> -            dst_reg temp = dst_reg(this, glsl_type::vec4_type);
> -
> -            emit_pull_constant_load(block, inst, temp, inst->src[i],
> -                                    pull_constant_loc[uniform]);
> -
> -            inst->src[i].file = temp.file;
> -            inst->src[i].reg = temp.reg;
> -            inst->src[i].reg_offset = temp.reg_offset;
> -            inst->src[i].reladdr = NULL;
> -         }
> -      }
> -   } while (nested_reladdr);
> -
> -   /* Now there are no accesses of the UNIFORM file with a reladdr, so
> -    * no need to track them as larger-than-vec4 objects.  This will be
> -    * relied on in cutting out unused uniform vectors from push
> -    * constants.
> -    */
> -   split_uniform_registers();
> -}
> -
> -void
> -vec4_visitor::resolve_ud_negate(src_reg *reg)
> -{
> -   if (reg->type != BRW_REGISTER_TYPE_UD ||
> -       !reg->negate)
> -      return;
> -
> -   src_reg temp = src_reg(this, glsl_type::uvec4_type);
> -   emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
> -   *reg = temp;
> -}
> -
> -/**
> - * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
> - *
> - * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
> - * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
> - */
> -void
> -vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
> -{
> -   assert(brw->gen <= 5);
> -
> -   if (!rvalue->type->is_boolean())
> -      return;
> -
> -   src_reg and_result = src_reg(this, rvalue->type);
> -   src_reg neg_result = src_reg(this, rvalue->type);
> -   emit(AND(dst_reg(and_result), *reg, src_reg(1)));
> -   emit(MOV(dst_reg(neg_result), negate(and_result)));
> -   *reg = neg_result;
> -}
> -
> -vec4_visitor::vec4_visitor(struct brw_context *brw,
> -                           struct brw_vec4_compile *c,
> -                           struct gl_program *prog,
> -                           const struct brw_vue_prog_key *key,
> -                           struct brw_vue_prog_data *prog_data,
> -                          struct gl_shader_program *shader_prog,
> -                           gl_shader_stage stage,
> -                          void *mem_ctx,
> -                           bool no_spills,
> -                           shader_time_shader_type st_base,
> -                           shader_time_shader_type st_written,
> -                           shader_time_shader_type st_reset)
> -   : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
> -     c(c),
> -     key(key),
> -     prog_data(prog_data),
> -     sanity_param_count(0),
> -     fail_msg(NULL),
> -     first_non_payload_grf(0),
> -     need_all_constants_in_pull_buffer(false),
> -     no_spills(no_spills),
> -     st_base(st_base),
> -     st_written(st_written),
> -     st_reset(st_reset)
> -{
> -   this->mem_ctx = mem_ctx;
> -   this->failed = false;
> -
> -   this->base_ir = NULL;
> -   this->current_annotation = NULL;
> -   memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
> -
> -   this->variable_ht = hash_table_ctor(0,
> -                                      hash_table_pointer_hash,
> -                                      hash_table_pointer_compare);
> -
> -   this->virtual_grf_start = NULL;
> -   this->virtual_grf_end = NULL;
> -   this->live_intervals = NULL;
> -
> -   this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
> -
> -   this->uniforms = 0;
> -
> -   /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
> -    * at least one. See setup_uniforms() in brw_vec4.cpp.
> -    */
> -   this->uniform_array_size = 1;
> -   if (prog_data) {
> -      this->uniform_array_size =
> -         MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
> -   }
> -
> -   this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
> -   this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
> -}
> -
> -vec4_visitor::~vec4_visitor()
> -{
> -   hash_table_dtor(this->variable_ht);
> -}
> -
> -
> -void
> -vec4_visitor::fail(const char *format, ...)
> -{
> -   va_list va;
> -   char *msg;
> -
> -   if (failed)
> -      return;
> -
> -   failed = true;
> -
> -   va_start(va, format);
> -   msg = ralloc_vasprintf(mem_ctx, format, va);
> -   va_end(va);
> -   msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
> -
> -   this->fail_msg = msg;
> -
> -   if (debug_enabled) {
> -      fprintf(stderr, "%s",  msg);
> -   }
> -}
> -
> -} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
> index c3b0233..c60e532 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
> @@ -37,7 +37,7 @@ extern "C" {
>  using namespace brw;
>
>  void
> -vec4_visitor::emit_vp_sop(enum brw_conditional_mod conditional_mod,
> +vec4_god::emit_vp_sop(enum brw_conditional_mod conditional_mod,
>                            dst_reg dst, src_reg src0, src_reg src1,
>                            src_reg one)
>  {
> @@ -50,7 +50,7 @@ vec4_visitor::emit_vp_sop(enum brw_conditional_mod conditional_mod,
>  }
>
>  void
> -vec4_vs_visitor::emit_program_code()
> +vec4_vs_god::emit_program_code()
>  {
>     this->need_all_constants_in_pull_buffer = false;
>
> @@ -407,7 +407,7 @@ vec4_vs_visitor::emit_program_code()
>  }
>
>  void
> -vec4_vs_visitor::setup_vp_regs()
> +vec4_vs_god::setup_vp_regs()
>  {
>     /* PROGRAM_TEMPORARY */
>     int num_temp = prog->NumTemporaries;
> @@ -452,7 +452,7 @@ vec4_vs_visitor::setup_vp_regs()
>  }
>
>  dst_reg
> -vec4_vs_visitor::get_vp_dst_reg(const prog_dst_register &dst)
> +vec4_vs_god::get_vp_dst_reg(const prog_dst_register &dst)
>  {
>     dst_reg result;
>
> @@ -485,7 +485,7 @@ vec4_vs_visitor::get_vp_dst_reg(const prog_dst_register &dst)
>  }
>
>  src_reg
> -vec4_vs_visitor::get_vp_src_reg(const prog_src_register &src)
> +vec4_vs_god::get_vp_src_reg(const prog_src_register &src)
>  {
>     struct gl_program_parameter_list *plist =
>        vs_compile->vp->program.Base.Parameters;
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_god.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_god.cpp
> new file mode 100644
> index 0000000..0b69409
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_god.cpp
> @@ -0,0 +1,231 @@
> +/*
> + * Copyright © 2013 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +
> +#include "brw_vs.h"
> +#include "main/context.h"
> +
> +
> +namespace brw {
> +
> +void
> +vec4_vs_god::emit_prolog()
> +{
> +   dst_reg sign_recovery_shift;
> +   dst_reg normalize_factor;
> +   dst_reg es3_normalize_factor;
> +
> +   for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
> +      if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
> +         uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
> +         dst_reg reg(ATTR, i);
> +         dst_reg reg_d = reg;
> +         reg_d.type = BRW_REGISTER_TYPE_D;
> +         dst_reg reg_ud = reg;
> +         reg_ud.type = BRW_REGISTER_TYPE_UD;
> +
> +         /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
> +          * come in as floating point conversions of the integer values.
> +          */
> +         if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
> +            dst_reg dst = reg;
> +            dst.type = brw_type_for_base_type(glsl_type::vec4_type);
> +            dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
> +            emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
> +         }
> +
> +         /* Do sign recovery for 2101010 formats if required. */
> +         if (wa_flags & BRW_ATTRIB_WA_SIGN) {
> +            if (sign_recovery_shift.file == BAD_FILE) {
> +               /* shift constant: <22,22,22,30> */
> +               sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
> +               emit(MOV(writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
> +               emit(MOV(writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
> +            }
> +
> +            emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
> +            emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
> +         }
> +
> +         /* Apply BGRA swizzle if required. */
> +         if (wa_flags & BRW_ATTRIB_WA_BGRA) {
> +            src_reg temp = src_reg(reg);
> +            temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
> +            emit(MOV(reg, temp));
> +         }
> +
> +         if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
> +            /* ES 3.0 has different rules for converting signed normalized
> +             * fixed-point numbers than desktop GL.
> +             */
> +            if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
> +               /* According to equation 2.2 of the ES 3.0 specification,
> +                * signed normalization conversion is done by:
> +                *
> +                * f = c / (2^(b-1)-1)
> +                */
> +               if (es3_normalize_factor.file == BAD_FILE) {
> +                  /* mul constant: 1 / (2^(b-1) - 1) */
> +                  es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
> +                  emit(MOV(writemask(es3_normalize_factor, WRITEMASK_XYZ),
> +                           src_reg(1.0f / ((1<<9) - 1))));
> +                  emit(MOV(writemask(es3_normalize_factor, WRITEMASK_W),
> +                           src_reg(1.0f / ((1<<1) - 1))));
> +               }
> +
> +               dst_reg dst = reg;
> +               dst.type = brw_type_for_base_type(glsl_type::vec4_type);
> +               emit(MOV(dst, src_reg(reg_d)));
> +               emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
> +               emit_minmax(BRW_CONDITIONAL_GE, dst, src_reg(dst), src_reg(-1.0f));
> +            } else {
> +               /* The following equations are from the OpenGL 3.2 specification:
> +                *
> +                * 2.1 unsigned normalization
> +                * f = c/(2^n-1)
> +                *
> +                * 2.2 signed normalization
> +                * f = (2c+1)/(2^n-1)
> +                *
> +                * Both of these share a common divisor, which is represented by
> +                * "normalize_factor" in the code below.
> +                */
> +               if (normalize_factor.file == BAD_FILE) {
> +                  /* 1 / (2^b - 1) for b=<10,10,10,2> */
> +                  normalize_factor = dst_reg(this, glsl_type::vec4_type);
> +                  emit(MOV(writemask(normalize_factor, WRITEMASK_XYZ),
> +                           src_reg(1.0f / ((1<<10) - 1))));
> +                  emit(MOV(writemask(normalize_factor, WRITEMASK_W),
> +                           src_reg(1.0f / ((1<<2) - 1))));
> +               }
> +
> +               dst_reg dst = reg;
> +               dst.type = brw_type_for_base_type(glsl_type::vec4_type);
> +               emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
> +
> +               /* For signed normalization, we want the numerator to be 2c+1. */
> +               if (wa_flags & BRW_ATTRIB_WA_SIGN) {
> +                  emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
> +                  emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
> +               }
> +
> +               emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
> +            }
> +         }
> +
> +         if (wa_flags & BRW_ATTRIB_WA_SCALE) {
> +            dst_reg dst = reg;
> +            dst.type = brw_type_for_base_type(glsl_type::vec4_type);
> +            emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
> +         }
> +      }
> +   }
> +}
> +
> +
> +dst_reg *
> +vec4_vs_god::make_reg_for_system_value(ir_variable *ir)
> +{
> +   /* VertexID is stored by the VF as the last vertex element, but
> +    * we don't represent it with a flag in inputs_read, so we call
> +    * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
> +    */
> +   dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
> +
> +   switch (ir->data.location) {
> +   case SYSTEM_VALUE_BASE_VERTEX:
> +      reg->writemask = WRITEMASK_X;
> +      vs_prog_data->uses_vertexid = true;
> +      break;
> +   case SYSTEM_VALUE_VERTEX_ID:
> +   case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
> +      reg->writemask = WRITEMASK_Z;
> +      vs_prog_data->uses_vertexid = true;
> +      break;
> +   case SYSTEM_VALUE_INSTANCE_ID:
> +      reg->writemask = WRITEMASK_W;
> +      vs_prog_data->uses_instanceid = true;
> +      break;
> +   default:
> +      unreachable("not reached");
> +   }
> +
> +   return reg;
> +}
> +
> +
> +void
> +vec4_vs_god::emit_urb_write_header(int mrf)
> +{
> +   /* No need to do anything for VS; an implied write to this MRF will be
> +    * performed by VS_OPCODE_URB_WRITE.
> +    */
> +   (void) mrf;
> +}
> +
> +
> +vec4_instruction *
> +vec4_vs_god::emit_urb_write_opcode(bool complete)
> +{
> +   /* For VS, the URB writes end the thread. */
> +   if (complete) {
> +      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
> +         emit_shader_time_end();
> +   }
> +
> +   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
> +   inst->urb_write_flags = complete ?
> +      BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
> +
> +   return inst;
> +}
> +
> +
> +void
> +vec4_vs_god::emit_thread_end()
> +{
> +   /* For VS, we always end the thread by emitting a single vertex.
> +    * emit_urb_write_opcode() will take care of setting the eot flag on the
> +    * SEND instruction.
> +    */
> +   emit_vertex();
> +}
> +
> +
> +vec4_vs_god::vec4_vs_god(struct brw_context *brw,
> +                                 struct brw_vs_compile *vs_compile,
> +                                 struct brw_vs_prog_data *vs_prog_data,
> +                                 struct gl_shader_program *prog,
> +                                 void *mem_ctx)
> +   : vec4_god(brw, &vs_compile->base, &vs_compile->vp->program.Base,
> +                  &vs_compile->key.base, &vs_prog_data->base, prog,
> +                  MESA_SHADER_VERTEX,
> +                  mem_ctx, false /* no_spills */,
> +                  ST_VS, ST_VS_WRITTEN, ST_VS_RESET),
> +     vs_compile(vs_compile),
> +     vs_prog_data(vs_prog_data)
> +{
> +}
> +
> +
> +} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
> deleted file mode 100644
> index 4baf73e..0000000
> --- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
> +++ /dev/null
> @@ -1,231 +0,0 @@
> -/*
> - * Copyright © 2013 Intel Corporation
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the "Software"),
> - * to deal in the Software without restriction, including without limitation
> - * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> - * and/or sell copies of the Software, and to permit persons to whom the
> - * Software is furnished to do so, subject to the following conditions:
> - *
> - * The above copyright notice and this permission notice (including the next
> - * paragraph) shall be included in all copies or substantial portions of the
> - * Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> - * DEALINGS IN THE SOFTWARE.
> - */
> -
> -
> -#include "brw_vs.h"
> -#include "main/context.h"
> -
> -
> -namespace brw {
> -
> -void
> -vec4_vs_visitor::emit_prolog()
> -{
> -   dst_reg sign_recovery_shift;
> -   dst_reg normalize_factor;
> -   dst_reg es3_normalize_factor;
> -
> -   for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
> -      if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
> -         uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
> -         dst_reg reg(ATTR, i);
> -         dst_reg reg_d = reg;
> -         reg_d.type = BRW_REGISTER_TYPE_D;
> -         dst_reg reg_ud = reg;
> -         reg_ud.type = BRW_REGISTER_TYPE_UD;
> -
> -         /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
> -          * come in as floating point conversions of the integer values.
> -          */
> -         if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
> -            dst_reg dst = reg;
> -            dst.type = brw_type_for_base_type(glsl_type::vec4_type);
> -            dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
> -            emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
> -         }
> -
> -         /* Do sign recovery for 2101010 formats if required. */
> -         if (wa_flags & BRW_ATTRIB_WA_SIGN) {
> -            if (sign_recovery_shift.file == BAD_FILE) {
> -               /* shift constant: <22,22,22,30> */
> -               sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
> -               emit(MOV(writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
> -               emit(MOV(writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
> -            }
> -
> -            emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
> -            emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
> -         }
> -
> -         /* Apply BGRA swizzle if required. */
> -         if (wa_flags & BRW_ATTRIB_WA_BGRA) {
> -            src_reg temp = src_reg(reg);
> -            temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
> -            emit(MOV(reg, temp));
> -         }
> -
> -         if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
> -            /* ES 3.0 has different rules for converting signed normalized
> -             * fixed-point numbers than desktop GL.
> -             */
> -            if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
> -               /* According to equation 2.2 of the ES 3.0 specification,
> -                * signed normalization conversion is done by:
> -                *
> -                * f = c / (2^(b-1)-1)
> -                */
> -               if (es3_normalize_factor.file == BAD_FILE) {
> -                  /* mul constant: 1 / (2^(b-1) - 1) */
> -                  es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
> -                  emit(MOV(writemask(es3_normalize_factor, WRITEMASK_XYZ),
> -                           src_reg(1.0f / ((1<<9) - 1))));
> -                  emit(MOV(writemask(es3_normalize_factor, WRITEMASK_W),
> -                           src_reg(1.0f / ((1<<1) - 1))));
> -               }
> -
> -               dst_reg dst = reg;
> -               dst.type = brw_type_for_base_type(glsl_type::vec4_type);
> -               emit(MOV(dst, src_reg(reg_d)));
> -               emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
> -               emit_minmax(BRW_CONDITIONAL_GE, dst, src_reg(dst), src_reg(-1.0f));
> -            } else {
> -               /* The following equations are from the OpenGL 3.2 specification:
> -                *
> -                * 2.1 unsigned normalization
> -                * f = c/(2^n-1)
> -                *
> -                * 2.2 signed normalization
> -                * f = (2c+1)/(2^n-1)
> -                *
> -                * Both of these share a common divisor, which is represented by
> -                * "normalize_factor" in the code below.
> -                */
> -               if (normalize_factor.file == BAD_FILE) {
> -                  /* 1 / (2^b - 1) for b=<10,10,10,2> */
> -                  normalize_factor = dst_reg(this, glsl_type::vec4_type);
> -                  emit(MOV(writemask(normalize_factor, WRITEMASK_XYZ),
> -                           src_reg(1.0f / ((1<<10) - 1))));
> -                  emit(MOV(writemask(normalize_factor, WRITEMASK_W),
> -                           src_reg(1.0f / ((1<<2) - 1))));
> -               }
> -
> -               dst_reg dst = reg;
> -               dst.type = brw_type_for_base_type(glsl_type::vec4_type);
> -               emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
> -
> -               /* For signed normalization, we want the numerator to be 2c+1. */
> -               if (wa_flags & BRW_ATTRIB_WA_SIGN) {
> -                  emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
> -                  emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
> -               }
> -
> -               emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
> -            }
> -         }
> -
> -         if (wa_flags & BRW_ATTRIB_WA_SCALE) {
> -            dst_reg dst = reg;
> -            dst.type = brw_type_for_base_type(glsl_type::vec4_type);
> -            emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
> -         }
> -      }
> -   }
> -}
> -
> -
> -dst_reg *
> -vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
> -{
> -   /* VertexID is stored by the VF as the last vertex element, but
> -    * we don't represent it with a flag in inputs_read, so we call
> -    * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
> -    */
> -   dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
> -
> -   switch (ir->data.location) {
> -   case SYSTEM_VALUE_BASE_VERTEX:
> -      reg->writemask = WRITEMASK_X;
> -      vs_prog_data->uses_vertexid = true;
> -      break;
> -   case SYSTEM_VALUE_VERTEX_ID:
> -   case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
> -      reg->writemask = WRITEMASK_Z;
> -      vs_prog_data->uses_vertexid = true;
> -      break;
> -   case SYSTEM_VALUE_INSTANCE_ID:
> -      reg->writemask = WRITEMASK_W;
> -      vs_prog_data->uses_instanceid = true;
> -      break;
> -   default:
> -      unreachable("not reached");
> -   }
> -
> -   return reg;
> -}
> -
> -
> -void
> -vec4_vs_visitor::emit_urb_write_header(int mrf)
> -{
> -   /* No need to do anything for VS; an implied write to this MRF will be
> -    * performed by VS_OPCODE_URB_WRITE.
> -    */
> -   (void) mrf;
> -}
> -
> -
> -vec4_instruction *
> -vec4_vs_visitor::emit_urb_write_opcode(bool complete)
> -{
> -   /* For VS, the URB writes end the thread. */
> -   if (complete) {
> -      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
> -         emit_shader_time_end();
> -   }
> -
> -   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
> -   inst->urb_write_flags = complete ?
> -      BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
> -
> -   return inst;
> -}
> -
> -
> -void
> -vec4_vs_visitor::emit_thread_end()
> -{
> -   /* For VS, we always end the thread by emitting a single vertex.
> -    * emit_urb_write_opcode() will take care of setting the eot flag on the
> -    * SEND instruction.
> -    */
> -   emit_vertex();
> -}
> -
> -
> -vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
> -                                 struct brw_vs_compile *vs_compile,
> -                                 struct brw_vs_prog_data *vs_prog_data,
> -                                 struct gl_shader_program *prog,
> -                                 void *mem_ctx)
> -   : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base,
> -                  &vs_compile->key.base, &vs_prog_data->base, prog,
> -                  MESA_SHADER_VERTEX,
> -                  mem_ctx, false /* no_spills */,
> -                  ST_VS, ST_VS_WRITTEN, ST_VS_RESET),
> -     vs_compile(vs_compile),
> -     vs_prog_data(vs_prog_data)
> -{
> -}
> -
> -
> -} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
> index ba2c23d..137990c 100644
> --- a/src/mesa/drivers/dri/i965/brw_vs.c
> +++ b/src/mesa/drivers/dri/i965/brw_vs.c
> @@ -232,7 +232,7 @@ do_vs_prog(struct brw_context *brw,
>     } else {
>        param_count = vp->program.Base.Parameters->NumParameters * 4;
>     }
> -   /* vec4_visitor::setup_uniform_clipplane_values() also uploads user clip
> +   /* vec4_god::setup_uniform_clipplane_values() also uploads user clip
>      * planes as uniforms.
>      */
>     param_count += c.key.base.nr_userclip_plane_consts * 4;
> diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
> index bad0f07..96ed4ce 100644
> --- a/src/mesa/drivers/dri/i965/brw_vs.h
> +++ b/src/mesa/drivers/dri/i965/brw_vs.h
> @@ -81,10 +81,10 @@ brw_upload_vs_prog(struct brw_context *brw);
>
>  namespace brw {
>
> -class vec4_vs_visitor : public vec4_visitor
> +class vec4_vs_god : public vec4_god
>  {
>  public:
> -   vec4_vs_visitor(struct brw_context *brw,
> +   vec4_vs_god(struct brw_context *brw,
>                     struct brw_vs_compile *vs_compile,
>                     struct brw_vs_prog_data *vs_prog_data,
>                     struct gl_shader_program *prog,
> diff --git a/src/mesa/drivers/dri/i965/brw_wm_iz.cpp b/src/mesa/drivers/dri/i965/brw_wm_iz.cpp
> index 14930eb..8b0efda 100644
> --- a/src/mesa/drivers/dri/i965/brw_wm_iz.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_wm_iz.cpp
> @@ -120,7 +120,7 @@ static const struct {
>   * \param line_aa  AA_NEVER, AA_ALWAYS or AA_SOMETIMES
>   * \param lookup  bitmask of IZ_* flags
>   */
> -void fs_visitor::setup_payload_gen4()
> +void fs_god::setup_payload_gen4()
>  {
>     assert(stage == MESA_SHADER_FRAGMENT);
>     brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
> diff --git a/src/mesa/drivers/dri/i965/gen6_gs_god.cpp b/src/mesa/drivers/dri/i965/gen6_gs_god.cpp
> new file mode 100644
> index 0000000..2ea3e6f
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/gen6_gs_god.cpp
> @@ -0,0 +1,776 @@
> +/*
> + * Copyright © 2014 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + * This code is based on original work by Ilia Mirkin.
> + */
> +
> +/**
> + * \file gen6_gs_god.cpp
> + *
> + * Gen6 geometry shader implementation
> + */
> +
> +#include "gen6_gs_god.h"
> +
> +const unsigned MAX_GS_INPUT_VERTICES = 6;
> +
> +namespace brw {
> +
> +void
> +gen6_gs_god::assign_binding_table_offsets()
> +{
> +   /* In gen6 we reserve the first BRW_MAX_SOL_BINDINGS entries for transform
> +    * feedback surfaces.
> +    */
> +   assign_common_binding_table_offsets(BRW_MAX_SOL_BINDINGS);
> +}
> +
> +void
> +gen6_gs_god::emit_prolog()
> +{
> +   vec4_gs_god::emit_prolog();
> +
> +   /* Gen6 geometry shaders require to allocate an initial VUE handle via
> +    * FF_SYNC message, however the documentation remarks that only one thread
> +    * can write to the URB simultaneously and the FF_SYNC message provides the
> +    * synchronization mechanism for this, so using this message effectively
> +    * stalls the thread until it is its turn to write to the URB. Because of
> +    * this, the best way to implement geometry shader algorithms in gen6 is to
> +    * execute the algorithm before the FF_SYNC message to maximize parallelism.
> +    *
> +    * To achieve this we buffer the geometry shader outputs for each emitted
> +    * vertex in vertex_output during operation. Then, when we have processed
> +    * the last vertex (that is, at thread end time), we send the FF_SYNC
> +    * message to allocate the initial VUE handle and write all buffered vertex
> +    * data to the URB in one go.
> +    *
> +    * For each emitted vertex, vertex_output will hold vue_map.num_slots
> +    * data items plus one additional item to hold required flags
> +    * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
> +    * which come right after the data items for that vertex. Vertex data and
> +    * flags for the next vertex come right after the data items and flags for
> +    * the previous vertex.
> +    */
> +   this->current_annotation = "gen6 prolog";
> +   this->vertex_output = src_reg(this,
> +                                 glsl_type::uint_type,
> +                                 (prog_data->vue_map.num_slots + 1) *
> +                                 c->gp->program.VerticesOut);
> +   this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
> +   emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
> +
> +   /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
> +    * so initialize it once to R0.
> +    */
> +   vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
> +                                     retype(brw_vec8_grf(0, 0),
> +                                            BRW_REGISTER_TYPE_UD)));
> +   inst->force_writemask_all = true;
> +
> +   /* This will be used as a temporary to store writeback data of FF_SYNC
> +    * and URB_WRITE messages.
> +    */
> +   this->temp = src_reg(this, glsl_type::uint_type);
> +
> +   /* This will be used to know when we are processing the first vertex of
> +    * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
> +    * that we are processing the first vertex in the primitive and to zero
> +    * otherwise. This way we can use its value directly in the URB write
> +    * headers.
> +    */
> +   this->first_vertex = src_reg(this, glsl_type::uint_type);
> +   emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
> +
> +   /* The FF_SYNC message requires to know the number of primitives generated,
> +    * so keep a counter for this.
> +    */
> +   this->prim_count = src_reg(this, glsl_type::uint_type);
> +   emit(MOV(dst_reg(this->prim_count), 0u));
> +
> +   if (c->prog_data.gen6_xfb_enabled) {
> +      /* Create a virtual register to hold destination indices in SOL */
> +      this->destination_indices = src_reg(this, glsl_type::uvec4_type);
> +      /* Create a virtual register to hold number of written primitives */
> +      this->sol_prim_written = src_reg(this, glsl_type::uint_type);
> +      /* Create a virtual register to hold Streamed Vertex Buffer Indices */
> +      this->svbi = src_reg(this, glsl_type::uvec4_type);
> +      /* Create a virtual register to hold max values of SVBI */
> +      this->max_svbi = src_reg(this, glsl_type::uvec4_type);
> +      emit(MOV(dst_reg(this->max_svbi),
> +               src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
> +
> +      xfb_setup();
> +   }
> +
> +   /* PrimitveID is delivered in r0.1 of the thread payload. If the program
> +    * needs it we have to move it to a separate register where we can map
> +    * the atttribute.
> +    *
> +    * Notice that we cannot use a virtual register for this, because we need to
> +    * map all input attributes to hardware registers in setup_payload(),
> +    * which happens before virtual registers are mapped to hardware registers.
> +    * We could work around that issue if we were able to compute the first
> +    * non-payload register here and move the PrimitiveID information to that
> +    * register, but we can't because at this point we don't know the final
> +    * number uniforms that will be included in the payload.
> +    *
> +    * So, what we do is to place PrimitiveID information in r1, which is always
> +    * delivered as part of the payload, but its only populated with data
> +    * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
> +    * in the 3DSTATE_GS state packet. That information can be obtained by other
> +    * means though, so we can safely use r1 for this purpose.
> +    */
> +   if (c->prog_data.include_primitive_id) {
> +      this->primitive_id =
> +         src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
> +      emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
> +   }
> +}
> +
> +void
> +gen6_gs_god::visit(ir_emit_vertex *)
> +{
> +   this->current_annotation = "gen6 emit vertex";
> +   /* Honor max_vertex layout indication in geometry shader by ignoring any
> +    * vertices coming after c->gp->program.VerticesOut.
> +    */
> +   unsigned num_output_vertices = c->gp->program.VerticesOut;
> +   emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices),
> +            BRW_CONDITIONAL_L));
> +   emit(IF(BRW_PREDICATE_NORMAL));
> +   {
> +      /* Buffer all output slots for this vertex in vertex_output */
> +      for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
> +         int varying = prog_data->vue_map.slot_to_varying[slot];
> +         if (varying != VARYING_SLOT_PSIZ) {
> +            dst_reg dst(this->vertex_output);
> +            dst.reladdr = ralloc(mem_ctx, src_reg);
> +            memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> +            emit_urb_slot(dst, varying);
> +         } else {
> +            /* The PSIZ slot can pack multiple varyings in different channels
> +             * and emit_urb_slot() will produce a MOV instruction for each of
> +             * them. Since we are writing to an array, that will translate to
> +             * possibly multiple MOV instructions with an array destination and
> +             * each will generate a scratch write with the same offset into
> +             * scratch space (thus, each one overwriting the previous). This is
> +             * not what we want. What we will do instead is emit PSIZ to a
> +             * a regular temporary register, then move that resgister into the
> +             * array. This way we only have one instruction with an array
> +             * destination and we only produce a single scratch write.
> +             */
> +            dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
> +            emit_urb_slot(tmp, varying);
> +            dst_reg dst(this->vertex_output);
> +            dst.reladdr = ralloc(mem_ctx, src_reg);
> +            memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> +            vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
> +            inst->force_writemask_all = true;
> +         }
> +
> +         emit(ADD(dst_reg(this->vertex_output_offset),
> +                  this->vertex_output_offset, 1u));
> +      }
> +
> +      /* Now buffer flags for this vertex */
> +      dst_reg dst(this->vertex_output);
> +      dst.reladdr = ralloc(mem_ctx, src_reg);
> +      memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> +      if (c->gp->program.OutputType == GL_POINTS) {
> +         /* If we are outputting points, then every vertex has PrimStart and
> +          * PrimEnd set.
> +          */
> +         emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
> +                  URB_WRITE_PRIM_START | URB_WRITE_PRIM_END));
> +         emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
> +      } else {
> +         /* Otherwise, we can only set the PrimStart flag, which we have stored
> +          * in the first_vertex register. We will have to wait until we execute
> +          * EndPrimitive() or we end the thread to set the PrimEnd flag on a
> +          * vertex.
> +          */
> +         emit(OR(dst, this->first_vertex,
> +                 (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
> +         emit(MOV(dst_reg(this->first_vertex), 0u));
> +      }
> +      emit(ADD(dst_reg(this->vertex_output_offset),
> +               this->vertex_output_offset, 1u));
> +
> +      /* Update vertex count */
> +      emit(ADD(dst_reg(this->vertex_count), this->vertex_count, 1u));
> +   }
> +   emit(BRW_OPCODE_ENDIF);
> +}
> +
> +void
> +gen6_gs_god::visit(ir_end_primitive *)
> +{
> +   this->current_annotation = "gen6 end primitive";
> +   /* Calling EndPrimitive() is optional for point output. In this case we set
> +    * the PrimEnd flag when we process EmitVertex().
> +    */
> +   if (c->gp->program.OutputType == GL_POINTS)
> +      return;
> +
> +   /* Otherwise we know that the last vertex we have processed was the last
> +    * vertex in the primitive and we need to set its PrimEnd flag, so do this
> +    * unless we haven't emitted that vertex at all (vertex_count != 0).
> +    *
> +    * Notice that we have already incremented vertex_count when we processed
> +    * the last emit_vertex, so we need to take that into account in the
> +    * comparison below (hence the num_output_vertices + 1 in the comparison
> +    * below).
> +    */
> +   unsigned num_output_vertices = c->gp->program.VerticesOut;
> +   emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
> +            BRW_CONDITIONAL_L));
> +   vec4_instruction *inst = emit(CMP(dst_null_d(),
> +                                     this->vertex_count, 0u,
> +                                     BRW_CONDITIONAL_NEQ));
> +   inst->predicate = BRW_PREDICATE_NORMAL;
> +   emit(IF(BRW_PREDICATE_NORMAL));
> +   {
> +      /* vertex_output_offset is already pointing at the first entry of the
> +       * next vertex. So subtract 1 to modify the flags for the previous
> +       * vertex.
> +       */
> +      src_reg offset(this, glsl_type::uint_type);
> +      emit(ADD(dst_reg(offset), this->vertex_output_offset, src_reg(-1)));
> +
> +      src_reg dst(this->vertex_output);
> +      dst.reladdr = ralloc(mem_ctx, src_reg);
> +      memcpy(dst.reladdr, &offset, sizeof(src_reg));
> +
> +      emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END));
> +      emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
> +
> +      /* Set the first vertex flag to indicate that the next vertex will start
> +       * a primitive.
> +       */
> +      emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
> +   }
> +   emit(BRW_OPCODE_ENDIF);
> +}
> +
> +void
> +gen6_gs_god::emit_urb_write_header(int mrf)
> +{
> +   this->current_annotation = "gen6 urb header";
> +   /* Compute offset of the flags for the current vertex in vertex_output and
> +    * write them in dw2 of the message header.
> +    *
> +    * Notice that by the time that emit_thread_end() calls here
> +    * vertex_output_offset should point to the first data item of the current
> +    * vertex in vertex_output, thus we only need to add the number of output
> +    * slots per vertex to that offset to obtain the flags data offset.
> +    */
> +   src_reg flags_offset(this, glsl_type::uint_type);
> +   emit(ADD(dst_reg(flags_offset),
> +            this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots)));
> +
> +   src_reg flags_data(this->vertex_output);
> +   flags_data.reladdr = ralloc(mem_ctx, src_reg);
> +   memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
> +
> +   emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
> +}
> +
> +void
> +gen6_gs_god::emit_urb_write_opcode(bool complete, int base_mrf,
> +                                       int last_mrf, int urb_offset)
> +{
> +   vec4_instruction *inst = NULL;
> +
> +   if (!complete) {
> +      /* If the vertex is not complete we don't have to do anything special */
> +      inst = emit(GS_OPCODE_URB_WRITE);
> +      inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
> +   } else {
> +      /* Otherwise we always request to allocate a new VUE handle. If this is
> +       * the last write before the EOT message and the new handle never gets
> +       * used it will be dereferenced when we send the EOT message. This is
> +       * necessary to avoid different setups for the EOT message (one for the
> +       * case when there is no output and another for the case when there is)
> +       * which would require to end the program with an IF/ELSE/ENDIF block,
> +       * something we do not want.
> +       */
> +      inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
> +      inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
> +      inst->dst = dst_reg(MRF, base_mrf);
> +      inst->src[0] = this->temp;
> +   }
> +
> +   inst->base_mrf = base_mrf;
> +   /* URB data written (does not include the message header reg) must
> +    * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
> +    * section 5.4.3.2.2: URB_INTERLEAVED.
> +    */
> +   int mlen = last_mrf - base_mrf;
> +   if ((mlen % 2) != 1)
> +      mlen++;
> +   inst->mlen = mlen;
> +   inst->offset = urb_offset;
> +}
> +
> +void
> +gen6_gs_god::emit_thread_end()
> +{
> +   /* Make sure the current primitive is ended: we know it is not ended when
> +    * first_vertex is not zero. This is only relevant for outputs other than
> +    * points because in the point case we set PrimEnd on all vertices.
> +    */
> +   if (c->gp->program.OutputType != GL_POINTS) {
> +      emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z));
> +      emit(IF(BRW_PREDICATE_NORMAL));
> +      {
> +         visit((ir_end_primitive *) NULL);
> +      }
> +      emit(BRW_OPCODE_ENDIF);
> +   }
> +
> +   /* Here we have to:
> +    * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
> +    * 2) Loop over all buffered vertex data and write it to corresponding
> +    *    URB entries.
> +    * 3) Allocate new VUE handles for all vertices other than the first.
> +    * 4) Send a final EOT message.
> +    */
> +
> +   /* MRF 0 is reserved for the debugger, so start with message header
> +    * in MRF 1.
> +    */
> +   int base_mrf = 1;
> +
> +   /* In the process of generating our URB write message contents, we
> +    * may need to unspill a register or load from an array.  Those
> +    * reads would use MRFs 14-15.
> +    */
> +   int max_usable_mrf = 13;
> +
> +   /* Issue the FF_SYNC message and obtain the initial VUE handle. */
> +   emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G));
> +   emit(IF(BRW_PREDICATE_NORMAL));
> +   {
> +      this->current_annotation = "gen6 thread end: ff_sync";
> +
> +      vec4_instruction *inst;
> +      if (c->prog_data.gen6_xfb_enabled) {
> +         src_reg sol_temp(this, glsl_type::uvec4_type);
> +         emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
> +              dst_reg(this->svbi),
> +              this->vertex_count,
> +              this->prim_count,
> +              sol_temp);
> +         inst = emit(GS_OPCODE_FF_SYNC,
> +                     dst_reg(this->temp), this->prim_count, this->svbi);
> +      } else {
> +         inst = emit(GS_OPCODE_FF_SYNC,
> +                     dst_reg(this->temp), this->prim_count, src_reg(0u));
> +      }
> +      inst->base_mrf = base_mrf;
> +
> +      /* Loop over all buffered vertices and emit URB write messages */
> +      this->current_annotation = "gen6 thread end: urb writes init";
> +      src_reg vertex(this, glsl_type::uint_type);
> +      emit(MOV(dst_reg(vertex), 0u));
> +      emit(MOV(dst_reg(this->vertex_output_offset), 0u));
> +
> +      this->current_annotation = "gen6 thread end: urb writes";
> +      emit(BRW_OPCODE_DO);
> +      {
> +         emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
> +         inst = emit(BRW_OPCODE_BREAK);
> +         inst->predicate = BRW_PREDICATE_NORMAL;
> +
> +         /* First we prepare the message header */
> +         emit_urb_write_header(base_mrf);
> +
> +         /* Then add vertex data to the message in interleaved fashion */
> +         int slot = 0;
> +         bool complete = false;
> +         do {
> +            int mrf = base_mrf + 1;
> +
> +            /* URB offset is in URB row increments, and each of our MRFs is half
> +             * of one of those, since we're doing interleaved writes.
> +             */
> +            int urb_offset = slot / 2;
> +
> +            for (; slot < prog_data->vue_map.num_slots; ++slot) {
> +               int varying = prog_data->vue_map.slot_to_varying[slot];
> +               current_annotation = output_reg_annotation[varying];
> +
> +               /* Compute offset of this slot for the current vertex
> +                * in vertex_output
> +                */
> +               src_reg data(this->vertex_output);
> +               data.reladdr = ralloc(mem_ctx, src_reg);
> +               memcpy(data.reladdr, &this->vertex_output_offset,
> +                      sizeof(src_reg));
> +
> +               /* Copy this slot to the appropriate message register */
> +               dst_reg reg = dst_reg(MRF, mrf);
> +               reg.type = output_reg[varying].type;
> +               data.type = reg.type;
> +               vec4_instruction *inst = emit(MOV(reg, data));
> +               inst->force_writemask_all = true;
> +
> +               mrf++;
> +               emit(ADD(dst_reg(this->vertex_output_offset),
> +                        this->vertex_output_offset, 1u));
> +
> +               /* If this was max_usable_mrf, we can't fit anything more into
> +                * this URB WRITE.
> +                */
> +               if (mrf > max_usable_mrf) {
> +                  slot++;
> +                  break;
> +               }
> +            }
> +
> +            complete = slot >= prog_data->vue_map.num_slots;
> +            emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
> +         } while (!complete);
> +
> +         /* Skip over the flags data item so that vertex_output_offset points
> +          * to the first data item of the next vertex, so that we can start
> +          * writing the next vertex.
> +          */
> +         emit(ADD(dst_reg(this->vertex_output_offset),
> +                  this->vertex_output_offset, 1u));
> +
> +         emit(ADD(dst_reg(vertex), vertex, 1u));
> +      }
> +      emit(BRW_OPCODE_WHILE);
> +
> +      if (c->prog_data.gen6_xfb_enabled)
> +         xfb_write();
> +   }
> +   emit(BRW_OPCODE_ENDIF);
> +
> +   /* Finally, emit EOT message.
> +    *
> +    * In gen6 we need to end the thread differently depending on whether we have
> +    * emitted at least one vertex or not. In case we did, the EOT message must
> +    * always include the COMPLETE flag or else the GPU hangs. If we have not
> +    * produced any output we can't use the COMPLETE flag.
> +    *
> +    * However, this would lead us to end the program with an ENDIF opcode,
> +    * which we want to avoid, so what we do is that we always request a new
> +    * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
> +    * With this we make sure that whether we have emitted at least one vertex
> +    * or none at all, we have to finish the thread without writing to the URB,
> +    * which works for both cases by setting the COMPLETE and UNUSED flags in
> +    * the EOT message.
> +    */
> +   this->current_annotation = "gen6 thread end: EOT";
> +
> +   if (c->prog_data.gen6_xfb_enabled) {
> +      /* When emitting EOT, set SONumPrimsWritten Increment Value. */
> +      src_reg data(this, glsl_type::uint_type);
> +      emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu)));
> +      emit(SHL(dst_reg(data), data, src_reg(16u)));
> +      emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
> +   }
> +
> +   vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
> +   inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
> +   inst->base_mrf = base_mrf;
> +   inst->mlen = 1;
> +}
> +
> +void
> +gen6_gs_god::setup_payload()
> +{
> +   int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
> +
> +   /* Attributes are going to be interleaved, so one register contains two
> +    * attribute slots.
> +    */
> +   int attributes_per_reg = 2;
> +
> +   /* If a geometry shader tries to read from an input that wasn't written by
> +    * the vertex shader, that produces undefined results, but it shouldn't
> +    * crash anything.  So initialize attribute_map to zeros--that ensures that
> +    * these undefined results are read from r0.
> +    */
> +   memset(attribute_map, 0, sizeof(attribute_map));
> +
> +   int reg = 0;
> +
> +   /* The payload always contains important data in r0. */
> +   reg++;
> +
> +   /* r1 is always part of the payload and it holds information relevant
> +    * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
> +    * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
> +    * information (and move the original value to a virtual register if
> +    * necessary).
> +    */
> +   if (c->prog_data.include_primitive_id)
> +      attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
> +   reg++;
> +
> +   reg = setup_uniforms(reg);
> +
> +   reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
> +
> +   lower_attributes_to_hw_regs(attribute_map, true);
> +
> +   this->first_non_payload_grf = reg;
> +}
> +
> +void
> +gen6_gs_god::xfb_setup()
> +{
> +   static const unsigned swizzle_for_offset[4] = {
> +      BRW_SWIZZLE4(0, 1, 2, 3),
> +      BRW_SWIZZLE4(1, 2, 3, 3),
> +      BRW_SWIZZLE4(2, 3, 3, 3),
> +      BRW_SWIZZLE4(3, 3, 3, 3)
> +   };
> +
> +   struct brw_gs_prog_data *prog_data =
> +      (struct brw_gs_prog_data *) &c->prog_data;
> +
> +   const struct gl_transform_feedback_info *linked_xfb_info =
> +      &this->shader_prog->LinkedTransformFeedback;
> +   int i;
> +
> +   /* Make sure that the VUE slots won't overflow the unsigned chars in
> +    * prog_data->transform_feedback_bindings[].
> +    */
> +   STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
> +
> +   /* Make sure that we don't need more binding table entries than we've
> +    * set aside for use in transform feedback.  (We shouldn't, since we
> +    * set aside enough binding table entries to have one per component).
> +    */
> +   assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
> +
> +   prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
> +   for (i = 0; i < prog_data->num_transform_feedback_bindings; i++) {
> +      prog_data->transform_feedback_bindings[i] =
> +         linked_xfb_info->Outputs[i].OutputRegister;
> +      prog_data->transform_feedback_swizzles[i] =
> +         swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
> +   }
> +}
> +
> +void
> +gen6_gs_god::xfb_write()
> +{
> +   unsigned num_verts;
> +   struct brw_gs_prog_data *prog_data =
> +      (struct brw_gs_prog_data *) &c->prog_data;
> +
> +   if (!prog_data->num_transform_feedback_bindings)
> +      return;
> +
> +   switch (c->prog_data.output_topology) {
> +   case _3DPRIM_POINTLIST:
> +      num_verts = 1;
> +      break;
> +   case _3DPRIM_LINELIST:
> +   case _3DPRIM_LINESTRIP:
> +   case _3DPRIM_LINELOOP:
> +      num_verts = 2;
> +      break;
> +   case _3DPRIM_TRILIST:
> +   case _3DPRIM_TRIFAN:
> +   case _3DPRIM_TRISTRIP:
> +   case _3DPRIM_RECTLIST:
> +      num_verts = 3;
> +      break;
> +   case _3DPRIM_QUADLIST:
> +   case _3DPRIM_QUADSTRIP:
> +   case _3DPRIM_POLYGON:
> +      num_verts = 3;
> +      break;
> +   default:
> +      unreachable("Unexpected primitive type in Gen6 SOL program.");
> +   }
> +
> +   this->current_annotation = "gen6 thread end: svb writes init";
> +
> +   emit(MOV(dst_reg(this->vertex_output_offset), 0u));
> +   emit(MOV(dst_reg(this->sol_prim_written), 0u));
> +
> +   /* Check that at least one primitive can be written
> +    *
> +    * Note: since we use the binding table to keep track of buffer offsets
> +    * and stride, the GS doesn't need to keep track of a separate pointer
> +    * into each buffer; it uses a single pointer which increments by 1 for
> +    * each vertex.  So we use SVBI0 for this pointer, regardless of whether
> +    * transform feedback is in interleaved or separate attribs mode.
> +    */
> +   src_reg sol_temp(this, glsl_type::uvec4_type);
> +   emit(ADD(dst_reg(sol_temp), this->svbi, src_reg(num_verts)));
> +
> +   /* Compare SVBI calculated number with the maximum value, which is
> +    * in R1.4 (previously saved in this->max_svbi) for gen6.
> +    */
> +   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
> +   emit(IF(BRW_PREDICATE_NORMAL));
> +   {
> +      src_reg destination_indices_uw =
> +         retype(destination_indices, BRW_REGISTER_TYPE_UW);
> +
> +      vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw),
> +                                        brw_imm_v(0x00020100))); /* (0, 1, 2) */
> +      inst->force_writemask_all = true;
> +
> +      emit(ADD(dst_reg(this->destination_indices),
> +               this->destination_indices,
> +               this->svbi));
> +   }
> +   emit(BRW_OPCODE_ENDIF);
> +
> +   /* Write transform feedback data for all processed vertices. */
> +   for (int i = 0; i < c->gp->program.VerticesOut; i++) {
> +      emit(MOV(dst_reg(sol_temp), i));
> +      emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
> +               BRW_CONDITIONAL_L));
> +      emit(IF(BRW_PREDICATE_NORMAL));
> +      {
> +         xfb_program(i, num_verts);
> +      }
> +      emit(BRW_OPCODE_ENDIF);
> +   }
> +}
> +
> +void
> +gen6_gs_god::xfb_program(unsigned vertex, unsigned num_verts)
> +{
> +   struct brw_gs_prog_data *prog_data =
> +      (struct brw_gs_prog_data *) &c->prog_data;
> +   unsigned binding;
> +   unsigned num_bindings = prog_data->num_transform_feedback_bindings;
> +   src_reg sol_temp(this, glsl_type::uvec4_type);
> +
> +   /* Check for buffer overflow: we need room to write the complete primitive
> +    * (all vertices). Otherwise, avoid writing any vertices for it
> +    */
> +   emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u));
> +   emit(MUL(dst_reg(sol_temp), sol_temp, src_reg(num_verts)));
> +   emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
> +   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
> +   emit(IF(BRW_PREDICATE_NORMAL));
> +   {
> +      /* Avoid overwriting MRF 1 as it is used as URB write message header */
> +      dst_reg mrf_reg(MRF, 2);
> +
> +      this->current_annotation = "gen6: emit SOL vertex data";
> +      /* For each vertex, generate code to output each varying using the
> +       * appropriate binding table entry.
> +       */
> +      for (binding = 0; binding < num_bindings; ++binding) {
> +         unsigned char varying =
> +            prog_data->transform_feedback_bindings[binding];
> +
> +         /* Set up the correct destination index for this vertex */
> +         vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
> +                                       mrf_reg,
> +                                       this->destination_indices);
> +         inst->sol_vertex = vertex % num_verts;
> +
> +         /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
> +          *
> +          *   "Prior to End of Thread with a URB_WRITE, the kernel must
> +          *   ensure that all writes are complete by sending the final
> +          *   write as a committed write."
> +          */
> +         bool final_write = binding == (unsigned) num_bindings - 1 &&
> +                            inst->sol_vertex == num_verts - 1;
> +
> +         /* Compute offset of this varying for the current vertex
> +          * in vertex_output
> +          */
> +         this->current_annotation = output_reg_annotation[varying];
> +         src_reg data(this->vertex_output);
> +         data.reladdr = ralloc(mem_ctx, src_reg);
> +         int offset = get_vertex_output_offset_for_varying(vertex, varying);
> +         emit(MOV(dst_reg(this->vertex_output_offset), offset));
> +         memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> +         data.type = output_reg[varying].type;
> +
> +         /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
> +          * same slot, so make sure we write the appropriate channel
> +          */
> +         if (varying == VARYING_SLOT_PSIZ)
> +            data.swizzle = BRW_SWIZZLE_WWWW;
> +         else if (varying == VARYING_SLOT_LAYER)
> +            data.swizzle = BRW_SWIZZLE_YYYY;
> +         else if (varying == VARYING_SLOT_VIEWPORT)
> +            data.swizzle = BRW_SWIZZLE_ZZZZ;
> +         else
> +            data.swizzle = prog_data->transform_feedback_swizzles[binding];
> +
> +         /* Write data */
> +         inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
> +         inst->sol_binding = binding;
> +         inst->sol_final_write = final_write;
> +
> +         if (final_write) {
> +            /* This is the last vertex of the primitive, then increment
> +             * SO num primitive counter and destination indices.
> +             */
> +            emit(ADD(dst_reg(this->destination_indices),
> +                     this->destination_indices,
> +                     src_reg(num_verts)));
> +            emit(ADD(dst_reg(this->sol_prim_written),
> +                     this->sol_prim_written, 1u));
> +         }
> +
> +      }
> +      this->current_annotation = NULL;
> +   }
> +   emit(BRW_OPCODE_ENDIF);
> +}
> +
> +int
> +gen6_gs_god::get_vertex_output_offset_for_varying(int vertex, int varying)
> +{
> +   /* Find the output slot assigned to this varying.
> +    *
> +    * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
> +    * as VARYING_SLOT_PSIZ.
> +    */
> +   if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
> +      varying = VARYING_SLOT_PSIZ;
> +   int slot = prog_data->vue_map.varying_to_slot[varying];
> +
> +   if (slot < 0) {
> +      /* This varying does not exist in the VUE so we are not writing to it
> +       * and its value is undefined. We still want to return a valid offset
> +       * into vertex_output though, to prevent any out-of-bound accesses into
> +       * the vertex_output array. Since the value for this varying is undefined
> +       * we don't really care for the value we assign to it, so any offset
> +       * within the limits of vertex_output will do.
> +       */
> +      slot = 0;
> +   }
> +
> +   return vertex * (prog_data->vue_map.num_slots + 1) + slot;
> +}
> +
> +} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/gen6_gs_god.h b/src/mesa/drivers/dri/i965/gen6_gs_god.h
> new file mode 100644
> index 0000000..f99f2cc
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/gen6_gs_god.h
> @@ -0,0 +1,82 @@
> +/*
> + * Copyright © 2014 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef GEN6_GS_VISITOR_H
> +#define GEN6_GS_VISITOR_H
> +
> +#include "brw_vec4.h"
> +#include "brw_vec4_gs_god.h"
> +
> +#ifdef __cplusplus
> +
> +namespace brw {
> +
> +class gen6_gs_god : public vec4_gs_god
> +{
> +public:
> +   gen6_gs_god(struct brw_context *brw,
> +                   struct brw_gs_compile *c,
> +                   struct gl_shader_program *prog,
> +                   void *mem_ctx,
> +                   bool no_spills) :
> +      vec4_gs_god(brw, c, prog, mem_ctx, no_spills) {}
> +
> +protected:
> +   virtual void assign_binding_table_offsets();
> +   virtual void emit_prolog();
> +   virtual void emit_thread_end();
> +   virtual void visit(ir_emit_vertex *);
> +   virtual void visit(ir_end_primitive *);
> +   virtual void emit_urb_write_header(int mrf);
> +   virtual void emit_urb_write_opcode(bool complete,
> +                                      int base_mrf,
> +                                      int last_mrf,
> +                                      int urb_offset);
> +   virtual void setup_payload();
> +
> +private:
> +   void xfb_write();
> +   void xfb_program(unsigned vertex, unsigned num_verts);
> +   void xfb_setup();
> +   int get_vertex_output_offset_for_varying(int vertex, int varying);
> +
> +   src_reg vertex_output;
> +   src_reg vertex_output_offset;
> +   src_reg temp;
> +   src_reg first_vertex;
> +   src_reg prim_count;
> +   src_reg primitive_id;
> +
> +   /* Transform Feedback members */
> +   src_reg sol_prim_written;
> +   src_reg svbi;
> +   src_reg max_svbi;
> +   src_reg destination_indices;
> +};
> +
> +} /* namespace brw */
> +
> +#endif /* __cplusplus */
> +
> +#endif /* GEN6_GS_VISITOR_H */
> diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
> deleted file mode 100644
> index 782687a..0000000
> --- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
> +++ /dev/null
> @@ -1,776 +0,0 @@
> -/*
> - * Copyright © 2014 Intel Corporation
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the "Software"),
> - * to deal in the Software without restriction, including without limitation
> - * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> - * and/or sell copies of the Software, and to permit persons to whom the
> - * Software is furnished to do so, subject to the following conditions:
> - *
> - * The above copyright notice and this permission notice (including the next
> - * paragraph) shall be included in all copies or substantial portions of the
> - * Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> - * IN THE SOFTWARE.
> - *
> - * This code is based on original work by Ilia Mirkin.
> - */
> -
> -/**
> - * \file gen6_gs_visitor.cpp
> - *
> - * Gen6 geometry shader implementation
> - */
> -
> -#include "gen6_gs_visitor.h"
> -
> -const unsigned MAX_GS_INPUT_VERTICES = 6;
> -
> -namespace brw {
> -
> -void
> -gen6_gs_visitor::assign_binding_table_offsets()
> -{
> -   /* In gen6 we reserve the first BRW_MAX_SOL_BINDINGS entries for transform
> -    * feedback surfaces.
> -    */
> -   assign_common_binding_table_offsets(BRW_MAX_SOL_BINDINGS);
> -}
> -
> -void
> -gen6_gs_visitor::emit_prolog()
> -{
> -   vec4_gs_visitor::emit_prolog();
> -
> -   /* Gen6 geometry shaders require to allocate an initial VUE handle via
> -    * FF_SYNC message, however the documentation remarks that only one thread
> -    * can write to the URB simultaneously and the FF_SYNC message provides the
> -    * synchronization mechanism for this, so using this message effectively
> -    * stalls the thread until it is its turn to write to the URB. Because of
> -    * this, the best way to implement geometry shader algorithms in gen6 is to
> -    * execute the algorithm before the FF_SYNC message to maximize parallelism.
> -    *
> -    * To achieve this we buffer the geometry shader outputs for each emitted
> -    * vertex in vertex_output during operation. Then, when we have processed
> -    * the last vertex (that is, at thread end time), we send the FF_SYNC
> -    * message to allocate the initial VUE handle and write all buffered vertex
> -    * data to the URB in one go.
> -    *
> -    * For each emitted vertex, vertex_output will hold vue_map.num_slots
> -    * data items plus one additional item to hold required flags
> -    * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
> -    * which come right after the data items for that vertex. Vertex data and
> -    * flags for the next vertex come right after the data items and flags for
> -    * the previous vertex.
> -    */
> -   this->current_annotation = "gen6 prolog";
> -   this->vertex_output = src_reg(this,
> -                                 glsl_type::uint_type,
> -                                 (prog_data->vue_map.num_slots + 1) *
> -                                 c->gp->program.VerticesOut);
> -   this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
> -   emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
> -
> -   /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
> -    * so initialize it once to R0.
> -    */
> -   vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
> -                                     retype(brw_vec8_grf(0, 0),
> -                                            BRW_REGISTER_TYPE_UD)));
> -   inst->force_writemask_all = true;
> -
> -   /* This will be used as a temporary to store writeback data of FF_SYNC
> -    * and URB_WRITE messages.
> -    */
> -   this->temp = src_reg(this, glsl_type::uint_type);
> -
> -   /* This will be used to know when we are processing the first vertex of
> -    * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
> -    * that we are processing the first vertex in the primitive and to zero
> -    * otherwise. This way we can use its value directly in the URB write
> -    * headers.
> -    */
> -   this->first_vertex = src_reg(this, glsl_type::uint_type);
> -   emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
> -
> -   /* The FF_SYNC message requires to know the number of primitives generated,
> -    * so keep a counter for this.
> -    */
> -   this->prim_count = src_reg(this, glsl_type::uint_type);
> -   emit(MOV(dst_reg(this->prim_count), 0u));
> -
> -   if (c->prog_data.gen6_xfb_enabled) {
> -      /* Create a virtual register to hold destination indices in SOL */
> -      this->destination_indices = src_reg(this, glsl_type::uvec4_type);
> -      /* Create a virtual register to hold number of written primitives */
> -      this->sol_prim_written = src_reg(this, glsl_type::uint_type);
> -      /* Create a virtual register to hold Streamed Vertex Buffer Indices */
> -      this->svbi = src_reg(this, glsl_type::uvec4_type);
> -      /* Create a virtual register to hold max values of SVBI */
> -      this->max_svbi = src_reg(this, glsl_type::uvec4_type);
> -      emit(MOV(dst_reg(this->max_svbi),
> -               src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
> -
> -      xfb_setup();
> -   }
> -
> -   /* PrimitveID is delivered in r0.1 of the thread payload. If the program
> -    * needs it we have to move it to a separate register where we can map
> -    * the atttribute.
> -    *
> -    * Notice that we cannot use a virtual register for this, because we need to
> -    * map all input attributes to hardware registers in setup_payload(),
> -    * which happens before virtual registers are mapped to hardware registers.
> -    * We could work around that issue if we were able to compute the first
> -    * non-payload register here and move the PrimitiveID information to that
> -    * register, but we can't because at this point we don't know the final
> -    * number uniforms that will be included in the payload.
> -    *
> -    * So, what we do is to place PrimitiveID information in r1, which is always
> -    * delivered as part of the payload, but its only populated with data
> -    * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
> -    * in the 3DSTATE_GS state packet. That information can be obtained by other
> -    * means though, so we can safely use r1 for this purpose.
> -    */
> -   if (c->prog_data.include_primitive_id) {
> -      this->primitive_id =
> -         src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
> -      emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
> -   }
> -}
> -
> -void
> -gen6_gs_visitor::visit(ir_emit_vertex *)
> -{
> -   this->current_annotation = "gen6 emit vertex";
> -   /* Honor max_vertex layout indication in geometry shader by ignoring any
> -    * vertices coming after c->gp->program.VerticesOut.
> -    */
> -   unsigned num_output_vertices = c->gp->program.VerticesOut;
> -   emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices),
> -            BRW_CONDITIONAL_L));
> -   emit(IF(BRW_PREDICATE_NORMAL));
> -   {
> -      /* Buffer all output slots for this vertex in vertex_output */
> -      for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
> -         int varying = prog_data->vue_map.slot_to_varying[slot];
> -         if (varying != VARYING_SLOT_PSIZ) {
> -            dst_reg dst(this->vertex_output);
> -            dst.reladdr = ralloc(mem_ctx, src_reg);
> -            memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> -            emit_urb_slot(dst, varying);
> -         } else {
> -            /* The PSIZ slot can pack multiple varyings in different channels
> -             * and emit_urb_slot() will produce a MOV instruction for each of
> -             * them. Since we are writing to an array, that will translate to
> -             * possibly multiple MOV instructions with an array destination and
> -             * each will generate a scratch write with the same offset into
> -             * scratch space (thus, each one overwriting the previous). This is
> -             * not what we want. What we will do instead is emit PSIZ to a
> -             * a regular temporary register, then move that resgister into the
> -             * array. This way we only have one instruction with an array
> -             * destination and we only produce a single scratch write.
> -             */
> -            dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
> -            emit_urb_slot(tmp, varying);
> -            dst_reg dst(this->vertex_output);
> -            dst.reladdr = ralloc(mem_ctx, src_reg);
> -            memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> -            vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
> -            inst->force_writemask_all = true;
> -         }
> -
> -         emit(ADD(dst_reg(this->vertex_output_offset),
> -                  this->vertex_output_offset, 1u));
> -      }
> -
> -      /* Now buffer flags for this vertex */
> -      dst_reg dst(this->vertex_output);
> -      dst.reladdr = ralloc(mem_ctx, src_reg);
> -      memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> -      if (c->gp->program.OutputType == GL_POINTS) {
> -         /* If we are outputting points, then every vertex has PrimStart and
> -          * PrimEnd set.
> -          */
> -         emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
> -                  URB_WRITE_PRIM_START | URB_WRITE_PRIM_END));
> -         emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
> -      } else {
> -         /* Otherwise, we can only set the PrimStart flag, which we have stored
> -          * in the first_vertex register. We will have to wait until we execute
> -          * EndPrimitive() or we end the thread to set the PrimEnd flag on a
> -          * vertex.
> -          */
> -         emit(OR(dst, this->first_vertex,
> -                 (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
> -         emit(MOV(dst_reg(this->first_vertex), 0u));
> -      }
> -      emit(ADD(dst_reg(this->vertex_output_offset),
> -               this->vertex_output_offset, 1u));
> -
> -      /* Update vertex count */
> -      emit(ADD(dst_reg(this->vertex_count), this->vertex_count, 1u));
> -   }
> -   emit(BRW_OPCODE_ENDIF);
> -}
> -
> -void
> -gen6_gs_visitor::visit(ir_end_primitive *)
> -{
> -   this->current_annotation = "gen6 end primitive";
> -   /* Calling EndPrimitive() is optional for point output. In this case we set
> -    * the PrimEnd flag when we process EmitVertex().
> -    */
> -   if (c->gp->program.OutputType == GL_POINTS)
> -      return;
> -
> -   /* Otherwise we know that the last vertex we have processed was the last
> -    * vertex in the primitive and we need to set its PrimEnd flag, so do this
> -    * unless we haven't emitted that vertex at all (vertex_count != 0).
> -    *
> -    * Notice that we have already incremented vertex_count when we processed
> -    * the last emit_vertex, so we need to take that into account in the
> -    * comparison below (hence the num_output_vertices + 1 in the comparison
> -    * below).
> -    */
> -   unsigned num_output_vertices = c->gp->program.VerticesOut;
> -   emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
> -            BRW_CONDITIONAL_L));
> -   vec4_instruction *inst = emit(CMP(dst_null_d(),
> -                                     this->vertex_count, 0u,
> -                                     BRW_CONDITIONAL_NEQ));
> -   inst->predicate = BRW_PREDICATE_NORMAL;
> -   emit(IF(BRW_PREDICATE_NORMAL));
> -   {
> -      /* vertex_output_offset is already pointing at the first entry of the
> -       * next vertex. So subtract 1 to modify the flags for the previous
> -       * vertex.
> -       */
> -      src_reg offset(this, glsl_type::uint_type);
> -      emit(ADD(dst_reg(offset), this->vertex_output_offset, src_reg(-1)));
> -
> -      src_reg dst(this->vertex_output);
> -      dst.reladdr = ralloc(mem_ctx, src_reg);
> -      memcpy(dst.reladdr, &offset, sizeof(src_reg));
> -
> -      emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END));
> -      emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
> -
> -      /* Set the first vertex flag to indicate that the next vertex will start
> -       * a primitive.
> -       */
> -      emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
> -   }
> -   emit(BRW_OPCODE_ENDIF);
> -}
> -
> -void
> -gen6_gs_visitor::emit_urb_write_header(int mrf)
> -{
> -   this->current_annotation = "gen6 urb header";
> -   /* Compute offset of the flags for the current vertex in vertex_output and
> -    * write them in dw2 of the message header.
> -    *
> -    * Notice that by the time that emit_thread_end() calls here
> -    * vertex_output_offset should point to the first data item of the current
> -    * vertex in vertex_output, thus we only need to add the number of output
> -    * slots per vertex to that offset to obtain the flags data offset.
> -    */
> -   src_reg flags_offset(this, glsl_type::uint_type);
> -   emit(ADD(dst_reg(flags_offset),
> -            this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots)));
> -
> -   src_reg flags_data(this->vertex_output);
> -   flags_data.reladdr = ralloc(mem_ctx, src_reg);
> -   memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
> -
> -   emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
> -}
> -
> -void
> -gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf,
> -                                       int last_mrf, int urb_offset)
> -{
> -   vec4_instruction *inst = NULL;
> -
> -   if (!complete) {
> -      /* If the vertex is not complete we don't have to do anything special */
> -      inst = emit(GS_OPCODE_URB_WRITE);
> -      inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
> -   } else {
> -      /* Otherwise we always request to allocate a new VUE handle. If this is
> -       * the last write before the EOT message and the new handle never gets
> -       * used it will be dereferenced when we send the EOT message. This is
> -       * necessary to avoid different setups for the EOT message (one for the
> -       * case when there is no output and another for the case when there is)
> -       * which would require to end the program with an IF/ELSE/ENDIF block,
> -       * something we do not want.
> -       */
> -      inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
> -      inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
> -      inst->dst = dst_reg(MRF, base_mrf);
> -      inst->src[0] = this->temp;
> -   }
> -
> -   inst->base_mrf = base_mrf;
> -   /* URB data written (does not include the message header reg) must
> -    * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
> -    * section 5.4.3.2.2: URB_INTERLEAVED.
> -    */
> -   int mlen = last_mrf - base_mrf;
> -   if ((mlen % 2) != 1)
> -      mlen++;
> -   inst->mlen = mlen;
> -   inst->offset = urb_offset;
> -}
> -
> -void
> -gen6_gs_visitor::emit_thread_end()
> -{
> -   /* Make sure the current primitive is ended: we know it is not ended when
> -    * first_vertex is not zero. This is only relevant for outputs other than
> -    * points because in the point case we set PrimEnd on all vertices.
> -    */
> -   if (c->gp->program.OutputType != GL_POINTS) {
> -      emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z));
> -      emit(IF(BRW_PREDICATE_NORMAL));
> -      {
> -         visit((ir_end_primitive *) NULL);
> -      }
> -      emit(BRW_OPCODE_ENDIF);
> -   }
> -
> -   /* Here we have to:
> -    * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
> -    * 2) Loop over all buffered vertex data and write it to corresponding
> -    *    URB entries.
> -    * 3) Allocate new VUE handles for all vertices other than the first.
> -    * 4) Send a final EOT message.
> -    */
> -
> -   /* MRF 0 is reserved for the debugger, so start with message header
> -    * in MRF 1.
> -    */
> -   int base_mrf = 1;
> -
> -   /* In the process of generating our URB write message contents, we
> -    * may need to unspill a register or load from an array.  Those
> -    * reads would use MRFs 14-15.
> -    */
> -   int max_usable_mrf = 13;
> -
> -   /* Issue the FF_SYNC message and obtain the initial VUE handle. */
> -   emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G));
> -   emit(IF(BRW_PREDICATE_NORMAL));
> -   {
> -      this->current_annotation = "gen6 thread end: ff_sync";
> -
> -      vec4_instruction *inst;
> -      if (c->prog_data.gen6_xfb_enabled) {
> -         src_reg sol_temp(this, glsl_type::uvec4_type);
> -         emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
> -              dst_reg(this->svbi),
> -              this->vertex_count,
> -              this->prim_count,
> -              sol_temp);
> -         inst = emit(GS_OPCODE_FF_SYNC,
> -                     dst_reg(this->temp), this->prim_count, this->svbi);
> -      } else {
> -         inst = emit(GS_OPCODE_FF_SYNC,
> -                     dst_reg(this->temp), this->prim_count, src_reg(0u));
> -      }
> -      inst->base_mrf = base_mrf;
> -
> -      /* Loop over all buffered vertices and emit URB write messages */
> -      this->current_annotation = "gen6 thread end: urb writes init";
> -      src_reg vertex(this, glsl_type::uint_type);
> -      emit(MOV(dst_reg(vertex), 0u));
> -      emit(MOV(dst_reg(this->vertex_output_offset), 0u));
> -
> -      this->current_annotation = "gen6 thread end: urb writes";
> -      emit(BRW_OPCODE_DO);
> -      {
> -         emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
> -         inst = emit(BRW_OPCODE_BREAK);
> -         inst->predicate = BRW_PREDICATE_NORMAL;
> -
> -         /* First we prepare the message header */
> -         emit_urb_write_header(base_mrf);
> -
> -         /* Then add vertex data to the message in interleaved fashion */
> -         int slot = 0;
> -         bool complete = false;
> -         do {
> -            int mrf = base_mrf + 1;
> -
> -            /* URB offset is in URB row increments, and each of our MRFs is half
> -             * of one of those, since we're doing interleaved writes.
> -             */
> -            int urb_offset = slot / 2;
> -
> -            for (; slot < prog_data->vue_map.num_slots; ++slot) {
> -               int varying = prog_data->vue_map.slot_to_varying[slot];
> -               current_annotation = output_reg_annotation[varying];
> -
> -               /* Compute offset of this slot for the current vertex
> -                * in vertex_output
> -                */
> -               src_reg data(this->vertex_output);
> -               data.reladdr = ralloc(mem_ctx, src_reg);
> -               memcpy(data.reladdr, &this->vertex_output_offset,
> -                      sizeof(src_reg));
> -
> -               /* Copy this slot to the appropriate message register */
> -               dst_reg reg = dst_reg(MRF, mrf);
> -               reg.type = output_reg[varying].type;
> -               data.type = reg.type;
> -               vec4_instruction *inst = emit(MOV(reg, data));
> -               inst->force_writemask_all = true;
> -
> -               mrf++;
> -               emit(ADD(dst_reg(this->vertex_output_offset),
> -                        this->vertex_output_offset, 1u));
> -
> -               /* If this was max_usable_mrf, we can't fit anything more into
> -                * this URB WRITE.
> -                */
> -               if (mrf > max_usable_mrf) {
> -                  slot++;
> -                  break;
> -               }
> -            }
> -
> -            complete = slot >= prog_data->vue_map.num_slots;
> -            emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
> -         } while (!complete);
> -
> -         /* Skip over the flags data item so that vertex_output_offset points
> -          * to the first data item of the next vertex, so that we can start
> -          * writing the next vertex.
> -          */
> -         emit(ADD(dst_reg(this->vertex_output_offset),
> -                  this->vertex_output_offset, 1u));
> -
> -         emit(ADD(dst_reg(vertex), vertex, 1u));
> -      }
> -      emit(BRW_OPCODE_WHILE);
> -
> -      if (c->prog_data.gen6_xfb_enabled)
> -         xfb_write();
> -   }
> -   emit(BRW_OPCODE_ENDIF);
> -
> -   /* Finally, emit EOT message.
> -    *
> -    * In gen6 we need to end the thread differently depending on whether we have
> -    * emitted at least one vertex or not. In case we did, the EOT message must
> -    * always include the COMPLETE flag or else the GPU hangs. If we have not
> -    * produced any output we can't use the COMPLETE flag.
> -    *
> -    * However, this would lead us to end the program with an ENDIF opcode,
> -    * which we want to avoid, so what we do is that we always request a new
> -    * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
> -    * With this we make sure that whether we have emitted at least one vertex
> -    * or none at all, we have to finish the thread without writing to the URB,
> -    * which works for both cases by setting the COMPLETE and UNUSED flags in
> -    * the EOT message.
> -    */
> -   this->current_annotation = "gen6 thread end: EOT";
> -
> -   if (c->prog_data.gen6_xfb_enabled) {
> -      /* When emitting EOT, set SONumPrimsWritten Increment Value. */
> -      src_reg data(this, glsl_type::uint_type);
> -      emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu)));
> -      emit(SHL(dst_reg(data), data, src_reg(16u)));
> -      emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
> -   }
> -
> -   vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
> -   inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
> -   inst->base_mrf = base_mrf;
> -   inst->mlen = 1;
> -}
> -
> -void
> -gen6_gs_visitor::setup_payload()
> -{
> -   int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
> -
> -   /* Attributes are going to be interleaved, so one register contains two
> -    * attribute slots.
> -    */
> -   int attributes_per_reg = 2;
> -
> -   /* If a geometry shader tries to read from an input that wasn't written by
> -    * the vertex shader, that produces undefined results, but it shouldn't
> -    * crash anything.  So initialize attribute_map to zeros--that ensures that
> -    * these undefined results are read from r0.
> -    */
> -   memset(attribute_map, 0, sizeof(attribute_map));
> -
> -   int reg = 0;
> -
> -   /* The payload always contains important data in r0. */
> -   reg++;
> -
> -   /* r1 is always part of the payload and it holds information relevant
> -    * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
> -    * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
> -    * information (and move the original value to a virtual register if
> -    * necessary).
> -    */
> -   if (c->prog_data.include_primitive_id)
> -      attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
> -   reg++;
> -
> -   reg = setup_uniforms(reg);
> -
> -   reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
> -
> -   lower_attributes_to_hw_regs(attribute_map, true);
> -
> -   this->first_non_payload_grf = reg;
> -}
> -
> -void
> -gen6_gs_visitor::xfb_setup()
> -{
> -   static const unsigned swizzle_for_offset[4] = {
> -      BRW_SWIZZLE4(0, 1, 2, 3),
> -      BRW_SWIZZLE4(1, 2, 3, 3),
> -      BRW_SWIZZLE4(2, 3, 3, 3),
> -      BRW_SWIZZLE4(3, 3, 3, 3)
> -   };
> -
> -   struct brw_gs_prog_data *prog_data =
> -      (struct brw_gs_prog_data *) &c->prog_data;
> -
> -   const struct gl_transform_feedback_info *linked_xfb_info =
> -      &this->shader_prog->LinkedTransformFeedback;
> -   int i;
> -
> -   /* Make sure that the VUE slots won't overflow the unsigned chars in
> -    * prog_data->transform_feedback_bindings[].
> -    */
> -   STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
> -
> -   /* Make sure that we don't need more binding table entries than we've
> -    * set aside for use in transform feedback.  (We shouldn't, since we
> -    * set aside enough binding table entries to have one per component).
> -    */
> -   assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
> -
> -   prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
> -   for (i = 0; i < prog_data->num_transform_feedback_bindings; i++) {
> -      prog_data->transform_feedback_bindings[i] =
> -         linked_xfb_info->Outputs[i].OutputRegister;
> -      prog_data->transform_feedback_swizzles[i] =
> -         swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
> -   }
> -}
> -
> -void
> -gen6_gs_visitor::xfb_write()
> -{
> -   unsigned num_verts;
> -   struct brw_gs_prog_data *prog_data =
> -      (struct brw_gs_prog_data *) &c->prog_data;
> -
> -   if (!prog_data->num_transform_feedback_bindings)
> -      return;
> -
> -   switch (c->prog_data.output_topology) {
> -   case _3DPRIM_POINTLIST:
> -      num_verts = 1;
> -      break;
> -   case _3DPRIM_LINELIST:
> -   case _3DPRIM_LINESTRIP:
> -   case _3DPRIM_LINELOOP:
> -      num_verts = 2;
> -      break;
> -   case _3DPRIM_TRILIST:
> -   case _3DPRIM_TRIFAN:
> -   case _3DPRIM_TRISTRIP:
> -   case _3DPRIM_RECTLIST:
> -      num_verts = 3;
> -      break;
> -   case _3DPRIM_QUADLIST:
> -   case _3DPRIM_QUADSTRIP:
> -   case _3DPRIM_POLYGON:
> -      num_verts = 3;
> -      break;
> -   default:
> -      unreachable("Unexpected primitive type in Gen6 SOL program.");
> -   }
> -
> -   this->current_annotation = "gen6 thread end: svb writes init";
> -
> -   emit(MOV(dst_reg(this->vertex_output_offset), 0u));
> -   emit(MOV(dst_reg(this->sol_prim_written), 0u));
> -
> -   /* Check that at least one primitive can be written
> -    *
> -    * Note: since we use the binding table to keep track of buffer offsets
> -    * and stride, the GS doesn't need to keep track of a separate pointer
> -    * into each buffer; it uses a single pointer which increments by 1 for
> -    * each vertex.  So we use SVBI0 for this pointer, regardless of whether
> -    * transform feedback is in interleaved or separate attribs mode.
> -    */
> -   src_reg sol_temp(this, glsl_type::uvec4_type);
> -   emit(ADD(dst_reg(sol_temp), this->svbi, src_reg(num_verts)));
> -
> -   /* Compare SVBI calculated number with the maximum value, which is
> -    * in R1.4 (previously saved in this->max_svbi) for gen6.
> -    */
> -   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
> -   emit(IF(BRW_PREDICATE_NORMAL));
> -   {
> -      src_reg destination_indices_uw =
> -         retype(destination_indices, BRW_REGISTER_TYPE_UW);
> -
> -      vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw),
> -                                        brw_imm_v(0x00020100))); /* (0, 1, 2) */
> -      inst->force_writemask_all = true;
> -
> -      emit(ADD(dst_reg(this->destination_indices),
> -               this->destination_indices,
> -               this->svbi));
> -   }
> -   emit(BRW_OPCODE_ENDIF);
> -
> -   /* Write transform feedback data for all processed vertices. */
> -   for (int i = 0; i < c->gp->program.VerticesOut; i++) {
> -      emit(MOV(dst_reg(sol_temp), i));
> -      emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
> -               BRW_CONDITIONAL_L));
> -      emit(IF(BRW_PREDICATE_NORMAL));
> -      {
> -         xfb_program(i, num_verts);
> -      }
> -      emit(BRW_OPCODE_ENDIF);
> -   }
> -}
> -
> -void
> -gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
> -{
> -   struct brw_gs_prog_data *prog_data =
> -      (struct brw_gs_prog_data *) &c->prog_data;
> -   unsigned binding;
> -   unsigned num_bindings = prog_data->num_transform_feedback_bindings;
> -   src_reg sol_temp(this, glsl_type::uvec4_type);
> -
> -   /* Check for buffer overflow: we need room to write the complete primitive
> -    * (all vertices). Otherwise, avoid writing any vertices for it
> -    */
> -   emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u));
> -   emit(MUL(dst_reg(sol_temp), sol_temp, src_reg(num_verts)));
> -   emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
> -   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
> -   emit(IF(BRW_PREDICATE_NORMAL));
> -   {
> -      /* Avoid overwriting MRF 1 as it is used as URB write message header */
> -      dst_reg mrf_reg(MRF, 2);
> -
> -      this->current_annotation = "gen6: emit SOL vertex data";
> -      /* For each vertex, generate code to output each varying using the
> -       * appropriate binding table entry.
> -       */
> -      for (binding = 0; binding < num_bindings; ++binding) {
> -         unsigned char varying =
> -            prog_data->transform_feedback_bindings[binding];
> -
> -         /* Set up the correct destination index for this vertex */
> -         vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
> -                                       mrf_reg,
> -                                       this->destination_indices);
> -         inst->sol_vertex = vertex % num_verts;
> -
> -         /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
> -          *
> -          *   "Prior to End of Thread with a URB_WRITE, the kernel must
> -          *   ensure that all writes are complete by sending the final
> -          *   write as a committed write."
> -          */
> -         bool final_write = binding == (unsigned) num_bindings - 1 &&
> -                            inst->sol_vertex == num_verts - 1;
> -
> -         /* Compute offset of this varying for the current vertex
> -          * in vertex_output
> -          */
> -         this->current_annotation = output_reg_annotation[varying];
> -         src_reg data(this->vertex_output);
> -         data.reladdr = ralloc(mem_ctx, src_reg);
> -         int offset = get_vertex_output_offset_for_varying(vertex, varying);
> -         emit(MOV(dst_reg(this->vertex_output_offset), offset));
> -         memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
> -         data.type = output_reg[varying].type;
> -
> -         /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
> -          * same slot, so make sure we write the appropriate channel
> -          */
> -         if (varying == VARYING_SLOT_PSIZ)
> -            data.swizzle = BRW_SWIZZLE_WWWW;
> -         else if (varying == VARYING_SLOT_LAYER)
> -            data.swizzle = BRW_SWIZZLE_YYYY;
> -         else if (varying == VARYING_SLOT_VIEWPORT)
> -            data.swizzle = BRW_SWIZZLE_ZZZZ;
> -         else
> -            data.swizzle = prog_data->transform_feedback_swizzles[binding];
> -
> -         /* Write data */
> -         inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
> -         inst->sol_binding = binding;
> -         inst->sol_final_write = final_write;
> -
> -         if (final_write) {
> -            /* This is the last vertex of the primitive, then increment
> -             * SO num primitive counter and destination indices.
> -             */
> -            emit(ADD(dst_reg(this->destination_indices),
> -                     this->destination_indices,
> -                     src_reg(num_verts)));
> -            emit(ADD(dst_reg(this->sol_prim_written),
> -                     this->sol_prim_written, 1u));
> -         }
> -
> -      }
> -      this->current_annotation = NULL;
> -   }
> -   emit(BRW_OPCODE_ENDIF);
> -}
> -
> -int
> -gen6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
> -{
> -   /* Find the output slot assigned to this varying.
> -    *
> -    * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
> -    * as VARYING_SLOT_PSIZ.
> -    */
> -   if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
> -      varying = VARYING_SLOT_PSIZ;
> -   int slot = prog_data->vue_map.varying_to_slot[varying];
> -
> -   if (slot < 0) {
> -      /* This varying does not exist in the VUE so we are not writing to it
> -       * and its value is undefined. We still want to return a valid offset
> -       * into vertex_output though, to prevent any out-of-bound accesses into
> -       * the vertex_output array. Since the value for this varying is undefined
> -       * we don't really care for the value we assign to it, so any offset
> -       * within the limits of vertex_output will do.
> -       */
> -      slot = 0;
> -   }
> -
> -   return vertex * (prog_data->vue_map.num_slots + 1) + slot;
> -}
> -
> -} /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
> deleted file mode 100644
> index 28f23c9..0000000
> --- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
> +++ /dev/null
> @@ -1,82 +0,0 @@
> -/*
> - * Copyright © 2014 Intel Corporation
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the "Software"),
> - * to deal in the Software without restriction, including without limitation
> - * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> - * and/or sell copies of the Software, and to permit persons to whom the
> - * Software is furnished to do so, subject to the following conditions:
> - *
> - * The above copyright notice and this permission notice (including the next
> - * paragraph) shall be included in all copies or substantial portions of the
> - * Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> - * IN THE SOFTWARE.
> - *
> - */
> -
> -#ifndef GEN6_GS_VISITOR_H
> -#define GEN6_GS_VISITOR_H
> -
> -#include "brw_vec4.h"
> -#include "brw_vec4_gs_visitor.h"
> -
> -#ifdef __cplusplus
> -
> -namespace brw {
> -
> -class gen6_gs_visitor : public vec4_gs_visitor
> -{
> -public:
> -   gen6_gs_visitor(struct brw_context *brw,
> -                   struct brw_gs_compile *c,
> -                   struct gl_shader_program *prog,
> -                   void *mem_ctx,
> -                   bool no_spills) :
> -      vec4_gs_visitor(brw, c, prog, mem_ctx, no_spills) {}
> -
> -protected:
> -   virtual void assign_binding_table_offsets();
> -   virtual void emit_prolog();
> -   virtual void emit_thread_end();
> -   virtual void visit(ir_emit_vertex *);
> -   virtual void visit(ir_end_primitive *);
> -   virtual void emit_urb_write_header(int mrf);
> -   virtual void emit_urb_write_opcode(bool complete,
> -                                      int base_mrf,
> -                                      int last_mrf,
> -                                      int urb_offset);
> -   virtual void setup_payload();
> -
> -private:
> -   void xfb_write();
> -   void xfb_program(unsigned vertex, unsigned num_verts);
> -   void xfb_setup();
> -   int get_vertex_output_offset_for_varying(int vertex, int varying);
> -
> -   src_reg vertex_output;
> -   src_reg vertex_output_offset;
> -   src_reg temp;
> -   src_reg first_vertex;
> -   src_reg prim_count;
> -   src_reg primitive_id;
> -
> -   /* Transform Feedback members */
> -   src_reg sol_prim_written;
> -   src_reg svbi;
> -   src_reg max_svbi;
> -   src_reg destination_indices;
> -};
> -
> -} /* namespace brw */
> -
> -#endif /* __cplusplus */
> -
> -#endif /* GEN6_GS_VISITOR_H */
> diff --git a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
> index ed8744d..239d225 100644
> --- a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
> +++ b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
> @@ -35,16 +35,16 @@ public:
>     struct brw_wm_prog_data *prog_data;
>     struct gl_shader_program *shader_prog;
>     struct brw_fragment_program *fp;
> -   fs_visitor *v;
> +   fs_god *v;
>  };
>
> -class cmod_propagation_fs_visitor : public fs_visitor
> +class cmod_propagation_fs_god : public fs_god
>  {
>  public:
> -   cmod_propagation_fs_visitor(struct brw_context *brw,
> +   cmod_propagation_fs_god(struct brw_context *brw,
>                                 struct brw_wm_prog_data *prog_data,
>                                 struct gl_shader_program *shader_prog)
> -      : fs_visitor(brw, NULL, NULL, prog_data, shader_prog, NULL, 8) {}
> +      : fs_god(brw, NULL, NULL, prog_data, shader_prog, NULL, 8) {}
>  };
>
>
> @@ -57,7 +57,7 @@ void cmod_propagation_test::SetUp()
>     prog_data = ralloc(NULL, struct brw_wm_prog_data);
>     shader_prog = ralloc(NULL, struct gl_shader_program);
>
> -   v = new cmod_propagation_fs_visitor(brw, prog_data, shader_prog);
> +   v = new cmod_propagation_fs_god(brw, prog_data, shader_prog);
>
>     _mesa_init_fragment_program(ctx, &fp->program, GL_FRAGMENT_SHADER, 0);
>
> @@ -75,7 +75,7 @@ instruction(bblock_t *block, int num)
>  }
>
>  static bool
> -cmod_propagation(fs_visitor *v)
> +cmod_propagation(fs_god *v)
>  {
>     const bool print = false;
>
> diff --git a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
> index 6f762bc..7ad0bd2 100644
> --- a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
> +++ b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
> @@ -35,16 +35,16 @@ public:
>     struct brw_wm_prog_data *prog_data;
>     struct gl_shader_program *shader_prog;
>     struct brw_fragment_program *fp;
> -   fs_visitor *v;
> +   fs_god *v;
>  };
>
> -class saturate_propagation_fs_visitor : public fs_visitor
> +class saturate_propagation_fs_god : public fs_god
>  {
>  public:
> -   saturate_propagation_fs_visitor(struct brw_context *brw,
> +   saturate_propagation_fs_god(struct brw_context *brw,
>                                     struct brw_wm_prog_data *prog_data,
>                                     struct gl_shader_program *shader_prog)
> -      : fs_visitor(brw, NULL, NULL, prog_data, shader_prog, NULL, 8) {}
> +      : fs_god(brw, NULL, NULL, prog_data, shader_prog, NULL, 8) {}
>  };
>
>
> @@ -57,7 +57,7 @@ void saturate_propagation_test::SetUp()
>     prog_data = ralloc(NULL, struct brw_wm_prog_data);
>     shader_prog = ralloc(NULL, struct gl_shader_program);
>
> -   v = new saturate_propagation_fs_visitor(brw, prog_data, shader_prog);
> +   v = new saturate_propagation_fs_god(brw, prog_data, shader_prog);
>
>     _mesa_init_fragment_program(ctx, &fp->program, GL_FRAGMENT_SHADER, 0);
>
> @@ -75,7 +75,7 @@ instruction(bblock_t *block, int num)
>  }
>
>  static bool
> -saturate_propagation(fs_visitor *v)
> +saturate_propagation(fs_god *v)
>  {
>     const bool print = false;
>
> diff --git a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
> index f9e4ce1..4913c30 100644
> --- a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
> +++ b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
> @@ -37,15 +37,15 @@ public:
>     struct gl_context *ctx;
>     struct gl_shader_program *shader_prog;
>     struct brw_vertex_program *vp;
> -   vec4_visitor *v;
> +   vec4_god *v;
>  };
>
> -class copy_propagation_vec4_visitor : public vec4_visitor
> +class copy_propagation_vec4_god : public vec4_god
>  {
>  public:
> -   copy_propagation_vec4_visitor(struct brw_context *brw,
> +   copy_propagation_vec4_god(struct brw_context *brw,
>                                    struct gl_shader_program *shader_prog)
> -      : vec4_visitor(brw, NULL, NULL, NULL, NULL, shader_prog,
> +      : vec4_god(brw, NULL, NULL, NULL, NULL, shader_prog,
>                       MESA_SHADER_VERTEX, NULL,
>                       false /* no_spills */,
>                       ST_NONE, ST_NONE, ST_NONE)
> @@ -99,7 +99,7 @@ void copy_propagation_test::SetUp()
>
>     shader_prog = ralloc(NULL, struct gl_shader_program);
>
> -   v = new copy_propagation_vec4_visitor(brw, shader_prog);
> +   v = new copy_propagation_vec4_god(brw, shader_prog);
>
>     _mesa_init_vertex_program(ctx, &vp->program, GL_VERTEX_SHADER, 0);
>
> @@ -107,7 +107,7 @@ void copy_propagation_test::SetUp()
>  }
>
>  static void
> -copy_propagation(vec4_visitor *v)
> +copy_propagation(vec4_god *v)
>  {
>     bool print = false;
>
> diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
> index 0c27162..bab3532 100644
> --- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
> +++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
> @@ -39,16 +39,16 @@ public:
>     struct gl_context *ctx;
>     struct gl_shader_program *shader_prog;
>     struct brw_vertex_program *vp;
> -   vec4_visitor *v;
> +   vec4_god *v;
>  };
>
>
> -class register_coalesce_vec4_visitor : public vec4_visitor
> +class register_coalesce_vec4_god : public vec4_god
>  {
>  public:
> -   register_coalesce_vec4_visitor(struct brw_context *brw,
> +   register_coalesce_vec4_god(struct brw_context *brw,
>                                    struct gl_shader_program *shader_prog)
> -      : vec4_visitor(brw, NULL, NULL, NULL, NULL, shader_prog,
> +      : vec4_god(brw, NULL, NULL, NULL, NULL, shader_prog,
>                       MESA_SHADER_VERTEX, NULL,
>                       false /* no_spills */,
>                       ST_NONE, ST_NONE, ST_NONE)
> @@ -102,7 +102,7 @@ void register_coalesce_test::SetUp()
>
>     shader_prog = ralloc(NULL, struct gl_shader_program);
>
> -   v = new register_coalesce_vec4_visitor(brw, shader_prog);
> +   v = new register_coalesce_vec4_god(brw, shader_prog);
>
>     _mesa_init_vertex_program(ctx, &vp->program, GL_VERTEX_SHADER, 0);
>
> @@ -110,7 +110,7 @@ void register_coalesce_test::SetUp()
>  }
>
>  static void
> -_register_coalesce(vec4_visitor *v, const char *func)
> +_register_coalesce(vec4_god *v, const char *func)
>  {
>     bool print = false;
>
> --
> 2.1.3
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev