[Mesa-dev] [PATCH 02/14] i965: Rename fs_generator to scalar_generator
Jordan Justen
jordan.l.justen at intel.com
Mon Sep 1 09:44:26 PDT 2014
Signed-off-by: Jordan Justen <jordan.l.justen at intel.com>
---
src/mesa/drivers/dri/i965/Makefile.sources | 2 +-
src/mesa/drivers/dri/i965/brw_blorp_blit_eu.h | 2 +-
src/mesa/drivers/dri/i965/brw_fs.cpp | 4 +-
src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 2009 -------------------
src/mesa/drivers/dri/i965/brw_scalar.h | 20 +-
src/mesa/drivers/dri/i965/brw_scalar_generator.cpp | 2011 ++++++++++++++++++++
6 files changed, 2025 insertions(+), 2023 deletions(-)
delete mode 100644 src/mesa/drivers/dri/i965/brw_fs_generator.cpp
create mode 100644 src/mesa/drivers/dri/i965/brw_scalar_generator.cpp
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index b91b813..52aaeaf 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -56,7 +56,6 @@ i965_FILES = \
brw_fs_cse.cpp \
brw_fs_dead_code_eliminate.cpp \
brw_fs_fp.cpp \
- brw_fs_generator.cpp \
brw_fs_live_variables.cpp \
brw_fs_peephole_predicated_break.cpp \
brw_fs_reg_allocate.cpp \
@@ -84,6 +83,7 @@ i965_FILES = \
brw_queryobj.c \
brw_reset.c \
brw_sampler_state.c \
+ brw_scalar_generator.cpp \
brw_schedule_instructions.cpp \
brw_sf.c \
brw_sf_emit.c \
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.h b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.h
index c8f1bbc..8e8cef4 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.h
@@ -184,7 +184,7 @@ private:
void *mem_ctx;
exec_list insts;
- fs_generator generator;
+ scalar_generator generator;
};
#endif /* BRW_BLORP_BLIT_EU_H */
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 409c828..d6cade6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3438,8 +3438,8 @@ brw_wm_fs_emit(struct brw_context *brw,
}
const unsigned *assembly = NULL;
- fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
- v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
+ scalar_generator g(brw, mem_ctx, key, prog_data, prog, fp,
+ v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
assembly = g.generate_assembly(simd8_cfg, simd16_cfg,
final_assembly_size);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
deleted file mode 100644
index 9145db5..0000000
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ /dev/null
@@ -1,2009 +0,0 @@
-/*
- * Copyright © 2010 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-/** @file brw_fs_generator.cpp
- *
- * This file supports generating code from the FS LIR to the actual
- * native instructions.
- */
-
-extern "C" {
-#include "main/macros.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-} /* extern "C" */
-
-#include "brw_scalar.h"
-#include "brw_cfg.h"
-
-fs_generator::fs_generator(struct brw_context *brw,
- void *mem_ctx,
- const struct brw_wm_prog_key *key,
- struct brw_wm_prog_data *prog_data,
- struct gl_shader_program *prog,
- struct gl_fragment_program *fp,
- bool runtime_check_aads_emit,
- bool debug_flag)
-
- : brw(brw), key(key), prog_data(prog_data), prog(prog), fp(fp),
- runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(debug_flag),
- mem_ctx(mem_ctx)
-{
- ctx = &brw->ctx;
-
- p = rzalloc(mem_ctx, struct brw_compile);
- brw_init_compile(brw, p, mem_ctx);
-}
-
-fs_generator::~fs_generator()
-{
-}
-
-bool
-fs_generator::patch_discard_jumps_to_fb_writes()
-{
- if (brw->gen < 6 || this->discard_halt_patches.is_empty())
- return false;
-
- int scale = brw_jump_scale(brw);
-
- /* There is a somewhat strange undocumented requirement of using
- * HALT, according to the simulator. If some channel has HALTed to
- * a particular UIP, then by the end of the program, every channel
- * must have HALTed to that UIP. Furthermore, the tracking is a
- * stack, so you can't do the final halt of a UIP after starting
- * halting to a new UIP.
- *
- * Symptoms of not emitting this instruction on actual hardware
- * included GPU hangs and sparkly rendering on the piglit discard
- * tests.
- */
- brw_inst *last_halt = gen6_HALT(p);
- brw_inst_set_uip(brw, last_halt, 1 * scale);
- brw_inst_set_jip(brw, last_halt, 1 * scale);
-
- int ip = p->nr_insn;
-
- foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
- brw_inst *patch = &p->store[patch_ip->ip];
-
- assert(brw_inst_opcode(brw, patch) == BRW_OPCODE_HALT);
- /* HALT takes a half-instruction distance from the pre-incremented IP. */
- brw_inst_set_uip(brw, patch, (ip - patch_ip->ip) * scale);
- }
-
- this->discard_halt_patches.make_empty();
- return true;
-}
-
-void
-fs_generator::fire_fb_write(fs_inst *inst,
- GLuint base_reg,
- struct brw_reg implied_header,
- GLuint nr)
-{
- uint32_t msg_control;
-
- if (brw->gen < 6) {
- brw_push_insn_state(p);
- brw_set_default_mask_control(p, BRW_MASK_DISABLE);
- brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- brw_MOV(p,
- brw_message_reg(base_reg + 1),
- brw_vec8_grf(1, 0));
- brw_pop_insn_state(p);
- }
-
- if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
- msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
- else if (prog_data->dual_src_blend)
- msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
- else if (dispatch_width == 16)
- msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
- else
- msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
-
- uint32_t surf_index =
- prog_data->binding_table.render_target_start + inst->target;
-
- brw_fb_WRITE(p,
- dispatch_width,
- base_reg,
- implied_header,
- msg_control,
- surf_index,
- nr,
- 0,
- inst->eot,
- inst->header_present);
-
- brw_mark_surface_used(&prog_data->base, surf_index);
-}
-
-void
-fs_generator::generate_fb_write(fs_inst *inst)
-{
- struct brw_reg implied_header;
-
- /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
- * move, here's g1.
- */
- if (inst->header_present) {
- brw_push_insn_state(p);
- brw_set_default_mask_control(p, BRW_MASK_DISABLE);
- brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- brw_set_default_flag_reg(p, 0, 0);
-
- /* On HSW, the GPU will use the predicate on SENDC, unless the header is
- * present.
- */
- if ((fp && fp->UsesKill) || key->alpha_test_func) {
- struct brw_reg pixel_mask;
-
- if (brw->gen >= 6)
- pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
- else
- pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
-
- brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
- }
-
- if (brw->gen >= 6) {
- brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
- brw_MOV(p,
- retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
- retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
-
- if (inst->target > 0 && key->replicate_alpha) {
- /* Set "Source0 Alpha Present to RenderTarget" bit in message
- * header.
- */
- brw_OR(p,
- vec1(retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD)),
- vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
- brw_imm_ud(0x1 << 11));
- }
-
- if (inst->target > 0) {
- /* Set the render target index for choosing BLEND_STATE. */
- brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
- inst->base_mrf, 2),
- BRW_REGISTER_TYPE_UD),
- brw_imm_ud(inst->target));
- }
-
- implied_header = brw_null_reg();
- } else {
- implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
- }
-
- brw_pop_insn_state(p);
- } else {
- implied_header = brw_null_reg();
- }
-
- if (!runtime_check_aads_emit) {
- fire_fb_write(inst, inst->base_mrf, implied_header, inst->mlen);
- } else {
- /* This can only happen in gen < 6 */
- assert(brw->gen < 6);
-
- struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
-
- /* Check runtime bit to detect if we have to send AA data or not */
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- brw_AND(p,
- v1_null_ud,
- retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
- brw_imm_ud(1<<26));
- brw_inst_set_cond_modifier(brw, brw_last_inst, BRW_CONDITIONAL_NZ);
-
- int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
- brw_inst_set_exec_size(brw, brw_last_inst, BRW_EXECUTE_1);
- {
- /* Don't send AA data */
- fire_fb_write(inst, inst->base_mrf+1, implied_header, inst->mlen-1);
- }
- brw_land_fwd_jump(p, jmp);
- fire_fb_write(inst, inst->base_mrf, implied_header, inst->mlen);
- }
-}
-
-void
-fs_generator::generate_blorp_fb_write(fs_inst *inst)
-{
- brw_fb_WRITE(p,
- 16 /* dispatch_width */,
- inst->base_mrf,
- brw_reg_from_fs_reg(&inst->src[0]),
- BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE,
- inst->target,
- inst->mlen,
- 0,
- true,
- inst->header_present);
-}
-
-/* Computes the integer pixel x,y values from the origin.
- *
- * This is the basis of gl_FragCoord computation, but is also used
- * pre-gen6 for computing the deltas from v0 for computing
- * interpolation.
- */
-void
-fs_generator::generate_pixel_xy(struct brw_reg dst, bool is_x)
-{
- struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
- struct brw_reg src;
- struct brw_reg deltas;
-
- if (is_x) {
- src = stride(suboffset(g1_uw, 4), 2, 4, 0);
- deltas = brw_imm_v(0x10101010);
- } else {
- src = stride(suboffset(g1_uw, 5), 2, 4, 0);
- deltas = brw_imm_v(0x11001100);
- }
-
- if (dispatch_width == 16) {
- dst = vec16(dst);
- }
-
- /* We do this SIMD8 or SIMD16, but since the destination is UW we
- * don't do compression in the SIMD16 case.
- */
- brw_push_insn_state(p);
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- brw_ADD(p, dst, src, deltas);
- brw_pop_insn_state(p);
-}
-
-void
-fs_generator::generate_linterp(fs_inst *inst,
- struct brw_reg dst, struct brw_reg *src)
-{
- struct brw_reg delta_x = src[0];
- struct brw_reg delta_y = src[1];
- struct brw_reg interp = src[2];
-
- if (brw->has_pln &&
- delta_y.nr == delta_x.nr + 1 &&
- (brw->gen >= 6 || (delta_x.nr & 1) == 0)) {
- brw_PLN(p, dst, interp, delta_x);
- } else {
- brw_LINE(p, brw_null_reg(), interp, delta_x);
- brw_MAC(p, dst, suboffset(interp, 1), delta_y);
- }
-}
-
-void
-fs_generator::generate_math_gen6(fs_inst *inst,
- struct brw_reg dst,
- struct brw_reg src0,
- struct brw_reg src1)
-{
- int op = brw_math_function(inst->opcode);
- bool binop = src1.file != BRW_ARCHITECTURE_REGISTER_FILE;
-
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- gen6_math(p, dst, op, src0, src1);
-
- if (dispatch_width == 16) {
- brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
- gen6_math(p, sechalf(dst), op, sechalf(src0),
- binop ? sechalf(src1) : brw_null_reg());
- brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
- }
-}
-
-void
-fs_generator::generate_math_gen4(fs_inst *inst,
- struct brw_reg dst,
- struct brw_reg src)
-{
- int op = brw_math_function(inst->opcode);
-
- assert(inst->mlen >= 1);
-
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- gen4_math(p, dst,
- op,
- inst->base_mrf, src,
- BRW_MATH_DATA_VECTOR,
- BRW_MATH_PRECISION_FULL);
-
- if (dispatch_width == 16) {
- brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
- gen4_math(p, sechalf(dst),
- op,
- inst->base_mrf + 1, sechalf(src),
- BRW_MATH_DATA_VECTOR,
- BRW_MATH_PRECISION_FULL);
-
- brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
- }
-}
-
-void
-fs_generator::generate_math_g45(fs_inst *inst,
- struct brw_reg dst,
- struct brw_reg src)
-{
- if (inst->opcode == SHADER_OPCODE_POW ||
- inst->opcode == SHADER_OPCODE_INT_QUOTIENT ||
- inst->opcode == SHADER_OPCODE_INT_REMAINDER) {
- generate_math_gen4(inst, dst, src);
- return;
- }
-
- int op = brw_math_function(inst->opcode);
-
- assert(inst->mlen >= 1);
-
- gen4_math(p, dst,
- op,
- inst->base_mrf, src,
- BRW_MATH_DATA_VECTOR,
- BRW_MATH_PRECISION_FULL);
-}
-
-void
-fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
- struct brw_reg sampler_index)
-{
- int msg_type = -1;
- int rlen = 4;
- uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
- uint32_t return_format;
-
- switch (dst.type) {
- case BRW_REGISTER_TYPE_D:
- return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
- break;
- case BRW_REGISTER_TYPE_UD:
- return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
- break;
- default:
- return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
- break;
- }
-
- if (dispatch_width == 16 &&
- !inst->force_uncompressed && !inst->force_sechalf)
- simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
-
- if (brw->gen >= 5) {
- switch (inst->opcode) {
- case SHADER_OPCODE_TEX:
- if (inst->shadow_compare) {
- msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
- } else {
- msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
- }
- break;
- case FS_OPCODE_TXB:
- if (inst->shadow_compare) {
- msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
- } else {
- msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
- }
- break;
- case SHADER_OPCODE_TXL:
- if (inst->shadow_compare) {
- msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
- } else {
- msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
- }
- break;
- case SHADER_OPCODE_TXS:
- msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
- break;
- case SHADER_OPCODE_TXD:
- if (inst->shadow_compare) {
- /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */
- assert(brw->gen >= 8 || brw->is_haswell);
- msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
- } else {
- msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
- }
- break;
- case SHADER_OPCODE_TXF:
- msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
- break;
- case SHADER_OPCODE_TXF_CMS:
- if (brw->gen >= 7)
- msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
- else
- msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
- break;
- case SHADER_OPCODE_TXF_UMS:
- assert(brw->gen >= 7);
- msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
- break;
- case SHADER_OPCODE_TXF_MCS:
- assert(brw->gen >= 7);
- msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
- break;
- case SHADER_OPCODE_LOD:
- msg_type = GEN5_SAMPLER_MESSAGE_LOD;
- break;
- case SHADER_OPCODE_TG4:
- if (inst->shadow_compare) {
- assert(brw->gen >= 7);
- msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
- } else {
- assert(brw->gen >= 6);
- msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
- }
- break;
- case SHADER_OPCODE_TG4_OFFSET:
- assert(brw->gen >= 7);
- if (inst->shadow_compare) {
- msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
- } else {
- msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
- }
- break;
- default:
- unreachable("not reached");
- }
- } else {
- switch (inst->opcode) {
- case SHADER_OPCODE_TEX:
- /* Note that G45 and older determines shadow compare and dispatch width
- * from message length for most messages.
- */
- assert(dispatch_width == 8);
- msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
- if (inst->shadow_compare) {
- assert(inst->mlen == 6);
- } else {
- assert(inst->mlen <= 4);
- }
- break;
- case FS_OPCODE_TXB:
- if (inst->shadow_compare) {
- assert(inst->mlen == 6);
- msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
- } else {
- assert(inst->mlen == 9);
- msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
- simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
- }
- break;
- case SHADER_OPCODE_TXL:
- if (inst->shadow_compare) {
- assert(inst->mlen == 6);
- msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
- } else {
- assert(inst->mlen == 9);
- msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
- simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
- }
- break;
- case SHADER_OPCODE_TXD:
- /* There is no sample_d_c message; comparisons are done manually */
- assert(inst->mlen == 7 || inst->mlen == 10);
- msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
- break;
- case SHADER_OPCODE_TXF:
- assert(inst->mlen == 9);
- msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
- simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
- break;
- case SHADER_OPCODE_TXS:
- assert(inst->mlen == 3);
- msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
- simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
- break;
- default:
- unreachable("not reached");
- }
- }
- assert(msg_type != -1);
-
- if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
- rlen = 8;
- dst = vec16(dst);
- }
-
- if (brw->gen >= 7 && inst->header_present && dispatch_width == 16) {
- /* The send-from-GRF for SIMD16 texturing with a header has an extra
- * hardware register allocated to it, which we need to skip over (since
- * our coordinates in the payload are in the even-numbered registers,
- * and the header comes right before the first one).
- */
- assert(src.file == BRW_GENERAL_REGISTER_FILE);
- src.nr++;
- }
-
- assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
-
- /* Load the message header if present. If there's a texture offset,
- * we need to set it up explicitly and load the offset bitfield.
- * Otherwise, we can use an implied move from g0 to the first message reg.
- */
- if (inst->header_present) {
- if (brw->gen < 6 && !inst->texture_offset) {
- /* Set up an implied move from g0 to the MRF. */
- src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
- } else {
- struct brw_reg header_reg;
-
- if (brw->gen >= 7) {
- header_reg = src;
- } else {
- assert(inst->base_mrf != -1);
- header_reg = brw_message_reg(inst->base_mrf);
- }
-
- brw_push_insn_state(p);
- brw_set_default_mask_control(p, BRW_MASK_DISABLE);
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- /* Explicitly set up the message header by copying g0 to the MRF. */
- brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
-
- if (inst->texture_offset) {
- /* Set the offset bits in DWord 2. */
- brw_MOV(p, get_element_ud(header_reg, 2),
- brw_imm_ud(inst->texture_offset));
- }
-
- brw_adjust_sampler_state_pointer(p, header_reg, sampler_index, dst);
- brw_pop_insn_state(p);
- }
- }
-
- uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
- inst->opcode == SHADER_OPCODE_TG4_OFFSET)
- ? prog_data->base.binding_table.gather_texture_start
- : prog_data->base.binding_table.texture_start;
-
- if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
- uint32_t sampler = sampler_index.dw1.ud;
-
- brw_SAMPLE(p,
- retype(dst, BRW_REGISTER_TYPE_UW),
- inst->base_mrf,
- src,
- sampler + base_binding_table_index,
- sampler % 16,
- msg_type,
- rlen,
- inst->mlen,
- inst->header_present,
- simd_mode,
- return_format);
-
- brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index);
- } else {
- /* Non-const sampler index */
- /* Note: this clobbers `dst` as a temporary before emitting the send */
-
- struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
- struct brw_reg temp = vec1(retype(dst, BRW_REGISTER_TYPE_UD));
-
- struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
-
- brw_push_insn_state(p);
- brw_set_default_mask_control(p, BRW_MASK_DISABLE);
- brw_set_default_access_mode(p, BRW_ALIGN_1);
-
- /* Some care required: `sampler` and `temp` may alias:
- * addr = sampler & 0xff
- * temp = (sampler << 8) & 0xf00
- * addr = addr | temp
- */
- brw_ADD(p, addr, sampler_reg, brw_imm_ud(base_binding_table_index));
- brw_SHL(p, temp, sampler_reg, brw_imm_ud(8u));
- brw_AND(p, temp, temp, brw_imm_ud(0x0f00));
- brw_AND(p, addr, addr, brw_imm_ud(0x0ff));
- brw_OR(p, addr, addr, temp);
-
- /* a0.0 |= <descriptor> */
- brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
- brw_set_sampler_message(p, insn_or,
- 0 /* surface */,
- 0 /* sampler */,
- msg_type,
- rlen,
- inst->mlen /* mlen */,
- inst->header_present /* header */,
- simd_mode,
- return_format);
- brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
- brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
- brw_set_src0(p, insn_or, addr);
- brw_set_dest(p, insn_or, addr);
-
-
- /* dst = send(offset, a0.0) */
- brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
- brw_set_dest(p, insn_send, dst);
- brw_set_src0(p, insn_send, src);
- brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr);
-
- brw_pop_insn_state(p);
-
- /* visitor knows more than we do about the surface limit required,
- * so has already done marking.
- */
- }
-}
-
-
-/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
- * looking like:
- *
- * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
- *
- * Ideally, we want to produce:
- *
- * DDX DDY
- * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
- * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
- * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
- * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
- * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
- * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
- * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
- * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
- *
- * and add another set of two more subspans if in 16-pixel dispatch mode.
- *
- * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
- * for each pair, and vertstride = 2 jumps us 2 elements after processing a
- * pair. But the ideal approximation may impose a huge performance cost on
- * sample_d. On at least Haswell, sample_d instruction does some
- * optimizations if the same LOD is used for all pixels in the subspan.
- *
- * For DDY, we need to use ALIGN16 mode since it's capable of doing the
- * appropriate swizzling.
- */
-void
-fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
- struct brw_reg quality)
-{
- unsigned vstride, width;
- assert(quality.file == BRW_IMMEDIATE_VALUE);
- assert(quality.type == BRW_REGISTER_TYPE_D);
-
- int quality_value = quality.dw1.d;
-
- if (quality_value == BRW_DERIVATIVE_FINE ||
- (key->high_quality_derivatives && quality_value != BRW_DERIVATIVE_COARSE)) {
- /* produce accurate derivatives */
- vstride = BRW_VERTICAL_STRIDE_2;
- width = BRW_WIDTH_2;
- }
- else {
- /* replicate the derivative at the top-left pixel to other pixels */
- vstride = BRW_VERTICAL_STRIDE_4;
- width = BRW_WIDTH_4;
- }
-
- struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
- BRW_REGISTER_TYPE_F,
- vstride,
- width,
- BRW_HORIZONTAL_STRIDE_0,
- BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
- struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
- BRW_REGISTER_TYPE_F,
- vstride,
- width,
- BRW_HORIZONTAL_STRIDE_0,
- BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
- brw_ADD(p, dst, src0, negate(src1));
-}
-
-/* The negate_value boolean is used to negate the derivative computation for
- * FBOs, since they place the origin at the upper left instead of the lower
- * left.
- */
-void
-fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
- struct brw_reg quality, bool negate_value)
-{
- assert(quality.file == BRW_IMMEDIATE_VALUE);
- assert(quality.type == BRW_REGISTER_TYPE_D);
-
- int quality_value = quality.dw1.d;
-
- if (quality_value == BRW_DERIVATIVE_FINE ||
- (key->high_quality_derivatives && quality_value != BRW_DERIVATIVE_COARSE)) {
- /* From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
- * Region Restrictions):
- *
- * In Align16 access mode, SIMD16 is not allowed for DW operations
- * and SIMD8 is not allowed for DF operations.
- *
- * In this context, "DW operations" means "operations acting on 32-bit
- * values", so it includes operations on floats.
- *
- * Gen4 has a similar restriction. From the i965 PRM, section 11.5.3
- * (Instruction Compression -> Rules and Restrictions):
- *
- * A compressed instruction must be in Align1 access mode. Align16
- * mode instructions cannot be compressed.
- *
- * Similar text exists in the g45 PRM.
- *
- * On these platforms, if we're building a SIMD16 shader, we need to
- * manually unroll to a pair of SIMD8 instructions.
- */
- bool unroll_to_simd8 =
- (dispatch_width == 16 &&
- (brw->gen == 4 || (brw->gen == 7 && !brw->is_haswell)));
-
- /* produce accurate derivatives */
- struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
- BRW_REGISTER_TYPE_F,
- BRW_VERTICAL_STRIDE_4,
- BRW_WIDTH_4,
- BRW_HORIZONTAL_STRIDE_1,
- BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
- struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
- BRW_REGISTER_TYPE_F,
- BRW_VERTICAL_STRIDE_4,
- BRW_WIDTH_4,
- BRW_HORIZONTAL_STRIDE_1,
- BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
- brw_push_insn_state(p);
- brw_set_default_access_mode(p, BRW_ALIGN_16);
- if (unroll_to_simd8)
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- if (negate_value)
- brw_ADD(p, dst, src1, negate(src0));
- else
- brw_ADD(p, dst, src0, negate(src1));
- if (unroll_to_simd8) {
- brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
- src0 = sechalf(src0);
- src1 = sechalf(src1);
- dst = sechalf(dst);
- if (negate_value)
- brw_ADD(p, dst, src1, negate(src0));
- else
- brw_ADD(p, dst, src0, negate(src1));
- }
- brw_pop_insn_state(p);
- } else {
- /* replicate the derivative at the top-left pixel to other pixels */
- struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
- BRW_REGISTER_TYPE_F,
- BRW_VERTICAL_STRIDE_4,
- BRW_WIDTH_4,
- BRW_HORIZONTAL_STRIDE_0,
- BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
- struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
- BRW_REGISTER_TYPE_F,
- BRW_VERTICAL_STRIDE_4,
- BRW_WIDTH_4,
- BRW_HORIZONTAL_STRIDE_0,
- BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
- if (negate_value)
- brw_ADD(p, dst, src1, negate(src0));
- else
- brw_ADD(p, dst, src0, negate(src1));
- }
-}
-
-void
-fs_generator::generate_discard_jump(fs_inst *inst)
-{
- assert(brw->gen >= 6);
-
- /* This HALT will be patched up at FB write time to point UIP at the end of
- * the program, and at brw_uip_jip() JIP will be set to the end of the
- * current block (or the program).
- */
- this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
-
- brw_push_insn_state(p);
- brw_set_default_mask_control(p, BRW_MASK_DISABLE);
- gen6_HALT(p);
- brw_pop_insn_state(p);
-}
-
-void
-fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
-{
- assert(inst->mlen != 0);
-
- brw_MOV(p,
- retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
- retype(src, BRW_REGISTER_TYPE_UD));
- brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
- dispatch_width / 8, inst->offset);
-}
-
-void
-fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
-{
- assert(inst->mlen != 0);
-
- brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
- dispatch_width / 8, inst->offset);
-}
-
-void
-fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
-{
- gen7_block_read_scratch(p, dst, dispatch_width / 8, inst->offset);
-}
-
-void
-fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
- struct brw_reg dst,
- struct brw_reg index,
- struct brw_reg offset)
-{
- assert(inst->mlen != 0);
-
- assert(index.file == BRW_IMMEDIATE_VALUE &&
- index.type == BRW_REGISTER_TYPE_UD);
- uint32_t surf_index = index.dw1.ud;
-
- assert(offset.file == BRW_IMMEDIATE_VALUE &&
- offset.type == BRW_REGISTER_TYPE_UD);
- uint32_t read_offset = offset.dw1.ud;
-
- brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
- read_offset, surf_index);
-
- brw_mark_surface_used(&prog_data->base, surf_index);
-}
-
-void
-fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
- struct brw_reg dst,
- struct brw_reg index,
- struct brw_reg offset)
-{
- assert(inst->mlen == 0);
- assert(index.type == BRW_REGISTER_TYPE_UD);
-
- assert(offset.file == BRW_GENERAL_REGISTER_FILE);
- /* Reference just the dword we need, to avoid angering validate_reg(). */
- offset = brw_vec1_grf(offset.nr, 0);
-
- /* We use the SIMD4x2 mode because we want to end up with 4 components in
- * the destination loaded consecutively from the same offset (which appears
- * in the first component, and the rest are ignored).
- */
- dst.width = BRW_WIDTH_4;
-
- if (index.file == BRW_IMMEDIATE_VALUE) {
-
- uint32_t surf_index = index.dw1.ud;
-
- brw_push_insn_state(p);
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- brw_set_default_mask_control(p, BRW_MASK_DISABLE);
- brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
- brw_pop_insn_state(p);
-
- brw_set_dest(p, send, dst);
- brw_set_src0(p, send, offset);
- brw_set_sampler_message(p, send,
- surf_index,
- 0, /* LD message ignores sampler unit */
- GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
- 1, /* rlen */
- 1, /* mlen */
- false, /* no header */
- BRW_SAMPLER_SIMD_MODE_SIMD4X2,
- 0);
-
- brw_mark_surface_used(&prog_data->base, surf_index);
-
- } else {
-
- struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
-
- brw_push_insn_state(p);
- brw_set_default_mask_control(p, BRW_MASK_DISABLE);
- brw_set_default_access_mode(p, BRW_ALIGN_1);
-
- /* a0.0 = surf_index & 0xff */
- brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
- brw_inst_set_exec_size(p->brw, insn_and, BRW_EXECUTE_1);
- brw_set_dest(p, insn_and, addr);
- brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
- brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
-
-
- /* a0.0 |= <descriptor> */
- brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
- brw_set_sampler_message(p, insn_or,
- 0 /* surface */,
- 0 /* sampler */,
- GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
- 1 /* rlen */,
- 1 /* mlen */,
- false /* header */,
- BRW_SAMPLER_SIMD_MODE_SIMD4X2,
- 0);
- brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
- brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
- brw_set_src0(p, insn_or, addr);
- brw_set_dest(p, insn_or, addr);
-
-
- /* dst = send(offset, a0.0) */
- brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
- brw_set_dest(p, insn_send, dst);
- brw_set_src0(p, insn_send, offset);
- brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr);
-
- brw_pop_insn_state(p);
-
- /* visitor knows more than we do about the surface limit required,
- * so has already done marking.
- */
-
- }
-}
-
-void
-fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
- struct brw_reg dst,
- struct brw_reg index,
- struct brw_reg offset)
-{
- assert(brw->gen < 7); /* Should use the gen7 variant. */
- assert(inst->header_present);
- assert(inst->mlen);
-
- assert(index.file == BRW_IMMEDIATE_VALUE &&
- index.type == BRW_REGISTER_TYPE_UD);
- uint32_t surf_index = index.dw1.ud;
-
- uint32_t simd_mode, rlen, msg_type;
- if (dispatch_width == 16) {
- simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
- rlen = 8;
- } else {
- simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
- rlen = 4;
- }
-
- if (brw->gen >= 5)
- msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
- else {
- /* We always use the SIMD16 message so that we only have to load U, and
- * not V or R.
- */
- msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
- assert(inst->mlen == 3);
- assert(inst->regs_written == 8);
- rlen = 8;
- simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
- }
-
- struct brw_reg offset_mrf = retype(brw_message_reg(inst->base_mrf + 1),
- BRW_REGISTER_TYPE_D);
- brw_MOV(p, offset_mrf, offset);
-
- struct brw_reg header = brw_vec8_grf(0, 0);
- gen6_resolve_implied_move(p, &header, inst->base_mrf);
-
- brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
- brw_inst_set_qtr_control(brw, send, BRW_COMPRESSION_NONE);
- brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
- brw_set_src0(p, send, header);
- if (brw->gen < 6)
- brw_inst_set_base_mrf(brw, send, inst->base_mrf);
-
- /* Our surface is set up as floats, regardless of what actual data is
- * stored in it.
- */
- uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
- brw_set_sampler_message(p, send,
- surf_index,
- 0, /* sampler (unused) */
- msg_type,
- rlen,
- inst->mlen,
- inst->header_present,
- simd_mode,
- return_format);
-
- brw_mark_surface_used(&prog_data->base, surf_index);
-}
-
-void
-fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
- struct brw_reg dst,
- struct brw_reg index,
- struct brw_reg offset)
-{
- assert(brw->gen >= 7);
- /* Varying-offset pull constant loads are treated as a normal expression on
- * gen7, so the fact that it's a send message is hidden at the IR level.
- */
- assert(!inst->header_present);
- assert(!inst->mlen);
- assert(index.type == BRW_REGISTER_TYPE_UD);
-
- uint32_t simd_mode, rlen, mlen;
- if (dispatch_width == 16) {
- mlen = 2;
- rlen = 8;
- simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
- } else {
- mlen = 1;
- rlen = 4;
- simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
- }
-
- if (index.file == BRW_IMMEDIATE_VALUE) {
-
- uint32_t surf_index = index.dw1.ud;
-
- brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
- brw_set_dest(p, send, dst);
- brw_set_src0(p, send, offset);
- brw_set_sampler_message(p, send,
- surf_index,
- 0, /* LD message ignores sampler unit */
- GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
- rlen,
- mlen,
- false, /* no header */
- simd_mode,
- 0);
-
- brw_mark_surface_used(&prog_data->base, surf_index);
-
- } else {
-
- struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
-
- brw_push_insn_state(p);
- brw_set_default_mask_control(p, BRW_MASK_DISABLE);
- brw_set_default_access_mode(p, BRW_ALIGN_1);
-
- /* a0.0 = surf_index & 0xff */
- brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
- brw_inst_set_exec_size(p->brw, insn_and, BRW_EXECUTE_1);
- brw_set_dest(p, insn_and, addr);
- brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
- brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
-
-
- /* a0.0 |= <descriptor> */
- brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
- brw_set_sampler_message(p, insn_or,
- 0 /* surface */,
- 0 /* sampler */,
- GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
- rlen /* rlen */,
- mlen /* mlen */,
- false /* header */,
- simd_mode,
- 0);
- brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
- brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
- brw_set_src0(p, insn_or, addr);
- brw_set_dest(p, insn_or, addr);
-
-
- /* dst = send(offset, a0.0) */
- brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
- brw_set_dest(p, insn_send, dst);
- brw_set_src0(p, insn_send, offset);
- brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr);
-
- brw_pop_insn_state(p);
-
- /* visitor knows more than we do about the surface limit required,
- * so has already done marking.
- */
- }
-}
-
-/**
- * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
- * into the flags register (f0.0).
- *
- * Used only on Gen6 and above.
- */
-void
-fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
-{
- struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
- struct brw_reg dispatch_mask;
-
- if (brw->gen >= 6)
- dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
- else
- dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
-
- brw_push_insn_state(p);
- brw_set_default_mask_control(p, BRW_MASK_DISABLE);
- brw_MOV(p, flags, dispatch_mask);
- brw_pop_insn_state(p);
-}
-
-void
-fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
- struct brw_reg dst,
- struct brw_reg src,
- struct brw_reg msg_data,
- unsigned msg_type)
-{
- assert(msg_data.file == BRW_IMMEDIATE_VALUE &&
- msg_data.type == BRW_REGISTER_TYPE_UD);
-
- brw_pixel_interpolator_query(p,
- retype(dst, BRW_REGISTER_TYPE_UW),
- src,
- inst->pi_noperspective,
- msg_type,
- msg_data.dw1.ud,
- inst->mlen,
- inst->regs_written);
-}
-
-
-static uint32_t brw_file_from_reg(fs_reg *reg)
-{
- switch (reg->file) {
- case GRF:
- return BRW_GENERAL_REGISTER_FILE;
- case MRF:
- return BRW_MESSAGE_REGISTER_FILE;
- case IMM:
- return BRW_IMMEDIATE_VALUE;
- default:
- unreachable("not reached");
- }
-}
-
-struct brw_reg
-brw_reg_from_fs_reg(fs_reg *reg)
-{
- struct brw_reg brw_reg;
-
- switch (reg->file) {
- case GRF:
- case MRF:
- if (reg->stride == 0) {
- brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0);
- } else {
- brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
- brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride);
- }
-
- brw_reg = retype(brw_reg, reg->type);
- brw_reg = byte_offset(brw_reg, reg->subreg_offset);
- break;
- case IMM:
- switch (reg->type) {
- case BRW_REGISTER_TYPE_F:
- brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f);
- break;
- case BRW_REGISTER_TYPE_D:
- brw_reg = brw_imm_d(reg->fixed_hw_reg.dw1.d);
- break;
- case BRW_REGISTER_TYPE_UD:
- brw_reg = brw_imm_ud(reg->fixed_hw_reg.dw1.ud);
- break;
- default:
- unreachable("not reached");
- }
- break;
- case HW_REG:
- assert(reg->type == reg->fixed_hw_reg.type);
- brw_reg = reg->fixed_hw_reg;
- break;
- case BAD_FILE:
- /* Probably unused. */
- brw_reg = brw_null_reg();
- break;
- case UNIFORM:
- unreachable("not reached");
- default:
- unreachable("not reached");
- }
- if (reg->abs)
- brw_reg = brw_abs(brw_reg);
- if (reg->negate)
- brw_reg = negate(brw_reg);
-
- return brw_reg;
-}
-
-/**
- * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
- * sampler LD messages.
- *
- * We don't want to bake it into the send message's code generation because
- * that means we don't get a chance to schedule the instructions.
- */
-void
-fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
- struct brw_reg dst,
- struct brw_reg value)
-{
- assert(value.file == BRW_IMMEDIATE_VALUE);
-
- brw_push_insn_state(p);
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- brw_set_default_mask_control(p, BRW_MASK_DISABLE);
- brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
- brw_pop_insn_state(p);
-}
-
-/* Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
- * (when mask is passed as a uniform) of register mask before moving it
- * to register dst.
- */
-void
-fs_generator::generate_set_omask(fs_inst *inst,
- struct brw_reg dst,
- struct brw_reg mask)
-{
- bool stride_8_8_1 =
- (mask.vstride == BRW_VERTICAL_STRIDE_8 &&
- mask.width == BRW_WIDTH_8 &&
- mask.hstride == BRW_HORIZONTAL_STRIDE_1);
-
- bool stride_0_1_0 =
- (mask.vstride == BRW_VERTICAL_STRIDE_0 &&
- mask.width == BRW_WIDTH_1 &&
- mask.hstride == BRW_HORIZONTAL_STRIDE_0);
-
- assert(stride_8_8_1 || stride_0_1_0);
- assert(dst.type == BRW_REGISTER_TYPE_UW);
-
- if (dispatch_width == 16)
- dst = vec16(dst);
- brw_push_insn_state(p);
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-
- if (stride_8_8_1) {
- brw_MOV(p, dst, retype(stride(mask, 16, 8, 2), dst.type));
- } else if (stride_0_1_0) {
- brw_MOV(p, dst, retype(mask, dst.type));
- }
- brw_pop_insn_state(p);
-}
-
-/* Sets vstride=1, width=4, hstride=0 of register src1 during
- * the ADD instruction.
- */
-void
-fs_generator::generate_set_sample_id(fs_inst *inst,
- struct brw_reg dst,
- struct brw_reg src0,
- struct brw_reg src1)
-{
- assert(dst.type == BRW_REGISTER_TYPE_D ||
- dst.type == BRW_REGISTER_TYPE_UD);
- assert(src0.type == BRW_REGISTER_TYPE_D ||
- src0.type == BRW_REGISTER_TYPE_UD);
-
- brw_push_insn_state(p);
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- brw_set_default_mask_control(p, BRW_MASK_DISABLE);
- struct brw_reg reg = retype(stride(src1, 1, 4, 0), BRW_REGISTER_TYPE_UW);
- brw_ADD(p, dst, src0, reg);
- if (dispatch_width == 16)
- brw_ADD(p, offset(dst, 1), offset(src0, 1), suboffset(reg, 2));
- brw_pop_insn_state(p);
-}
-
-/**
- * Change the register's data type from UD to W, doubling the strides in order
- * to compensate for halving the data type width.
- */
-static struct brw_reg
-ud_reg_to_w(struct brw_reg r)
-{
- assert(r.type == BRW_REGISTER_TYPE_UD);
- r.type = BRW_REGISTER_TYPE_W;
-
- /* The BRW_*_STRIDE enums are defined so that incrementing the field
- * doubles the real stride.
- */
- if (r.hstride != 0)
- ++r.hstride;
- if (r.vstride != 0)
- ++r.vstride;
-
- return r;
-}
-
-void
-fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
- struct brw_reg dst,
- struct brw_reg x,
- struct brw_reg y)
-{
- assert(brw->gen >= 7);
- assert(dst.type == BRW_REGISTER_TYPE_UD);
- assert(x.type == BRW_REGISTER_TYPE_F);
- assert(y.type == BRW_REGISTER_TYPE_F);
-
- /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
- *
- * Because this instruction does not have a 16-bit floating-point type,
- * the destination data type must be Word (W).
- *
- * The destination must be DWord-aligned and specify a horizontal stride
- * (HorzStride) of 2. The 16-bit result is stored in the lower word of
- * each destination channel and the upper word is not modified.
- */
- struct brw_reg dst_w = ud_reg_to_w(dst);
-
- /* Give each 32-bit channel of dst the form below , where "." means
- * unchanged.
- * 0x....hhhh
- */
- brw_F32TO16(p, dst_w, y);
-
- /* Now the form:
- * 0xhhhh0000
- */
- brw_SHL(p, dst, dst, brw_imm_ud(16u));
-
- /* And, finally the form of packHalf2x16's output:
- * 0xhhhhllll
- */
- brw_F32TO16(p, dst_w, x);
-}
-
-void
-fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
- struct brw_reg dst,
- struct brw_reg src)
-{
- assert(brw->gen >= 7);
- assert(dst.type == BRW_REGISTER_TYPE_F);
- assert(src.type == BRW_REGISTER_TYPE_UD);
-
- /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
- *
- * Because this instruction does not have a 16-bit floating-point type,
- * the source data type must be Word (W). The destination type must be
- * F (Float).
- */
- struct brw_reg src_w = ud_reg_to_w(src);
-
- /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
- * For the Y case, we wish to access only the upper word; therefore
- * a 16-bit subregister offset is needed.
- */
- assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
- inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
- if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
- src_w.subnr += 2;
-
- brw_F16TO32(p, dst, src_w);
-}
-
-void
-fs_generator::generate_shader_time_add(fs_inst *inst,
- struct brw_reg payload,
- struct brw_reg offset,
- struct brw_reg value)
-{
- assert(brw->gen >= 7);
- brw_push_insn_state(p);
- brw_set_default_mask_control(p, true);
-
- assert(payload.file == BRW_GENERAL_REGISTER_FILE);
- struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
- offset.type);
- struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
- value.type);
-
- assert(offset.file == BRW_IMMEDIATE_VALUE);
- if (value.file == BRW_GENERAL_REGISTER_FILE) {
- value.width = BRW_WIDTH_1;
- value.hstride = BRW_HORIZONTAL_STRIDE_0;
- value.vstride = BRW_VERTICAL_STRIDE_0;
- } else {
- assert(value.file == BRW_IMMEDIATE_VALUE);
- }
-
- /* Trying to deal with setup of the params from the IR is crazy in the FS8
- * case, and we don't really care about squeezing every bit of performance
- * out of this path, so we just emit the MOVs from here.
- */
- brw_MOV(p, payload_offset, offset);
- brw_MOV(p, payload_value, value);
- brw_shader_time_add(p, payload,
- prog_data->base.binding_table.shader_time_start);
- brw_pop_insn_state(p);
-
- brw_mark_surface_used(&prog_data->base,
- prog_data->base.binding_table.shader_time_start);
-}
-
-void
-fs_generator::generate_untyped_atomic(fs_inst *inst, struct brw_reg dst,
- struct brw_reg atomic_op,
- struct brw_reg surf_index)
-{
- assert(atomic_op.file == BRW_IMMEDIATE_VALUE &&
- atomic_op.type == BRW_REGISTER_TYPE_UD &&
- surf_index.file == BRW_IMMEDIATE_VALUE &&
- surf_index.type == BRW_REGISTER_TYPE_UD);
-
- brw_untyped_atomic(p, dst, brw_message_reg(inst->base_mrf),
- atomic_op.dw1.ud, surf_index.dw1.ud,
- inst->mlen, dispatch_width / 8);
-
- brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
-}
-
-void
-fs_generator::generate_untyped_surface_read(fs_inst *inst, struct brw_reg dst,
- struct brw_reg surf_index)
-{
- assert(surf_index.file == BRW_IMMEDIATE_VALUE &&
- surf_index.type == BRW_REGISTER_TYPE_UD);
-
- brw_untyped_surface_read(p, dst, brw_message_reg(inst->base_mrf),
- surf_index.dw1.ud,
- inst->mlen, dispatch_width / 8);
-
- brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
-}
-
-void
-fs_generator::generate_code(const cfg_t *cfg)
-{
- int start_offset = p->next_insn_offset;
-
- struct annotation_info annotation;
- memset(&annotation, 0, sizeof(annotation));
-
- foreach_block_and_inst (block, fs_inst, inst, cfg) {
- struct brw_reg src[3], dst;
- unsigned int last_insn_offset = p->next_insn_offset;
-
- if (unlikely(debug_flag))
- annotate(brw, &annotation, cfg, inst, p->next_insn_offset);
-
- for (unsigned int i = 0; i < inst->sources; i++) {
- src[i] = brw_reg_from_fs_reg(&inst->src[i]);
-
- /* The accumulator result appears to get used for the
- * conditional modifier generation. When negating a UD
- * value, there is a 33rd bit generated for the sign in the
- * accumulator value, so now you can't check, for example,
- * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
- */
- assert(!inst->conditional_mod ||
- inst->src[i].type != BRW_REGISTER_TYPE_UD ||
- !inst->src[i].negate);
- }
- dst = brw_reg_from_fs_reg(&inst->dst);
-
- brw_set_default_predicate_control(p, inst->predicate);
- brw_set_default_predicate_inverse(p, inst->predicate_inverse);
- brw_set_default_flag_reg(p, 0, inst->flag_subreg);
- brw_set_default_saturate(p, inst->saturate);
- brw_set_default_mask_control(p, inst->force_writemask_all);
- brw_set_default_acc_write_control(p, inst->writes_accumulator);
-
- if (inst->force_uncompressed || dispatch_width == 8) {
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- } else if (inst->force_sechalf) {
- brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
- } else {
- brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
- }
-
- switch (inst->opcode) {
- case BRW_OPCODE_MOV:
- brw_MOV(p, dst, src[0]);
- break;
- case BRW_OPCODE_ADD:
- brw_ADD(p, dst, src[0], src[1]);
- break;
- case BRW_OPCODE_MUL:
- brw_MUL(p, dst, src[0], src[1]);
- break;
- case BRW_OPCODE_AVG:
- brw_AVG(p, dst, src[0], src[1]);
- break;
- case BRW_OPCODE_MACH:
- brw_MACH(p, dst, src[0], src[1]);
- break;
-
- case BRW_OPCODE_MAD:
- assert(brw->gen >= 6);
- brw_set_default_access_mode(p, BRW_ALIGN_16);
- if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) {
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- brw_MAD(p, dst, src[0], src[1], src[2]);
- brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
- brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
- brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
- } else {
- brw_MAD(p, dst, src[0], src[1], src[2]);
- }
- brw_set_default_access_mode(p, BRW_ALIGN_1);
- break;
-
- case BRW_OPCODE_LRP:
- assert(brw->gen >= 6);
- brw_set_default_access_mode(p, BRW_ALIGN_16);
- if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) {
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- brw_LRP(p, dst, src[0], src[1], src[2]);
- brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
- brw_LRP(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
- brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
- } else {
- brw_LRP(p, dst, src[0], src[1], src[2]);
- }
- brw_set_default_access_mode(p, BRW_ALIGN_1);
- break;
-
- case BRW_OPCODE_FRC:
- brw_FRC(p, dst, src[0]);
- break;
- case BRW_OPCODE_RNDD:
- brw_RNDD(p, dst, src[0]);
- break;
- case BRW_OPCODE_RNDE:
- brw_RNDE(p, dst, src[0]);
- break;
- case BRW_OPCODE_RNDZ:
- brw_RNDZ(p, dst, src[0]);
- break;
-
- case BRW_OPCODE_AND:
- brw_AND(p, dst, src[0], src[1]);
- break;
- case BRW_OPCODE_OR:
- brw_OR(p, dst, src[0], src[1]);
- break;
- case BRW_OPCODE_XOR:
- brw_XOR(p, dst, src[0], src[1]);
- break;
- case BRW_OPCODE_NOT:
- brw_NOT(p, dst, src[0]);
- break;
- case BRW_OPCODE_ASR:
- brw_ASR(p, dst, src[0], src[1]);
- break;
- case BRW_OPCODE_SHR:
- brw_SHR(p, dst, src[0], src[1]);
- break;
- case BRW_OPCODE_SHL:
- brw_SHL(p, dst, src[0], src[1]);
- break;
- case BRW_OPCODE_F32TO16:
- assert(brw->gen >= 7);
- brw_F32TO16(p, dst, src[0]);
- break;
- case BRW_OPCODE_F16TO32:
- assert(brw->gen >= 7);
- brw_F16TO32(p, dst, src[0]);
- break;
- case BRW_OPCODE_CMP:
- brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
- break;
- case BRW_OPCODE_SEL:
- brw_SEL(p, dst, src[0], src[1]);
- break;
- case BRW_OPCODE_BFREV:
- assert(brw->gen >= 7);
- /* BFREV only supports UD type for src and dst. */
- brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
- retype(src[0], BRW_REGISTER_TYPE_UD));
- break;
- case BRW_OPCODE_FBH:
- assert(brw->gen >= 7);
- /* FBH only supports UD type for dst. */
- brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
- break;
- case BRW_OPCODE_FBL:
- assert(brw->gen >= 7);
- /* FBL only supports UD type for dst. */
- brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
- break;
- case BRW_OPCODE_CBIT:
- assert(brw->gen >= 7);
- /* CBIT only supports UD type for dst. */
- brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
- break;
- case BRW_OPCODE_ADDC:
- assert(brw->gen >= 7);
- brw_ADDC(p, dst, src[0], src[1]);
- break;
- case BRW_OPCODE_SUBB:
- assert(brw->gen >= 7);
- brw_SUBB(p, dst, src[0], src[1]);
- break;
- case BRW_OPCODE_MAC:
- brw_MAC(p, dst, src[0], src[1]);
- break;
-
- case BRW_OPCODE_BFE:
- assert(brw->gen >= 7);
- brw_set_default_access_mode(p, BRW_ALIGN_16);
- if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) {
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- brw_BFE(p, dst, src[0], src[1], src[2]);
- brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
- brw_BFE(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
- brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
- } else {
- brw_BFE(p, dst, src[0], src[1], src[2]);
- }
- brw_set_default_access_mode(p, BRW_ALIGN_1);
- break;
-
- case BRW_OPCODE_BFI1:
- assert(brw->gen >= 7);
- /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
- * should
- *
- * "Force BFI instructions to be executed always in SIMD8."
- */
- if (dispatch_width == 16 && brw->is_haswell) {
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- brw_BFI1(p, dst, src[0], src[1]);
- brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
- brw_BFI1(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]));
- brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
- } else {
- brw_BFI1(p, dst, src[0], src[1]);
- }
- break;
- case BRW_OPCODE_BFI2:
- assert(brw->gen >= 7);
- brw_set_default_access_mode(p, BRW_ALIGN_16);
- /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
- * should
- *
- * "Force BFI instructions to be executed always in SIMD8."
- *
- * Otherwise we would be able to emit compressed instructions like we
- * do for the other three-source instructions.
- */
- if (dispatch_width == 16) {
- brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
- brw_BFI2(p, dst, src[0], src[1], src[2]);
- brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
- brw_BFI2(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
- brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
- } else {
- brw_BFI2(p, dst, src[0], src[1], src[2]);
- }
- brw_set_default_access_mode(p, BRW_ALIGN_1);
- break;
-
- case BRW_OPCODE_IF:
- if (inst->src[0].file != BAD_FILE) {
- /* The instruction has an embedded compare (only allowed on gen6) */
- assert(brw->gen == 6);
- gen6_IF(p, inst->conditional_mod, src[0], src[1]);
- } else {
- brw_IF(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
- }
- break;
-
- case BRW_OPCODE_ELSE:
- brw_ELSE(p);
- break;
- case BRW_OPCODE_ENDIF:
- brw_ENDIF(p);
- break;
-
- case BRW_OPCODE_DO:
- brw_DO(p, BRW_EXECUTE_8);
- break;
-
- case BRW_OPCODE_BREAK:
- brw_BREAK(p);
- brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
- break;
- case BRW_OPCODE_CONTINUE:
- brw_CONT(p);
- brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
- break;
-
- case BRW_OPCODE_WHILE:
- brw_WHILE(p);
- break;
-
- case SHADER_OPCODE_RCP:
- case SHADER_OPCODE_RSQ:
- case SHADER_OPCODE_SQRT:
- case SHADER_OPCODE_EXP2:
- case SHADER_OPCODE_LOG2:
- case SHADER_OPCODE_SIN:
- case SHADER_OPCODE_COS:
- assert(brw->gen < 6 || inst->mlen == 0);
- if (brw->gen >= 7) {
- gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
- brw_null_reg());
- } else if (brw->gen == 6) {
- generate_math_gen6(inst, dst, src[0], brw_null_reg());
- } else if (brw->gen == 5 || brw->is_g4x) {
- generate_math_g45(inst, dst, src[0]);
- } else {
- generate_math_gen4(inst, dst, src[0]);
- }
- break;
- case SHADER_OPCODE_INT_QUOTIENT:
- case SHADER_OPCODE_INT_REMAINDER:
- case SHADER_OPCODE_POW:
- assert(brw->gen < 6 || inst->mlen == 0);
- if (brw->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) {
- gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
- } else if (brw->gen >= 6) {
- generate_math_gen6(inst, dst, src[0], src[1]);
- } else {
- generate_math_gen4(inst, dst, src[0]);
- }
- break;
- case FS_OPCODE_PIXEL_X:
- generate_pixel_xy(dst, true);
- break;
- case FS_OPCODE_PIXEL_Y:
- generate_pixel_xy(dst, false);
- break;
- case FS_OPCODE_CINTERP:
- brw_MOV(p, dst, src[0]);
- break;
- case FS_OPCODE_LINTERP:
- generate_linterp(inst, dst, src);
- break;
- case SHADER_OPCODE_TEX:
- case FS_OPCODE_TXB:
- case SHADER_OPCODE_TXD:
- case SHADER_OPCODE_TXF:
- case SHADER_OPCODE_TXF_CMS:
- case SHADER_OPCODE_TXF_UMS:
- case SHADER_OPCODE_TXF_MCS:
- case SHADER_OPCODE_TXL:
- case SHADER_OPCODE_TXS:
- case SHADER_OPCODE_LOD:
- case SHADER_OPCODE_TG4:
- case SHADER_OPCODE_TG4_OFFSET:
- generate_tex(inst, dst, src[0], src[1]);
- break;
- case FS_OPCODE_DDX:
- generate_ddx(inst, dst, src[0], src[1]);
- break;
- case FS_OPCODE_DDY:
- /* Make sure fp->UsesDFdy flag got set (otherwise there's no
- * guarantee that key->render_to_fbo is set).
- */
- assert(fp->UsesDFdy);
- generate_ddy(inst, dst, src[0], src[1], key->render_to_fbo);
- break;
-
- case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
- generate_scratch_write(inst, src[0]);
- break;
-
- case SHADER_OPCODE_GEN4_SCRATCH_READ:
- generate_scratch_read(inst, dst);
- break;
-
- case SHADER_OPCODE_GEN7_SCRATCH_READ:
- generate_scratch_read_gen7(inst, dst);
- break;
-
- case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
- generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
- break;
-
- case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
- generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
- break;
-
- case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
- generate_varying_pull_constant_load(inst, dst, src[0], src[1]);
- break;
-
- case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
- generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
- break;
-
- case FS_OPCODE_REP_FB_WRITE:
- case FS_OPCODE_FB_WRITE:
- generate_fb_write(inst);
- break;
-
- case FS_OPCODE_BLORP_FB_WRITE:
- generate_blorp_fb_write(inst);
- break;
-
- case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
- generate_mov_dispatch_to_flags(inst);
- break;
-
- case FS_OPCODE_DISCARD_JUMP:
- generate_discard_jump(inst);
- break;
-
- case SHADER_OPCODE_SHADER_TIME_ADD:
- generate_shader_time_add(inst, src[0], src[1], src[2]);
- break;
-
- case SHADER_OPCODE_UNTYPED_ATOMIC:
- generate_untyped_atomic(inst, dst, src[0], src[1]);
- break;
-
- case SHADER_OPCODE_UNTYPED_SURFACE_READ:
- generate_untyped_surface_read(inst, dst, src[0]);
- break;
-
- case FS_OPCODE_SET_SIMD4X2_OFFSET:
- generate_set_simd4x2_offset(inst, dst, src[0]);
- break;
-
- case FS_OPCODE_SET_OMASK:
- generate_set_omask(inst, dst, src[0]);
- break;
-
- case FS_OPCODE_SET_SAMPLE_ID:
- generate_set_sample_id(inst, dst, src[0], src[1]);
- break;
-
- case FS_OPCODE_PACK_HALF_2x16_SPLIT:
- generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
- break;
-
- case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
- case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
- generate_unpack_half_2x16_split(inst, dst, src[0]);
- break;
-
- case FS_OPCODE_PLACEHOLDER_HALT:
- /* This is the place where the final HALT needs to be inserted if
- * we've emitted any discards. If not, this will emit no code.
- */
- if (!patch_discard_jumps_to_fb_writes()) {
- if (unlikely(debug_flag)) {
- annotation.ann_count--;
- }
- }
- break;
-
- case FS_OPCODE_INTERPOLATE_AT_CENTROID:
- generate_pixel_interpolator_query(inst, dst, src[0], src[1],
- GEN7_PIXEL_INTERPOLATOR_LOC_CENTROID);
- break;
-
- case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
- generate_pixel_interpolator_query(inst, dst, src[0], src[1],
- GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
- break;
-
- case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
- generate_pixel_interpolator_query(inst, dst, src[0], src[1],
- GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
- break;
-
- case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
- generate_pixel_interpolator_query(inst, dst, src[0], src[1],
- GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
- break;
-
- default:
- if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
- _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
- opcode_descs[inst->opcode].name);
- } else {
- _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
- }
- abort();
-
- case SHADER_OPCODE_LOAD_PAYLOAD:
- unreachable("Should be lowered by lower_load_payload()");
- }
-
- if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
- assert(p->next_insn_offset == last_insn_offset + 16 ||
- !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
- "emitting more than 1 instruction");
-
- brw_inst *last = &p->store[last_insn_offset / 16];
-
- brw_inst_set_cond_modifier(brw, last, inst->conditional_mod);
- brw_inst_set_no_dd_clear(brw, last, inst->no_dd_clear);
- brw_inst_set_no_dd_check(brw, last, inst->no_dd_check);
- }
- }
-
- brw_set_uip_jip(p);
- annotation_finalize(&annotation, p->next_insn_offset);
-
- int before_size = p->next_insn_offset - start_offset;
- brw_compact_instructions(p, start_offset, annotation.ann_count,
- annotation.ann);
- int after_size = p->next_insn_offset - start_offset;
-
- if (unlikely(debug_flag)) {
- if (prog) {
- fprintf(stderr,
- "Native code for %s fragment shader %d (SIMD%d dispatch):\n",
- prog->Label ? prog->Label : "unnamed",
- prog->Name, dispatch_width);
- } else if (fp) {
- fprintf(stderr,
- "Native code for fragment program %d (SIMD%d dispatch):\n",
- fp->Base.Id, dispatch_width);
- } else {
- fprintf(stderr, "Native code for blorp program (SIMD%d dispatch):\n",
- dispatch_width);
- }
- fprintf(stderr, "SIMD%d shader: %d instructions. Compacted %d to %d"
- " bytes (%.0f%%)\n",
- dispatch_width, before_size / 16, before_size, after_size,
- 100.0f * (before_size - after_size) / before_size);
-
- const struct gl_program *prog = fp ? &fp->Base : NULL;
-
- dump_assembly(p->store, annotation.ann_count, annotation.ann, brw, prog);
- ralloc_free(annotation.ann);
- }
-}
-
-const unsigned *
-fs_generator::generate_assembly(const cfg_t *simd8_cfg,
- const cfg_t *simd16_cfg,
- unsigned *assembly_size)
-{
- assert(simd8_cfg || simd16_cfg);
-
- if (simd8_cfg) {
- dispatch_width = 8;
- generate_code(simd8_cfg);
- }
-
- if (simd16_cfg) {
- /* align to 64 byte boundary. */
- while (p->next_insn_offset % 64) {
- brw_NOP(p);
- }
-
- /* Save off the start of this SIMD16 program */
- prog_data->prog_offset_16 = p->next_insn_offset;
-
- brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
-
- dispatch_width = 16;
- generate_code(simd16_cfg);
- }
-
- return brw_get_program(p, assembly_size);
-}
diff --git a/src/mesa/drivers/dri/i965/brw_scalar.h b/src/mesa/drivers/dri/i965/brw_scalar.h
index c2bc1f0..cb70c94 100644
--- a/src/mesa/drivers/dri/i965/brw_scalar.h
+++ b/src/mesa/drivers/dri/i965/brw_scalar.h
@@ -567,18 +567,18 @@ public:
*
* Translates FS IR to actual i965 assembly code.
*/
-class fs_generator
+class scalar_generator
{
public:
- fs_generator(struct brw_context *brw,
- void *mem_ctx,
- const struct brw_wm_prog_key *key,
- struct brw_wm_prog_data *prog_data,
- struct gl_shader_program *prog,
- struct gl_fragment_program *fp,
- bool runtime_check_aads_emit,
- bool debug_flag);
- ~fs_generator();
+ scalar_generator(struct brw_context *brw,
+ void *mem_ctx,
+ const struct brw_wm_prog_key *key,
+ struct brw_wm_prog_data *prog_data,
+ struct gl_shader_program *prog,
+ struct gl_fragment_program *fp,
+ bool runtime_check_aads_emit,
+ bool debug_flag);
+ ~scalar_generator();
const unsigned *generate_assembly(const cfg_t *simd8_cfg,
const cfg_t *simd16_cfg,
diff --git a/src/mesa/drivers/dri/i965/brw_scalar_generator.cpp b/src/mesa/drivers/dri/i965/brw_scalar_generator.cpp
new file mode 100644
index 0000000..a5a5cad
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_scalar_generator.cpp
@@ -0,0 +1,2011 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_generator.cpp
+ *
+ * This file supports generating code from the FS LIR to the actual
+ * native instructions.
+ */
+
+extern "C" {
+#include "main/macros.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+} /* extern "C" */
+
+#include "brw_scalar.h"
+#include "brw_cfg.h"
+
+scalar_generator::scalar_generator(struct brw_context *brw,
+ void *mem_ctx,
+ const struct brw_wm_prog_key *key,
+ struct brw_wm_prog_data *prog_data,
+ struct gl_shader_program *prog,
+ struct gl_fragment_program *fp,
+ bool runtime_check_aads_emit,
+ bool debug_flag)
+
+ : brw(brw), key(key), prog_data(prog_data), prog(prog), fp(fp),
+ runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(debug_flag),
+ mem_ctx(mem_ctx)
+{
+ ctx = &brw->ctx;
+
+ p = rzalloc(mem_ctx, struct brw_compile);
+ brw_init_compile(brw, p, mem_ctx);
+}
+
+scalar_generator::~scalar_generator()
+{
+}
+
+bool
+scalar_generator::patch_discard_jumps_to_fb_writes()
+{
+ if (brw->gen < 6 || this->discard_halt_patches.is_empty())
+ return false;
+
+ int scale = brw_jump_scale(brw);
+
+ /* There is a somewhat strange undocumented requirement of using
+ * HALT, according to the simulator. If some channel has HALTed to
+ * a particular UIP, then by the end of the program, every channel
+ * must have HALTed to that UIP. Furthermore, the tracking is a
+ * stack, so you can't do the final halt of a UIP after starting
+ * halting to a new UIP.
+ *
+ * Symptoms of not emitting this instruction on actual hardware
+ * included GPU hangs and sparkly rendering on the piglit discard
+ * tests.
+ */
+ brw_inst *last_halt = gen6_HALT(p);
+ brw_inst_set_uip(brw, last_halt, 1 * scale);
+ brw_inst_set_jip(brw, last_halt, 1 * scale);
+
+ int ip = p->nr_insn;
+
+ foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
+ brw_inst *patch = &p->store[patch_ip->ip];
+
+ assert(brw_inst_opcode(brw, patch) == BRW_OPCODE_HALT);
+ /* HALT takes a half-instruction distance from the pre-incremented IP. */
+ brw_inst_set_uip(brw, patch, (ip - patch_ip->ip) * scale);
+ }
+
+ this->discard_halt_patches.make_empty();
+ return true;
+}
+
+void
+scalar_generator::fire_fb_write(fs_inst *inst,
+ GLuint base_reg,
+ struct brw_reg implied_header,
+ GLuint nr)
+{
+ uint32_t msg_control;
+
+ if (brw->gen < 6) {
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_MOV(p,
+ brw_message_reg(base_reg + 1),
+ brw_vec8_grf(1, 0));
+ brw_pop_insn_state(p);
+ }
+
+ if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
+ msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
+ else if (prog_data->dual_src_blend)
+ msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
+ else if (dispatch_width == 16)
+ msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
+ else
+ msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
+
+ uint32_t surf_index =
+ prog_data->binding_table.render_target_start + inst->target;
+
+ brw_fb_WRITE(p,
+ dispatch_width,
+ base_reg,
+ implied_header,
+ msg_control,
+ surf_index,
+ nr,
+ 0,
+ inst->eot,
+ inst->header_present);
+
+ brw_mark_surface_used(&prog_data->base, surf_index);
+}
+
+void
+scalar_generator::generate_fb_write(fs_inst *inst)
+{
+ struct brw_reg implied_header;
+
+ /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
+ * move, here's g1.
+ */
+ if (inst->header_present) {
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_set_default_flag_reg(p, 0, 0);
+
+ /* On HSW, the GPU will use the predicate on SENDC, unless the header is
+ * present.
+ */
+ if ((fp && fp->UsesKill) || key->alpha_test_func) {
+ struct brw_reg pixel_mask;
+
+ if (brw->gen >= 6)
+ pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
+ else
+ pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+
+ brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
+ }
+
+ if (brw->gen >= 6) {
+ brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+ brw_MOV(p,
+ retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
+ retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+
+ if (inst->target > 0 && key->replicate_alpha) {
+ /* Set "Source0 Alpha Present to RenderTarget" bit in message
+ * header.
+ */
+ brw_OR(p,
+ vec1(retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD)),
+ vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+ brw_imm_ud(0x1 << 11));
+ }
+
+ if (inst->target > 0) {
+ /* Set the render target index for choosing BLEND_STATE. */
+ brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+ inst->base_mrf, 2),
+ BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(inst->target));
+ }
+
+ implied_header = brw_null_reg();
+ } else {
+ implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
+ }
+
+ brw_pop_insn_state(p);
+ } else {
+ implied_header = brw_null_reg();
+ }
+
+ if (!runtime_check_aads_emit) {
+ fire_fb_write(inst, inst->base_mrf, implied_header, inst->mlen);
+ } else {
+ /* This can only happen in gen < 6 */
+ assert(brw->gen < 6);
+
+ struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+
+ /* Check runtime bit to detect if we have to send AA data or not */
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_AND(p,
+ v1_null_ud,
+ retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
+ brw_imm_ud(1<<26));
+ brw_inst_set_cond_modifier(brw, brw_last_inst, BRW_CONDITIONAL_NZ);
+
+ int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
+ brw_inst_set_exec_size(brw, brw_last_inst, BRW_EXECUTE_1);
+ {
+ /* Don't send AA data */
+ fire_fb_write(inst, inst->base_mrf+1, implied_header, inst->mlen-1);
+ }
+ brw_land_fwd_jump(p, jmp);
+ fire_fb_write(inst, inst->base_mrf, implied_header, inst->mlen);
+ }
+}
+
+void
+scalar_generator::generate_blorp_fb_write(fs_inst *inst)
+{
+ brw_fb_WRITE(p,
+ 16 /* dispatch_width */,
+ inst->base_mrf,
+ brw_reg_from_fs_reg(&inst->src[0]),
+ BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE,
+ inst->target,
+ inst->mlen,
+ 0,
+ true,
+ inst->header_present);
+}
+
+/* Computes the integer pixel x,y values from the origin.
+ *
+ * This is the basis of gl_FragCoord computation, but is also used
+ * pre-gen6 for computing the deltas from v0 for computing
+ * interpolation.
+ */
+void
+scalar_generator::generate_pixel_xy(struct brw_reg dst, bool is_x)
+{
+ struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
+ struct brw_reg src;
+ struct brw_reg deltas;
+
+ if (is_x) {
+ src = stride(suboffset(g1_uw, 4), 2, 4, 0);
+ deltas = brw_imm_v(0x10101010);
+ } else {
+ src = stride(suboffset(g1_uw, 5), 2, 4, 0);
+ deltas = brw_imm_v(0x11001100);
+ }
+
+ if (dispatch_width == 16) {
+ dst = vec16(dst);
+ }
+
+ /* We do this SIMD8 or SIMD16, but since the destination is UW we
+ * don't do compression in the SIMD16 case.
+ */
+ brw_push_insn_state(p);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_ADD(p, dst, src, deltas);
+ brw_pop_insn_state(p);
+}
+
+void
+scalar_generator::generate_linterp(fs_inst *inst,
+ struct brw_reg dst, struct brw_reg *src)
+{
+ struct brw_reg delta_x = src[0];
+ struct brw_reg delta_y = src[1];
+ struct brw_reg interp = src[2];
+
+ if (brw->has_pln &&
+ delta_y.nr == delta_x.nr + 1 &&
+ (brw->gen >= 6 || (delta_x.nr & 1) == 0)) {
+ brw_PLN(p, dst, interp, delta_x);
+ } else {
+ brw_LINE(p, brw_null_reg(), interp, delta_x);
+ brw_MAC(p, dst, suboffset(interp, 1), delta_y);
+ }
+}
+
+void
+scalar_generator::generate_math_gen6(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src0,
+ struct brw_reg src1)
+{
+ int op = brw_math_function(inst->opcode);
+ bool binop = src1.file != BRW_ARCHITECTURE_REGISTER_FILE;
+
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ gen6_math(p, dst, op, src0, src1);
+
+ if (dispatch_width == 16) {
+ brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
+ gen6_math(p, sechalf(dst), op, sechalf(src0),
+ binop ? sechalf(src1) : brw_null_reg());
+ brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+ }
+}
+
+void
+scalar_generator::generate_math_gen4(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src)
+{
+ int op = brw_math_function(inst->opcode);
+
+ assert(inst->mlen >= 1);
+
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ gen4_math(p, dst,
+ op,
+ inst->base_mrf, src,
+ BRW_MATH_DATA_VECTOR,
+ BRW_MATH_PRECISION_FULL);
+
+ if (dispatch_width == 16) {
+ brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
+ gen4_math(p, sechalf(dst),
+ op,
+ inst->base_mrf + 1, sechalf(src),
+ BRW_MATH_DATA_VECTOR,
+ BRW_MATH_PRECISION_FULL);
+
+ brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+ }
+}
+
+void
+scalar_generator::generate_math_g45(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src)
+{
+ if (inst->opcode == SHADER_OPCODE_POW ||
+ inst->opcode == SHADER_OPCODE_INT_QUOTIENT ||
+ inst->opcode == SHADER_OPCODE_INT_REMAINDER) {
+ generate_math_gen4(inst, dst, src);
+ return;
+ }
+
+ int op = brw_math_function(inst->opcode);
+
+ assert(inst->mlen >= 1);
+
+ gen4_math(p, dst,
+ op,
+ inst->base_mrf, src,
+ BRW_MATH_DATA_VECTOR,
+ BRW_MATH_PRECISION_FULL);
+}
+
+void
+scalar_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
+ struct brw_reg sampler_index)
+{
+ int msg_type = -1;
+ int rlen = 4;
+ uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+ uint32_t return_format;
+
+ switch (dst.type) {
+ case BRW_REGISTER_TYPE_D:
+ return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
+ break;
+ case BRW_REGISTER_TYPE_UD:
+ return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
+ break;
+ default:
+ return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+ break;
+ }
+
+ if (dispatch_width == 16 &&
+ !inst->force_uncompressed && !inst->force_sechalf)
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+
+ if (brw->gen >= 5) {
+ switch (inst->opcode) {
+ case SHADER_OPCODE_TEX:
+ if (inst->shadow_compare) {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
+ } else {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
+ }
+ break;
+ case FS_OPCODE_TXB:
+ if (inst->shadow_compare) {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
+ } else {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
+ }
+ break;
+ case SHADER_OPCODE_TXL:
+ if (inst->shadow_compare) {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
+ } else {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
+ }
+ break;
+ case SHADER_OPCODE_TXS:
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
+ break;
+ case SHADER_OPCODE_TXD:
+ if (inst->shadow_compare) {
+ /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */
+ assert(brw->gen >= 8 || brw->is_haswell);
+ msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
+ } else {
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
+ }
+ break;
+ case SHADER_OPCODE_TXF:
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
+ break;
+ case SHADER_OPCODE_TXF_CMS:
+ if (brw->gen >= 7)
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
+ else
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
+ break;
+ case SHADER_OPCODE_TXF_UMS:
+ assert(brw->gen >= 7);
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
+ break;
+ case SHADER_OPCODE_TXF_MCS:
+ assert(brw->gen >= 7);
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
+ break;
+ case SHADER_OPCODE_LOD:
+ msg_type = GEN5_SAMPLER_MESSAGE_LOD;
+ break;
+ case SHADER_OPCODE_TG4:
+ if (inst->shadow_compare) {
+ assert(brw->gen >= 7);
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
+ } else {
+ assert(brw->gen >= 6);
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
+ }
+ break;
+ case SHADER_OPCODE_TG4_OFFSET:
+ assert(brw->gen >= 7);
+ if (inst->shadow_compare) {
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
+ } else {
+ msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
+ }
+ break;
+ default:
+ unreachable("not reached");
+ }
+ } else {
+ switch (inst->opcode) {
+ case SHADER_OPCODE_TEX:
+ /* Note that G45 and older determines shadow compare and dispatch width
+ * from message length for most messages.
+ */
+ assert(dispatch_width == 8);
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+ if (inst->shadow_compare) {
+ assert(inst->mlen == 6);
+ } else {
+ assert(inst->mlen <= 4);
+ }
+ break;
+ case FS_OPCODE_TXB:
+ if (inst->shadow_compare) {
+ assert(inst->mlen == 6);
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
+ } else {
+ assert(inst->mlen == 9);
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+ }
+ break;
+ case SHADER_OPCODE_TXL:
+ if (inst->shadow_compare) {
+ assert(inst->mlen == 6);
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
+ } else {
+ assert(inst->mlen == 9);
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+ }
+ break;
+ case SHADER_OPCODE_TXD:
+ /* There is no sample_d_c message; comparisons are done manually */
+ assert(inst->mlen == 7 || inst->mlen == 10);
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
+ break;
+ case SHADER_OPCODE_TXF:
+ assert(inst->mlen == 9);
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+ break;
+ case SHADER_OPCODE_TXS:
+ assert(inst->mlen == 3);
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+ break;
+ default:
+ unreachable("not reached");
+ }
+ }
+ assert(msg_type != -1);
+
+ if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
+ rlen = 8;
+ dst = vec16(dst);
+ }
+
+ if (brw->gen >= 7 && inst->header_present && dispatch_width == 16) {
+ /* The send-from-GRF for SIMD16 texturing with a header has an extra
+ * hardware register allocated to it, which we need to skip over (since
+ * our coordinates in the payload are in the even-numbered registers,
+ * and the header comes right before the first one).
+ */
+ assert(src.file == BRW_GENERAL_REGISTER_FILE);
+ src.nr++;
+ }
+
+ assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
+
+ /* Load the message header if present. If there's a texture offset,
+ * we need to set it up explicitly and load the offset bitfield.
+ * Otherwise, we can use an implied move from g0 to the first message reg.
+ */
+ if (inst->header_present) {
+ if (brw->gen < 6 && !inst->texture_offset) {
+ /* Set up an implied move from g0 to the MRF. */
+ src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
+ } else {
+ struct brw_reg header_reg;
+
+ if (brw->gen >= 7) {
+ header_reg = src;
+ } else {
+ assert(inst->base_mrf != -1);
+ header_reg = brw_message_reg(inst->base_mrf);
+ }
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ /* Explicitly set up the message header by copying g0 to the MRF. */
+ brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
+
+ if (inst->texture_offset) {
+ /* Set the offset bits in DWord 2. */
+ brw_MOV(p, get_element_ud(header_reg, 2),
+ brw_imm_ud(inst->texture_offset));
+ }
+
+ brw_adjust_sampler_state_pointer(p, header_reg, sampler_index, dst);
+ brw_pop_insn_state(p);
+ }
+ }
+
+ uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
+ inst->opcode == SHADER_OPCODE_TG4_OFFSET)
+ ? prog_data->base.binding_table.gather_texture_start
+ : prog_data->base.binding_table.texture_start;
+
+ if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
+ uint32_t sampler = sampler_index.dw1.ud;
+
+ brw_SAMPLE(p,
+ retype(dst, BRW_REGISTER_TYPE_UW),
+ inst->base_mrf,
+ src,
+ sampler + base_binding_table_index,
+ sampler % 16,
+ msg_type,
+ rlen,
+ inst->mlen,
+ inst->header_present,
+ simd_mode,
+ return_format);
+
+ brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index);
+ } else {
+ /* Non-const sampler index */
+ /* Note: this clobbers `dst` as a temporary before emitting the send */
+
+ struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+ struct brw_reg temp = vec1(retype(dst, BRW_REGISTER_TYPE_UD));
+
+ struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ /* Some care required: `sampler` and `temp` may alias:
+ * addr = sampler & 0xff
+ * temp = (sampler << 8) & 0xf00
+ * addr = addr | temp
+ */
+ brw_ADD(p, addr, sampler_reg, brw_imm_ud(base_binding_table_index));
+ brw_SHL(p, temp, sampler_reg, brw_imm_ud(8u));
+ brw_AND(p, temp, temp, brw_imm_ud(0x0f00));
+ brw_AND(p, addr, addr, brw_imm_ud(0x0ff));
+ brw_OR(p, addr, addr, temp);
+
+ /* a0.0 |= <descriptor> */
+ brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
+ brw_set_sampler_message(p, insn_or,
+ 0 /* surface */,
+ 0 /* sampler */,
+ msg_type,
+ rlen,
+ inst->mlen /* mlen */,
+ inst->header_present /* header */,
+ simd_mode,
+ return_format);
+ brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
+ brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
+ brw_set_src0(p, insn_or, addr);
+ brw_set_dest(p, insn_or, addr);
+
+
+ /* dst = send(offset, a0.0) */
+ brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, insn_send, dst);
+ brw_set_src0(p, insn_send, src);
+ brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr);
+
+ brw_pop_insn_state(p);
+
+ /* visitor knows more than we do about the surface limit required,
+ * so has already done marking.
+ */
+ }
+}
+
+
+/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
+ * looking like:
+ *
+ * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
+ *
+ * Ideally, we want to produce:
+ *
+ * DDX DDY
+ * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
+ * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
+ * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
+ * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
+ * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
+ * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
+ * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
+ * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
+ *
+ * and add another set of two more subspans if in 16-pixel dispatch mode.
+ *
+ * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
+ * for each pair, and vertstride = 2 jumps us 2 elements after processing a
+ * pair. But the ideal approximation may impose a huge performance cost on
+ * sample_d. On at least Haswell, sample_d instruction does some
+ * optimizations if the same LOD is used for all pixels in the subspan.
+ *
+ * For DDY, we need to use ALIGN16 mode since it's capable of doing the
+ * appropriate swizzling.
+ */
+void
+scalar_generator::generate_ddx(fs_inst *inst,
+ struct brw_reg dst, struct brw_reg src,
+ struct brw_reg quality)
+{
+ unsigned vstride, width;
+ assert(quality.file == BRW_IMMEDIATE_VALUE);
+ assert(quality.type == BRW_REGISTER_TYPE_D);
+
+ int quality_value = quality.dw1.d;
+
+ if (quality_value == BRW_DERIVATIVE_FINE ||
+ (key->high_quality_derivatives && quality_value != BRW_DERIVATIVE_COARSE)) {
+ /* produce accurate derivatives */
+ vstride = BRW_VERTICAL_STRIDE_2;
+ width = BRW_WIDTH_2;
+ }
+ else {
+ /* replicate the derivative at the top-left pixel to other pixels */
+ vstride = BRW_VERTICAL_STRIDE_4;
+ width = BRW_WIDTH_4;
+ }
+
+ struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
+ BRW_REGISTER_TYPE_F,
+ vstride,
+ width,
+ BRW_HORIZONTAL_STRIDE_0,
+ BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+ struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
+ BRW_REGISTER_TYPE_F,
+ vstride,
+ width,
+ BRW_HORIZONTAL_STRIDE_0,
+ BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+ brw_ADD(p, dst, src0, negate(src1));
+}
+
+/* The negate_value boolean is used to negate the derivative computation for
+ * FBOs, since they place the origin at the upper left instead of the lower
+ * left.
+ */
+void
+scalar_generator::generate_ddy(fs_inst *inst,
+ struct brw_reg dst, struct brw_reg src,
+ struct brw_reg quality, bool negate_value)
+{
+ assert(quality.file == BRW_IMMEDIATE_VALUE);
+ assert(quality.type == BRW_REGISTER_TYPE_D);
+
+ int quality_value = quality.dw1.d;
+
+ if (quality_value == BRW_DERIVATIVE_FINE ||
+ (key->high_quality_derivatives && quality_value != BRW_DERIVATIVE_COARSE)) {
+ /* From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
+ * Region Restrictions):
+ *
+ * In Align16 access mode, SIMD16 is not allowed for DW operations
+ * and SIMD8 is not allowed for DF operations.
+ *
+ * In this context, "DW operations" means "operations acting on 32-bit
+ * values", so it includes operations on floats.
+ *
+ * Gen4 has a similar restriction. From the i965 PRM, section 11.5.3
+ * (Instruction Compression -> Rules and Restrictions):
+ *
+ * A compressed instruction must be in Align1 access mode. Align16
+ * mode instructions cannot be compressed.
+ *
+ * Similar text exists in the g45 PRM.
+ *
+ * On these platforms, if we're building a SIMD16 shader, we need to
+ * manually unroll to a pair of SIMD8 instructions.
+ */
+ bool unroll_to_simd8 =
+ (dispatch_width == 16 &&
+ (brw->gen == 4 || (brw->gen == 7 && !brw->is_haswell)));
+
+ /* produce accurate derivatives */
+ struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
+ BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_4,
+ BRW_WIDTH_4,
+ BRW_HORIZONTAL_STRIDE_1,
+ BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
+ struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
+ BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_4,
+ BRW_WIDTH_4,
+ BRW_HORIZONTAL_STRIDE_1,
+ BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ if (unroll_to_simd8)
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ if (negate_value)
+ brw_ADD(p, dst, src1, negate(src0));
+ else
+ brw_ADD(p, dst, src0, negate(src1));
+ if (unroll_to_simd8) {
+ brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
+ src0 = sechalf(src0);
+ src1 = sechalf(src1);
+ dst = sechalf(dst);
+ if (negate_value)
+ brw_ADD(p, dst, src1, negate(src0));
+ else
+ brw_ADD(p, dst, src0, negate(src1));
+ }
+ brw_pop_insn_state(p);
+ } else {
+ /* replicate the derivative at the top-left pixel to other pixels */
+ struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
+ BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_4,
+ BRW_WIDTH_4,
+ BRW_HORIZONTAL_STRIDE_0,
+ BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+ struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
+ BRW_REGISTER_TYPE_F,
+ BRW_VERTICAL_STRIDE_4,
+ BRW_WIDTH_4,
+ BRW_HORIZONTAL_STRIDE_0,
+ BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+ if (negate_value)
+ brw_ADD(p, dst, src1, negate(src0));
+ else
+ brw_ADD(p, dst, src0, negate(src1));
+ }
+}
+
+void
+scalar_generator::generate_discard_jump(fs_inst *inst)
+{
+ assert(brw->gen >= 6);
+
+ /* This HALT will be patched up at FB write time to point UIP at the end of
+ * the program, and at brw_uip_jip() JIP will be set to the end of the
+ * current block (or the program).
+ */
+ this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ gen6_HALT(p);
+ brw_pop_insn_state(p);
+}
+
+void
+scalar_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
+{
+ assert(inst->mlen != 0);
+
+ brw_MOV(p,
+ retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
+ retype(src, BRW_REGISTER_TYPE_UD));
+ brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
+ dispatch_width / 8, inst->offset);
+}
+
+void
+scalar_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
+{
+ assert(inst->mlen != 0);
+
+ brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
+ dispatch_width / 8, inst->offset);
+}
+
+void
+scalar_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
+{
+ gen7_block_read_scratch(p, dst, dispatch_width / 8, inst->offset);
+}
+
+void
+scalar_generator::generate_uniform_pull_constant_load(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg index,
+ struct brw_reg offset)
+{
+ assert(inst->mlen != 0);
+
+ assert(index.file == BRW_IMMEDIATE_VALUE &&
+ index.type == BRW_REGISTER_TYPE_UD);
+ uint32_t surf_index = index.dw1.ud;
+
+ assert(offset.file == BRW_IMMEDIATE_VALUE &&
+ offset.type == BRW_REGISTER_TYPE_UD);
+ uint32_t read_offset = offset.dw1.ud;
+
+ brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
+ read_offset, surf_index);
+
+ brw_mark_surface_used(&prog_data->base, surf_index);
+}
+
+void
+scalar_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg index,
+ struct brw_reg offset)
+{
+ assert(inst->mlen == 0);
+ assert(index.type == BRW_REGISTER_TYPE_UD);
+
+ assert(offset.file == BRW_GENERAL_REGISTER_FILE);
+ /* Reference just the dword we need, to avoid angering validate_reg(). */
+ offset = brw_vec1_grf(offset.nr, 0);
+
+ /* We use the SIMD4x2 mode because we want to end up with 4 components in
+ * the destination loaded consecutively from the same offset (which appears
+ * in the first component, and the rest are ignored).
+ */
+ dst.width = BRW_WIDTH_4;
+
+ if (index.file == BRW_IMMEDIATE_VALUE) {
+
+ uint32_t surf_index = index.dw1.ud;
+
+ brw_push_insn_state(p);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_pop_insn_state(p);
+
+ brw_set_dest(p, send, dst);
+ brw_set_src0(p, send, offset);
+ brw_set_sampler_message(p, send,
+ surf_index,
+ 0, /* LD message ignores sampler unit */
+ GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+ 1, /* rlen */
+ 1, /* mlen */
+ false, /* no header */
+ BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+ 0);
+
+ brw_mark_surface_used(&prog_data->base, surf_index);
+
+ } else {
+
+ struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ /* a0.0 = surf_index & 0xff */
+ brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
+ brw_inst_set_exec_size(p->brw, insn_and, BRW_EXECUTE_1);
+ brw_set_dest(p, insn_and, addr);
+ brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
+ brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
+
+
+ /* a0.0 |= <descriptor> */
+ brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
+ brw_set_sampler_message(p, insn_or,
+ 0 /* surface */,
+ 0 /* sampler */,
+ GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+ 1 /* rlen */,
+ 1 /* mlen */,
+ false /* header */,
+ BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+ 0);
+ brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
+ brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
+ brw_set_src0(p, insn_or, addr);
+ brw_set_dest(p, insn_or, addr);
+
+
+ /* dst = send(offset, a0.0) */
+ brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, insn_send, dst);
+ brw_set_src0(p, insn_send, offset);
+ brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr);
+
+ brw_pop_insn_state(p);
+
+ /* visitor knows more than we do about the surface limit required,
+ * so has already done marking.
+ */
+
+ }
+}
+
+void
+scalar_generator::generate_varying_pull_constant_load(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg index,
+ struct brw_reg offset)
+{
+ assert(brw->gen < 7); /* Should use the gen7 variant. */
+ assert(inst->header_present);
+ assert(inst->mlen);
+
+ assert(index.file == BRW_IMMEDIATE_VALUE &&
+ index.type == BRW_REGISTER_TYPE_UD);
+ uint32_t surf_index = index.dw1.ud;
+
+ uint32_t simd_mode, rlen, msg_type;
+ if (dispatch_width == 16) {
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+ rlen = 8;
+ } else {
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+ rlen = 4;
+ }
+
+ if (brw->gen >= 5)
+ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
+ else {
+ /* We always use the SIMD16 message so that we only have to load U, and
+ * not V or R.
+ */
+ msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
+ assert(inst->mlen == 3);
+ assert(inst->regs_written == 8);
+ rlen = 8;
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+ }
+
+ struct brw_reg offset_mrf = retype(brw_message_reg(inst->base_mrf + 1),
+ BRW_REGISTER_TYPE_D);
+ brw_MOV(p, offset_mrf, offset);
+
+ struct brw_reg header = brw_vec8_grf(0, 0);
+ gen6_resolve_implied_move(p, &header, inst->base_mrf);
+
+ brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_inst_set_qtr_control(brw, send, BRW_COMPRESSION_NONE);
+ brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
+ brw_set_src0(p, send, header);
+ if (brw->gen < 6)
+ brw_inst_set_base_mrf(brw, send, inst->base_mrf);
+
+ /* Our surface is set up as floats, regardless of what actual data is
+ * stored in it.
+ */
+ uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+ brw_set_sampler_message(p, send,
+ surf_index,
+ 0, /* sampler (unused) */
+ msg_type,
+ rlen,
+ inst->mlen,
+ inst->header_present,
+ simd_mode,
+ return_format);
+
+ brw_mark_surface_used(&prog_data->base, surf_index);
+}
+
+void
+scalar_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg index,
+ struct brw_reg offset)
+{
+ assert(brw->gen >= 7);
+ /* Varying-offset pull constant loads are treated as a normal expression on
+ * gen7, so the fact that it's a send message is hidden at the IR level.
+ */
+ assert(!inst->header_present);
+ assert(!inst->mlen);
+ assert(index.type == BRW_REGISTER_TYPE_UD);
+
+ uint32_t simd_mode, rlen, mlen;
+ if (dispatch_width == 16) {
+ mlen = 2;
+ rlen = 8;
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+ } else {
+ mlen = 1;
+ rlen = 4;
+ simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+ }
+
+ if (index.file == BRW_IMMEDIATE_VALUE) {
+
+ uint32_t surf_index = index.dw1.ud;
+
+ brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, send, dst);
+ brw_set_src0(p, send, offset);
+ brw_set_sampler_message(p, send,
+ surf_index,
+ 0, /* LD message ignores sampler unit */
+ GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+ rlen,
+ mlen,
+ false, /* no header */
+ simd_mode,
+ 0);
+
+ brw_mark_surface_used(&prog_data->base, surf_index);
+
+ } else {
+
+ struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+ /* a0.0 = surf_index & 0xff */
+ brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
+ brw_inst_set_exec_size(p->brw, insn_and, BRW_EXECUTE_1);
+ brw_set_dest(p, insn_and, addr);
+ brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
+ brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
+
+
+ /* a0.0 |= <descriptor> */
+ brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
+ brw_set_sampler_message(p, insn_or,
+ 0 /* surface */,
+ 0 /* sampler */,
+ GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+ rlen /* rlen */,
+ mlen /* mlen */,
+ false /* header */,
+ simd_mode,
+ 0);
+ brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
+ brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
+ brw_set_src0(p, insn_or, addr);
+ brw_set_dest(p, insn_or, addr);
+
+
+ /* dst = send(offset, a0.0) */
+ brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
+ brw_set_dest(p, insn_send, dst);
+ brw_set_src0(p, insn_send, offset);
+ brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr);
+
+ brw_pop_insn_state(p);
+
+ /* visitor knows more than we do about the surface limit required,
+ * so has already done marking.
+ */
+ }
+}
+
+/**
+ * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
+ * into the flags register (f0.0).
+ *
+ * Used only on Gen6 and above.
+ */
+void
+scalar_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
+{
+ struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
+ struct brw_reg dispatch_mask;
+
+ if (brw->gen >= 6)
+ dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
+ else
+ dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV(p, flags, dispatch_mask);
+ brw_pop_insn_state(p);
+}
+
+void
+scalar_generator::generate_pixel_interpolator_query(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src,
+ struct brw_reg msg_data,
+ unsigned msg_type)
+{
+ assert(msg_data.file == BRW_IMMEDIATE_VALUE &&
+ msg_data.type == BRW_REGISTER_TYPE_UD);
+
+ brw_pixel_interpolator_query(p,
+ retype(dst, BRW_REGISTER_TYPE_UW),
+ src,
+ inst->pi_noperspective,
+ msg_type,
+ msg_data.dw1.ud,
+ inst->mlen,
+ inst->regs_written);
+}
+
+
+static uint32_t brw_file_from_reg(fs_reg *reg)
+{
+ switch (reg->file) {
+ case GRF:
+ return BRW_GENERAL_REGISTER_FILE;
+ case MRF:
+ return BRW_MESSAGE_REGISTER_FILE;
+ case IMM:
+ return BRW_IMMEDIATE_VALUE;
+ default:
+ unreachable("not reached");
+ }
+}
+
+struct brw_reg
+brw_reg_from_fs_reg(fs_reg *reg)
+{
+ struct brw_reg brw_reg;
+
+ switch (reg->file) {
+ case GRF:
+ case MRF:
+ if (reg->stride == 0) {
+ brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0);
+ } else {
+ brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
+ brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride);
+ }
+
+ brw_reg = retype(brw_reg, reg->type);
+ brw_reg = byte_offset(brw_reg, reg->subreg_offset);
+ break;
+ case IMM:
+ switch (reg->type) {
+ case BRW_REGISTER_TYPE_F:
+ brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f);
+ break;
+ case BRW_REGISTER_TYPE_D:
+ brw_reg = brw_imm_d(reg->fixed_hw_reg.dw1.d);
+ break;
+ case BRW_REGISTER_TYPE_UD:
+ brw_reg = brw_imm_ud(reg->fixed_hw_reg.dw1.ud);
+ break;
+ default:
+ unreachable("not reached");
+ }
+ break;
+ case HW_REG:
+ assert(reg->type == reg->fixed_hw_reg.type);
+ brw_reg = reg->fixed_hw_reg;
+ break;
+ case BAD_FILE:
+ /* Probably unused. */
+ brw_reg = brw_null_reg();
+ break;
+ case UNIFORM:
+ unreachable("not reached");
+ default:
+ unreachable("not reached");
+ }
+ if (reg->abs)
+ brw_reg = brw_abs(brw_reg);
+ if (reg->negate)
+ brw_reg = negate(brw_reg);
+
+ return brw_reg;
+}
+
+/**
+ * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
+ * sampler LD messages.
+ *
+ * We don't want to bake it into the send message's code generation because
+ * that means we don't get a chance to schedule the instructions.
+ */
+void
+scalar_generator::generate_set_simd4x2_offset(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg value)
+{
+ assert(value.file == BRW_IMMEDIATE_VALUE);
+
+ brw_push_insn_state(p);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
+ brw_pop_insn_state(p);
+}
+
+/* Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
+ * (when mask is passed as a uniform) of register mask before moving it
+ * to register dst.
+ */
+void
+scalar_generator::generate_set_omask(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg mask)
+{
+ bool stride_8_8_1 =
+ (mask.vstride == BRW_VERTICAL_STRIDE_8 &&
+ mask.width == BRW_WIDTH_8 &&
+ mask.hstride == BRW_HORIZONTAL_STRIDE_1);
+
+ bool stride_0_1_0 =
+ (mask.vstride == BRW_VERTICAL_STRIDE_0 &&
+ mask.width == BRW_WIDTH_1 &&
+ mask.hstride == BRW_HORIZONTAL_STRIDE_0);
+
+ assert(stride_8_8_1 || stride_0_1_0);
+ assert(dst.type == BRW_REGISTER_TYPE_UW);
+
+ if (dispatch_width == 16)
+ dst = vec16(dst);
+ brw_push_insn_state(p);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+ if (stride_8_8_1) {
+ brw_MOV(p, dst, retype(stride(mask, 16, 8, 2), dst.type));
+ } else if (stride_0_1_0) {
+ brw_MOV(p, dst, retype(mask, dst.type));
+ }
+ brw_pop_insn_state(p);
+}
+
+/* Sets vstride=1, width=4, hstride=0 of register src1 during
+ * the ADD instruction.
+ */
+void
+scalar_generator::generate_set_sample_id(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src0,
+ struct brw_reg src1)
+{
+ assert(dst.type == BRW_REGISTER_TYPE_D ||
+ dst.type == BRW_REGISTER_TYPE_UD);
+ assert(src0.type == BRW_REGISTER_TYPE_D ||
+ src0.type == BRW_REGISTER_TYPE_UD);
+
+ brw_push_insn_state(p);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+ struct brw_reg reg = retype(stride(src1, 1, 4, 0), BRW_REGISTER_TYPE_UW);
+ brw_ADD(p, dst, src0, reg);
+ if (dispatch_width == 16)
+ brw_ADD(p, offset(dst, 1), offset(src0, 1), suboffset(reg, 2));
+ brw_pop_insn_state(p);
+}
+
+/**
+ * Change the register's data type from UD to W, doubling the strides in order
+ * to compensate for halving the data type width.
+ */
+static struct brw_reg
+ud_reg_to_w(struct brw_reg r)
+{
+ assert(r.type == BRW_REGISTER_TYPE_UD);
+ r.type = BRW_REGISTER_TYPE_W;
+
+ /* The BRW_*_STRIDE enums are defined so that incrementing the field
+ * doubles the real stride.
+ */
+ if (r.hstride != 0)
+ ++r.hstride;
+ if (r.vstride != 0)
+ ++r.vstride;
+
+ return r;
+}
+
+void
+scalar_generator::generate_pack_half_2x16_split(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg x,
+ struct brw_reg y)
+{
+ assert(brw->gen >= 7);
+ assert(dst.type == BRW_REGISTER_TYPE_UD);
+ assert(x.type == BRW_REGISTER_TYPE_F);
+ assert(y.type == BRW_REGISTER_TYPE_F);
+
+ /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
+ *
+ * Because this instruction does not have a 16-bit floating-point type,
+ * the destination data type must be Word (W).
+ *
+ * The destination must be DWord-aligned and specify a horizontal stride
+ * (HorzStride) of 2. The 16-bit result is stored in the lower word of
+ * each destination channel and the upper word is not modified.
+ */
+ struct brw_reg dst_w = ud_reg_to_w(dst);
+
+ /* Give each 32-bit channel of dst the form below , where "." means
+ * unchanged.
+ * 0x....hhhh
+ */
+ brw_F32TO16(p, dst_w, y);
+
+ /* Now the form:
+ * 0xhhhh0000
+ */
+ brw_SHL(p, dst, dst, brw_imm_ud(16u));
+
+ /* And, finally the form of packHalf2x16's output:
+ * 0xhhhhllll
+ */
+ brw_F32TO16(p, dst_w, x);
+}
+
+void
+scalar_generator::generate_unpack_half_2x16_split(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src)
+{
+ assert(brw->gen >= 7);
+ assert(dst.type == BRW_REGISTER_TYPE_F);
+ assert(src.type == BRW_REGISTER_TYPE_UD);
+
+ /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
+ *
+ * Because this instruction does not have a 16-bit floating-point type,
+ * the source data type must be Word (W). The destination type must be
+ * F (Float).
+ */
+ struct brw_reg src_w = ud_reg_to_w(src);
+
+ /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
+ * For the Y case, we wish to access only the upper word; therefore
+ * a 16-bit subregister offset is needed.
+ */
+ assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
+ inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
+ if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
+ src_w.subnr += 2;
+
+ brw_F16TO32(p, dst, src_w);
+}
+
+void
+scalar_generator::generate_shader_time_add(fs_inst *inst,
+ struct brw_reg payload,
+ struct brw_reg offset,
+ struct brw_reg value)
+{
+ assert(brw->gen >= 7);
+ brw_push_insn_state(p);
+ brw_set_default_mask_control(p, true);
+
+ assert(payload.file == BRW_GENERAL_REGISTER_FILE);
+ struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
+ offset.type);
+ struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
+ value.type);
+
+ assert(offset.file == BRW_IMMEDIATE_VALUE);
+ if (value.file == BRW_GENERAL_REGISTER_FILE) {
+ value.width = BRW_WIDTH_1;
+ value.hstride = BRW_HORIZONTAL_STRIDE_0;
+ value.vstride = BRW_VERTICAL_STRIDE_0;
+ } else {
+ assert(value.file == BRW_IMMEDIATE_VALUE);
+ }
+
+ /* Trying to deal with setup of the params from the IR is crazy in the FS8
+ * case, and we don't really care about squeezing every bit of performance
+ * out of this path, so we just emit the MOVs from here.
+ */
+ brw_MOV(p, payload_offset, offset);
+ brw_MOV(p, payload_value, value);
+ brw_shader_time_add(p, payload,
+ prog_data->base.binding_table.shader_time_start);
+ brw_pop_insn_state(p);
+
+ brw_mark_surface_used(&prog_data->base,
+ prog_data->base.binding_table.shader_time_start);
+}
+
+void
+scalar_generator::generate_untyped_atomic(fs_inst *inst, struct brw_reg dst,
+ struct brw_reg atomic_op,
+ struct brw_reg surf_index)
+{
+ assert(atomic_op.file == BRW_IMMEDIATE_VALUE &&
+ atomic_op.type == BRW_REGISTER_TYPE_UD &&
+ surf_index.file == BRW_IMMEDIATE_VALUE &&
+ surf_index.type == BRW_REGISTER_TYPE_UD);
+
+ brw_untyped_atomic(p, dst, brw_message_reg(inst->base_mrf),
+ atomic_op.dw1.ud, surf_index.dw1.ud,
+ inst->mlen, dispatch_width / 8);
+
+ brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
+}
+
+void
+scalar_generator::generate_untyped_surface_read(fs_inst *inst, struct brw_reg dst,
+ struct brw_reg surf_index)
+{
+ assert(surf_index.file == BRW_IMMEDIATE_VALUE &&
+ surf_index.type == BRW_REGISTER_TYPE_UD);
+
+ brw_untyped_surface_read(p, dst, brw_message_reg(inst->base_mrf),
+ surf_index.dw1.ud,
+ inst->mlen, dispatch_width / 8);
+
+ brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
+}
+
+void
+scalar_generator::generate_code(const cfg_t *cfg)
+{
+ int start_offset = p->next_insn_offset;
+
+ struct annotation_info annotation;
+ memset(&annotation, 0, sizeof(annotation));
+
+ foreach_block_and_inst (block, fs_inst, inst, cfg) {
+ struct brw_reg src[3], dst;
+ unsigned int last_insn_offset = p->next_insn_offset;
+
+ if (unlikely(debug_flag))
+ annotate(brw, &annotation, cfg, inst, p->next_insn_offset);
+
+ for (unsigned int i = 0; i < inst->sources; i++) {
+ src[i] = brw_reg_from_fs_reg(&inst->src[i]);
+
+ /* The accumulator result appears to get used for the
+ * conditional modifier generation. When negating a UD
+ * value, there is a 33rd bit generated for the sign in the
+ * accumulator value, so now you can't check, for example,
+ * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
+ */
+ assert(!inst->conditional_mod ||
+ inst->src[i].type != BRW_REGISTER_TYPE_UD ||
+ !inst->src[i].negate);
+ }
+ dst = brw_reg_from_fs_reg(&inst->dst);
+
+ brw_set_default_predicate_control(p, inst->predicate);
+ brw_set_default_predicate_inverse(p, inst->predicate_inverse);
+ brw_set_default_flag_reg(p, 0, inst->flag_subreg);
+ brw_set_default_saturate(p, inst->saturate);
+ brw_set_default_mask_control(p, inst->force_writemask_all);
+ brw_set_default_acc_write_control(p, inst->writes_accumulator);
+
+ if (inst->force_uncompressed || dispatch_width == 8) {
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ } else if (inst->force_sechalf) {
+ brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
+ } else {
+ brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+ }
+
+ switch (inst->opcode) {
+ case BRW_OPCODE_MOV:
+ brw_MOV(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_ADD:
+ brw_ADD(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_MUL:
+ brw_MUL(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_AVG:
+ brw_AVG(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_MACH:
+ brw_MACH(p, dst, src[0], src[1]);
+ break;
+
+ case BRW_OPCODE_MAD:
+ assert(brw->gen >= 6);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) {
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_MAD(p, dst, src[0], src[1], src[2]);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
+ brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
+ brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+ } else {
+ brw_MAD(p, dst, src[0], src[1], src[2]);
+ }
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ break;
+
+ case BRW_OPCODE_LRP:
+ assert(brw->gen >= 6);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) {
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_LRP(p, dst, src[0], src[1], src[2]);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
+ brw_LRP(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
+ brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+ } else {
+ brw_LRP(p, dst, src[0], src[1], src[2]);
+ }
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ break;
+
+ case BRW_OPCODE_FRC:
+ brw_FRC(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_RNDD:
+ brw_RNDD(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_RNDE:
+ brw_RNDE(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_RNDZ:
+ brw_RNDZ(p, dst, src[0]);
+ break;
+
+ case BRW_OPCODE_AND:
+ brw_AND(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_OR:
+ brw_OR(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_XOR:
+ brw_XOR(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_NOT:
+ brw_NOT(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_ASR:
+ brw_ASR(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_SHR:
+ brw_SHR(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_SHL:
+ brw_SHL(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_F32TO16:
+ assert(brw->gen >= 7);
+ brw_F32TO16(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_F16TO32:
+ assert(brw->gen >= 7);
+ brw_F16TO32(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_CMP:
+ brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
+ break;
+ case BRW_OPCODE_SEL:
+ brw_SEL(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_BFREV:
+ assert(brw->gen >= 7);
+ /* BFREV only supports UD type for src and dst. */
+ brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
+ retype(src[0], BRW_REGISTER_TYPE_UD));
+ break;
+ case BRW_OPCODE_FBH:
+ assert(brw->gen >= 7);
+ /* FBH only supports UD type for dst. */
+ brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+ break;
+ case BRW_OPCODE_FBL:
+ assert(brw->gen >= 7);
+ /* FBL only supports UD type for dst. */
+ brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+ break;
+ case BRW_OPCODE_CBIT:
+ assert(brw->gen >= 7);
+ /* CBIT only supports UD type for dst. */
+ brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+ break;
+ case BRW_OPCODE_ADDC:
+ assert(brw->gen >= 7);
+ brw_ADDC(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_SUBB:
+ assert(brw->gen >= 7);
+ brw_SUBB(p, dst, src[0], src[1]);
+ break;
+ case BRW_OPCODE_MAC:
+ brw_MAC(p, dst, src[0], src[1]);
+ break;
+
+ case BRW_OPCODE_BFE:
+ assert(brw->gen >= 7);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) {
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_BFE(p, dst, src[0], src[1], src[2]);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
+ brw_BFE(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
+ brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+ } else {
+ brw_BFE(p, dst, src[0], src[1], src[2]);
+ }
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ break;
+
+ case BRW_OPCODE_BFI1:
+ assert(brw->gen >= 7);
+ /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
+ * should
+ *
+ * "Force BFI instructions to be executed always in SIMD8."
+ */
+ if (dispatch_width == 16 && brw->is_haswell) {
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_BFI1(p, dst, src[0], src[1]);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
+ brw_BFI1(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]));
+ brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+ } else {
+ brw_BFI1(p, dst, src[0], src[1]);
+ }
+ break;
+ case BRW_OPCODE_BFI2:
+ assert(brw->gen >= 7);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
+ * should
+ *
+ * "Force BFI instructions to be executed always in SIMD8."
+ *
+ * Otherwise we would be able to emit compressed instructions like we
+ * do for the other three-source instructions.
+ */
+ if (dispatch_width == 16) {
+ brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_BFI2(p, dst, src[0], src[1], src[2]);
+ brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
+ brw_BFI2(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
+ brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+ } else {
+ brw_BFI2(p, dst, src[0], src[1], src[2]);
+ }
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ break;
+
+ case BRW_OPCODE_IF:
+ if (inst->src[0].file != BAD_FILE) {
+ /* The instruction has an embedded compare (only allowed on gen6) */
+ assert(brw->gen == 6);
+ gen6_IF(p, inst->conditional_mod, src[0], src[1]);
+ } else {
+ brw_IF(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
+ }
+ break;
+
+ case BRW_OPCODE_ELSE:
+ brw_ELSE(p);
+ break;
+ case BRW_OPCODE_ENDIF:
+ brw_ENDIF(p);
+ break;
+
+ case BRW_OPCODE_DO:
+ brw_DO(p, BRW_EXECUTE_8);
+ break;
+
+ case BRW_OPCODE_BREAK:
+ brw_BREAK(p);
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+ break;
+ case BRW_OPCODE_CONTINUE:
+ brw_CONT(p);
+ brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+ break;
+
+ case BRW_OPCODE_WHILE:
+ brw_WHILE(p);
+ break;
+
+ case SHADER_OPCODE_RCP:
+ case SHADER_OPCODE_RSQ:
+ case SHADER_OPCODE_SQRT:
+ case SHADER_OPCODE_EXP2:
+ case SHADER_OPCODE_LOG2:
+ case SHADER_OPCODE_SIN:
+ case SHADER_OPCODE_COS:
+ assert(brw->gen < 6 || inst->mlen == 0);
+ if (brw->gen >= 7) {
+ gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
+ brw_null_reg());
+ } else if (brw->gen == 6) {
+ generate_math_gen6(inst, dst, src[0], brw_null_reg());
+ } else if (brw->gen == 5 || brw->is_g4x) {
+ generate_math_g45(inst, dst, src[0]);
+ } else {
+ generate_math_gen4(inst, dst, src[0]);
+ }
+ break;
+ case SHADER_OPCODE_INT_QUOTIENT:
+ case SHADER_OPCODE_INT_REMAINDER:
+ case SHADER_OPCODE_POW:
+ assert(brw->gen < 6 || inst->mlen == 0);
+ if (brw->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) {
+ gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
+ } else if (brw->gen >= 6) {
+ generate_math_gen6(inst, dst, src[0], src[1]);
+ } else {
+ generate_math_gen4(inst, dst, src[0]);
+ }
+ break;
+ case FS_OPCODE_PIXEL_X:
+ generate_pixel_xy(dst, true);
+ break;
+ case FS_OPCODE_PIXEL_Y:
+ generate_pixel_xy(dst, false);
+ break;
+ case FS_OPCODE_CINTERP:
+ brw_MOV(p, dst, src[0]);
+ break;
+ case FS_OPCODE_LINTERP:
+ generate_linterp(inst, dst, src);
+ break;
+ case SHADER_OPCODE_TEX:
+ case FS_OPCODE_TXB:
+ case SHADER_OPCODE_TXD:
+ case SHADER_OPCODE_TXF:
+ case SHADER_OPCODE_TXF_CMS:
+ case SHADER_OPCODE_TXF_UMS:
+ case SHADER_OPCODE_TXF_MCS:
+ case SHADER_OPCODE_TXL:
+ case SHADER_OPCODE_TXS:
+ case SHADER_OPCODE_LOD:
+ case SHADER_OPCODE_TG4:
+ case SHADER_OPCODE_TG4_OFFSET:
+ generate_tex(inst, dst, src[0], src[1]);
+ break;
+ case FS_OPCODE_DDX:
+ generate_ddx(inst, dst, src[0], src[1]);
+ break;
+ case FS_OPCODE_DDY:
+ /* Make sure fp->UsesDFdy flag got set (otherwise there's no
+ * guarantee that key->render_to_fbo is set).
+ */
+ assert(fp->UsesDFdy);
+ generate_ddy(inst, dst, src[0], src[1], key->render_to_fbo);
+ break;
+
+ case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+ generate_scratch_write(inst, src[0]);
+ break;
+
+ case SHADER_OPCODE_GEN4_SCRATCH_READ:
+ generate_scratch_read(inst, dst);
+ break;
+
+ case SHADER_OPCODE_GEN7_SCRATCH_READ:
+ generate_scratch_read_gen7(inst, dst);
+ break;
+
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+ generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
+ break;
+
+ case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+ generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
+ break;
+
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
+ generate_varying_pull_constant_load(inst, dst, src[0], src[1]);
+ break;
+
+ case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
+ generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
+ break;
+
+ case FS_OPCODE_REP_FB_WRITE:
+ case FS_OPCODE_FB_WRITE:
+ generate_fb_write(inst);
+ break;
+
+ case FS_OPCODE_BLORP_FB_WRITE:
+ generate_blorp_fb_write(inst);
+ break;
+
+ case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
+ generate_mov_dispatch_to_flags(inst);
+ break;
+
+ case FS_OPCODE_DISCARD_JUMP:
+ generate_discard_jump(inst);
+ break;
+
+ case SHADER_OPCODE_SHADER_TIME_ADD:
+ generate_shader_time_add(inst, src[0], src[1], src[2]);
+ break;
+
+ case SHADER_OPCODE_UNTYPED_ATOMIC:
+ generate_untyped_atomic(inst, dst, src[0], src[1]);
+ break;
+
+ case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+ generate_untyped_surface_read(inst, dst, src[0]);
+ break;
+
+ case FS_OPCODE_SET_SIMD4X2_OFFSET:
+ generate_set_simd4x2_offset(inst, dst, src[0]);
+ break;
+
+ case FS_OPCODE_SET_OMASK:
+ generate_set_omask(inst, dst, src[0]);
+ break;
+
+ case FS_OPCODE_SET_SAMPLE_ID:
+ generate_set_sample_id(inst, dst, src[0], src[1]);
+ break;
+
+ case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+ generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
+ break;
+
+ case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
+ case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
+ generate_unpack_half_2x16_split(inst, dst, src[0]);
+ break;
+
+ case FS_OPCODE_PLACEHOLDER_HALT:
+ /* This is the place where the final HALT needs to be inserted if
+ * we've emitted any discards. If not, this will emit no code.
+ */
+ if (!patch_discard_jumps_to_fb_writes()) {
+ if (unlikely(debug_flag)) {
+ annotation.ann_count--;
+ }
+ }
+ break;
+
+ case FS_OPCODE_INTERPOLATE_AT_CENTROID:
+ generate_pixel_interpolator_query(inst, dst, src[0], src[1],
+ GEN7_PIXEL_INTERPOLATOR_LOC_CENTROID);
+ break;
+
+ case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+ generate_pixel_interpolator_query(inst, dst, src[0], src[1],
+ GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
+ break;
+
+ case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+ generate_pixel_interpolator_query(inst, dst, src[0], src[1],
+ GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
+ break;
+
+ case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+ generate_pixel_interpolator_query(inst, dst, src[0], src[1],
+ GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
+ break;
+
+ default:
+ if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
+ _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
+ opcode_descs[inst->opcode].name);
+ } else {
+ _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
+ }
+ abort();
+
+ case SHADER_OPCODE_LOAD_PAYLOAD:
+ unreachable("Should be lowered by lower_load_payload()");
+ }
+
+ if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
+ assert(p->next_insn_offset == last_insn_offset + 16 ||
+ !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
+ "emitting more than 1 instruction");
+
+ brw_inst *last = &p->store[last_insn_offset / 16];
+
+ brw_inst_set_cond_modifier(brw, last, inst->conditional_mod);
+ brw_inst_set_no_dd_clear(brw, last, inst->no_dd_clear);
+ brw_inst_set_no_dd_check(brw, last, inst->no_dd_check);
+ }
+ }
+
+ brw_set_uip_jip(p);
+ annotation_finalize(&annotation, p->next_insn_offset);
+
+ int before_size = p->next_insn_offset - start_offset;
+ brw_compact_instructions(p, start_offset, annotation.ann_count,
+ annotation.ann);
+ int after_size = p->next_insn_offset - start_offset;
+
+ if (unlikely(debug_flag)) {
+ if (prog) {
+ fprintf(stderr,
+ "Native code for %s fragment shader %d (SIMD%d dispatch):\n",
+ prog->Label ? prog->Label : "unnamed",
+ prog->Name, dispatch_width);
+ } else if (fp) {
+ fprintf(stderr,
+ "Native code for fragment program %d (SIMD%d dispatch):\n",
+ fp->Base.Id, dispatch_width);
+ } else {
+ fprintf(stderr, "Native code for blorp program (SIMD%d dispatch):\n",
+ dispatch_width);
+ }
+ fprintf(stderr, "SIMD%d shader: %d instructions. Compacted %d to %d"
+ " bytes (%.0f%%)\n",
+ dispatch_width, before_size / 16, before_size, after_size,
+ 100.0f * (before_size - after_size) / before_size);
+
+ const struct gl_program *prog = fp ? &fp->Base : NULL;
+
+ dump_assembly(p->store, annotation.ann_count, annotation.ann, brw, prog);
+ ralloc_free(annotation.ann);
+ }
+}
+
+const unsigned *
+scalar_generator::generate_assembly(const cfg_t *simd8_cfg,
+ const cfg_t *simd16_cfg,
+ unsigned *assembly_size)
+{
+ assert(simd8_cfg || simd16_cfg);
+
+ if (simd8_cfg) {
+ dispatch_width = 8;
+ generate_code(simd8_cfg);
+ }
+
+ if (simd16_cfg) {
+ /* align to 64 byte boundary. */
+ while (p->next_insn_offset % 64) {
+ brw_NOP(p);
+ }
+
+ /* Save off the start of this SIMD16 program */
+ prog_data->prog_offset_16 = p->next_insn_offset;
+
+ brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+
+ dispatch_width = 16;
+ generate_code(simd16_cfg);
+ }
+
+ return brw_get_program(p, assembly_size);
+}
--
2.1.0
More information about the mesa-dev
mailing list