[Mesa-dev] [PATCH 5/5] i965: Write a scalar TCS backend that runs in SINGLE_PATCH mode.
Jordan Justen
jordan.l.justen at intel.com
Thu Apr 28 23:21:19 UTC 2016
On 2016-04-21 22:32:09, Kenneth Graunke wrote:
> Unlike most shader stages, the Hull Shader hardware makes us explicitly
> tell it how many threads to dispatch and manually configure the channel
> mask. One perk of this is that we have a lot of flexibility - we can
> run it in either SIMD4x2 or SIMD8 mode.
>
> Treating it as SIMD8 means that shaders with 8 or fewer output vertices
> (which is overwhemingly the common case) can be handled by a single
> thread. This has several intriguing properties:
>
> - Accessing input arrays with gl_InvocationID as the index is a simple
> SIMD8 URB read with g1 as the header. No indirect addressing required.
> - Barriers are no-ops.
> - We could potentially do output shadowing to combine writes, as the
> concurrency concerns are gone. (We don't do this yet, though.)
>
> Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
> ---
> src/mesa/drivers/dri/i965/brw_compiler.c | 4 +-
> src/mesa/drivers/dri/i965/brw_fs.cpp | 97 ++++++++
> src/mesa/drivers/dri/i965/brw_fs.h | 5 +
> src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 356 +++++++++++++++++++++++++++
> src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 3 +
> src/mesa/drivers/dri/i965/brw_tcs.c | 3 +-
> src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp | 59 ++++-
> 7 files changed, 512 insertions(+), 15 deletions(-)
>
> Shockingly, this appears to cut instruction counts in Unigine Heaven
> (-2.5 to 5.5%), Synmark (-31%), and Tessmark (-37%). It increases
> instruction counts in Shadow of Mordor (up to +57%) - but again, this
> is running in scalar mode, so larger instruction counts are expected :)
> I also have a bunch of optimizations in progress that will help those.
>
> Cycle counts look pretty good too.
>
> This patch leaves it off by default because I haven't properly benchmarked
> it yet. I fully expect we'll turn it on by default.
>
> diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c
> index 4496699..93a30a5 100644
> --- a/src/mesa/drivers/dri/i965/brw_compiler.c
> +++ b/src/mesa/drivers/dri/i965/brw_compiler.c
> @@ -152,7 +152,8 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
>
> compiler->scalar_stage[MESA_SHADER_VERTEX] =
> devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
> - compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false;
> + compiler->scalar_stage[MESA_SHADER_TESS_CTRL] =
> + devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TCS", false);
> compiler->scalar_stage[MESA_SHADER_TESS_EVAL] =
> devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true);
> compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
> @@ -194,6 +195,7 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
>
> compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false;
> compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false;
> + compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectOutput = false;
>
> if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
> compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 5d6a107..be5edb8 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -1758,6 +1758,21 @@ fs_visitor::assign_vs_urb_setup()
> }
>
> void
> +fs_visitor::assign_tcs_single_patch_urb_setup()
> +{
> + assert(stage == MESA_SHADER_TESS_CTRL);
> +
> + brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data;
> +
> + first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
We talked about this bit of code offline. This is just a reminder that
you were going to check to see if it is used currently.
Patches 3-5 Reviewed-by: Jordan Justen <jordan.l.justen at intel.com>
> +
> + /* Rewrite all ATTR file references to HW_REGs. */
> + foreach_block_and_inst(block, fs_inst, inst, cfg) {
> + convert_attr_sources_to_hw_regs(inst);
> + }
> +}
> +
> +void
> fs_visitor::assign_tes_urb_setup()
> {
> assert(stage == MESA_SHADER_TESS_EVAL);
> @@ -5463,6 +5478,88 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
> }
>
> bool
> +fs_visitor::run_tcs_single_patch()
> +{
> + assert(stage == MESA_SHADER_TESS_CTRL);
> +
> + struct brw_tcs_prog_data *tcs_prog_data =
> + (struct brw_tcs_prog_data *) prog_data;
> +
> + /* r1-r4 contain the ICP handles. */
> + payload.num_regs = 5;
> +
> + if (shader_time_index >= 0)
> + emit_shader_time_begin();
> +
> + /* Initialize gl_InvocationID */
> + fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
> + fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
> + bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
> + bld.MOV(channels_ud, channels_uw);
> +
> + if (tcs_prog_data->instances == 1) {
> + invocation_id = channels_ud;
> + } else {
> + invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
> +
> + /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */
> + fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
> + fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
> + bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
> + brw_imm_ud(INTEL_MASK(23, 17)));
> + bld.SHR(instance_times_8, t, brw_imm_ud(17 - 3));
> +
> + bld.ADD(invocation_id, instance_times_8, channels_ud);
> + }
> +
> + /* Fix the disptach mask */
> + if (nir->info.tcs.vertices_out % 8) {
> + bld.CMP(bld.null_reg_ud(), invocation_id,
> + brw_imm_ud(nir->info.tcs.vertices_out), BRW_CONDITIONAL_L);
> + bld.IF(BRW_PREDICATE_NORMAL);
> + }
> +
> + emit_nir_code();
> +
> + if (nir->info.tcs.vertices_out % 8) {
> + bld.emit(BRW_OPCODE_ENDIF);
> + }
> +
> + /* Emit EOT write; set TR DS Cache bit */
> + fs_reg srcs[3] = {
> + fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
> + fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
> + fs_reg(brw_imm_ud(0)),
> + };
> + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
> + bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
> +
> + fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
> + bld.null_reg_ud(), payload);
> + inst->mlen = 3;
> + inst->base_mrf = -1;
> + inst->eot = true;
> +
> + if (shader_time_index >= 0)
> + emit_shader_time_end();
> +
> + if (failed)
> + return false;
> +
> + calculate_cfg();
> +
> + optimize();
> +
> + assign_curb_setup();
> + assign_tcs_single_patch_urb_setup();
> +
> + fixup_3src_null_dest();
> + allocate_registers();
> +
> + return !failed;
> +}
> +
> +bool
> fs_visitor::run_tes()
> {
> assert(stage == MESA_SHADER_TESS_EVAL);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index bcd2e3e..f24c78a 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -110,6 +110,7 @@ public:
>
> bool run_fs(bool do_rep_send);
> bool run_vs(gl_clip_plane *clip_planes);
> + bool run_tcs_single_patch();
> bool run_tes();
> bool run_gs();
> bool run_cs();
> @@ -126,6 +127,7 @@ public:
> void assign_urb_setup();
> void convert_attr_sources_to_hw_regs(fs_inst *inst);
> void assign_vs_urb_setup();
> + void assign_tcs_single_patch_urb_setup();
> void assign_tes_urb_setup();
> void assign_gs_urb_setup();
> bool assign_regs(bool allow_spilling);
> @@ -249,6 +251,8 @@ public:
> nir_ssa_undef_instr *instr);
> void nir_emit_vs_intrinsic(const brw::fs_builder &bld,
> nir_intrinsic_instr *instr);
> + void nir_emit_tcs_intrinsic(const brw::fs_builder &bld,
> + nir_intrinsic_instr *instr);
> void nir_emit_gs_intrinsic(const brw::fs_builder &bld,
> nir_intrinsic_instr *instr);
> void nir_emit_fs_intrinsic(const brw::fs_builder &bld,
> @@ -404,6 +408,7 @@ public:
> fs_reg userplane[MAX_CLIP_PLANES];
> fs_reg final_gs_vertex_count;
> fs_reg control_data_bits;
> + fs_reg invocation_id;
>
> unsigned grf_used;
> bool spilled_any_registers;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> index cf4f782..e617083 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> @@ -114,6 +114,9 @@ fs_visitor::nir_setup_single_output_varying(fs_reg *reg,
> void
> fs_visitor::nir_setup_outputs()
> {
> + if (stage == MESA_SHADER_TESS_CTRL)
> + return;
> +
> brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
>
> nir_outputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_outputs);
> @@ -232,6 +235,8 @@ emit_system_values_block(nir_block *block, void *void_visitor)
> break;
>
> case nir_intrinsic_load_invocation_id:
> + if (v->stage == MESA_SHADER_TESS_CTRL)
> + break;
> assert(v->stage == MESA_SHADER_GEOMETRY);
> reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
> if (reg->file == BAD_FILE) {
> @@ -452,6 +457,9 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
> case MESA_SHADER_VERTEX:
> nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
> break;
> + case MESA_SHADER_TESS_CTRL:
> + nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
> + break;
> case MESA_SHADER_TESS_EVAL:
> nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
> break;
> @@ -1901,6 +1909,354 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
> }
>
> void
> +fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
> + nir_intrinsic_instr *instr)
> +{
> + assert(stage == MESA_SHADER_TESS_CTRL);
> + struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
> + struct brw_tcs_prog_data *tcs_prog_data =
> + (struct brw_tcs_prog_data *) prog_data;
> +
> + fs_reg dst;
> + if (nir_intrinsic_infos[instr->intrinsic].has_dest)
> + dst = get_nir_dest(instr->dest);
> +
> + switch (instr->intrinsic) {
> + case nir_intrinsic_load_primitive_id:
> + bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
> + break;
> + case nir_intrinsic_load_invocation_id:
> + bld.MOV(retype(dst, invocation_id.type), invocation_id);
> + break;
> + case nir_intrinsic_load_patch_vertices_in:
> + bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
> + brw_imm_d(tcs_key->input_vertices));
> + break;
> +
> + case nir_intrinsic_barrier: {
> + if (tcs_prog_data->instances == 1)
> + break;
> +
> + fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> + fs_reg m0_2 = byte_offset(m0, 2 * sizeof(uint32_t));
> +
> + const fs_builder fwa_bld = bld.exec_all();
> +
> + /* Zero the message header */
> + fwa_bld.MOV(m0, brw_imm_ud(0u));
> +
> + /* Copy "Barrier ID" from r0.2, bits 16:13 */
> + fwa_bld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
> + brw_imm_ud(INTEL_MASK(16, 13)));
> +
> + /* Shift it up to bits 27:24. */
> + fwa_bld.SHL(m0_2, m0_2, brw_imm_ud(11));
> +
> + /* Set the Barrier Count and the enable bit */
> + fwa_bld.OR(m0_2, m0_2,
> + brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
> +
> + bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
> + break;
> + }
> +
> + case nir_intrinsic_load_input:
> + unreachable("nir_lower_io should never give us these.");
> + break;
> +
> + case nir_intrinsic_load_per_vertex_input: {
> + fs_reg indirect_offset = get_indirect_offset(instr);
> + unsigned imm_offset = instr->const_index[0];
> +
> + const nir_src &vertex_src = instr->src[0];
> + nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
> +
> + fs_inst *inst;
> +
> + fs_reg icp_handle;
> +
> + if (vertex_const) {
> + /* Emit a MOV to resolve <0,1,0> regioning. */
> + icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> + bld.MOV(icp_handle,
> + retype(brw_vec1_grf(1 + (vertex_const->i32[0] >> 3),
> + vertex_const->i32[0] & 7),
> + BRW_REGISTER_TYPE_UD));
> + } else if (tcs_prog_data->instances == 1 &&
> + vertex_src.is_ssa &&
> + vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
> + nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) {
> + /* For the common case of only 1 instance, an array index of
> + * gl_InvocationID means reading g1. Skip all the indirect work.
> + */
> + icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
> + } else {
> + /* The vertex index is non-constant. We need to use indirect
> + * addressing to fetch the proper URB handle.
> + */
> + icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> +
> + /* Each ICP handle is a single DWord (4 bytes) */
> + fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> + bld.SHL(vertex_offset_bytes,
> + retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
> + brw_imm_ud(2u));
> +
> + /* Start at g1. We might read up to 4 registers. */
> + bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
> + fs_reg(brw_vec8_grf(1, 0)), vertex_offset_bytes,
> + brw_imm_ud(4 * REG_SIZE));
> + }
> +
> + if (indirect_offset.file == BAD_FILE) {
> + /* Constant indexing - use global offset. */
> + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
> + inst->offset = imm_offset;
> + inst->mlen = 1;
> + inst->base_mrf = -1;
> + inst->regs_written = instr->num_components;
> + } else {
> + /* Indirect indexing - use per-slot offsets as well. */
> + const fs_reg srcs[] = { icp_handle, indirect_offset };
> + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
> + bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
> +
> + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);
> + inst->offset = imm_offset;
> + inst->base_mrf = -1;
> + inst->mlen = 2;
> + inst->regs_written = instr->num_components;
> + }
> +
> + /* Copy the temporary to the destination to deal with writemasking.
> + *
> + * Also attempt to deal with gl_PointSize being in the .w component.
> + */
> + if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
> + inst->dst = bld.vgrf(dst.type, 4);
> + inst->regs_written = 4;
> + bld.MOV(dst, offset(inst->dst, bld, 3));
> + }
> + break;
> + }
> +
> + case nir_intrinsic_load_output:
> + case nir_intrinsic_load_per_vertex_output: {
> + fs_reg indirect_offset = get_indirect_offset(instr);
> + unsigned imm_offset = instr->const_index[0];
> +
> + fs_inst *inst;
> + if (indirect_offset.file == BAD_FILE) {
> + /* Replicate the patch handle to all enabled channels */
> + fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
> + bld.MOV(patch_handle,
> + retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
> +
> + if (imm_offset == 0) {
> + /* This is a read of gl_TessLevelInner[], which lives in the
> + * Patch URB header. The layout depends on the domain.
> + */
> + dst.type = BRW_REGISTER_TYPE_F;
> + switch (tcs_key->tes_primitive_mode) {
> + case GL_QUADS: {
> + /* DWords 3-2 (reversed) */
> + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
> +
> + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle);
> + inst->offset = 0;
> + inst->mlen = 1;
> + inst->base_mrf = -1;
> + inst->regs_written = 4;
> +
> + /* dst.xy = tmp.wz */
> + bld.MOV(dst, offset(tmp, bld, 3));
> + bld.MOV(offset(dst, bld, 1), offset(tmp, bld, 2));
> + break;
> + }
> + case GL_TRIANGLES:
> + /* DWord 4; hardcode offset = 1 and regs_written = 1 */
> + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, patch_handle);
> + inst->offset = 1;
> + inst->mlen = 1;
> + inst->base_mrf = -1;
> + inst->regs_written = 1;
> + break;
> + case GL_ISOLINES:
> + /* All channels are undefined. */
> + break;
> + default:
> + unreachable("Bogus tessellation domain");
> + }
> + } else if (imm_offset == 1) {
> + /* This is a read of gl_TessLevelOuter[], which lives in the
> + * Patch URB header. The layout depends on the domain.
> + */
> + dst.type = BRW_REGISTER_TYPE_F;
> +
> + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
> + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle);
> + inst->offset = 1;
> + inst->mlen = 1;
> + inst->base_mrf = -1;
> + inst->regs_written = 4;
> +
> + /* Reswizzle: WZYX */
> + fs_reg srcs[4] = {
> + offset(tmp, bld, 3),
> + offset(tmp, bld, 2),
> + offset(tmp, bld, 1),
> + offset(tmp, bld, 0),
> + };
> +
> + unsigned num_components;
> + switch (tcs_key->tes_primitive_mode) {
> + case GL_QUADS:
> + num_components = 4;
> + break;
> + case GL_TRIANGLES:
> + num_components = 3;
> + break;
> + case GL_ISOLINES:
> + /* Isolines are not reversed; swizzle .zw -> .xy */
> + srcs[0] = offset(tmp, bld, 2);
> + srcs[1] = offset(tmp, bld, 3);
> + num_components = 2;
> + break;
> + default:
> + unreachable("Bogus tessellation domain");
> + }
> + bld.LOAD_PAYLOAD(dst, srcs, num_components, 0);
> + } else {
> + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, patch_handle);
> + inst->offset = imm_offset;
> + inst->mlen = 1;
> + inst->base_mrf = -1;
> + inst->regs_written = instr->num_components;
> + }
> + } else {
> + /* Indirect indexing - use per-slot offsets as well. */
> + const fs_reg srcs[] = {
> + retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
> + indirect_offset
> + };
> + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
> + bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
> +
> + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);
> + inst->offset = imm_offset;
> + inst->mlen = 2;
> + inst->base_mrf = -1;
> + inst->regs_written = instr->num_components;
> + }
> + break;
> + }
> +
> + case nir_intrinsic_store_output:
> + case nir_intrinsic_store_per_vertex_output: {
> + fs_reg value = get_nir_src(instr->src[0]);
> + fs_reg indirect_offset = get_indirect_offset(instr);
> + unsigned imm_offset = instr->const_index[0];
> + unsigned swiz = BRW_SWIZZLE_XYZW;
> + unsigned mask = instr->const_index[1];
> + unsigned header_regs = 0;
> + fs_reg srcs[7];
> + srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
> +
> + if (indirect_offset.file != BAD_FILE) {
> + srcs[header_regs++] = indirect_offset;
> + } else if (tcs_key->program_string_id != 0) {
> + if (imm_offset == 0) {
> + value.type = BRW_REGISTER_TYPE_F;
> +
> + mask &= (1 << tesslevel_inner_components(tcs_key->tes_primitive_mode)) - 1;
> +
> + /* This is a write to gl_TessLevelInner[], which lives in the
> + * Patch URB header. The layout depends on the domain.
> + */
> + switch (tcs_key->tes_primitive_mode) {
> + case GL_QUADS:
> + /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed).
> + * We use an XXYX swizzle to reverse put .xy in the .wz
> + * channels, and use a .zw writemask.
> + */
> + mask = writemask_for_backwards_vector(mask);
> + swiz = BRW_SWIZZLE4(0, 0, 1, 0);
> + break;
> + case GL_TRIANGLES:
> + /* gl_TessLevelInner[].x lives at DWord 4, so we set the
> + * writemask to X and bump the URB offset by 1.
> + */
> + imm_offset = 1;
> + break;
> + case GL_ISOLINES:
> + /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */
> + return;
> + default:
> + unreachable("Bogus tessellation domain");
> + }
> + } else if (imm_offset == 1) {
> + /* This is a write to gl_TessLevelOuter[] which lives in the
> + * Patch URB Header at DWords 4-7. However, it's reversed, so
> + * instead of .xyzw we have .wzyx.
> + */
> + value.type = BRW_REGISTER_TYPE_F;
> +
> + mask &= (1 << tesslevel_outer_components(tcs_key->tes_primitive_mode)) - 1;
> +
> + if (tcs_key->tes_primitive_mode == GL_ISOLINES) {
> + /* Isolines .xy should be stored in .zw, in order. */
> + swiz = BRW_SWIZZLE4(0, 0, 0, 1);
> + mask <<= 2;
> + } else {
> + /* Other domains are reversed; store .wzyx instead of .xyzw */
> + swiz = BRW_SWIZZLE_WZYX;
> + mask = writemask_for_backwards_vector(mask);
> + }
> + }
> + }
> +
> + if (mask == 0)
> + break;
> +
> + unsigned num_components = _mesa_fls(mask);
> + enum opcode opcode;
> +
> + if (mask != WRITEMASK_XYZW) {
> + srcs[header_regs++] = brw_imm_ud(mask << 16);
> + opcode = indirect_offset.file != BAD_FILE ?
> + SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
> + SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
> + } else {
> + opcode = indirect_offset.file != BAD_FILE ?
> + SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
> + SHADER_OPCODE_URB_WRITE_SIMD8;
> + }
> +
> + for (unsigned i = 0; i < num_components; i++) {
> + if (mask & (1 << i))
> + srcs[header_regs + i] = offset(value, bld, BRW_GET_SWZ(swiz, i));
> + }
> +
> + unsigned mlen = header_regs + num_components;
> +
> + fs_reg payload =
> + bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
> + bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
> +
> + fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
> + inst->offset = imm_offset;
> + inst->mlen = mlen;
> + inst->base_mrf = -1;
> + break;
> + }
> +
> + default:
> + nir_emit_intrinsic(bld, instr);
> + break;
> + }
> +}
> +
> +void
> fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
> nir_intrinsic_instr *instr)
> {
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index daabf70..41a9b12 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -1014,6 +1014,9 @@ fs_visitor::init()
> case MESA_SHADER_VERTEX:
> key_tex = &((const brw_vs_prog_key *) key)->tex;
> break;
> + case MESA_SHADER_TESS_CTRL:
> + key_tex = &((const brw_tcs_prog_key *) key)->tex;
> + break;
> case MESA_SHADER_TESS_EVAL:
> key_tex = &((const brw_tes_prog_key *) key)->tex;
> break;
> diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c
> index 0117ffe..98ed2b2 100644
> --- a/src/mesa/drivers/dri/i965/brw_tcs.c
> +++ b/src/mesa/drivers/dri/i965/brw_tcs.c
> @@ -214,7 +214,8 @@ brw_codegen_tcs_prog(struct brw_context *brw,
> prog_data.base.base.nr_image_params = tcs->NumImages;
>
> brw_nir_setup_glsl_uniforms(nir, shader_prog, &tcp->program.Base,
> - &prog_data.base.base, false);
> + &prog_data.base.base,
> + compiler->scalar_stage[MESA_SHADER_TESS_CTRL]);
> } else {
> /* Upload the Patch URB Header as the first two uniforms.
> * Do the annoying scrambling so the shader doesn't have to.
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
> index 17e3448..79cf93e 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
> @@ -29,6 +29,7 @@
>
> #include "brw_nir.h"
> #include "brw_vec4_tcs.h"
> +#include "brw_fs.h"
>
> namespace brw {
>
> @@ -452,7 +453,10 @@ brw_compile_tcs(const struct brw_compiler *compiler,
> brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map);
> nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
>
> - prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2);
> + if (is_scalar)
> + prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 8);
> + else
> + prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2);
>
> /* Compute URB entry size. The maximum allowed URB entry size is 32k.
> * That divides up as follows:
> @@ -493,20 +497,49 @@ brw_compile_tcs(const struct brw_compiler *compiler,
> brw_print_vue_map(stderr, &vue_prog_data->vue_map);
> }
>
> - vec4_tcs_visitor v(compiler, log_data, key, prog_data,
> - nir, mem_ctx, shader_time_index, &input_vue_map);
> - if (!v.run()) {
> - if (error_str)
> - *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
> - return NULL;
> - }
> + if (is_scalar) {
> + fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
> + &prog_data->base.base, NULL, nir, 8,
> + shader_time_index, &input_vue_map);
> + if (!v.run_tcs_single_patch()) {
> + if (error_str)
> + *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
> + return NULL;
> + }
>
> - if (unlikely(INTEL_DEBUG & DEBUG_TCS))
> - v.dump_instructions();
> + prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
> +
> + fs_generator g(compiler, log_data, mem_ctx, (void *) key,
> + &prog_data->base.base, v.promoted_constants, false,
> + MESA_SHADER_TESS_CTRL);
> + if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
> + g.enable_debug(ralloc_asprintf(mem_ctx,
> + "%s tessellation control shader %s",
> + nir->info.label ? nir->info.label
> + : "unnamed",
> + nir->info.name));
> + }
> +
> + g.generate_code(v.cfg, 8);
>
> - return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
> - &prog_data->base, v.cfg,
> - final_assembly_size);
> + return g.get_assembly(final_assembly_size);
> + } else {
> + vec4_tcs_visitor v(compiler, log_data, key, prog_data,
> + nir, mem_ctx, shader_time_index, &input_vue_map);
> + if (!v.run()) {
> + if (error_str)
> + *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
> + return NULL;
> + }
> +
> + if (unlikely(INTEL_DEBUG & DEBUG_TCS))
> + v.dump_instructions();
> +
> +
> + return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
> + &prog_data->base, v.cfg,
> + final_assembly_size);
> + }
> }
>
>
> --
> 2.8.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list