[Mesa-dev] [PATCH 06/10] i965/vs: Add support for pull constant loads for uniform arrays.
Kenneth Graunke
kenneth at whitecape.org
Wed Aug 24 11:26:54 PDT 2011
On 08/23/2011 06:25 PM, Eric Anholt wrote:
> ---
> src/mesa/drivers/dri/i965/brw_defines.h | 1 +
> src/mesa/drivers/dri/i965/brw_vec4.h | 11 ++
> src/mesa/drivers/dri/i965/brw_vec4_emit.cpp | 51 +++++++++-
> src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 133 +++++++++++++++++++++++-
> 4 files changed, 192 insertions(+), 4 deletions(-)
Some suggestions for tidying up the patch below. Otherwise looks good.
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> index d1799c0..5f34939 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -643,6 +643,7 @@ enum opcode {
> VS_OPCODE_URB_WRITE,
> VS_OPCODE_SCRATCH_READ,
> VS_OPCODE_SCRATCH_WRITE,
> + VS_OPCODE_PULL_CONSTANT_LOAD,
> };
>
> #define BRW_PREDICATE_NONE 0
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
> index 1db910e..5a4ba0f 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.h
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.h
> @@ -364,6 +364,7 @@ public:
> */
> dst_reg output_reg[VERT_RESULT_MAX];
> int uniform_size[MAX_UNIFORMS];
> + int uniform_vector_size[MAX_UNIFORMS];
> int uniforms;
>
> struct hash_table *variable_ht;
> @@ -380,6 +381,7 @@ public:
> void reg_allocate_trivial();
> void reg_allocate();
> void move_grf_array_access_to_scratch();
> + void move_uniform_array_access_to_pull_constants();
> void calculate_live_intervals();
> bool dead_code_eliminate();
> bool virtual_grf_interferes(int a, int b);
> @@ -439,6 +441,8 @@ public:
>
> src_reg get_scratch_offset(vec4_instruction *inst,
> src_reg *reladdr, int reg_offset);
> + src_reg get_pull_constant_offset(vec4_instruction *inst,
> + src_reg *reladdr, int reg_offset);
> void emit_scratch_read(vec4_instruction *inst,
> dst_reg dst,
> src_reg orig_src,
> @@ -447,6 +451,10 @@ public:
> src_reg temp,
> dst_reg orig_dst,
> int base_offset);
> + void emit_pull_constant_load(vec4_instruction *inst,
> + dst_reg dst,
> + src_reg orig_src,
> + int base_offset);
>
> GLboolean try_emit_sat(ir_expression *ir);
>
> @@ -482,6 +490,9 @@ public:
> void generate_scratch_read(vec4_instruction *inst,
> struct brw_reg dst,
> struct brw_reg index);
> + void generate_pull_constant_load(vec4_instruction *inst,
> + struct brw_reg dst,
> + struct brw_reg index);
> };
>
> } /* namespace brw */
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
> index 794f499..d77b2c6 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
> @@ -114,7 +114,7 @@ vec4_visitor::setup_uniforms(int reg)
> * matter what, or the GPU would hang.
> */
> if (intel->gen< 6&& this->uniforms == 0) {
> - this->uniform_size[this->uniforms] = 1;
> + this->uniform_vector_size[this->uniforms] = 1;
>
> for (unsigned int i = 0; i< 4; i++) {
> unsigned int slot = this->uniforms * 4 + i;
> @@ -229,6 +229,9 @@ vec4_instruction::get_src(int i)
> brw_reg = brw_abs(brw_reg);
> if (src[i].negate)
> brw_reg = negate(brw_reg);
> +
> + /* This should have been moved to pull constants. */
> + assert(!src[i].reladdr);
> break;
>
> case HW_REG:
> @@ -488,6 +491,47 @@ vec4_visitor::generate_scratch_write(vec4_instruction *inst,
> }
>
> void
> +vec4_visitor::generate_pull_constant_load(vec4_instruction *inst,
> + struct brw_reg dst,
> + struct brw_reg index)
> +{
> + if (intel->gen>= 6) {
> + brw_push_insn_state(p);
> + brw_set_mask_control(p, BRW_MASK_DISABLE);
> + brw_MOV(p,
> + retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_D),
> + retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_D));
> + brw_pop_insn_state(p);
> + }
I think you mean:
gen6_resolve_implied_move(p, brw_vec8_grf(0, 0), inst->base_mrf);
> + brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_D),
> + index);
> +
> + uint32_t msg_type;
> +
> + if (intel->gen>= 6)
> + msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
> + else if (intel->gen == 5 || intel->is_g4x)
> + msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
> + else
> + msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
> +
> + /* Each of the 8 channel enables is considered for whether each
> + * dword is written.
> + */
> + struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
> + brw_set_dest(p, send, dst);
> + brw_set_src0(p, send, brw_message_reg(inst->base_mrf));
> + brw_set_dp_read_message(p, send,
> + SURF_INDEX_VERT_CONST_BUFFER,
> + BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
> + msg_type,
> + BRW_DATAPORT_READ_TARGET_DATA_CACHE,
> + 2, /* mlen */
> + 1 /* rlen */);
> +}
> +
> +void
> vec4_visitor::generate_vs_instruction(vec4_instruction *instruction,
> struct brw_reg dst,
> struct brw_reg *src)
> @@ -529,6 +573,10 @@ vec4_visitor::generate_vs_instruction(vec4_instruction *instruction,
> generate_scratch_write(inst, dst, src[0], src[1]);
> break;
>
> + case VS_OPCODE_PULL_CONSTANT_LOAD:
> + generate_pull_constant_load(inst, dst, src[0]);
> + break;
> +
> default:
> if (inst->opcode< (int)ARRAY_SIZE(brw_opcodes)) {
> fail("unsupported opcode in `%s' in VS\n",
> @@ -556,6 +604,7 @@ vec4_visitor::run()
> * often do repeated subexpressions for those.
> */
> move_grf_array_access_to_scratch();
> + move_uniform_array_access_to_pull_constants();
>
> bool progress;
> do {
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> index 46f826c..b1266c1 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> @@ -372,7 +372,10 @@ vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
> c->prog_data.param[this->uniforms * 4 + i] =&zero;
> }
>
> - this->uniform_size[this->uniforms] = type->vector_elements;
> + /* Track the size of this uniform vector, for future packing of
> + * uniforms.
> + */
> + this->uniform_vector_size[this->uniforms] = type->vector_elements;
> this->uniforms++;
>
> return 1;
> @@ -420,7 +423,7 @@ vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
> (gl_state_index *)slots[i].tokens);
> float *values =&this->vp->Base.Parameters->ParameterValues[index][0].f;
>
> - this->uniform_size[this->uniforms] = 0;
> + this->uniform_vector_size[this->uniforms] = 0;
> /* Add each of the unique swizzled channels of the element.
> * This will end up matching the size of the glsl_type of this field.
> */
> @@ -431,7 +434,7 @@ vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
>
> c->prog_data.param[this->uniforms * 4 + j] =&values[swiz];
> if (swiz<= last_swiz)
> - this->uniform_size[this->uniforms]++;
> + this->uniform_vector_size[this->uniforms]++;
> }
> this->uniforms++;
> }
> @@ -668,6 +671,11 @@ vec4_visitor::visit(ir_variable *ir)
> case ir_var_uniform:
> reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
>
> + /* Track how big the whole uniform variable is, in case we need to put a
> + * copy of its data into pull constants for array access.
> + */
> + this->uniform_size[this->uniforms] = type_size(ir->type);
> +
> if (!strncmp(ir->name, "gl_", 3)) {
> setup_builtin_uniform_values(ir);
> } else {
> @@ -1938,6 +1946,38 @@ vec4_visitor::get_scratch_offset(vec4_instruction *inst,
> }
> }
>
> +src_reg
> +vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
> + src_reg *reladdr, int reg_offset)
> +{
> + if (reladdr) {
> + src_reg index = src_reg(this, glsl_type::int_type);
> +
> + vec4_instruction *add = emit(BRW_OPCODE_ADD,
> + dst_reg(index),
> + *reladdr,
> + src_reg(reg_offset));
> + /* Move our new instruction from the tail to its correct place. */
> + add->remove();
> + inst->insert_before(add);
This is ugly. Can we please make a vec4_instruction constructor:
vec4_instruction(enum opcode opcode, dst_reg dst,
src_reg src0, src_reg src1, src_reg src2)
{
this->opcode = opcode;
this->dst = dst;
this->src[0] = src0;
this->src[1] = src1;
this->src[2] = src2;
}
update vec4_visitor::emit to use it, and then make this:
vec4_instruction *add =
new(mem_ctx) vec4_instruction(BRW_OPCODE_ADD, dst_reg(index),
*reladdr, src_reg(reg_offset));
add->ir = this->base_ir;
inst->insert_before(add);
(maybe base_ir should be passed to the constructor too; not sure)
> + /* Pre-gen6, the message header uses byte offsets instead of vec4
> + * (16-byte) offset units.
> + */
> + if (intel->gen< 6) {
> + vec4_instruction *mul = emit(BRW_OPCODE_MUL, dst_reg(index),
> + index, src_reg(16));
> + mul->remove();
> + inst->insert_before(mul);
> + }
Ditto. Alternatively, maybe make an emit_before() helper that does
insert_before() instead of push_tail().
> + return index;
> + } else {
> + int message_header_scale = intel->gen< 6 ? 16 : 1;
> + return src_reg(reg_offset * message_header_scale);
> + }
> +}
> +
> /**
> * Emits an instruction before @inst to load the value named by @orig_src
> * from scratch space at @base_offset to @temp.
> @@ -2063,6 +2103,93 @@ vec4_visitor::move_grf_array_access_to_scratch()
> }
> }
>
> +/**
> + * Emits an instruction before @inst to load the value named by @orig_src
> + * from the pull constant buffer (surface) at @base_offset to @temp.
> + */
> +void
> +vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
> + dst_reg temp, src_reg orig_src,
> + int base_offset)
> +{
> + int reg_offset = base_offset + orig_src.reg_offset;
> + src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
> +
> + vec4_instruction *const_load_inst = emit(VS_OPCODE_PULL_CONSTANT_LOAD,
> + temp, index);
> +
> + const_load_inst->base_mrf = 14;
> + const_load_inst->mlen = 1;
> + /* Move our instruction from the tail to its correct place. */
> + const_load_inst->remove();
> + inst->insert_before(const_load_inst);
Ditto. emit_before()...
> +}
> +
> +/**
> + * Implements array access of uniforms by inserting a
> + * PULL_CONSTANT_LOAD instruction.
> + *
> + * Unlike temporary GRF array access (where we don't support it due to
> + * the difficulty of doing relative addressing on instruction
> + * destinations), we could potentially do array access of uniforms
> + * that were loaded in GRF space as push constants. In real-world
> + * usage we've seen, though, the arrays being used are always larger
> + * than we could load as push constants, so just always move all
> + * uniform array access out to a pull constant buffer.
> + */
> +void
> +vec4_visitor::move_uniform_array_access_to_pull_constants()
> +{
> + int pull_constant_loc[this->uniforms];
> +
> + for (int i = 0; i< this->uniforms; i++) {
> + pull_constant_loc[i] = -1;
> + }
> +
> + /* Walk through and find array access of uniforms. Put a copy of that
> + * uniform in the pull constant buffer.
> + *
> + * Note that we don't move constant-indexed accesses to arrays. No
> + * testing has been done of the performance impact of this choice.
> + */
> + foreach_list_safe(node,&this->instructions) {
> + vec4_instruction *inst = (vec4_instruction *)node;
> +
> + for (int i = 0 ; i< 3; i++) {
> + if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
> + continue;
> +
> + int uniform = inst->src[i].reg;
> +
> + /* If this array isn't already present in the pull constant buffer,
> + * add it.
> + */
> + if (pull_constant_loc[uniform] == -1) {
> + const float **values =&prog_data->param[uniform * 4];
> +
> + pull_constant_loc[uniform] = prog_data->nr_pull_params;
> +
> + for (int j = 0; j< uniform_size[uniform] * 4; j++) {
> + prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
> + }
> + }
> +
> + /* Set up the annotation tracking for new generated instructions. */
> + base_ir = inst->ir;
> + current_annotation = inst->annotation;
> +
> + dst_reg temp = dst_reg(this, glsl_type::vec4_type);
> +
> + emit_pull_constant_load(inst, temp, inst->src[i],
> + pull_constant_loc[uniform]);
> +
> + inst->src[i].file = temp.file;
> + inst->src[i].reg = temp.reg;
> + inst->src[i].reg_offset = temp.reg_offset;
> + inst->src[i].reladdr = NULL;
> + }
> + }
> +}
>
> vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
> struct gl_shader_program *prog,
More information about the mesa-dev
mailing list