[Mesa-dev] [PATCH] i965/fs: Reduce the interference between payload regs and virtual GRFs.
Kenneth Graunke
kenneth at whitecape.org
Tue Oct 16 13:39:10 PDT 2012
On 10/15/2012 04:06 PM, Eric Anholt wrote:
> Improves performance of the Lightsmark penumbra shadows scene by 15.7% +/-
> 1.0% (n=15), by eliminating register spilling. (tested by smashing the list of
> scenes to have all other scenes have 0 duration -- includes additional
> rendering of scene description text that normally doesn't appear in that
> scene)
> ---
> src/mesa/drivers/dri/i965/brw_fs.h | 2 +
> src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 164 ++++++++++++++++++---
> 2 files changed, 147 insertions(+), 19 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index a71783c..ad717c9 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -235,6 +235,8 @@ public:
> void assign_urb_setup();
> bool assign_regs();
> void assign_regs_trivial();
> + void setup_payload_interference(struct ra_graph *g, int payload_reg_count,
> + int first_payload_node);
> int choose_spill_reg(struct ra_graph *g);
> void spill_reg(int spill_reg);
> void split_virtual_grfs();
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> index 7b778d6..bd9789f 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> @@ -163,27 +163,154 @@ brw_alloc_reg_set(struct brw_context *brw, int reg_width, int base_reg_count)
> /**
> * Sets up interference between thread payload registers and the virtual GRFs
> * to be allocated for program temporaries.
> + *
> + * We want to be able to reallocate the payload for our virtual GRFs, notably
> + * because the setup coefficients for a full set of 16 FS inputs takes up 8 of
> + * our 128 registers.
> + *
> + * The layout of the payload registers is:
> + *
> + * 0..nr_payload_regs-1: fixed function setup (including bary coordinates).
> + * nr_payload_regs..nr_payload_regs+curb_read_lengh-1: uniform data
> + * nr_payload_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients.
> + *
> + * And we have payload_node_count nodes covering these registers in order
> + * (note that in 16-wide, a node is two registers).
> */
> -static void
> -brw_setup_payload_interference(struct ra_graph *g,
> - int payload_reg_count,
> - int first_payload_node,
> - int reg_node_count)
> +void
> +fs_visitor::setup_payload_interference(struct ra_graph *g,
> + int payload_node_count,
> + int first_payload_node)
> {
> - for (int i = 0; i < payload_reg_count; i++) {
> - /* Mark each payload reg node as being allocated to its physical register.
> + int reg_width = c->dispatch_width / 8;
> + int loop_depth = 0;
> + int loop_end_ip = 0;
> +
> + int payload_last_use_ip[payload_node_count];
> + memset(payload_last_use_ip, 0, sizeof(payload_last_use_ip));
> + int ip = 0;
> + foreach_list(node, &this->instructions) {
> + fs_inst *inst = (fs_inst *)node;
> +
> + switch (inst->opcode) {
> + case BRW_OPCODE_DO:
> + loop_depth++;
> +
> + /* Since payload regs are deffed only at the start of the shader
> + * execution, any uses of the payload within a loop mean the live
> + * interval extends to the end of the outermost loop. Find the ip of
> + * the end now.
> + */
> + if (loop_depth == 1) {
> + int scan_depth = loop_depth;
> + int scan_ip = ip;
> + for (fs_inst *scan_inst = (fs_inst *)inst->next;
> + scan_depth > 0;
> + scan_inst = (fs_inst *)scan_inst->next) {
> + switch (scan_inst->opcode) {
> + case BRW_OPCODE_DO:
> + scan_depth++;
> + break;
> + case BRW_OPCODE_WHILE:
> + scan_depth--;
> + break;
> + default:
> + break;
> + }
> + scan_ip++;
> + }
> + loop_end_ip = scan_ip;
> + }
> + break;
> + case BRW_OPCODE_WHILE:
> + loop_depth--;
> + break;
> + default:
> + break;
> + }
Wow, it's unfortunate that you have to do this. Essentially, for each
instruction in a loop, you walk through all the instructions, to the end
of the loop. That's big O(fail). :(
If we had the jump offsets, we wouldn't have to do this. But we don't
until later. Alternatively...you could probably structure the code like
this:
1. See if the instruction uses this reg.
2. If it's a loop, fast-forward inst to the WHILE
(doing both inst = inst->next and ip++)
3. Set payload_last_use_ip.
4. Proceed with the rest of your code.
Otherwise, this looks good.
> +
> + int use_ip;
> + if (loop_depth > 0)
> + use_ip = loop_end_ip;
> + else
> + use_ip = ip;
> +
> + /* Note that UNIFORM args have been turned into FIXED_HW_REG by
> + * assign_curbe_setup(), and interpolation uses fixed hardware regs from
> + * the start (see interp_reg()).
> + */
> + for (int i = 0; i < 3; i++) {
> + if (inst->src[i].file == FIXED_HW_REG &&
> + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
> + int node_nr = inst->src[i].fixed_hw_reg.nr / reg_width;
> + if (node_nr >= payload_node_count)
> + continue;
> +
> + payload_last_use_ip[node_nr] = use_ip;
> + }
> + }
> +
> + /* Special case instructions which have extra implied registers used. */
> + switch (inst->opcode) {
> + case FS_OPCODE_FB_WRITE:
> + /* We could omit this for the !inst->header_present case, except that
> + * the simulator apparently incorrectly reads from g0/g1 instead of
> + * sideband. It also really freaks out driver developers to see g0
> + * used in unusual places, so just always reserve it.
> + */
> + payload_last_use_ip[0 / reg_width] = use_ip;
> + payload_last_use_ip[1 / reg_width] = use_ip;
> + break;
> + case FS_OPCODE_DISCARD:
> + payload_last_use_ip[1 / reg_width] = use_ip;
> + break;
> +
> + case FS_OPCODE_LINTERP:
> + /* On gen6+ in 16-wide, there are 4 adjacent registers (so 2 nodes)
> + * used by PLN's sourcing of the deltas, while we list only the first
> + * two in the arguments (1 node). Pre-gen6, the deltas are computed
> + * in normal VGRFs.
> + */
> + if (intel->gen >= 6) {
> + int delta_x_arg = 0;
> + if (inst->src[delta_x_arg].file == FIXED_HW_REG &&
> + inst->src[delta_x_arg].fixed_hw_reg.file ==
> + BRW_GENERAL_REGISTER_FILE) {
> + int sechalf_node = (inst->src[delta_x_arg].fixed_hw_reg.nr /
> + reg_width) + 1;
> + assert(sechalf_node < payload_node_count);
> + payload_last_use_ip[sechalf_node] = use_ip;
> + }
> + }
> + break;
> +
> + default:
> + break;
> + }
> +
> + ip++;
> + }
> +
> + for (int i = 0; i < payload_node_count; i++) {
> + /* Mark the payload node as interfering with any virtual grf that is
> + * live between the start of the program and our last use of the payload
> + * node.
> + */
> + for (int j = 0; j < this->virtual_grf_count; j++) {
> + if (this->virtual_grf_def[j] <= payload_last_use_ip[i] ||
> + this->virtual_grf_use[j] <= payload_last_use_ip[i]) {
> + ra_add_node_interference(g, first_payload_node + i, j);
> + }
> + }
> + }
> +
> + for (int i = 0; i < payload_node_count; i++) {
> + /* Mark each payload node as being allocated to its physical register.
> *
> * The alternative would be to have per-physical-register classes, which
> * would just be silly.
> */
> ra_set_node_reg(g, first_payload_node + i, i);
> -
> - /* For now, just mark each payload node as interfering with every other
> - * node to be allocated.
> - */
> - for (int j = 0; j < reg_node_count; j++) {
> - ra_add_node_interference(g, first_payload_node + i, j);
> - }
> }
> }
>
> @@ -198,7 +325,7 @@ fs_visitor::assign_regs()
> */
> int reg_width = c->dispatch_width / 8;
> int hw_reg_mapping[this->virtual_grf_count];
> - int payload_reg_count = (ALIGN(this->first_non_payload_grf, reg_width) /
> + int payload_node_count = (ALIGN(this->first_non_payload_grf, reg_width) /
> reg_width);
> int base_reg_count = max_grf / reg_width;
>
> @@ -208,7 +335,7 @@ fs_visitor::assign_regs()
>
> int node_count = this->virtual_grf_count;
> int first_payload_node = node_count;
> - node_count += payload_reg_count;
> + node_count += payload_node_count;
> struct ra_graph *g = ra_alloc_interference_graph(brw->wm.regs, node_count);
>
> for (int i = 0; i < this->virtual_grf_count; i++) {
> @@ -240,8 +367,7 @@ fs_visitor::assign_regs()
> }
> }
>
> - brw_setup_payload_interference(g, payload_reg_count, first_payload_node,
> - this->virtual_grf_count);
> + setup_payload_interference(g, payload_node_count, first_payload_node);
>
> if (!ra_allocate_no_spills(g)) {
> /* Failed to allocate registers. Spill a reg, and the caller will
> @@ -268,7 +394,7 @@ fs_visitor::assign_regs()
> * regs in the register classes back down to real hardware reg
> * numbers.
> */
> - this->grf_used = payload_reg_count * reg_width;
> + this->grf_used = payload_node_count * reg_width;
> for (int i = 0; i < this->virtual_grf_count; i++) {
> int reg = ra_get_node_reg(g, i);
>
>
More information about the mesa-dev
mailing list