[Mesa-dev] [PATCH] i965: Ask the register allocator to round-robin through registers.
Matt Turner
mattst88 at gmail.com
Wed Apr 3 11:59:37 PDT 2013
On Wed, Apr 3, 2013 at 10:25 AM, Eric Anholt <eric at anholt.net> wrote:
> The way we were allocating registers before, packing into low register
> numbers for Ironlake, resulted in an overly-constrained dependency graph
> for instruction scheduling. Improves GLBenchmark 2.1 performance by
> 3.4% +/- 0.6% (n=26)
> ---
> src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 2 ++
> src/mesa/program/register_allocate.c | 31 +++++++++++++++++++--
> src/mesa/program/register_allocate.h | 1 +
> 3 files changed, 31 insertions(+), 3 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> index 4ee7bbc..b9b0303 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
> @@ -108,6 +108,8 @@ brw_alloc_reg_set(struct brw_context *brw, int reg_width)
>
> uint8_t *ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count);
> struct ra_regs *regs = ra_alloc_reg_set(brw, ra_reg_count);
> + if (intel->gen >= 6)
> + ra_set_allocate_round_robin(regs);
> int *classes = ralloc_array(brw, int, class_count);
> int aligned_pairs_class = -1;
>
> diff --git a/src/mesa/program/register_allocate.c b/src/mesa/program/register_allocate.c
> index a9064c3..5f45662 100644
> --- a/src/mesa/program/register_allocate.c
> +++ b/src/mesa/program/register_allocate.c
> @@ -70,6 +70,7 @@
> * this during ra_set_finalize().
> */
>
> +#include <stdbool.h>
> #include <ralloc.h>
>
> #include "main/imports.h"
> @@ -93,6 +94,8 @@ struct ra_regs {
>
> struct ra_class **classes;
> unsigned int class_count;
> +
> + bool round_robin;
> };
>
> struct ra_class {
> @@ -185,6 +188,22 @@ ra_alloc_reg_set(void *mem_ctx, unsigned int count)
> return regs;
> }
>
> +/**
> + * The register allocator by default prefers to allocate low register numbers,
> + * since it was written for hardware (gen4/5 Intel) that is limited in its
> + * multithreadedness by the number of registers used in a given shader.
> + *
> + * However, for hardware without that restriction, densely packed register
> + * allocation can put serious constraints on instruction scheduling. This
> + * function tells the allocator to rotate around the registers if possible as
> + * it allocates the nodes.
> + */
> +void
> +ra_set_allocate_round_robin(struct ra_regs *regs)
> +{
> + regs->round_robin = true;
> +}
> +
> static void
> ra_add_conflict_list(struct ra_regs *regs, unsigned int r1, unsigned int r2)
> {
> @@ -436,16 +455,19 @@ GLboolean
> ra_select(struct ra_graph *g)
> {
> int i;
> + int start_search_reg = 0;
>
> while (g->stack_count != 0) {
> - unsigned int r;
> + unsigned int ri;
> + unsigned int r = -1;
> int n = g->stack[g->stack_count - 1];
> struct ra_class *c = g->regs->classes[g->nodes[n].class];
>
> /* Find the lowest-numbered reg which is not used by a member
> * of the graph adjacent to us.
> */
> - for (r = 0; r < g->regs->count; r++) {
> + for (ri = 0; ri < g->regs->count; ri++) {
> + r = (start_search_reg + ri) % g->regs->count;
> if (!c->regs[r])
> continue;
>
> @@ -461,12 +483,15 @@ ra_select(struct ra_graph *g)
> if (i == g->nodes[n].adjacency_count)
> break;
> }
> - if (r == g->regs->count)
> + if (ri == g->regs->count)
> return GL_FALSE;
>
> g->nodes[n].reg = r;
> g->nodes[n].in_stack = GL_FALSE;
> g->stack_count--;
> +
> + if (g->regs->round_robin)
> + start_search_reg = r;
> }
With the s/= r/= r + 1/ change mentioned on IRC to make this work for
8-wide too, it gets my
Reviewed-by: Matt Turner <mattst88 at gmail.com>
More information about the mesa-dev
mailing list