[Mesa-dev] Re : [PATCH] Var packer optimisation.
Vincent Lejeune
vljn at ovi.com
Thu Aug 11 06:28:00 PDT 2011
Thanks for your reviews.
I'm taking note of your comments and I will have a new patch soon.
(btw, I must apologize because I forgot to add a commit when rebasing, so this new patch didn't have function comments to explain their role, and "break;" where not pushed t)
----- Mail original -----
> De : Paul Berry <stereotype441 at gmail.com>
> À : Vincent Lejeune <vljn at ovi.com>
> Cc : mesa-dev at lists.freedesktop.org
> Envoyé le : Mercredi 10 Août 2011 22h04
> Objet : Re: [Mesa-dev] [PATCH] Var packer optimisation.
>
> On 9 August 2011 14:19, Vincent Lejeune <vljn at ovi.com> wrote:
>> From: vlj <vljn at ovi.com>
>>
>> This optimisation pass will look for and pack together vector (up to vec3)
> and scalar in function body. It only concerns local variable, not temporary
> which should however be packed as a side effect. It should reduce register
> pressure.
>> ---
>> src/glsl/Makefile | 1 +
>> src/glsl/glsl_parser_extras.cpp | 1 +
>> src/glsl/ir_optimization.h | 1 +
>> src/glsl/opt_var_packer.cpp | 309
> +++++++++++++++++++++++++++++++++++++++
>> 4 files changed, 312 insertions(+), 0 deletions(-)
>> create mode 100644 src/glsl/opt_var_packer.cpp
>>
>> diff --git a/src/glsl/Makefile b/src/glsl/Makefile
>> index 4100414..fff0dc7 100644
>> --- a/src/glsl/Makefile
>> +++ b/src/glsl/Makefile
>> @@ -83,6 +83,7 @@ CXX_SOURCES = \
>> opt_structure_splitting.cpp \
>> opt_swizzle_swizzle.cpp \
>> opt_tree_grafting.cpp \
>> + opt_var_packer.cpp \
>> s_expression.cpp
>>
>> LIBS = \
>> diff --git a/src/glsl/glsl_parser_extras.cpp
> b/src/glsl/glsl_parser_extras.cpp
>> index d9aa300..3b0e84a 100644
>> --- a/src/glsl/glsl_parser_extras.cpp
>> +++ b/src/glsl/glsl_parser_extras.cpp
>> @@ -776,6 +776,7 @@ do_common_optimization(exec_list *ir, bool linked,
> unsigned max_unroll_iteration
>> {
>> GLboolean progress = GL_FALSE;
>>
>> + progress = do_var_packing(ir) || progress;
>> progress = lower_instructions(ir, SUB_TO_ADD_NEG) || progress;
>>
>> if (linked) {
>
> This optimization pass shouldn't come at the top of
> do_common_optimization, since it has the potential to decrease the
> effectiveness of optimizations that follow. For example, if it packs
> a dead variable with a live one, then dead code elimination won't be
> able to eliminate the dead variable any more. I think it may
> interfere with constant propagation and algebraic simplification too,
> but I'm less certain of that.
>
>> diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
>> index dd26567..83d124f 100644
>> --- a/src/glsl/ir_optimization.h
>> +++ b/src/glsl/ir_optimization.h
>> @@ -71,3 +71,4 @@ bool lower_variable_index_to_cond_assign(exec_list
> *instructions,
>> bool lower_input, bool lower_output, bool lower_temp, bool
> lower_uniform);
>> bool lower_quadop_vector(exec_list *instructions, bool dont_lower_swz);
>> bool optimize_redundant_jumps(exec_list *instructions);
>> +bool do_var_packing(exec_list *instructions);
>> diff --git a/src/glsl/opt_var_packer.cpp b/src/glsl/opt_var_packer.cpp
>> new file mode 100644
>> index 0000000..e7d0638
>> --- /dev/null
>> +++ b/src/glsl/opt_var_packer.cpp
>> @@ -0,0 +1,309 @@
>> +/*
>> + * Copyright © 2011 Vincent Lejeune
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the
> "Software"),
>> + * to deal in the Software without restriction, including without
> limitation
>> + * the rights to use, copy, modify, merge, publish, distribute,
> sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including the
> next
>> + * paragraph) shall be included in all copies or substantial portions of
> the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
> KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
> SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
> OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
>> + * DEALINGS IN THE SOFTWARE.
>> + */
>> +
>> +#include "ir.h"
>> +#include "ir_hierarchical_visitor.h"
>> +#include "ir_rvalue_visitor.h"
>> +#include <cstring>
>> +
>> +class box : public exec_node
>> +{
>> +public:
>> + void* content;
>> + box(void* c):content(c)
>> + {
>> +
>> + }
>> +};
>> +
>> +
>> +
>> +class boxed_exec_list : public exec_list
>> +{
>> +public:
>> + void push_tail(void *n)
>> + {
>> + box* b = new (this) box(n);
>> + exec_list::push_tail(b);
>> + }
>> +
>> + void push_head(void *n)
>> + {
>> + box* b = new (this) box(n);
>> + exec_list::push_head(b);
>> + }
>> +
>> + static void* operator new(size_t size, void *ctx)
>> + {
>> + void *node;
>> +
>> + node = ralloc_size(ctx, size);
>> + assert(node != NULL);
>> +
>> + return node;
>> + }
>> +
>> + bool has(const void* pointer) const
>> + {
>> + foreach_list_const(tmp,this)
>> + {
>> + box* tmpb =
> reinterpret_cast<box*>(const_cast<exec_node*>(tmp));
>> + if(tmpb->content == pointer)
>> + return true;
>> + }
>> + return false;
>> + }
>> +
>> + int size()
>> + {
>> + int result=0;
>> + foreach_list_const(tmp,this)
>> + {
>> + result++;
>> + }
>> + return result;
>> + }
>> +};
>> +
>> +#define list_item(type,pointer)
> reinterpret_cast<type>(reinterpret_cast<box*>(pointer)->content)
>> +#define list_item_const(type,pointer)
> reinterpret_cast<type>(reinterpret_cast<box*>(const_cast<exec_node*>(pointer))->content)
>> +
>> +
>> +#include <iostream>
>> +using namespace std;
>> +
>> +class ir_variable_lister : public ir_hierarchical_visitor
>> +{
>> + friend class ir_packer;
>> +protected:
>> + boxed_exec_list* available_vec3;
>> + boxed_exec_list* available_vec2;
>> + boxed_exec_list* available_float;
>> +
>> + void store_variable(ir_variable* var)
>> + {
>> + if(strcmp(var->name,"_ret_val") == 0)
>> + return ;
>
> I assume this check is trying to avoid applying the optimization to
> the temporary variables introduced by function inlining for return
> values, yes? It's not obvious to me why we would want to make an
> exception for them. And even if we do, this is not the best way to do
> it, because it means that if the user happens to make a variable with
> this name, it will not get optimized, regardless of how it is used.
>
>> + switch(var->type->gl_type)
>> + {
>> + case GL_FLOAT_VEC3:
>> + if(available_vec3->has(var)) break;
>> + available_vec3->push_tail(var);
>> + break;
>> + case GL_FLOAT_VEC2:
>> + if(available_vec2->has(var)) break;
>> + available_vec2->push_tail(var);
>> + break;
>> + case GL_FLOAT:
>> + if(available_float->has(var)) break;
>> + available_float->push_tail(var);
>> + break;
>> + default:
>> + break;
>> + }
>> + }
>> +
>> + bool find_candidates(ir_variable*& var1,ir_swizzle_mask& mask1,
> ir_variable*& var2, ir_swizzle_mask& mask2)
>> + {
>> + if(available_vec3->size() > 1 &&
> available_float->size() >1)
>> + {
>> + var1 = list_item(ir_variable*,available_vec3->pop_head());
>> + var2 = list_item(ir_variable*,available_float->pop_head());
>> + mask1.x = 0;mask1.y = 1;mask1.z = 2;mask1.num_components =
> 3;mask1.has_duplicates=false;
>> + mask2.x = 3;mask2.num_components = 1;mask1.has_duplicates=false;
>> + return true;
>> + }
>> + if(available_vec2->size() > 2)
>> + {
>> + var1 =
> list_item_const(ir_variable*,available_vec2->pop_head());
>> + var2 =
> list_item_const(ir_variable*,available_vec2->pop_head());
>> + mask1.x = 0;mask1.y = 1;mask1.num_components = 2;
>> + mask2.x = 2; mask2.y = 3;mask2.num_components = 2;
>> + return true;
>> + }
>> + if(available_vec2->size() > 1 &&
> available_float->size() > 1)
>> + {
>> + var1 =
> list_item_const(ir_variable*,available_vec2->pop_head());
>> + var2 =
> list_item_const(ir_variable*,available_float->pop_head());
>> + mask1.x = 0;mask1.y = 1;mask1.num_components = 2;
>> + mask2.x = 2; mask2.num_components = 1;
>> + return true;
>> + }
>> + if(available_float->size() > 2)
>> + {
>> + var1 =
> list_item_const(ir_variable*,available_float->pop_head());
>> + var2 =
> list_item_const(ir_variable*,available_float->pop_head());
>> + mask1.x = 0;mask1.num_components = 1;
>> + mask2.x = 1;mask2.num_components = 1;
>> + return true;
>> + }
>> + return false;
>> + }
>
> I realize this code may wind up getting rewritten as a result of Ian
> and Eric's comments, but I wanted to point out something that I think
> is a bug: all the ">"s in the above function look to me like they
> should be ">=". For example, we want to combine vec3's and
> floats
> whenever there is at least one of each present, not when there are at
> least two of each present.
>
>> +
>> +public:
>> + ir_visitor_status visit_enter(ir_assignment * assign)
>> + {
>> + if(ir_dereference_variable* dref =
> assign->lhs->as_dereference_variable())
>> + {
>> + if(dref->var->mode != ir_var_auto)
>> + return visit_continue;
>> + store_variable(dref->var);
>> + }
>> + return visit_continue;
>> + }
>> +
>> + ir_variable_lister(void* ctx)
>> + {
>> + available_float = new (ctx) boxed_exec_list();
>> + available_vec2 = new (ctx) boxed_exec_list();
>> + available_vec3 = new (ctx) boxed_exec_list();
>> + }
>> +
>> +};
>> +
>> +
>> +class ir_variable_replacer : public ir_rvalue_visitor
>> +{
>> +protected:
>> + ir_variable* var1;
>> + ir_swizzle_mask mask_for_var1;
>> + ir_variable* var2;
>> + ir_swizzle_mask mask_for_var2;
>> + ir_variable* packed_var;
>> +
>> + inline
>> + unsigned cyclic_right_shift(unsigned mask,unsigned step) const
>> + {
>> + unsigned result = 0;
>> + result |= mask << step;
>> + result |= mask >> (4 - step);
>> + return result;
>> + }
>> +
>> +
>> +public:
>> + void handle_rvalue(ir_rvalue **rvalue)
>> + {
>> + if(!*rvalue)
>> + return;
>> + ir_rvalue* tmp_rvalue = *rvalue;
>> + if(ir_dereference_variable* dref =
> tmp_rvalue->as_dereference_variable())
>> + {
>> + if(dref->var == var1)
>> + {
>> + ir_dereference_variable* ndref = new (packed_var)
> ir_dereference_variable(packed_var);
>> + ir_swizzle* swz = new (dref) ir_swizzle(ndref,mask_for_var1);
>> + *rvalue = swz;
>> + }
>> + if(dref->var == var2)
>> + {
>> + ir_dereference_variable* ndref = new (packed_var)
> ir_dereference_variable(packed_var);
>> + ir_swizzle* swz = new (dref) ir_swizzle(ndref,mask_for_var2);
>> + *rvalue = swz;
>> + }
>> + }
>> + }
>> +
>> + ir_visitor_status visit_enter(ir_assignment * assign)
>> + {
>> + assign->rhs->accept(this);
>> + if(ir_dereference_variable* dref =
> assign->lhs->as_dereference_variable())
>> + {
>> + if(dref->var == var1)
>> + {
>> + dref->var = packed_var;
>> + }
>> + if(dref->var == var2)
>> + {
>> + dref->var = packed_var;
>> + assign->write_mask =
> cyclic_right_shift(assign->write_mask,mask_for_var1.num_components);
>> + }
>> + }
>> + return visit_continue;
>> + }
>> +
>> + ir_variable_replacer(ir_variable* v1,ir_swizzle_mask mask1,
> ir_variable* v2, ir_swizzle_mask mask2, ir_variable*
> newvar):var1(v1),mask_for_var1(mask1), var2(v2),mask_for_var2(mask2),
> packed_var(newvar)
>> + {
>> + }
>> +};
>> +
>> +
>> +class ir_packer : public ir_hierarchical_visitor
>> +{
>> +public:
>> + ir_visitor_status visit_enter(ir_function_signature * fonc)
>> + {
>> + int body_size = 0;
>> + foreach_list_const(tmp,&(fonc->body))
>> + {
>> + body_size++;
>> + }
>> + if(!body_size)
>> + return visit_continue;
>> + //cout << "ENTERING " <<
> fonc->function_name() << ":"<< endl;
>> + ir_variable_lister v(fonc);
>> + foreach_list(tmp,&(fonc->body))
>> + {
>> + ir_instruction* inst = (ir_instruction*) tmp;
>> + inst->accept(&v);
>> + }
>> + ir_variable *v1=0,*v2=0;
>> + ir_swizzle_mask m1,m2;
>> + if(!v.find_candidates(v1,m1,v2,m2))
>> + return visit_continue;
>> + cout << "PACKING " << v1->name<<
> " WITH " << v2->name << endl;
>> + ir_variable* newvar=0;
>> + switch(m1.num_components + m2.num_components)
>> + {
>> + case 4:
>> + newvar = new (fonc)
> ir_variable(glsl_type::vec4_type,"vec4_tmp",ir_var_temporary);
>> + break;
>> + case 3:
>> + newvar = new (fonc)
> ir_variable(glsl_type::vec3_type,"vec3_tmp",ir_var_temporary);
>> + break;
>> + case 2:
>> + newvar = new (fonc)
> ir_variable(glsl_type::vec2_type,"vec2_tmp",ir_var_temporary);
>> + break;
>> + }
>> + fonc->body.push_head(newvar);
>> + ir_variable_replacer vis2(v1,m1,v2,m2,newvar);
>> + foreach_list(tmp,&(fonc->body))
>> + {
>> + ir_instruction* inst = (ir_instruction*) tmp;
>> + inst->accept(&vis2);
>> + }
>> + return visit_continue;
>> + }
>> +};
>> +
>> +
>> +bool
>> +do_var_packing(exec_list *instructions)
>> +{
>> + ir_packer v;
>> + for(int i=0;i<5;i++)
>> + visit_list_elements(&v, instructions);
>> +
>> +
>> + return false;
>> +}
>> --
>> 1.7.3.4
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>
>
> I'm also concerned that this optimization pass doesn't pay any
> attention to variable lifetimes. Because of Mesa's aggressive
> function inlining, it's likely that a shader may have a large number
> of variables whose lifetimes don't overlap. If this optimization pass
> combines two variables whose lifetimes don't overlap, it will produce
> a variable with a much longer lifetime, and therefore reduce the
> opportunities for the individual back-ends to re-use registers. So if
> we're not lucky, it might actually cause _increased_ register pressure
> rather than decreased register pressure.
>
> Have you tested this optimization on any typical shaders and measured
> the difference in register pressure?
>
More information about the mesa-dev
mailing list