[Beignet] [PATCH V2] GBE: Optimize the bool register allocation/processing.

Mon Mar 31 19:59:38 PDT 2014

Some example of the instruction count change after this patch:

We use the luxmark's median sala scene as the example. And without
setting any environment variable:
The result is as below:

(insnCnt*2)   master    ifendif    boolopt   to master   to ifendif
Intersect0     1848      1754        1524     -18%        -13%
Intersect1     4026      3720        3336     -17%        -10%
Init           9504      9396        8678     -9%         -7%
Advance      126264    127940      118008     -6%         -8%
Sampler       10560     10426        9672     -8%         -7%
InitFrame        86        94          90     +0.5%       -5%

You can see that almost all kernels get obvious instruction reduction.

On Tue, Apr 01, 2014 at 10:10:19AM +0800, Zhigang Gong wrote:
> From: Zhigang Gong <zhigang.gong at gmail.com>
> 
> Previously, we have a global flag allocation implemntation.
> After some analysis, I found the global flag allocation is not
> the best solution here.
> As for the cross block reference of bool value, we have to
> combine it with current emask. There is no obvious advantage to
> allocate deadicate physical flag register for those cross block usage.
> We just need to allocate physical flag within each BB. We need to handle
> the following cases:
> 
> 1. The bool's liveness never beyond this BB. And the bool is only used as
>    a dst register or a pred register. This bool value could be
>    allocated in physical flag only if there is enough physical flag.
>    We already identified those bool at the instruction select stage, and
>    put them in the flagBooleans set.
> 2. The bool is defined in another BB and used in this BB, then we need
>    to prepend an instruction at the position where we use it.
> 3. The bool is defined in this BB but is also used as some instruction's
>    source registers rather than the pred register. We have to keep the normal
>    grf (UW8/UW16) register for this bool. For some CMP instruction, we need to
>    append a SEL instruction convert the flag to the grf register.
> 4. Even for the spilling flag, if there is only one spilling flag, we will also
>    try to reuse the temporary flag register latter. This requires all the
>    instructions should got it flag at the instruction selection stage. And should
>    not use the flag physical number directly at the gen_context stage. Otherwise,
>    may break the algorithm here.
> We will track all the validated bool value and to avoid any redundant
> validation for the same flag. But if there is no enough physical flag,
> we have to spill the previous allocated physical flag. And the spilling
> policy is to spill the allocate flag which live to the last time end point.
> 
> Let's see an real example of the improvement of this patch:
> I take the compiler_vect_compare as an example, before this patch, the
> instructions are as below:
>     (      24)  cmp.g.f1.1(8)   null            g110<8,8,1>D    0D              { align1 WE_normal 1Q };
>     (      26)  cmp.g.f1.1(8)   null            g111<8,8,1>D    0D              { align1 WE_normal 2Q };
>     (      28)  (+f1.1) sel(16) g109<1>UW       g1.2<0,1,0>UW   g1<0,1,0>UW     { align1 WE_normal 1H };
>     (      30)  cmp.g.f1.1(8)   null            g112<8,8,1>D    0D              { align1 WE_normal 1Q };
>     (      32)  cmp.g.f1.1(8)   null            g113<8,8,1>D    0D              { align1 WE_normal 2Q };
>     (      34)  (+f1.1) sel(16) g108<1>UW       g1.2<0,1,0>UW   g1<0,1,0>UW     { align1 WE_normal 1H };
>     (      36)  cmp.g.f1.1(8)   null            g114<8,8,1>D    0D              { align1 WE_normal 1Q };
>     (      38)  cmp.g.f1.1(8)   null            g115<8,8,1>D    0D              { align1 WE_normal 2Q };
>     (      40)  (+f1.1) sel(16) g107<1>UW       g1.2<0,1,0>UW   g1<0,1,0>UW     { align1 WE_normal 1H };
>     (      42)  cmp.g.f1.1(8)   null            g116<8,8,1>D    0D              { align1 WE_normal 1Q };
>     (      44)  cmp.g.f1.1(8)   null            g117<8,8,1>D    0D              { align1 WE_normal 2Q };
>     (      46)  (+f1.1) sel(16) g106<1>UW       g1.2<0,1,0>UW   g1<0,1,0>UW     { align1 WE_normal 1H };
>     (      48)  mov(16)         g104<1>F        -nanF                           { align1 WE_normal 1H };
>     (      50)  cmp.ne.f1.1(16) null            g109<8,8,1>UW   0x0UW           { align1 WE_normal 1H switch };
>     (      52)  (+f1.1) sel(16) g96<1>D         g104<8,8,1>D    0D              { align1 WE_normal 1H };
>     (      54)  cmp.ne.f1.1(16) null            g108<8,8,1>UW   0x0UW           { align1 WE_normal 1H switch };
>     (      56)  (+f1.1) sel(16) g98<1>D         g104<8,8,1>D    0D              { align1 WE_normal 1H };
>     (      58)  cmp.ne.f1.1(16) null            g107<8,8,1>UW   0x0UW           { align1 WE_normal 1H switch };
>     (      60)  (+f1.1) sel(16) g100<1>D        g104<8,8,1>D    0D              { align1 WE_normal 1H };
>     (      62)  cmp.ne.f1.1(16) null            g106<8,8,1>UW   0x0UW           { align1 WE_normal 1H switch };
>     (      64)  (+f1.1) sel(16) g102<1>D        g104<8,8,1>D    0D              { align1 WE_normal 1H };
>     (      66)  add(16)         g94<1>D         g1.3<0,1,0>D    g120<8,8,1>D    { align1 WE_normal 1H };
>     (      68)  send(16)        null            g94<8,8,1>UD
>                 data (bti: 1, rgba: 0, SIMD16, legacy, Untyped Surface Write) mlen 10 rlen 0 { align1 WE_normal 1H };
>     (      70)  mov(16)         g2<1>UW         0x1UW                           { align1 WE_normal 1H };
>     (      72)  endif(16) 2                     null                            { align1 WE_normal 1H };
> 
> After this patch, it becomes:
> 
>     (      24)  cmp.g(8)        null            g110<8,8,1>D    0D              { align1 WE_normal 1Q };
>     (      26)  cmp.g(8)        null            g111<8,8,1>D    0D              { align1 WE_normal 2Q };
>     (      28)  cmp.g.f1.1(8)   null            g112<8,8,1>D    0D              { align1 WE_normal 1Q };
>     (      30)  cmp.g.f1.1(8)   null            g113<8,8,1>D    0D              { align1 WE_normal 2Q };
>     (      32)  cmp.g.f0.1(8)   null            g114<8,8,1>D    0D              { align1 WE_normal 1Q };
>     (      34)  cmp.g.f0.1(8)   null            g115<8,8,1>D    0D              { align1 WE_normal 2Q };
>     (      36)  (+f0.1) sel(16) g109<1>UW       g1.2<0,1,0>UW   g1<0,1,0>UW     { align1 WE_normal 1H };
>     (      38)  cmp.g.f1.0(8)   null            g116<8,8,1>D    0D              { align1 WE_normal 1Q };
>     (      40)  cmp.g.f1.0(8)   null            g117<8,8,1>D    0D              { align1 WE_normal 2Q };
>     (      42)  mov(16)         g106<1>F        -nanF                           { align1 WE_normal 1H };
>     (      44)  (+f0) sel(16)   g98<1>D         g106<8,8,1>D    0D              { align1 WE_normal 1H };
>     (      46)  (+f1.1) sel(16) g100<1>D        g106<8,8,1>D    0D              { align1 WE_normal 1H };
>     (      48)  (+f0.1) sel(16) g102<1>D        g106<8,8,1>D    0D              { align1 WE_normal 1H };
>     (      50)  (+f1) sel(16)   g104<1>D        g106<8,8,1>D    0D              { align1 WE_normal 1H };
>     (      52)  add(16)         g96<1>D         g1.3<0,1,0>D    g120<8,8,1>D    { align1 WE_normal 1H };
>     (      54)  send(16)        null            g96<8,8,1>UD
>                 data (bti: 1, rgba: 0, SIMD16, legacy, Untyped Surface Write) mlen 10 rlen 0 { align1 WE_normal 1H };
>     (      56)  mov(16)         g2<1>UW         0x1UW                           { align1 WE_normal 1H };
>     (      58)  endif(16) 2                     null                            { align1 WE_normal 1H };
> 
> It reduces the instruction count from 25 to 18. Save about 28% instructions.
> 
> v2:
> Fix some minor bugs.
> 
> Signed-off-by: Zhigang Gong <zhigang.gong at gmail.com>