[Mesa-dev] [PATCH 3/3] llvmpipe: drop scissor planes early if the tri is fully inside them

Tue Feb 2 01:45:38 UTC 2016

Am 01.02.2016 um 17:14 schrieb Brian Paul:
> On 01/31/2016 06:00 PM, sroland at vmware.com wrote:
>> From: Roland Scheidegger <sroland at vmware.com>
>>
>> If the tri is fully inside a scissor edge (or rather, we just use the
>> bounding box of the tri for the comparison), then we can drop these
>> additional scissor "planes" early. We do not even need to allocate
>> space for them in the tri.
>> The math actually appears to be slightly iffy due to bounding boxes
>> being rounded, but it doesn't matter in the end.
>> Those scissor rects are costly - the 4 planes from the scissor are
>> already more expensive to calculate than the 3 planes from the tri
>> itself,
>> and it also prevents us from using the specialized raster code for small
>> tris.
>> This helps openarena performance by about 8% or so. Of course, it helps
>> there that while openarena often enables scissoring (and even moves the
>> scissor rect around) I have not seen a single tri actually hit the
>> scissor rect, ever.
>>
>> v2: drop individual scissor edges, and do it earlier, not even allocating
>> space for them.
>> ---
>>   src/gallium/drivers/llvmpipe/lp_rast_tri.c   | 16 +++++
>>   src/gallium/drivers/llvmpipe/lp_setup_line.c | 81
>> ++++++++++++++---------
>>   src/gallium/drivers/llvmpipe/lp_setup_tri.c  | 98
>> +++++++++++++++++-----------
>>   3 files changed, 126 insertions(+), 69 deletions(-)
>>
>> diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
>> b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
>> index f4a2f02..bf27900 100644
>> --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
>> +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
>> @@ -380,11 +380,27 @@ lp_rast_triangle_32_3_4(struct
>> lp_rasterizer_task *task,
>>       */
>>      dcdx = _mm_sub_epi32(zero, dcdx);
>>
>> +#if 0
>> +   {
>> +   __m128i lbits, fixup, xy, dcdxdy, dcdx16;
>> +   lbits = _mm_and_si128(c, _mm_set1_epi32(FIXED_ONE - 1));
>> +   fixup = _mm_cmpeq_epi32(lbits, _mm_setzero_si128());
>> +   c = _mm_srai_epi32(c, 8);
>> +   c = _mm_add_epi32(c, fixup);
>> +   xy = _mm_set1_epi32(x | y << 16);
>> +   dcdx = _mm_srai_epi32(dcdx, 8);
>> +   dcdy = _mm_srai_epi32(dcdy, 8);
>> +   dcdx16 = _mm_and_si128(_mm_set_epi16(0,-1,0,-1,0,-1,0,-1), dcdx);
>> +   dcdxdy = _mm_or_si128(dcdx16, _mm_slli_epi32(dcdy, 16));
>> +   c = _mm_add_epi32(c, _mm_madd_epi16(dcdxdy, xy));
>> +   }
>> +#else
>>      c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
>>      c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
>>
>>      /* Adjust so we can just check the sign bit (< 0 comparison),
>> instead of having to do a less efficient <= 0 comparison */
>>      c = _mm_sub_epi32(c, _mm_set1_epi32(1));
>> +#endif
>>
>>      dcdx2 = _mm_add_epi32(dcdx, dcdx);
>>      dcdx3 = _mm_add_epi32(dcdx2, dcdx);
>> diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c
>> b/src/gallium/drivers/llvmpipe/lp_setup_line.c
>> index f425825..3ec9ac4 100644
>> --- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
>> +++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
>> @@ -336,13 +336,6 @@ try_setup_line( struct lp_setup_context *setup,
>>         layer = MIN2(layer, scene->fb_max_layer);
>>      }
>>
>> -   if (setup->scissor_test) {
>> -      nr_planes = 8;
>> -   }
>> -   else {
>> -      nr_planes = 4;
>> -   }
>> -
>>      dx = v1[0][0] - v2[0][0];
>>      dy = v1[0][1] - v2[0][1];
>>      area = (dx * dx  + dy * dy);
>> @@ -591,6 +584,20 @@ try_setup_line( struct lp_setup_context *setup,
>>      bbox.x0 = MAX2(bbox.x0, 0);
>>      bbox.y0 = MAX2(bbox.y0, 0);
>>
>> +   nr_planes = 4;
>> +   /*
>> +    * Determine how many scissor planes we need, that is drop scissor
>> +    * edges if the bounding box of the tri is fully inside that edge.
>> +    */
>> +   if (setup->scissor_test) {
>> +      /* why not just use draw_regions */
>> +      struct u_rect *scissor = &setup->scissors[viewport_index];
>> +      if (bbox.x0 < scissor->x0) nr_planes++;
>> +      if (bbox.x1 > scissor->x1) nr_planes++;
>> +      if (bbox.y0 < scissor->y0) nr_planes++;
>> +      if (bbox.y1 > scissor->y1) nr_planes++;
>> +   }
>> +
>>      line = lp_setup_alloc_triangle(scene,
>>                                     key->num_inputs,
>>                                     nr_planes,
>> @@ -708,30 +715,44 @@ try_setup_line( struct lp_setup_context *setup,
>>       * Note that otherwise, the scissor planes only vary in 'C' value,
>>       * and even then only on state-changes.  Could alternatively store
>>       * these planes elsewhere.
>> +    * (Or only store the c value together with a bit indicating which
>> +    * scissor edge this is, so rasterization would treat them
>> differently
>> +    * (easier to evaluate) to ordinary planes.)
>>       */
>> -   if (nr_planes == 8) {
>> -      const struct u_rect *scissor =
>> -         &setup->scissors[viewport_index];
>> -
>> -      plane[4].dcdx = -1 << 8;
>> -      plane[4].dcdy = 0;
>> -      plane[4].c = (1-scissor->x0) << 8;
>> -      plane[4].eo = 1 << 8;
>> -
>> -      plane[5].dcdx = 1 << 8;
>> -      plane[5].dcdy = 0;
>> -      plane[5].c = (scissor->x1+1) << 8;
>> -      plane[5].eo = 0;
>> -
>> -      plane[6].dcdx = 0;
>> -      plane[6].dcdy = 1 << 8;
>> -      plane[6].c = (1-scissor->y0) << 8;
>> -      plane[6].eo = 1 << 8;
>> -
>> -      plane[7].dcdx = 0;
>> -      plane[7].dcdy = -1 << 8;
>> -      plane[7].c = (scissor->y1+1) << 8;
>> -      plane[7].eo = 0;
>> +   if (nr_planes > 4) {
>> +      /* why not just use draw_regions */
>> +      struct u_rect *scissor = &setup->scissors[viewport_index];
>> +      unsigned scis_index = 4;
>> +
>> +      if (bbox.x0 < scissor->x0) {
>> +         plane[scis_index].dcdx = -1 << 8;
>> +         plane[scis_index].dcdy = 0;
>> +         plane[scis_index].c = (1-scissor->x0) << 8;
>> +         plane[scis_index].eo = 1 << 8;
>> +         scis_index++;
>> +      }
>> +      if (bbox.x1 > scissor->x1) {
>> +         plane[scis_index].dcdx = 1 << 8;
>> +         plane[scis_index].dcdy = 0;
>> +         plane[scis_index].c = (scissor->x1+1) << 8;
>> +         plane[scis_index].eo = 0 << 8;
>> +         scis_index++;
>> +      }
>> +      if (bbox.y0 < scissor->y0) {
>> +         plane[scis_index].dcdx = 0;
>> +         plane[scis_index].dcdy = 1 << 8;
>> +         plane[scis_index].c = (1-scissor->y0) << 8;
>> +         plane[scis_index].eo = 1 << 8;
>> +         scis_index++;
>> +      }
>> +      if (bbox.y1 > scissor->y1) {
>> +         plane[scis_index].dcdx = 0;
>> +         plane[scis_index].dcdy = -1 << 8;
>> +         plane[scis_index].c = (scissor->y1+1) << 8;
>> +         plane[scis_index].eo = 0;
>> +         scis_index++;
>> +      }
>> +      assert(scis_index == nr_planes);
>>      }
>>
>>      return lp_setup_bin_triangle(setup, line, &bbox, nr_planes,
>> viewport_index);
>> diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
>> b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
>> index 1e3a750..4e9acce 100644
>> --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
>> +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
>> @@ -302,13 +302,6 @@ do_triangle_ccw(struct lp_setup_context *setup,
>>         layer = MIN2(layer, scene->fb_max_layer);
>>      }
>>
>> -   if (setup->scissor_test) {
>> -      nr_planes = 7;
>> -   }
>> -   else {
>> -      nr_planes = 3;
>> -   }
>> -
>>      /* Bounding rectangle (in pixels) */
>>      {
>>         /* Yes this is necessary to accurately calculate bounding boxes
>> @@ -347,6 +340,20 @@ do_triangle_ccw(struct lp_setup_context *setup,
>>      bbox.x0 = MAX2(bbox.x0, 0);
>>      bbox.y0 = MAX2(bbox.y0, 0);
>>
>> +   nr_planes = 3;
>> +   /*
>> +    * Determine how many scissor planes we need, that is drop scissor
>> +    * edges if the bounding box of the tri is fully inside that edge.
>> +    */
>> +   if (setup->scissor_test) {
>> +      /* why not just use draw_regions */
>> +      struct u_rect *scissor = &setup->scissors[viewport_index];
>> +      if (bbox.x0 < scissor->x0) nr_planes++;
>> +      if (bbox.x1 > scissor->x1) nr_planes++;
>> +      if (bbox.y0 < scissor->y0) nr_planes++;
>> +      if (bbox.y1 > scissor->y1) nr_planes++;
> 
> Minor nit:
> 
> I like having the 'true' code on the line after the 'if' just in case
> I'd need to put a breakpoint in there to debug something someday.
Yes you're right that's bad style.

> 
> BTW, I was curious if this approach would generate better code:
> 
> nr_planes += (bbox.x0 < scissor->x0);
> nr_planes += (bbox.x1 > scissor->x1);
> nr_planes += (bbox.y0 < scissor->y0);
> nr_planes += (bbox.y1 > scissor->y1);
> 
> But with -O3 at least, you get identical (branch-less) code.
I guess without optimizations, it could indeed be different. But of
course performance isn't exactly the most important metric then...
So, it's mostly a matter of style.
I tend to trust the compiler to figure out how to eliminate such trivial
branches, albeit sometimes it really does weird stuff - like here:

   // nr_planes = 3 - unused in the branch...
   ..                           mov    $0x3,%r11d
   // if no scissor test
   0x00007ffff65cee51 <+609>:   je     0x7ffff65cee9f <triangle_ccw+687>
   // set reg to 0 (set instructions are all byte)
   0x00007ffff65cee53 <+611>:   xor    %r11d,%r11d
   // scissor box starts at 0x350(%r14), test left edge
   0x00007ffff65cee56 <+614>:   cmp    0x350(%r14),%r8d
   0x00007ffff65cee5d <+621>:   setl   %r11b
   0x00007ffff65cee61 <+625>:   xor    %esi,%esi
   // r11d will be either 4 or 5...
   0x00007ffff65cee63 <+627>:   add    $0x4,%r11d
   // omg gcc tries to be clever here but that looks like fail...
   // test left edge again...
   0x00007ffff65cee67 <+631>:   cmp    0x350(%r14),%r8d
   0x00007ffff65cee6e <+638>:   setl   %sil
   // esi is either 3 or 4...
   0x00007ffff65cee72 <+642>:   add    $0x3,%esi
   0x00007ffff65cee75 <+645>:   cmp    0x354(%r14),%edx
   //... and finally, the cmov - gcc has sucessfuly avoided
   // having to do math here for the right edge, reducing it to cmov...
   0x00007ffff65cee7c <+652>:   cmovle %esi,%r11d
   // top edge - looks ok
   0x00007ffff65cee80 <+656>:   xor    %edx,%edx
   0x00007ffff65cee82 <+658>:   cmp    0x358(%r14),%ecx
   0x00007ffff65cee89 <+665>:   setl   %dl
   0x00007ffff65cee8c <+668>:   add    %edx,%r11d
   // bottom edge - looks ok again albeit I wonder why gcc ended up
   // with xor/set in previous case whereas set/movzbl here...
   0x00007ffff65cee8f <+671>:   cmp    0x35c(%r14),%eax
   0x00007ffff65cee96 <+678>:   setg   %al
   0x00007ffff65cee99 <+681>:   movzbl %al,%eax
   0x00007ffff65cee9c <+684>:   add    %eax,%r11d

With your suggestion, I got (gcc moved the code having scissor around):
   0x00007ffff65cf2b1 <+1745>:  xor    %esi,%esi
   0x00007ffff65cf2b3 <+1747>:  cmp    0x350(%r10),%r11d
   0x00007ffff65cf2ba <+1754>:  setl   %sil
   0x00007ffff65cf2be <+1758>:  cmp    0x354(%r10),%edx
   0x00007ffff65cf2c5 <+1765>:  setg   %dl
   0x00007ffff65cf2c8 <+1768>:  movzbl %dl,%edx
   // of course, lea used as multi-add...
   0x00007ffff65cf2cb <+1771>:  lea    0x3(%rsi,%rdx,1),%r11d
   0x00007ffff65cf2d0 <+1776>:  xor    %edx,%edx
   0x00007ffff65cf2d2 <+1778>:  cmp    0x358(%r10),%ecx
   0x00007ffff65cf2d9 <+1785>:  setl   %dl
   0x00007ffff65cf2dc <+1788>:  add    %r11d,%edx
   0x00007ffff65cf2df <+1791>:  xor    %r11d,%r11d
   0x00007ffff65cf2e2 <+1794>:  cmp    0x35c(%r10),%eax
   0x00007ffff65cf2e9 <+1801>:  setg   %r11b
   0x00007ffff65cf2ed <+1805>:  add    %edx,%r11d

which definitely looks better - albeit I'd say gcc really should have
done this without a helping hand (it figured things out for the
top/bottom edges, I suppose it just hit some other optimization
opportunity for the other two edges)...

In any case, I think generally it's not worth bothering about the
compiler potentially being stupid. But I changed the code in any case,
because with the style issue you pointed out it would have been 8 lines
and that way it's just 4 ;-).
Of course, even simpler would be to only having to test the scissor
edges once. This would be possible, at the expense of having to waste
more binner memory (always just allocate space for 4 scissor planes if
there's scissor test).

Roland

> 
> 
>> +   }
>> +
>>      tri = lp_setup_alloc_triangle(scene,
>>                                    key->num_inputs,
>>                                    nr_planes,
>> @@ -367,13 +374,11 @@ do_triangle_ccw(struct lp_setup_context *setup,
>>
>>      /* Setup parameter interpolants:
>>       */
>> -   setup->setup.variant->jit_function( v0,
>> -                       v1,
>> -                       v2,
>> -                       frontfacing,
>> -                       GET_A0(&tri->inputs),
>> -                       GET_DADX(&tri->inputs),
>> -                       GET_DADY(&tri->inputs) );
>> +   setup->setup.variant->jit_function(v0, v1, v2,
>> +                                      frontfacing,
>> +                                      GET_A0(&tri->inputs),
>> +                                      GET_DADX(&tri->inputs),
>> +                                      GET_DADY(&tri->inputs));
>>
>>      tri->inputs.frontfacing = frontfacing;
>>      tri->inputs.disable = FALSE;
>> @@ -383,9 +388,9 @@ do_triangle_ccw(struct lp_setup_context *setup,
>>
>>      if (0)
>>         lp_dump_setup_coef(&setup->setup.variant->key,
>> -             (const float (*)[4])GET_A0(&tri->inputs),
>> -             (const float (*)[4])GET_DADX(&tri->inputs),
>> -             (const float (*)[4])GET_DADY(&tri->inputs));
>> +                         (const float (*)[4])GET_A0(&tri->inputs),
>> +                         (const float (*)[4])GET_DADX(&tri->inputs),
>> +                         (const float (*)[4])GET_DADY(&tri->inputs));
>>
>>      plane = GET_PLANES(tri);
>>
>> @@ -672,29 +677,44 @@ do_triangle_ccw(struct lp_setup_context *setup,
>>       * Note that otherwise, the scissor planes only vary in 'C' value,
>>       * and even then only on state-changes.  Could alternatively store
>>       * these planes elsewhere.
>> +    * (Or only store the c value together with a bit indicating which
>> +    * scissor edge this is, so rasterization would treat them
>> differently
>> +    * (easier to evaluate) to ordinary planes.)
>>       */
>> -   if (nr_planes == 7) {
>> -      const struct u_rect *scissor = &setup->scissors[viewport_index];
>> -
>> -      plane[3].dcdx = -1 << 8;
>> -      plane[3].dcdy = 0;
>> -      plane[3].c = (1-scissor->x0) << 8;
>> -      plane[3].eo = 1 << 8;
>> -
>> -      plane[4].dcdx = 1 << 8;
>> -      plane[4].dcdy = 0;
>> -      plane[4].c = (scissor->x1+1) << 8;
>> -      plane[4].eo = 0;
>> -
>> -      plane[5].dcdx = 0;
>> -      plane[5].dcdy = 1 << 8;
>> -      plane[5].c = (1-scissor->y0) << 8;
>> -      plane[5].eo = 1 << 8;
>> -
>> -      plane[6].dcdx = 0;
>> -      plane[6].dcdy = -1 << 8;
>> -      plane[6].c = (scissor->y1+1) << 8;
>> -      plane[6].eo = 0;
>> +   if (nr_planes > 3) {
>> +      /* why not just use draw_regions */
>> +      struct u_rect *scissor = &setup->scissors[viewport_index];
>> +      unsigned scis_index = 3;
>> +
>> +      if (bbox.x0 < scissor->x0) {
>> +         plane[scis_index].dcdx = -1 << 8;
>> +         plane[scis_index].dcdy = 0;
>> +         plane[scis_index].c = (1-scissor->x0) << 8;
>> +         plane[scis_index].eo = 1 << 8;
>> +         scis_index++;
>> +      }
>> +      if (bbox.x1 > scissor->x1) {
>> +         plane[scis_index].dcdx = 1 << 8;
>> +         plane[scis_index].dcdy = 0;
>> +         plane[scis_index].c = (scissor->x1+1) << 8;
>> +         plane[scis_index].eo = 0 << 8;
>> +         scis_index++;
>> +      }
>> +      if (bbox.y0 < scissor->y0) {
>> +         plane[scis_index].dcdx = 0;
>> +         plane[scis_index].dcdy = 1 << 8;
>> +         plane[scis_index].c = (1-scissor->y0) << 8;
>> +         plane[scis_index].eo = 1 << 8;
>> +         scis_index++;
>> +      }
>> +      if (bbox.y1 > scissor->y1) {
>> +         plane[scis_index].dcdx = 0;
>> +         plane[scis_index].dcdy = -1 << 8;
>> +         plane[scis_index].c = (scissor->y1+1) << 8;
>> +         plane[scis_index].eo = 0;
>> +         scis_index++;
>> +      }
>> +      assert(scis_index == nr_planes);
>>      }
> 
> I haven't ever examined the code, but I wonder if code like the above
> would be any more efficient if the 'plane' pointer were incremented
> inside each conditional, rather than an array index.  Probably not a big
> deal.

I thought "surely the compiler would recognize that and do that anyway".
But nope, fail again...
With arrays, what happened is gcc actually used essentially 4 fixed
offsets, then played tricks with them based on branch taken or not. But
of course, just incrementing the plane pointer looks much simpler.
I suppose my trust in compilers isn't entirely warranted... Though I
doubt those couple extra mov's would really make a difference.

One day though I'd like to address some _real_ performance problems in
llvmpipe (for starters, the big one would be that rasterization can't
run in parallel to the main thread - either raster threads are idle or
the main thread is waiting for them to finish...).

Roland