[Mesa-dev] [PATCH 3/3] llvmpipe: drop scissor planes early if the tri is fully inside them

Brian Paul brianp at vmware.com
Mon Feb 1 16:14:48 UTC 2016


On 01/31/2016 06:00 PM, sroland at vmware.com wrote:
> From: Roland Scheidegger <sroland at vmware.com>
>
> If the tri is fully inside a scissor edge (or rather, we just use the
> bounding box of the tri for the comparison), then we can drop these
> additional scissor "planes" early. We do not even need to allocate
> space for them in the tri.
> The math actually appears to be slightly iffy due to bounding boxes
> being rounded, but it doesn't matter in the end.
> Those scissor rects are costly - the 4 planes from the scissor are
> already more expensive to calculate than the 3 planes from the tri itself,
> and it also prevents us from using the specialized raster code for small
> tris.
> This helps openarena performance by about 8% or so. Of course, it helps
> there that while openarena often enables scissoring (and even moves the
> scissor rect around) I have not seen a single tri actually hit the
> scissor rect, ever.
>
> v2: drop individual scissor edges, and do it earlier, not even allocating
> space for them.
> ---
>   src/gallium/drivers/llvmpipe/lp_rast_tri.c   | 16 +++++
>   src/gallium/drivers/llvmpipe/lp_setup_line.c | 81 ++++++++++++++---------
>   src/gallium/drivers/llvmpipe/lp_setup_tri.c  | 98 +++++++++++++++++-----------
>   3 files changed, 126 insertions(+), 69 deletions(-)
>
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> index f4a2f02..bf27900 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> @@ -380,11 +380,27 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
>       */
>      dcdx = _mm_sub_epi32(zero, dcdx);
>
> +#if 0
> +   {
> +   __m128i lbits, fixup, xy, dcdxdy, dcdx16;
> +   lbits = _mm_and_si128(c, _mm_set1_epi32(FIXED_ONE - 1));
> +   fixup = _mm_cmpeq_epi32(lbits, _mm_setzero_si128());
> +   c = _mm_srai_epi32(c, 8);
> +   c = _mm_add_epi32(c, fixup);
> +   xy = _mm_set1_epi32(x | y << 16);
> +   dcdx = _mm_srai_epi32(dcdx, 8);
> +   dcdy = _mm_srai_epi32(dcdy, 8);
> +   dcdx16 = _mm_and_si128(_mm_set_epi16(0,-1,0,-1,0,-1,0,-1), dcdx);
> +   dcdxdy = _mm_or_si128(dcdx16, _mm_slli_epi32(dcdy, 16));
> +   c = _mm_add_epi32(c, _mm_madd_epi16(dcdxdy, xy));
> +   }
> +#else
>      c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
>      c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
>
>      /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
>      c = _mm_sub_epi32(c, _mm_set1_epi32(1));
> +#endif
>
>      dcdx2 = _mm_add_epi32(dcdx, dcdx);
>      dcdx3 = _mm_add_epi32(dcdx2, dcdx);
> diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
> index f425825..3ec9ac4 100644
> --- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
> +++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
> @@ -336,13 +336,6 @@ try_setup_line( struct lp_setup_context *setup,
>         layer = MIN2(layer, scene->fb_max_layer);
>      }
>
> -   if (setup->scissor_test) {
> -      nr_planes = 8;
> -   }
> -   else {
> -      nr_planes = 4;
> -   }
> -
>      dx = v1[0][0] - v2[0][0];
>      dy = v1[0][1] - v2[0][1];
>      area = (dx * dx  + dy * dy);
> @@ -591,6 +584,20 @@ try_setup_line( struct lp_setup_context *setup,
>      bbox.x0 = MAX2(bbox.x0, 0);
>      bbox.y0 = MAX2(bbox.y0, 0);
>
> +   nr_planes = 4;
> +   /*
> +    * Determine how many scissor planes we need, that is drop scissor
> +    * edges if the bounding box of the tri is fully inside that edge.
> +    */
> +   if (setup->scissor_test) {
> +      /* why not just use draw_regions */
> +      struct u_rect *scissor = &setup->scissors[viewport_index];
> +      if (bbox.x0 < scissor->x0) nr_planes++;
> +      if (bbox.x1 > scissor->x1) nr_planes++;
> +      if (bbox.y0 < scissor->y0) nr_planes++;
> +      if (bbox.y1 > scissor->y1) nr_planes++;
> +   }
> +
>      line = lp_setup_alloc_triangle(scene,
>                                     key->num_inputs,
>                                     nr_planes,
> @@ -708,30 +715,44 @@ try_setup_line( struct lp_setup_context *setup,
>       * Note that otherwise, the scissor planes only vary in 'C' value,
>       * and even then only on state-changes.  Could alternatively store
>       * these planes elsewhere.
> +    * (Or only store the c value together with a bit indicating which
> +    * scissor edge this is, so rasterization would treat them differently
> +    * (easier to evaluate) to ordinary planes.)
>       */
> -   if (nr_planes == 8) {
> -      const struct u_rect *scissor =
> -         &setup->scissors[viewport_index];
> -
> -      plane[4].dcdx = -1 << 8;
> -      plane[4].dcdy = 0;
> -      plane[4].c = (1-scissor->x0) << 8;
> -      plane[4].eo = 1 << 8;
> -
> -      plane[5].dcdx = 1 << 8;
> -      plane[5].dcdy = 0;
> -      plane[5].c = (scissor->x1+1) << 8;
> -      plane[5].eo = 0;
> -
> -      plane[6].dcdx = 0;
> -      plane[6].dcdy = 1 << 8;
> -      plane[6].c = (1-scissor->y0) << 8;
> -      plane[6].eo = 1 << 8;
> -
> -      plane[7].dcdx = 0;
> -      plane[7].dcdy = -1 << 8;
> -      plane[7].c = (scissor->y1+1) << 8;
> -      plane[7].eo = 0;
> +   if (nr_planes > 4) {
> +      /* why not just use draw_regions */
> +      struct u_rect *scissor = &setup->scissors[viewport_index];
> +      unsigned scis_index = 4;
> +
> +      if (bbox.x0 < scissor->x0) {
> +         plane[scis_index].dcdx = -1 << 8;
> +         plane[scis_index].dcdy = 0;
> +         plane[scis_index].c = (1-scissor->x0) << 8;
> +         plane[scis_index].eo = 1 << 8;
> +         scis_index++;
> +      }
> +      if (bbox.x1 > scissor->x1) {
> +         plane[scis_index].dcdx = 1 << 8;
> +         plane[scis_index].dcdy = 0;
> +         plane[scis_index].c = (scissor->x1+1) << 8;
> +         plane[scis_index].eo = 0 << 8;
> +         scis_index++;
> +      }
> +      if (bbox.y0 < scissor->y0) {
> +         plane[scis_index].dcdx = 0;
> +         plane[scis_index].dcdy = 1 << 8;
> +         plane[scis_index].c = (1-scissor->y0) << 8;
> +         plane[scis_index].eo = 1 << 8;
> +         scis_index++;
> +      }
> +      if (bbox.y1 > scissor->y1) {
> +         plane[scis_index].dcdx = 0;
> +         plane[scis_index].dcdy = -1 << 8;
> +         plane[scis_index].c = (scissor->y1+1) << 8;
> +         plane[scis_index].eo = 0;
> +         scis_index++;
> +      }
> +      assert(scis_index == nr_planes);
>      }
>
>      return lp_setup_bin_triangle(setup, line, &bbox, nr_planes, viewport_index);
> diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
> index 1e3a750..4e9acce 100644
> --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
> +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
> @@ -302,13 +302,6 @@ do_triangle_ccw(struct lp_setup_context *setup,
>         layer = MIN2(layer, scene->fb_max_layer);
>      }
>
> -   if (setup->scissor_test) {
> -      nr_planes = 7;
> -   }
> -   else {
> -      nr_planes = 3;
> -   }
> -
>      /* Bounding rectangle (in pixels) */
>      {
>         /* Yes this is necessary to accurately calculate bounding boxes
> @@ -347,6 +340,20 @@ do_triangle_ccw(struct lp_setup_context *setup,
>      bbox.x0 = MAX2(bbox.x0, 0);
>      bbox.y0 = MAX2(bbox.y0, 0);
>
> +   nr_planes = 3;
> +   /*
> +    * Determine how many scissor planes we need, that is drop scissor
> +    * edges if the bounding box of the tri is fully inside that edge.
> +    */
> +   if (setup->scissor_test) {
> +      /* why not just use draw_regions */
> +      struct u_rect *scissor = &setup->scissors[viewport_index];
> +      if (bbox.x0 < scissor->x0) nr_planes++;
> +      if (bbox.x1 > scissor->x1) nr_planes++;
> +      if (bbox.y0 < scissor->y0) nr_planes++;
> +      if (bbox.y1 > scissor->y1) nr_planes++;

Minor nit:

I like having the 'true' code on the line after the 'if' just in case 
I'd need to put a breakpoint in there to debug something someday.

BTW, I was curious if this approach would generate better code:

nr_planes += (bbox.x0 < scissor->x0);
nr_planes += (bbox.x1 > scissor->x1);
nr_planes += (bbox.y0 < scissor->y0);
nr_planes += (bbox.y1 > scissor->y1);

But with -O3 at least, you get identical (branch-less) code.


> +   }
> +
>      tri = lp_setup_alloc_triangle(scene,
>                                    key->num_inputs,
>                                    nr_planes,
> @@ -367,13 +374,11 @@ do_triangle_ccw(struct lp_setup_context *setup,
>
>      /* Setup parameter interpolants:
>       */
> -   setup->setup.variant->jit_function( v0,
> -				       v1,
> -				       v2,
> -				       frontfacing,
> -				       GET_A0(&tri->inputs),
> -				       GET_DADX(&tri->inputs),
> -				       GET_DADY(&tri->inputs) );
> +   setup->setup.variant->jit_function(v0, v1, v2,
> +                                      frontfacing,
> +                                      GET_A0(&tri->inputs),
> +                                      GET_DADX(&tri->inputs),
> +                                      GET_DADY(&tri->inputs));
>
>      tri->inputs.frontfacing = frontfacing;
>      tri->inputs.disable = FALSE;
> @@ -383,9 +388,9 @@ do_triangle_ccw(struct lp_setup_context *setup,
>
>      if (0)
>         lp_dump_setup_coef(&setup->setup.variant->key,
> -			 (const float (*)[4])GET_A0(&tri->inputs),
> -			 (const float (*)[4])GET_DADX(&tri->inputs),
> -			 (const float (*)[4])GET_DADY(&tri->inputs));
> +                         (const float (*)[4])GET_A0(&tri->inputs),
> +                         (const float (*)[4])GET_DADX(&tri->inputs),
> +                         (const float (*)[4])GET_DADY(&tri->inputs));
>
>      plane = GET_PLANES(tri);
>
> @@ -672,29 +677,44 @@ do_triangle_ccw(struct lp_setup_context *setup,
>       * Note that otherwise, the scissor planes only vary in 'C' value,
>       * and even then only on state-changes.  Could alternatively store
>       * these planes elsewhere.
> +    * (Or only store the c value together with a bit indicating which
> +    * scissor edge this is, so rasterization would treat them differently
> +    * (easier to evaluate) to ordinary planes.)
>       */
> -   if (nr_planes == 7) {
> -      const struct u_rect *scissor = &setup->scissors[viewport_index];
> -
> -      plane[3].dcdx = -1 << 8;
> -      plane[3].dcdy = 0;
> -      plane[3].c = (1-scissor->x0) << 8;
> -      plane[3].eo = 1 << 8;
> -
> -      plane[4].dcdx = 1 << 8;
> -      plane[4].dcdy = 0;
> -      plane[4].c = (scissor->x1+1) << 8;
> -      plane[4].eo = 0;
> -
> -      plane[5].dcdx = 0;
> -      plane[5].dcdy = 1 << 8;
> -      plane[5].c = (1-scissor->y0) << 8;
> -      plane[5].eo = 1 << 8;
> -
> -      plane[6].dcdx = 0;
> -      plane[6].dcdy = -1 << 8;
> -      plane[6].c = (scissor->y1+1) << 8;
> -      plane[6].eo = 0;
> +   if (nr_planes > 3) {
> +      /* why not just use draw_regions */
> +      struct u_rect *scissor = &setup->scissors[viewport_index];
> +      unsigned scis_index = 3;
> +
> +      if (bbox.x0 < scissor->x0) {
> +         plane[scis_index].dcdx = -1 << 8;
> +         plane[scis_index].dcdy = 0;
> +         plane[scis_index].c = (1-scissor->x0) << 8;
> +         plane[scis_index].eo = 1 << 8;
> +         scis_index++;
> +      }
> +      if (bbox.x1 > scissor->x1) {
> +         plane[scis_index].dcdx = 1 << 8;
> +         plane[scis_index].dcdy = 0;
> +         plane[scis_index].c = (scissor->x1+1) << 8;
> +         plane[scis_index].eo = 0 << 8;
> +         scis_index++;
> +      }
> +      if (bbox.y0 < scissor->y0) {
> +         plane[scis_index].dcdx = 0;
> +         plane[scis_index].dcdy = 1 << 8;
> +         plane[scis_index].c = (1-scissor->y0) << 8;
> +         plane[scis_index].eo = 1 << 8;
> +         scis_index++;
> +      }
> +      if (bbox.y1 > scissor->y1) {
> +         plane[scis_index].dcdx = 0;
> +         plane[scis_index].dcdy = -1 << 8;
> +         plane[scis_index].c = (scissor->y1+1) << 8;
> +         plane[scis_index].eo = 0;
> +         scis_index++;
> +      }
> +      assert(scis_index == nr_planes);
>      }

I haven't ever examined the code, but I wonder if code like the above 
would be any more efficient if the 'plane' pointer were incremented 
inside each conditional, rather than an array index.  Probably not a big 
deal.

-Brian



>
>      return lp_setup_bin_triangle(setup, tri, &bbox, nr_planes, viewport_index);
>



More information about the mesa-dev mailing list