[Mesa-dev] [PATCH 3/3] llvmpipe: drop scissor planes early if the tri is fully inside them

Sun Jan 31 17:34:56 PST 2016

Am 01.02.2016 um 02:00 schrieb sroland at vmware.com:
> From: Roland Scheidegger <sroland at vmware.com>
> 
> If the tri is fully inside a scissor edge (or rather, we just use the
> bounding box of the tri for the comparison), then we can drop these
> additional scissor "planes" early. We do not even need to allocate
> space for them in the tri.
> The math actually appears to be slightly iffy due to bounding boxes
> being rounded, but it doesn't matter in the end.
> Those scissor rects are costly - the 4 planes from the scissor are
> already more expensive to calculate than the 3 planes from the tri itself,
> and it also prevents us from using the specialized raster code for small
> tris.
> This helps openarena performance by about 8% or so. Of course, it helps
> there that while openarena often enables scissoring (and even moves the
> scissor rect around) I have not seen a single tri actually hit the
> scissor rect, ever.
> 
> v2: drop individual scissor edges, and do it earlier, not even allocating
> space for them.
> ---
>  src/gallium/drivers/llvmpipe/lp_rast_tri.c   | 16 +++++
>  src/gallium/drivers/llvmpipe/lp_setup_line.c | 81 ++++++++++++++---------
>  src/gallium/drivers/llvmpipe/lp_setup_tri.c  | 98 +++++++++++++++++-----------
>  3 files changed, 126 insertions(+), 69 deletions(-)
> 
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> index f4a2f02..bf27900 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> @@ -380,11 +380,27 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
>      */
>     dcdx = _mm_sub_epi32(zero, dcdx);
>  
> +#if 0
> +   {
> +   __m128i lbits, fixup, xy, dcdxdy, dcdx16;
> +   lbits = _mm_and_si128(c, _mm_set1_epi32(FIXED_ONE - 1));
> +   fixup = _mm_cmpeq_epi32(lbits, _mm_setzero_si128());
> +   c = _mm_srai_epi32(c, 8);
> +   c = _mm_add_epi32(c, fixup);
> +   xy = _mm_set1_epi32(x | y << 16);
> +   dcdx = _mm_srai_epi32(dcdx, 8);
> +   dcdy = _mm_srai_epi32(dcdy, 8);
> +   dcdx16 = _mm_and_si128(_mm_set_epi16(0,-1,0,-1,0,-1,0,-1), dcdx);
> +   dcdxdy = _mm_or_si128(dcdx16, _mm_slli_epi32(dcdy, 16));
> +   c = _mm_add_epi32(c, _mm_madd_epi16(dcdxdy, xy));
> +   }
> +#else
>     c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
>     c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
>  
>     /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
>     c = _mm_sub_epi32(c, _mm_set1_epi32(1));
> +#endif
Oops disregard this hunk. That's part of some experiment (use the same
down-shifting logic for calculations used in the generic rasterization
functions, plus some different fixup calc (with logic ops rather than
the subs used elsewhere), and just because I can, replace the emulated 2
32bit muls with one single 16bit madd, all in all that's quite a bit
cheaper... could do that for the 32_3_16 func as well - works well,
maybe I'll look at that another time again...)

Roland


>  
>     dcdx2 = _mm_add_epi32(dcdx, dcdx);
>     dcdx3 = _mm_add_epi32(dcdx2, dcdx);
> diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
> index f425825..3ec9ac4 100644
> --- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
> +++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
> @@ -336,13 +336,6 @@ try_setup_line( struct lp_setup_context *setup,
>        layer = MIN2(layer, scene->fb_max_layer);
>     }
>  
> -   if (setup->scissor_test) {
> -      nr_planes = 8;
> -   }
> -   else {
> -      nr_planes = 4;
> -   }
> -
>     dx = v1[0][0] - v2[0][0];
>     dy = v1[0][1] - v2[0][1];
>     area = (dx * dx  + dy * dy);
> @@ -591,6 +584,20 @@ try_setup_line( struct lp_setup_context *setup,
>     bbox.x0 = MAX2(bbox.x0, 0);
>     bbox.y0 = MAX2(bbox.y0, 0);
>  
> +   nr_planes = 4;
> +   /*
> +    * Determine how many scissor planes we need, that is drop scissor
> +    * edges if the bounding box of the tri is fully inside that edge.
> +    */
> +   if (setup->scissor_test) {
> +      /* why not just use draw_regions */
> +      struct u_rect *scissor = &setup->scissors[viewport_index];
> +      if (bbox.x0 < scissor->x0) nr_planes++;
> +      if (bbox.x1 > scissor->x1) nr_planes++;
> +      if (bbox.y0 < scissor->y0) nr_planes++;
> +      if (bbox.y1 > scissor->y1) nr_planes++;
> +   }
> +
>     line = lp_setup_alloc_triangle(scene,
>                                    key->num_inputs,
>                                    nr_planes,
> @@ -708,30 +715,44 @@ try_setup_line( struct lp_setup_context *setup,
>      * Note that otherwise, the scissor planes only vary in 'C' value,
>      * and even then only on state-changes.  Could alternatively store
>      * these planes elsewhere.
> +    * (Or only store the c value together with a bit indicating which
> +    * scissor edge this is, so rasterization would treat them differently
> +    * (easier to evaluate) to ordinary planes.)
>      */
> -   if (nr_planes == 8) {
> -      const struct u_rect *scissor =
> -         &setup->scissors[viewport_index];
> -
> -      plane[4].dcdx = -1 << 8;
> -      plane[4].dcdy = 0;
> -      plane[4].c = (1-scissor->x0) << 8;
> -      plane[4].eo = 1 << 8;
> -
> -      plane[5].dcdx = 1 << 8;
> -      plane[5].dcdy = 0;
> -      plane[5].c = (scissor->x1+1) << 8;
> -      plane[5].eo = 0;
> -
> -      plane[6].dcdx = 0;
> -      plane[6].dcdy = 1 << 8;
> -      plane[6].c = (1-scissor->y0) << 8;
> -      plane[6].eo = 1 << 8;
> -
> -      plane[7].dcdx = 0;
> -      plane[7].dcdy = -1 << 8;
> -      plane[7].c = (scissor->y1+1) << 8;
> -      plane[7].eo = 0;
> +   if (nr_planes > 4) {
> +      /* why not just use draw_regions */
> +      struct u_rect *scissor = &setup->scissors[viewport_index];
> +      unsigned scis_index = 4;
> +
> +      if (bbox.x0 < scissor->x0) {
> +         plane[scis_index].dcdx = -1 << 8;
> +         plane[scis_index].dcdy = 0;
> +         plane[scis_index].c = (1-scissor->x0) << 8;
> +         plane[scis_index].eo = 1 << 8;
> +         scis_index++;
> +      }
> +      if (bbox.x1 > scissor->x1) {
> +         plane[scis_index].dcdx = 1 << 8;
> +         plane[scis_index].dcdy = 0;
> +         plane[scis_index].c = (scissor->x1+1) << 8;
> +         plane[scis_index].eo = 0 << 8;
> +         scis_index++;
> +      }
> +      if (bbox.y0 < scissor->y0) {
> +         plane[scis_index].dcdx = 0;
> +         plane[scis_index].dcdy = 1 << 8;
> +         plane[scis_index].c = (1-scissor->y0) << 8;
> +         plane[scis_index].eo = 1 << 8;
> +         scis_index++;
> +      }
> +      if (bbox.y1 > scissor->y1) {
> +         plane[scis_index].dcdx = 0;
> +         plane[scis_index].dcdy = -1 << 8;
> +         plane[scis_index].c = (scissor->y1+1) << 8;
> +         plane[scis_index].eo = 0;
> +         scis_index++;
> +      }
> +      assert(scis_index == nr_planes);
>     }
>  
>     return lp_setup_bin_triangle(setup, line, &bbox, nr_planes, viewport_index);
> diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
> index 1e3a750..4e9acce 100644
> --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
> +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
> @@ -302,13 +302,6 @@ do_triangle_ccw(struct lp_setup_context *setup,
>        layer = MIN2(layer, scene->fb_max_layer);
>     }
>  
> -   if (setup->scissor_test) {
> -      nr_planes = 7;
> -   }
> -   else {
> -      nr_planes = 3;
> -   }
> -
>     /* Bounding rectangle (in pixels) */
>     {
>        /* Yes this is necessary to accurately calculate bounding boxes
> @@ -347,6 +340,20 @@ do_triangle_ccw(struct lp_setup_context *setup,
>     bbox.x0 = MAX2(bbox.x0, 0);
>     bbox.y0 = MAX2(bbox.y0, 0);
>  
> +   nr_planes = 3;
> +   /*
> +    * Determine how many scissor planes we need, that is drop scissor
> +    * edges if the bounding box of the tri is fully inside that edge.
> +    */
> +   if (setup->scissor_test) {
> +      /* why not just use draw_regions */
> +      struct u_rect *scissor = &setup->scissors[viewport_index];
> +      if (bbox.x0 < scissor->x0) nr_planes++;
> +      if (bbox.x1 > scissor->x1) nr_planes++;
> +      if (bbox.y0 < scissor->y0) nr_planes++;
> +      if (bbox.y1 > scissor->y1) nr_planes++;
> +   }
> +
>     tri = lp_setup_alloc_triangle(scene,
>                                   key->num_inputs,
>                                   nr_planes,
> @@ -367,13 +374,11 @@ do_triangle_ccw(struct lp_setup_context *setup,
>  
>     /* Setup parameter interpolants:
>      */
> -   setup->setup.variant->jit_function( v0,
> -				       v1,
> -				       v2,
> -				       frontfacing,
> -				       GET_A0(&tri->inputs),
> -				       GET_DADX(&tri->inputs),
> -				       GET_DADY(&tri->inputs) );
> +   setup->setup.variant->jit_function(v0, v1, v2,
> +                                      frontfacing,
> +                                      GET_A0(&tri->inputs),
> +                                      GET_DADX(&tri->inputs),
> +                                      GET_DADY(&tri->inputs));
>  
>     tri->inputs.frontfacing = frontfacing;
>     tri->inputs.disable = FALSE;
> @@ -383,9 +388,9 @@ do_triangle_ccw(struct lp_setup_context *setup,
>  
>     if (0)
>        lp_dump_setup_coef(&setup->setup.variant->key,
> -			 (const float (*)[4])GET_A0(&tri->inputs),
> -			 (const float (*)[4])GET_DADX(&tri->inputs),
> -			 (const float (*)[4])GET_DADY(&tri->inputs));
> +                         (const float (*)[4])GET_A0(&tri->inputs),
> +                         (const float (*)[4])GET_DADX(&tri->inputs),
> +                         (const float (*)[4])GET_DADY(&tri->inputs));
>  
>     plane = GET_PLANES(tri);
>  
> @@ -672,29 +677,44 @@ do_triangle_ccw(struct lp_setup_context *setup,
>      * Note that otherwise, the scissor planes only vary in 'C' value,
>      * and even then only on state-changes.  Could alternatively store
>      * these planes elsewhere.
> +    * (Or only store the c value together with a bit indicating which
> +    * scissor edge this is, so rasterization would treat them differently
> +    * (easier to evaluate) to ordinary planes.)
>      */
> -   if (nr_planes == 7) {
> -      const struct u_rect *scissor = &setup->scissors[viewport_index];
> -
> -      plane[3].dcdx = -1 << 8;
> -      plane[3].dcdy = 0;
> -      plane[3].c = (1-scissor->x0) << 8;
> -      plane[3].eo = 1 << 8;
> -
> -      plane[4].dcdx = 1 << 8;
> -      plane[4].dcdy = 0;
> -      plane[4].c = (scissor->x1+1) << 8;
> -      plane[4].eo = 0;
> -
> -      plane[5].dcdx = 0;
> -      plane[5].dcdy = 1 << 8;
> -      plane[5].c = (1-scissor->y0) << 8;
> -      plane[5].eo = 1 << 8;
> -
> -      plane[6].dcdx = 0;
> -      plane[6].dcdy = -1 << 8;
> -      plane[6].c = (scissor->y1+1) << 8;
> -      plane[6].eo = 0;
> +   if (nr_planes > 3) {
> +      /* why not just use draw_regions */
> +      struct u_rect *scissor = &setup->scissors[viewport_index];
> +      unsigned scis_index = 3;
> +
> +      if (bbox.x0 < scissor->x0) {
> +         plane[scis_index].dcdx = -1 << 8;
> +         plane[scis_index].dcdy = 0;
> +         plane[scis_index].c = (1-scissor->x0) << 8;
> +         plane[scis_index].eo = 1 << 8;
> +         scis_index++;
> +      }
> +      if (bbox.x1 > scissor->x1) {
> +         plane[scis_index].dcdx = 1 << 8;
> +         plane[scis_index].dcdy = 0;
> +         plane[scis_index].c = (scissor->x1+1) << 8;
> +         plane[scis_index].eo = 0 << 8;
> +         scis_index++;
> +      }
> +      if (bbox.y0 < scissor->y0) {
> +         plane[scis_index].dcdx = 0;
> +         plane[scis_index].dcdy = 1 << 8;
> +         plane[scis_index].c = (1-scissor->y0) << 8;
> +         plane[scis_index].eo = 1 << 8;
> +         scis_index++;
> +      }
> +      if (bbox.y1 > scissor->y1) {
> +         plane[scis_index].dcdx = 0;
> +         plane[scis_index].dcdy = -1 << 8;
> +         plane[scis_index].c = (scissor->y1+1) << 8;
> +         plane[scis_index].eo = 0;
> +         scis_index++;
> +      }
> +      assert(scis_index == nr_planes);
>     }
>  
>     return lp_setup_bin_triangle(setup, tri, &bbox, nr_planes, viewport_index);
>