[Mesa-dev] [PATCH 3/3] llvmpipe: drop scissor planes early if the tri is fully inside them
Roland Scheidegger
sroland at vmware.com
Sun Jan 31 17:34:56 PST 2016
Am 01.02.2016 um 02:00 schrieb sroland at vmware.com:
> From: Roland Scheidegger <sroland at vmware.com>
>
> If the tri is fully inside a scissor edge (or rather, we just use the
> bounding box of the tri for the comparison), then we can drop these
> additional scissor "planes" early. We do not even need to allocate
> space for them in the tri.
> The math actually appears to be slightly iffy due to bounding boxes
> being rounded, but it doesn't matter in the end.
> Those scissor rects are costly - the 4 planes from the scissor are
> already more expensive to calculate than the 3 planes from the tri itself,
> and it also prevents us from using the specialized raster code for small
> tris.
> This helps openarena performance by about 8% or so. Of course, it helps
> there that while openarena often enables scissoring (and even moves the
> scissor rect around) I have not seen a single tri actually hit the
> scissor rect, ever.
>
> v2: drop individual scissor edges, and do it earlier, not even allocating
> space for them.
> ---
> src/gallium/drivers/llvmpipe/lp_rast_tri.c | 16 +++++
> src/gallium/drivers/llvmpipe/lp_setup_line.c | 81 ++++++++++++++---------
> src/gallium/drivers/llvmpipe/lp_setup_tri.c | 98 +++++++++++++++++-----------
> 3 files changed, 126 insertions(+), 69 deletions(-)
>
> diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> index f4a2f02..bf27900 100644
> --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
> @@ -380,11 +380,27 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
> */
> dcdx = _mm_sub_epi32(zero, dcdx);
>
> +#if 0
> + {
> + __m128i lbits, fixup, xy, dcdxdy, dcdx16;
> + lbits = _mm_and_si128(c, _mm_set1_epi32(FIXED_ONE - 1));
> + fixup = _mm_cmpeq_epi32(lbits, _mm_setzero_si128());
> + c = _mm_srai_epi32(c, 8);
> + c = _mm_add_epi32(c, fixup);
> + xy = _mm_set1_epi32(x | y << 16);
> + dcdx = _mm_srai_epi32(dcdx, 8);
> + dcdy = _mm_srai_epi32(dcdy, 8);
> + dcdx16 = _mm_and_si128(_mm_set_epi16(0,-1,0,-1,0,-1,0,-1), dcdx);
> + dcdxdy = _mm_or_si128(dcdx16, _mm_slli_epi32(dcdy, 16));
> + c = _mm_add_epi32(c, _mm_madd_epi16(dcdxdy, xy));
> + }
> +#else
> c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
> c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
>
> /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
> c = _mm_sub_epi32(c, _mm_set1_epi32(1));
> +#endif
Oops disregard this hunk. That's part of some experiment (use the same
down-shifting logic for calculations used in the generic rasterization
functions, plus some different fixup calc (with logic ops rather than
the subs used elsewhere), and just because I can, replace the emulated 2
32bit muls with one single 16bit madd, all in all that's quite a bit
cheaper... could do that for the 32_3_16 func as well - works well,
maybe I'll look at that another time again...)
Roland
>
> dcdx2 = _mm_add_epi32(dcdx, dcdx);
> dcdx3 = _mm_add_epi32(dcdx2, dcdx);
> diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
> index f425825..3ec9ac4 100644
> --- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
> +++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
> @@ -336,13 +336,6 @@ try_setup_line( struct lp_setup_context *setup,
> layer = MIN2(layer, scene->fb_max_layer);
> }
>
> - if (setup->scissor_test) {
> - nr_planes = 8;
> - }
> - else {
> - nr_planes = 4;
> - }
> -
> dx = v1[0][0] - v2[0][0];
> dy = v1[0][1] - v2[0][1];
> area = (dx * dx + dy * dy);
> @@ -591,6 +584,20 @@ try_setup_line( struct lp_setup_context *setup,
> bbox.x0 = MAX2(bbox.x0, 0);
> bbox.y0 = MAX2(bbox.y0, 0);
>
> + nr_planes = 4;
> + /*
> + * Determine how many scissor planes we need, that is drop scissor
> + * edges if the bounding box of the tri is fully inside that edge.
> + */
> + if (setup->scissor_test) {
> + /* why not just use draw_regions */
> + struct u_rect *scissor = &setup->scissors[viewport_index];
> + if (bbox.x0 < scissor->x0) nr_planes++;
> + if (bbox.x1 > scissor->x1) nr_planes++;
> + if (bbox.y0 < scissor->y0) nr_planes++;
> + if (bbox.y1 > scissor->y1) nr_planes++;
> + }
> +
> line = lp_setup_alloc_triangle(scene,
> key->num_inputs,
> nr_planes,
> @@ -708,30 +715,44 @@ try_setup_line( struct lp_setup_context *setup,
> * Note that otherwise, the scissor planes only vary in 'C' value,
> * and even then only on state-changes. Could alternatively store
> * these planes elsewhere.
> + * (Or only store the c value together with a bit indicating which
> + * scissor edge this is, so rasterization would treat them differently
> + * (easier to evaluate) to ordinary planes.)
> */
> - if (nr_planes == 8) {
> - const struct u_rect *scissor =
> - &setup->scissors[viewport_index];
> -
> - plane[4].dcdx = -1 << 8;
> - plane[4].dcdy = 0;
> - plane[4].c = (1-scissor->x0) << 8;
> - plane[4].eo = 1 << 8;
> -
> - plane[5].dcdx = 1 << 8;
> - plane[5].dcdy = 0;
> - plane[5].c = (scissor->x1+1) << 8;
> - plane[5].eo = 0;
> -
> - plane[6].dcdx = 0;
> - plane[6].dcdy = 1 << 8;
> - plane[6].c = (1-scissor->y0) << 8;
> - plane[6].eo = 1 << 8;
> -
> - plane[7].dcdx = 0;
> - plane[7].dcdy = -1 << 8;
> - plane[7].c = (scissor->y1+1) << 8;
> - plane[7].eo = 0;
> + if (nr_planes > 4) {
> + /* why not just use draw_regions */
> + struct u_rect *scissor = &setup->scissors[viewport_index];
> + unsigned scis_index = 4;
> +
> + if (bbox.x0 < scissor->x0) {
> + plane[scis_index].dcdx = -1 << 8;
> + plane[scis_index].dcdy = 0;
> + plane[scis_index].c = (1-scissor->x0) << 8;
> + plane[scis_index].eo = 1 << 8;
> + scis_index++;
> + }
> + if (bbox.x1 > scissor->x1) {
> + plane[scis_index].dcdx = 1 << 8;
> + plane[scis_index].dcdy = 0;
> + plane[scis_index].c = (scissor->x1+1) << 8;
> + plane[scis_index].eo = 0 << 8;
> + scis_index++;
> + }
> + if (bbox.y0 < scissor->y0) {
> + plane[scis_index].dcdx = 0;
> + plane[scis_index].dcdy = 1 << 8;
> + plane[scis_index].c = (1-scissor->y0) << 8;
> + plane[scis_index].eo = 1 << 8;
> + scis_index++;
> + }
> + if (bbox.y1 > scissor->y1) {
> + plane[scis_index].dcdx = 0;
> + plane[scis_index].dcdy = -1 << 8;
> + plane[scis_index].c = (scissor->y1+1) << 8;
> + plane[scis_index].eo = 0;
> + scis_index++;
> + }
> + assert(scis_index == nr_planes);
> }
>
> return lp_setup_bin_triangle(setup, line, &bbox, nr_planes, viewport_index);
> diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
> index 1e3a750..4e9acce 100644
> --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
> +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
> @@ -302,13 +302,6 @@ do_triangle_ccw(struct lp_setup_context *setup,
> layer = MIN2(layer, scene->fb_max_layer);
> }
>
> - if (setup->scissor_test) {
> - nr_planes = 7;
> - }
> - else {
> - nr_planes = 3;
> - }
> -
> /* Bounding rectangle (in pixels) */
> {
> /* Yes this is necessary to accurately calculate bounding boxes
> @@ -347,6 +340,20 @@ do_triangle_ccw(struct lp_setup_context *setup,
> bbox.x0 = MAX2(bbox.x0, 0);
> bbox.y0 = MAX2(bbox.y0, 0);
>
> + nr_planes = 3;
> + /*
> + * Determine how many scissor planes we need, that is drop scissor
> + * edges if the bounding box of the tri is fully inside that edge.
> + */
> + if (setup->scissor_test) {
> + /* why not just use draw_regions */
> + struct u_rect *scissor = &setup->scissors[viewport_index];
> + if (bbox.x0 < scissor->x0) nr_planes++;
> + if (bbox.x1 > scissor->x1) nr_planes++;
> + if (bbox.y0 < scissor->y0) nr_planes++;
> + if (bbox.y1 > scissor->y1) nr_planes++;
> + }
> +
> tri = lp_setup_alloc_triangle(scene,
> key->num_inputs,
> nr_planes,
> @@ -367,13 +374,11 @@ do_triangle_ccw(struct lp_setup_context *setup,
>
> /* Setup parameter interpolants:
> */
> - setup->setup.variant->jit_function( v0,
> - v1,
> - v2,
> - frontfacing,
> - GET_A0(&tri->inputs),
> - GET_DADX(&tri->inputs),
> - GET_DADY(&tri->inputs) );
> + setup->setup.variant->jit_function(v0, v1, v2,
> + frontfacing,
> + GET_A0(&tri->inputs),
> + GET_DADX(&tri->inputs),
> + GET_DADY(&tri->inputs));
>
> tri->inputs.frontfacing = frontfacing;
> tri->inputs.disable = FALSE;
> @@ -383,9 +388,9 @@ do_triangle_ccw(struct lp_setup_context *setup,
>
> if (0)
> lp_dump_setup_coef(&setup->setup.variant->key,
> - (const float (*)[4])GET_A0(&tri->inputs),
> - (const float (*)[4])GET_DADX(&tri->inputs),
> - (const float (*)[4])GET_DADY(&tri->inputs));
> + (const float (*)[4])GET_A0(&tri->inputs),
> + (const float (*)[4])GET_DADX(&tri->inputs),
> + (const float (*)[4])GET_DADY(&tri->inputs));
>
> plane = GET_PLANES(tri);
>
> @@ -672,29 +677,44 @@ do_triangle_ccw(struct lp_setup_context *setup,
> * Note that otherwise, the scissor planes only vary in 'C' value,
> * and even then only on state-changes. Could alternatively store
> * these planes elsewhere.
> + * (Or only store the c value together with a bit indicating which
> + * scissor edge this is, so rasterization would treat them differently
> + * (easier to evaluate) to ordinary planes.)
> */
> - if (nr_planes == 7) {
> - const struct u_rect *scissor = &setup->scissors[viewport_index];
> -
> - plane[3].dcdx = -1 << 8;
> - plane[3].dcdy = 0;
> - plane[3].c = (1-scissor->x0) << 8;
> - plane[3].eo = 1 << 8;
> -
> - plane[4].dcdx = 1 << 8;
> - plane[4].dcdy = 0;
> - plane[4].c = (scissor->x1+1) << 8;
> - plane[4].eo = 0;
> -
> - plane[5].dcdx = 0;
> - plane[5].dcdy = 1 << 8;
> - plane[5].c = (1-scissor->y0) << 8;
> - plane[5].eo = 1 << 8;
> -
> - plane[6].dcdx = 0;
> - plane[6].dcdy = -1 << 8;
> - plane[6].c = (scissor->y1+1) << 8;
> - plane[6].eo = 0;
> + if (nr_planes > 3) {
> + /* why not just use draw_regions */
> + struct u_rect *scissor = &setup->scissors[viewport_index];
> + unsigned scis_index = 3;
> +
> + if (bbox.x0 < scissor->x0) {
> + plane[scis_index].dcdx = -1 << 8;
> + plane[scis_index].dcdy = 0;
> + plane[scis_index].c = (1-scissor->x0) << 8;
> + plane[scis_index].eo = 1 << 8;
> + scis_index++;
> + }
> + if (bbox.x1 > scissor->x1) {
> + plane[scis_index].dcdx = 1 << 8;
> + plane[scis_index].dcdy = 0;
> + plane[scis_index].c = (scissor->x1+1) << 8;
> + plane[scis_index].eo = 0 << 8;
> + scis_index++;
> + }
> + if (bbox.y0 < scissor->y0) {
> + plane[scis_index].dcdx = 0;
> + plane[scis_index].dcdy = 1 << 8;
> + plane[scis_index].c = (1-scissor->y0) << 8;
> + plane[scis_index].eo = 1 << 8;
> + scis_index++;
> + }
> + if (bbox.y1 > scissor->y1) {
> + plane[scis_index].dcdx = 0;
> + plane[scis_index].dcdy = -1 << 8;
> + plane[scis_index].c = (scissor->y1+1) << 8;
> + plane[scis_index].eo = 0;
> + scis_index++;
> + }
> + assert(scis_index == nr_planes);
> }
>
> return lp_setup_bin_triangle(setup, tri, &bbox, nr_planes, viewport_index);
>
More information about the mesa-dev
mailing list