[Mesa-dev] [PATCH] swrast: initial multi-threaded span rendering

Andreas Fänger a.faenger at e-sign.com
Wed Aug 10 02:49:02 PDT 2011


Hi Keith,

you are right. The main purpose of this patch is to speedup osmesa rendering as there is no llvmpipe target at the moment. Also llvmpipe is currently missing some important features like aa/fsaa and anisotropic filtering, which is available in swrast now. 
So I need to stick with the old rasterizer at the moment, with some improvements.

Andreas

-----Ursprüngliche Nachricht-----
Von: Keith Whitwell [mailto:keithw at vmware.com] 
Gesendet: Mittwoch, 10. August 2011 11:17
An: Andreas Fänger
Cc: mesa-dev at lists.freedesktop.org
Betreff: Re: [Mesa-dev] [PATCH] swrast: initial multi-threaded span rendering

I'm not sure it makes a lot of sense to be optimizing swrast at this
stage.  Take a look at llvmpipe and perhaps consider improving the
multithreading already in place in that rasterizer, which is far better
optimized than swrast already.

Keith

On Wed, 2011-08-10 at 08:07 +0000, Andreas Fänger wrote:
> Optional parallel rendering of spans using OpenMP.
> Initial implementation for aa triangles. A new option for scons is
> also provided to activate the openmp support (off by default).
> ---
>  common.py                      |    1 +
>  scons/gallium.py               |   12 +++++++
>  src/mesa/swrast/s_aatritemp.h  |   68 ++++++++++++++++++++++-----------------
>  src/mesa/swrast/s_context.c    |   26 ++++++++++++---
>  src/mesa/swrast/s_texcombine.c |    4 ++
>  src/mesa/tnl/t_pipeline.c      |   12 +++++++
>  6 files changed, 87 insertions(+), 36 deletions(-)
> 
> diff --git a/common.py b/common.py
> index 8657030..cfee1b5 100644
> --- a/common.py
> +++ b/common.py
> @@ -88,6 +88,7 @@ def AddOptions(opts):
>  	opts.Add('toolchain', 'compiler toolchain', default_toolchain)
>  	opts.Add(BoolOption('gles', 'EXPERIMENTAL: enable OpenGL ES support', 'no'))
>  	opts.Add(BoolOption('llvm', 'use LLVM', default_llvm))
> +	opts.Add(BoolOption('openmp', 'EXPERIMENTAL: compile with openmp (swrast)', 'no'))
>  	opts.Add(BoolOption('debug', 'DEPRECATED: debug build', 'yes'))
>  	opts.Add(BoolOption('profile', 'DEPRECATED: profile build', 'no'))
>  	opts.Add(BoolOption('quiet', 'DEPRECATED: profile build', 'yes'))
> diff --git a/scons/gallium.py b/scons/gallium.py
> index 8cd3bc7..7135251 100755
> --- a/scons/gallium.py
> +++ b/scons/gallium.py
> @@ -596,6 +596,18 @@ def generate(env):
>          libs += ['m', 'pthread', 'dl']
>      env.Append(LIBS = libs)
>  
> +    # OpenMP
> +    if env['openmp']:
> +        if env['msvc']:
> +            env.Append(CCFLAGS = ['/openmp'])
> +            # When building openmp release VS2008 link.exe crashes with LNK1103 error.
> +            # Workaround: overwrite PDB flags with empty value as it isn't required anyways
> +            if env['build'] == 'release':
> +                env['PDB'] = ''
> +        if env['gcc']:
> +            env.Append(CCFLAGS = ['-fopenmp'])
> +            env.Append(LIBS = ['gomp'])
> +
>      # Load tools
>      env.Tool('lex')
>      env.Tool('yacc')
> diff --git a/src/mesa/swrast/s_aatritemp.h b/src/mesa/swrast/s_aatritemp.h
> index 91d4f7a..005d12c 100644
> --- a/src/mesa/swrast/s_aatritemp.h
> +++ b/src/mesa/swrast/s_aatritemp.h
> @@ -181,13 +181,18 @@
>        const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS];
>        const GLfloat dxdy = majDx / majDy;
>        const GLfloat xAdj = dxdy < 0.0F ? -dxdy : 0.0F;
> -      GLfloat x = pMin[0] - (yMin - iyMin) * dxdy;
>        GLint iy;
> -      for (iy = iyMin; iy < iyMax; iy++, x += dxdy) {
> +      #pragma omp parallel for schedule(dynamic) private(iy) firstprivate(span)
> +      for (iy = iyMin; iy < iyMax; iy++) {
> +         GLfloat x = pMin[0] - (yMin - iy) * dxdy;
>           GLint ix, startX = (GLint) (x - xAdj);
>           GLuint count;
>           GLfloat coverage = 0.0F;
>  
> +#ifdef _OPENMP
> +         /* each thread needs to use a different (global) SpanArrays variable */
> +         span.array = SWRAST_CONTEXT(ctx)->SpanArrays + omp_get_thread_num();
> +#endif
>           /* skip over fragments with zero coverage */
>           while (startX < MAX_WIDTH) {
>              coverage = compute_coveragef(pMin, pMid, pMax, startX, iy);
> @@ -228,13 +233,12 @@
>              coverage = compute_coveragef(pMin, pMid, pMax, ix, iy);
>           }
>           
> -         if (ix <= startX)
> -            continue;
> -         
> -         span.x = startX;
> -         span.y = iy;
> -         span.end = (GLuint) ix - (GLuint) startX;
> -         _swrast_write_rgba_span(ctx, &span);
> +         if (ix > startX) {
> +            span.x = startX;
> +            span.y = iy;
> +            span.end = (GLuint) ix - (GLuint) startX;
> +            _swrast_write_rgba_span(ctx, &span);
> +         }
>        }
>     }
>     else {
> @@ -244,13 +248,18 @@
>        const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS];
>        const GLfloat dxdy = majDx / majDy;
>        const GLfloat xAdj = dxdy > 0 ? dxdy : 0.0F;
> -      GLfloat x = pMin[0] - (yMin - iyMin) * dxdy;
>        GLint iy;
> -      for (iy = iyMin; iy < iyMax; iy++, x += dxdy) {
> +      #pragma omp parallel for schedule(dynamic) private(iy) firstprivate(span)
> +      for (iy = iyMin; iy < iyMax; iy++) {
> +         GLfloat x = pMin[0] - (yMin - iy) * dxdy;
>           GLint ix, left, startX = (GLint) (x + xAdj);
>           GLuint count, n;
>           GLfloat coverage = 0.0F;
>           
> +#ifdef _OPENMP
> +         /* each thread needs to use a different (global) SpanArrays variable */
> +         span.array = SWRAST_CONTEXT(ctx)->SpanArrays + omp_get_thread_num();
> +#endif
>           /* make sure we're not past the window edge */
>           if (startX >= ctx->DrawBuffer->_Xmax) {
>              startX = ctx->DrawBuffer->_Xmax - 1;
> @@ -296,31 +305,30 @@
>           ATTRIB_LOOP_END
>  #endif
>  
> -         if (startX <= ix)
> -            continue;
> +         if (startX > ix) {
> +            n = (GLuint) startX - (GLuint) ix;
>  
> -         n = (GLuint) startX - (GLuint) ix;
> +            left = ix + 1;
>  
> -         left = ix + 1;
> -
> -         /* shift all values to the left */
> -         /* XXX this is temporary */
> -         {
> -            SWspanarrays *array = span.array;
> -            GLint j;
> -            for (j = 0; j < (GLint) n; j++) {
> -               array->coverage[j] = array->coverage[j + left];
> -               COPY_CHAN4(array->rgba[j], array->rgba[j + left]);
> +            /* shift all values to the left */
> +            /* XXX this is temporary */
> +            {
> +               SWspanarrays *array = span.array;
> +               GLint j;
> +               for (j = 0; j < (GLint) n; j++) {
> +                  array->coverage[j] = array->coverage[j + left];
> +                  COPY_CHAN4(array->rgba[j], array->rgba[j + left]);
>  #ifdef DO_Z
> -               array->z[j] = array->z[j + left];
> +                  array->z[j] = array->z[j + left];
>  #endif
> +               }
>              }
> -         }
>  
> -         span.x = left;
> -         span.y = iy;
> -         span.end = n;
> -         _swrast_write_rgba_span(ctx, &span);
> +            span.x = left;
> +            span.y = iy;
> +            span.end = n;
> +            _swrast_write_rgba_span(ctx, &span);
> +         }
>        }
>     }
>  }
> diff --git a/src/mesa/swrast/s_context.c b/src/mesa/swrast/s_context.c
> index def1531..4434f11 100644
> --- a/src/mesa/swrast/s_context.c
> +++ b/src/mesa/swrast/s_context.c
> @@ -772,6 +772,11 @@ _swrast_CreateContext( struct gl_context *ctx )
>  {
>     GLuint i;
>     SWcontext *swrast = (SWcontext *)CALLOC(sizeof(SWcontext));
> +#ifdef _OPENMP
> +   const GLint maxThreads = omp_get_max_threads();
> +#else
> +   const GLint maxThreads = 1;
> +#endif
>  
>     if (SWRAST_DEBUG) {
>        _mesa_debug(ctx, "_swrast_CreateContext\n");
> @@ -806,19 +811,25 @@ _swrast_CreateContext( struct gl_context *ctx )
>     for (i = 0; i < MAX_TEXTURE_IMAGE_UNITS; i++)
>        swrast->TextureSample[i] = NULL;
>  
> -   swrast->SpanArrays = MALLOC_STRUCT(sw_span_arrays);
> +   /* SpanArrays is global and shared by all SWspan instances. However, when
> +    * using multiple threads, it is necessary to have one SpanArrays instance
> +    * per thread.
> +    */
> +   swrast->SpanArrays = (SWspanarrays *) MALLOC(maxThreads * sizeof(SWspanarrays));
>     if (!swrast->SpanArrays) {
>        FREE(swrast);
>        return GL_FALSE;
>     }
> -   swrast->SpanArrays->ChanType = CHAN_TYPE;
> +   for(i = 0; i < maxThreads; i++) {
> +      swrast->SpanArrays[i].ChanType = CHAN_TYPE;
>  #if CHAN_TYPE == GL_UNSIGNED_BYTE
> -   swrast->SpanArrays->rgba = swrast->SpanArrays->rgba8;
> +      swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].rgba8;
>  #elif CHAN_TYPE == GL_UNSIGNED_SHORT
> -   swrast->SpanArrays->rgba = swrast->SpanArrays->rgba16;
> +      swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].rgba16;
>  #else
> -   swrast->SpanArrays->rgba = swrast->SpanArrays->attribs[FRAG_ATTRIB_COL0];
> +      swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].attribs[FRAG_ATTRIB_COL0];
>  #endif
> +   }
>  
>     /* init point span buffer */
>     swrast->PointSpan.primitive = GL_POINT;
> @@ -826,7 +837,10 @@ _swrast_CreateContext( struct gl_context *ctx )
>     swrast->PointSpan.facing = 0;
>     swrast->PointSpan.array = swrast->SpanArrays;
>  
> -   swrast->TexelBuffer = (GLfloat *) MALLOC(ctx->Const.MaxTextureImageUnits *
> +   /* TexelBuffer is also global and normally shared by all SWspan instances;
> +    * when running with multiple threads, create one per thread.
> +    */
> +   swrast->TexelBuffer = (GLfloat *) MALLOC(ctx->Const.MaxTextureImageUnits * maxThreads *
>                                             MAX_WIDTH * 4 * sizeof(GLfloat));
>     if (!swrast->TexelBuffer) {
>        FREE(swrast->SpanArrays);
> diff --git a/src/mesa/swrast/s_texcombine.c b/src/mesa/swrast/s_texcombine.c
> index 086ed0b..80b9dff 100644
> --- a/src/mesa/swrast/s_texcombine.c
> +++ b/src/mesa/swrast/s_texcombine.c
> @@ -48,7 +48,11 @@ typedef float (*float4_array)[4];
>  static INLINE float4_array
>  get_texel_array(SWcontext *swrast, GLuint unit)
>  {
> +#ifdef _OPENMP
> +   return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4 * omp_get_num_threads() + (MAX_WIDTH * 4 * omp_get_thread_num()));
> +#else
>     return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4);
> +#endif
>  }
>  
> 
> diff --git a/src/mesa/tnl/t_pipeline.c b/src/mesa/tnl/t_pipeline.c
> index 18f095f..881d5d5 100644
> --- a/src/mesa/tnl/t_pipeline.c
> +++ b/src/mesa/tnl/t_pipeline.c
> @@ -146,7 +146,17 @@ void _tnl_run_pipeline( struct gl_context *ctx )
>  	 _tnl_notify_pipeline_output_change( ctx );
>     }
>  
> +#ifndef _OPENMP
> +   /* Don't adjust FPU precision mode in case multiple threads are to be used.
> +    * This would require that the additional threads also changed the FPU mode
> +    * which is quite a mess as this had to be done in all parallelized sections;
> +    * otherwise the master thread and all other threads are running in different
> +    * modes, producing inconsistent results.
> +    * Note that all x64 implementations don't define/use START_FAST_MATH, so
> +    * this is "hack" is only used in i386 mode
> +    */
>     START_FAST_MATH(__tmp);
> +#endif
>  
>     for (i = 0; i < tnl->pipeline.nr_stages ; i++) {
>        struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
> @@ -154,7 +164,9 @@ void _tnl_run_pipeline( struct gl_context *ctx )
>  	 break;
>     }
>  
> +#ifndef _OPENMP
>     END_FAST_MATH(__tmp);
> +#endif
>  }
>  
> 





More information about the mesa-dev mailing list