[Mesa-dev] [PATCH] swrast: initial multi-threaded span rendering

Wed Aug 10 02:16:30 PDT 2011

I'm not sure it makes a lot of sense to be optimizing swrast at this
stage.  Take a look at llvmpipe and perhaps consider improving the
multithreading already in place in that rasterizer, which is far better
optimized than swrast already.

Keith

On Wed, 2011-08-10 at 08:07 +0000, Andreas Fänger wrote:
> Optional parallel rendering of spans using OpenMP.
> Initial implementation for aa triangles. A new option for scons is
> also provided to activate the openmp support (off by default).
> ---
>  common.py                      |    1 +
>  scons/gallium.py               |   12 +++++++
>  src/mesa/swrast/s_aatritemp.h  |   68 ++++++++++++++++++++++-----------------
>  src/mesa/swrast/s_context.c    |   26 ++++++++++++---
>  src/mesa/swrast/s_texcombine.c |    4 ++
>  src/mesa/tnl/t_pipeline.c      |   12 +++++++
>  6 files changed, 87 insertions(+), 36 deletions(-)
> 
> diff --git a/common.py b/common.py
> index 8657030..cfee1b5 100644
> --- a/common.py
> +++ b/common.py
> @@ -88,6 +88,7 @@ def AddOptions(opts):
>  	opts.Add('toolchain', 'compiler toolchain', default_toolchain)
>  	opts.Add(BoolOption('gles', 'EXPERIMENTAL: enable OpenGL ES support', 'no'))
>  	opts.Add(BoolOption('llvm', 'use LLVM', default_llvm))
> +	opts.Add(BoolOption('openmp', 'EXPERIMENTAL: compile with openmp (swrast)', 'no'))
>  	opts.Add(BoolOption('debug', 'DEPRECATED: debug build', 'yes'))
>  	opts.Add(BoolOption('profile', 'DEPRECATED: profile build', 'no'))
>  	opts.Add(BoolOption('quiet', 'DEPRECATED: profile build', 'yes'))
> diff --git a/scons/gallium.py b/scons/gallium.py
> index 8cd3bc7..7135251 100755
> --- a/scons/gallium.py
> +++ b/scons/gallium.py
> @@ -596,6 +596,18 @@ def generate(env):
>          libs += ['m', 'pthread', 'dl']
>      env.Append(LIBS = libs)
>  
> +    # OpenMP
> +    if env['openmp']:
> +        if env['msvc']:
> +            env.Append(CCFLAGS = ['/openmp'])
> +            # When building openmp release VS2008 link.exe crashes with LNK1103 error.
> +            # Workaround: overwrite PDB flags with empty value as it isn't required anyways
> +            if env['build'] == 'release':
> +                env['PDB'] = ''
> +        if env['gcc']:
> +            env.Append(CCFLAGS = ['-fopenmp'])
> +            env.Append(LIBS = ['gomp'])
> +
>      # Load tools
>      env.Tool('lex')
>      env.Tool('yacc')
> diff --git a/src/mesa/swrast/s_aatritemp.h b/src/mesa/swrast/s_aatritemp.h
> index 91d4f7a..005d12c 100644
> --- a/src/mesa/swrast/s_aatritemp.h
> +++ b/src/mesa/swrast/s_aatritemp.h
> @@ -181,13 +181,18 @@
>        const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS];
>        const GLfloat dxdy = majDx / majDy;
>        const GLfloat xAdj = dxdy < 0.0F ? -dxdy : 0.0F;
> -      GLfloat x = pMin[0] - (yMin - iyMin) * dxdy;
>        GLint iy;
> -      for (iy = iyMin; iy < iyMax; iy++, x += dxdy) {
> +      #pragma omp parallel for schedule(dynamic) private(iy) firstprivate(span)
> +      for (iy = iyMin; iy < iyMax; iy++) {
> +         GLfloat x = pMin[0] - (yMin - iy) * dxdy;
>           GLint ix, startX = (GLint) (x - xAdj);
>           GLuint count;
>           GLfloat coverage = 0.0F;
>  
> +#ifdef _OPENMP
> +         /* each thread needs to use a different (global) SpanArrays variable */
> +         span.array = SWRAST_CONTEXT(ctx)->SpanArrays + omp_get_thread_num();
> +#endif
>           /* skip over fragments with zero coverage */
>           while (startX < MAX_WIDTH) {
>              coverage = compute_coveragef(pMin, pMid, pMax, startX, iy);
> @@ -228,13 +233,12 @@
>              coverage = compute_coveragef(pMin, pMid, pMax, ix, iy);
>           }
>           
> -         if (ix <= startX)
> -            continue;
> -         
> -         span.x = startX;
> -         span.y = iy;
> -         span.end = (GLuint) ix - (GLuint) startX;
> -         _swrast_write_rgba_span(ctx, &span);
> +         if (ix > startX) {
> +            span.x = startX;
> +            span.y = iy;
> +            span.end = (GLuint) ix - (GLuint) startX;
> +            _swrast_write_rgba_span(ctx, &span);
> +         }
>        }
>     }
>     else {
> @@ -244,13 +248,18 @@
>        const GLfloat *pMax = vMax->attrib[FRAG_ATTRIB_WPOS];
>        const GLfloat dxdy = majDx / majDy;
>        const GLfloat xAdj = dxdy > 0 ? dxdy : 0.0F;
> -      GLfloat x = pMin[0] - (yMin - iyMin) * dxdy;
>        GLint iy;
> -      for (iy = iyMin; iy < iyMax; iy++, x += dxdy) {
> +      #pragma omp parallel for schedule(dynamic) private(iy) firstprivate(span)
> +      for (iy = iyMin; iy < iyMax; iy++) {
> +         GLfloat x = pMin[0] - (yMin - iy) * dxdy;
>           GLint ix, left, startX = (GLint) (x + xAdj);
>           GLuint count, n;
>           GLfloat coverage = 0.0F;
>           
> +#ifdef _OPENMP
> +         /* each thread needs to use a different (global) SpanArrays variable */
> +         span.array = SWRAST_CONTEXT(ctx)->SpanArrays + omp_get_thread_num();
> +#endif
>           /* make sure we're not past the window edge */
>           if (startX >= ctx->DrawBuffer->_Xmax) {
>              startX = ctx->DrawBuffer->_Xmax - 1;
> @@ -296,31 +305,30 @@
>           ATTRIB_LOOP_END
>  #endif
>  
> -         if (startX <= ix)
> -            continue;
> +         if (startX > ix) {
> +            n = (GLuint) startX - (GLuint) ix;
>  
> -         n = (GLuint) startX - (GLuint) ix;
> +            left = ix + 1;
>  
> -         left = ix + 1;
> -
> -         /* shift all values to the left */
> -         /* XXX this is temporary */
> -         {
> -            SWspanarrays *array = span.array;
> -            GLint j;
> -            for (j = 0; j < (GLint) n; j++) {
> -               array->coverage[j] = array->coverage[j + left];
> -               COPY_CHAN4(array->rgba[j], array->rgba[j + left]);
> +            /* shift all values to the left */
> +            /* XXX this is temporary */
> +            {
> +               SWspanarrays *array = span.array;
> +               GLint j;
> +               for (j = 0; j < (GLint) n; j++) {
> +                  array->coverage[j] = array->coverage[j + left];
> +                  COPY_CHAN4(array->rgba[j], array->rgba[j + left]);
>  #ifdef DO_Z
> -               array->z[j] = array->z[j + left];
> +                  array->z[j] = array->z[j + left];
>  #endif
> +               }
>              }
> -         }
>  
> -         span.x = left;
> -         span.y = iy;
> -         span.end = n;
> -         _swrast_write_rgba_span(ctx, &span);
> +            span.x = left;
> +            span.y = iy;
> +            span.end = n;
> +            _swrast_write_rgba_span(ctx, &span);
> +         }
>        }
>     }
>  }
> diff --git a/src/mesa/swrast/s_context.c b/src/mesa/swrast/s_context.c
> index def1531..4434f11 100644
> --- a/src/mesa/swrast/s_context.c
> +++ b/src/mesa/swrast/s_context.c
> @@ -772,6 +772,11 @@ _swrast_CreateContext( struct gl_context *ctx )
>  {
>     GLuint i;
>     SWcontext *swrast = (SWcontext *)CALLOC(sizeof(SWcontext));
> +#ifdef _OPENMP
> +   const GLint maxThreads = omp_get_max_threads();
> +#else
> +   const GLint maxThreads = 1;
> +#endif
>  
>     if (SWRAST_DEBUG) {
>        _mesa_debug(ctx, "_swrast_CreateContext\n");
> @@ -806,19 +811,25 @@ _swrast_CreateContext( struct gl_context *ctx )
>     for (i = 0; i < MAX_TEXTURE_IMAGE_UNITS; i++)
>        swrast->TextureSample[i] = NULL;
>  
> -   swrast->SpanArrays = MALLOC_STRUCT(sw_span_arrays);
> +   /* SpanArrays is global and shared by all SWspan instances. However, when
> +    * using multiple threads, it is necessary to have one SpanArrays instance
> +    * per thread.
> +    */
> +   swrast->SpanArrays = (SWspanarrays *) MALLOC(maxThreads * sizeof(SWspanarrays));
>     if (!swrast->SpanArrays) {
>        FREE(swrast);
>        return GL_FALSE;
>     }
> -   swrast->SpanArrays->ChanType = CHAN_TYPE;
> +   for(i = 0; i < maxThreads; i++) {
> +      swrast->SpanArrays[i].ChanType = CHAN_TYPE;
>  #if CHAN_TYPE == GL_UNSIGNED_BYTE
> -   swrast->SpanArrays->rgba = swrast->SpanArrays->rgba8;
> +      swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].rgba8;
>  #elif CHAN_TYPE == GL_UNSIGNED_SHORT
> -   swrast->SpanArrays->rgba = swrast->SpanArrays->rgba16;
> +      swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].rgba16;
>  #else
> -   swrast->SpanArrays->rgba = swrast->SpanArrays->attribs[FRAG_ATTRIB_COL0];
> +      swrast->SpanArrays[i].rgba = swrast->SpanArrays[i].attribs[FRAG_ATTRIB_COL0];
>  #endif
> +   }
>  
>     /* init point span buffer */
>     swrast->PointSpan.primitive = GL_POINT;
> @@ -826,7 +837,10 @@ _swrast_CreateContext( struct gl_context *ctx )
>     swrast->PointSpan.facing = 0;
>     swrast->PointSpan.array = swrast->SpanArrays;
>  
> -   swrast->TexelBuffer = (GLfloat *) MALLOC(ctx->Const.MaxTextureImageUnits *
> +   /* TexelBuffer is also global and normally shared by all SWspan instances;
> +    * when running with multiple threads, create one per thread.
> +    */
> +   swrast->TexelBuffer = (GLfloat *) MALLOC(ctx->Const.MaxTextureImageUnits * maxThreads *
>                                             MAX_WIDTH * 4 * sizeof(GLfloat));
>     if (!swrast->TexelBuffer) {
>        FREE(swrast->SpanArrays);
> diff --git a/src/mesa/swrast/s_texcombine.c b/src/mesa/swrast/s_texcombine.c
> index 086ed0b..80b9dff 100644
> --- a/src/mesa/swrast/s_texcombine.c
> +++ b/src/mesa/swrast/s_texcombine.c
> @@ -48,7 +48,11 @@ typedef float (*float4_array)[4];
>  static INLINE float4_array
>  get_texel_array(SWcontext *swrast, GLuint unit)
>  {
> +#ifdef _OPENMP
> +   return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4 * omp_get_num_threads() + (MAX_WIDTH * 4 * omp_get_thread_num()));
> +#else
>     return (float4_array) (swrast->TexelBuffer + unit * MAX_WIDTH * 4);
> +#endif
>  }
>  
> 
> diff --git a/src/mesa/tnl/t_pipeline.c b/src/mesa/tnl/t_pipeline.c
> index 18f095f..881d5d5 100644
> --- a/src/mesa/tnl/t_pipeline.c
> +++ b/src/mesa/tnl/t_pipeline.c
> @@ -146,7 +146,17 @@ void _tnl_run_pipeline( struct gl_context *ctx )
>  	 _tnl_notify_pipeline_output_change( ctx );
>     }
>  
> +#ifndef _OPENMP
> +   /* Don't adjust FPU precision mode in case multiple threads are to be used.
> +    * This would require that the additional threads also changed the FPU mode
> +    * which is quite a mess as this had to be done in all parallelized sections;
> +    * otherwise the master thread and all other threads are running in different
> +    * modes, producing inconsistent results.
> +    * Note that all x64 implementations don't define/use START_FAST_MATH, so
> +    * this is "hack" is only used in i386 mode
> +    */
>     START_FAST_MATH(__tmp);
> +#endif
>  
>     for (i = 0; i < tnl->pipeline.nr_stages ; i++) {
>        struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
> @@ -154,7 +164,9 @@ void _tnl_run_pipeline( struct gl_context *ctx )
>  	 break;
>     }
>  
> +#ifndef _OPENMP
>     END_FAST_MATH(__tmp);
> +#endif
>  }
>  
>