[Mesa-dev] [PATCH 18/20] mesa: Rewrite the way uniforms are tracked and handled

Wed Jan 4 17:34:48 PST 2012

On 01/03/2012 09:43 PM, Marek Olšák wrote:
> On Fri, Oct 28, 2011 at 7:42 PM, Ian Romanick<idr at freedesktop.org>  wrote:
>> diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp
>> index db2f200..50a724b 100644
>> --- a/src/mesa/main/uniform_query.cpp
>> +++ b/src/mesa/main/uniform_query.cpp
>> @@ -22,15 +22,16 @@
>>   * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
>>   * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
>>   */
>> +#include<stdlib.h>
>>   #include "main/core.h"
>>   #include "main/context.h"
>>   #include "ir.h"
>>   #include "ir_uniform.h"
>> +#include "program/hash_table.h"
>>   #include "../glsl/program.h"
>>   #include "../glsl/ir_uniform.h"
>>
>>   extern "C" {
>> -#include "main/image.h"
>>   #include "main/shaderapi.h"
>>   #include "main/shaderobj.h"
>>   #include "uniforms.h"
>> @@ -44,42 +45,30 @@ _mesa_GetActiveUniformARB(GLhandleARB program, GLuint index,
>>     GET_CURRENT_CONTEXT(ctx);
>>     struct gl_shader_program *shProg =
>>        _mesa_lookup_shader_program_err(ctx, program, "glGetActiveUniform");
>> -   const struct gl_program_parameter *param;
>>
>>     if (!shProg)
>>        return;
>>
>> -   if (!shProg->Uniforms || index>= shProg->Uniforms->NumUniforms) {
>> +   if (index>= shProg->NumUserUniformStorage) {
>>        _mesa_error(ctx, GL_INVALID_VALUE, "glGetActiveUniform(index)");
>>        return;
>>     }
>>
>> -   param = get_uniform_parameter(shProg, index);
>> -   if (!param)
>> -      return;
>> -
>> -   const struct gl_uniform *const uni =&shProg->Uniforms->Uniforms[index];
>> +   const struct gl_uniform_storage *const uni =&shProg->UniformStorage[index];
>>
>>     if (nameOut) {
>> -      _mesa_copy_string(nameOut, maxLength, length, param->Name);
>> +      _mesa_copy_string(nameOut, maxLength, length, uni->name);
>>     }
>>
>>     if (size) {
>> -      GLint typeSize = _mesa_sizeof_glsl_type(uni->Type->gl_type);
>> -      if ((GLint) param->Size>  typeSize) {
>> -         /* This is an array.
>> -          * Array elements are placed on vector[4] boundaries so they're
>> -          * a multiple of four floats.  We round typeSize up to next multiple
>> -          * of four to get the right size below.
>> -          */
>> -         typeSize = (typeSize + 3)&  ~3;
>> -      }
>> -      /* Note that the returned size is in units of the<type>, not bytes */
>> -      *size = param->Size / typeSize;
>> +      /* array_elements is zero for non-arrays, but the API requires that 1 be
>> +       * returned.
>> +       */
>> +      *size = MAX2(1, uni->array_elements);
>>     }
>>
>>     if (type) {
>> -      *type = uni->Type->gl_type;
>> +      *type = uni->type->gl_type;
>>     }
>>   }
>>
>> @@ -409,12 +398,21 @@ validate_uniform_parameters(struct gl_context *ctx,
>>
>>     _mesa_uniform_split_location_offset(location, loc, array_index);
>>
>> -   if (*loc>= shProg->Uniforms->NumUniforms) {
>> +   if (*loc>= shProg->NumUserUniformStorage) {
>>        _mesa_error(ctx, GL_INVALID_OPERATION, "%s(location=%d)",
>>                   caller, location);
>>        return false;
>>     }
>>
>> +   /* This case should be impossible.  The implication is that a call like
>> +    * glGetUniformLocation(prog, "foo[8]") was successful but "foo" is not an
>> +    * array.
>> +    */
>> +   if (*array_index != 0&&  shProg->UniformStorage[*loc].array_elements == 0) {
>> +      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(location=%d)",
>> +                 caller, location);
>> +      return false;
>> +   }
>>     return true;
>>   }
>>
>> @@ -423,72 +421,81 @@ validate_uniform_parameters(struct gl_context *ctx,
>>   */
>>   extern "C" void
>>   _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location,
>> -                 GLsizei bufSize, GLenum returnType, GLvoid *paramsOut)
>> +                 GLsizei bufSize, enum glsl_base_type returnType,
>> +                 GLvoid *paramsOut)
>>   {
>>     struct gl_shader_program *shProg =
>>        _mesa_lookup_shader_program_err(ctx, program, "glGetUniformfv");
>> -   struct gl_program *prog;
>> -   GLint paramPos;
>> +   struct gl_uniform_storage *uni;
>>     unsigned loc, offset;
>>
>>     if (!validate_uniform_parameters(ctx, shProg, location, 1,
>>                                     &loc,&offset, "glGetUniform", true))
>>        return;
>>
>> -   if (!find_uniform_parameter_pos(shProg, loc,&prog,&paramPos)) {
>> -      _mesa_error(ctx, GL_INVALID_OPERATION,  "glGetUniformfv(location)");
>> -   }
>> -   else {
>> -      const struct gl_program_parameter *p =
>> -&prog->Parameters->Parameters[paramPos];
>> -      gl_constant_value (*values)[4];
>> -      GLint rows, cols, i, j, k;
>> -      GLsizei numBytes;
>> -      GLenum storage_type;
>> -
>> -      values = prog->Parameters->ParameterValues + paramPos + offset;
>> -
>> -      get_uniform_rows_cols(p,&rows,&cols);
>> -
>> -      numBytes = rows * cols * _mesa_sizeof_type(returnType);
>> -      if (bufSize<  numBytes) {
>> -         _mesa_error( ctx, GL_INVALID_OPERATION,
>> -                     "glGetnUniformfvARB(out of bounds: bufSize is %d,"
>> -                     " but %d bytes are required)", bufSize, numBytes );
>> -         return;
>> -      }
>> +   uni =&shProg->UniformStorage[loc];
>>
>> -      if (ctx->Const.NativeIntegers) {
>> -        storage_type = base_uniform_type(p->DataType);
>> -      } else {
>> -        storage_type = GL_FLOAT;
>> -      }
>> +   {
>> +      unsigned elements = (uni->type->is_sampler())
>> +        ? 1 : uni->type->components();
>>
>> -      k = 0;
>> -      for (i = 0; i<  rows; i++) {
>> -        for (j = 0; j<  cols; j++ ) {
>> -           void *out = (char *)paramsOut + 4 * k;
>> +      /* Calculate the source base address *BEFORE* modifying elements to
>> +       * account for the size of the user's buffer.
>> +       */
>> +      const union gl_constant_value *const src =
>> +&uni->storage[offset * elements];
>> +
>> +      unsigned bytes = sizeof(uni->storage[0]) * elements;
>> +      if (bytes>  bufSize) {
>> +        elements = bufSize / sizeof(uni->storage[0]);
>> +        bytes = bufSize;
>> +      }
>>
>> +      /* If the return type and the uniform's native type are "compatible,"
>> +       * just memcpy the data.  If the types are not compatible, perform a
>> +       * slower convert-and-copy process.
>> +       */
>> +      if (returnType == uni->type->base_type
>> +         || ((returnType == GLSL_TYPE_INT
>> +              || returnType == GLSL_TYPE_UINT
>> +              || returnType == GLSL_TYPE_SAMPLER)
>> +&&
>> +             (uni->type->base_type == GLSL_TYPE_INT
>> +              || uni->type->base_type == GLSL_TYPE_UINT
>> +              || uni->type->base_type == GLSL_TYPE_SAMPLER))) {
>> +        memcpy(paramsOut, src, bytes);
>> +      } else {
>> +        union gl_constant_value *const dst =
>> +           (union gl_constant_value *) paramsOut;
>> +
>> +        /* This code could be optimized by putting the loop inside the switch
>> +         * statements.  However, this is not expected to be
>> +         * performance-critical code.
>> +         */
>> +        for (unsigned i = 0; i<  elements; i++) {
>>             switch (returnType) {
>> -           case GL_FLOAT:
>> -              switch (storage_type) {
>> -              case GL_FLOAT:
>> -                 *(float *)out = values[i][j].f;
>> +           case GLSL_TYPE_FLOAT:
>> +              switch (uni->type->base_type) {
>> +              case GLSL_TYPE_UINT:
>> +                 dst[i].f = (float) src[i].u;
>> +                 break;
>> +              case GLSL_TYPE_INT:
>> +              case GLSL_TYPE_SAMPLER:
>> +                 dst[i].f = (float) src[i].i;
>>                   break;
>> -              case GL_INT:
>> -              case GL_BOOL: /* boolean is just an integer 1 or 0. */
>> -                 *(float *)out = values[i][j].i;
>> +              case GLSL_TYPE_BOOL:
>> +                 dst[i].f = src[i].i ? 1.0f : 0.0f;
>>                   break;
>> -              case GL_UNSIGNED_INT:
>> -                 *(float *)out = values[i][j].u;
>> +              default:
>> +                 assert(!"Should not get here.");
>>                   break;
>>                }
>>                break;
>>
>> -           case GL_INT:
>> -           case GL_UNSIGNED_INT:
>> -              switch (storage_type) {
>> -              case GL_FLOAT:
>> +           case GLSL_TYPE_INT:
>> +           case GLSL_TYPE_UINT:
>> +              switch (uni->type->base_type) {
>> +              case GLSL_TYPE_FLOAT:
>>                   /* While the GL 3.2 core spec doesn't explicitly
>>                    * state how conversion of float uniforms to integer
>>                    * values works, in section 6.2 "State Tables" on
>> @@ -506,23 +513,21 @@ _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location,
>>                    *      a floating-point value is rounded to the
>>                    *      nearest integer..."
>>                    */
>> -                 *(int *)out = IROUND(values[i][j].f);
>> +                 dst[i].i = IROUND(src[i].f);
>>                   break;
>> -
>> -              case GL_INT:
>> -              case GL_UNSIGNED_INT:
>> -              case GL_BOOL:
>> -                 /* type conversions for these to int/uint are just
>> -                  * copying the data.
>> -                  */
>> -                 *(int *)out = values[i][j].i;
>> +              case GLSL_TYPE_BOOL:
>> +                 dst[i].i = src[i].i ? 1 : 0;
>>                   break;
>> +              default:
>> +                 assert(!"Should not get here.");
>>                   break;
>>                }
>>                break;
>> -           }
>>
>> -           k++;
>> +           default:
>> +              assert(!"Should not get here.");
>> +              break;
>> +           }
>>          }
>>        }
>>     }
>> @@ -925,9 +930,12 @@ _mesa_uniform(struct gl_context *ctx, struct gl_shader_program *shProg,
>>               GLint location, GLsizei count,
>>                const GLvoid *values, GLenum type)
>>   {
>> -   struct gl_uniform *uniform;
>> -   GLint elems;
>>     unsigned loc, offset;
>> +   unsigned components;
>> +   unsigned src_components;
>> +   unsigned i;
>> +   enum glsl_base_type basicType;
>> +   struct gl_uniform_storage *uni;
>>
>>     ASSERT_OUTSIDE_BEGIN_END(ctx);
>>
>> @@ -935,73 +943,207 @@ _mesa_uniform(struct gl_context *ctx, struct gl_shader_program *shProg,
>>                                     &loc,&offset, "glUniform", false))
>>        return;
>>
>> -   elems = _mesa_sizeof_glsl_type(type);
>> +   uni =&shProg->UniformStorage[loc];
>>
>> -   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);
>> +   /* Verify that the types are compatible.
>> +    */
>> +   switch (type) {
>> +   case GL_FLOAT:
>> +      basicType = GLSL_TYPE_FLOAT;
>> +      src_components = 1;
>> +      break;
>> +   case GL_FLOAT_VEC2:
>> +      basicType = GLSL_TYPE_FLOAT;
>> +      src_components = 2;
>> +      break;
>> +   case GL_FLOAT_VEC3:
>> +      basicType = GLSL_TYPE_FLOAT;
>> +      src_components = 3;
>> +      break;
>> +   case GL_FLOAT_VEC4:
>> +      basicType = GLSL_TYPE_FLOAT;
>> +      src_components = 4;
>> +      break;
>> +   case GL_UNSIGNED_INT:
>> +      basicType = GLSL_TYPE_UINT;
>> +      src_components = 1;
>> +      break;
>> +   case GL_UNSIGNED_INT_VEC2:
>> +      basicType = GLSL_TYPE_UINT;
>> +      src_components = 2;
>> +      break;
>> +   case GL_UNSIGNED_INT_VEC3:
>> +      basicType = GLSL_TYPE_UINT;
>> +      src_components = 3;
>> +      break;
>> +   case GL_UNSIGNED_INT_VEC4:
>> +      basicType = GLSL_TYPE_UINT;
>> +      src_components = 4;
>> +      break;
>> +   case GL_INT:
>> +      basicType = GLSL_TYPE_INT;
>> +      src_components = 1;
>> +      break;
>> +   case GL_INT_VEC2:
>> +      basicType = GLSL_TYPE_INT;
>> +      src_components = 2;
>> +      break;
>> +   case GL_INT_VEC3:
>> +      basicType = GLSL_TYPE_INT;
>> +      src_components = 3;
>> +      break;
>> +   case GL_INT_VEC4:
>> +      basicType = GLSL_TYPE_INT;
>> +      src_components = 4;
>> +      break;
>> +   case GL_BOOL:
>> +   case GL_BOOL_VEC2:
>> +   case GL_BOOL_VEC3:
>> +   case GL_BOOL_VEC4:
>> +   case GL_FLOAT_MAT2:
>> +   case GL_FLOAT_MAT2x3:
>> +   case GL_FLOAT_MAT2x4:
>> +   case GL_FLOAT_MAT3x2:
>> +   case GL_FLOAT_MAT3:
>> +   case GL_FLOAT_MAT3x4:
>> +   case GL_FLOAT_MAT4x2:
>> +   case GL_FLOAT_MAT4x3:
>> +   case GL_FLOAT_MAT4:
>> +   default:
>> +      _mesa_problem(NULL, "Invalid type in %s", __func__);
>> +      return;
>> +   }
>>
>> -   uniform =&shProg->Uniforms->Uniforms[loc];
>> +   if (uni->type->is_sampler()) {
>> +      components = 1;
>> +   } else {
>> +      components = uni->type->vector_elements;
>> +   }
>> +
>> +   bool match;
>> +   switch (uni->type->base_type) {
>> +   case GLSL_TYPE_BOOL:
>> +      match = true;
>> +      break;
>> +   case GLSL_TYPE_SAMPLER:
>> +      match = (basicType == GLSL_TYPE_INT);
>> +      break;
>> +   default:
>> +      match = (basicType == uni->type->base_type);
>> +      break;
>> +   }
>> +
>> +   if (uni->type->is_matrix() || components != src_components || !match) {
>> +      _mesa_error(ctx, GL_INVALID_OPERATION, "glUniform(type mismatch)");
>> +      return;
>> +   }
>>
>>     if (ctx->Shader.Flags&  GLSL_UNIFORMS) {
>> -      const GLenum basicType = base_uniform_type(type);
>> -      GLint i;
>> -      printf("Mesa: set program %u uniform %s (loc %d) to: ",
>> -            shProg->Name, uniform->Name, location);
>> -      if (basicType == GL_INT) {
>> -         const GLint *v = (const GLint *) values;
>> -         for (i = 0; i<  count * elems; i++) {
>> -            printf("%d ", v[i]);
>> -         }
>> -      }
>> -      else if (basicType == GL_UNSIGNED_INT) {
>> -         const GLuint *v = (const GLuint *) values;
>> -         for (i = 0; i<  count * elems; i++) {
>> -            printf("%u ", v[i]);
>> -         }
>> -      }
>> -      else {
>> -         const GLfloat *v = (const GLfloat *) values;
>> -         assert(basicType == GL_FLOAT);
>> -         for (i = 0; i<  count * elems; i++) {
>> -            printf("%g ", v[i]);
>> +      log_uniform(values, basicType, components, 1, count,
>> +                 false, shProg, location, uni);
>> +   }
>> +
>> +   /* Validate the texture unit setting.
>> +    */
>> +   /* FINISHME: I cannot find any justification for this in the GL spec.
>> +    */
>> +   if (uni->type->is_sampler()) {
>> +      for (i = 0; i<  count; i++) {
>> +        const unsigned texUnit = ((unsigned *) values)[i];
>> +
>> +         /* check that the sampler (tex unit index) is legal */
>> +         if (texUnit>= ctx->Const.MaxCombinedTextureImageUnits) {
>> +            _mesa_error(ctx, GL_INVALID_VALUE,
>> +                        "glUniform1i(invalid sampler/tex unit index for "
>> +                       "uniform %d)",
>> +                        location);
>> +            return;
>>           }
>>        }
>> -      printf("\n");
>>     }
>>
>> -   /* A uniform var may be used by both a vertex shader and a fragment
>> -    * shader.  We may need to update one or both shader's uniform here:
>> +   /* Page 82 (page 96 of the PDF) of the OpenGL 2.1 spec says:
>> +    *
>> +    *     "When loading N elements starting at an arbitrary position k in a
>> +    *     uniform declared as an array, elements k through k + N - 1 in the
>> +    *     array will be replaced with the new values. Values for any array
>> +    *     element that exceeds the highest array element index used, as
>> +    *     reported by GetActiveUniform, will be ignored by the GL."
>> +    *
>> +    * Clamp 'count' to a valid value.  Note that for non-arrays a count>  1
>> +    * will have already generated an error.
>>      */
>> -   if (shProg->_LinkedShaders[MESA_SHADER_VERTEX]) {
>> -      /* convert uniform location to program parameter index */
>> -      GLint index = uniform->VertPos;
>> -      if (index>= 0) {
>> -         set_program_uniform(ctx,
>> -                             shProg->_LinkedShaders[MESA_SHADER_VERTEX]->Program,
>> -                             index, offset, type, count, elems, values);
>> -      }
>> +   if (uni->array_elements != 0) {
>> +      if (offset>= uni->array_elements)
>> +        return;
>> +
>> +      count = MIN2(count, (uni->array_elements - offset));
>>     }
>>
>> -   if (shProg->_LinkedShaders[MESA_SHADER_FRAGMENT]) {
>> -      /* convert uniform location to program parameter index */
>> -      GLint index = uniform->FragPos;
>> -      if (index>= 0) {
>> -         set_program_uniform(ctx,
>> -                            shProg->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program,
>> -                             index, offset, type, count, elems, values);
>> +   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);
>> +
>> +   /* Store the data in the "actual type" backing storage for the uniform.
>> +    */
>> +   if (!uni->type->is_boolean()) {
>> +      memcpy(&uni->storage[components * offset], values,
>> +            sizeof(uni->storage[0]) * components * count);
>> +   } else {
>> +      const union gl_constant_value *src =
>> +        (const union gl_constant_value *) values;
>> +      union gl_constant_value *dst =&uni->storage[components * offset];
>> +      const unsigned elems = components * count;
>> +
>> +      for (i = 0; i<  elems; i++) {
>> +        if (basicType == GLSL_TYPE_FLOAT) {
>> +           dst[i].i = src[i].f != 0.0f ? 1 : 0;
>> +        } else {
>> +           dst[i].i = src[i].i != 0    ? 1 : 0;
>> +        }
>>        }
>>     }
>>
>> -   if (shProg->_LinkedShaders[MESA_SHADER_GEOMETRY]) {
>> -      /* convert uniform location to program parameter index */
>> -      GLint index = uniform->GeomPos;
>> -      if (index>= 0) {
>> -         set_program_uniform(ctx,
>> -                            shProg->_LinkedShaders[MESA_SHADER_GEOMETRY]->Program,
>> -                             index, offset, type, count, elems, values);
>> +   uni->initialized = true;
>> +   uni->dirty = true;
>> +
>> +   _mesa_propagate_uniforms_to_driver_storage(uni, offset, count);
>> +
>> +   /* If the uniform is a sampler, do the extra magic necessary to propagate
>> +    * the changes through.
>> +    */
>> +   if (uni->type->is_sampler()) {
>> +      for (i = 0; i<  count; i++) {
>> +        shProg->SamplerUnits[uni->sampler + offset + i] =
>> +           ((unsigned *) values)[i];
>>        }
>> -   }
>>
>> -   uniform->Initialized = GL_TRUE;
>> +      bool flushed = false;
>> +      for (i = 0; i<  MESA_SHADER_TYPES; i++) {
>> +        struct gl_program *prog;
>> +
>> +        if (shProg->_LinkedShaders[i] == NULL)
>> +           continue;
>> +
>> +        prog = shProg->_LinkedShaders[i]->Program;
>> +
>> +        assert(sizeof(prog->SamplerUnits) == sizeof(shProg->SamplerUnits));
>> +
>> +        if (memcmp(prog->SamplerUnits,
>> +                   shProg->SamplerUnits,
>> +                   sizeof(shProg->SamplerUnits)) != 0) {
>> +           if (!flushed) {
>> +              FLUSH_VERTICES(ctx, _NEW_TEXTURE | _NEW_PROGRAM);
>> +              flushed = true;
>> +           }
>> +
>> +           memcpy(prog->SamplerUnits,
>> +                  shProg->SamplerUnits,
>> +                  sizeof(shProg->SamplerUnits));
>> +
>> +           _mesa_update_shader_textures_used(prog);
>> +           (void) ctx->Driver.ProgramStringNotify(ctx, prog->Target, prog);
>
> Hi Ian,
>
> This ProgramStringNotify call causes excessive shader recompilations
> in the game Cogs (from some Humble Bundle). Approximately 25% of
> in-game CPU time is spent on translating shaders and register
> allocation. If the shaders were larger, it would be much worse.
> Moreover, I have realized calling ProgramStringNotify is not needed
> for st/mesa in this case at all (I just comment it out if I want
> higher frame rates), because it doesn't use the prog->SamplerUnits
> array when translating shaders. The array is only used when we set
> samplers, so setting _NEW_TEXTURE is sufficient. It seems that i965
> could do the same thing. Another option would be to cache precompiled
> shaders, but that seems more complicated than doing what st/mesa does.

The original way I had thought to fix this was to replace 
ProgramStringNotify with something like SamplersChangedNotify.  This 
would allow an easy upgrade path.  Drivers could just plug their 
existing ProgramStringNotify handler into the new SamplersChangedNotify 
vtable entry and keep working.  Drivers could then be modified, one at a 
time, to do whatever made sense for that driver.