[Mesa-dev] [PATCH] gallium: improve the pipe_stream_output_info struct

Marek Olšák maraeo at gmail.com
Mon Jan 9 11:09:05 PST 2012


On Mon, Jan 9, 2012 at 7:31 PM, Jose Fonseca <jfonseca at vmware.com> wrote:
>
>
> ----- Original Message -----
>> There are 3 changes:
>>
>> 1) stride is specified for each buffer, not just one, so that drivers
>> don't
>>    have to derive it from the outputs
>>
>> 2) new per-output property dst_offset, which specifies the offset
>>    into the buffer in dwords where the output should be stored,
>>    so that drivers don't have to compute the offsets manually;
>>    this will also be useful for gl_SkipComponents
>>    from ARB_transform_feedback3
>>
>> 3) register_mask is removed, instead, there is start_component
>>    and num_components; register_mask with non-consecutive 1s
>>    doesn't make much sense (some hardware cannot do packing of
>>    components)
>>
>> Christoph Bumiller: fixed nvc0.
>> ---
>>  src/gallium/auxiliary/draw/draw_pt_so_emit.c |    4 ++-
>>  src/gallium/auxiliary/util/u_blitter.c       |    4 +-
>>  src/gallium/auxiliary/util/u_dump_state.c    |    6 ++-
>>  src/gallium/drivers/llvmpipe/lp_state_so.c   |    2 +-
>>  src/gallium/drivers/nvc0/nvc0_program.c      |   11 ++----
>>  src/gallium/drivers/r600/r600.h              |    4 +-
>>  src/gallium/drivers/r600/r600_hw_context.c   |    8 ++--
>>  src/gallium/drivers/r600/r600_pipe.h         |    1 -
>>  src/gallium/drivers/r600/r600_shader.c       |   46
>>  +++++++-------------------
>>  src/gallium/drivers/r600/r600_state_common.c |    2 +-
>>  src/gallium/drivers/softpipe/sp_state_so.c   |    2 +-
>>  src/gallium/drivers/trace/tr_dump_state.c    |    6 ++-
>>  src/gallium/include/pipe/p_state.h           |   13 ++++---
>>  src/mesa/state_tracker/st_glsl_to_tgsi.cpp   |   18 ++++------
>>  14 files changed, 53 insertions(+), 74 deletions(-)
>>
>> diff --git a/src/gallium/auxiliary/draw/draw_pt_so_emit.c
>> b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
>> index 2dc9e29..7dc6937 100644
>> --- a/src/gallium/auxiliary/draw/draw_pt_so_emit.c
>> +++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
>> @@ -144,7 +144,9 @@ static void so_emit_prim(struct pt_so_emit *so,
>>           (const char *)input_ptr + (indices[i] *
>>           input_vertex_stride));
>>        for (slot = 0; slot < state->num_outputs; ++slot) {
>>           unsigned idx = state->output[slot].register_index;
>> -         unsigned writemask = state->output[slot].register_mask;
>> +         unsigned writemask =
>> +               ((1 << state->output[slot].num_components) - 1) <<
>> +               state->output[slot].start_component;
>>           unsigned written_compos = 0;
>>           unsigned compo;
>>
>> diff --git a/src/gallium/auxiliary/util/u_blitter.c
>> b/src/gallium/auxiliary/util/u_blitter.c
>> index 80fdfe0..94012c5 100644
>> --- a/src/gallium/auxiliary/util/u_blitter.c
>> +++ b/src/gallium/auxiliary/util/u_blitter.c
>> @@ -266,8 +266,8 @@ struct blitter_context
>> *util_blitter_create(struct pipe_context *pipe)
>>
>>        memset(&so, 0, sizeof(so));
>>        so.num_outputs = 1;
>> -      so.output[0].register_mask = TGSI_WRITEMASK_XYZW;
>> -      so.stride = 4;
>> +      so.output[0].num_components = 4;
>> +      so.stride[0] = 4;
>>
>>        ctx->vs_pos_only =
>>           util_make_vertex_passthrough_shader_with_so(pipe, 1,
>>           semantic_names,
>> diff --git a/src/gallium/auxiliary/util/u_dump_state.c
>> b/src/gallium/auxiliary/util/u_dump_state.c
>> index e44c619..4b5a042 100644
>> --- a/src/gallium/auxiliary/util/u_dump_state.c
>> +++ b/src/gallium/auxiliary/util/u_dump_state.c
>> @@ -441,13 +441,15 @@ util_dump_shader_state(FILE *stream, const
>> struct pipe_shader_state *state)
>>     util_dump_member_begin(stream, "stream_output");
>>     util_dump_struct_begin(stream, "pipe_stream_output_info");
>>     util_dump_member(stream, uint, &state->stream_output,
>>     num_outputs);
>> -   util_dump_member(stream, uint, &state->stream_output, stride);
>> +   util_dump_array(stream, uint, state->stream_output.stride,
>> +                   Elements(state->stream_output.stride));
>>     util_dump_array_begin(stream);
>>     for(i = 0; i < state->stream_output.num_outputs; ++i) {
>>        util_dump_elem_begin(stream);
>>        util_dump_struct_begin(stream, ""); /* anonymous */
>>        util_dump_member(stream, uint,
>>        &state->stream_output.output[i], register_index);
>> -      util_dump_member(stream, uint,
>> &state->stream_output.output[i], register_mask);
>> +      util_dump_member(stream, uint,
>> &state->stream_output.output[i], start_component);
>> +      util_dump_member(stream, uint,
>> &state->stream_output.output[i], num_components);
>>        util_dump_member(stream, uint,
>>        &state->stream_output.output[i], output_buffer);
>>        util_dump_struct_end(stream);
>>        util_dump_elem_end(stream);
>> diff --git a/src/gallium/drivers/llvmpipe/lp_state_so.c
>> b/src/gallium/drivers/llvmpipe/lp_state_so.c
>> index 108f3aa..ed2272d 100644
>> --- a/src/gallium/drivers/llvmpipe/lp_state_so.c
>> +++ b/src/gallium/drivers/llvmpipe/lp_state_so.c
>> @@ -42,7 +42,7 @@ llvmpipe_create_stream_output_state(struct
>> pipe_context *pipe,
>>
>>     if (so) {
>>        so->base.num_outputs = templ->num_outputs;
>> -      so->base.stride = templ->stride;
>> +      memcpy(so->base.stride, templ->stride, sizeof(templ->stride));
>>        memcpy(so->base.output, templ->output,
>>               templ->num_outputs * sizeof(templ->output[0]));
>>     }
>> diff --git a/src/gallium/drivers/nvc0/nvc0_program.c
>> b/src/gallium/drivers/nvc0/nvc0_program.c
>> index 605bca5..499d20e 100644
>> --- a/src/gallium/drivers/nvc0/nvc0_program.c
>> +++ b/src/gallium/drivers/nvc0/nvc0_program.c
>> @@ -496,20 +496,17 @@ nvc0_program_create_tfb_state(const struct
>> nv50_ir_prog_info *info,
>>        tfb->varying_count[b] = 0;
>>
>>        for (i = 0; i < pso->num_outputs; ++i) {
>> +         unsigned startc = pso->output[i].start_component;
>>           if (pso->output[i].output_buffer != b)
>>              continue;
>> -         for (c = 0; c < 4; ++c) {
>> -            if (!(pso->output[i].register_mask & (1 << c)))
>> -               continue;
>> +         for (c = 0; c < pso->output[i].num_components; ++c) {
>>              tfb->varying_count[b]++;
>>              tfb->varying_index[n++] =
>> -               info->out[pso->output[i].register_index].slot[c];
>> +               info->out[pso->output[i].register_index].slot[startc
>> + c];
>>           }
>>        }
>> -      tfb->stride[b] = tfb->varying_count[b] * 4;
>> +      tfb->stride[b] = pso->stride[b] * 4;
>>     }
>> -   if (pso->stride)
>> -      tfb->stride[0] = pso->stride;
>>
>>     return tfb;
>>  }
>> diff --git a/src/gallium/drivers/r600/r600.h
>> b/src/gallium/drivers/r600/r600.h
>> index 4bfb5a9..baf09c1 100644
>> --- a/src/gallium/drivers/r600/r600.h
>> +++ b/src/gallium/drivers/r600/r600.h
>> @@ -196,7 +196,7 @@ struct r600_so_target {
>>
>>       /* The buffer where BUFFER_FILLED_SIZE is stored. */
>>       struct r600_resource    *filled_size;
>> -     unsigned                stride;
>> +     unsigned                stride_in_dw;
>>       unsigned                so_index;
>>  };
>>
>> @@ -248,7 +248,7 @@ struct r600_context {
>>       struct r600_so_target           *so_targets[PIPE_MAX_SO_BUFFERS];
>>       boolean                         streamout_start;
>>       unsigned                        streamout_append_bitmask;
>> -     unsigned                        *vs_shader_so_strides;
>> +     unsigned                        *vs_so_stride_in_dw;
>>  };
>>
>>  struct r600_draw {
>> diff --git a/src/gallium/drivers/r600/r600_hw_context.c
>> b/src/gallium/drivers/r600/r600_hw_context.c
>> index 1dba966..09bc5a7 100644
>> --- a/src/gallium/drivers/r600/r600_hw_context.c
>> +++ b/src/gallium/drivers/r600/r600_hw_context.c
>> @@ -2012,7 +2012,7 @@ static void r600_set_streamout_enable(struct
>> r600_context *ctx, unsigned buffer_
>>  void r600_context_streamout_begin(struct r600_context *ctx)
>>  {
>>       struct r600_so_target **t = ctx->so_targets;
>> -     unsigned *strides = ctx->vs_shader_so_strides;
>> +     unsigned *stride_in_dw = ctx->vs_so_stride_in_dw;
>>       unsigned buffer_en, i, update_flags = 0;
>>
>>       buffer_en = (ctx->num_so_targets >= 1 && t[0] ? 1 : 0) |
>> @@ -2043,7 +2043,7 @@ void r600_context_streamout_begin(struct
>> r600_context *ctx)
>>
>>       for (i = 0; i < ctx->num_so_targets; i++) {
>>               if (t[i]) {
>> -                     t[i]->stride = strides[i];
>> +                     t[i]->stride_in_dw = stride_in_dw[i];
>>                       t[i]->so_index = i;
>>
>>                       update_flags |= SURFACE_BASE_UPDATE_STRMOUT(i);
>> @@ -2053,7 +2053,7 @@ void r600_context_streamout_begin(struct
>> r600_context *ctx)
>>                                                       16*i - R600_CONTEXT_REG_OFFSET) >> 2;
>>                       ctx->pm4[ctx->pm4_cdwords++] = (t[i]->b.buffer_offset +
>>                                                       t[i]->b.buffer_size) >> 2; /* BUFFER_SIZE (in DW) */
>> -                     ctx->pm4[ctx->pm4_cdwords++] = strides[i] >> 2;            /* VTX_STRIDE
>> (in DW) */
>> +                     ctx->pm4[ctx->pm4_cdwords++] = stride_in_dw[i];            /* VTX_STRIDE
>> (in DW) */
>>                       ctx->pm4[ctx->pm4_cdwords++] = 0;                          /* BUFFER_BASE */
>>
>>                       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
>> @@ -2165,7 +2165,7 @@ void r600_context_draw_opaque_count(struct
>> r600_context *ctx, struct r600_so_tar
>>
>>       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
>>       ctx->pm4[ctx->pm4_cdwords++] =
>>       (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE -
>>       R600_CONTEXT_REG_OFFSET) >> 2;
>> -     ctx->pm4[ctx->pm4_cdwords++] = t->stride >> 2;
>> +     ctx->pm4[ctx->pm4_cdwords++] = t->stride_in_dw;
>>
>>       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_COPY_DW, 4, 0);
>>       ctx->pm4[ctx->pm4_cdwords++] = COPY_DW_SRC_IS_MEM |
>>       COPY_DW_DST_IS_REG;
>> diff --git a/src/gallium/drivers/r600/r600_pipe.h
>> b/src/gallium/drivers/r600/r600_pipe.h
>> index 447b9dc..91eb0e8 100644
>> --- a/src/gallium/drivers/r600/r600_pipe.h
>> +++ b/src/gallium/drivers/r600/r600_pipe.h
>> @@ -147,7 +147,6 @@ struct r600_pipe_shader {
>>       struct tgsi_token               *tokens;
>>       unsigned        sprite_coord_enable;
>>       struct pipe_stream_output_info  so;
>> -     unsigned                        so_strides[4];
>>  };
>>
>>  struct r600_pipe_sampler_state {
>> diff --git a/src/gallium/drivers/r600/r600_shader.c
>> b/src/gallium/drivers/r600/r600_shader.c
>> index ad4aded..9dbbbd8 100644
>> --- a/src/gallium/drivers/r600/r600_shader.c
>> +++ b/src/gallium/drivers/r600/r600_shader.c
>> @@ -124,12 +124,14 @@ int r600_pipe_shader_create(struct pipe_context
>> *ctx, struct r600_pipe_shader *s
>>                       unsigned i;
>>                       fprintf(stderr, "STREAMOUT\n");
>>                       for (i = 0; i < shader->so.num_outputs; i++) {
>> +                             unsigned mask = ((1 << shader->so.output[i].num_components) - 1)
>> <<
>> +                                             shader->so.output[i].start_component;
>>                               fprintf(stderr, "  %i: MEM_STREAM0_BUF%i OUT[%i].%s%s%s%s\n", i,
>>                                       shader->so.output[i].output_buffer,
>>                                       shader->so.output[i].register_index,
>> -                                     shader->so.output[i].register_mask & 1 ? "x" : "_",
>> -                                     (shader->so.output[i].register_mask >> 1) & 1 ? "y" :
>> "_",
>> -                                     (shader->so.output[i].register_mask >> 2) & 1 ? "z" :
>> "_",
>> -                                     (shader->so.output[i].register_mask >> 3) & 1 ? "w" :
>> "_");
>> +                                     mask & 1 ? "x" : "_",
>> +                                     (mask >> 1) & 1 ? "y" : "_",
>> +                                     (mask >> 2) & 1 ? "z" : "_",
>> +                                     (mask >> 3) & 1 ? "w" : "_");
>>                       }
>>               }
>>       }
>> @@ -863,11 +865,8 @@ static int r600_shader_from_tgsi(struct
>> r600_pipe_context * rctx, struct r600_pi
>>
>>       /* Add stream outputs. */
>>       if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs) {
>> -             unsigned buffer_offset[PIPE_MAX_SO_BUFFERS] = {0};
>> -
>>               for (i = 0; i < so.num_outputs; i++) {
>>                       struct r600_bytecode_output output;
>> -                     unsigned comps;
>>
>>                       if (so.output[i].output_buffer >= 4) {
>>                               R600_ERR("exceeded the max number of stream output buffers, got:
>>                               %d\n",
>> @@ -875,36 +874,21 @@ static int r600_shader_from_tgsi(struct
>> r600_pipe_context * rctx, struct r600_pi
>>                               r = -EINVAL;
>>                               goto out_err;
>>                       }
>> -
>> -                     switch (so.output[i].register_mask) {
>> -                     case TGSI_WRITEMASK_XYZW:
>> -                             comps = 4;
>> -                             break;
>> -                     case TGSI_WRITEMASK_XYZ:
>> -                             comps = 3;
>> -                             break;
>> -                     case TGSI_WRITEMASK_XY:
>> -                             comps = 2;
>> -                             break;
>> -                     case TGSI_WRITEMASK_X:
>> -                             comps = 1;
>> -                             break;
>> -                     default:
>> -                             R600_ERR("streamout: invalid register_mask, got: %x\n",
>> -                                      so.output[i].register_mask);
>> -                             r = -EINVAL;
>> -                             goto out_err;
>> +                     if (so.output[i].start_component) {
>> +                        R600_ERR("stream_output - start_component cannot be
>> non-zero\n");
>> +                        r = -EINVAL;
>> +                        goto out_err;
>>                       }
>>
>>                       memset(&output, 0, sizeof(struct r600_bytecode_output));
>>                       output.gpr = shader->output[so.output[i].register_index].gpr;
>>                       output.elem_size = 0;
>> -                     output.array_base = buffer_offset[so.output[i].output_buffer];
>> +                     output.array_base = so.output[i].dst_offset;
>>                       output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
>>                       output.burst_count = 1;
>>                       output.barrier = 1;
>>                       output.array_size = 0;
>> -                     output.comp_mask = so.output[i].register_mask;
>> +                     output.comp_mask = (1 << so.output[i].num_components) - 1;
>>                       if (ctx.bc->chip_class >= EVERGREEN) {
>>                               switch (so.output[i].output_buffer) {
>>                               case 0:
>> @@ -939,12 +923,6 @@ static int r600_shader_from_tgsi(struct
>> r600_pipe_context * rctx, struct r600_pi
>>                       r = r600_bytecode_add_output(ctx.bc, &output);
>>                       if (r)
>>                               goto out_err;
>> -
>> -                     buffer_offset[so.output[i].output_buffer] += comps;
>> -             }
>> -
>> -             for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
>> -                     pipeshader->so_strides[i] = buffer_offset[i] * 4;
>>               }
>>       }
>>
>> diff --git a/src/gallium/drivers/r600/r600_state_common.c
>> b/src/gallium/drivers/r600/r600_state_common.c
>> index 054ab90..d96857e 100644
>> --- a/src/gallium/drivers/r600/r600_state_common.c
>> +++ b/src/gallium/drivers/r600/r600_state_common.c
>> @@ -645,7 +645,7 @@ void r600_draw_vbo(struct pipe_context *ctx,
>> const struct pipe_draw_info *dinfo)
>>               }
>>       }
>>
>> -     rctx->ctx.vs_shader_so_strides = rctx->vs_shader->so_strides;
>> +     rctx->ctx.vs_so_stride_in_dw = rctx->vs_shader->so.stride;
>>
>>       mask = (1ULL << ((unsigned)rctx->framebuffer.nr_cbufs * 4)) - 1;
>>
>> diff --git a/src/gallium/drivers/softpipe/sp_state_so.c
>> b/src/gallium/drivers/softpipe/sp_state_so.c
>> index 31ef384..f4b5cdf 100644
>> --- a/src/gallium/drivers/softpipe/sp_state_so.c
>> +++ b/src/gallium/drivers/softpipe/sp_state_so.c
>> @@ -43,7 +43,7 @@ softpipe_create_stream_output_state(struct
>> pipe_context *pipe,
>>
>>     if (so) {
>>        so->base.num_outputs = templ->num_outputs;
>> -      so->base.stride = templ->stride;
>> +      memcpy(so->base.stride, templ->stride, sizeof(templ->stride));
>>        memcpy(so->base.output, templ->output,
>>               templ->num_outputs * sizeof(templ->output[0]));
>>     }
>> diff --git a/src/gallium/drivers/trace/tr_dump_state.c
>> b/src/gallium/drivers/trace/tr_dump_state.c
>> index 038a80e..7ce477e 100644
>> --- a/src/gallium/drivers/trace/tr_dump_state.c
>> +++ b/src/gallium/drivers/trace/tr_dump_state.c
>> @@ -271,14 +271,16 @@ void trace_dump_shader_state(const struct
>> pipe_shader_state *state)
>>     trace_dump_member_begin("stream_output");
>>     trace_dump_struct_begin("pipe_stream_output_info");
>>     trace_dump_member(uint, &state->stream_output, num_outputs);
>> -   trace_dump_member(uint, &state->stream_output, stride);
>> +   trace_dump_array(uint, state->stream_output.stride,
>> PIPE_MAX_SO_BUFFERS);
>>     trace_dump_array_begin();
>>     for(i = 0; i < state->stream_output.num_outputs; ++i) {
>>        trace_dump_elem_begin();
>>        trace_dump_struct_begin(""); /* anonymous */
>>        trace_dump_member(uint, &state->stream_output.output[i],
>>        register_index);
>> -      trace_dump_member(uint, &state->stream_output.output[i],
>> register_mask);
>> +      trace_dump_member(uint, &state->stream_output.output[i],
>> start_component);
>> +      trace_dump_member(uint, &state->stream_output.output[i],
>> num_components);
>>        trace_dump_member(uint, &state->stream_output.output[i],
>>        output_buffer);
>> +      trace_dump_member(uint, &state->stream_output.output[i],
>> dst_offset);
>>        trace_dump_struct_end();
>>        trace_dump_elem_end();
>>     }
>> diff --git a/src/gallium/include/pipe/p_state.h
>> b/src/gallium/include/pipe/p_state.h
>> index f943ca5..15cc001 100644
>> --- a/src/gallium/include/pipe/p_state.h
>> +++ b/src/gallium/include/pipe/p_state.h
>> @@ -176,16 +176,19 @@ struct pipe_clip_state
>>  struct pipe_stream_output_info
>>  {
>>     unsigned num_outputs;
>> -   /** stride for an entire vertex, only used if all output_buffers
>> are 0 */
>> -   unsigned stride;
>> +   /** stride for an entire vertex for each buffer in dwords */
>> +   unsigned stride[PIPE_MAX_SO_BUFFERS];
>
> If it per output buffer then please move it inside the array below. Otherwise no objection.

The array is per shader output (there can be 16 or 32 outputs -
depending on the driver), while the strides are per output buffer
(there can be at most 4 buffers).

> BTW, why stride in dwords? Elsewhere in gallium we always use bytes.

In OpenGL, the stride is always dword-aligned and cannot be changed
explicitly. Also, core Mesa represents the stride in terms of the
number of components, which are dwords. Finally, my hardware wants the
stride to be in dwords. I decided to use everyone's representation in
the pipe interface to assure it's implementable without fallbacks
everywhere.

Marek


More information about the mesa-dev mailing list