[Mesa-dev] [PATCH 5/6] r600g: add streamout support

Fri Jul 10 07:18:38 PDT 2015

On Thu, Jul 9, 2015 at 8:43 AM, Dave Airlie <airlied at gmail.com> wrote:
> From: Glenn Kennard <glenn.kennard at gmail.com>
>
> This adds the main chunk of the geometry shader multiple stream
> support to the r600 driver.
>
> Glenn wrote the original pass, and I took his code and hacked
> it into a working state.
>
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
>  src/gallium/drivers/r600/evergreen_state.c   |  29 ++--
>  src/gallium/drivers/r600/r600_pipe.c         |   2 +-
>  src/gallium/drivers/r600/r600_shader.c       | 200 ++++++++++++++++++++-------
>  src/gallium/drivers/r600/r600_shader.h       |   6 +-
>  src/gallium/drivers/r600/r600_state.c        |   6 +-
>  src/gallium/drivers/r600/r600_state_common.c |   7 +
>  6 files changed, 180 insertions(+), 70 deletions(-)
>
> diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
> index 4ddbc0b..788bf54 100644
> --- a/src/gallium/drivers/r600/evergreen_state.c
> +++ b/src/gallium/drivers/r600/evergreen_state.c
> @@ -2988,8 +2988,12 @@ void evergreen_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader
>         struct r600_command_buffer *cb = &shader->command_buffer;
>         struct r600_shader *rshader = &shader->shader;
>         struct r600_shader *cp_shader = &shader->gs_copy_shader->shader;
> -       unsigned gsvs_itemsize =
> -                       (cp_shader->ring_item_size * rshader->gs_max_out_vertices) >> 2;
> +       unsigned gsvs_itemsizes[4] = {
> +                       (cp_shader->ring_item_sizes[0] * rshader->gs_max_out_vertices) >> 2,
> +                       (cp_shader->ring_item_sizes[1] * rshader->gs_max_out_vertices) >> 2,
> +                       (cp_shader->ring_item_sizes[2] * rshader->gs_max_out_vertices) >> 2,
> +                       (cp_shader->ring_item_sizes[3] * rshader->gs_max_out_vertices) >> 2
> +       };
>
>         r600_init_command_buffer(cb, 64);
>
> @@ -3008,21 +3012,24 @@ void evergreen_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader
>                                 S_028B90_ENABLE(rshader->gs_num_invocations > 0));
>         }
>         r600_store_context_reg_seq(cb, R_02891C_SQ_GS_VERT_ITEMSIZE, 4);
> -       r600_store_value(cb, cp_shader->ring_item_size >> 2);
> -       r600_store_value(cb, 0);
> -       r600_store_value(cb, 0);
> -       r600_store_value(cb, 0);
> +       r600_store_value(cb, cp_shader->ring_item_sizes[0] >> 2);
> +       r600_store_value(cb, cp_shader->ring_item_sizes[1] >> 2);
> +       r600_store_value(cb, cp_shader->ring_item_sizes[2] >> 2);
> +       r600_store_value(cb, cp_shader->ring_item_sizes[3] >> 2);
>
>         r600_store_context_reg(cb, R_028900_SQ_ESGS_RING_ITEMSIZE,
> -                              (rshader->ring_item_size) >> 2);
> +                              (rshader->ring_item_sizes[0]) >> 2);
>
>         r600_store_context_reg(cb, R_028904_SQ_GSVS_RING_ITEMSIZE,
> -                              gsvs_itemsize);
> +                              gsvs_itemsizes[0] +
> +                              gsvs_itemsizes[1] +
> +                              gsvs_itemsizes[2] +
> +                              gsvs_itemsizes[3]);
>
>         r600_store_context_reg_seq(cb, R_02892C_SQ_GSVS_RING_OFFSET_1, 3);
> -       r600_store_value(cb, gsvs_itemsize);
> -       r600_store_value(cb, gsvs_itemsize);
> -       r600_store_value(cb, gsvs_itemsize);
> +       r600_store_value(cb, gsvs_itemsizes[0]);
> +       r600_store_value(cb, gsvs_itemsizes[0] + gsvs_itemsizes[1]);
> +       r600_store_value(cb, gsvs_itemsizes[0] + gsvs_itemsizes[1] + gsvs_itemsizes[2]);
>
>         /* FIXME calculate these values somehow ??? */
>         r600_store_context_reg_seq(cb, R_028A54_GS_PER_ES, 3);
> diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
> index 143e98e..67caa69 100644
> --- a/src/gallium/drivers/r600/r600_pipe.c
> +++ b/src/gallium/drivers/r600/r600_pipe.c
> @@ -352,7 +352,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
>         case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
>                 return 16384;
>         case PIPE_CAP_MAX_VERTEX_STREAMS:
> -               return 1;
> +               return family >= CHIP_CEDAR ? 4 : 1;
>
>         case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
>                 return 2047;
> diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
> index dbff313..ef19706 100644
> --- a/src/gallium/drivers/r600/r600_shader.c
> +++ b/src/gallium/drivers/r600/r600_shader.c
> @@ -311,7 +311,9 @@ struct r600_shader_ctx {
>         int                                     gs_out_ring_offset;
>         int                                     gs_next_vertex;
>         struct r600_shader      *gs_for_vs;
> -       int                                     gs_export_gpr_treg;
> +       int                                     gs_export_gpr_tregs[4];
> +       const struct pipe_stream_output_info    *gs_stream_output_info;
> +       unsigned                                enabled_stream_buffers_mask; /* See r600_streamout.enabled_stream_buffers_mask */
>  };
>
>  struct r600_shader_tgsi_instruction {
> @@ -319,7 +321,7 @@ struct r600_shader_tgsi_instruction {
>         int (*process)(struct r600_shader_ctx *ctx);
>  };
>
> -static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind);
> +static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
>  static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
>  static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
>  static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
> @@ -1336,9 +1338,11 @@ static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
>         return 0;
>  }
>
> -static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so)
> +static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
> +                                                 int stream, unsigned *stream_item_size)
>  {
>         unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
> +       unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
>         int i, j, r;
>
>         /* Sanity checking. */
> @@ -1358,8 +1362,9 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output
>
>         /* Initialize locations where the outputs are stored. */
>         for (i = 0; i < so->num_outputs; i++) {
> -               so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
>
> +               so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
> +               start_comp[i] = so->output[i].start_component;
>                 /* Lower outputs with dst_offset < start_component.
>                  *
>                  * We can only output 4D vectors with a write mask, e.g. we can
> @@ -1385,7 +1390,7 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output
>                                 if (r)
>                                         return r;
>                         }
> -                       so->output[i].start_component = 0;
> +                       start_comp[i] = 0;
>                         so_gpr[i] = tmp;
>                 }
>         }
> @@ -1394,16 +1399,21 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output
>         for (i = 0; i < so->num_outputs; i++) {
>                 struct r600_bytecode_output output;
>
> +               if (stream != -1 && stream != so->output[i].output_buffer)
> +                       continue;
> +
>                 memset(&output, 0, sizeof(struct r600_bytecode_output));
>                 output.gpr = so_gpr[i];
> -               output.elem_size = so->output[i].num_components;
> -               output.array_base = so->output[i].dst_offset - so->output[i].start_component;
> +               output.elem_size = so->output[i].num_components - 1;
> +               if (output.elem_size == 2)
> +                       output.elem_size = 3; // 3 not supported, write 4 with junk at end
> +               output.array_base = so->output[i].dst_offset - start_comp[i];
>                 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
>                 output.burst_count = 1;
>                 /* array_size is an upper limit for the burst_count
>                  * with MEM_STREAM instructions */
>                 output.array_size = 0xFFF;
> -               output.comp_mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
> +               output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
>                 if (ctx->bc->chip_class >= EVERGREEN) {
>                         switch (so->output[i].output_buffer) {
>                         case 0:
> @@ -1419,6 +1429,9 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output
>                                 output.op = CF_OP_MEM_STREAM0_BUF3;
>                                 break;
>                         }
> +                       output.op += so->output[i].stream * 4;
> +                       assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
> +                       ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
>                 } else {
>                         switch (so->output[i].output_buffer) {
>                         case 0:
> @@ -1434,6 +1447,7 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output
>                                 output.op = CF_OP_MEM_STREAM3;
>                                         break;
>                         }
> +                       ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
>                 }
>                 r = r600_bytecode_add_output(ctx->bc, &output);
>                 if (r)
> @@ -1486,7 +1500,8 @@ static int generate_gs_copy_shader(struct r600_context *rctx,
>         struct r600_bytecode_output output;
>         struct r600_bytecode_cf *cf_jump, *cf_pop,
>                 *last_exp_pos = NULL, *last_exp_param = NULL;
> -       int i, next_clip_pos = 61, next_param = 0;
> +       int i, j, next_clip_pos = 61, next_param = 0;
> +       int ring;
>
>         cshader = calloc(1, sizeof(struct r600_pipe_shader));
>         if (!cshader)
> @@ -1506,6 +1521,9 @@ static int generate_gs_copy_shader(struct r600_context *rctx,
>
>         ctx.bc->isa = rctx->isa;
>
> +       cf_jump = NULL;
> +       memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
> +
>         /* R0.x = R0.x & 0x3fffffff */
>         memset(&alu, 0, sizeof(alu));
>         alu.op = ALU_OP2_AND_INT;
> @@ -1524,22 +1542,10 @@ static int generate_gs_copy_shader(struct r600_context *rctx,
>         alu.last = 1;
>         r600_bytecode_add_alu(ctx.bc, &alu);
>
> -       /* PRED_SETE_INT __, R0.y, 0 */
> -       memset(&alu, 0, sizeof(alu));
> -       alu.op = ALU_OP2_PRED_SETE_INT;
> -       alu.src[0].chan = 1;
> -       alu.src[1].sel = V_SQ_ALU_SRC_0;
> -       alu.execute_mask = 1;
> -       alu.update_pred = 1;
> -       alu.last = 1;
> -       r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
> -
> -       r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
> -       cf_jump = ctx.bc->cf_last;
> -
>         /* fetch vertex data from GSVS ring */
>         for (i = 0; i < ocnt; ++i) {
>                 struct r600_shader_io *out = &ctx.shader->output[i];
> +
>                 out->gpr = i + 1;
>                 out->ring_offset = i * 16;
>
> @@ -1549,6 +1555,7 @@ static int generate_gs_copy_shader(struct r600_context *rctx,
>                 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
>                 vtx.offset = out->ring_offset;
>                 vtx.dst_gpr = out->gpr;
> +               vtx.src_gpr = 0;
>                 vtx.dst_sel_x = 0;
>                 vtx.dst_sel_y = 1;
>                 vtx.dst_sel_z = 2;
> @@ -1561,18 +1568,68 @@ static int generate_gs_copy_shader(struct r600_context *rctx,
>
>                 r600_bytecode_add_vtx(ctx.bc, &vtx);
>         }
> +       ctx.temp_reg = i + 1;
> +       for (ring = 3; ring >= 0; --ring) {
> +               bool enabled = false;
> +               for (i = 0; i < so->num_outputs; i++) {
> +                       if (so->output[i].stream == ring) {
> +                               enabled = true;
> +                               break;
> +                       }
> +               }
> +               if (ring != 0 && !enabled) {
> +                       cshader->shader.ring_item_sizes[ring] = 0;
> +                       continue;
> +               }
>
> -       /* XXX handle clipvertex, streamout? */
> -       emit_streamout(&ctx, so);
> +               if (cf_jump) {
> +                       // Patch up jump label
> +                       r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
> +                       cf_pop = ctx.bc->cf_last;
> +
> +                       cf_jump->cf_addr = cf_pop->id + 2;
> +                       cf_jump->pop_count = 1;
> +                       cf_pop->cf_addr = cf_pop->id + 2;
> +                       cf_pop->pop_count = 1;
> +               }
> +
> +               /* PRED_SETE_INT __, R0.y, ring */
> +               memset(&alu, 0, sizeof(alu));
> +               alu.op = ALU_OP2_PRED_SETE_INT;
> +               alu.src[0].chan = 1;
> +               alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
> +               alu.src[1].value = ring;
> +               alu.execute_mask = 1;
> +               alu.update_pred = 1;
> +               alu.last = 1;
> +               r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
> +
> +               r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
> +               cf_jump = ctx.bc->cf_last;
> +
> +               if (enabled)
> +                       emit_streamout(&ctx, so, ring, &cshader->shader.ring_item_sizes[ring]);
> +               cshader->shader.ring_item_sizes[ring] = ocnt * 16;
> +       }
>
>         /* export vertex data */
>         /* XXX factor out common code with r600_shader_from_tgsi ? */
>         for (i = 0; i < ocnt; ++i) {
>                 struct r600_shader_io *out = &ctx.shader->output[i];
> -
> +               bool instream0 = true;
>                 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
>                         continue;
>
> +               for (j = 0; j < so->num_outputs; j++) {
> +                       if (so->output[j].register_index == i) {
> +                               if (so->output[j].stream == 0)
> +                                       break;
> +                               if (so->output[j].stream > 0)
> +                                       instream0 = false;
> +                       }
> +               }
> +               if (!instream0)
> +                       continue;
>                 memset(&output, 0, sizeof(output));
>                 output.gpr = out->gpr;
>                 output.elem_size = 3;
> @@ -1718,17 +1775,19 @@ static int generate_gs_copy_shader(struct r600_context *rctx,
>         }
>
>         gs->gs_copy_shader = cshader;
> +       cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
>
>         ctx.bc->nstack = 1;
> -       cshader->shader.ring_item_size = ocnt * 16;
>
>         return r600_bytecode_build(ctx.bc);
>  }
>
> -static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind)
> +static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind)
>  {
>         struct r600_bytecode_output output;
>         int i, k, ring_offset;
> +       int effective_stream = stream == -1 ? 0 : stream;
> +       int idx = 0;
>
>         for (i = 0; i < ctx->shader->noutput; i++) {
>                 if (ctx->gs_for_vs) {
> @@ -1745,15 +1804,18 @@ static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind)
>
>                         if (ring_offset == -1)
>                                 continue;
> -               } else
> -                       ring_offset = i * 16;
> +               } else {
> +                       ring_offset = idx * 16;
> +                       idx++;
> +               }
>
> +               if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
> +                       continue;
>                 /* next_ring_offset after parsing input decls contains total size of
>                  * single vertex data, gs_next_vertex - current vertex index */
>                 if (!ind)
>                         ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
>
> -               /* get a temp and add the ring offset to the next vertex base in the shader */
>                 memset(&output, 0, sizeof(struct r600_bytecode_output));
>                 output.gpr = ctx->shader->output[i].gpr;
>                 output.elem_size = 3;
> @@ -1764,28 +1826,39 @@ static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind)
>                         output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
>                 else
>                         output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
> -               output.op = CF_OP_MEM_RING;
>
> +               switch (stream) {
> +               default:
> +               case 0:
> +                       output.op = CF_OP_MEM_RING; break;
> +               case 1:
> +                       output.op = CF_OP_MEM_RING1; break;
> +               case 2:
> +                       output.op = CF_OP_MEM_RING2; break;
> +               case 3:
> +                       output.op = CF_OP_MEM_RING3; break;
> +               }
>
>                 if (ind) {
>                         output.array_base = ring_offset >> 2; /* in dwords */
>                         output.array_size = 0xfff;
> -                       output.index_gpr = ctx->gs_export_gpr_treg;
> +                       output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
>                 } else
>                         output.array_base = ring_offset >> 2; /* in dwords */
>                 r600_bytecode_add_output(ctx->bc, &output);
>         }
>
>         if (ind) {
> +               /* get a temp and add the ring offset to the next vertex base in the shader */
>                 struct r600_bytecode_alu alu;
>                 int r;
>
>                 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
>                 alu.op = ALU_OP2_ADD_INT;
> -               alu.src[0].sel = ctx->gs_export_gpr_treg;
> +               alu.src[0].sel = ctx->gs_export_gpr_tregs[effective_stream];
>                 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
>                 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
> -               alu.dst.sel = ctx->gs_export_gpr_treg;
> +               alu.dst.sel = ctx->gs_export_gpr_tregs[effective_stream];
>                 alu.dst.write = 1;
>                 alu.last = 1;
>                 r = r600_bytecode_add_alu(ctx->bc, &alu);
> @@ -1851,6 +1924,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
>         ctx.next_ring_offset = 0;
>         ctx.gs_out_ring_offset = 0;
>         ctx.gs_next_vertex = 0;
> +       ctx.gs_stream_output_info = &so;
>
>         shader->uses_index_registers = false;
>         ctx.face_gpr = -1;
> @@ -1937,8 +2011,11 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
>         ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
>
>         if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
> -               ctx.gs_export_gpr_treg = ctx.bc->ar_reg + 3;
> -               ctx.temp_reg = ctx.bc->ar_reg + 4;
> +               ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
> +               ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
> +               ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
> +               ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
> +               ctx.temp_reg = ctx.bc->ar_reg + 7;
>         } else {
>                 ctx.temp_reg = ctx.bc->ar_reg + 3;
>         }
> @@ -2026,7 +2103,10 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
>                 }
>         }
>
> -       shader->ring_item_size = ctx.next_ring_offset;
> +       shader->ring_item_sizes[0] = ctx.next_ring_offset;
> +       shader->ring_item_sizes[1] = 0;
> +       shader->ring_item_sizes[2] = 0;
> +       shader->ring_item_sizes[3] = 0;
>
>         /* Process two side if needed */
>         if (shader->two_side && ctx.colors_used) {
> @@ -2150,17 +2230,18 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
>                 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
>                         struct r600_bytecode_alu alu;
>                         int r;
> -
> -                       memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> -                       alu.op = ALU_OP1_MOV;
> -                       alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
> -                       alu.src[0].value = 0;
> -                       alu.dst.sel = ctx.gs_export_gpr_treg;
> -                       alu.dst.write = 1;
> -                       alu.last = 1;
> -                       r = r600_bytecode_add_alu(ctx.bc, &alu);
> -                       if (r)
> -                               return r;
> +                       for (j = 0; j < 4; j++) {
> +                               memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +                               alu.op = ALU_OP1_MOV;
> +                               alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
> +                               alu.src[0].value = 0;
> +                               alu.dst.sel = ctx.gs_export_gpr_tregs[j];
> +                               alu.dst.write = 1;
> +                               alu.last = 1;
> +                               r = r600_bytecode_add_alu(ctx.bc, &alu);
> +                               if (r)
> +                                       return r;
> +                       }
>                 }
>                 if (shader->two_side && ctx.colors_used) {
>                         if ((r = process_twoside_color_inputs(&ctx)))
> @@ -2261,13 +2342,19 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
>         /* Add stream outputs. */
>         if (!ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX &&
>             so.num_outputs && !use_llvm)
> -               emit_streamout(&ctx, &so);
> +               emit_streamout(&ctx, &so, -1, NULL);
>
>         convert_edgeflag_to_int(&ctx);
>
>         if (ring_outputs) {
> -               if (key.vs_as_es)
> -                       emit_gs_ring_writes(&ctx, FALSE);
> +               if (key.vs_as_es) {
> +                       ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
> +                       ctx.gs_export_gpr_tregs[1] = -1;
> +                       ctx.gs_export_gpr_tregs[2] = -1;
> +                       ctx.gs_export_gpr_tregs[3] = -1;
> +
> +                       emit_gs_ring_writes(&ctx, &so, -1, FALSE);
> +               }
>         } else {
>                 /* Export output */
>                 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
> @@ -7217,10 +7304,17 @@ static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
>
>  static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
>  {
> +       struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
> +       int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
> +       int r;
> +
>         if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
> -               emit_gs_ring_writes(ctx, TRUE);
> +               emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
>
> -       return r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
> +       r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
> +       if (!r)
> +               ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
> +       return r;
>  }
>
>  static int tgsi_umad(struct r600_shader_ctx *ctx)
> diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
> index dd359d7..30eca52 100644
> --- a/src/gallium/drivers/r600/r600_shader.h
> +++ b/src/gallium/drivers/r600/r600_shader.h
> @@ -83,8 +83,9 @@ struct r600_shader {
>         unsigned                gs_output_prim;
>         unsigned                gs_max_out_vertices;
>         unsigned                gs_num_invocations;
> -       /* size in bytes of a data item in the ring (single vertex data) */
> -       unsigned                ring_item_size;
> +       /* Size in bytes of a data item in the ring(s) (single vertex data).
> +          Stages with only one ring items 123 will be set to 0. */
> +       unsigned                ring_item_sizes[4];
>
>         unsigned                indirect_files;
>         unsigned                max_arrays;
> @@ -125,6 +126,7 @@ struct r600_pipe_shader {
>         struct r600_shader_key  key;
>         unsigned                db_shader_control;
>         unsigned                ps_depth_export;
> +       unsigned                enabled_stream_buffers_mask; /* See r600_streamout.enabled_stream_buffers_mask */
>  };
>
>  /* return the table index 0-5 for TGSI_INTERPOLATE_LINEAR/PERSPECTIVE and
> diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
> index 960dfce..fb79a5b 100644
> --- a/src/gallium/drivers/r600/r600_state.c
> +++ b/src/gallium/drivers/r600/r600_state.c
> @@ -2650,7 +2650,7 @@ void r600_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader *sha
>         struct r600_shader *rshader = &shader->shader;
>         struct r600_shader *cp_shader = &shader->gs_copy_shader->shader;
>         unsigned gsvs_itemsize =
> -                       (cp_shader->ring_item_size * rshader->gs_max_out_vertices) >> 2;
> +                       (cp_shader->ring_item_sizes[0] * rshader->gs_max_out_vertices) >> 2;
>
>         r600_init_command_buffer(cb, 64);
>
> @@ -2665,10 +2665,10 @@ void r600_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader *sha
>                                r600_conv_prim_to_gs_out(rshader->gs_output_prim));
>
>         r600_store_context_reg(cb, R_0288C8_SQ_GS_VERT_ITEMSIZE,
> -                              cp_shader->ring_item_size >> 2);
> +                              cp_shader->ring_item_sizes[0] >> 2);
>
>         r600_store_context_reg(cb, R_0288A8_SQ_ESGS_RING_ITEMSIZE,
> -                              (rshader->ring_item_size) >> 2);
> +                              (rshader->ring_item_sizes[0]) >> 2);
>
>         r600_store_context_reg(cb, R_0288AC_SQ_GSVS_RING_ITEMSIZE,
>                                gsvs_itemsize);
> diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
> index 13dc9ee..f8fbaf3 100644
> --- a/src/gallium/drivers/r600/r600_state_common.c
> +++ b/src/gallium/drivers/r600/r600_state_common.c
> @@ -1208,6 +1208,13 @@ static bool r600_update_derived_state(struct r600_context *rctx)
>                                 rctx->clip_misc_state.clip_disable = rctx->gs_shader->current->shader.vs_position_window_space;
>                                 rctx->clip_misc_state.atom.dirty = true;
>                         }
> +                       /* Need to reapply R_028B98_VGT_STRMOUT_BUFFER_CONFIG streamout state, if any */
> +                       if (rctx->b.streamout.enabled_stream_buffers_mask !=
> +                               rctx->gs_shader->current->gs_copy_shader->enabled_stream_buffers_mask) {
> +                               rctx->b.streamout.enabled_stream_buffers_mask =
> +                                       rctx->gs_shader->current->gs_copy_shader->enabled_stream_buffers_mask;
> +                               rctx->b.streamout.begin_atom.dirty = true;
> +                       }

This is a very bad idea. Streamout cannot be started if it has been
started already. Things could go bananas. For each "begin", there
should be exactly one "end". The solution is to move emitting
VGT_STRMOUT_BUFFER_CONFIG to a different place, e.g.
r600_emit_streamout_enable.

Marek