[Mesa-dev] [PATCH 2/2] nvc0: implement new stream output interface

Christoph Bumiller e0425955 at student.tuwien.ac.at
Sun Nov 20 04:43:59 PST 2011


---
 src/gallium/drivers/nouveau/nv_object.xml.h    |   13 ++-
 src/gallium/drivers/nvc0/nvc0_3d.xml.h         |    8 +-
 src/gallium/drivers/nvc0/nvc0_context.c        |    2 +-
 src/gallium/drivers/nvc0/nvc0_context.h        |   25 +++--
 src/gallium/drivers/nvc0/nvc0_program.c        |   42 ++++++++
 src/gallium/drivers/nvc0/nvc0_program.h        |   11 ++-
 src/gallium/drivers/nvc0/nvc0_push.c           |   21 +++-
 src/gallium/drivers/nvc0/nvc0_query.c          |  135 +++++++++++++++---------
 src/gallium/drivers/nvc0/nvc0_screen.c         |    8 ++-
 src/gallium/drivers/nvc0/nvc0_shader_state.c   |  104 +++++++++++-------
 src/gallium/drivers/nvc0/nvc0_state.c          |  118 ++++++++++----------
 src/gallium/drivers/nvc0/nvc0_state_validate.c |   16 +++-
 src/gallium/drivers/nvc0/nvc0_stateobj.h       |   16 ++-
 src/gallium/drivers/nvc0/nvc0_surface.c        |    6 +-
 src/gallium/drivers/nvc0/nvc0_vbo.c            |   35 ++++++
 15 files changed, 371 insertions(+), 189 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv_object.xml.h b/src/gallium/drivers/nouveau/nv_object.xml.h
index a5b0d04..47dc675 100644
--- a/src/gallium/drivers/nouveau/nv_object.xml.h
+++ b/src/gallium/drivers/nouveau/nv_object.xml.h
@@ -185,15 +185,18 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define NV01_SUBCHAN_OBJECT					0x00000000
 
 
-#define NV84_SUBCHAN_QUERY_ADDRESS_HIGH				0x00000010
+#define NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH			0x00000010
 
-#define NV84_SUBCHAN_QUERY_ADDRESS_LOW				0x00000014
+#define NV84_SUBCHAN_SEMAPHORE_ADDRESS_LOW			0x00000014
 
-#define NV84_SUBCHAN_QUERY_COUNTER				0x00000018
+#define NV84_SUBCHAN_SEMAPHORE_SEQUENCE				0x00000018
 
-#define NV84_SUBCHAN_QUERY_GET					0x0000001c
+#define NV84_SUBCHAN_SEMAPHORE_TRIGGER				0x0000001c
+#define NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL		0x00000001
+#define NV84_SUBCHAN_SEMAPHORE_TRIGGER_WRITE_LONG		0x00000002
+#define NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_GEQUAL		0x00000004
 
-#define NV84_SUBCHAN_QUERY_INTR					0x00000020
+#define NV84_SUBCHAN_NOTIFY_INTR				0x00000020
 
 #define NV84_SUBCHAN_WRCACHE_FLUSH				0x00000024
 
diff --git a/src/gallium/drivers/nvc0/nvc0_3d.xml.h b/src/gallium/drivers/nvc0/nvc0_3d.xml.h
index a8d9108..c32fa3a 100644
--- a/src/gallium/drivers/nvc0/nvc0_3d.xml.h
+++ b/src/gallium/drivers/nvc0/nvc0_3d.xml.h
@@ -130,11 +130,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define NVC0_3D_TFB_BUFFER_SIZE(i0)			       (0x0000038c + 0x20*(i0))
 
-#define NVC0_3D_TFB_PRIMITIVE_ID(i0)			       (0x00000390 + 0x20*(i0))
+#define NVC0_3D_TFB_BUFFER_OFFSET(i0)			       (0x00000390 + 0x20*(i0))
 
-#define NVC0_3D_TFB_UNK07X0(i0)				       (0x00000700 + 0x10*(i0))
-#define NVC0_3D_TFB_UNK07X0__ESIZE				0x00000010
-#define NVC0_3D_TFB_UNK07X0__LEN				0x00000004
+#define NVC0_3D_TFB_STREAM(i0)				       (0x00000700 + 0x10*(i0))
+#define NVC0_3D_TFB_STREAM__ESIZE				0x00000010
+#define NVC0_3D_TFB_STREAM__LEN					0x00000004
 
 #define NVC0_3D_TFB_VARYING_COUNT(i0)			       (0x00000704 + 0x10*(i0))
 #define NVC0_3D_TFB_VARYING_COUNT__ESIZE			0x00000010
diff --git a/src/gallium/drivers/nvc0/nvc0_context.c b/src/gallium/drivers/nvc0/nvc0_context.c
index 2927a09..1cc1756 100644
--- a/src/gallium/drivers/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nvc0/nvc0_context.c
@@ -77,7 +77,7 @@ nvc0_context_unreference_resources(struct nvc0_context *nvc0)
    }
 
    for (i = 0; i < nvc0->num_tfbbufs; ++i)
-      pipe_resource_reference(&nvc0->tfbbuf[i], NULL);
+      pipe_so_target_reference(&nvc0->tfbbuf[i], NULL);
 }
 
 static void
diff --git a/src/gallium/drivers/nvc0/nvc0_context.h b/src/gallium/drivers/nvc0/nvc0_context.h
index 4435c1b..af95d1a 100644
--- a/src/gallium/drivers/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nvc0/nvc0_context.h
@@ -49,14 +49,14 @@
 #define NVC0_NEW_CONSTBUF     (1 << 18)
 #define NVC0_NEW_TEXTURES     (1 << 19)
 #define NVC0_NEW_SAMPLERS     (1 << 20)
-#define NVC0_NEW_TFB          (1 << 21)
-#define NVC0_NEW_TFB_BUFFERS  (1 << 22)
+#define NVC0_NEW_TFB_TARGETS  (1 << 21)
 
 #define NVC0_BUFCTX_CONSTANT 0
 #define NVC0_BUFCTX_FRAME    1
 #define NVC0_BUFCTX_VERTEX   2
 #define NVC0_BUFCTX_TEXTURES 3
-#define NVC0_BUFCTX_COUNT    4
+#define NVC0_BUFCTX_TFB      4
+#define NVC0_BUFCTX_COUNT    5
 
 struct nvc0_context {
    struct nouveau_context base;
@@ -75,6 +75,7 @@ struct nvc0_context {
       boolean prim_restart;
       boolean early_z;
       uint16_t scissor;
+      boolean rasterizer_discard;
       uint8_t num_vtxbufs;
       uint8_t num_vtxelts;
       uint8_t num_textures[5];
@@ -84,6 +85,7 @@ struct nvc0_context {
       uint8_t clip_enable;
       uint32_t clip_mode;
       uint32_t uniform_buffer_bound[5];
+      struct nvc0_transform_feedback_state *tfb;
    } state;
 
    struct nvc0_blend_stateobj *blend;
@@ -125,10 +127,9 @@ struct nvc0_context {
 
    boolean vbo_push_hint;
 
-   struct nvc0_transform_feedback_state *tfb;
-   struct pipe_resource *tfbbuf[4];
+   uint8_t tfbbuf_dirty;
+   struct pipe_stream_output_target *tfbbuf[4];
    unsigned num_tfbbufs;
-   unsigned tfb_offset[4];
 
    struct draw_context *draw;
 };
@@ -170,10 +171,14 @@ void nvc0_program_library_upload(struct nvc0_context *);
 
 /* nvc0_query.c */
 void nvc0_init_query_functions(struct nvc0_context *);
-void nvc0_query_pushbuf_submit(struct nvc0_context *nvc0,
-                               struct pipe_query *pq, unsigned result_offset);
-
-#define NVC0_QUERY_TFB_BUFFER_OFFSETS (PIPE_QUERY_TYPES + 0)
+void nvc0_query_pushbuf_submit(struct nouveau_channel *,
+                               struct pipe_query *, unsigned result_offset);
+void nvc0_query_fifo_wait(struct nouveau_channel *, struct pipe_query *);
+void nvc0_so_target_save_offset(struct pipe_context *,
+                                struct pipe_stream_output_target *, unsigned i,
+                                boolean *serialize);
+
+#define NVC0_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
 
 /* nvc0_shader_state.c */
 void nvc0_vertprog_validate(struct nvc0_context *);
diff --git a/src/gallium/drivers/nvc0/nvc0_program.c b/src/gallium/drivers/nvc0/nvc0_program.c
index f3185b4..9e3ca4b 100644
--- a/src/gallium/drivers/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nvc0/nvc0_program.c
@@ -480,6 +480,39 @@ nvc0_fp_gen_header(struct nvc0_program *fp, struct nv50_ir_prog_info *info)
    return 0;
 }
 
+static struct nvc0_transform_feedback_state *
+nvc0_program_create_tfb_state(const struct nv50_ir_prog_info *info,
+                              const struct pipe_stream_output_info *pso)
+{
+   struct nvc0_transform_feedback_state *tfb;
+   int n = 0;
+   int i, c, b;
+
+   tfb = MALLOC(sizeof(*tfb) + pso->num_outputs * 4 * sizeof(uint8_t));
+   if (!tfb)
+      return NULL;
+
+   for (b = 0; b < 4; ++b) {
+      tfb->varying_count[b] = 0;
+
+      for (i = 0; i < pso->num_outputs; ++i) {
+         if (pso->output_buffer[i] != b)
+            continue;
+         for (c = 0; c < 4; ++c) {
+            if (!(pso->register_mask[i] & (1 << c)))
+               continue;
+            tfb->varying_count[b]++;
+            tfb->varying_index[n++] = info->out[pso->register_index[i]].slot[c];
+         }
+      }
+      tfb->stride[b] = tfb->varying_count[b] * 4;
+   }
+   if (pso->stride)
+      tfb->stride[0] = pso->stride;
+
+   return tfb;
+}
+
 #ifdef DEBUG
 static void
 nvc0_program_dump(struct nvc0_program *prog)
@@ -577,6 +610,10 @@ nvc0_program_translate(struct nvc0_program *prog)
    if (info->io.globalAccess)
       prog->hdr[0] |= 1 << 16;
 
+   if (prog->pipe.stream_output.num_outputs)
+      prog->tfb = nvc0_program_create_tfb_state(info,
+                                                &prog->pipe.stream_output);
+
 out:
    FREE(info);
    return !ret;
@@ -675,6 +712,11 @@ nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog)
       FREE(prog->immd_data);
    if (prog->relocs)
       FREE(prog->relocs);
+   if (prog->tfb) {
+      if (nvc0->state.tfb == prog->tfb)
+         nvc0->state.tfb = NULL;
+      FREE(prog->tfb);
+   }
 
    memset(prog->hdr, 0, sizeof(prog->hdr));
 
diff --git a/src/gallium/drivers/nvc0/nvc0_program.h b/src/gallium/drivers/nvc0/nvc0_program.h
index b107850..10eb9f7 100644
--- a/src/gallium/drivers/nvc0/nvc0_program.h
+++ b/src/gallium/drivers/nvc0/nvc0_program.h
@@ -6,6 +6,14 @@
 
 #define NVC0_CAP_MAX_PROGRAM_TEMPS 128
 
+
+struct nvc0_transform_feedback_state {
+   uint32_t stride[4];
+   uint8_t varying_count[4];
+   uint8_t varying_index[0];
+};
+
+
 #define NVC0_SHADER_HEADER_SIZE (20 * 4)
 
 struct nvc0_program {
@@ -31,7 +39,6 @@ struct nvc0_program {
       uint8_t clip_enable; /* only applies if num_ucps == 0 */
       uint8_t edgeflag;
       uint8_t num_ucps;
-      uint8_t out_pos[PIPE_MAX_SHADER_OUTPUTS];
    } vp;
    struct {
       uint8_t early_z;
@@ -44,6 +51,8 @@ struct nvc0_program {
 
    void *relocs;
 
+   struct nvc0_transform_feedback_state *tfb;
+
    struct nouveau_resource *res;
 };
 
diff --git a/src/gallium/drivers/nvc0/nvc0_push.c b/src/gallium/drivers/nvc0/nvc0_push.c
index 2e9f4c1..238671d 100644
--- a/src/gallium/drivers/nvc0/nvc0_push.c
+++ b/src/gallium/drivers/nvc0/nvc0_push.c
@@ -273,7 +273,8 @@ nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
 {
    struct push_context ctx;
    unsigned i, index_size;
-   unsigned inst = info->instance_count;
+   unsigned inst_count = info->instance_count;
+   unsigned vert_count = info->count;
    boolean apply_bias = info->indexed && info->index_bias;
 
    init_push_context(nvc0, &ctx);
@@ -312,26 +313,34 @@ nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
       index_size = 0;
       ctx.primitive_restart = FALSE;
       ctx.restart_index = 0;
+
+      if (info->count_from_stream_output) {
+         struct pipe_context *pipe = &nvc0->base.pipe;
+         struct nvc0_so_target *targ;
+         targ = nvc0_so_target(info->count_from_stream_output);
+         pipe->get_query_result(pipe, targ->pq, TRUE, &vert_count);
+         vert_count /= targ->stride;
+      }
    }
 
    ctx.instance_id = info->start_instance;
    ctx.prim = nvc0_prim_gl(info->mode);
 
-   while (inst--) {
+   while (inst_count--) {
       BEGIN_RING(ctx.chan, RING_3D(VERTEX_BEGIN_GL), 1);
       OUT_RING  (ctx.chan, ctx.prim);
       switch (index_size) {
       case 0:
-         emit_vertices_seq(&ctx, info->start, info->count);
+         emit_vertices_seq(&ctx, info->start, vert_count);
          break;
       case 1:
-         emit_vertices_i08(&ctx, info->start, info->count);
+         emit_vertices_i08(&ctx, info->start, vert_count);
          break;
       case 2:
-         emit_vertices_i16(&ctx, info->start, info->count);
+         emit_vertices_i16(&ctx, info->start, vert_count);
          break;
       case 4:
-         emit_vertices_i32(&ctx, info->start, info->count);
+         emit_vertices_i32(&ctx, info->start, vert_count);
          break;
       default:
          assert(0);
diff --git a/src/gallium/drivers/nvc0/nvc0_query.c b/src/gallium/drivers/nvc0/nvc0_query.c
index be363a2..7244d83 100644
--- a/src/gallium/drivers/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nvc0/nvc0_query.c
@@ -27,7 +27,8 @@
 
 struct nvc0_query {
    uint32_t *data;
-   uint32_t type;
+   uint16_t type;
+   uint16_t index;
    uint32_t sequence;
    struct nouveau_bo *bo;
    uint32_t base;
@@ -103,7 +104,6 @@ nvc0_query_create(struct pipe_context *pipe, unsigned type)
    switch (type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
    case PIPE_QUERY_OCCLUSION_PREDICATE:
-   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
       q->rotate = 32;
       space = NVC0_QUERY_ALLOC_SPACE;
       break;
@@ -112,6 +112,7 @@ nvc0_query_create(struct pipe_context *pipe, unsigned type)
       space = 512;
       break;
    case PIPE_QUERY_SO_STATISTICS:
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
       q->is64bit = TRUE;
       space = 64;
       break;
@@ -123,7 +124,7 @@ nvc0_query_create(struct pipe_context *pipe, unsigned type)
    case PIPE_QUERY_PRIMITIVES_EMITTED:
       space = 32;
       break;
-   case NVC0_QUERY_TFB_BUFFER_OFFSETS:
+   case NVC0_QUERY_TFB_BUFFER_OFFSET:
       space = 16;
       break;
    default:
@@ -141,7 +142,9 @@ nvc0_query_create(struct pipe_context *pipe, unsigned type)
       /* we advance before query_begin ! */
       q->offset -= q->rotate;
       q->data -= q->rotate / sizeof(*q->data);
-   }
+   } else
+   if (!q->is64bit)
+      q->data[0] = 0; /* initialize sequence */
 
    return (struct pipe_query *)q;
 }
@@ -176,8 +179,6 @@ nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
    struct nouveau_channel *chan = nvc0->screen->base.channel;
    struct nvc0_query *q = nvc0_query(pq);
 
-   const int index = 0; /* vertex stream */
-
    /* For occlusion queries we have to change the storage, because a previous
     * query might set the initial render conition to FALSE even *after* we re-
     * initialized it to TRUE.
@@ -188,12 +189,12 @@ nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
       /* XXX: can we do this with the GPU, and sync with respect to a previous
        *  query ?
        */
+      q->data[0] = q->sequence; /* initialize sequence */
       q->data[1] = 1; /* initial render condition = TRUE */
       q->data[4] = q->sequence + 1; /* for comparison COND_MODE */
       q->data[5] = 0;
    }
-   if (!q->is64bit)
-      q->data[0] = q->sequence++; /* the previously used one */
+   q->sequence++;
 
    switch (q->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
@@ -208,14 +209,17 @@ nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
       }
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED:
-      nvc0_query_get(chan, q, 0x10, 0x06805002 | (index << 5));
+      nvc0_query_get(chan, q, 0x10, 0x06805002 | (q->index << 5));
       break;
    case PIPE_QUERY_PRIMITIVES_EMITTED:
-      nvc0_query_get(chan, q, 0x10, 0x05805002 | (index << 5));
+      nvc0_query_get(chan, q, 0x10, 0x05805002 | (q->index << 5));
       break;
    case PIPE_QUERY_SO_STATISTICS:
-      nvc0_query_get(chan, q, 0x20, 0x05805002 | (index << 5));
-      nvc0_query_get(chan, q, 0x30, 0x06805002 | (index << 5));
+      nvc0_query_get(chan, q, 0x20, 0x05805002 | (q->index << 5));
+      nvc0_query_get(chan, q, 0x30, 0x06805002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      nvc0_query_get(chan, q, 0x10, 0x03005002 | (q->index << 5));
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
    case PIPE_QUERY_TIME_ELAPSED:
@@ -247,15 +251,11 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
    struct nouveau_channel *chan = nvc0->screen->base.channel;
    struct nvc0_query *q = nvc0_query(pq);
 
-   const int index = 0; /* for multiple vertex streams */
-
    if (!q->active) {
       /* some queries don't require 'begin' to be called (e.g. GPU_FINISHED) */
       if (q->rotate)
          nvc0_query_rotate(nvc0, q);
-      else
-      if (!q->is64bit)
-         q->data[0] = q->sequence++;
+      q->sequence++;
    }
    q->ready = FALSE;
    q->active = FALSE;
@@ -268,17 +268,20 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
          IMMED_RING(chan, RING_3D(SAMPLECNT_ENABLE), 0);
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED:
-      nvc0_query_get(chan, q, 0, 0x06805002 | (index << 5));
+      nvc0_query_get(chan, q, 0, 0x06805002 | (q->index << 5));
       break;
    case PIPE_QUERY_PRIMITIVES_EMITTED:
-      nvc0_query_get(chan, q, 0, 0x05805002 | (index << 5));
+      nvc0_query_get(chan, q, 0, 0x05805002 | (q->index << 5));
       break;
    case PIPE_QUERY_SO_STATISTICS:
-      nvc0_query_get(chan, q, 0x00, 0x05805002 | (index << 5));
-      nvc0_query_get(chan, q, 0x10, 0x06805002 | (index << 5));
+      nvc0_query_get(chan, q, 0x00, 0x05805002 | (q->index << 5));
+      nvc0_query_get(chan, q, 0x10, 0x06805002 | (q->index << 5));
       break;
    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-      nvc0_query_get(chan, q, 0x00, 0x02005002 | (index << 5));
+      /* TODO: How do we sum over all streams for render condition ? */
+      /* PRIMS_DROPPED doesn't write sequence, use a ZERO query to sync on */
+      nvc0_query_get(chan, q, 0x00, 0x03005002 | (q->index << 5));
+      nvc0_query_get(chan, q, 0x20, 0x00005002);
       break;
    case PIPE_QUERY_TIMESTAMP:
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
@@ -300,11 +303,9 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       nvc0_query_get(chan, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */
       nvc0_query_get(chan, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */
       break;
-   case NVC0_QUERY_TFB_BUFFER_OFFSETS:
-      nvc0_query_get(chan, q, 0x00, 0x1d005002); /* TFB, BUFFER_OFFSET */
-      nvc0_query_get(chan, q, 0x04, 0x1d005022);
-      nvc0_query_get(chan, q, 0x08, 0x1d005042);
-      nvc0_query_get(chan, q, 0x0c, 0x1d005062);
+   case NVC0_QUERY_TFB_BUFFER_OFFSET:
+      /* indexed by TFB buffer instead of by vertex stream */
+      nvc0_query_get(chan, q, 0x00, 0x0d005002 | (q->index << 5));
       break;
    default:
       assert(0);
@@ -315,7 +316,14 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
 static INLINE boolean
 nvc0_query_ready(struct nvc0_query *q)
 {
-   return q->ready || (!q->is64bit && (q->data[0] == q->sequence));
+   if (q->is64bit) {
+      if (nouveau_bo_map(q->bo, NOUVEAU_BO_RD | NOUVEAU_BO_NOWAIT))
+         return FALSE;
+      nouveau_bo_unmap(q->bo);
+      return TRUE;
+   } else {
+      return q->data[0] == q->sequence;
+   }
 }
 
 static INLINE boolean
@@ -355,14 +363,12 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
 
    switch (q->type) {
    case PIPE_QUERY_GPU_FINISHED:
-      res32[0] = 0;
       res8[0] = TRUE;
       break;
    case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
       res64[0] = q->data[1] - q->data[5];
       break;
    case PIPE_QUERY_OCCLUSION_PREDICATE:
-      res32[0] = 0;
       res8[0] = q->data[1] != q->data[5];
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
@@ -374,15 +380,13 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       res64[1] = data64[2] - data64[6];
       break;
    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-      res32[0] = 0;
-      res8[0] = !q->data[1];
+      res8[0] = data64[0] != data64[2];
       break;
    case PIPE_QUERY_TIMESTAMP:
       res64[0] = data64[1];
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT: /* u32 sequence, u32 0, u64 time */
       res64[0] = 1000000000;
-      res32[2] = 0;
       res8[8] = (data64[1] == data64[3]) ? FALSE : TRUE;
       break;
    case PIPE_QUERY_TIME_ELAPSED:
@@ -392,11 +396,8 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       for (i = 0; i < 10; ++i)
          res64[i] = data64[i * 2] - data64[24 + i * 2];
       break;
-   case NVC0_QUERY_TFB_BUFFER_OFFSETS:
-      res32[0] = q->data[0];
-      res32[1] = q->data[1];
-      res32[2] = q->data[2];
-      res32[3] = q->data[3];
+   case NVC0_QUERY_TFB_BUFFER_OFFSET:
+      res32[0] = q->data[1];
       break;
    default:
       return FALSE;
@@ -405,6 +406,23 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    return TRUE;
 }
 
+void
+nvc0_query_fifo_wait(struct nouveau_channel *chan, struct pipe_query *pq)
+{
+   struct nvc0_query *q = nvc0_query(pq);
+   unsigned offset = q->offset;
+
+   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) offset += 0x20;
+
+   MARK_RING (chan, 5, 2);
+   BEGIN_RING(chan, RING_3D_(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
+   OUT_RELOCh(chan, q->bo, offset, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   OUT_RELOCl(chan, q->bo, offset, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   OUT_RING  (chan, q->sequence);
+   OUT_RING  (chan, (1 << 12) |
+              NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
+}
+
 static void
 nvc0_render_condition(struct pipe_context *pipe,
                       struct pipe_query *pq, uint mode)
@@ -427,9 +445,8 @@ nvc0_render_condition(struct pipe_context *pipe,
    /* NOTE: comparison of 2 queries only works if both have completed */
    switch (q->type) {
    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-      /* query writes 1 if there was no overflow */
-      cond = negated ? NVC0_3D_COND_MODE_RES_NON_ZERO :
-                       NVC0_3D_COND_MODE_EQUAL;
+      cond = negated ? NVC0_3D_COND_MODE_EQUAL :
+                       NVC0_3D_COND_MODE_NOT_EQUAL;
       wait = TRUE;
       break;
    case PIPE_QUERY_OCCLUSION_COUNTER:
@@ -450,14 +467,8 @@ nvc0_render_condition(struct pipe_context *pipe,
       break;
    }
 
-   if (wait) {
-      MARK_RING (chan, 5, 2);
-      BEGIN_RING(chan, RING_3D_(NV84_SUBCHAN_QUERY_ADDRESS_HIGH), 4);
-      OUT_RELOCh(chan, q->bo, q->offset, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-      OUT_RELOCl(chan, q->bo, q->offset, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-      OUT_RING  (chan, q->sequence);
-      OUT_RING  (chan, 0x00001001);
-   }
+   if (wait)
+      nvc0_query_fifo_wait(chan, pq);
 
    MARK_RING (chan, 4, 2);
    BEGIN_RING(chan, RING_3D(COND_ADDRESS_HIGH), 3);
@@ -467,13 +478,33 @@ nvc0_render_condition(struct pipe_context *pipe,
 }
 
 void
-nvc0_query_pushbuf_submit(struct nvc0_context *nvc0,
+nvc0_query_pushbuf_submit(struct nouveau_channel *chan,
                           struct pipe_query *pq, unsigned result_offset)
 {
    struct nvc0_query *q = nvc0_query(pq);
 
-   nouveau_pushbuf_submit(nvc0->screen->base.channel,
-                          q->bo, q->offset + result_offset, 4);
+#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
+
+   nouveau_pushbuf_submit(chan, q->bo, q->offset + result_offset, 4 |
+                          NVC0_IB_ENTRY_1_NO_PREFETCH);
+}
+
+void
+nvc0_so_target_save_offset(struct pipe_context *pipe,
+                           struct pipe_stream_output_target *ptarg,
+                           unsigned index, boolean *serialize)
+{
+   struct nvc0_so_target *targ = nvc0_so_target(ptarg);
+
+   if (*serialize) {
+      struct nouveau_channel *chan = nvc0_context(pipe)->screen->base.channel;
+      *serialize = FALSE;
+      IMMED_RING(chan, RING_3D(SERIALIZE), 0);
+   }
+
+   nvc0_query(targ->pq)->index = index;
+
+   nvc0_query_end(pipe, targ->pq);
 }
 
 void
diff --git a/src/gallium/drivers/nvc0/nvc0_screen.c b/src/gallium/drivers/nvc0/nvc0_screen.c
index 2445cd6..eef3635 100644
--- a/src/gallium/drivers/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nvc0/nvc0_screen.c
@@ -105,9 +105,15 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return 1;
    case PIPE_CAP_TIMER_QUERY:
    case PIPE_CAP_OCCLUSION_QUERY:
+   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
       return 1;
    case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
-      return 0;
+      return 4;
+   case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_ATTRIBS:
+      return 4;
+   case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+   case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+      return 128;
    case PIPE_CAP_BLEND_EQUATION_SEPARATE:
    case PIPE_CAP_INDEP_BLEND_ENABLE:
    case PIPE_CAP_INDEP_BLEND_FUNC:
diff --git a/src/gallium/drivers/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nvc0/nvc0_shader_state.c
index f4a12fb..446bd94 100644
--- a/src/gallium/drivers/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nvc0/nvc0_shader_state.c
@@ -90,7 +90,9 @@ nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog)
          return FALSE;
    }
 
-   return nvc0_program_upload_code(nvc0, prog);
+   if (likely(prog->code_size))
+      return nvc0_program_upload_code(nvc0, prog);
+   return TRUE; /* stream output info only */
 }
 
 void
@@ -212,14 +214,15 @@ nvc0_gmtyprog_validate(struct nvc0_context *nvc0)
    struct nouveau_channel *chan = nvc0->screen->base.channel;
    struct nvc0_program *gp = nvc0->gmtyprog;
 
-   if (!gp) {
+   if (gp)
+      nvc0_program_validate(nvc0, gp);
+   /* we allow GPs with no code for specifying stream output state only */
+   if (!gp || !gp->code_size) {
       BEGIN_RING(chan, RING_3D(GP_SELECT), 1);
       OUT_RING  (chan, 0x40);
       IMMED_RING(chan, RING_3D(LAYER), 0);
       return;
    }
-   if (!nvc0_program_validate(nvc0, gp))
-         return;
    nvc0_program_update_context_state(nvc0, gp, 3);
 
    BEGIN_RING(chan, RING_3D(GP_SELECT), 1);
@@ -234,57 +237,76 @@ nvc0_gmtyprog_validate(struct nvc0_context *nvc0)
    nvc0_program_validate_clip(nvc0, gp);
 }
 
-/* It's *is* kind of shader related. We need to inspect the program
- * to get the output locations right.
- */
 void
 nvc0_tfb_validate(struct nvc0_context *nvc0)
 {
    struct nouveau_channel *chan = nvc0->screen->base.channel;
-   struct nvc0_program *vp;
-   struct nvc0_transform_feedback_state *tfb = nvc0->tfb;
-   int b;
+   struct nvc0_transform_feedback_state *tfb;
+   unsigned b, n, i;
 
-   BEGIN_RING(chan, RING_3D(TFB_ENABLE), 1);
-   if (!tfb) {
-      OUT_RING(chan, 0);
-      return;
+   if (nvc0->gmtyprog) tfb = nvc0->gmtyprog->tfb;
+   else
+   if (nvc0->tevlprog) tfb = nvc0->tevlprog->tfb;
+   else
+      tfb = nvc0->vertprog->tfb;
+
+   IMMED_RING(chan, RING_3D(TFB_ENABLE), (tfb && nvc0->num_tfbbufs) ? 1 : 0);
+
+   if (tfb && tfb != nvc0->state.tfb) {
+      uint8_t var[128];
+
+      for (n = 0, b = 0; b < 4; n += tfb->varying_count[b++]) {
+         if (tfb->varying_count[b]) {
+            BEGIN_RING(chan, RING_3D(TFB_STREAM(b)), 3);
+            OUT_RING  (chan, 0);
+            OUT_RING  (chan, tfb->varying_count[b]);
+            OUT_RING  (chan, tfb->stride[b]);
+
+            for (i = 0; i < tfb->varying_count[b]; ++i)
+               var[i] = tfb->varying_index[n + i];
+            for (; i & 3; ++i)
+               var[i] = 0; /* zero rest of method word bits */
+
+            BEGIN_RING(chan, RING_3D(TFB_VARYING_LOCS(b, 0)), i / 4);
+            OUT_RINGp (chan, var, i / 4);
+
+            if (nvc0->tfbbuf[b])
+               nvc0_so_target(nvc0->tfbbuf[b])->stride = tfb->stride[b];
+         } else {
+            IMMED_RING(chan, RING_3D(TFB_VARYING_COUNT(b)), 0);
+         }
+      }
    }
-   OUT_RING(chan, 1);
+   nvc0->state.tfb = tfb;
 
-   vp = nvc0->vertprog ? nvc0->vertprog : nvc0->gmtyprog;
+   if (!(nvc0->dirty & NVC0_NEW_TFB_TARGETS))
+      return;
+   nvc0_bufctx_reset(nvc0, NVC0_BUFCTX_TFB);
 
    for (b = 0; b < nvc0->num_tfbbufs; ++b) {
-      uint8_t idx, var[128];
-      int i, n;
-      struct nv04_resource *buf = nv04_resource(nvc0->tfbbuf[b]);
+      struct nvc0_so_target *targ = nvc0_so_target(nvc0->tfbbuf[b]);
+      struct nv04_resource *buf = nv04_resource(targ->pipe.buffer);
 
-      BEGIN_RING(chan, RING_3D(TFB_BUFFER_ENABLE(b)), 5);
-      OUT_RING  (chan, 1);
-      OUT_RESRCh(chan, buf, nvc0->tfb_offset[b], NOUVEAU_BO_WR);
-      OUT_RESRCl(chan, buf, nvc0->tfb_offset[b], NOUVEAU_BO_WR);
-      OUT_RING  (chan, buf->base.width0 - nvc0->tfb_offset[b]);
-      OUT_RING  (chan, 0); /* TFB_PRIMITIVE_ID <- offset ? */
+      if (tfb)
+         targ->stride = tfb->stride[b];
 
-      if (!(nvc0->dirty & NVC0_NEW_TFB))
+      if (!(nvc0->tfbbuf_dirty & (1 << b)))
          continue;
 
-      BEGIN_RING(chan, RING_3D(TFB_UNK07X0(b)), 3);
-      OUT_RING  (chan, 0);
-      OUT_RING  (chan, tfb->varying_count[b]);
-      OUT_RING  (chan, tfb->stride[b]);
-
-      n = b ? tfb->varying_count[b - 1] : 0;
-      i = 0;
-      for (; i < tfb->varying_count[b]; ++i) {
-         idx = tfb->varying_index[n + i];
-         var[i] = vp->vp.out_pos[idx >> 2] + (idx & 3);
+      if (!targ->clean)
+         nvc0_query_fifo_wait(chan, targ->pq);
+      BEGIN_RING(chan, RING_3D(TFB_BUFFER_ENABLE(b)), 5);
+      OUT_RING  (chan, 1);
+      OUT_RESRCh(chan, buf, targ->pipe.buffer_offset, NOUVEAU_BO_WR);
+      OUT_RESRCl(chan, buf, targ->pipe.buffer_offset, NOUVEAU_BO_WR);
+      OUT_RING  (chan, targ->pipe.buffer_size);
+      if (!targ->clean) {
+         nvc0_query_pushbuf_submit(chan, targ->pq, 0x4);
+      } else {
+         OUT_RING(chan, 0); /* TFB_BUFFER_OFFSET */
+         targ->clean = FALSE;
       }
-      for (; i & 3; ++i)
-         var[i] = 0;
-
-      BEGIN_RING(chan, RING_3D(TFB_VARYING_LOCS(b, 0)), i / 4);
-      OUT_RINGp (chan, var, i / 4);
+      nvc0_bufctx_add_resident(nvc0, NVC0_BUFCTX_TFB, buf, NOUVEAU_BO_WR);
    }
    for (; b < 4; ++b)
       IMMED_RING(chan, RING_3D(TFB_BUFFER_ENABLE(b)), 0);
diff --git a/src/gallium/drivers/nvc0/nvc0_state.c b/src/gallium/drivers/nvc0/nvc0_state.c
index d15a956..1e334a0 100644
--- a/src/gallium/drivers/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nvc0/nvc0_state.c
@@ -520,7 +520,12 @@ nvc0_sp_state_create(struct pipe_context *pipe,
       return NULL;
 
    prog->type = type;
-   prog->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+   if (cso->tokens)
+      prog->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+   if (cso->stream_output.num_outputs)
+      prog->pipe.stream_output = cso->stream_output;
 
    return (void *)prog;
 }
@@ -747,72 +752,75 @@ nvc0_vertex_state_bind(struct pipe_context *pipe, void *hwcso)
     nvc0->dirty |= NVC0_NEW_VERTEX;
 }
 
-static void *
-nvc0_tfb_state_create(struct pipe_context *pipe,
-                      const struct pipe_stream_output_info *pso)
+static struct pipe_stream_output_target *
+nvc0_so_target_create(struct pipe_context *pipe,
+                      struct pipe_resource *res,
+                      unsigned offset, unsigned size)
 {
-   struct nvc0_transform_feedback_state *so;
-   int n = 0;
-   int i, c, b;
-
-   so = MALLOC(sizeof(*so) + pso->num_outputs * 4 * sizeof(uint8_t));
-   if (!so)
+   struct nvc0_so_target *targ = MALLOC_STRUCT(nvc0_so_target);
+   if (!targ)
       return NULL;
 
-   for (b = 0; b < 4; ++b) {
-      for (i = 0; i < pso->num_outputs; ++i) {
-         if (pso->output_buffer[i] != b)
-            continue;
-         for (c = 0; c < 4; ++c) {
-            if (!(pso->register_mask[i] & (1 << c)))
-               continue;
-            so->varying_count[b]++;
-            so->varying_index[n++] = (pso->register_index[i] << 2) | c;
-         }
-      }
-      so->stride[b] = so->varying_count[b] * 4;
+   targ->pq = pipe->create_query(pipe, NVC0_QUERY_TFB_BUFFER_OFFSET);
+   if (!targ->pq) {
+      FREE(targ);
+      return NULL;
    }
-   if (pso->stride)
-      so->stride[0] = pso->stride;
+   targ->clean = TRUE;
 
-   return so;
-}
+   targ->pipe.buffer_size = size;
+   targ->pipe.buffer_offset = offset;
+   targ->pipe.context = pipe;
+   targ->pipe.buffer = NULL;
+   pipe_resource_reference(&targ->pipe.buffer, res);
+   pipe_reference_init(&targ->pipe.reference, 1);
 
-static void
-nvc0_tfb_state_delete(struct pipe_context *pipe, void *hwcso)
-{
-   FREE(hwcso);
+   return &targ->pipe;
 }
 
 static void
-nvc0_tfb_state_bind(struct pipe_context *pipe, void *hwcso)
+nvc0_so_target_destroy(struct pipe_context *pipe,
+                       struct pipe_stream_output_target *ptarg)
 {
-   nvc0_context(pipe)->tfb = hwcso;
-   nvc0_context(pipe)->dirty |= NVC0_NEW_TFB;
+   struct nvc0_so_target *targ = nvc0_so_target(ptarg);
+   pipe->destroy_query(pipe, targ->pq);
+   FREE(targ);
 }
 
 static void
-nvc0_set_transform_feedback_buffers(struct pipe_context *pipe,
-                                    struct pipe_resource **buffers,
-                                    int *offsets,
-                                    int num_buffers)
+nvc0_set_transform_feedback_targets(struct pipe_context *pipe,
+                                    unsigned num_targets,
+                                    struct pipe_stream_output_target **targets,
+                                    unsigned append_mask)
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
-   int i;
+   unsigned i;
+   boolean serialize = TRUE;
 
-   assert(num_buffers >= 0 && num_buffers <= 4); /* why signed ? */
+   assert(num_targets <= 4);
 
-   for (i = 0; i < num_buffers; ++i) {
-       assert(offsets[i] >= 0);
-       nvc0->tfb_offset[i] = offsets[i];
-       pipe_resource_reference(&nvc0->tfbbuf[i], buffers[i]);
-   }
-   for (; i < nvc0->num_tfbbufs; ++i)
-      pipe_resource_reference(&nvc0->tfbbuf[i], NULL);
+   for (i = 0; i < num_targets; ++i) {
+      if (nvc0->tfbbuf[i] == targets[i] && (append_mask & (1 << i)))
+         continue;
+      nvc0->tfbbuf_dirty |= 1 << i;
 
-   nvc0->num_tfbbufs = num_buffers;
+      if (nvc0->tfbbuf[i] && nvc0->tfbbuf[i] != targets[i])
+         nvc0_so_target_save_offset(pipe, nvc0->tfbbuf[i], i, &serialize);
+
+      if (targets[i] && !(append_mask & (1 << i)))
+         nvc0_so_target(targets[i])->clean = TRUE;
+
+      pipe_so_target_reference(&nvc0->tfbbuf[i], targets[i]);
+   }
+   for (; i < nvc0->num_tfbbufs; ++i) {
+      nvc0->tfbbuf_dirty |= 1 << i;
+      nvc0_so_target_save_offset(pipe, nvc0->tfbbuf[i], i, &serialize);
+      pipe_so_target_reference(&nvc0->tfbbuf[i], NULL);
+   }
+   nvc0->num_tfbbufs = num_targets;
 
-   nvc0->dirty |= NVC0_NEW_TFB_BUFFERS;
+   if (nvc0->tfbbuf_dirty)
+      nvc0->dirty |= NVC0_NEW_TFB_TARGETS;
 }
 
 void
@@ -871,17 +879,9 @@ nvc0_init_state_functions(struct nvc0_context *nvc0)
    pipe->set_vertex_buffers = nvc0_set_vertex_buffers;
    pipe->set_index_buffer = nvc0_set_index_buffer;
 
-#if 0
-   pipe->create_stream_output_state = nvc0_tfb_state_create;
-   pipe->delete_stream_output_state = nvc0_tfb_state_delete;
-   pipe->bind_stream_output_state = nvc0_tfb_state_bind;
-   pipe->set_stream_output_buffers = nvc0_set_transform_feedback_buffers;
-#else
-   (void)nvc0_tfb_state_create;
-   (void)nvc0_tfb_state_delete;
-   (void)nvc0_tfb_state_bind;
-   (void)nvc0_set_transform_feedback_buffers;
-#endif
+   pipe->create_stream_output_target = nvc0_so_target_create;
+   pipe->stream_output_target_destroy = nvc0_so_target_destroy;
+   pipe->set_stream_output_targets = nvc0_set_transform_feedback_targets;
 
    pipe->redefine_user_buffer = u_default_redefine_user_buffer;
 }
diff --git a/src/gallium/drivers/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nvc0/nvc0_state_validate.c
index 1ec95b7..0dc822a 100644
--- a/src/gallium/drivers/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nvc0/nvc0_state_validate.c
@@ -428,6 +428,7 @@ nvc0_validate_derived_1(struct nvc0_context *nvc0)
 {
    struct nouveau_channel *chan = nvc0->screen->base.channel;
    boolean early_z;
+   boolean rasterizer_discard;
 
    early_z = nvc0->fragprog->fp.early_z && !nvc0->zsa->pipe.alpha.enabled;
 
@@ -435,6 +436,16 @@ nvc0_validate_derived_1(struct nvc0_context *nvc0)
       nvc0->state.early_z = early_z;
       IMMED_RING(chan, RING_3D(EARLY_FRAGMENT_TESTS), early_z);
    }
+
+   rasterizer_discard = (!nvc0->fragprog || !nvc0->fragprog->hdr[18]) &&
+      !nvc0->zsa->pipe.depth.enabled && !nvc0->zsa->pipe.stencil[0].enabled;
+   rasterizer_discard = rasterizer_discard ||
+      nvc0->rast->pipe.rasterizer_discard;
+
+   if (rasterizer_discard != nvc0->state.rasterizer_discard) {
+      nvc0->state.rasterizer_discard = rasterizer_discard;
+      IMMED_RING(chan, RING_3D(RASTERIZE_ENABLE), !rasterizer_discard);
+   }
 }
 
 static void
@@ -484,13 +495,14 @@ static struct state_validate {
     { nvc0_tevlprog_validate,      NVC0_NEW_TEVLPROG },
     { nvc0_gmtyprog_validate,      NVC0_NEW_GMTYPROG },
     { nvc0_fragprog_validate,      NVC0_NEW_FRAGPROG },
-    { nvc0_validate_derived_1,     NVC0_NEW_FRAGPROG | NVC0_NEW_ZSA },
+    { nvc0_validate_derived_1,     NVC0_NEW_FRAGPROG | NVC0_NEW_ZSA |
+                                   NVC0_NEW_RASTERIZER },
     { nvc0_validate_clip,          NVC0_NEW_CLIP },
     { nvc0_constbufs_validate,     NVC0_NEW_CONSTBUF },
     { nvc0_validate_textures,      NVC0_NEW_TEXTURES },
     { nvc0_validate_samplers,      NVC0_NEW_SAMPLERS },
     { nvc0_vertex_arrays_validate, NVC0_NEW_VERTEX | NVC0_NEW_ARRAYS },
-    { nvc0_tfb_validate,           NVC0_NEW_TFB | NVC0_NEW_TFB_BUFFERS }
+    { nvc0_tfb_validate,           NVC0_NEW_TFB_TARGETS | NVC0_NEW_GMTYPROG }
 };
 #define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0]))
 
diff --git a/src/gallium/drivers/nvc0/nvc0_stateobj.h b/src/gallium/drivers/nvc0/nvc0_stateobj.h
index b508000..5c0d0c1 100644
--- a/src/gallium/drivers/nvc0/nvc0_stateobj.h
+++ b/src/gallium/drivers/nvc0/nvc0_stateobj.h
@@ -50,11 +50,17 @@ struct nvc0_vertex_stateobj {
    struct nvc0_vertex_element element[0];
 };
 
-/* will have to lookup index -> location qualifier from nvc0_program */
-struct nvc0_transform_feedback_state {
-   uint32_t stride[4];
-   uint8_t varying_count[4];
-   uint8_t varying_index[0];
+struct nvc0_so_target {
+   struct pipe_stream_output_target pipe;
+   struct pipe_query *pq;
+   unsigned stride;
+   boolean clean;
 };
 
+static INLINE struct nvc0_so_target *
+nvc0_so_target(struct pipe_stream_output_target *ptarg)
+{
+   return (struct nvc0_so_target *)ptarg;
+}
+
 #endif
diff --git a/src/gallium/drivers/nvc0/nvc0_surface.c b/src/gallium/drivers/nvc0/nvc0_surface.c
index f807535..a2e1a85 100644
--- a/src/gallium/drivers/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nvc0/nvc0_surface.c
@@ -744,7 +744,8 @@ nvc0_blitctx_prepare_state(struct nvc0_blitctx *blit)
    IMMED_RING(chan, RING_3D(STENCIL_ENABLE), 0);
    IMMED_RING(chan, RING_3D(ALPHA_TEST_ENABLE), 0);
 
-   /* transform feedback ? */
+   /* disable transform feedback */
+   IMMED_RING(chan, RING_3D(TFB_ENABLE), 0);
 }
 
 static void
@@ -830,7 +831,8 @@ nvc0_blitctx_post_blit(struct nvc0_context *nvc0, struct nvc0_blitctx *blit)
        NVC0_NEW_RASTERIZER | NVC0_NEW_ZSA | NVC0_NEW_BLEND |
        NVC0_NEW_TEXTURES | NVC0_NEW_SAMPLERS |
        NVC0_NEW_VERTPROG | NVC0_NEW_FRAGPROG |
-       NVC0_NEW_TCTLPROG | NVC0_NEW_TEVLPROG | NVC0_NEW_GMTYPROG);
+       NVC0_NEW_TCTLPROG | NVC0_NEW_TEVLPROG | NVC0_NEW_GMTYPROG |
+       NVC0_NEW_TFB_TARGETS);
 }
 
 static void
diff --git a/src/gallium/drivers/nvc0/nvc0_vbo.c b/src/gallium/drivers/nvc0/nvc0_vbo.c
index 50e99ac..7cf6991 100644
--- a/src/gallium/drivers/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nvc0/nvc0_vbo.c
@@ -569,6 +569,38 @@ nvc0_draw_elements(struct nvc0_context *nvc0, boolean shorten,
    }
 }
 
+static void
+nvc0_draw_stream_output(struct nvc0_context *nvc0,
+                        const struct pipe_draw_info *info)
+{
+   struct nouveau_channel *chan = nvc0->screen->base.channel;
+   struct nvc0_so_target *so = nvc0_so_target(info->count_from_stream_output);
+   struct nv04_resource *res = nv04_resource(so->pipe.buffer);
+   unsigned mode = nvc0_prim_gl(info->mode);
+   unsigned num_instances = info->instance_count;
+
+   if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
+      res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+      IMMED_RING(chan, RING_3D(SERIALIZE), 0);
+      nvc0_query_fifo_wait(chan, so->pq);
+      IMMED_RING(chan, RING_3D(VERTEX_ARRAY_FLUSH), 0);
+   }
+
+   while (num_instances--) {
+      BEGIN_RING(chan, RING_3D(VERTEX_BEGIN_GL), 1);
+      OUT_RING  (chan, mode);
+      BEGIN_RING(chan, RING_3D(DRAW_TFB_BASE), 1);
+      OUT_RING  (chan, 0);
+      BEGIN_RING(chan, RING_3D(DRAW_TFB_STRIDE), 1);
+      OUT_RING  (chan, so->stride);
+      BEGIN_RING(chan, RING_3D(DRAW_TFB_BYTES), 1);
+      nvc0_query_pushbuf_submit(chan, so->pq, 0x4);
+      IMMED_RING(chan, RING_3D(VERTEX_END_GL), 0);
+
+      mode |= NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
+   }
+}
+
 void
 nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
@@ -615,6 +647,9 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       nvc0->base.vbo_dirty = FALSE;
    }
 
+   if (unlikely(info->count_from_stream_output)) {
+      nvc0_draw_stream_output(nvc0, info);
+   } else
    if (!info->indexed) {
       nvc0_draw_arrays(nvc0,
                        info->mode, info->start, info->count,
-- 
1.7.3.4



More information about the mesa-dev mailing list