[Mesa-dev] [PATCH v2] i965 gen6: Initial implementation of transform feedback.

Mon Dec 19 11:05:41 PST 2011

This patch adds basic transform feedback capability for Gen6 hardware.
This consists of several related pieces of functionality:

(1) In gen6_sol.c, we set up binding table entries for use by
transform feedback.  We use one binding table entry per transform
feedback varying (this allows us to avoid doing pointer arithmetic in
the shader, since we can set up the binding table entries with the
appropriate offsets and surface pitches to place each varying at the
correct address).

(2) In brw_context.c, we advertise the hardware capabilities, which
are as follows:

   MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS 64
   MAX_TRANSFORM_FEEDBACK_SEPARATE_ATTRIBS        4
   MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS    16

OpenGL 3.0 requires these values to be at least 64, 4, and 4,
respectively.  The reason we advertise a larger value than required
for MAX_TRANSFORM_FEEDBACK_SEPARATE_COMPONENTS is that we have already
set aside 64 binding table entries, so we might as well make them all
available in both separate attribs and interleaved modes.

(3) We set aside a single SVBI ("streamed vertex buffer index") for
use by transform feedback.  The hardware supports four independent
SVBI's, but we only need one, since vertices are added to all
transform feedback buffers at the same rate.  Note: at the moment this
index is reset to 0 only when the driver is initialized.  It needs to
be reset to 0 whenever BeginTransformFeedback() is called, and
otherwise preserved.

(4) In brw_gs_emit.c and brw_gs.c, we modify the geometry shader
program to output transform feedback data as a side effect.

(5) In gen6_gs_state.c, we configure the geometry shader stage to
handle the SVBI pointer correctly.

Note: ordering of vertices is not yet correct for triangle strips
(alternate triangles are improperly oriented).  This will be addressed
in a future patch.

Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>
---

v2 has been updated to reflect Eric Anholt's comments on 12/17
11:50am, as well as Marek Olšák's suggestion to compute offsets in the
linker (from "Re: [Mesa-dev] [PATCH 1/8] mesa: Record transform
feedback stride in linker output.", 12/14 05:48am).

 src/mesa/drivers/dri/i965/Makefile.sources       |    1 +
 src/mesa/drivers/dri/i965/brw_context.c          |   20 +++++
 src/mesa/drivers/dri/i965/brw_context.h          |   47 +++++++++++-
 src/mesa/drivers/dri/i965/brw_defines.h          |    6 ++
 src/mesa/drivers/dri/i965/brw_eu.h               |    7 ++
 src/mesa/drivers/dri/i965/brw_eu_emit.c          |   39 +++++++++
 src/mesa/drivers/dri/i965/brw_gs.c               |   26 ++++++-
 src/mesa/drivers/dri/i965/brw_gs.h               |   20 +++++
 src/mesa/drivers/dri/i965/brw_gs_emit.c          |   93 ++++++++++++++++++++-
 src/mesa/drivers/dri/i965/brw_misc_state.c       |    2 +-
 src/mesa/drivers/dri/i965/brw_state.h            |    1 +
 src/mesa/drivers/dri/i965/brw_state_upload.c     |    1 +
 src/mesa/drivers/dri/i965/brw_wm_surface_state.c |   85 ++++++++++++++++++++
 src/mesa/drivers/dri/i965/gen6_gs_state.c        |    8 ++-
 src/mesa/drivers/dri/i965/gen6_sol.c             |   71 ++++++++++++++++
 15 files changed, 417 insertions(+), 10 deletions(-)
 create mode 100644 src/mesa/drivers/dri/i965/gen6_sol.c

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index cd6a8f4..e50f9c3 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -93,6 +93,7 @@ i965_C_SOURCES := \
 	gen6_sampler_state.c \
 	gen6_scissor_state.c \
 	gen6_sf_state.c \
+        gen6_sol.c \
 	gen6_urb.c \
 	gen6_viewport_state.c \
 	gen6_vs_state.c \
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index faa02bf..eb68bf4 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -178,6 +178,26 @@ brwCreateContext(int api,
    
    ctx->Const.MaxTextureMaxAnisotropy = 16.0;
 
+   /* Hardware only supports a limited number of transform feedback buffers.
+    * So we need to override the Mesa default (which is based only on software
+    * limits).
+    */
+   ctx->Const.MaxTransformFeedbackSeparateAttribs = BRW_MAX_SOL_BUFFERS;
+
+   /* On Gen6, in the worst case, we use up one binding table entry per
+    * transform feedback component (see comments above the definition of
+    * BRW_MAX_SOL_BINDINGS, in brw_context.h), so we need to advertise a value
+    * for MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS equal to
+    * BRW_MAX_SOL_BINDINGS.
+    *
+    * In "separate components" mode, we need to divide this value by
+    * BRW_MAX_SOL_BUFFERS, so that the total number of binding table entries
+    * used up by all buffers will not exceed BRW_MAX_SOL_BINDINGS.
+    */
+   ctx->Const.MaxTransformFeedbackInterleavedComponents = BRW_MAX_SOL_BINDINGS;
+   ctx->Const.MaxTransformFeedbackSeparateComponents =
+      BRW_MAX_SOL_BINDINGS / BRW_MAX_SOL_BUFFERS;
+
    /* if conformance mode is set, swrast can handle any size AA point */
    ctx->Const.MaxPointSizeAA = 255.0;
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 70a45c7..febd4fe 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -368,6 +368,12 @@ struct brw_clip_prog_data {
 struct brw_gs_prog_data {
    GLuint urb_read_length;
    GLuint total_grf;
+
+   /**
+    * Gen6 transform feedback: Amount by which the streaming vertex buffer
+    * indices should be incremented each time the GS is invoked.
+    */
+   unsigned svbi_postincrement_value;
 };
 
 struct brw_vs_prog_data {
@@ -407,6 +413,34 @@ struct brw_vs_ouput_sizes {
 #define BRW_MAX_DRAW_BUFFERS 8
 
 /**
+ * Max number of binding table entries used for stream output.
+ *
+ * From the OpenGL 3.0 spec, table 6.44 (Transform Feedback State), the
+ * minimum value of MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS is 64.
+ *
+ * On Gen6, the size of transform feedback data is limited not by the number
+ * of components but by the number of binding table entries we set aside.  We
+ * use one binding table entry for a float, one entry for a vector, and one
+ * entry per matrix column.  Since the only way we can communicate our
+ * transform feedback capabilities to the client is via
+ * MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS, we need to plan for the
+ * worst case, in which all the varyings are floats, so we use up one binding
+ * table entry per component.  Therefore we need to set aside at least 64
+ * binding table entries for use by transform feedback.
+ *
+ * Note: since we don't currently pack varyings, it is currently impossible
+ * for the client to actually use up all of these binding table entries--if
+ * all of their varyings were floats, they would run out of varying slots and
+ * fail to link.  But that's a bug, so it seems prudent to go ahead and
+ * allocate the number of binding table entries we will need once the bug is
+ * fixed.
+ */
+#define BRW_MAX_SOL_BINDINGS 64
+
+/** Maximum number of actual buffers used for stream output */
+#define BRW_MAX_SOL_BUFFERS 4
+
+/**
  * Helpers to create Surface Binding Table indexes for draw buffers,
  * textures, and constant buffers.
  *
@@ -436,6 +470,11 @@ struct brw_vs_ouput_sizes {
  *    |   . |     .                   |
  *    |   : |     :                   |
  *    |  25 | Texture 15              |
+ *    +-----|-------------------------+
+ *    |  26 | SOL Binding 0           |
+ *    |   . |     .                   |
+ *    |   : |     :                   |
+ *    |  89 | SOL Binding 63          |
  *    +-------------------------------+
  *
  * Note that nothing actually uses the SURF_INDEX_DRAW macro, so it has to be
@@ -446,9 +485,10 @@ struct brw_vs_ouput_sizes {
 #define SURF_INDEX_VERT_CONST_BUFFER (BRW_MAX_DRAW_BUFFERS + 0)
 #define SURF_INDEX_FRAG_CONST_BUFFER (BRW_MAX_DRAW_BUFFERS + 1)
 #define SURF_INDEX_TEXTURE(t)        (BRW_MAX_DRAW_BUFFERS + 2 + (t))
+#define SURF_INDEX_SOL_BINDING(t)    (SURF_INDEX_TEXTURE(BRW_MAX_TEX_UNIT) + (t))
 
 /** Maximum size of the binding table. */
-#define BRW_MAX_SURFACES (BRW_MAX_DRAW_BUFFERS + BRW_MAX_TEX_UNIT + 2)
+#define BRW_MAX_SURFACES SURF_INDEX_SOL_BINDING(BRW_MAX_SOL_BINDINGS)
 
 enum brw_cache_id {
    BRW_BLEND_STATE,
@@ -1026,6 +1066,11 @@ brw_compute_barycentric_interp_modes(bool shade_model_flat,
 
 /* brw_wm_surface_state.c */
 void brw_init_surface_formats(struct brw_context *brw);
+void
+brw_update_sol_surface(struct brw_context *brw,
+                       struct gl_buffer_object *buffer_obj,
+                       uint32_t *out_offset, unsigned num_vector_components,
+                       unsigned stride_dwords, unsigned offset_dwords);
 
 /* gen6_clip_state.c */
 bool
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 0eb7512..6b27338 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1423,6 +1423,12 @@ enum brw_wm_barycentric_interp_mode {
 #define URB_WRITE_PRIM_START		0x2
 #define URB_WRITE_PRIM_TYPE_SHIFT	2
 
+
+/* Maximum number of entries that can be addressed using a binding table
+ * pointer of type SURFTYPE_BUFFER
+ */
+#define BRW_MAX_NUM_BUFFER_ENTRIES	(1 << 27)
+
 #include "intel_chipset.h"
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 596be02..1529ec6 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -912,6 +912,13 @@ void brw_ff_sync(struct brw_compile *p,
 		   GLuint response_length,
 		   bool eot);
 
+void brw_svb_write(struct brw_compile *p,
+                   struct brw_reg dest,
+                   GLuint msg_reg_nr,
+                   struct brw_reg src0,
+                   GLuint binding_table_index,
+                   bool   send_commit_msg);
+
 void brw_fb_WRITE(struct brw_compile *p,
 		  int dispatch_width,
 		   GLuint msg_reg_nr,
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index d48753c..f6726fc 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -2390,3 +2390,42 @@ void brw_ff_sync(struct brw_compile *p,
 			   response_length,
 			   eot);
 }
+
+/**
+ * Emit the SEND instruction necessary to generate stream output data on Gen6
+ * (for transform feedback).
+ *
+ * If send_commit_msg is true, this is the last piece of stream output data
+ * from this thread, so send the data as a committed write.  According to the
+ * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
+ *
+ *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
+ *   writes are complete by sending the final write as a committed write."
+ */
+void
+brw_svb_write(struct brw_compile *p,
+              struct brw_reg dest,
+              GLuint msg_reg_nr,
+              struct brw_reg src0,
+              GLuint binding_table_index,
+              bool   send_commit_msg)
+{
+   struct brw_instruction *insn;
+
+   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, brw_imm_d(0));
+   brw_set_dp_write_message(p, insn,
+                            binding_table_index,
+                            0, /* msg_control: ignored */
+                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
+                            1, /* msg_length */
+                            true, /* header_present */
+                            0, /* last_render_target: ignored */
+                            send_commit_msg, /* response_length */
+                            0, /* end_of_thread */
+                            send_commit_msg); /* send_commit_msg */
+}
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index f5d5898..9a6edf6 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -183,7 +183,31 @@ static void populate_key( struct brw_context *brw,
    } else if (intel->gen == 6) {
       /* On Gen6, GS is used for transform feedback. */
       /* _NEW_TRANSFORM_FEEDBACK */
-      key->need_gs_prog = ctx->TransformFeedback.CurrentObject->Active;
+      if (ctx->TransformFeedback.CurrentObject->Active) {
+         const struct gl_shader_program *shaderprog =
+            ctx->Shader.CurrentVertexProgram;
+         const struct gl_transform_feedback_info *linked_xfb_info =
+            &shaderprog->LinkedTransformFeedback;
+         int i;
+
+         /* Make sure that the VUE slots won't overflow the unsigned chars in
+          * key->transform_feedback_bindings[].
+          */
+         STATIC_ASSERT (BRW_VERT_RESULT_MAX <= 256);
+
+         /* Make sure that we don't need more binding table entries than we've
+          * set aside for use in transform feedback.  (We shouldn't, since we
+          * set aside enough binding table entries to have one per component).
+          */
+         assert (linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
+
+         key->need_gs_prog = true;
+         key->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
+         for (i = 0; i < key->num_transform_feedback_bindings; ++i) {
+            key->transform_feedback_bindings[i] =
+               linked_xfb_info->Outputs[i].OutputRegister;
+         }
+      }
    } else {
       /* Pre-gen6, GS is used to transform QUADLIST, QUADSTRIP, and LINELOOP
        * into simpler primitives.
diff --git a/src/mesa/drivers/dri/i965/brw_gs.h b/src/mesa/drivers/dri/i965/brw_gs.h
index ecab3ef..33d8d7a 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.h
+++ b/src/mesa/drivers/dri/i965/brw_gs.h
@@ -50,6 +50,18 @@ struct brw_gs_prog_key {
    GLuint pv_first:1;
    GLuint need_gs_prog:1;
    GLuint userclip_active:1;
+
+   /**
+    * Number of varyings that are output to transform feedback.
+    */
+   GLuint num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */
+
+   /**
+    * Map from the index of a transform feedback binding table entry to the
+    * gl_vert_result that should be streamed out through that binding table
+    * entry.
+    */
+   unsigned char transform_feedback_bindings[BRW_MAX_SOL_BINDINGS];
 };
 
 struct brw_gs_compile {
@@ -59,6 +71,14 @@ struct brw_gs_compile {
    
    struct {
       struct brw_reg R0;
+
+      /**
+       * Register holding streamed vertex buffer pointers -- see the Sandy
+       * Bridge PRM, volume 2 part 1, section 4.4.2 (GS Thread Payload
+       * [DevSNB]).  These pointers are delivered in GRF 1.
+       */
+      struct brw_reg SVBI;
+
       struct brw_reg vertex[MAX_GS_VERTS];
       struct brw_reg header;
       struct brw_reg temp;
diff --git a/src/mesa/drivers/dri/i965/brw_gs_emit.c b/src/mesa/drivers/dri/i965/brw_gs_emit.c
index 322f9bd..3062c33 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_emit.c
@@ -42,8 +42,16 @@
 #include "brw_eu.h"
 #include "brw_gs.h"
 
+/**
+ * Allocate registers for GS.
+ *
+ * If svbi_payload_enable is true, then the thread will be spawned with the
+ * "SVBI Payload Enable" bit set, so GRF 1 needs to be set aside to hold the
+ * streamed vertex buffer indices.
+ */
 static void brw_gs_alloc_regs( struct brw_gs_compile *c,
-			       GLuint nr_verts )
+			       GLuint nr_verts,
+                               bool svbi_payload_enable )
 {
    GLuint i = 0,j;
 
@@ -51,6 +59,10 @@ static void brw_gs_alloc_regs( struct brw_gs_compile *c,
     */
    c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
 
+   /* Streamed vertex buffer indices */
+   if (svbi_payload_enable)
+      c->reg.SVBI = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
+
    /* Payload vertices plus space for more generated vertices:
     */
    for (j = 0; j < nr_verts; j++) {
@@ -212,7 +224,7 @@ void brw_gs_quads( struct brw_gs_compile *c, struct brw_gs_prog_key *key )
 {
    struct intel_context *intel = &c->func.brw->intel;
 
-   brw_gs_alloc_regs(c, 4);
+   brw_gs_alloc_regs(c, 4, false);
    brw_gs_initialize_header(c);
    /* Use polygons for correct edgeflag behaviour. Note that vertex 3
     * is the PV for quads, but vertex 0 for polygons:
@@ -250,7 +262,7 @@ void brw_gs_quad_strip( struct brw_gs_compile *c, struct brw_gs_prog_key *key )
 {
    struct intel_context *intel = &c->func.brw->intel;
 
-   brw_gs_alloc_regs(c, 4);
+   brw_gs_alloc_regs(c, 4, false);
    brw_gs_initialize_header(c);
    
    if (intel->needs_ff_sync)
@@ -286,7 +298,7 @@ void brw_gs_lines( struct brw_gs_compile *c )
 {
    struct intel_context *intel = &c->func.brw->intel;
 
-   brw_gs_alloc_regs(c, 2);
+   brw_gs_alloc_regs(c, 2, false);
    brw_gs_initialize_header(c);
 
    if (intel->needs_ff_sync)
@@ -310,10 +322,81 @@ gen6_sol_program(struct brw_gs_compile *c, struct brw_gs_prog_key *key,
 	         unsigned num_verts, bool check_edge_flags)
 {
    struct brw_compile *p = &c->func;
+   c->prog_data.svbi_postincrement_value = num_verts;
 
-   brw_gs_alloc_regs(c, num_verts);
+   brw_gs_alloc_regs(c, num_verts, true);
    brw_gs_initialize_header(c);
 
+   if (key->num_transform_feedback_bindings > 0) {
+      unsigned vertex, binding;
+      /* Note: since we use the binding table to keep track of buffer offsets
+       * and stride, the GS doesn't need to keep track of a separate pointer
+       * into each buffer; it uses a single pointer which increments by 1 for
+       * each vertex.  So we use SVBI0 for this pointer, regardless of whether
+       * transform feedback is in interleaved or separate attribs mode.
+       */
+      brw_MOV(p, get_element_ud(c->reg.header, 5),
+              get_element_ud(c->reg.SVBI, 0));
+      /* For each vertex, generate code to output each varying using the
+       * appropriate binding table entry.
+       */
+      for (vertex = 0; vertex < num_verts; ++vertex) {
+         for (binding = 0; binding < key->num_transform_feedback_bindings;
+              ++binding) {
+            unsigned char vert_result =
+               key->transform_feedback_bindings[binding];
+            unsigned char slot = c->vue_map.vert_result_to_slot[vert_result];
+            /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
+             *
+             *   "Prior to End of Thread with a URB_WRITE, the kernel must
+             *   ensure that all writes are complete by sending the final
+             *   write as a committed write."
+             */
+            bool final_write =
+               binding == key->num_transform_feedback_bindings - 1 &&
+               vertex == num_verts - 1;
+            struct brw_reg vertex_slot = c->reg.vertex[vertex];
+            vertex_slot.nr += slot / 2;
+            vertex_slot.subnr = (slot % 2) * 16;
+            brw_MOV(p, stride(c->reg.header, 4, 4, 1),
+                    retype(vertex_slot, BRW_REGISTER_TYPE_UD));
+            brw_svb_write(p,
+                          final_write ? c->reg.temp : brw_null_reg(), /* dest */
+                          1, /* msg_reg_nr */
+                          c->reg.header, /* src0 */
+                          SURF_INDEX_SOL_BINDING(binding), /* binding_table_index */
+                          final_write); /* send_commit_msg */
+         }
+
+         /* If there are more vertices to output, increment the pointer so
+          * that we will start outputting to the next location in the
+          * transform feedback buffers.
+          */
+         if (vertex != num_verts - 1) {
+            brw_ADD(p, get_element_ud(c->reg.header, 5),
+                    get_element_ud(c->reg.header, 5), brw_imm_ud(1));
+         }
+      }
+
+      /* Now, reinitialize the header register from R0 to restore the parts of
+       * the register that we overwrote while streaming out transform feedback
+       * data.
+       */
+      brw_gs_initialize_header(c);
+
+      /* Finally, wait for the write commit to occur so that we can proceed to
+       * other things safely.
+       *
+       * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
+       *
+       *   The write commit does not modify the destination register, but
+       *   merely clears the dependency associated with the destination
+       *   register. Thus, a simple “mov” instruction using the register as a
+       *   source is sufficient to wait for the write commit to occur.
+       */
+      brw_MOV(p, c->reg.temp, c->reg.temp);
+   }
+
    brw_gs_ff_sync(c, 1);
 
    brw_gs_overwrite_header_dw2_from_r0(c);
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 3e8cb3f..81bac34 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -116,7 +116,7 @@ static void upload_gen6_binding_table_pointers(struct brw_context *brw)
 	     GEN6_BINDING_TABLE_MODIFY_PS |
 	     (4 - 2));
    OUT_BATCH(brw->bind.bo_offset); /* vs */
-   OUT_BATCH(0); /* gs */
+   OUT_BATCH(brw->bind.bo_offset); /* gs */
    OUT_BATCH(brw->bind.bo_offset); /* wm/ps */
    ADVANCE_BATCH();
 }
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 59fe81a..a3a470f 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -92,6 +92,7 @@ extern const struct brw_tracked_state gen6_gs_state;
 extern const struct brw_tracked_state gen6_renderbuffer_surfaces;
 extern const struct brw_tracked_state gen6_sampler_state;
 extern const struct brw_tracked_state gen6_scissor_state;
+extern const struct brw_tracked_state gen6_sol_surface;
 extern const struct brw_tracked_state gen6_sf_state;
 extern const struct brw_tracked_state gen6_sf_vp;
 extern const struct brw_tracked_state gen6_urb;
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index bd32815..4636892 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -145,6 +145,7 @@ static const struct brw_tracked_state *gen6_atoms[] =
    &brw_wm_pull_constants,
    &gen6_renderbuffer_surfaces,
    &brw_texture_surfaces,
+   &gen6_sol_surface,
    &brw_binding_table,
 
    &brw_samplers,
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index f9b0b71..9056077 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -38,6 +38,7 @@
 #include "intel_batchbuffer.h"
 #include "intel_tex.h"
 #include "intel_fbo.h"
+#include "intel_buffer_objects.h"
 
 #include "brw_context.h"
 #include "brw_state.h"
@@ -705,6 +706,90 @@ brw_create_constant_surface(struct brw_context *brw,
 			   I915_GEM_DOMAIN_SAMPLER, 0);
 }
 
+/**
+ * Set up a binding table entry for use by stream output logic (transform
+ * feedback).
+ *
+ * buffer_size_minus_1 must me less than BRW_MAX_NUM_BUFFER_ENTRIES.
+ */
+void
+brw_update_sol_surface(struct brw_context *brw,
+                       struct gl_buffer_object *buffer_obj,
+                       uint32_t *out_offset, unsigned num_vector_components,
+                       unsigned stride_dwords, unsigned offset_dwords)
+{
+   drm_intel_bo *bo = intel_buffer_object(buffer_obj)->buffer;
+   uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 6 * 4, 32,
+                                    out_offset);
+   uint32_t pitch_minus_1 = 4*stride_dwords - 1;
+   uint32_t offset_bytes = 4 * offset_dwords;
+   size_t size_dwords = buffer_obj->Size / 4;
+   uint32_t buffer_size_minus_1, width, height, depth, surface_format;
+
+   /* FIXME: can we rely on core Mesa to ensure that the buffer isn't
+    * too big to map using a single binding table entry?
+    */
+   assert ((size_dwords - offset_dwords) / stride_dwords
+           <= BRW_MAX_NUM_BUFFER_ENTRIES);
+
+   if (size_dwords > offset_dwords + num_vector_components) {
+      /* There is room for at least 1 transform feedback output in the buffer.
+       * Compute the number of additional transform feedback outputs the
+       * buffer has room for.
+       */
+      buffer_size_minus_1 =
+         (size_dwords - offset_dwords - num_vector_components) / stride_dwords;
+   } else {
+      /* There isn't even room for a single transform feedback output in the
+       * buffer.  We can't configure the binding table entry to prevent output
+       * entirely; we'll have to rely on the geometry shader to detect
+       * overflow.  But to minimize the damage in case of a bug, set up the
+       * binding table entry to just allow a single output.
+       */
+      buffer_size_minus_1 = 0;
+   }
+   width = buffer_size_minus_1 & 0x7f;
+   height = (buffer_size_minus_1 & 0xfff80) >> 7;
+   depth = (buffer_size_minus_1 & 0x7f00000) >> 20;
+
+   switch (num_vector_components) {
+   case 1:
+      surface_format = BRW_SURFACEFORMAT_R32_FLOAT;
+      break;
+   case 2:
+      surface_format = BRW_SURFACEFORMAT_R32G32_FLOAT;
+      break;
+   case 3:
+      surface_format = BRW_SURFACEFORMAT_R32G32B32_FLOAT;
+      break;
+   case 4:
+      surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+      break;
+   default:
+      assert (!"Invalid vector size for transform feedback output");
+      surface_format = BRW_SURFACEFORMAT_R32_FLOAT;
+      break;
+   }
+
+   surf[0] = BRW_SURFACE_BUFFER << BRW_SURFACE_TYPE_SHIFT |
+      BRW_SURFACE_MIPMAPLAYOUT_BELOW << BRW_SURFACE_MIPLAYOUT_SHIFT |
+      surface_format << BRW_SURFACE_FORMAT_SHIFT |
+      BRW_SURFACE_RC_READ_WRITE;
+   surf[1] = bo->offset + offset_bytes; /* reloc */
+   surf[2] = (width << BRW_SURFACE_WIDTH_SHIFT |
+	      height << BRW_SURFACE_HEIGHT_SHIFT);
+   surf[3] = (depth << BRW_SURFACE_DEPTH_SHIFT |
+              pitch_minus_1 << BRW_SURFACE_PITCH_SHIFT);
+   surf[4] = 0;
+   surf[5] = 0;
+
+   /* Emit relocation to surface contents. */
+   drm_intel_bo_emit_reloc(brw->intel.batch.bo,
+			   *out_offset + 4,
+			   bo, offset_bytes,
+			   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
+}
+
 /* Creates a new WM constant buffer reflecting the current fragment program's
  * constants, if needed by the fragment program.
  *
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_state.c b/src/mesa/drivers/dri/i965/gen6_gs_state.c
index 42962a6..fdad5d4 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_gs_state.c
@@ -50,13 +50,17 @@ upload_gs_state(struct brw_context *brw)
       OUT_BATCH(brw->gs.prog_offset);
       OUT_BATCH(GEN6_GS_SPF_MODE | GEN6_GS_VECTOR_MASK_ENABLE);
       OUT_BATCH(0); /* no scratch space */
-      OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) |
+      OUT_BATCH((2 << GEN6_GS_DISPATCH_START_GRF_SHIFT) |
 	        (brw->gs.prog_data->urb_read_length << GEN6_GS_URB_READ_LENGTH_SHIFT));
       OUT_BATCH(((brw->max_gs_threads - 1) << GEN6_GS_MAX_THREADS_SHIFT) |
 	        GEN6_GS_STATISTICS_ENABLE |
 		GEN6_GS_SO_STATISTICS_ENABLE |
 		GEN6_GS_RENDERING_ENABLE);
-      OUT_BATCH(GEN6_GS_ENABLE);
+      OUT_BATCH(GEN6_GS_SVBI_PAYLOAD_ENABLE |
+                GEN6_GS_SVBI_POSTINCREMENT_ENABLE |
+                (brw->gs.prog_data->svbi_postincrement_value <<
+                 GEN6_GS_SVBI_POSTINCREMENT_VALUE_SHIFT) |
+                GEN6_GS_ENABLE);
       ADVANCE_BATCH();
    } else {
       BEGIN_BATCH(7);
diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c
new file mode 100644
index 0000000..491b39c
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/gen6_sol.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** \file gen6_sol.c
+ *
+ * Code to initialize the binding table entries used by transform feedback.
+ */
+
+#include "brw_context.h"
+#include "brw_defines.h"
+
+static void
+gen6_update_sol_surfaces(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->intel.ctx;
+   /* _NEW_TRANSFORM_FEEDBACK */
+   struct gl_transform_feedback_object *xfb_obj =
+      ctx->TransformFeedback.CurrentObject;
+   /* BRW_NEW_VERTEX_PROGRAM */
+   const struct gl_shader_program *shaderprog =
+      ctx->Shader.CurrentVertexProgram;
+   const struct gl_transform_feedback_info *linked_xfb_info =
+      &shaderprog->LinkedTransformFeedback;
+   int i;
+
+   for (i = 0; i < BRW_MAX_SOL_BINDINGS; ++i) {
+      const int surf_index = SURF_INDEX_SOL_BINDING(i);
+      if (xfb_obj->Active && i < linked_xfb_info->NumOutputs) {
+         unsigned buffer = linked_xfb_info->Outputs[i].OutputBuffer;
+         unsigned buffer_offset =
+            xfb_obj->Offset[buffer] / 4 +
+            linked_xfb_info->Outputs[i].DstOffset;
+         brw_update_sol_surface(
+            brw, xfb_obj->Buffers[buffer], &brw->bind.surf_offset[surf_index],
+            linked_xfb_info->Outputs[i].NumComponents,
+            linked_xfb_info->BufferStride[buffer], buffer_offset);
+      } else {
+         brw->bind.surf_offset[surf_index] = 0;
+      }
+   }
+}
+
+const struct brw_tracked_state gen6_sol_surface = {
+   .dirty = {
+      .mesa = _NEW_TRANSFORM_FEEDBACK,
+      .brw = (BRW_NEW_BATCH |
+              BRW_NEW_VERTEX_PROGRAM),
+      .cache = 0
+   },
+   .emit = gen6_update_sol_surfaces,
+};
-- 
1.7.6.4