On 22 December 2011 16:54, Eric Anholt <span dir="ltr">&lt;<a href="mailto:eric@anholt.net" target="_blank">eric@anholt.net</a>&gt;</span> wrote:<br><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


Fixes almost all of the transform feedback piglit tests.  Remaining<br>
are a few tests related to tesselation for<br>
quads/trifans/tristrips/polygons with flat shading.<br>
---<br>
 src/mesa/drivers/dri/i965/gen7_sol_state.c |  199 ++++++++++++++++++++++++++-<br>
 1 files changed, 191 insertions(+), 8 deletions(-)<br>
<br>
diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c<br>
index 650f625..a5e28b6 100644<br>
--- a/src/mesa/drivers/dri/i965/gen7_sol_state.c<br>
+++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c<br>
@@ -32,31 +32,214 @@<br>
 #include &quot;brw_state.h&quot;<br>
 #include &quot;brw_defines.h&quot;<br>
 #include &quot;intel_batchbuffer.h&quot;<br>
+#include &quot;intel_buffer_objects.h&quot;<br>
<br>
 static void<br>
-upload_sol_state(struct brw_context *brw)<br>
+upload_3dstate_so_buffers(struct brw_context *brw)<br>
+{<br>
+   struct intel_context *intel = &amp;brw-&gt;intel;<br>
+   struct gl_context *ctx = &amp;intel-&gt;ctx;<br>
+   /* BRW_NEW_VERTEX_PROGRAM */<br>
+   const struct gl_shader_program *vs_prog =<br>
+      ctx-&gt;Shader.CurrentVertexProgram;<br>
+   const struct gl_transform_feedback_info *linked_xfb_info =<br>
+      &amp;vs_prog-&gt;LinkedTransformFeedback;<br>
+   struct gl_transform_feedback_object *xfb_obj =<br>
+      ctx-&gt;TransformFeedback.CurrentObject;<br></blockquote><div><br>Can we have a &quot;/* NEW_TRANSFORM_FEEDBACK */&quot; comment here?<br> </div><blockquote class="gmail_quote" style="margin:0pt 0pt 0pt 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">



+   int i;<br>
+<br>
+   /* Set up the up to 4 output buffers.  These are the ranges defined in the<br>
+    * gl_transform_feedback_object.<br>
+    */<br>
+   for (i = 0; i &lt; 4; i++) {<br>
+      struct gl_buffer_object *bufferobj = xfb_obj-&gt;Buffers[i];<br>
+      drm_intel_bo *bo;<br>
+      uint32_t start, end;<br>
+<br>
+      if (!xfb_obj-&gt;Buffers[i]) {<br>
+        /* The pitch of 0 in this command indicates that the buffer is<br>
+         * unbound and won&#39;t be written to.<br>
+         */<br>
+        BEGIN_BATCH(4);<br>
+        OUT_BATCH(_3DSTATE_SO_BUFFER &lt;&lt; 16 | (4 - 2));<br>
+        OUT_BATCH((i &lt;&lt; SO_BUFFER_INDEX_SHIFT));<br>
+        OUT_BATCH(0);<br>
+        OUT_BATCH(0);<br>
+        ADVANCE_BATCH();<br>
+<br>
+        continue;<br>
+      }<br>
+<br>
+      bo = intel_buffer_object(bufferobj)-&gt;buffer;<br>
+<br>
+      start = xfb_obj-&gt;Offset[i];<br>
+      assert(start % 4 == 0);<br>
+      end = ALIGN(start + xfb_obj-&gt;Size[i], 4);<br>
+      assert(end &lt;= bo-&gt;size);<br>
+<br>
+      BEGIN_BATCH(4);<br>
+      OUT_BATCH(_3DSTATE_SO_BUFFER &lt;&lt; 16 | (4 - 2));<br>
+      OUT_BATCH((i &lt;&lt; SO_BUFFER_INDEX_SHIFT) |<br>
+               ((linked_xfb_info-&gt;BufferStride[i] * 4) &lt;&lt;<br>
+                SO_BUFFER_PITCH_SHIFT));<br></blockquote><div><br>It looks like we&#39;re not setting &quot;SO Buffer Object Control State&quot;.  Is that ok?  I&#39;m not too familiar with memory object control states so I&#39;m not sure, but it seemed to me that it might be sensible to mark the stream output as L3 cacheable.<br>


 </div><blockquote class="gmail_quote" style="margin:0pt 0pt 0pt 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
+      OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, start);<br>
+      OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, end);<br>
+      ADVANCE_BATCH();<br>
+   }<br>
+}<br>
+<br>
+/**<br>
+ * Outputs the 3DSTATE_SO_DECL_LIST command.<br>
+ *<br>
+ * The data output is a series of 64-bit entries containing a SO_DECL per<br>
+ * stream.  We only have one stream of rendering coming out of the GS unit, so<br>
+ * we only emit stream 0 (low 16 bits) SO_DECLs.<br>
+ */<br>
+static void<br>
+upload_3dstate_so_decl_list(struct brw_context *brw,<br>
+                           struct brw_vue_map *vue_map)<br>
+{<br>
+   struct intel_context *intel = &amp;brw-&gt;intel;<br>
+   struct gl_context *ctx = &amp;intel-&gt;ctx;<br>
+   /* BRW_NEW_VERTEX_PROGRAM */<br>
+   const struct gl_shader_program *vs_prog =<br>
+      ctx-&gt;Shader.CurrentVertexProgram;<br>
+   /* NEW_TRANSFORM_FEEDBACK */<br>
+   const struct gl_transform_feedback_info *linked_xfb_info =<br>
+      &amp;vs_prog-&gt;LinkedTransformFeedback;<br>
+   int i;<br>
+   uint16_t so_decl[128];<br></blockquote><div><br>Can we add an assertion to verify that there is no danger of overflowing this array?  I think STATIC_ASSERT(ARRAY_SIZE(so_decl) &gt;= MAX_PROGRAM_OUTPUTS) ought to do the trick.<br>


 </div><blockquote class="gmail_quote" style="margin:0pt 0pt 0pt 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
+   int buffer_mask = 0;<br>
+   int next_offset[4] = {0, 0, 0, 0}; <br></blockquote><blockquote class="gmail_quote" style="margin:0pt 0pt 0pt 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
+<br>
+   /* Construct the list of SO_DECLs to be emitted.  The formatting of the<br>
+    * command is feels strange -- each dword pair contains a SO_DECL per stream.<br>
+    */<br>
+   for (i = 0; i &lt; linked_xfb_info-&gt;NumOutputs; i++) {<br>
+      int buffer = linked_xfb_info-&gt;Outputs[i].OutputBuffer;<br>
+      uint16_t decl = 0;<br>
+      int vert_result = linked_xfb_info-&gt;Outputs[i].OutputRegister;<br>
+<br>
+      buffer_mask |= 1 &lt;&lt; buffer;<br>
+<br>
+      decl |= buffer &lt;&lt; SO_DECL_OUTPUT_BUFFER_SLOT_SHIFT;<br>
+      decl |= vue_map-&gt;vert_result_to_slot[vert_result] &lt;&lt;<br>
+        SO_DECL_REGISTER_INDEX_SHIFT;<br>
+      decl |= ((1 &lt;&lt; linked_xfb_info-&gt;Outputs[i].NumComponents) - 1) &lt;&lt;<br>
+        SO_DECL_COMPONENT_MASK_SHIFT;<br>
+<br>
+      /* FINISHME */<br>
+      assert(linked_xfb_info-&gt;Outputs[i].DstOffset == next_offset[buffer]);<br></blockquote><div><br>FYI, this assertion should hold true until we implement ARB_transfrom_feedback3 (which allows holes in the transform feedback structure).  I think Marek has some plans to implement that for Gallium (not sure of his timeframe though), so we may want to keep an eye out.<br>


 </div><blockquote class="gmail_quote" style="margin:0pt 0pt 0pt 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
+<br>
+      next_offset[buffer] += linked_xfb_info-&gt;Outputs[i].NumComponents;<br>
+<br>
+      so_decl[i] = decl;<br>
+   }<br>
+<br>
+   BEGIN_BATCH(linked_xfb_info-&gt;NumOutputs * 2 + 3);<br>
+   OUT_BATCH(_3DSTATE_SO_DECL_LIST &lt;&lt; 16 |<br>
+            (linked_xfb_info-&gt;NumOutputs * 2 + 1));<br>
+<br>
+   OUT_BATCH((buffer_mask &lt;&lt; SO_STREAM_TO_BUFFER_SELECTS_0_SHIFT) |<br>
+            (0 &lt;&lt; SO_STREAM_TO_BUFFER_SELECTS_1_SHIFT) |<br>
+            (0 &lt;&lt; SO_STREAM_TO_BUFFER_SELECTS_2_SHIFT) |<br>
+            (0 &lt;&lt; SO_STREAM_TO_BUFFER_SELECTS_3_SHIFT));<br>
+<br>
+   OUT_BATCH((linked_xfb_info-&gt;NumOutputs &lt;&lt; SO_NUM_ENTRIES_0_SHIFT) |<br>
+            (0 &lt;&lt; SO_NUM_ENTRIES_1_SHIFT) |<br>
+            (0 &lt;&lt; SO_NUM_ENTRIES_2_SHIFT) |<br>
+            (0 &lt;&lt; SO_NUM_ENTRIES_3_SHIFT));<br>
+<br>
+   for (i = 0; i &lt; linked_xfb_info-&gt;NumOutputs; i++) {<br>
+      OUT_BATCH(so_decl[i]);<br>
+      OUT_BATCH(0);<br>
+   }<br>
+<br>
+   ADVANCE_BATCH();<br>
+}<br>
+<br>
+static void<br>
+upload_3dstate_streamout(struct brw_context *brw, bool active,<br>
+                        struct brw_vue_map *vue_map)<br>
 {<br>
    struct intel_context *intel = &amp;brw-&gt;intel;<br>
    struct gl_context *ctx = &amp;intel-&gt;ctx;<br>
-   uint32_t dw1 = 0;<br>
+   /* _NEW_TRANSFORM_FEEDBACK */<br>
+   struct gl_transform_feedback_object *xfb_obj =<br>
+      ctx-&gt;TransformFeedback.CurrentObject;<br>
+   uint32_t dw1 = 0, dw2 = 0;<br>
+   int i;<br>
<br>
    /* _NEW_RASTERIZER_DISCARD */<br>
    if (ctx-&gt;RasterDiscard)<br>
       dw1 |= SO_RENDERING_DISABLE;<br>
<br>
-   /* Disable the SOL stage */<br>
+   if (active) {<br>
+      int urb_entry_read_offset = 0;<br>
+      int urb_entry_read_length = (vue_map-&gt;num_slots + 1) / 2 -<br>
+        urb_entry_read_offset;<br>
+<br>
+      dw1 |= SO_FUNCTION_ENABLE;<br>
+      dw1 |= SO_STATISTICS_ENABLE;<br></blockquote><blockquote class="gmail_quote" style="margin:0pt 0pt 0pt 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
+<br>
+      for (i = 0; i &lt; 4; i++) {<br>
+        if (xfb_obj-&gt;Buffers[i]) {<br>
+           dw1 |= SO_BUFFER_ENABLE_0 &lt;&lt; i; <br></blockquote><blockquote class="gmail_quote" style="margin:0pt 0pt 0pt 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
+        }<br>
+      }<br>
+<br>
+      /* We always read the whole vertex.  This could be reduced at some<br>
+       * point by reading less and offsetting the register index in the<br>
+       * SO_DECLs.<br>
+       */<br>
+      dw2 |= urb_entry_read_offset &lt;&lt; SO_STREAM_0_VERTEX_READ_OFFSET_SHIFT;<br>
+      dw2 |= (urb_entry_read_length - 1) &lt;&lt;<br>
+        SO_STREAM_0_VERTEX_READ_LENGTH_SHIFT;<br>
+   }<br>
+<br>
    BEGIN_BATCH(3);<br>
    OUT_BATCH(_3DSTATE_STREAMOUT &lt;&lt; 16 | (3 - 2));<br>
-   OUT_BATCH(0);<br>
-   OUT_BATCH(0);<br>
+   OUT_BATCH(dw1);<br>
+   OUT_BATCH(dw2);<br>
    ADVANCE_BATCH();<br>
 }<br>
<br>
+static void<br>
+upload_sol_state(struct brw_context *brw)<br>
+{<br>
+   struct intel_context *intel = &amp;brw-&gt;intel;<br>
+   struct gl_context *ctx = &amp;intel-&gt;ctx;<br>
+   /* _NEW_TRANSFORM_FEEDBACK */<br>
+   struct gl_transform_feedback_object *xfb_obj =<br>
+      ctx-&gt;TransformFeedback.CurrentObject;<br>
+   bool active = xfb_obj-&gt;Active &amp;&amp; !xfb_obj-&gt;Paused;<br>
+   struct brw_vue_map vue_map;<br>
+<br>
+   /* _NEW_TRANSFORM, CACHE_NEW_VS_PROG */<br>
+   brw_compute_vue_map(&amp;vue_map, intel, ctx-&gt;Transform.ClipPlanesEnabled != 0,<br>
+                       brw-&gt;vs.prog_data-&gt;outputs_written);<br>
+<br>
+   if (active) {<br>
+      upload_3dstate_so_buffers(brw);<br>
+      upload_3dstate_so_decl_list(brw, &amp;vue_map);<br>
+   }<br>
+<br>
+   /* Finally, set up the SOL stage.  This command must always follow updates to<br>
+    * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or<br>
+    * MMIO register updates (current performed by the kernel at each batch<br>
+    * emit).<br>
+    */<br>
+   upload_3dstate_streamout(brw, active, &amp;vue_map);<br>
+}<br>
+<br>
 const struct brw_tracked_state gen7_sol_state = {<br>
    .dirty = {<br>
-      .mesa  = _NEW_RASTERIZER_DISCARD,<br>
-      .brw   = BRW_NEW_BATCH,<br>
-      .cache = 0,<br>
+      .mesa  = (_NEW_RASTERIZER_DISCARD |<br>
+               _NEW_TRANSFORM_FEEDBACK |<br>
+               _NEW_TRANSFORM),<br>
+      .brw   = (BRW_NEW_BATCH |<br>
+               BRW_NEW_VERTEX_PROGRAM),<br>
+      .cache = CACHE_NEW_VS_PROG,<br>
    },<br>
    .emit = upload_sol_state,<br>
 };<br>
<span><font color="#888888">--<br>
1.7.7.3<br>
<br>
_______________________________________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org" target="_blank">mesa-dev@lists.freedesktop.org</a><br>
<a href="http://lists.freedesktop.org/mailman/listinfo/mesa-dev" target="_blank">http://lists.freedesktop.org/mailman/listinfo/mesa-dev</a><br>
</font></span></blockquote></div><br>