[Mesa-dev] [PATCH v03 25/38] i965: Port gen7+ 3DSTATE_SOL to genxml.

Tue May 2 01:43:13 UTC 2017

Emit 3DSTATE_SOL on Gen7+ using brw_batch_emit helper, that uses pack
structs from genxml.

v2:
   - Add helpers to assign struct brw_address (Kristian)
v3:
   - Rename MOCS -> SOBufferMOCS
   - Do not re-declare MOCS macros (Ken).
   - Style and code reorganization (Ken).

Signed-off-by: Rafael Antognolli <rafael.antognolli at intel.com>
Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>
---
 src/mesa/drivers/dri/i965/Makefile.sources    |   1 +-
 src/mesa/drivers/dri/i965/brw_state.h         |   6 +-
 src/mesa/drivers/dri/i965/gen7_sol_state.c    | 307 +-----------------
 src/mesa/drivers/dri/i965/gen8_sol_state.c    |  95 +-----
 src/mesa/drivers/dri/i965/genX_state_upload.c | 341 ++++++++++++++++++-
 5 files changed, 338 insertions(+), 412 deletions(-)
 delete mode 100644 src/mesa/drivers/dri/i965/gen8_sol_state.c

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 47680a7..bfcf57c 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -111,7 +111,6 @@ i965_FILES = \
 	gen8_hs_state.c \
 	gen8_multisample_state.c \
 	gen8_ps_state.c \
-	gen8_sol_state.c \
 	gen8_surface_state.c \
 	gen8_viewport_state.c \
 	gen8_vs_state.c \
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 3df975a..94f758b 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -135,7 +135,6 @@ extern const struct brw_tracked_state gen7_l3_state;
 extern const struct brw_tracked_state gen7_ps_state;
 extern const struct brw_tracked_state gen7_push_constant_space;
 extern const struct brw_tracked_state gen7_sf_clip_viewport;
-extern const struct brw_tracked_state gen7_sol_state;
 extern const struct brw_tracked_state gen7_te_state;
 extern const struct brw_tracked_state gen7_tes_push_constants;
 extern const struct brw_tracked_state gen7_urb;
@@ -299,11 +298,6 @@ void gen8_upload_ps_state(struct brw_context *brw,
 void gen8_upload_ps_extra(struct brw_context *brw,
                           const struct brw_wm_prog_data *prog_data);
 
-/* gen7_sol_state.c */
-void gen7_upload_3dstate_so_decl_list(struct brw_context *brw,
-                                      const struct brw_vue_map *vue_map);
-void gen8_upload_3dstate_so_buffers(struct brw_context *brw);
-
 /* gen8_surface_state.c */
 
 void gen8_init_vtable_surface_functions(struct brw_context *brw);
diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c
index f1bd19c..f54b370 100644
--- a/src/mesa/drivers/dri/i965/gen7_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c
@@ -35,313 +35,6 @@
 #include "intel_buffer_objects.h"
 #include "main/transformfeedback.h"
 
-static void
-upload_3dstate_so_buffers(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_TRANSFORM_FEEDBACK */
-   struct gl_transform_feedback_object *xfb_obj =
-      ctx->TransformFeedback.CurrentObject;
-   const struct gl_transform_feedback_info *linked_xfb_info =
-      xfb_obj->program->sh.LinkedTransformFeedback;
-   int i;
-
-   /* Set up the up to 4 output buffers.  These are the ranges defined in the
-    * gl_transform_feedback_object.
-    */
-   for (i = 0; i < 4; i++) {
-      struct intel_buffer_object *bufferobj =
-	 intel_buffer_object(xfb_obj->Buffers[i]);
-      struct brw_bo *bo;
-      uint32_t start, end;
-      uint32_t stride;
-
-      if (!xfb_obj->Buffers[i]) {
-	 /* The pitch of 0 in this command indicates that the buffer is
-	  * unbound and won't be written to.
-	  */
-	 BEGIN_BATCH(4);
-	 OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (4 - 2));
-	 OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT));
-	 OUT_BATCH(0);
-	 OUT_BATCH(0);
-	 ADVANCE_BATCH();
-
-	 continue;
-      }
-
-      stride = linked_xfb_info->Buffers[i].Stride * 4;
-
-      start = xfb_obj->Offset[i];
-      assert(start % 4 == 0);
-      end = ALIGN(start + xfb_obj->Size[i], 4);
-      bo = intel_bufferobj_buffer(brw, bufferobj, start, end - start);
-      assert(end <= bo->size);
-
-      BEGIN_BATCH(4);
-      OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (4 - 2));
-      OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT) | stride);
-      OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, start);
-      OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, end);
-      ADVANCE_BATCH();
-   }
-}
-
-/**
- * Outputs the 3DSTATE_SO_DECL_LIST command.
- *
- * The data output is a series of 64-bit entries containing a SO_DECL per
- * stream.  We only have one stream of rendering coming out of the GS unit, so
- * we only emit stream 0 (low 16 bits) SO_DECLs.
- */
-void
-gen7_upload_3dstate_so_decl_list(struct brw_context *brw,
-                                 const struct brw_vue_map *vue_map)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_TRANSFORM_FEEDBACK */
-   struct gl_transform_feedback_object *xfb_obj =
-      ctx->TransformFeedback.CurrentObject;
-   const struct gl_transform_feedback_info *linked_xfb_info =
-      xfb_obj->program->sh.LinkedTransformFeedback;
-   uint16_t so_decl[MAX_VERTEX_STREAMS][128];
-   int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
-   int next_offset[BRW_MAX_SOL_BUFFERS] = {0, 0, 0, 0};
-   int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
-   int max_decls = 0;
-   STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
-
-   memset(so_decl, 0, sizeof(so_decl));
-
-   /* Construct the list of SO_DECLs to be emitted.  The formatting of the
-    * command is feels strange -- each dword pair contains a SO_DECL per stream.
-    */
-   for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
-      int buffer = linked_xfb_info->Outputs[i].OutputBuffer;
-      uint16_t decl = 0;
-      int varying = linked_xfb_info->Outputs[i].OutputRegister;
-      const unsigned components = linked_xfb_info->Outputs[i].NumComponents;
-      unsigned component_mask = (1 << components) - 1;
-      unsigned stream_id = linked_xfb_info->Outputs[i].StreamId;
-      unsigned decl_buffer_slot = buffer << SO_DECL_OUTPUT_BUFFER_SLOT_SHIFT;
-      assert(stream_id < MAX_VERTEX_STREAMS);
-
-      /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w
-       * gl_Layer is stored in VARYING_SLOT_PSIZ.y
-       * gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z
-       */
-      if (varying == VARYING_SLOT_PSIZ) {
-         assert(components == 1);
-         component_mask <<= 3;
-      } else if (varying == VARYING_SLOT_LAYER) {
-         assert(components == 1);
-         component_mask <<= 1;
-      } else if (varying == VARYING_SLOT_VIEWPORT) {
-         assert(components == 1);
-         component_mask <<= 2;
-      } else {
-         component_mask <<= linked_xfb_info->Outputs[i].ComponentOffset;
-      }
-
-      buffer_mask[stream_id] |= 1 << buffer;
-
-      decl |= decl_buffer_slot;
-      if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) {
-         decl |= vue_map->varying_to_slot[VARYING_SLOT_PSIZ] <<
-            SO_DECL_REGISTER_INDEX_SHIFT;
-      } else {
-         assert(vue_map->varying_to_slot[varying] >= 0);
-         decl |= vue_map->varying_to_slot[varying] <<
-            SO_DECL_REGISTER_INDEX_SHIFT;
-      }
-      decl |= component_mask << SO_DECL_COMPONENT_MASK_SHIFT;
-
-      /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
-       * array.  Instead, it simply increments DstOffset for the following
-       * input by the number of components that should be skipped.
-       *
-       * Our hardware is unusual in that it requires us to program SO_DECLs
-       * for fake "hole" components, rather than simply taking the offset
-       * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
-       * program as many size = 4 holes as we can, then a final hole to
-       * accommodate the final 1, 2, or 3 remaining.
-       */
-      int skip_components =
-         linked_xfb_info->Outputs[i].DstOffset - next_offset[buffer];
-
-      next_offset[buffer] += skip_components;
-
-      while (skip_components >= 4) {
-         so_decl[stream_id][decls[stream_id]++] =
-            SO_DECL_HOLE_FLAG | 0xf | decl_buffer_slot;
-         skip_components -= 4;
-      }
-      if (skip_components > 0)
-         so_decl[stream_id][decls[stream_id]++] =
-            SO_DECL_HOLE_FLAG | ((1 << skip_components) - 1) |
-            decl_buffer_slot;
-
-      assert(linked_xfb_info->Outputs[i].DstOffset == next_offset[buffer]);
-
-      next_offset[buffer] += components;
-
-      so_decl[stream_id][decls[stream_id]++] = decl;
-
-      if (decls[stream_id] > max_decls)
-         max_decls = decls[stream_id];
-   }
-
-   BEGIN_BATCH(max_decls * 2 + 3);
-   OUT_BATCH(_3DSTATE_SO_DECL_LIST << 16 | (max_decls * 2 + 1));
-
-   OUT_BATCH((buffer_mask[0] << SO_STREAM_TO_BUFFER_SELECTS_0_SHIFT) |
-             (buffer_mask[1] << SO_STREAM_TO_BUFFER_SELECTS_1_SHIFT) |
-             (buffer_mask[2] << SO_STREAM_TO_BUFFER_SELECTS_2_SHIFT) |
-             (buffer_mask[3] << SO_STREAM_TO_BUFFER_SELECTS_3_SHIFT));
-
-   OUT_BATCH((decls[0] << SO_NUM_ENTRIES_0_SHIFT) |
-             (decls[1] << SO_NUM_ENTRIES_1_SHIFT) |
-             (decls[2] << SO_NUM_ENTRIES_2_SHIFT) |
-             (decls[3] << SO_NUM_ENTRIES_3_SHIFT));
-
-   for (int i = 0; i < max_decls; i++) {
-      /* Stream 1 | Stream 0 */
-      OUT_BATCH(((uint32_t) so_decl[1][i]) << 16 | so_decl[0][i]);
-      /* Stream 3 | Stream 2 */
-      OUT_BATCH(((uint32_t) so_decl[3][i]) << 16 | so_decl[2][i]);
-   }
-
-   ADVANCE_BATCH();
-}
-
-static bool
-query_active(struct gl_query_object *q)
-{
-   return q && q->Active;
-}
-
-static void
-upload_3dstate_streamout(struct brw_context *brw, bool active,
-			 const struct brw_vue_map *vue_map)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_TRANSFORM_FEEDBACK */
-   struct gl_transform_feedback_object *xfb_obj =
-      ctx->TransformFeedback.CurrentObject;
-   uint32_t dw1 = 0, dw2 = 0, dw3 = 0, dw4 = 0;
-   int i;
-
-   if (active) {
-      const struct gl_transform_feedback_info *linked_xfb_info =
-         xfb_obj->program->sh.LinkedTransformFeedback;
-      int urb_entry_read_offset = 0;
-      int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
-	 urb_entry_read_offset;
-
-      dw1 |= SO_FUNCTION_ENABLE;
-      dw1 |= SO_STATISTICS_ENABLE;
-
-      /* BRW_NEW_RASTERIZER_DISCARD */
-      if (ctx->RasterDiscard) {
-         if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
-            dw1 |= SO_RENDERING_DISABLE;
-         } else {
-            perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
-                       "query active relies on the clipper.");
-         }
-      }
-
-      /* _NEW_LIGHT */
-      if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
-	 dw1 |= SO_REORDER_TRAILING;
-
-      if (brw->gen < 8) {
-         for (i = 0; i < 4; i++) {
-            if (xfb_obj->Buffers[i]) {
-               dw1 |= SO_BUFFER_ENABLE(i);
-            }
-         }
-      }
-
-      /* We always read the whole vertex.  This could be reduced at some
-       * point by reading less and offsetting the register index in the
-       * SO_DECLs.
-       */
-      dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_0_VERTEX_READ_OFFSET);
-      dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_0_VERTEX_READ_LENGTH);
-
-      dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_1_VERTEX_READ_OFFSET);
-      dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_1_VERTEX_READ_LENGTH);
-
-      dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_2_VERTEX_READ_OFFSET);
-      dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_2_VERTEX_READ_LENGTH);
-
-      dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_3_VERTEX_READ_OFFSET);
-      dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_3_VERTEX_READ_LENGTH);
-
-      if (brw->gen >= 8) {
-	 /* Set buffer pitches; 0 means unbound. */
-	 if (xfb_obj->Buffers[0])
-	    dw3 |= linked_xfb_info->Buffers[0].Stride * 4;
-	 if (xfb_obj->Buffers[1])
-	    dw3 |= (linked_xfb_info->Buffers[1].Stride * 4) << 16;
-	 if (xfb_obj->Buffers[2])
-	    dw4 |= linked_xfb_info->Buffers[2].Stride * 4;
-	 if (xfb_obj->Buffers[3])
-	    dw4 |= (linked_xfb_info->Buffers[3].Stride * 4) << 16;
-      }
-   }
-
-   const int dwords = brw->gen >= 8 ? 5 : 3;
-
-   BEGIN_BATCH(dwords);
-   OUT_BATCH(_3DSTATE_STREAMOUT << 16 | (dwords - 2));
-   OUT_BATCH(dw1);
-   OUT_BATCH(dw2);
-   if (dwords > 3) {
-      OUT_BATCH(dw3);
-      OUT_BATCH(dw4);
-   }
-   ADVANCE_BATCH();
-}
-
-static void
-upload_sol_state(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_TRANSFORM_FEEDBACK */
-   bool active = _mesa_is_xfb_active_and_unpaused(ctx);
-
-   if (active) {
-      if (brw->gen >= 8)
-         gen8_upload_3dstate_so_buffers(brw);
-      else
-         upload_3dstate_so_buffers(brw);
-
-      /* BRW_NEW_VUE_MAP_GEOM_OUT */
-      gen7_upload_3dstate_so_decl_list(brw, &brw->vue_map_geom_out);
-   }
-
-   /* Finally, set up the SOL stage.  This command must always follow updates to
-    * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
-    * MMIO register updates (current performed by the kernel at each batch
-    * emit).
-    */
-   upload_3dstate_streamout(brw, active, &brw->vue_map_geom_out);
-}
-
-const struct brw_tracked_state gen7_sol_state = {
-   .dirty = {
-      .mesa  = _NEW_LIGHT,
-      .brw   = BRW_NEW_BATCH |
-               BRW_NEW_BLORP |
-               BRW_NEW_RASTERIZER_DISCARD |
-               BRW_NEW_VUE_MAP_GEOM_OUT |
-               BRW_NEW_TRANSFORM_FEEDBACK,
-   },
-   .emit = upload_sol_state,
-};
-
 void
 gen7_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
                               struct gl_transform_feedback_object *obj)
diff --git a/src/mesa/drivers/dri/i965/gen8_sol_state.c b/src/mesa/drivers/dri/i965/gen8_sol_state.c
deleted file mode 100644
index 6866539..0000000
--- a/src/mesa/drivers/dri/i965/gen8_sol_state.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright © 2012 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-/**
- * @file gen8_sol_state.c
- *
- * Controls the stream output logic (SOL) stage of the gen8 hardware, which is
- * used to implement GL_EXT_transform_feedback.
- */
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-#include "intel_batchbuffer.h"
-#include "intel_buffer_objects.h"
-#include "main/transformfeedback.h"
-
-void
-gen8_upload_3dstate_so_buffers(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_TRANSFORM_FEEDBACK */
-   struct gl_transform_feedback_object *xfb_obj =
-      ctx->TransformFeedback.CurrentObject;
-   struct brw_transform_feedback_object *brw_obj =
-      (struct brw_transform_feedback_object *) xfb_obj;
-   uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
-
-   /* Set up the up to 4 output buffers.  These are the ranges defined in the
-    * gl_transform_feedback_object.
-    */
-   for (int i = 0; i < 4; i++) {
-      struct intel_buffer_object *bufferobj =
-         intel_buffer_object(xfb_obj->Buffers[i]);
-
-      if (!bufferobj) {
-         BEGIN_BATCH(8);
-         OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (8 - 2));
-         OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT));
-         OUT_BATCH(0);
-         OUT_BATCH(0);
-         OUT_BATCH(0);
-         OUT_BATCH(0);
-         OUT_BATCH(0);
-         OUT_BATCH(0);
-         ADVANCE_BATCH();
-         continue;
-      }
-
-      uint32_t start = xfb_obj->Offset[i];
-      assert(start % 4 == 0);
-      uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
-      struct brw_bo *bo =
-         intel_bufferobj_buffer(brw, bufferobj, start, end - start);
-      assert(end <= bo->size);
-
-      BEGIN_BATCH(8);
-      OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (8 - 2));
-      OUT_BATCH(GEN8_SO_BUFFER_ENABLE | (i << SO_BUFFER_INDEX_SHIFT) |
-                GEN8_SO_BUFFER_OFFSET_WRITE_ENABLE |
-                GEN8_SO_BUFFER_OFFSET_ADDRESS_ENABLE |
-                (mocs_wb << 22));
-      OUT_RELOC64(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, start);
-      OUT_BATCH(xfb_obj->Size[i] / 4 - 1);
-      OUT_RELOC64(brw_obj->offset_bo,
-                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
-                  i * sizeof(uint32_t));
-      if (brw_obj->zero_offsets)
-         OUT_BATCH(0); /* Zero out the offset and write that to offset_bo */
-      else
-         OUT_BATCH(0xFFFFFFFF); /* Use offset_bo as the "Stream Offset." */
-      ADVANCE_BATCH();
-   }
-   brw_obj->zero_offsets = false;
-}
diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c
index a9d87fe..8bf5bc4 100644
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -31,11 +31,13 @@
 #include "brw_util.h"
 
 #include "intel_batchbuffer.h"
+#include "intel_buffer_objects.h"
 #include "intel_fbo.h"
 
 #include "main/fbobject.h"
 #include "main/framebuffer.h"
 #include "main/stencil.h"
+#include "main/transformfeedback.h"
 
 #include "compiler/brw_defines_common.h"
 
@@ -82,6 +84,28 @@ __gen_combine_address(struct brw_context *brw, void *location,
    }
 }
 
+static inline struct brw_address
+render_bo(struct brw_bo *bo, uint32_t offset)
+{
+   return (struct brw_address) {
+            .bo = bo,
+            .offset = offset,
+            .read_domains = I915_GEM_DOMAIN_RENDER,
+            .write_domain = I915_GEM_DOMAIN_RENDER,
+   };
+}
+
+static inline struct brw_address
+instruction_bo(struct brw_bo *bo, uint32_t offset)
+{
+   return (struct brw_address) {
+            .bo = bo,
+            .offset = offset,
+            .read_domains = I915_GEM_DOMAIN_INSTRUCTION,
+            .write_domain = I915_GEM_DOMAIN_INSTRUCTION,
+   };
+}
+
 #include "genxml/genX_pack.h"
 
 #define _brw_cmd_length(cmd) cmd ## _length
@@ -96,11 +120,12 @@ __gen_combine_address(struct brw_context *brw, void *location,
         _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),   \
         _dst = NULL)
 
-#define brw_batch_emitn(brw, cmd, n) ({                \
+#define brw_batch_emitn(brw, cmd, n, ...) ({           \
       uint32_t *_dw = emit_dwords(brw, n);             \
       struct cmd template = {                          \
          _brw_cmd_header(cmd),                         \
          .DWordLength = n - _brw_cmd_length_bias(cmd), \
+         __VA_ARGS__                                   \
       };                                               \
       _brw_cmd_pack(cmd)(brw, _dw, &template);         \
       _dw + 1; /* Array starts at dw[1] */             \
@@ -864,6 +889,316 @@ static const struct brw_tracked_state genX(sbe_state) = {
    },
    .emit = genX(upload_sbe),
 };
+
+/* ---------------------------------------------------------------------- */
+
+/**
+ * Outputs the 3DSTATE_SO_DECL_LIST command.
+ *
+ * The data output is a series of 64-bit entries containing a SO_DECL per
+ * stream.  We only have one stream of rendering coming out of the GS unit, so
+ * we only emit stream 0 (low 16 bits) SO_DECLs.
+ */
+static void
+genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
+                                  const struct brw_vue_map *vue_map)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_TRANSFORM_FEEDBACK */
+   struct gl_transform_feedback_object *xfb_obj =
+      ctx->TransformFeedback.CurrentObject;
+   const struct gl_transform_feedback_info *linked_xfb_info =
+      xfb_obj->program->sh.LinkedTransformFeedback;
+   struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
+   int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
+   int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
+   int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
+   int max_decls = 0;
+   STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
+
+   memset(so_decl, 0, sizeof(so_decl));
+
+   /* Construct the list of SO_DECLs to be emitted.  The formatting of the
+    * command feels strange -- each dword pair contains a SO_DECL per stream.
+    */
+   for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
+      int buffer = linked_xfb_info->Outputs[i].OutputBuffer;
+      struct GENX(SO_DECL) decl = {0};
+      int varying = linked_xfb_info->Outputs[i].OutputRegister;
+      const unsigned components = linked_xfb_info->Outputs[i].NumComponents;
+      unsigned component_mask = (1 << components) - 1;
+      unsigned stream_id = linked_xfb_info->Outputs[i].StreamId;
+      unsigned decl_buffer_slot = buffer;
+      assert(stream_id < MAX_VERTEX_STREAMS);
+
+      /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w
+       * gl_Layer is stored in VARYING_SLOT_PSIZ.y
+       * gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z
+       */
+      if (varying == VARYING_SLOT_PSIZ) {
+         assert(components == 1);
+         component_mask <<= 3;
+      } else if (varying == VARYING_SLOT_LAYER) {
+         assert(components == 1);
+         component_mask <<= 1;
+      } else if (varying == VARYING_SLOT_VIEWPORT) {
+         assert(components == 1);
+         component_mask <<= 2;
+      } else {
+         component_mask <<= linked_xfb_info->Outputs[i].ComponentOffset;
+      }
+
+      buffer_mask[stream_id] |= 1 << buffer;
+
+      decl.OutputBufferSlot = decl_buffer_slot;
+      if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) {
+         decl.RegisterIndex = vue_map->varying_to_slot[VARYING_SLOT_PSIZ];
+      } else {
+         assert(vue_map->varying_to_slot[varying] >= 0);
+         decl.RegisterIndex = vue_map->varying_to_slot[varying];
+      }
+      decl.ComponentMask = component_mask;
+
+      /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
+       * array.  Instead, it simply increments DstOffset for the following
+       * input by the number of components that should be skipped.
+       *
+       * Our hardware is unusual in that it requires us to program SO_DECLs
+       * for fake "hole" components, rather than simply taking the offset
+       * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
+       * program as many size = 4 holes as we can, then a final hole to
+       * accommodate the final 1, 2, or 3 remaining.
+       */
+      int skip_components =
+         linked_xfb_info->Outputs[i].DstOffset - next_offset[buffer];
+
+      next_offset[buffer] += skip_components;
+
+      while (skip_components >= 4) {
+         struct GENX(SO_DECL) *d = &so_decl[stream_id][decls[stream_id]++];
+         d->HoleFlag = 1;
+         d->OutputBufferSlot = decl_buffer_slot;
+         d->ComponentMask = 0xf;
+         skip_components -= 4;
+      }
+
+      if (skip_components > 0) {
+         struct GENX(SO_DECL) *d = &so_decl[stream_id][decls[stream_id]++];
+         d->HoleFlag = 1;
+         d->OutputBufferSlot = decl_buffer_slot;
+         d->ComponentMask = (1 << skip_components) - 1;
+      }
+
+      assert(linked_xfb_info->Outputs[i].DstOffset == next_offset[buffer]);
+
+      next_offset[buffer] += components;
+
+      so_decl[stream_id][decls[stream_id]++] = decl;
+
+      if (decls[stream_id] > max_decls)
+         max_decls = decls[stream_id];
+   }
+
+   uint32_t *dw;
+   dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls,
+                        .StreamtoBufferSelects0 = buffer_mask[0],
+                        .StreamtoBufferSelects1 = buffer_mask[1],
+                        .StreamtoBufferSelects2 = buffer_mask[2],
+                        .StreamtoBufferSelects3 = buffer_mask[3],
+                        .NumEntries0 = decls[0],
+                        .NumEntries1 = decls[1],
+                        .NumEntries2 = decls[2],
+                        .NumEntries3 = decls[3]);
+
+   for (int i = 0; i < max_decls; i++) {
+      GENX(SO_DECL_ENTRY_pack)(
+         brw, dw + 2 + i * 2,
+         &(struct GENX(SO_DECL_ENTRY)) {
+            .Stream0Decl = so_decl[0][i],
+            .Stream1Decl = so_decl[1][i],
+            .Stream2Decl = so_decl[2][i],
+            .Stream3Decl = so_decl[3][i],
+         });
+   }
+}
+
+static void
+genX(upload_3dstate_so_buffers)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_TRANSFORM_FEEDBACK */
+   struct gl_transform_feedback_object *xfb_obj =
+      ctx->TransformFeedback.CurrentObject;
+#if GEN_GEN < 8
+   const struct gl_transform_feedback_info *linked_xfb_info =
+      xfb_obj->program->sh.LinkedTransformFeedback;
+#else
+   struct brw_transform_feedback_object *brw_obj =
+      (struct brw_transform_feedback_object *) xfb_obj;
+   uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
+#endif
+
+   /* Set up the up to 4 output buffers.  These are the ranges defined in the
+    * gl_transform_feedback_object.
+    */
+   for (int i = 0; i < 4; i++) {
+      struct intel_buffer_object *bufferobj =
+         intel_buffer_object(xfb_obj->Buffers[i]);
+
+      if (!bufferobj) {
+         brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
+            sob.SOBufferIndex = i;
+         }
+         continue;
+      }
+
+      uint32_t start = xfb_obj->Offset[i];
+      assert(start % 4 == 0);
+      uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
+      struct brw_bo *bo =
+         intel_bufferobj_buffer(brw, bufferobj, start, end - start);
+      assert(end <= bo->size);
+
+      brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
+         sob.SOBufferIndex = i;
+
+         sob.SurfaceBaseAddress = render_bo(bo, start);
+#if GEN_GEN < 8
+         sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4;
+         sob.SurfaceEndAddress = render_bo(bo, end);
+#else
+         sob.SOBufferEnable = true;
+         sob.StreamOffsetWriteEnable = true;
+         sob.StreamOutputBufferOffsetAddressEnable = true;
+         sob.SOBufferMOCS = mocs_wb;
+
+         sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
+         sob.StreamOutputBufferOffsetAddress =
+            instruction_bo(brw_obj->offset_bo, i * sizeof(uint32_t));
+
+         if (brw_obj->zero_offsets) {
+            /* Zero out the offset and write that to offset_bo */
+            sob.StreamOffset = 0;
+         } else {
+            /* Use offset_bo as the "Stream Offset." */
+            sob.StreamOffset = 0xFFFFFFFF;
+         }
+#endif
+      }
+   }
+
+#if GEN_GEN >= 8
+   brw_obj->zero_offsets = false;
+#endif
+}
+
+static inline bool
+query_active(struct gl_query_object *q)
+{
+   return q && q->Active;
+}
+
+static void
+genX(upload_3dstate_streamout)(struct brw_context *brw, bool active,
+                               const struct brw_vue_map *vue_map)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_TRANSFORM_FEEDBACK */
+   struct gl_transform_feedback_object *xfb_obj =
+      ctx->TransformFeedback.CurrentObject;
+
+   brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) {
+      if (active) {
+         int urb_entry_read_offset = 0;
+         int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
+            urb_entry_read_offset;
+
+         sos.SOFunctionEnable = true;
+         sos.SOStatisticsEnable = true;
+
+         /* BRW_NEW_RASTERIZER_DISCARD */
+         if (ctx->RasterDiscard) {
+            if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
+               sos.RenderingDisable = true;
+            } else {
+               perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
+                          "query active relies on the clipper.");
+            }
+         }
+
+         /* _NEW_LIGHT */
+         if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
+            sos.ReorderMode = TRAILING;
+
+#if GEN_GEN < 8
+         sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL;
+         sos.SOBufferEnable1 = xfb_obj->Buffers[1] != NULL;
+         sos.SOBufferEnable2 = xfb_obj->Buffers[2] != NULL;
+         sos.SOBufferEnable3 = xfb_obj->Buffers[3] != NULL;
+#else
+         const struct gl_transform_feedback_info *linked_xfb_info =
+            xfb_obj->program->sh.LinkedTransformFeedback;
+         /* Set buffer pitches; 0 means unbound. */
+         if (xfb_obj->Buffers[0])
+            sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4;
+         if (xfb_obj->Buffers[1])
+            sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4;
+         if (xfb_obj->Buffers[2])
+            sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4;
+         if (xfb_obj->Buffers[3])
+            sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4;
+#endif
+
+         /* We always read the whole vertex.  This could be reduced at some
+          * point by reading less and offsetting the register index in the
+          * SO_DECLs.
+          */
+         sos.Stream0VertexReadOffset = urb_entry_read_offset;
+         sos.Stream0VertexReadLength = urb_entry_read_length - 1;
+         sos.Stream1VertexReadOffset = urb_entry_read_offset;
+         sos.Stream1VertexReadLength = urb_entry_read_length - 1;
+         sos.Stream2VertexReadOffset = urb_entry_read_offset;
+         sos.Stream2VertexReadLength = urb_entry_read_length - 1;
+         sos.Stream3VertexReadOffset = urb_entry_read_offset;
+         sos.Stream3VertexReadLength = urb_entry_read_length - 1;
+      }
+   }
+}
+
+static void
+genX(upload_sol)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   /* BRW_NEW_TRANSFORM_FEEDBACK */
+   bool active = _mesa_is_xfb_active_and_unpaused(ctx);
+
+   if (active) {
+      genX(upload_3dstate_so_buffers)(brw);
+
+      /* BRW_NEW_VUE_MAP_GEOM_OUT */
+      genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out);
+   }
+
+   /* Finally, set up the SOL stage.  This command must always follow updates to
+    * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
+    * MMIO register updates (current performed by the kernel at each batch
+    * emit).
+    */
+   genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out);
+}
+
+static const struct brw_tracked_state genX(sol_state) = {
+   .dirty = {
+      .mesa  = _NEW_LIGHT,
+      .brw   = BRW_NEW_BATCH |
+               BRW_NEW_BLORP |
+               BRW_NEW_RASTERIZER_DISCARD |
+               BRW_NEW_VUE_MAP_GEOM_OUT |
+               BRW_NEW_TRANSFORM_FEEDBACK,
+   },
+   .emit = genX(upload_sol),
+};
+
 #endif
 
 /* ---------------------------------------------------------------------- */
@@ -1182,7 +1517,7 @@ genX(init_atoms)(struct brw_context *brw)
       &gen7_te_state,
       &gen7_ds_state,
       &gen7_gs_state,
-      &gen7_sol_state,
+      &genX(sol_state),
       &genX(clip_state),
       &genX(sbe_state),
       &genX(sf_state),
@@ -1269,7 +1604,7 @@ genX(init_atoms)(struct brw_context *brw)
       &gen7_te_state,
       &gen8_ds_state,
       &gen8_gs_state,
-      &gen7_sol_state,
+      &genX(sol_state),
       &genX(clip_state),
       &genX(raster_state),
       &genX(sbe_state),
-- 
git-series 0.9.1