<div dir="ltr"><div><div><div>On 28 August 2013 20:40, Kenneth Graunke <span dir="ltr"><<a href="mailto:kenneth@whitecape.org" target="_blank">kenneth@whitecape.org</a>></span> wrote:<br><div class="gmail_extra"><div class="gmail_quote">
<blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"><div class=""><div class="h5">On 08/26/2013 03:12 PM, Paul Berry wrote:<br>
<blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
---<br>
src/mesa/drivers/dri/i965/brw_<u></u>state.h | 41 ++++++++++<br>
src/mesa/drivers/dri/i965/<u></u>gen7_vs_state.c | 123 ++++++++++++++++++++----------<br>
2 files changed, 122 insertions(+), 42 deletions(-)<br>
<br>
diff --git a/src/mesa/drivers/dri/i965/<u></u>brw_state.h b/src/mesa/drivers/dri/i965/<u></u>brw_state.h<br>
index b54338a..efef994 100644<br>
--- a/src/mesa/drivers/dri/i965/<u></u>brw_state.h<br>
+++ b/src/mesa/drivers/dri/i965/<u></u>brw_state.h<br>
@@ -128,6 +128,38 @@ extern const struct brw_tracked_state gen7_wm_state;<br>
extern const struct brw_tracked_state haswell_cut_index;<br>
<br>
<br>
+/**<br>
+ * Parameters that differ between Gen7 VS and GS state upload commands.<br>
+ */<br>
+struct gen7_vec4_upload_params<br>
+{<br>
+ /**<br>
+ * Command used to set the binding table pointers for this stage.<br>
+ */<br>
+ unsigned binding_table_pointers_cmd;<br>
+<br>
+ /**<br>
+ * Command used to set the sampler state pointers for this stage.<br>
+ */<br>
+ unsigned sampler_state_pointers_cmd;<br>
+<br>
+ /**<br>
+ * Command used to send constants for this stage.<br>
+ */<br>
+ unsigned constant_cmd;<br>
+<br>
+ /**<br>
+ * Command used to send state for this stage.<br>
+ */<br>
+ unsigned state_cmd;<br>
+<br>
+ /**<br>
+ * Size of the state command for this stage.<br>
+ */<br>
+ unsigned state_cmd_size;<br>
+};<br>
+<br>
+<br>
/* brw_misc_state.c */<br>
void brw_upload_invariant_state(<u></u>struct brw_context *brw);<br>
uint32_t<br>
@@ -240,6 +272,15 @@ brw_vec4_upload_binding_table(<u></u>struct brw_context *brw,<br>
struct brw_vec4_context_base *vec4_ctx,<br>
const struct brw_vec4_prog_data *prog_data);<br>
<br>
+/* gen7_vs_state.c */<br>
+void<br>
+gen7_upload_vec4_state(struct brw_context *brw,<br>
+ const struct gen7_vec4_upload_params *upload_params,<br>
+ const struct brw_vec4_context_base *vec4_ctx,<br>
+ bool active, bool alt_floating_point_mode,<br>
+ const struct brw_vec4_prog_data *prog_data,<br>
+ const unsigned *stage_specific_cmd_data);<br>
+<br>
#ifdef __cplusplus<br>
}<br>
#endif<br>
diff --git a/src/mesa/drivers/dri/i965/<u></u>gen7_vs_state.c b/src/mesa/drivers/dri/i965/<u></u>gen7_vs_state.c<br>
index 30fe802..fd81112 100644<br>
--- a/src/mesa/drivers/dri/i965/<u></u>gen7_vs_state.c<br>
+++ b/src/mesa/drivers/dri/i965/<u></u>gen7_vs_state.c<br>
@@ -29,33 +29,31 @@<br>
#include "program/prog_statevars.h"<br>
#include "intel_batchbuffer.h"<br>
<br>
-static void<br>
-upload_vs_state(struct brw_context *brw)<br>
-{<br>
- struct gl_context *ctx = &brw->ctx;<br>
- const struct brw_vec4_context_base *vec4_ctx = &brw->vs.base;<br>
- uint32_t floating_point_mode = 0;<br>
- const int max_threads_shift = brw->is_haswell ?<br>
- HSW_VS_MAX_THREADS_SHIFT : GEN6_VS_MAX_THREADS_SHIFT;<br>
<br>
- gen7_emit_vs_workaround_flush(<u></u>brw);<br>
-<br>
- /* BRW_NEW_VS_BINDING_TABLE */<br>
+void<br>
+gen7_upload_vec4_state(struct brw_context *brw,<br>
+ const struct gen7_vec4_upload_params *upload_params,<br>
+ const struct brw_vec4_context_base *vec4_ctx,<br>
+ bool active, bool alt_floating_point_mode,<br>
+ const struct brw_vec4_prog_data *prog_data,<br>
+ const unsigned *stage_specific_cmd_data)<br>
+{<br>
+ /* BRW_NEW_*_BINDING_TABLE */<br>
BEGIN_BATCH(2);<br>
- OUT_BATCH(_3DSTATE_BINDING_<u></u>TABLE_POINTERS_VS << 16 | (2 - 2));<br>
+ OUT_BATCH(upload_params-><u></u>binding_table_pointers_cmd << 16 | (2 - 2));<br>
OUT_BATCH(vec4_ctx->bind_bo_<u></u>offset);<br>
ADVANCE_BATCH();<br>
<br>
/* CACHE_NEW_SAMPLER */<br>
BEGIN_BATCH(2);<br>
- OUT_BATCH(_3DSTATE_SAMPLER_<u></u>STATE_POINTERS_VS << 16 | (2 - 2));<br>
+ OUT_BATCH(upload_params-><u></u>sampler_state_pointers_cmd << 16 | (2 - 2));<br>
OUT_BATCH(vec4_ctx->sampler_<u></u>offset);<br>
ADVANCE_BATCH();<br>
<br>
- if (vec4_ctx->push_const_size == 0) {<br>
+ if (!active || vec4_ctx->push_const_size == 0) {<br>
/* Disable the push constant buffers. */<br>
BEGIN_BATCH(7);<br>
- OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 | (7 - 2));<br>
+ OUT_BATCH(upload_params-><u></u>constant_cmd << 16 | (7 - 2));<br>
OUT_BATCH(0);<br>
OUT_BATCH(0);<br>
OUT_BATCH(0);<br>
@@ -65,10 +63,10 @@ upload_vs_state(struct brw_context *brw)<br>
ADVANCE_BATCH();<br>
} else {<br>
BEGIN_BATCH(7);<br>
- OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 | (7 - 2));<br>
+ OUT_BATCH(upload_params-><u></u>constant_cmd << 16 | (7 - 2));<br>
OUT_BATCH(vec4_ctx->push_<u></u>const_size);<br>
OUT_BATCH(0);<br>
- /* Pointer to the VS constant buffer. Covered by the set of<br>
+ /* Pointer to the stage's constant buffer. Covered by the set of<br>
* state flags from gen6_prepare_wm_contants<br>
*/<br>
OUT_BATCH(vec4_ctx->push_<u></u>const_offset | GEN7_MOCS_L3);<br>
@@ -78,36 +76,77 @@ upload_vs_state(struct brw_context *brw)<br>
ADVANCE_BATCH();<br>
}<br>
<br>
+ BEGIN_BATCH(upload_params-><u></u>state_cmd_size);<br>
+ OUT_BATCH(upload_params-><u></u>state_cmd << 16 |<br>
+ (upload_params->state_cmd_size - 2));<br>
+ if (active) {<br>
+ OUT_BATCH(vec4_ctx->prog_<u></u>offset);<br>
+ OUT_BATCH((alt_floating_point_<u></u>mode ? GEN6_FLOATING_POINT_MODE_ALT<br>
+ : GEN6_FLOATING_POINT_MODE_IEEE_<u></u>754) |<br>
+ ((ALIGN(vec4_ctx->sampler_<u></u>count, 4)/4) <<<br>
+ GEN6_SAMPLER_COUNT_SHIFT));<br>
+<br>
+ if (prog_data->total_scratch) {<br>
+ OUT_RELOC(vec4_ctx->scratch_<u></u>bo,<br>
+ I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,<br>
+ ffs(prog_data->total_scratch) - 11);<br>
+ } else {<br>
+ OUT_BATCH(0);<br>
+ }<br>
+ } else {<br>
+ OUT_BATCH(0); /* prog_bo */<br>
+ OUT_BATCH((0 << GEN6_SAMPLER_COUNT_SHIFT) |<br>
+ (0 << GEN6_BINDING_TABLE_ENTRY_<u></u>COUNT_SHIFT));<br>
+ OUT_BATCH(0); /* scratch space base offset */<br>
+ }<br>
+ for (int i = 0; i < upload_params->state_cmd_size - 4; ++i)<br>
+ OUT_BATCH(stage_specific_cmd_<u></u>data[i]);<br>
+ ADVANCE_BATCH();<br>
+}<br>
+<br>
+<br>
+static const struct gen7_vec4_upload_params vs_upload_params = {<br>
+ .binding_table_pointers_cmd = _3DSTATE_BINDING_TABLE_<u></u>POINTERS_VS,<br>
+ .sampler_state_pointers_cmd = _3DSTATE_SAMPLER_STATE_<u></u>POINTERS_VS,<br>
+ .constant_cmd = _3DSTATE_CONSTANT_VS,<br>
+ .state_cmd = _3DSTATE_VS,<br>
+ .state_cmd_size = 6,<br>
+};<br>
+<br>
+<br>
+static void<br>
+upload_vs_state(struct brw_context *brw)<br>
+{<br>
+ struct gl_context *ctx = &brw->ctx;<br>
+ const struct brw_vec4_context_base *vec4_ctx = &brw->vs.base;<br>
+ const int max_threads_shift = brw->is_haswell ?<br>
+ HSW_VS_MAX_THREADS_SHIFT : GEN6_VS_MAX_THREADS_SHIFT;<br>
+ /* CACHE_NEW_VS_PROG */<br>
+ const struct brw_vec4_prog_data *prog_data = &brw->vs.prog_data->base;<br>
+<br>
+ gen7_emit_vs_workaround_flush(<u></u>brw);<br>
+<br>
/* Use ALT floating point mode for ARB vertex programs, because they<br>
* require 0^0 == 1.<br>
*/<br>
- if (ctx->Shader.<u></u>CurrentVertexProgram == NULL)<br>
- floating_point_mode = GEN6_FLOATING_POINT_MODE_ALT;<br>
-<br>
- BEGIN_BATCH(6);<br>
- OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2));<br>
- OUT_BATCH(vec4_ctx->prog_<u></u>offset);<br>
- OUT_BATCH(floating_point_mode |<br>
- ((ALIGN(vec4_ctx->sampler_<u></u>count, 4)/4) <<<br>
- GEN6_SAMPLER_COUNT_SHIFT));<br>
-<br>
- if (brw->vs.prog_data->base.<u></u>total_scratch) {<br>
- OUT_RELOC(vec4_ctx->scratch_<u></u>bo,<br>
- I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,<br>
- ffs(brw->vs.prog_data->base.<u></u>total_scratch) - 11);<br>
- } else {<br>
- OUT_BATCH(0);<br>
- }<br>
+ bool alt_floating_point_mode = (ctx->Shader.<u></u>CurrentVertexProgram == NULL);<br>
<br>
- OUT_BATCH((brw->vs.prog_data-><u></u>base.dispatch_grf_start_reg <<<br>
- GEN6_VS_DISPATCH_START_GRF_<u></u>SHIFT) |<br>
- (brw->vs.prog_data->base.urb_<u></u>read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |<br>
- (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_<u></u>SHIFT));<br>
+ unsigned stage_specific_cmd_data[2];<br>
+ stage_specific_cmd_data[0] =<br>
+ (prog_data->dispatch_grf_<u></u>start_reg <<<br>
+ GEN6_VS_DISPATCH_START_GRF_<u></u>SHIFT) |<br>
+ (prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |<br>
+ (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_<u></u>SHIFT);<br>
+ stage_specific_cmd_data[1] =<br>
+ ((brw->max_vs_threads - 1) << max_threads_shift) |<br>
+ GEN6_VS_STATISTICS_ENABLE |<br>
+ GEN6_VS_ENABLE;<br>
<br>
- OUT_BATCH(((brw->max_vs_<u></u>threads - 1) << max_threads_shift) |<br>
- GEN6_VS_STATISTICS_ENABLE |<br>
- GEN6_VS_ENABLE);<br>
- ADVANCE_BATCH();<br>
+ /* BRW_NEW_VS_BINDING_TABLE */<br>
+ /* CACHE_NEW_SAMPLER */<br>
+ gen7_upload_vec4_state(brw, &vs_upload_params, vec4_ctx, true /* active */,<br>
+ alt_floating_point_mode, prog_data,<br>
+ stage_specific_cmd_data);<br>
}<br>
<br>
const struct brw_tracked_state gen7_vs_state = {<br>
<br>
</blockquote>
<br></div></div>
I can't quite put my finger on why, but I really dislike this code reuse. It just seems really awkward to me.<br>
<br>
I feel like the code reuse is minimal; we're sharing really simple code here - nothing complicated. Instead we get one packet split across multiple functions in separate files, making it harder to follow.<br>
<br>
Plus, you end up with a function that awkwardly emits four separate packets...but it doesn't even know the names of the packets. You have to pass them in via a struct. It doesn't even know the length of the final packet.<br>
<br>
Personally, I'd like to split the binding table and sampler table code out of the unit state. (I had originally done them as a separate atom, but Eric suggested doing it this way.)<br>
<br>
Instead, I'd envisioned uploading the new tables when we create them: the function that creates a new VS binding table would also emit 3DSTATE_BINDING_TABLE_<u></u>POINTERS_VS to upload it. That way it's all tied neatly together in one place. (We started doing something similar with BLEND_STATE recently.) That doesn't necessarily need to happen today, but I feel like this pushes us away from it.<br>
<br>
I've even thought about splitting out the 3DSTATE_CONSTANT commands into a separate atom, so we don't need to re-upload the program if only uniforms change. Not sure whether that's a good idea yet.<br>
<br>
For now, I'd be much happier to see patches 21 and 22 replaced by a patch that emits 3DSTATE_GS in the current style with no reuse. I'm ambivalent about patch 20.<br>
</blockquote></div><br></div><div class="gmail_extra">Would you have some time tomorrow to do some in-person brainstorming about other ways we might be able to structure this patch without completely giving up on reuse?<br>
<br></div><div class="gmail_extra">I'm asking because I'm really concerned about the amount of code duplication we have in the back-end state emission logic right now. In my experience, plans of the form "duplicate code now, we can always consolidate it later", while usually well-meaning, often don't work out because people get busy and "later" becomes "never". Our current code base seems to exhibit a lot of signs of that effect--that's why a lot of state atoms have separate (but nearly identical) vs and fs versions, or separate (but usually somewhat less identical) gen4-5, gen6, and gen7 versions. Some state atoms are even split both ways.<br>
<br></div><div class="gmail_extra">The situation is going to get a lot worse when we add gen8, geometry shaders, tessellation control shaders, and tessellation evaluation shaders to the mix. I'd prefer to shift from a "duplicate code now, consolidate later" strategy to a "generalize the code first, then add the new cases", as I've done here. If we can't do that, I would at least like to have an idea of what consolidation we are planning to do in the near future so that we don't just perpetuate the code duplication status quo.<br>
</div><br><br><div class="gmail_extra">Incidentally, I can think of a few ways I could have made the code reuse a lot cleaner using C++, however I am still gun-shy after how negatively my use of C++ in blorp was received. Here are some examples.<br>
<br></div><div class="gmail_extra">In C++, we could make a base class to represent the first 4 (shared) DWORDs of the 3DSTATE_{VS,GS} packets, and derived classes to represent the remaining DWORDS, like this (to simplify the discussion, let's assume that the stage is always active):<br>
<br>class gen7_3dstate_vec4<br>{<br>public:<br> init(struct brw_context *brw, unsigned opcode, unsigned size,<br> const struct brw_vec4_context_base *vec4_ctx,<br> const struct brw_vec4_prog_data *prog_data)<br>
{<br> dw0 = opcode << 16 | (size - 2);<br> dw1 = vec4_ctx->prog_offset;<br> dw2 = (ALIGN(vec4_ctx->sampler_count, 4)/4) <<<br> GEN6_SAMPLER_COUNT_SHIFT;<br> dw3 = ffs(prog_data->total_scratch) - 11;<br>
drm_intel_bo_emit_reloc(...dw3...);<br> }<br><br>protected:<br> unsigned dw0;<br> unsigned dw1;<br> unsigned dw2;<br> unsigned dw3;<br>};<br><br>class gen7_3dstate_vs : public gen7_3dstate_vec4<br>{<br>public:<br>
init(struct brw_context *brw)<br> {<br> const struct brw_vec4_context_base *vec4_ctx = &brw->vs.base;<br> const struct brw_vec4_prog_data *prog_data = &brw->vs.prog_data->base;<br> gen7_3dstate_vec4::init(brw, _3DSTATE_VS, 6, vec4_ctx, prog_data);<br>
</div><div class="gmail_extra"> if (brw->ctx.Shader.CurrentVertexProgram == NULL)<br></div><div class="gmail_extra"> dw2 |= GEN6_FLOATING_POINT_MODE_ALT;<br></div><div class="gmail_extra"> dw4 = (prog_data->dispatch_grf_start_reg <<<br>
GEN6_VS_DISPATCH_START_GRF_SHIFT) |<br> (prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |<br> (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT);<br> dw5 = ((brw->max_vs_threads - 1) << max_threads_shift) |<br>
GEN6_VS_STATISTICS_ENABLE |<br> GEN6_VS_ENABLE;<br> }<br><br>protected:<br> unsigned dw4;<br> unsigned dw5;<br>};<br><br></div><div class="gmail_extra"><br></div>Then upload_vs_state() would shrink down to:<br>
<div class="gmail_extra"><br></div><div class="gmail_extra">static void<br>upload_vs_state(struct brw_context *brw)<br>{<br></div><div class="gmail_extra"> /* For simplicity assume we only have to output the 3DSTATE_VS command */<br>
</div><div class="gmail_extra"> void *p = allocate sizeof(gen7_3dstate_vs) bytes of space in the batch;<br></div><div class="gmail_extra"> gen7_3dstate_vs *cmd = new(p) gen7_3dstate_vs();<br></div><div class="gmail_extra">
cmd->init(brw);<br></div><div class="gmail_extra"> ADVANCE_BATCH();<br></div><div class="gmail_extra">}<br></div><div class="gmail_extra"><br><br></div><div class="gmail_extra">A further improvement could be made if state atoms used a C++ dispatch mechanism like this:<br>
<br>class brw_tracked_state {<br></div><div class="gmail_extra">public:<br></div><div class="gmail_extra"> virtual void emit(struct brw_context *brw) const = 0;<br></div><div class="gmail_extra"> /* leaving out dirty bits for simplicity */<br>
</div>};<br><div class="gmail_extra"><br></div><div class="gmail_extra">In that case, I wouldn't have had to make a separate gen7_vec4_upload_params struct; the extra parameters could have simply lived in the derived class for the state atom.<br>
<br></div><div class="gmail_extra"><br></div><div class="gmail_extra">Using a C++ dispatch mechanism for brw_tracked_state would also open up more possible ways to share code between state atoms. Here's an example that leverages the gen7_3dstate_vs class above:<br>
</div><div class="gmail_extra"><br>template<class command><br>class gen7_vec4_state<br>{<br></div><div class="gmail_extra">public:<br></div><div class="gmail_extra"> virtual void emit(struct brw_context *brw) const<br>
{<br></div><div class="gmail_extra"> void *p = allocate sizeof(command) bytes of space in the batch;<br></div><div class="gmail_extra"> command *cmd = new(p) command();<br></div><div class="gmail_extra"> cmd->init(brw);<br>
</div><div class="gmail_extra"> ADVANCE_BATCH();<br> }<br></div><div class="gmail_extra">};<br><br></div><div class="gmail_extra">With the above template, we wouldn't need separate upload_vs_state and upload_gs_state functions. We would simply instantiate:<br>
</div><div class="gmail_extra"><br>gen7_vec4_state<gen7_3dstate_vs> gen7_vs_state;<br></div><div class="gmail_extra">gen7_vec4_state<gen7_3dstate_gs> gen7_gs_state;<br><br></div><div class="gmail_extra">and the compiler would generate the appropriate emit() functions for us. With suitable use of inline functions, there's no reason this technique would be any less performant than what we have now, and IMHO it would be much easier to extend and maintain.<br>
</div></div></div></div></div>