[Intel-gfx] [PATCH 1/2] rendercopy/bdw: Enable hw-generated binding tables
Abdiel Janulgue
abdiel.janulgue at linux.intel.com
Wed May 7 00:09:01 CEST 2014
On Wednesday, May 07, 2014 12:38:04 AM Ville Syrjälä wrote:
> On Tue, May 06, 2014 at 10:48:02PM +0300, Abdiel Janulgue wrote:
> > Use on-chip hw-binding table generator to generate binding
> > tables when the test emits SURFACE_STATES packets. The hw generates
> > these binding table packets internally so we don't have to
> > allocate space on the batchbuffer.
> >
> > Signed-off-by: Abdiel Janulgue <abdiel.janulgue at linux.intel.com>
> > ---
> >
> > lib/gen8_render.h | 13 +++++++
> > lib/intel_batchbuffer.c | 12 ++++++
> > lib/intel_batchbuffer.h | 3 ++
> > lib/intel_reg.h | 3 ++
> > lib/rendercopy_gen8.c | 97
> > ++++++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 123
> > insertions(+), 5 deletions(-)
> >
> > diff --git a/lib/gen8_render.h b/lib/gen8_render.h
> > index fffc100..4d5f7ba 100644
> > --- a/lib/gen8_render.h
> > +++ b/lib/gen8_render.h
> > @@ -83,6 +83,19 @@
> >
> > /* Random shifts */
> > #define GEN8_3DSTATE_PS_MAX_THREADS_SHIFT 23
> >
> > +/* GEN8+ resource streamer */
> > +#define GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC (0x7919 << 16) /*
> > GEN8 */ +# define BINDING_TABLE_POOL_ENABLE 0x0800
> > +#define GEN8_3DSTATE_BINDING_TABLE_EDIT_VS (0x7843 << 16) /*
> > GEN8 */ +#define GEN8_3DSTATE_BINDING_TABLE_EDIT_GS (0x7844 <<
> > 16) /* GEN8 */ +#define GEN8_3DSTATE_BINDING_TABLE_EDIT_HS
> > (0x7845 << 16) /* GEN8 */ +#define GEN8_3DSTATE_BINDING_TABLE_EDIT_DS
> > (0x7846 << 16) /* GEN8 */ +#define
> > GEN8_3DSTATE_BINDING_TABLE_EDIT_PS (0x7847 << 16) /* GEN8 */ +
> > +/* Toggling RS needs PIPE_CONTROL SC invalidate */
> > +#define _3DSTATE_PIPE_CONTROL ((0x3 << 29) | (3 << 27) | (2 << 24))
> > +#define PIPE_CONTROL_STATE_CACHE_INVALIDATE (1 << 2)
> > +
> >
> > /* Shamelessly ripped from mesa */
> > struct gen8_surface_state
> > {
> >
> > diff --git a/lib/intel_batchbuffer.c b/lib/intel_batchbuffer.c
> > index e868922..26c9b89 100644
> > --- a/lib/intel_batchbuffer.c
> > +++ b/lib/intel_batchbuffer.c
> > @@ -81,6 +81,14 @@ intel_batchbuffer_reset(struct intel_batchbuffer
> > *batch)
> >
> > memset(batch->buffer, 0, sizeof(batch->buffer));
> >
> > batch->ptr = batch->buffer;
> >
> > +
> > + if (batch->hw_bt_pool_bo != NULL) {
> > + drm_intel_bo_unreference(batch->hw_bt_pool_bo);
> > + batch->hw_bt_pool_bo = NULL;
> > + }
> > +
> > + batch->hw_bt_pool_bo = drm_intel_bo_alloc(batch->bufmgr, "hw_bt",
> > + 131072, 4096);
> >
> > }
> >
> > /**
> >
> > @@ -116,6 +124,10 @@ intel_batchbuffer_free(struct intel_batchbuffer
> > *batch)>
> > {
> >
> > drm_intel_bo_unreference(batch->bo);
> > batch->bo = NULL;
> >
> > +
> > + drm_intel_bo_unreference(batch->hw_bt_pool_bo);
> > + batch->hw_bt_pool_bo = NULL;
> > +
> >
> > free(batch);
> >
> > }
> >
> > diff --git a/lib/intel_batchbuffer.h b/lib/intel_batchbuffer.h
> > index 49dbcf0..2a516bc 100644
> > --- a/lib/intel_batchbuffer.h
> > +++ b/lib/intel_batchbuffer.h
> > @@ -18,6 +18,9 @@ struct intel_batchbuffer {
> >
> > uint8_t buffer[BATCH_SZ];
> > uint8_t *ptr;
> > uint8_t *state;
> >
> > +
> > + drm_intel_bo *hw_bt_pool_bo;
> > + int use_resource_streamer;
> >
> > };
> >
> > struct intel_batchbuffer *intel_batchbuffer_alloc(drm_intel_bufmgr
> > *bufmgr,
> >
> > diff --git a/lib/intel_reg.h b/lib/intel_reg.h
> > index 4b3a102..1cb9a29 100644
> > --- a/lib/intel_reg.h
> > +++ b/lib/intel_reg.h
> > @@ -2560,6 +2560,9 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> > SOFTWARE.>
> > #define MI_BATCH_NON_SECURE (1)
> > #define MI_BATCH_NON_SECURE_I965 (1 << 8)
> >
> > +/* Resource Streamer */
> > +#define MI_RS_CONTROL (0x6 << 23)
> > +
> >
> > #define MAX_DISPLAY_PIPES 2
> >
> > typedef enum {
> >
> > diff --git a/lib/rendercopy_gen8.c b/lib/rendercopy_gen8.c
> > index 6f5a698..b0dd0f3 100644
> > --- a/lib/rendercopy_gen8.c
> > +++ b/lib/rendercopy_gen8.c
> > @@ -169,12 +169,14 @@ static void
> >
> > gen6_render_flush(struct intel_batchbuffer *batch,
> >
> > drm_intel_context *context, uint32_t batch_end)
> >
> > {
> >
> > - int ret;
> > + int ret = 0;
> > + uint32_t flags = 0;
> >
> > ret = drm_intel_bo_subdata(batch->bo, 0, 4096, batch->buffer);
> >
> > + flags = batch->use_resource_streamer ? I915_EXEC_RESOURCE_STREAMER : 0;
> >
> > if (ret == 0)
> >
> > ret = drm_intel_gem_bo_context_exec(batch->bo, context,
> >
> > - batch_end, 0);
> > + batch_end, flags | I915_EXEC_RENDER);
> >
> > assert(ret == 0);
> >
> > }
> >
> > @@ -228,18 +230,83 @@ gen8_bind_buf(struct intel_batchbuffer *batch,
> > struct igt_buf *buf,>
> > return offset;
> >
> > }
> >
> > +static void
> > +gen8_hw_binding_table(struct intel_batchbuffer *batch, bool enable)
> > +{
> > + if (!enable) {
> > + OUT_BATCH(MI_RS_CONTROL | 0x0);
> > +
> > + OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC | (4 - 2));
> > + /* binding table pool base address */
> > + OUT_BATCH(0);
> > + OUT_BATCH(0);
> > + /* Upper bound */
> > + OUT_BATCH(0);
> > +
>
> WaCsStallBeforeStateCacheInvalidate says you need to emit another
> PIPE_CONTROL here with CS stall bit set. And thanks to the other
> restrictions that PIPE_CONTORL needs one of the other magic bits also
> set. Stall at scoreboard perhaps? I think that's what we set in the
> kernel usually.
It used to have the CS stall bit. But it seems to work without it. In any
case, you might be right. It's a good idea to put in back in.
>
> > + OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
> > + OUT_BATCH(PIPE_CONTROL_STATE_CACHE_INVALIDATE);
> > + OUT_BATCH(0);
> > + OUT_BATCH(0);
> > + OUT_BATCH(0);
> > + OUT_BATCH(0);
> > +
> > + return;
> > + }
> > + OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_POOL_ALLOC | (4 - 2));
> > +
> > + /* binding table pool base address */
> > + OUT_RELOC(batch->hw_bt_pool_bo, I915_GEM_DOMAIN_SAMPLER, 0,
> > + BINDING_TABLE_POOL_ENABLE);
> > + OUT_BATCH(0);
>
> I suppose it doesn't matter much which domain we use here. But the
> hardware writes to the BT pool so maybe you should set write domain
> as well?
>
> > +
> > + /* Upper bound */
> > + OUT_RELOC(batch->hw_bt_pool_bo, I915_GEM_DOMAIN_SAMPLER, 0,
> > + batch->hw_bt_pool_bo->size);
>
> AFAICS on bdw this should be just the bt pool size and not a reloc.
>
> > +
>
> Again need another CS stall PIPE_CONTROL.
>
> > + OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
> > + OUT_BATCH(PIPE_CONTROL_STATE_CACHE_INVALIDATE);
> > + OUT_BATCH(0);
> > + OUT_BATCH(0);
> > + OUT_BATCH(0);
> > + OUT_BATCH(0);
> > +}
> > +
> > +static uint32_t
> > +gen8_rs_bind_surfaces(struct intel_batchbuffer *batch,
> > + struct igt_buf *src,
> > + struct igt_buf *dst,
> > + uint32_t *surf0, uint32_t *surf1)
> > +{
> > + *surf0 = gen8_bind_buf(batch, dst, GEN6_SURFACEFORMAT_B8G8R8A8_UNORM,
> > 1);
> > + *surf1 = gen8_bind_buf(batch, src, GEN6_SURFACEFORMAT_B8G8R8A8_UNORM,
> > 0);
> > +
> > + return 0;
> > +}
> > +
> > +static void
> > +gen8_rs_edit_surfaces(struct intel_batchbuffer *batch,
> > + uint32_t surf0, uint32_t surf1)
> > +{
> > + OUT_BATCH(GEN8_3DSTATE_BINDING_TABLE_EDIT_PS | (4 - 2));
> > + OUT_BATCH(0x3);
>
> So I take it there's no need to clear the entries first?
I don't think it's necessary on this case. The clear is only
neded if the binding table pool is re-used from the previous
batch - which might contain stale entries. At least it causes
hungs in Mesa. In this test, we are allocating the pool same
time as the batch is allocated.
>
> > + {
> > + OUT_BATCH(0 << 16 | surf0 >> 6);
> > + OUT_BATCH(1 << 16 | surf1 >> 6);
> > + }
> > +}
> > +
> >
> > static uint32_t
> > gen8_bind_surfaces(struct intel_batchbuffer *batch,
> >
> > struct igt_buf *src,
> > struct igt_buf *dst)
> >
> > {
> >
> > - uint32_t *binding_table, offset;
> > + uint32_t offset = 0;
> > + uint32_t *binding_table;
> >
> > binding_table = batch_alloc(batch, 8, 32);
> > offset = batch_offset(batch, binding_table);
> > annotation_add_state(&aub_annotations, AUB_TRACE_BINDING_TABLE,
> >
> > offset, 8);
> >
> > -
> >
> > binding_table[0] =
> >
> > gen8_bind_buf(batch, dst, GEN6_SURFACEFORMAT_B8G8R8A8_UNORM, 1);
> >
> > binding_table[1] =
> >
> > @@ -512,6 +579,9 @@ gen7_emit_push_constants(struct intel_batchbuffer
> > *batch) {>
> > static void
> > gen8_emit_state_base_address(struct intel_batchbuffer *batch) {
> >
> > + if (batch->use_resource_streamer)
> > + OUT_BATCH(MI_RS_CONTROL | 0x0);
> > +
> >
> > OUT_BATCH(GEN6_STATE_BASE_ADDRESS | (16 - 2));
> >
> > /* general */
> >
> > @@ -546,6 +616,9 @@ gen8_emit_state_base_address(struct intel_batchbuffer
> > *batch) {>
> > OUT_BATCH(0xfffff000 | 1);
> > /* intruction buffer size */
> > OUT_BATCH(1 << 12 | 1);
> >
> > +
> > + if (batch->use_resource_streamer)
> > + OUT_BATCH(MI_RS_CONTROL | 0x1);
> >
> > }
> >
> > static void
> >
> > @@ -918,6 +991,7 @@ void gen8_render_copyfunc(struct intel_batchbuffer
> > *batch,>
> > uint32_t scissor_state;
> > uint32_t vertex_buffer;
> > uint32_t batch_end;
> >
> > + uint32_t surf0 = 0, surf1 = 1;
> >
> > intel_batchbuffer_flush_with_context(batch, context);
> >
> > @@ -927,7 +1001,11 @@ void gen8_render_copyfunc(struct intel_batchbuffer
> > *batch,>
> > annotation_init(&aub_annotations);
> >
> > - ps_binding_table = gen8_bind_surfaces(batch, src, dst);
> > + if (batch->use_resource_streamer)
> > + ps_binding_table = gen8_rs_bind_surfaces(batch, src, dst,
> > + &surf0, &surf1);
> > + else
> > + ps_binding_table = gen8_bind_surfaces(batch, src, dst);
> >
> > ps_sampler_state = gen8_create_sampler(batch);
> > ps_kernel_off = gen8_fill_ps(batch, ps_kernel, sizeof(ps_kernel));
> > vertex_buffer = gen7_fill_vertex_buffer_data(batch, src,
> >
> > @@ -955,6 +1033,9 @@ void gen8_render_copyfunc(struct intel_batchbuffer
> > *batch,>
> > gen8_emit_state_base_address(batch);
> >
> > + if (batch->use_resource_streamer)
> > + gen8_hw_binding_table(batch, true);
> > +
> >
> > OUT_BATCH(GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC);
> > OUT_BATCH(viewport.cc_state);
> > OUT_BATCH(GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
> >
> > @@ -978,6 +1059,9 @@ void gen8_render_copyfunc(struct intel_batchbuffer
> > *batch,>
> > gen8_emit_sf(batch);
> >
> > + if (batch->use_resource_streamer)
> > + gen8_rs_edit_surfaces(batch, surf0, surf1);
> > +
> >
> > OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS);
> > OUT_BATCH(ps_binding_table);
> >
> > @@ -1001,6 +1085,9 @@ void gen8_render_copyfunc(struct intel_batchbuffer
> > *batch,>
> > gen8_emit_vf_topology(batch);
> > gen8_emit_primitive(batch, vertex_buffer);
> >
> > + if (batch->use_resource_streamer)
> > + gen8_hw_binding_table(batch, false);
> > +
> >
> > OUT_BATCH(MI_BATCH_BUFFER_END);
> >
> > batch_end = batch_align(batch, 8);
Thanks for the review. I'll update v2 soon to address the comments
-abdiel
More information about the Intel-gfx
mailing list