[Intel-gfx] [PATCH v2 2/2] drm/i915/gen8: Apply Per-context workarounds using W/A batch buffers

Mon Mar 2 09:43:10 PST 2015

On Mon, Mar 02, 2015 at 11:07:20AM +0000, Arun Siluvery wrote:
> Some of the workarounds are to be applied during context save but before
> restore and some at the end of context save/restore but before executing
> the instructions in the ring. Workaround batch buffers are created for
> this purpose as they cannot be applied using normal means. HW executes
> them at specific stages during context save/restore.
> 
> In this method we initialize batch buffer with w/a commands and its address
> is supplied using context offset pointers when a context is initialized.
> 
> This patch introduces indirect and per-context batch buffers using which
> following workarounds are applied. These are required to fix issues
> observed with preemption related workloads.
> 
> In Indirect context w/a batch buffer,
> +WaDisableCtxRestoreArbitration
> +WaFlushCoherentL3CacheLinesAtContextSwitch
> +WaClearSlmSpaceAtContextSwitch
> 
> In Per context w/a batch buffer,
> +WaDisableCtxRestoreArbitration
> +WaRsRestoreWithPerCtxtBb
> 
> v2: Use GTT address type for all privileged instructions, update as
> per dynamic pinning changes, minor simplifications, rename variables
> as follows to keep lines under 80 chars and rebase.
> s/indirect_ctx_wa_ringbuf/indirect_ctx_wa_bb
> s/per_ctx_wa_ringbuf/per_ctx_wa_bb
> 
> v3: Modify WA BB initialization to Gen specific.
> 
> v4: s/PIPE_CONTROL_FLUSH_RO_CACHES/PIPE_CONTROL_FLUSH_L3 (Ville)
> This patches modifies definitions of MI_LOAD_REGISTER_MEM and
> MI_LOAD_REGISTER_REG; Add GEN8 specific defines for these instructions
> so as to not break any future users of existing definitions (Michel)
> 
> Change-Id: I0cedb536b7f6d9f10ba9e81ba625848e7bab603c
> Signed-off-by: Rafael Barbalho <rafael.barbalho at intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery at linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |   3 +
>  drivers/gpu/drm/i915/i915_reg.h         |  28 ++++
>  drivers/gpu/drm/i915/intel_lrc.c        | 231 +++++++++++++++++++++++++++++++-
>  drivers/gpu/drm/i915/intel_ringbuffer.h |   3 +
>  4 files changed, 258 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index d42040f..86cdb52 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -774,6 +774,9 @@ struct intel_context {
>  
>  	/* Execlists */
>  	bool rcs_initialized;
> +	struct intel_ringbuffer *indirect_ctx_wa_bb;
> +	struct intel_ringbuffer *per_ctx_wa_bb;

Why is this per-ctx and not per-engine? Also your patch splitting doesn't
make that much sense: Patch 1 only adds a static function without any
users (resulting in gcc being unhappy). Imo a better split would be:
- wire up wa batch/ring allocation/freing functions
- wire up the changes to the lrc initial reg state code
- one patch per w/a entry you add

Cheers, Daniel

> +
>  	struct {
>  		struct drm_i915_gem_object *state;
>  		struct intel_ringbuffer *ringbuf;
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index 55143cb..3048494 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -347,6 +347,26 @@
>  #define   MI_INVALIDATE_BSD		(1<<7)
>  #define   MI_FLUSH_DW_USE_GTT		(1<<2)
>  #define   MI_FLUSH_DW_USE_PPGTT		(0<<2)
> +#define MI_ATOMIC(len)	MI_INSTR(0x2F, (len-2))
> +#define   MI_ATOMIC_MEMORY_TYPE_GGTT	(1<<22)
> +#define   MI_ATOMIC_INLINE_DATA		(1<<18)
> +#define   MI_ATOMIC_CS_STALL		(1<<17)
> +#define   MI_ATOMIC_RETURN_DATA_CTL	(1<<16)
> +#define MI_ATOMIC_OP_MASK(op)  ((op) << 8)
> +#define MI_ATOMIC_AND	MI_ATOMIC_OP_MASK(0x01)
> +#define MI_ATOMIC_OR	MI_ATOMIC_OP_MASK(0x02)
> +#define MI_ATOMIC_XOR	MI_ATOMIC_OP_MASK(0x03)
> +#define MI_ATOMIC_MOVE	MI_ATOMIC_OP_MASK(0x04)
> +#define MI_ATOMIC_INC	MI_ATOMIC_OP_MASK(0x05)
> +#define MI_ATOMIC_DEC	MI_ATOMIC_OP_MASK(0x06)
> +#define MI_ATOMIC_ADD	MI_ATOMIC_OP_MASK(0x07)
> +#define MI_ATOMIC_SUB	MI_ATOMIC_OP_MASK(0x08)
> +#define MI_ATOMIC_RSUB	MI_ATOMIC_OP_MASK(0x09)
> +#define MI_ATOMIC_IMAX	MI_ATOMIC_OP_MASK(0x0A)
> +#define MI_ATOMIC_IMIN	MI_ATOMIC_OP_MASK(0x0B)
> +#define MI_ATOMIC_UMAX	MI_ATOMIC_OP_MASK(0x0C)
> +#define MI_ATOMIC_UMIN	MI_ATOMIC_OP_MASK(0x0D)
> +
>  #define MI_BATCH_BUFFER		MI_INSTR(0x30, 1)
>  #define   MI_BATCH_NON_SECURE		(1)
>  /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */
> @@ -410,6 +430,7 @@
>  #define   DISPLAY_PLANE_A           (0<<20)
>  #define   DISPLAY_PLANE_B           (1<<20)
>  #define GFX_OP_PIPE_CONTROL(len)	((0x3<<29)|(0x3<<27)|(0x2<<24)|(len-2))
> +#define   PIPE_CONTROL_FLUSH_L3				(1<<27)
>  #define   PIPE_CONTROL_GLOBAL_GTT_IVB			(1<<24) /* gen7+ */
>  #define   PIPE_CONTROL_MMIO_WRITE			(1<<23)
>  #define   PIPE_CONTROL_STORE_DATA_INDEX			(1<<21)
> @@ -426,6 +447,7 @@
>  #define   PIPE_CONTROL_INDIRECT_STATE_DISABLE		(1<<9)
>  #define   PIPE_CONTROL_NOTIFY				(1<<8)
>  #define   PIPE_CONTROL_FLUSH_ENABLE			(1<<7) /* gen7+ */
> +#define   PIPE_CONTROL_DC_FLUSH_ENABLE			(1<<5)
>  #define   PIPE_CONTROL_VF_CACHE_INVALIDATE		(1<<4)
>  #define   PIPE_CONTROL_CONST_CACHE_INVALIDATE		(1<<3)
>  #define   PIPE_CONTROL_STATE_CACHE_INVALIDATE		(1<<2)
> @@ -451,6 +473,10 @@
>  #define   MI_REPORT_PERF_COUNT_GGTT (1<<0)
>  #define MI_LOAD_REGISTER_MEM    MI_INSTR(0x29, 0)
>  #define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 0)
> +#define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2)
> +#define MI_LRM_USE_GLOBAL_GTT (1<<22)
> +#define MI_LRM_ASYNC_MODE_ENABLE (1<<21)
> +#define MI_LOAD_REGISTER_REG_GEN8 MI_INSTR(0x2A, 1)
>  #define MI_RS_STORE_DATA_IMM    MI_INSTR(0x2B, 0)
>  #define MI_LOAD_URB_MEM         MI_INSTR(0x2C, 0)
>  #define MI_STORE_URB_MEM        MI_INSTR(0x2D, 0)
> @@ -1520,6 +1546,8 @@ enum skl_disp_power_wells {
>  #define   GEN8_RC_SEMA_IDLE_MSG_DISABLE	(1 << 12)
>  #define   GEN8_FF_DOP_CLOCK_GATE_DISABLE	(1<<10)
>  
> +#define GEN8_RS_PREEMPT_STATUS		0x215C
> +
>  /* Fuse readout registers for GT */
>  #define CHV_FUSE_GT			(VLV_DISPLAY_BASE + 0x2168)
>  #define   CHV_FGT_EU_DIS_SS0_R0_SHIFT	16
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index ea37a56..326539b 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -202,6 +202,7 @@ enum {
>  	FAULT_AND_CONTINUE /* Unsupported */
>  };
>  #define GEN8_CTX_ID_SHIFT 32
> +#define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT  0x17
>  
>  static int intel_lr_context_pin(struct intel_engine_cs *ring,
>  		struct intel_context *ctx);
> @@ -1139,6 +1140,169 @@ create_wa_bb(struct intel_engine_cs *ring, uint32_t bb_size)
>  	return ringbuf;
>  }
>  
> +static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
> +				    struct intel_context *ctx)
> +{
> +	unsigned long flags = 0;
> +	u32 scratch_addr;
> +	struct intel_ringbuffer *ringbuf = NULL;
> +
> +	if (ring->scratch.obj == NULL) {
> +		DRM_ERROR("scratch page not allocated for %s\n", ring->name);
> +		return -EINVAL;
> +	}
> +
> +	ringbuf = create_wa_bb(ring, PAGE_SIZE);
> +	if (!ringbuf)
> +		return -ENOMEM;
> +
> +	ctx->indirect_ctx_wa_bb = ringbuf;
> +
> +	/* WaDisableCtxRestoreArbitration:bdw,chv */
> +	intel_logical_ring_emit(ringbuf, MI_ARB_ON_OFF | MI_ARB_DISABLE);
> +
> +	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw,chv */
> +	intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6));
> +	intel_logical_ring_emit(ringbuf, PIPE_CONTROL_DC_FLUSH_ENABLE);
> +	intel_logical_ring_emit(ringbuf, 0);
> +	intel_logical_ring_emit(ringbuf, 0);
> +	intel_logical_ring_emit(ringbuf, 0);
> +	intel_logical_ring_emit(ringbuf, 0);
> +
> +	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
> +	flags = PIPE_CONTROL_FLUSH_L3 |
> +		PIPE_CONTROL_GLOBAL_GTT_IVB |
> +		PIPE_CONTROL_CS_STALL |
> +		PIPE_CONTROL_QW_WRITE;
> +
> +	/* Actual scratch location is at 128 bytes offset */
> +	scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES;
> +	scratch_addr |= PIPE_CONTROL_GLOBAL_GTT;
> +
> +	intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6));
> +	intel_logical_ring_emit(ringbuf, flags);
> +	intel_logical_ring_emit(ringbuf, scratch_addr);
> +	intel_logical_ring_emit(ringbuf, 0);
> +	intel_logical_ring_emit(ringbuf, 0);
> +	intel_logical_ring_emit(ringbuf, 0);
> +
> +	/* Padding to align with cache line */
> +	intel_logical_ring_emit(ringbuf, 0);
> +	intel_logical_ring_emit(ringbuf, 0);
> +	intel_logical_ring_emit(ringbuf, 0);
> +
> +	/*
> +	 * No MI_BATCH_BUFFER_END is required in Indirect ctx BB because
> +	 * execution depends on the size defined in CTX_RCS_INDIRECT_CTX
> +	 */
> +
> +	return 0;
> +}
> +
> +static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
> +			       struct intel_context *ctx)
> +{
> +	unsigned long flags = 0;
> +	u32 scratch_addr;
> +	struct intel_ringbuffer *ringbuf = NULL;
> +
> +	if (ring->scratch.obj == NULL) {
> +		DRM_ERROR("scratch page not allocated for %s\n", ring->name);
> +		return -EINVAL;
> +	}
> +
> +	ringbuf = create_wa_bb(ring, PAGE_SIZE);
> +	if (!ringbuf)
> +		return -ENOMEM;
> +
> +	ctx->per_ctx_wa_bb = ringbuf;
> +
> +	/* Actual scratch location is at 128 bytes offset */
> +	scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES;
> +	scratch_addr |= PIPE_CONTROL_GLOBAL_GTT;
> +
> +	/* WaDisableCtxRestoreArbitration:bdw,chv */
> +	intel_logical_ring_emit(ringbuf, MI_ARB_ON_OFF | MI_ARB_ENABLE);
> +
> +	/*
> +	 * As per Bspec, to workaround a known HW issue, SW must perform the
> +	 * below programming sequence prior to programming MI_BATCH_BUFFER_END.
> +	 *
> +	 * This is only applicable for Gen8.
> +	 */
> +
> +	/* WaRsRestoreWithPerCtxtBb:bdw,chv */
> +	intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1));
> +	intel_logical_ring_emit(ringbuf, INSTPM);
> +	intel_logical_ring_emit(ringbuf,
> +				_MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING));
> +
> +	flags = MI_ATOMIC_MEMORY_TYPE_GGTT |
> +		MI_ATOMIC_INLINE_DATA |
> +		MI_ATOMIC_CS_STALL |
> +		MI_ATOMIC_RETURN_DATA_CTL |
> +		MI_ATOMIC_MOVE;
> +
> +	intel_logical_ring_emit(ringbuf, MI_ATOMIC(5) | flags);
> +	intel_logical_ring_emit(ringbuf, scratch_addr);
> +	intel_logical_ring_emit(ringbuf, 0);
> +	intel_logical_ring_emit(ringbuf,
> +				_MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
> +	intel_logical_ring_emit(ringbuf,
> +				_MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
> +
> +	/*
> +	 * Bspec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and
> +	 * MI_BATCH_BUFFER_END instructions in this sequence need to be
> +	 * in the same cacheline.
> +	 */
> +	while (((unsigned long) ringbuf->tail % CACHELINE_BYTES) != 0)
> +		intel_logical_ring_emit(ringbuf, MI_NOOP);
> +
> +	intel_logical_ring_emit(ringbuf,
> +				MI_LOAD_REGISTER_MEM_GEN8 |
> +				MI_LRM_USE_GLOBAL_GTT |
> +				MI_LRM_ASYNC_MODE_ENABLE);
> +	intel_logical_ring_emit(ringbuf, INSTPM);
> +	intel_logical_ring_emit(ringbuf, 0);
> +	intel_logical_ring_emit(ringbuf, scratch_addr);
> +
> +	/*
> +	 * Bspec says there should not be any commands programmed
> +	 * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so
> +	 * do not add any new commands
> +	 */
> +	intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_REG_GEN8);
> +	intel_logical_ring_emit(ringbuf, GEN8_RS_PREEMPT_STATUS);
> +	intel_logical_ring_emit(ringbuf, GEN8_RS_PREEMPT_STATUS);
> +
> +	intel_logical_ring_emit(ringbuf, MI_BATCH_BUFFER_END);
> +
> +	return 0;
> +}
> +
> +static int intel_init_workaround_bb(struct intel_engine_cs *ring,
> +				    struct intel_context *ctx)
> +{
> +	int ret;
> +	struct drm_device *dev = ring->dev;
> +
> +	WARN_ON(ring->id != RCS);
> +
> +	if (IS_GEN8(dev)) {
> +		ret = gen8_init_indirectctx_bb(ring, ctx);
> +		if (ret)
> +			return ret;
> +
> +		ret = gen8_init_perctx_bb(ring, ctx);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +
> +}
> +
>  static int gen8_init_common_ring(struct intel_engine_cs *ring)
>  {
>  	struct drm_device *dev = ring->dev;
> @@ -1490,6 +1654,7 @@ static int logical_render_ring_init(struct drm_device *dev)
>  	else
>  		ring->init_hw = gen8_init_render_ring;
>  	ring->init_context = gen8_init_rcs_context;
> +	ring->init_context_bb = intel_init_workaround_bb;
>  	ring->cleanup = intel_fini_pipe_control;
>  	ring->get_seqno = gen8_get_seqno;
>  	ring->set_seqno = gen8_set_seqno;
> @@ -1500,11 +1665,16 @@ static int logical_render_ring_init(struct drm_device *dev)
>  	ring->emit_bb_start = gen8_emit_bb_start;
>  
>  	ring->dev = dev;
> +
> +	ret = intel_init_pipe_control(ring);
> +	if (ret)
> +		return ret;
> +
>  	ret = logical_ring_init(dev, ring);
>  	if (ret)
>  		return ret;
>  
> -	return intel_init_pipe_control(ring);
> +	return 0;
>  }
>  
>  static int logical_bsd_ring_init(struct drm_device *dev)
> @@ -1784,15 +1954,29 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
>  	reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118;
>  	reg_state[CTX_SECOND_BB_STATE+1] = 0;
>  	if (ring->id == RCS) {
> -		/* TODO: according to BSpec, the register state context
> -		 * for CHV does not have these. OTOH, these registers do
> -		 * exist in CHV. I'm waiting for a clarification */
>  		reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + 0x1c0;
> -		reg_state[CTX_BB_PER_CTX_PTR+1] = 0;
> +
> +		if (ctx->per_ctx_wa_bb)
> +			reg_state[CTX_BB_PER_CTX_PTR + 1] =
> +				i915_gem_obj_ggtt_offset(
> +					ctx->per_ctx_wa_bb->obj) | 0x01;
> +		else
> +			reg_state[CTX_BB_PER_CTX_PTR+1] = 0;
> +
>  		reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + 0x1c4;
> -		reg_state[CTX_RCS_INDIRECT_CTX+1] = 0;
>  		reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring->mmio_base + 0x1c8;
> -		reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0;
> +
> +		if (ctx->indirect_ctx_wa_bb) {
> +			reg_state[CTX_RCS_INDIRECT_CTX + 1] =
> +				i915_gem_obj_ggtt_offset(
> +				ctx->indirect_ctx_wa_bb->obj) | 0x01;
> +
> +			reg_state[CTX_RCS_INDIRECT_CTX_OFFSET + 1] =
> +				CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6;
> +		} else {
> +			reg_state[CTX_RCS_INDIRECT_CTX+1] = 0;
> +			reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0;
> +		}
>  	}
>  	reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9);
>  	reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED;
> @@ -1859,6 +2043,18 @@ void intel_lr_context_free(struct intel_context *ctx)
>  			drm_gem_object_unreference(&ctx_obj->base);
>  		}
>  	}
> +
> +	if (ctx->indirect_ctx_wa_bb) {
> +		intel_unpin_ringbuffer_obj(ctx->indirect_ctx_wa_bb);
> +		intel_destroy_ringbuffer_obj(ctx->indirect_ctx_wa_bb);
> +		kfree(ctx->indirect_ctx_wa_bb);
> +	}
> +
> +	if (ctx->per_ctx_wa_bb) {
> +		intel_unpin_ringbuffer_obj(ctx->per_ctx_wa_bb);
> +		intel_destroy_ringbuffer_obj(ctx->per_ctx_wa_bb);
> +		kfree(ctx->per_ctx_wa_bb);
> +	}
>  }
>  
>  static uint32_t get_lr_context_size(struct intel_engine_cs *ring)
> @@ -1985,6 +2181,16 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
>  
>  	}
>  
> +	if (ring->id == RCS && !ctx->rcs_initialized) {
> +		if (ring->init_context_bb) {
> +			ret = ring->init_context_bb(ring, ctx);
> +			if (ret) {
> +				DRM_ERROR("ring init context bb: %d\n", ret);
> +				goto error;
> +			}
> +		}
> +	}
> +
>  	ret = populate_lr_context(ctx, ctx_obj, ring, ringbuf);
>  	if (ret) {
>  		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
> @@ -2013,6 +2219,17 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
>  	return 0;
>  
>  error:
> +	if (ctx->indirect_ctx_wa_bb) {
> +		intel_unpin_ringbuffer_obj(ctx->indirect_ctx_wa_bb);
> +		intel_destroy_ringbuffer_obj(ctx->indirect_ctx_wa_bb);
> +		kfree(ctx->indirect_ctx_wa_bb);
> +	}
> +	if (ctx->per_ctx_wa_bb) {
> +		intel_unpin_ringbuffer_obj(ctx->per_ctx_wa_bb);
> +		intel_destroy_ringbuffer_obj(ctx->per_ctx_wa_bb);
> +		kfree(ctx->per_ctx_wa_bb);
> +	}
> +
>  	if (is_global_default_ctx)
>  		intel_unpin_ringbuffer_obj(ringbuf);
>  error_destroy_rbuf:
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 39183fc..839d698 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -146,6 +146,9 @@ struct  intel_engine_cs {
>  	int		(*init_context)(struct intel_engine_cs *ring,
>  					struct intel_context *ctx);
>  
> +	int		(*init_context_bb)(struct intel_engine_cs *ring,
> +					   struct intel_context *ctx);
> +
>  	void		(*write_tail)(struct intel_engine_cs *ring,
>  				      u32 value);
>  	int __must_check (*flush)(struct intel_engine_cs *ring,
> -- 
> 2.3.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch