[Intel-gfx] [PATCH] drm/i915: avoid concurrent writes to aux_inv
Chris Wilson
chris at chris-wilson.co.uk
Wed Mar 2 20:50:19 UTC 2022
Quoting fei.yang at intel.com (2022-03-02 18:26:57)
> From: Fei Yang <fei.yang at intel.com>
>
> GPU hangs have been observed when multiple engines write to the
> same aux_inv register at the same time. To avoid this each engine
> should only invalidate its own auxiliary table. The function
> gen12_emit_flush_xcs() currently invalidate the auxiliary table for
> all engines because the rq->engine is not necessarily the engine
> eventually carrying out the request, and potentially the engine
> could even be a virtual one (with engine->instance being -1).
> With this patch, auxiliary table invalidation is done only for the
> engine executing the request. And the mmio address for the aux_inv
> register is set after the engine instance becomes certain.
>
> Signed-off-by: Chris Wilson <chris.p.wilson at intel.com>
> Signed-off-by: Fei Yang <fei.yang at intel.com>
> ---
> drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 41 ++++---------------
> .../drm/i915/gt/intel_execlists_submission.c | 38 +++++++++++++++++
> drivers/gpu/drm/i915/i915_request.h | 2 +
> 3 files changed, 47 insertions(+), 34 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> index b1b9c3fd7bf9..af62e2bc2c9b 100644
> --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> @@ -165,30 +165,6 @@ static u32 preparser_disable(bool state)
> return MI_ARB_CHECK | 1 << 8 | state;
> }
>
> -static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
> -{
> - static const i915_reg_t vd[] = {
> - GEN12_VD0_AUX_NV,
> - GEN12_VD1_AUX_NV,
> - GEN12_VD2_AUX_NV,
> - GEN12_VD3_AUX_NV,
> - };
> -
> - static const i915_reg_t ve[] = {
> - GEN12_VE0_AUX_NV,
> - GEN12_VE1_AUX_NV,
> - };
> -
> - if (engine->class == VIDEO_DECODE_CLASS)
> - return vd[engine->instance];
> -
> - if (engine->class == VIDEO_ENHANCEMENT_CLASS)
> - return ve[engine->instance];
> -
> - GEM_BUG_ON("unknown aux_inv reg\n");
> - return INVALID_MMIO_REG;
> -}
> -
> static u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
> {
> *cs++ = MI_LOAD_REGISTER_IMM(1);
> @@ -288,7 +264,7 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
> if (mode & EMIT_INVALIDATE)
> aux_inv = rq->engine->mask & ~BIT(BCS0);
> if (aux_inv)
> - cmd += 2 * hweight32(aux_inv) + 2;
> + cmd += 4;
>
> cs = intel_ring_begin(rq, cmd);
> if (IS_ERR(cs))
> @@ -319,16 +295,13 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
> *cs++ = 0; /* value */
>
> if (aux_inv) { /* hsdes: 1809175790 */
> - struct intel_engine_cs *engine;
> - unsigned int tmp;
> -
> - *cs++ = MI_LOAD_REGISTER_IMM(hweight32(aux_inv));
> - for_each_engine_masked(engine, rq->engine->gt, aux_inv, tmp) {
> - *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
> - *cs++ = AUX_INV;
> - }
> + *cs++ = MI_LOAD_REGISTER_IMM(1);
> + rq->vd_ve_aux_inv = cs;
> + *cs++ = 0; /* address to be set at submission to HW */
> + *cs++ = AUX_INV;
> *cs++ = MI_NOOP;
> - }
> + } else
> + rq->vd_ve_aux_inv = NULL;
>
> if (mode & EMIT_INVALIDATE)
> *cs++ = preparser_disable(false);
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 1c602d4ae297..a018de6dcac5 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -1258,6 +1258,34 @@ static bool completed(const struct i915_request *rq)
> return __i915_request_is_complete(rq);
> }
>
> +static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
> +{
> + static const i915_reg_t vd[] = {
> + GEN12_VD0_AUX_NV,
> + GEN12_VD1_AUX_NV,
> + GEN12_VD2_AUX_NV,
> + GEN12_VD3_AUX_NV,
> + };
> +
> + static const i915_reg_t ve[] = {
> + GEN12_VE0_AUX_NV,
> + GEN12_VE1_AUX_NV,
> + };
> +
> + if (engine->class == VIDEO_DECODE_CLASS) {
> + GEM_BUG_ON(engine->instance >= ARRAY_SIZE(vd));
> + return vd[engine->instance];
> + }
> +
> + if (engine->class == VIDEO_ENHANCEMENT_CLASS) {
> + GEM_BUG_ON(engine->instance >= ARRAY_SIZE(ve));
> + return ve[engine->instance];
> + }
> +
> + GEM_BUG_ON("unknown aux_inv reg\n");
> + return INVALID_MMIO_REG;
> +}
> +
> static void execlists_dequeue(struct intel_engine_cs *engine)
> {
> struct intel_engine_execlists * const execlists = &engine->execlists;
> @@ -1538,6 +1566,16 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> }
>
> if (__i915_request_submit(rq)) {
> + /* hsdes: 1809175790 */
> + if ((GRAPHICS_VER(engine->i915) == 12) &&
> + rq->vd_ve_aux_inv &&
> + (engine->class == VIDEO_DECODE_CLASS ||
> + engine->class == VIDEO_ENHANCEMENT_CLASS)) {
We do not need the extra checks here; we just do as we are told. We only
tell ourselves to apply the fixup when required.
> + *rq->vd_ve_aux_inv = i915_mmio_reg_offset
Likewise, vd_ve is overspecific, aux_inv_fixup or aux_inv_wa (or
wa_aux_iv, fixup_aux_inv).
> + (aux_inv_reg(engine));
> + rq->vd_ve_aux_inv = NULL;
Move this to i915_request initialisation so that we only set aux_inv
when required, which probably explains the extra defence.
> + rq->execution_mask = engine->mask;
> + }
> if (!merge) {
> *port++ = i915_request_get(last);
> last = NULL;
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 28b1f9db5487..69de32e5e15d 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -350,6 +350,8 @@ struct i915_request {
> struct list_head link;
> unsigned long delay;
> } mock;)
> +
> + u32 *vd_ve_aux_inv;
Not at the end of the struct; that's where we put things in the dungeon.
The selftest struct should be last; I do hope no one has been putting
things at random places in the struct without considering the layout and
semantics. From the flow, this is akin to batch, capture_list; before
emitted_jiffies would be a good spot.
-Chris
More information about the Intel-gfx
mailing list