[Intel-gfx] [PATCH] drm/i915: Fix NPD in PMU during driver teardown

Summers, Stuart stuart.summers at intel.com
Thu Jun 30 19:04:18 UTC 2022


On Thu, 2022-06-30 at 11:29 +0100, Tvrtko Ursulin wrote:
> On 29/06/2022 19:46, Stuart Summers wrote:
> > In the driver teardown, we are unregistering the gt prior
> > to unregistering the PMU. This means there is a small window
> > of time in which the application can request metrics from the
> > PMU, some of which are calling into the uapi engines list,
> > while the engines are not available. In this case we can
> > see null pointer dereferences.
> > 
> > Prevent this by simply checking if the engines are present
> > when those PMU events come through. Print a debug message
> > indicating when they aren't available.
> 
> Obvious question - can we just move PMU unregister PMU to before 
> unregister GT?

Well I wanted to push the workaround asap to get feedback. I submitted
to trybot in parallel and it looks valid. I agree that's a more valid
solution, but IMO we should have these checks anyway.

At any rate, I'll resubmit with the move of the registrations.

Thanks,
Stuart

> 
> Regards,
> 
> Tvrtko
> 
> > Signed-off-by: Stuart Summers <stuart.summers at intel.com>
> > ---
> >   drivers/gpu/drm/i915/i915_pmu.c | 72 +++++++++++++++++++---------
> > -----
> >   1 file changed, 42 insertions(+), 30 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_pmu.c
> > b/drivers/gpu/drm/i915/i915_pmu.c
> > index 958b37123bf1..796a1d8e36f2 100644
> > --- a/drivers/gpu/drm/i915/i915_pmu.c
> > +++ b/drivers/gpu/drm/i915/i915_pmu.c
> > @@ -670,21 +670,28 @@ static void i915_pmu_enable(struct perf_event
> > *event)
> >   	if (is_engine_event(event)) {
> >   		u8 sample = engine_event_sample(event);
> >   		struct intel_engine_cs *engine;
> > -
> > -		engine = intel_engine_lookup_user(i915,
> > -						  engine_event_class(ev
> > ent),
> > -						  engine_event_instance
> > (event));
> > -
> > -		BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.enable_count) !=
> > -			     I915_ENGINE_SAMPLE_COUNT);
> > -		BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.sample) !=
> > -			     I915_ENGINE_SAMPLE_COUNT);
> > -		GEM_BUG_ON(sample >= ARRAY_SIZE(engine-
> > >pmu.enable_count));
> > -		GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.sample));
> > -		GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0);
> > -
> > -		engine->pmu.enable |= BIT(sample);
> > -		engine->pmu.enable_count[sample]++;
> > +		u8 class = engine_event_class(event);
> > +		u8 instance = engine_event_instance(event);
> > +
> > +		engine = intel_engine_lookup_user(i915, class,
> > instance);
> > +		if (engine) {
> > +			BUILD_BUG_ON(ARRAY_SIZE(engine-
> > >pmu.enable_count) !=
> > +				     I915_ENGINE_SAMPLE_COUNT);
> > +			BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.sample) !=
> > +				     I915_ENGINE_SAMPLE_COUNT);
> > +			GEM_BUG_ON(sample >=
> > +				   ARRAY_SIZE(engine-
> > >pmu.enable_count));
> > +			GEM_BUG_ON(sample >=
> > +				   ARRAY_SIZE(engine->pmu.sample));
> > +			GEM_BUG_ON(engine->pmu.enable_count[sample] ==
> > ~0);
> > +
> > +			engine->pmu.enable |= BIT(sample);
> > +			engine->pmu.enable_count[sample]++;
> > +		} else {
> > +			drm_dbg(&i915->drm,
> > +				"Invalid engine event: { class:%d,
> > inst:%d }\n",
> > +				class, instance);
> > +		}
> >   	}
> >   
> >   	spin_unlock_irqrestore(&pmu->lock, flags);
> > @@ -714,21 +721,26 @@ static void i915_pmu_disable(struct
> > perf_event *event)
> >   	if (is_engine_event(event)) {
> >   		u8 sample = engine_event_sample(event);
> >   		struct intel_engine_cs *engine;
> > -
> > -		engine = intel_engine_lookup_user(i915,
> > -						  engine_event_class(ev
> > ent),
> > -						  engine_event_instance
> > (event));
> > -
> > -		GEM_BUG_ON(sample >= ARRAY_SIZE(engine-
> > >pmu.enable_count));
> > -		GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.sample));
> > -		GEM_BUG_ON(engine->pmu.enable_count[sample] == 0);
> > -
> > -		/*
> > -		 * Decrement the reference count and clear the enabled
> > -		 * bitmask when the last listener on an event goes
> > away.
> > -		 */
> > -		if (--engine->pmu.enable_count[sample] == 0)
> > -			engine->pmu.enable &= ~BIT(sample);
> > +		u8 class = engine_event_class(event);
> > +		u8 instance = engine_event_instance(event);
> > +
> > +		engine = intel_engine_lookup_user(i915, class,
> > instance);
> > +		if (engine) {
> > +			GEM_BUG_ON(sample >= ARRAY_SIZE(engine-
> > >pmu.enable_count));
> > +			GEM_BUG_ON(sample >= ARRAY_SIZE(engine-
> > >pmu.sample));
> > +			GEM_BUG_ON(engine->pmu.enable_count[sample] ==
> > 0);
> > +
> > +			/*
> > +			 * Decrement the reference count and clear the
> > enabled
> > +			 * bitmask when the last listener on an event
> > goes away.
> > +			 */
> > +			if (--engine->pmu.enable_count[sample] == 0)
> > +				engine->pmu.enable &= ~BIT(sample);
> > +		} else {
> > +			drm_dbg(&i915->drm,
> > +				"Invalid engine event: { class:%d,
> > inst:%d }\n",
> > +				class, instance);
> > +		}
> >   	}
> >   
> >   	GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));


More information about the Intel-gfx mailing list