[Intel-gfx] [PATCH] drm/i915: Fix NPD in PMU during driver teardown
Umesh Nerlige Ramappa
umesh.nerlige.ramappa at intel.com
Fri Jul 1 00:11:48 UTC 2022
On Thu, Jun 30, 2022 at 09:00:28PM +0000, Stuart Summers wrote:
>In the driver teardown, we are unregistering the gt prior
>to unregistering the PMU. This means there is a small window
>of time in which the application can request metrics from the
>PMU, some of which are calling into the uapi engines list,
>while the engines are not available. In this case we can
>see null pointer dereferences.
>
>Fix this ordering in both the driver load and unload sequences.
>
>Additionally add a check for engine presence to prevent this
>NPD in the event this ordering is accidentally reversed. Print
>a debug message indicating when they aren't available.
>
>v1: Actually address the driver load/unload ordering issue
>
>Signed-off-by: Stuart Summers <stuart.summers at intel.com>
>---
I thought this is likely happening because intel_gpu_top is running in
the background when i915 is unloaded. I tried a quick repro, I don't see
the unload succeed ("fatal module in use", not sure if this was a
partial unload), but when I try to kill intel_gpu_top, I get an NPD.
This is in the event disable path - i915_pmu_event_stop ->
i915_pmu_disable.
It's likely that you are seeing a different path (unload) leading to the
same issue.
I think in i915_pmu_disable/disable should be aware of event->hw.state
and or pmu->closed states before accessing the event. Maybe like,
if (event->hw.state != PERF_HES_STOPPED && is_engine_event(event)) {
@Tvrtko, wondering if this case is tested by igt at perf_pmu@module-unload.
I am not clear if we should use event->hw.state or pmu->closed here and
if/how they are related. IMO, for this issue, the engine check is good
enough too, so we don't really need the pmu state checks. Thoughts?
Regards,
Umesh
> drivers/gpu/drm/i915/i915_driver.c | 11 ++---
> drivers/gpu/drm/i915/i915_pmu.c | 72 +++++++++++++++++-------------
> 2 files changed, 48 insertions(+), 35 deletions(-)
>
>diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c
>index deb8a8b76965..ee4dcb85d206 100644
>--- a/drivers/gpu/drm/i915/i915_driver.c
>+++ b/drivers/gpu/drm/i915/i915_driver.c
>@@ -717,7 +717,6 @@ static void i915_driver_register(struct drm_i915_private *dev_priv)
> struct drm_device *dev = &dev_priv->drm;
>
> i915_gem_driver_register(dev_priv);
>- i915_pmu_register(dev_priv);
>
> intel_vgpu_register(dev_priv);
>
>@@ -731,11 +730,12 @@ static void i915_driver_register(struct drm_i915_private *dev_priv)
> i915_debugfs_register(dev_priv);
> i915_setup_sysfs(dev_priv);
>
>+ intel_gt_driver_register(to_gt(dev_priv));
>+
> /* Depends on sysfs having been initialized */
>+ i915_pmu_register(dev_priv);
> i915_perf_register(dev_priv);
>
>- intel_gt_driver_register(to_gt(dev_priv));
>-
> intel_display_driver_register(dev_priv);
>
> intel_power_domains_enable(dev_priv);
>@@ -762,11 +762,12 @@ static void i915_driver_unregister(struct drm_i915_private *dev_priv)
>
> intel_display_driver_unregister(dev_priv);
>
>- intel_gt_driver_unregister(to_gt(dev_priv));
>-
> i915_perf_unregister(dev_priv);
>+ /* GT should be available until PMU is gone */
> i915_pmu_unregister(dev_priv);
>
>+ intel_gt_driver_unregister(to_gt(dev_priv));
>+
> i915_teardown_sysfs(dev_priv);
> drm_dev_unplug(&dev_priv->drm);
>
>diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
>index 958b37123bf1..796a1d8e36f2 100644
>--- a/drivers/gpu/drm/i915/i915_pmu.c
>+++ b/drivers/gpu/drm/i915/i915_pmu.c
>@@ -670,21 +670,28 @@ static void i915_pmu_enable(struct perf_event *event)
> if (is_engine_event(event)) {
> u8 sample = engine_event_sample(event);
> struct intel_engine_cs *engine;
>-
>- engine = intel_engine_lookup_user(i915,
>- engine_event_class(event),
>- engine_event_instance(event));
>-
>- BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.enable_count) !=
>- I915_ENGINE_SAMPLE_COUNT);
>- BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.sample) !=
>- I915_ENGINE_SAMPLE_COUNT);
>- GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.enable_count));
>- GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.sample));
>- GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0);
>-
>- engine->pmu.enable |= BIT(sample);
>- engine->pmu.enable_count[sample]++;
>+ u8 class = engine_event_class(event);
>+ u8 instance = engine_event_instance(event);
>+
>+ engine = intel_engine_lookup_user(i915, class, instance);
>+ if (engine) {
>+ BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.enable_count) !=
>+ I915_ENGINE_SAMPLE_COUNT);
>+ BUILD_BUG_ON(ARRAY_SIZE(engine->pmu.sample) !=
>+ I915_ENGINE_SAMPLE_COUNT);
>+ GEM_BUG_ON(sample >=
>+ ARRAY_SIZE(engine->pmu.enable_count));
>+ GEM_BUG_ON(sample >=
>+ ARRAY_SIZE(engine->pmu.sample));
>+ GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0);
>+
>+ engine->pmu.enable |= BIT(sample);
>+ engine->pmu.enable_count[sample]++;
>+ } else {
>+ drm_dbg(&i915->drm,
>+ "Invalid engine event: { class:%d, inst:%d }\n",
>+ class, instance);
>+ }
> }
>
> spin_unlock_irqrestore(&pmu->lock, flags);
>@@ -714,21 +721,26 @@ static void i915_pmu_disable(struct perf_event *event)
> if (is_engine_event(event)) {
> u8 sample = engine_event_sample(event);
> struct intel_engine_cs *engine;
>-
>- engine = intel_engine_lookup_user(i915,
>- engine_event_class(event),
>- engine_event_instance(event));
>-
>- GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.enable_count));
>- GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.sample));
>- GEM_BUG_ON(engine->pmu.enable_count[sample] == 0);
>-
>- /*
>- * Decrement the reference count and clear the enabled
>- * bitmask when the last listener on an event goes away.
>- */
>- if (--engine->pmu.enable_count[sample] == 0)
>- engine->pmu.enable &= ~BIT(sample);
>+ u8 class = engine_event_class(event);
>+ u8 instance = engine_event_instance(event);
>+
>+ engine = intel_engine_lookup_user(i915, class, instance);
>+ if (engine) {
>+ GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.enable_count));
>+ GEM_BUG_ON(sample >= ARRAY_SIZE(engine->pmu.sample));
>+ GEM_BUG_ON(engine->pmu.enable_count[sample] == 0);
>+
>+ /*
>+ * Decrement the reference count and clear the enabled
>+ * bitmask when the last listener on an event goes away.
>+ */
>+ if (--engine->pmu.enable_count[sample] == 0)
>+ engine->pmu.enable &= ~BIT(sample);
>+ } else {
>+ drm_dbg(&i915->drm,
>+ "Invalid engine event: { class:%d, inst:%d }\n",
>+ class, instance);
>+ }
> }
>
> GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
>--
>2.25.1
>
More information about the Intel-gfx
mailing list