[Intel-gfx] [PATCH v2 2/2] drm/i915/pmu: Fix CPU hotplug with multiple GPUs

Chris Wilson chris at chris-wilson.co.uk
Tue Oct 20 16:19:04 UTC 2020


Quoting Tvrtko Ursulin (2020-10-20 17:11:44)
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> 
> Since we keep a driver global mask of online CPUs and base the decision
> whether PMU needs to be migrated upon it, we need to make sure the
> migration is done for all registered PMUs (so GPUs).
> 
> To do this we need to track the current CPU for each PMU and base the
> decision on whether to migrate on a comparison between global and local
> state.
> 
> At the same time, since dynamic CPU hotplug notification slots are a
> scarce resource and given how we already register the multi instance type
> state, we can and should add multiple instance of the i915 PMU to this
> same state and not allocate a new one for every GPU.
> 
> v2:
>  * Use pr_notice. (Chris)
> 
> v3:
>  * Handle a nasty interaction where unregistration which triggers a false
>    CPU offline event. (Chris)
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Suggested-by: Daniel Vetter <daniel.vetter at intel.com> # dynamic slot optimisation
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_pci.c |  7 +++-
>  drivers/gpu/drm/i915/i915_pmu.c | 57 +++++++++++++++++++++------------
>  drivers/gpu/drm/i915/i915_pmu.h |  6 +++-
>  3 files changed, 48 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
> index 27964ac0638a..a384f51c91c1 100644
> --- a/drivers/gpu/drm/i915/i915_pci.c
> +++ b/drivers/gpu/drm/i915/i915_pci.c
> @@ -1150,9 +1150,13 @@ static int __init i915_init(void)
>                 return 0;
>         }
>  
> +       i915_pmu_init();
> +
>         err = pci_register_driver(&i915_pci_driver);
> -       if (err)
> +       if (err) {
> +               i915_pmu_exit();
>                 return err;

We could do an onion bhaji and call i915_globals_exit() as well.

> +       }
>  
>         i915_perf_sysctl_register();
>         return 0;
> @@ -1166,6 +1170,7 @@ static void __exit i915_exit(void)
>         i915_perf_sysctl_unregister();
>         pci_unregister_driver(&i915_pci_driver);
>         i915_globals_exit();
> +       i915_pmu_exit();

pmu_exit then globals_exit to pair with i915_init?

>  }
>  
>  module_init(i915_init);
> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
> index 51ed7d0efcdc..cd786ad12be7 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.c
> +++ b/drivers/gpu/drm/i915/i915_pmu.c
> @@ -30,6 +30,7 @@
>  #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS)
>  
>  static cpumask_t i915_pmu_cpumask;
> +static unsigned int i915_pmu_target_cpu = -1;
>  
>  static u8 engine_config_sample(u64 config)
>  {
> @@ -1049,25 +1050,39 @@ static int i915_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
>  static int i915_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
>  {
>         struct i915_pmu *pmu = hlist_entry_safe(node, typeof(*pmu), cpuhp.node);
> -       unsigned int target;
> +       unsigned int target = i915_pmu_target_cpu;
>  
>         GEM_BUG_ON(!pmu->base.event_init);
>  
> +       /*
> +        * Unregistering an instance generates a CPU offline event which we must
> +        * ignore to avoid incorrectly modifying the shared i915_pmu_cpumask.
> +        */
> +       if (pmu->closed)
> +               return 0;
> +
>         if (cpumask_test_and_clear_cpu(cpu, &i915_pmu_cpumask)) {
>                 target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
> +
>                 /* Migrate events if there is a valid target */
>                 if (target < nr_cpu_ids) {
>                         cpumask_set_cpu(target, &i915_pmu_cpumask);
> -                       perf_pmu_migrate_context(&pmu->base, cpu, target);
> +                       i915_pmu_target_cpu = target;
>                 }
>         }
>  
> +       if (target < nr_cpu_ids && target != pmu->cpuhp.cpu) {
> +               perf_pmu_migrate_context(&pmu->base, cpu, target);
> +               pmu->cpuhp.cpu = target;
> +       }
> +
>         return 0;
>  }
>  
> -static int i915_pmu_register_cpuhp_state(struct i915_pmu *pmu)
> +static enum cpuhp_state cpuhp_slot = CPUHP_INVALID;
> +
> +void i915_pmu_init(void)
>  {
> -       enum cpuhp_state slot;
>         int ret;
>  
>         ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
> @@ -1075,27 +1090,29 @@ static int i915_pmu_register_cpuhp_state(struct i915_pmu *pmu)
>                                       i915_pmu_cpu_online,
>                                       i915_pmu_cpu_offline);
>         if (ret < 0)
> -               return ret;
> +               pr_notice("Failed to setup cpuhp state for i915 PMU! (%d)\n",
> +                         ret);
> +       else
> +               cpuhp_slot = ret;
> +}
>  
> -       slot = ret;
> -       ret = cpuhp_state_add_instance(slot, &pmu->cpuhp.node);
> -       if (ret) {
> -               cpuhp_remove_multi_state(slot);
> -               return ret;
> -       }
> +void i915_pmu_exit(void)
> +{
> +       if (cpuhp_slot != CPUHP_INVALID)
> +               cpuhp_remove_multi_state(cpuhp_slot);
> +}
>  
> -       pmu->cpuhp.slot = slot;
> -       return 0;
> +static int i915_pmu_register_cpuhp_state(struct i915_pmu *pmu)
> +{
> +       if (cpuhp_slot == CPUHP_INVALID)
> +               return -EINVAL;
> +
> +       return cpuhp_state_add_instance(cpuhp_slot, &pmu->cpuhp.node);
>  }
>  
>  static void i915_pmu_unregister_cpuhp_state(struct i915_pmu *pmu)
>  {
> -       struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu);
> -
> -       drm_WARN_ON(&i915->drm, pmu->cpuhp.slot == CPUHP_INVALID);
> -       drm_WARN_ON(&i915->drm, cpuhp_state_remove_instance(pmu->cpuhp.slot, &pmu->cpuhp.node));
> -       cpuhp_remove_multi_state(pmu->cpuhp.slot);
> -       pmu->cpuhp.slot = CPUHP_INVALID;
> +       cpuhp_state_remove_instance(cpuhp_slot, &pmu->cpuhp.node);
>  }
>  
>  static bool is_igp(struct drm_i915_private *i915)
> @@ -1129,7 +1146,7 @@ void i915_pmu_register(struct drm_i915_private *i915)
>         spin_lock_init(&pmu->lock);
>         hrtimer_init(&pmu->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>         pmu->timer.function = i915_sample;
> -       pmu->cpuhp.slot = CPUHP_INVALID;
> +       pmu->cpuhp.cpu = -1;

To the best of my limited understanding of cpuhp,
Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
-Chris


More information about the Intel-gfx mailing list