[PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI

Andrey Grodzovsky Andrey.Grodzovsky at amd.com
Mon Dec 2 22:05:08 UTC 2019


On 12/2/19 6:42 AM, Ma, Le wrote:
>
> [AMD Official Use Only - Internal Distribution Only]
>
> *From:*Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>
> *Sent:* Saturday, November 30, 2019 12:22 AM
> *To:* Ma, Le <Le.Ma at amd.com>; amd-gfx at lists.freedesktop.org
> *Cc:* Chen, Guchun <Guchun.Chen at amd.com>; Zhou1, Tao 
> <Tao.Zhou1 at amd.com>; Deucher, Alexander <Alexander.Deucher at amd.com>; 
> Li, Dennis <Dennis.Li at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>
> *Subject:* Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset 
> support for XGMI
>
> On 11/28/19 4:00 AM, Ma, Le wrote:
>
>     -----Original Message-----
>     From: Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>
>     <mailto:Andrey.Grodzovsky at amd.com>
>     Sent: Wednesday, November 27, 2019 11:46 PM
>     To: Ma, Le <Le.Ma at amd.com> <mailto:Le.Ma at amd.com>;
>     amd-gfx at lists.freedesktop.org <mailto:amd-gfx at lists.freedesktop.org>
>     Cc: Chen, Guchun <Guchun.Chen at amd.com>
>     <mailto:Guchun.Chen at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>
>     <mailto:Tao.Zhou1 at amd.com>; Deucher, Alexander
>     <Alexander.Deucher at amd.com> <mailto:Alexander.Deucher at amd.com>;
>     Li, Dennis <Dennis.Li at amd.com> <mailto:Dennis.Li at amd.com>; Zhang,
>     Hawking <Hawking.Zhang at amd.com> <mailto:Hawking.Zhang at amd.com>
>     Subject: Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset
>     support for XGMI
>
>     On 11/27/19 4:15 AM, Le Ma wrote:
>
>     > Currently each XGMI node reset wq does not run in parrallel because
>
>     > same work item bound to same cpu runs in sequence. So change to
>     bound
>
>     > the xgmi_reset_work item to different cpus.
>
>     It's not the same work item, see more bellow
>
>     >
>
>     > XGMI requires all nodes enter into baco within very close proximity
>
>     > before any node exit baco. So schedule the xgmi_reset_work wq twice
>
>     > for enter/exit baco respectively.
>
>     >
>
>     > The default reset code path and methods do not change for vega20
>     production:
>
>     >    - baco reset without xgmi/ras
>
>     >    - psp reset with xgmi/ras
>
>     >
>
>     > To enable baco for XGMI/RAS case, both 2 conditions below are
>     needed:
>
>     >    - amdgpu_ras_enable=2
>
>     >    - baco-supported smu firmware
>
>     >
>
>     > The case that PSP reset and baco reset coexist within an XGMI
>     hive is
>
>     > not in the consideration.
>
>     >
>
>     > Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532
>
>     > Signed-off-by: Le Ma <le.ma at amd.com <mailto:le.ma at amd.com>>
>
>     > ---
>
>     > drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +
>
>     > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78
>     ++++++++++++++++++++++++++----
>
>     >   2 files changed, 70 insertions(+), 10 deletions(-)
>
>     >
>
>     > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
>     > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
>     > index d120fe5..08929e6 100644
>
>     > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
>     > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>
>     > @@ -998,6 +998,8 @@ struct amdgpu_device {
>
>     > int                                           pstate;
>
>     >          /* enable runtime pm on the device */
>
>     > bool                            runpm;
>
>     > +
>
>     > + bool                                        in_baco;
>
>     >   };
>
>     >
>
>     >   static inline struct amdgpu_device *amdgpu_ttm_adev(struct
>
>     > ttm_bo_device *bdev) diff --git
>
>     > a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
>     > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
>     > index bd387bb..71abfe9 100644
>
>     > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
>     > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>
>     > @@ -2654,7 +2654,13 @@ static void
>     amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>
>     >          struct amdgpu_device *adev =
>
>     > container_of(__work, struct amdgpu_device, xgmi_reset_work);
>
>     >
>
>     > -       adev->asic_reset_res =  amdgpu_asic_reset(adev);
>
>     > +      if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
>
>     > + adev->asic_reset_res = (adev->in_baco == false) ?
>
>     > +             amdgpu_device_baco_enter(adev->ddev) :
>
>     > +             amdgpu_device_baco_exit(adev->ddev);
>
>     > +      else
>
>     > + adev->asic_reset_res = amdgpu_asic_reset(adev);
>
>     > +
>
>     >          if (adev->asic_reset_res)
>
>     > DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
>
>     >  adev->asic_reset_res, adev->ddev->unique); @@ -3796,6 +3802,7 @@
>
>     > static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>
>     >          struct amdgpu_device *tmp_adev = NULL;
>
>     >          bool need_full_reset = *need_full_reset_arg, vram_lost
>     = false;
>
>     >          int r = 0;
>
>     > +      int cpu = smp_processor_id();
>
>     >
>
>     >          /*
>
>     >           * ASIC reset has to be done on all HGMI hive nodes
>     ASAP @@
>
>     > -3803,21 +3810,24 @@ static int amdgpu_do_asic_reset(struct
>     amdgpu_hive_info *hive,
>
>     >           */
>
>     >          if (need_full_reset) {
>
>     > list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>
>     > - /* For XGMI run all resets in parallel to speed up the process */
>
>     > +                              /*
>
>     > +                              * For XGMI run all resets in
>     parallel to speed up the
>
>     > +                              * process by scheduling the
>     highpri wq on different
>
>     > +                              * cpus. For XGMI with baco reset,
>     all nodes must enter
>
>     > +                              * baco within close proximity
>     before anyone exit.
>
>     > +                              */
>
>     > if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
>
>     > -                                           if
>     (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
>
>     Note that tmp_adev->xgmi_reset_work (the work item) is per device
>     in XGMI hive and not the same work item. So I don't see why you
>     need to explicitly queue them on different CPUs, they should run
>     in parallel already.
>
>     Andrey
>
>     [Le]: It’s also beyond my understanding that the 2 node reset work
>     items scheduled to same cpu does not run in parallel. But from the
>     experiment result in my side, the 2nd work item always run after
>     1st work item finished. Based on this result, I changed to queue
>     them on different CPUs to make sure more XGMI nodes case to run in
>     parallel, because baco requires all nodes enter baco within very
>     close proximity.
>
>     The experiment code is as following for your reference. When card0
>     worker running, card1 worker is not observed to run.
>
> The code bellow will only test that they don't run concurrently - but 
> this doesn't mean they don't run on different CPUs and threads,I don't 
> have an XGMI setup at hand to test this theory but what if there is 
> some locking dependency between them that serializes their execution ? 
> Can you just add a one line print inside amdgpu_device_xgmi_reset_func 
> that prints CPU id, thread name/id and card number ?
>
> Andrey
>
> [Le]: I checked if directly use queue_work() several times, the same 
> CPU thread will be used. And the worker per CPU will execute the item 
> one by one. Our goal here is to make the xgmi_reset_func run 
> concurrently for XGMI BACO case. That’s why I schedule them on 
> different CPUs to run parallelly. And I can share the XGMI system with 
> you if you’d like to verify more.
>

I tried today to setup XGMI 2P setup to test this but weren't able to 
load with the XGMI bridge in place (maybe faulty bridge) - so yea - 
maybe leave me your setup before your changes (the original code) so i 
can try to open some kernel traces that show CPU id and thread id to 
check this. It's just so weird that system_highpri_wq which is 
documented to be multi-cpu and multi-threaded wouldn't queue those work 
items to different cpus and worker threads.

Andrey


>     +atomic_t card0_in_baco = ATOMIC_INIT(0);
>
>     +atomic_t card1_in_baco = ATOMIC_INIT(0);
>
>     +
>
>     static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>
>     {
>
>     struct amdgpu_device *adev =
>
>     container_of(__work, struct amdgpu_device, xgmi_reset_work);
>
>     + printk("lema1: card 0x%x goes into reset wq\n",
>     adev->pdev->bus->number);
>
>     +       if (adev->pdev->bus->number == 0x7) {
>
>     + atomic_set(&card1_in_baco, 1);
>
>     + printk("lema1: card1 in baco from card1 view\n");
>
>     +       }
>
>     +
>
>             if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
>
>                    adev->asic_reset_res = (adev->in_baco == false) ?
>
>     amdgpu_device_baco_enter(adev->ddev) :
>
>     @@ -2664,6 +2673,23 @@ static void
>     amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>
>             if (adev->asic_reset_res)
>
>     DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
>
>     adev->asic_reset_res, adev->ddev->unique);
>
>     +
>
>     +       if (adev->pdev->bus->number == 0x4) {
>
>     + atomic_set(&card0_in_baco, 1);
>
>     +        printk("lema1: card0 in baco from card0 view\n");
>
>     +
>
>     + while (true)
>
>     + if (!!atomic_read(&card1_in_baco))
>
>     + break;
>
>     + printk("lema1: card1 in baco from card0 view\n");
>
>     +       }
>
>     +
>
>     +       if (adev->pdev->bus->number == 0x7) {
>
>     + while (true)
>
>     + if (!!atomic_read(&card0_in_baco))
>
>     + break;
>
>     + printk("lema1: card0 in baco from card1 view\n");
>
>     +       }
>
>     > +                                          if
>     (!queue_work_on(cpu, system_highpri_wq,
>
>     > +    &tmp_adev->xgmi_reset_work))
>
>     >                                                        r =
>     -EALREADY;
>
>     > +                                          cpu =
>     cpumask_next(cpu, cpu_online_mask);
>
>     > } else
>
>     >                                            r =
>     amdgpu_asic_reset(tmp_adev);
>
>     > -
>
>     > - if (r) {
>
>     > -                                           DRM_ERROR("ASIC
>     reset failed with error, %d for drm dev, %s",
>
>     > -                                                       r,
>     tmp_adev->ddev->unique);
>
>     > +                              if (r)
>
>     >                                            break;
>
>     > -                               }
>
>     >                      }
>
>     >
>
>     > -                   /* For XGMI wait for all PSP resets to
>     complete before proceed */
>
>     > +                  /* For XGMI wait for all work to complete
>     before proceed */
>
>     >                      if (!r) {
>
>     > list_for_each_entry(tmp_adev, device_list_handle,
>
>     >     gmc.xgmi.head) {
>
>     > @@ -3826,11 +3836,59 @@ static int amdgpu_do_asic_reset(struct
>     amdgpu_hive_info *hive,
>
>     >                                                        r =
>     tmp_adev->asic_reset_res;
>
>     >                                                        if (r)
>
>     > break;
>
>     > + if(AMD_RESET_METHOD_BACO ==
>
>     > + amdgpu_asic_reset_method(tmp_adev))
>
>     > + tmp_adev->in_baco = true;
>
>     >                                            }
>
>     > }
>
>     >                      }
>
>     > -       }
>
>     >
>
>     > +                  /*
>
>     > +                  * For XGMI with baco reset, need exit baco
>     phase by scheduling
>
>     > +                  * xgmi_reset_work one more time. PSP reset
>     skips this phase.
>
>     > +                  * Not assume the situation that PSP reset and
>     baco reset
>
>     > +                  * coexist within an XGMI hive.
>
>     > +                  */
>
>     > +
>
>     > +                  if (!r) {
>
>     > + cpu = smp_processor_id();
>
>     > + list_for_each_entry(tmp_adev, device_list_handle,
>
>     > + gmc.xgmi.head) {
>
>     > +                                          if
>     (tmp_adev->gmc.xgmi.num_physical_nodes > 1
>
>     > +                                              &&
>     AMD_RESET_METHOD_BACO ==
>
>     > + amdgpu_asic_reset_method(tmp_adev)) {
>
>     > +                                                      if
>     (!queue_work_on(cpu,
>
>     > + system_highpri_wq,
>
>     > +             &tmp_adev->xgmi_reset_work))
>
>     > + r = -EALREADY;
>
>     > +                                                      if (r)
>
>     > + break;
>
>     > +                                                      cpu =
>     cpumask_next(cpu, cpu_online_mask);
>
>     > +                                          }
>
>     > +                              }
>
>     > +                  }
>
>     > +
>
>     > +                  if (!r) {
>
>     > + list_for_each_entry(tmp_adev, device_list_handle,
>
>     > + gmc.xgmi.head) {
>
>     > +                                          if
>     (tmp_adev->gmc.xgmi.num_physical_nodes > 1
>
>     > +                                              &&
>     AMD_RESET_METHOD_BACO ==
>
>     > + amdgpu_asic_reset_method(tmp_adev)) {
>
>     > + flush_work(&tmp_adev->xgmi_reset_work);
>
>     > +                                                      r =
>     tmp_adev->asic_reset_res;
>
>     > +                                                      if (r)
>
>     > + break;
>
>     > + tmp_adev->in_baco = false;
>
>     > +                                          }
>
>     > +                              }
>
>     > +                  }
>
>     > +
>
>     > +                  if (r) {
>
>     > + DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
>
>     > +                                          r,
>     tmp_adev->ddev->unique);
>
>     > + goto end;
>
>     > +                  }
>
>     > +      }
>
>     >
>
>     > list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>
>     >                      if (need_full_reset) {
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20191202/f61a0507/attachment-0001.html>


More information about the amd-gfx mailing list