[PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI
Andrey Grodzovsky
Andrey.Grodzovsky at amd.com
Wed Nov 27 15:46:20 UTC 2019
On 11/27/19 4:15 AM, Le Ma wrote:
> Currently each XGMI node reset wq does not run in parrallel because same work
> item bound to same cpu runs in sequence. So change to bound the xgmi_reset_work
> item to different cpus.
It's not the same work item, see more bellow
>
> XGMI requires all nodes enter into baco within very close proximity before
> any node exit baco. So schedule the xgmi_reset_work wq twice for enter/exit
> baco respectively.
>
> The default reset code path and methods do not change for vega20 production:
> - baco reset without xgmi/ras
> - psp reset with xgmi/ras
>
> To enable baco for XGMI/RAS case, both 2 conditions below are needed:
> - amdgpu_ras_enable=2
> - baco-supported smu firmware
>
> The case that PSP reset and baco reset coexist within an XGMI hive is not in
> the consideration.
>
> Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532
> Signed-off-by: Le Ma <le.ma at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78 ++++++++++++++++++++++++++----
> 2 files changed, 70 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d120fe5..08929e6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -998,6 +998,8 @@ struct amdgpu_device {
> int pstate;
> /* enable runtime pm on the device */
> bool runpm;
> +
> + bool in_baco;
> };
>
> static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index bd387bb..71abfe9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2654,7 +2654,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
> struct amdgpu_device *adev =
> container_of(__work, struct amdgpu_device, xgmi_reset_work);
>
> - adev->asic_reset_res = amdgpu_asic_reset(adev);
> + if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
> + adev->asic_reset_res = (adev->in_baco == false) ?
> + amdgpu_device_baco_enter(adev->ddev) :
> + amdgpu_device_baco_exit(adev->ddev);
> + else
> + adev->asic_reset_res = amdgpu_asic_reset(adev);
> +
> if (adev->asic_reset_res)
> DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
> adev->asic_reset_res, adev->ddev->unique);
> @@ -3796,6 +3802,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
> struct amdgpu_device *tmp_adev = NULL;
> bool need_full_reset = *need_full_reset_arg, vram_lost = false;
> int r = 0;
> + int cpu = smp_processor_id();
>
> /*
> * ASIC reset has to be done on all HGMI hive nodes ASAP
> @@ -3803,21 +3810,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
> */
> if (need_full_reset) {
> list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> - /* For XGMI run all resets in parallel to speed up the process */
> + /*
> + * For XGMI run all resets in parallel to speed up the
> + * process by scheduling the highpri wq on different
> + * cpus. For XGMI with baco reset, all nodes must enter
> + * baco within close proximity before anyone exit.
> + */
> if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
> - if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
Note that tmp_adev->xgmi_reset_work (the work item) is per device in
XGMI hive and not the same work item. So I don't see why you need to
explicitly queue them on different CPUs, they should run in parallel
already.
Andrey
> + if (!queue_work_on(cpu, system_highpri_wq,
> + &tmp_adev->xgmi_reset_work))
> r = -EALREADY;
> + cpu = cpumask_next(cpu, cpu_online_mask);
> } else
> r = amdgpu_asic_reset(tmp_adev);
> -
> - if (r) {
> - DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
> - r, tmp_adev->ddev->unique);
> + if (r)
> break;
> - }
> }
>
> - /* For XGMI wait for all PSP resets to complete before proceed */
> + /* For XGMI wait for all work to complete before proceed */
> if (!r) {
> list_for_each_entry(tmp_adev, device_list_handle,
> gmc.xgmi.head) {
> @@ -3826,11 +3836,59 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
> r = tmp_adev->asic_reset_res;
> if (r)
> break;
> + if(AMD_RESET_METHOD_BACO ==
> + amdgpu_asic_reset_method(tmp_adev))
> + tmp_adev->in_baco = true;
> }
> }
> }
> - }
>
> + /*
> + * For XGMI with baco reset, need exit baco phase by scheduling
> + * xgmi_reset_work one more time. PSP reset skips this phase.
> + * Not assume the situation that PSP reset and baco reset
> + * coexist within an XGMI hive.
> + */
> +
> + if (!r) {
> + cpu = smp_processor_id();
> + list_for_each_entry(tmp_adev, device_list_handle,
> + gmc.xgmi.head) {
> + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1
> + && AMD_RESET_METHOD_BACO ==
> + amdgpu_asic_reset_method(tmp_adev)) {
> + if (!queue_work_on(cpu,
> + system_highpri_wq,
> + &tmp_adev->xgmi_reset_work))
> + r = -EALREADY;
> + if (r)
> + break;
> + cpu = cpumask_next(cpu, cpu_online_mask);
> + }
> + }
> + }
> +
> + if (!r) {
> + list_for_each_entry(tmp_adev, device_list_handle,
> + gmc.xgmi.head) {
> + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1
> + && AMD_RESET_METHOD_BACO ==
> + amdgpu_asic_reset_method(tmp_adev)) {
> + flush_work(&tmp_adev->xgmi_reset_work);
> + r = tmp_adev->asic_reset_res;
> + if (r)
> + break;
> + tmp_adev->in_baco = false;
> + }
> + }
> + }
> +
> + if (r) {
> + DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
> + r, tmp_adev->ddev->unique);
> + goto end;
> + }
> + }
>
> list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> if (need_full_reset) {
More information about the amd-gfx
mailing list