[PATCH] drm/amdgpu: fix a race in GPU reset with IB test (v2)
Christian König
ckoenig.leichtzumerken at gmail.com
Fri May 31 13:55:47 UTC 2019
Am 29.05.19 um 20:44 schrieb Alex Deucher:
> Split late_init into two functions, one (do_late_init) which
> just does the hw init, and late_init which calls do_late_init
> and schedules the IB test work. Call do_late_init in
> the GPU reset code to run the init code, but not schedule
> the IB test code. The IB test code is called directly
> in the gpu reset code so no need to run the IB tests
> in a separate work thread. If we do, we end up racing.
>
> v2: Rework late_init. Pull out the mgpu fan boost and xgmi
> pstate code into late_init so they get called in all cases.
> rename the late_init worker thread to delayed work since it's
> just the IB tests now which can happen later. Schedule the
> work at init and resume time. It's not needed at reset time
> because the IB tests are called directly.
>
> Cc: Xinhui Pan <xinhui.pan at amd.com>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
Reviewed-by: Christian König <christian.koenig at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 116 +++++++++++----------
> drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 2 +-
> 3 files changed, 61 insertions(+), 59 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d355e9a09ad1..19a00282e34c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -925,7 +925,7 @@ struct amdgpu_device {
> const struct amdgpu_df_funcs *df_funcs;
>
> /* delayed work_func for deferring clockgating during resume */
> - struct delayed_work late_init_work;
> + struct delayed_work delayed_init_work;
>
> struct amdgpu_virt virt;
> /* firmware VRAM reservation */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 7a8c2201cd04..d00fd5dd307a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1869,6 +1869,43 @@ static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_power
> return 0;
> }
>
> +static int amdgpu_device_enable_mgpu_fan_boost(void)
> +{
> + struct amdgpu_gpu_instance *gpu_ins;
> + struct amdgpu_device *adev;
> + int i, ret = 0;
> +
> + mutex_lock(&mgpu_info.mutex);
> +
> + /*
> + * MGPU fan boost feature should be enabled
> + * only when there are two or more dGPUs in
> + * the system
> + */
> + if (mgpu_info.num_dgpu < 2)
> + goto out;
> +
> + for (i = 0; i < mgpu_info.num_dgpu; i++) {
> + gpu_ins = &(mgpu_info.gpu_ins[i]);
> + adev = gpu_ins->adev;
> + if (!(adev->flags & AMD_IS_APU) &&
> + !gpu_ins->mgpu_fan_enabled &&
> + adev->powerplay.pp_funcs &&
> + adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
> + ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
> + if (ret)
> + break;
> +
> + gpu_ins->mgpu_fan_enabled = 1;
> + }
> + }
> +
> +out:
> + mutex_unlock(&mgpu_info.mutex);
> +
> + return ret;
> +}
> +
> /**
> * amdgpu_device_ip_late_init - run late init for hardware IPs
> *
> @@ -1902,11 +1939,15 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
> amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
> amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
>
> - queue_delayed_work(system_wq, &adev->late_init_work,
> - msecs_to_jiffies(AMDGPU_RESUME_MS));
> -
> amdgpu_device_fill_reset_magic(adev);
>
> + r = amdgpu_device_enable_mgpu_fan_boost();
> + if (r)
> + DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
> +
> + /* set to low pstate by default */
> + amdgpu_xgmi_set_pstate(adev, 0);
> +
> return 0;
> }
>
> @@ -2005,65 +2046,20 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
> return 0;
> }
>
> -static int amdgpu_device_enable_mgpu_fan_boost(void)
> -{
> - struct amdgpu_gpu_instance *gpu_ins;
> - struct amdgpu_device *adev;
> - int i, ret = 0;
> -
> - mutex_lock(&mgpu_info.mutex);
> -
> - /*
> - * MGPU fan boost feature should be enabled
> - * only when there are two or more dGPUs in
> - * the system
> - */
> - if (mgpu_info.num_dgpu < 2)
> - goto out;
> -
> - for (i = 0; i < mgpu_info.num_dgpu; i++) {
> - gpu_ins = &(mgpu_info.gpu_ins[i]);
> - adev = gpu_ins->adev;
> - if (!(adev->flags & AMD_IS_APU) &&
> - !gpu_ins->mgpu_fan_enabled &&
> - adev->powerplay.pp_funcs &&
> - adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
> - ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
> - if (ret)
> - break;
> -
> - gpu_ins->mgpu_fan_enabled = 1;
> - }
> - }
> -
> -out:
> - mutex_unlock(&mgpu_info.mutex);
> -
> - return ret;
> -}
> -
> /**
> - * amdgpu_device_ip_late_init_func_handler - work handler for ib test
> + * amdgpu_device_delayed_init_work_handler - work handler for IB tests
> *
> * @work: work_struct.
> */
> -static void amdgpu_device_ip_late_init_func_handler(struct work_struct *work)
> +static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
> {
> struct amdgpu_device *adev =
> - container_of(work, struct amdgpu_device, late_init_work.work);
> + container_of(work, struct amdgpu_device, delayed_init_work.work);
> int r;
>
> r = amdgpu_ib_ring_tests(adev);
> if (r)
> DRM_ERROR("ib ring test failed (%d).\n", r);
> -
> - r = amdgpu_device_enable_mgpu_fan_boost();
> - if (r)
> - DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
> -
> - /*set to low pstate by default */
> - amdgpu_xgmi_set_pstate(adev, 0);
> -
> }
>
> static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
> @@ -2535,8 +2531,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> INIT_LIST_HEAD(&adev->ring_lru_list);
> spin_lock_init(&adev->ring_lru_list_lock);
>
> - INIT_DELAYED_WORK(&adev->late_init_work,
> - amdgpu_device_ip_late_init_func_handler);
> + INIT_DELAYED_WORK(&adev->delayed_init_work,
> + amdgpu_device_delayed_init_work_handler);
> INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
> amdgpu_device_delay_enable_gfx_off);
>
> @@ -2749,6 +2745,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> /* must succeed. */
> amdgpu_ras_resume(adev);
>
> + queue_delayed_work(system_wq, &adev->delayed_init_work,
> + msecs_to_jiffies(AMDGPU_RESUME_MS));
> +
> r = device_create_file(adev->dev, &dev_attr_pcie_replay_count);
> if (r) {
> dev_err(adev->dev, "Could not create pcie_replay_count");
> @@ -2796,7 +2795,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
> adev->firmware.gpu_info_fw = NULL;
> }
> adev->accel_working = false;
> - cancel_delayed_work_sync(&adev->late_init_work);
> + cancel_delayed_work_sync(&adev->delayed_init_work);
> /* free i2c buses */
> if (!amdgpu_device_has_dc_support(adev))
> amdgpu_i2c_fini(adev);
> @@ -2859,7 +2858,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon)
> if (fbcon)
> amdgpu_fbdev_set_suspend(adev, 1);
>
> - cancel_delayed_work_sync(&adev->late_init_work);
> + cancel_delayed_work_sync(&adev->delayed_init_work);
>
> if (!amdgpu_device_has_dc_support(adev)) {
> /* turn off display hw */
> @@ -2979,6 +2978,9 @@ int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon)
> if (r)
> return r;
>
> + queue_delayed_work(system_wq, &adev->delayed_init_work,
> + msecs_to_jiffies(AMDGPU_RESUME_MS));
> +
> if (!amdgpu_device_has_dc_support(adev)) {
> /* pin cursors */
> list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
> @@ -3002,7 +3004,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon)
> return r;
>
> /* Make sure IB tests flushed */
> - flush_delayed_work(&adev->late_init_work);
> + flush_delayed_work(&adev->delayed_init_work);
>
> /* blat the mode back in */
> if (fbcon) {
> @@ -3593,7 +3595,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>
> dev_info(adev->dev, "GPU reset begin!\n");
>
> - cancel_delayed_work_sync(&adev->late_init_work);
> + cancel_delayed_work_sync(&adev->delayed_init_work);
>
> hive = amdgpu_get_xgmi_hive(adev, false);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index 87a93874d71e..be97771525e1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -974,7 +974,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
> int r, pasid;
>
> /* Ensure IB tests are run on ring */
> - flush_delayed_work(&adev->late_init_work);
> + flush_delayed_work(&adev->delayed_init_work);
>
> file_priv->driver_priv = NULL;
>
More information about the amd-gfx
mailing list