[PATCH 1/1] drm/amdgpu: Use device wedged event
kernel test robot
lkp at intel.com
Fri Dec 20 14:06:33 UTC 2024
Hi André,
kernel test robot noticed the following build errors:
[auto build test ERROR on linus/master]
[also build test ERROR on drm-intel/for-linux-next drm-intel/for-linux-next-fixes drm-tip/drm-tip v6.13-rc3 next-20241220]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Andr-Almeida/drm-amdgpu-Use-device-wedged-event/20241213-031134
base: linus/master
patch link: https://lore.kernel.org/r/20241212190909.28559-2-andrealmeid%40igalia.com
patch subject: [PATCH 1/1] drm/amdgpu: Use device wedged event
config: s390-randconfig-001-20241220 (https://download.01.org/0day-ci/archive/20241220/202412202144.SalviKDh-lkp@intel.com/config)
compiler: s390-linux-gcc (GCC) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241220/202412202144.SalviKDh-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp at intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202412202144.SalviKDh-lkp@intel.com/
All errors (new ones prefixed by >>):
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c: In function 'amdgpu_device_gpu_recover':
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:6061:9: error: implicit declaration of function 'drm_dev_wedged_event' [-Wimplicit-function-declaration]
6061 | drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
| ^~~~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:6061:49: error: 'DRM_WEDGE_RECOVERY_NONE' undeclared (first use in this function); did you mean 'DRM_MODE_ENCODER_NONE'?
6061 | drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
| ^~~~~~~~~~~~~~~~~~~~~~~
| DRM_MODE_ENCODER_NONE
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:6061:49: note: each undeclared identifier is reported only once for each function it appears in
vim +/drm_dev_wedged_event +6061 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
5794
5795 /**
5796 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5797 *
5798 * @adev: amdgpu_device pointer
5799 * @job: which job trigger hang
5800 * @reset_context: amdgpu reset context pointer
5801 *
5802 * Attempt to reset the GPU if it has hung (all asics).
5803 * Attempt to do soft-reset or full-reset and reinitialize Asic
5804 * Returns 0 for success or an error on failure.
5805 */
5806
5807 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5808 struct amdgpu_job *job,
5809 struct amdgpu_reset_context *reset_context)
5810 {
5811 struct list_head device_list, *device_list_handle = NULL;
5812 bool job_signaled = false;
5813 struct amdgpu_hive_info *hive = NULL;
5814 struct amdgpu_device *tmp_adev = NULL;
5815 int i, r = 0;
5816 bool need_emergency_restart = false;
5817 bool audio_suspended = false;
5818 int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
5819
5820 /*
5821 * Special case: RAS triggered and full reset isn't supported
5822 */
5823 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5824
5825 /*
5826 * Flush RAM to disk so that after reboot
5827 * the user can read log and see why the system rebooted.
5828 */
5829 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5830 amdgpu_ras_get_context(adev)->reboot) {
5831 DRM_WARN("Emergency reboot.");
5832
5833 ksys_sync_helper();
5834 emergency_restart();
5835 }
5836
5837 dev_info(adev->dev, "GPU %s begin!\n",
5838 need_emergency_restart ? "jobs stop":"reset");
5839
5840 if (!amdgpu_sriov_vf(adev))
5841 hive = amdgpu_get_xgmi_hive(adev);
5842 if (hive)
5843 mutex_lock(&hive->hive_lock);
5844
5845 reset_context->job = job;
5846 reset_context->hive = hive;
5847 /*
5848 * Build list of devices to reset.
5849 * In case we are in XGMI hive mode, resort the device list
5850 * to put adev in the 1st position.
5851 */
5852 INIT_LIST_HEAD(&device_list);
5853 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5854 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5855 list_add_tail(&tmp_adev->reset_list, &device_list);
5856 if (adev->shutdown)
5857 tmp_adev->shutdown = true;
5858 }
5859 if (!list_is_first(&adev->reset_list, &device_list))
5860 list_rotate_to_front(&adev->reset_list, &device_list);
5861 device_list_handle = &device_list;
5862 } else {
5863 list_add_tail(&adev->reset_list, &device_list);
5864 device_list_handle = &device_list;
5865 }
5866
5867 if (!amdgpu_sriov_vf(adev)) {
5868 r = amdgpu_device_health_check(device_list_handle);
5869 if (r)
5870 goto end_reset;
5871 }
5872
5873 /* We need to lock reset domain only once both for XGMI and single device */
5874 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5875 reset_list);
5876 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5877
5878 /* block all schedulers and reset given job's ring */
5879 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5880
5881 amdgpu_device_set_mp1_state(tmp_adev);
5882
5883 /*
5884 * Try to put the audio codec into suspend state
5885 * before gpu reset started.
5886 *
5887 * Due to the power domain of the graphics device
5888 * is shared with AZ power domain. Without this,
5889 * we may change the audio hardware from behind
5890 * the audio driver's back. That will trigger
5891 * some audio codec errors.
5892 */
5893 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5894 audio_suspended = true;
5895
5896 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5897
5898 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5899
5900 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
5901
5902 /*
5903 * Mark these ASICs to be reseted as untracked first
5904 * And add them back after reset completed
5905 */
5906 amdgpu_unregister_gpu_instance(tmp_adev);
5907
5908 drm_client_dev_suspend(adev_to_drm(tmp_adev), false);
5909
5910 /* disable ras on ALL IPs */
5911 if (!need_emergency_restart &&
5912 amdgpu_device_ip_need_full_reset(tmp_adev))
5913 amdgpu_ras_suspend(tmp_adev);
5914
5915 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5916 struct amdgpu_ring *ring = tmp_adev->rings[i];
5917
5918 if (!amdgpu_ring_sched_ready(ring))
5919 continue;
5920
5921 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5922
5923 if (need_emergency_restart)
5924 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5925 }
5926 atomic_inc(&tmp_adev->gpu_reset_counter);
5927 }
5928
5929 if (need_emergency_restart)
5930 goto skip_sched_resume;
5931
5932 /*
5933 * Must check guilty signal here since after this point all old
5934 * HW fences are force signaled.
5935 *
5936 * job->base holds a reference to parent fence
5937 */
5938 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5939 job_signaled = true;
5940 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5941 goto skip_hw_reset;
5942 }
5943
5944 retry: /* Rest of adevs pre asic reset from XGMI hive. */
5945 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5946 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5947 /*TODO Should we stop ?*/
5948 if (r) {
5949 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5950 r, adev_to_drm(tmp_adev)->unique);
5951 tmp_adev->asic_reset_res = r;
5952 }
5953 }
5954
5955 /* Actual ASIC resets if needed.*/
5956 /* Host driver will handle XGMI hive reset for SRIOV */
5957 if (amdgpu_sriov_vf(adev)) {
5958 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
5959 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
5960 amdgpu_ras_set_fed(adev, true);
5961 set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5962 }
5963
5964 r = amdgpu_device_reset_sriov(adev, reset_context);
5965 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
5966 amdgpu_virt_release_full_gpu(adev, true);
5967 goto retry;
5968 }
5969 if (r)
5970 adev->asic_reset_res = r;
5971 } else {
5972 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5973 if (r && r == -EAGAIN)
5974 goto retry;
5975 }
5976
5977 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5978 /*
5979 * Drop any pending non scheduler resets queued before reset is done.
5980 * Any reset scheduled after this point would be valid. Scheduler resets
5981 * were already dropped during drm_sched_stop and no new ones can come
5982 * in before drm_sched_start.
5983 */
5984 amdgpu_device_stop_pending_resets(tmp_adev);
5985 }
5986
5987 skip_hw_reset:
5988
5989 /* Post ASIC reset for all devs .*/
5990 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5991
5992 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5993 struct amdgpu_ring *ring = tmp_adev->rings[i];
5994
5995 if (!amdgpu_ring_sched_ready(ring))
5996 continue;
5997
5998 drm_sched_start(&ring->sched, 0);
5999 }
6000
6001 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
6002 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
6003
6004 if (tmp_adev->asic_reset_res)
6005 r = tmp_adev->asic_reset_res;
6006
6007 tmp_adev->asic_reset_res = 0;
6008
6009 if (r) {
6010 /* bad news, how to tell it to userspace ?
6011 * for ras error, we should report GPU bad status instead of
6012 * reset failure
6013 */
6014 if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
6015 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
6016 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
6017 atomic_read(&tmp_adev->gpu_reset_counter));
6018 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
6019 } else {
6020 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
6021 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
6022 DRM_WARN("smart shift update failed\n");
6023 }
6024 }
6025
6026 skip_sched_resume:
6027 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6028 /* unlock kfd: SRIOV would do it separately */
6029 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
6030 amdgpu_amdkfd_post_reset(tmp_adev);
6031
6032 /* kfd_post_reset will do nothing if kfd device is not initialized,
6033 * need to bring up kfd here if it's not be initialized before
6034 */
6035 if (!adev->kfd.init_complete)
6036 amdgpu_amdkfd_device_init(adev);
6037
6038 if (audio_suspended)
6039 amdgpu_device_resume_display_audio(tmp_adev);
6040
6041 amdgpu_device_unset_mp1_state(tmp_adev);
6042
6043 amdgpu_ras_set_error_query_ready(tmp_adev, true);
6044 }
6045
6046 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
6047 reset_list);
6048 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6049
6050 end_reset:
6051 if (hive) {
6052 mutex_unlock(&hive->hive_lock);
6053 amdgpu_put_xgmi_hive(hive);
6054 }
6055
6056 if (r)
6057 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
6058
6059 atomic_set(&adev->reset_domain->reset_res, r);
6060
> 6061 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
6062
6063 return r;
6064 }
6065
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
More information about the amd-gfx
mailing list