[bug report] drm/amdgpu: Multi-GPU DPC recovery support
Dan Carpenter
dan.carpenter at linaro.org
Wed Apr 9 08:46:48 UTC 2025
Hello Ce Sun,
Commit 8ba904f54148 ("drm/amdgpu: Multi-GPU DPC recovery support")
from Mar 21, 2025 (linux-next), leads to the following Smatch static
checker warning:
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:6820 amdgpu_pci_slot_reset()
warn: iterator used outside loop: 'tmp_adev'
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
6753 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
6754 {
6755 struct drm_device *dev = pci_get_drvdata(pdev);
6756 struct amdgpu_device *adev = drm_to_adev(dev);
6757 struct amdgpu_reset_context reset_context;
6758 struct amdgpu_device *tmp_adev = NULL;
6759 struct amdgpu_hive_info *hive = NULL;
6760 struct list_head device_list;
6761 int r = 0, i;
6762 u32 memsize;
6763
6764 /* PCI error slot reset should be skipped During RAS recovery */
6765 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
6766 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
6767 amdgpu_ras_in_recovery(adev))
6768 return PCI_ERS_RESULT_RECOVERED;
6769
6770 dev_info(adev->dev, "PCI error: slot reset callback!!\n");
6771
6772 memset(&reset_context, 0, sizeof(reset_context));
6773
6774 /* wait for asic to come out of reset */
6775 msleep(700);
6776
6777 /* Restore PCI confspace */
6778 amdgpu_device_load_pci_state(pdev);
6779
6780 /* confirm ASIC came out of reset */
6781 for (i = 0; i < adev->usec_timeout; i++) {
6782 memsize = amdgpu_asic_get_config_memsize(adev);
6783
6784 if (memsize != 0xffffffff)
6785 break;
6786 udelay(1);
6787 }
6788 if (memsize == 0xffffffff) {
6789 r = -ETIME;
6790 goto out;
6791 }
6792
6793 reset_context.method = AMD_RESET_METHOD_NONE;
6794 reset_context.reset_req_dev = adev;
6795 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
6796 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
6797 INIT_LIST_HEAD(&device_list);
6798
6799 hive = amdgpu_get_xgmi_hive(adev);
6800 if (hive) {
6801 mutex_lock(&hive->hive_lock);
6802 reset_context.hive = hive;
6803 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
6804 tmp_adev->pcie_reset_ctx.in_link_reset = true;
6805 list_add_tail(&tmp_adev->reset_list, &device_list);
6806 }
tmp_adev is an invalid non-NULL pointer.
6807 } else {
6808 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
6809 list_add_tail(&adev->reset_list, &device_list);
6810 }
6811
6812 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context);
6813 out:
6814 if (!r) {
6815 if (amdgpu_device_cache_pci_state(adev->pdev))
6816 pci_restore_state(adev->pdev);
6817 dev_info(adev->dev, "PCIe error recovery succeeded\n");
6818 } else {
6819 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r);
--> 6820 if (tmp_adev) {
This looks like it might have been intentional but it would be more
readable to check if (hive) {
6821 list_for_each_entry(tmp_adev, &device_list, reset_list)
6822 amdgpu_device_unset_mp1_state(tmp_adev);
6823 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6824 }
6825 }
6826
6827 if (hive) {
6828 mutex_unlock(&hive->hive_lock);
6829 amdgpu_put_xgmi_hive(hive);
6830 }
6831
6832 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
6833 }
regards,
dan carpenter
More information about the amd-gfx
mailing list