[bug report] drm/amdgpu: parse legacy RAS bad page mixed with new data in various NPS modes

Dan Carpenter dan.carpenter at linaro.org
Fri Dec 13 07:41:14 UTC 2024


Hello Tao Zhou,

Commit a8d133e625ce ("drm/amdgpu: parse legacy RAS bad page mixed
with new data in various NPS modes") from Oct 31, 2024 (linux-next),
leads to the following Smatch static checker warning:

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c:2849 amdgpu_ras_add_bad_pages() warn: missing error code here? 'amdgpu_umc_pages_in_a_row()' failed.
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c:2884 amdgpu_ras_add_bad_pages() warn: missing error code here? 'amdgpu_ras_mca2pa()' failed.

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
    2800 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
    2801                 struct eeprom_table_record *bps, int pages, bool from_rom)
    2802 {
    2803         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
    2804         struct ras_err_handler_data *data;
    2805         struct ras_err_data err_data;
    2806         struct eeprom_table_record *err_rec;
    2807         struct amdgpu_ras_eeprom_control *control =
    2808                         &adev->psp.ras_context.ras->eeprom_control;
    2809         enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
    2810         int ret = 0;
    2811         uint32_t i, j, loop_cnt = 1;
    2812         bool find_pages_per_pa = false;
    2813 
    2814         if (!con || !con->eh_data || !bps || pages <= 0)
    2815                 return 0;
    2816 
    2817         if (from_rom) {
    2818                 err_data.err_addr =
    2819                         kcalloc(adev->umc.retire_unit,
    2820                                 sizeof(struct eeprom_table_record), GFP_KERNEL);
    2821                 if (!err_data.err_addr) {
    2822                         dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n");
    2823                         ret = -ENOMEM;
    2824                         goto out;
    2825                 }
    2826 
    2827                 err_rec = err_data.err_addr;
    2828                 loop_cnt = adev->umc.retire_unit;
    2829                 if (adev->gmc.gmc_funcs->query_mem_partition_mode)
    2830                         nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
    2831         }
    2832 
    2833         mutex_lock(&con->recovery_lock);
    2834         data = con->eh_data;
    2835         if (!data)
    2836                 goto free;

I guess this is intentionally success?

    2837 
    2838         for (i = 0; i < pages; i++) {
    2839                 if (from_rom &&
    2840                     control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) {
    2841                         if (!find_pages_per_pa) {
    2842                                 if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) {
    2843                                         if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
    2844                                                 /* may use old RAS TA, use PA to find pages in
    2845                                                  * one row
    2846                                                  */
    2847                                                 if (amdgpu_umc_pages_in_a_row(adev, &err_data,
    2848                                                                 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
    2849                                                         goto free;

This should be an error code.

    2850                                                 else
    2851                                                         find_pages_per_pa = true;
    2852                                         } else {
    2853                                                 /* unsupported cases */
    2854                                                 goto free;

Error code.

    2855                                         }
    2856                                 }
    2857                         } else {
    2858                                 if (amdgpu_umc_pages_in_a_row(adev, &err_data,
    2859                                                 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
    2860                                         goto free;

Error code.

    2861                         }
    2862                 } else {
    2863                         if (from_rom && !find_pages_per_pa) {
    2864                                 if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) {
    2865                                         /* bad page in any NPS mode in eeprom */
    2866                                         if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data))
    2867                                                 goto free;

Error code.

    2868                                 } else {
    2869                                         /* legacy bad page in eeprom, generated only in
    2870                                          * NPS1 mode
    2871                                          */
    2872                                         if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) {
    2873                                                 /* old RAS TA or ASICs which don't support to
    2874                                                  * convert addrss via mca address
    2875                                                  */
    2876                                                 if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
    2877                                                         find_pages_per_pa = true;
    2878                                                         err_rec = &bps[i];
    2879                                                         loop_cnt = 1;
    2880                                                 } else {
    2881                                                         /* non-nps1 mode, old RAS TA
    2882                                                          * can't support it
    2883                                                          */
--> 2884                                                         goto free;

Error code.

    2885                                                 }
    2886                                         }
    2887                                 }
    2888 
    2889                                 if (!find_pages_per_pa)
    2890                                         i += (adev->umc.retire_unit - 1);
    2891                         } else {
    2892                                 err_rec = &bps[i];
    2893                         }
    2894                 }
    2895 
    2896                 for (j = 0; j < loop_cnt; j++) {
    2897                         if (amdgpu_ras_check_bad_page_unlock(con,
    2898                                 err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
    2899                                 continue;
    2900 
    2901                         if (!data->space_left &&
    2902                             amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
    2903                                 ret = -ENOMEM;
    2904                                 goto free;
    2905                         }
    2906 
    2907                         amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);
    2908 
    2909                         memcpy(&data->bps[data->count], &(err_rec[j]),
    2910                                         sizeof(struct eeprom_table_record));
    2911                         data->count++;
    2912                         data->space_left--;
    2913                 }
    2914         }
    2915 
    2916 free:
    2917         if (from_rom)
    2918                 kfree(err_data.err_addr);
    2919 out:
    2920         mutex_unlock(&con->recovery_lock);
    2921 
    2922         return ret;
    2923 }

regards,
dan carpenter


More information about the amd-gfx mailing list