[bug report] drm/amdgpu: parse legacy RAS bad page mixed with new data in various NPS modes
Dan Carpenter
dan.carpenter at linaro.org
Fri Dec 13 07:41:14 UTC 2024
Hello Tao Zhou,
Commit a8d133e625ce ("drm/amdgpu: parse legacy RAS bad page mixed
with new data in various NPS modes") from Oct 31, 2024 (linux-next),
leads to the following Smatch static checker warning:
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c:2849 amdgpu_ras_add_bad_pages() warn: missing error code here? 'amdgpu_umc_pages_in_a_row()' failed.
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c:2884 amdgpu_ras_add_bad_pages() warn: missing error code here? 'amdgpu_ras_mca2pa()' failed.
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
2800 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
2801 struct eeprom_table_record *bps, int pages, bool from_rom)
2802 {
2803 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2804 struct ras_err_handler_data *data;
2805 struct ras_err_data err_data;
2806 struct eeprom_table_record *err_rec;
2807 struct amdgpu_ras_eeprom_control *control =
2808 &adev->psp.ras_context.ras->eeprom_control;
2809 enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
2810 int ret = 0;
2811 uint32_t i, j, loop_cnt = 1;
2812 bool find_pages_per_pa = false;
2813
2814 if (!con || !con->eh_data || !bps || pages <= 0)
2815 return 0;
2816
2817 if (from_rom) {
2818 err_data.err_addr =
2819 kcalloc(adev->umc.retire_unit,
2820 sizeof(struct eeprom_table_record), GFP_KERNEL);
2821 if (!err_data.err_addr) {
2822 dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n");
2823 ret = -ENOMEM;
2824 goto out;
2825 }
2826
2827 err_rec = err_data.err_addr;
2828 loop_cnt = adev->umc.retire_unit;
2829 if (adev->gmc.gmc_funcs->query_mem_partition_mode)
2830 nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
2831 }
2832
2833 mutex_lock(&con->recovery_lock);
2834 data = con->eh_data;
2835 if (!data)
2836 goto free;
I guess this is intentionally success?
2837
2838 for (i = 0; i < pages; i++) {
2839 if (from_rom &&
2840 control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) {
2841 if (!find_pages_per_pa) {
2842 if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) {
2843 if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
2844 /* may use old RAS TA, use PA to find pages in
2845 * one row
2846 */
2847 if (amdgpu_umc_pages_in_a_row(adev, &err_data,
2848 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
2849 goto free;
This should be an error code.
2850 else
2851 find_pages_per_pa = true;
2852 } else {
2853 /* unsupported cases */
2854 goto free;
Error code.
2855 }
2856 }
2857 } else {
2858 if (amdgpu_umc_pages_in_a_row(adev, &err_data,
2859 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
2860 goto free;
Error code.
2861 }
2862 } else {
2863 if (from_rom && !find_pages_per_pa) {
2864 if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) {
2865 /* bad page in any NPS mode in eeprom */
2866 if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data))
2867 goto free;
Error code.
2868 } else {
2869 /* legacy bad page in eeprom, generated only in
2870 * NPS1 mode
2871 */
2872 if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) {
2873 /* old RAS TA or ASICs which don't support to
2874 * convert addrss via mca address
2875 */
2876 if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) {
2877 find_pages_per_pa = true;
2878 err_rec = &bps[i];
2879 loop_cnt = 1;
2880 } else {
2881 /* non-nps1 mode, old RAS TA
2882 * can't support it
2883 */
--> 2884 goto free;
Error code.
2885 }
2886 }
2887 }
2888
2889 if (!find_pages_per_pa)
2890 i += (adev->umc.retire_unit - 1);
2891 } else {
2892 err_rec = &bps[i];
2893 }
2894 }
2895
2896 for (j = 0; j < loop_cnt; j++) {
2897 if (amdgpu_ras_check_bad_page_unlock(con,
2898 err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
2899 continue;
2900
2901 if (!data->space_left &&
2902 amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
2903 ret = -ENOMEM;
2904 goto free;
2905 }
2906
2907 amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);
2908
2909 memcpy(&data->bps[data->count], &(err_rec[j]),
2910 sizeof(struct eeprom_table_record));
2911 data->count++;
2912 data->space_left--;
2913 }
2914 }
2915
2916 free:
2917 if (from_rom)
2918 kfree(err_data.err_addr);
2919 out:
2920 mutex_unlock(&con->recovery_lock);
2921
2922 return ret;
2923 }
regards,
dan carpenter
More information about the amd-gfx
mailing list