[PATCH] drm/amdgpu: fix device attribute node create failed with multi gpu

Wang, Kevin(Yang) Kevin1.Wang at amd.com
Fri May 22 15:43:29 UTC 2020


[AMD Official Use Only - Internal Distribution Only]



________________________________
From: Alex Deucher <alexdeucher at gmail.com>
Sent: Friday, May 22, 2020 11:16 PM
To: Wang, Kevin(Yang) <Kevin1.Wang at amd.com>
Cc: amd-gfx list <amd-gfx at lists.freedesktop.org>; Deucher, Alexander <Alexander.Deucher at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>
Subject: Re: [PATCH] drm/amdgpu: fix device attribute node create failed with multi gpu

On Fri, May 22, 2020 at 10:57 AM Kevin Wang <kevin1.wang at amd.com> wrote:
>
> the origin design will use varible of "attr->states" to save node
> supported states on current gpu device, but for multi gpu device, when
> probe second gpu device, the driver will check attribute node states
> from previous gpu device wthether to create attribute node.
> it will cause other gpu device create attribute node faild.
>
> 1. add member attr_list into amdgpu_device to link supported device attribute node.
> 2. add new structure "struct amdgpu_device_attr_entry{}" to track device attribute state.
>
> fix:
> drm/amdgpu: optimize amdgpu device attribute code
>
> Signed-off-by: Kevin Wang <kevin1.wang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h    |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 75 +++++++++++++++-----------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_pm.h | 13 +++--
>  3 files changed, 52 insertions(+), 37 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index bfce0931f9c1..b84146339ea3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -995,6 +995,7 @@ struct amdgpu_device {
>         char                            serial[16];
>
>         struct amdgpu_autodump          autodump;
> +       struct list_head                attr_list;

Might want to call this pm_attr_list or even move this to the
amdgpu_pm struct, but either way, assuming you've tested this with
multiple GPUs:
Reviewed-by: Alex Deucher <alexander.deucher at amd.com>


[kevin]:
thanks,  the patch v2 will fix it.
and this patch is test passed on local witl multi gpu.

>  };
>
>  static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
> index 55815b899942..ac99aa0a16a8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
> @@ -1725,7 +1725,7 @@ static struct amdgpu_device_attr amdgpu_device_attrs[] = {
>  };
>
>  static int default_attr_update(struct amdgpu_device *adev, struct amdgpu_device_attr *attr,
> -                              uint32_t mask)
> +                              uint32_t mask, enum amdgpu_device_attr_states *states)
>  {
>         struct device_attribute *dev_attr = &attr->dev_attr;
>         const char *attr_name = dev_attr->attr.name;
> @@ -1733,7 +1733,7 @@ static int default_attr_update(struct amdgpu_device *adev, struct amdgpu_device_
>         enum amd_asic_type asic_type = adev->asic_type;
>
>         if (!(attr->flags & mask)) {
> -               attr->states = ATTR_STATE_UNSUPPORTED;
> +               *states = ATTR_STATE_UNSUPPORTED;
>                 return 0;
>         }
>
> @@ -1741,34 +1741,34 @@ static int default_attr_update(struct amdgpu_device *adev, struct amdgpu_device_
>
>         if (DEVICE_ATTR_IS(pp_dpm_socclk)) {
>                 if (asic_type <= CHIP_VEGA10)
> -                       attr->states = ATTR_STATE_UNSUPPORTED;
> +                       *states = ATTR_STATE_UNSUPPORTED;
>         } else if (DEVICE_ATTR_IS(pp_dpm_dcefclk)) {
>                 if (asic_type <= CHIP_VEGA10 || asic_type == CHIP_ARCTURUS)
> -                       attr->states = ATTR_STATE_UNSUPPORTED;
> +                       *states = ATTR_STATE_UNSUPPORTED;
>         } else if (DEVICE_ATTR_IS(pp_dpm_fclk)) {
>                 if (asic_type < CHIP_VEGA20)
> -                       attr->states = ATTR_STATE_UNSUPPORTED;
> +                       *states = ATTR_STATE_UNSUPPORTED;
>         } else if (DEVICE_ATTR_IS(pp_dpm_pcie)) {
>                 if (asic_type == CHIP_ARCTURUS)
> -                       attr->states = ATTR_STATE_UNSUPPORTED;
> +                       *states = ATTR_STATE_UNSUPPORTED;
>         } else if (DEVICE_ATTR_IS(pp_od_clk_voltage)) {
> -               attr->states = ATTR_STATE_UNSUPPORTED;
> +               *states = ATTR_STATE_UNSUPPORTED;
>                 if ((is_support_sw_smu(adev) && adev->smu.od_enabled) ||
>                     (!is_support_sw_smu(adev) && hwmgr->od_enabled))
> -                       attr->states = ATTR_STATE_UNSUPPORTED;
> +                       *states = ATTR_STATE_UNSUPPORTED;
>         } else if (DEVICE_ATTR_IS(mem_busy_percent)) {
>                 if (adev->flags & AMD_IS_APU || asic_type == CHIP_VEGA10)
> -                       attr->states = ATTR_STATE_UNSUPPORTED;
> +                       *states = ATTR_STATE_UNSUPPORTED;
>         } else if (DEVICE_ATTR_IS(pcie_bw)) {
>                 /* PCIe Perf counters won't work on APU nodes */
>                 if (adev->flags & AMD_IS_APU)
> -                       attr->states = ATTR_STATE_UNSUPPORTED;
> +                       *states = ATTR_STATE_UNSUPPORTED;
>         } else if (DEVICE_ATTR_IS(unique_id)) {
>                 if (!adev->unique_id)
> -                       attr->states = ATTR_STATE_UNSUPPORTED;
> +                       *states = ATTR_STATE_UNSUPPORTED;
>         } else if (DEVICE_ATTR_IS(pp_features)) {
>                 if (adev->flags & AMD_IS_APU || asic_type <= CHIP_VEGA10)
> -                       attr->states = ATTR_STATE_UNSUPPORTED;
> +                       *states = ATTR_STATE_UNSUPPORTED;
>         }
>
>         if (asic_type == CHIP_ARCTURUS) {
> @@ -1794,22 +1794,24 @@ static int amdgpu_device_attr_create(struct amdgpu_device *adev,
>         int ret = 0;
>         struct device_attribute *dev_attr = &attr->dev_attr;
>         const char *name = dev_attr->attr.name;
> +       enum amdgpu_device_attr_states attr_states = ATTR_STATE_SUPPORTED;
> +       struct amdgpu_device_attr_entry *attr_entry;
> +
>         int (*attr_update)(struct amdgpu_device *adev, struct amdgpu_device_attr *attr,
> -                          uint32_t mask) = default_attr_update;
> +                          uint32_t mask, enum amdgpu_device_attr_states *states) = default_attr_update;
>
>         BUG_ON(!attr);
>
>         attr_update = attr->attr_update ? attr_update : default_attr_update;
>
> -       ret = attr_update(adev, attr, mask);
> +       ret = attr_update(adev, attr, mask, &attr_states);
>         if (ret) {
>                 dev_err(adev->dev, "failed to update device file %s, ret = %d\n",
>                         name, ret);
>                 return ret;
>         }
>
> -       /* the attr->states maybe changed after call attr->attr_update function */
> -       if (attr->states == ATTR_STATE_UNSUPPORTED)
> +       if (attr_states == ATTR_STATE_UNSUPPORTED)
>                 return 0;
>
>         ret = device_create_file(adev->dev, dev_attr);
> @@ -1818,7 +1820,14 @@ static int amdgpu_device_attr_create(struct amdgpu_device *adev,
>                         name, ret);
>         }
>
> -       attr->states = ATTR_STATE_SUPPORTED;
> +       attr_entry = kmalloc(sizeof(*attr_entry), GFP_KERNEL);
> +       if (!attr_entry)
> +               return -ENOMEM;
> +
> +       attr_entry->attr = attr;
> +       INIT_LIST_HEAD(&attr_entry->entry);
> +
> +       list_add_tail(&attr_entry->entry, &adev->attr_list);
>
>         return ret;
>  }
> @@ -1827,14 +1836,12 @@ static void amdgpu_device_attr_remove(struct amdgpu_device *adev, struct amdgpu_
>  {
>         struct device_attribute *dev_attr = &attr->dev_attr;
>
> -       if (attr->states == ATTR_STATE_UNSUPPORTED)
> -               return;
> -
>         device_remove_file(adev->dev, dev_attr);
> -
> -       attr->states = ATTR_STATE_UNSUPPORTED;
>  }
>
> +static void amdgpu_device_attr_remove_groups(struct amdgpu_device *adev,
> +                                            struct list_head *attr_list);
> +
>  static int amdgpu_device_attr_create_groups(struct amdgpu_device *adev,
>                                             struct amdgpu_device_attr *attrs,
>                                             uint32_t counts,
> @@ -1852,20 +1859,24 @@ static int amdgpu_device_attr_create_groups(struct amdgpu_device *adev,
>         return 0;
>
>  failed:
> -       while (i--)
> -               amdgpu_device_attr_remove(adev, &attrs[i]);
> +       amdgpu_device_attr_remove_groups(adev, &adev->attr_list);
>
>         return ret;
>  }
>
>  static void amdgpu_device_attr_remove_groups(struct amdgpu_device *adev,
> -                                            struct amdgpu_device_attr *attrs,
> -                                            uint32_t counts)
> +                                            struct list_head *attr_list)
>  {
> -       uint32_t i = 0;
> +       struct amdgpu_device_attr_entry *entry, *entry_tmp;
>
> -       for (i = 0; i < counts; i++)
> -               amdgpu_device_attr_remove(adev, &attrs[i]);
> +       if (list_empty(&adev->attr_list))
> +               return ;
> +
> +       list_for_each_entry_safe(entry, entry_tmp, attr_list, entry) {
> +               amdgpu_device_attr_remove(adev, entry->attr);
> +               list_del(&entry->entry);
> +               kfree(entry);
> +       }
>  }
>
>  static ssize_t amdgpu_hwmon_show_temp(struct device *dev,
> @@ -3276,6 +3287,8 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev)
>         if (adev->pm.dpm_enabled == 0)
>                 return 0;
>
> +       INIT_LIST_HEAD(&adev->attr_list);
> +
>         adev->pm.int_hwmon_dev = hwmon_device_register_with_groups(adev->dev,
>                                                                    DRIVER_NAME, adev,
>                                                                    hwmon_groups);
> @@ -3319,9 +3332,7 @@ void amdgpu_pm_sysfs_fini(struct amdgpu_device *adev)
>         if (adev->pm.int_hwmon_dev)
>                 hwmon_device_unregister(adev->pm.int_hwmon_dev);
>
> -       amdgpu_device_attr_remove_groups(adev,
> -                                        amdgpu_device_attrs,
> -                                        ARRAY_SIZE(amdgpu_device_attrs));
> +       amdgpu_device_attr_remove_groups(adev, &adev->attr_list);
>  }
>
>  void amdgpu_pm_compute_clocks(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.h
> index 48e8086baf33..d9ae2b49a402 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.h
> @@ -47,10 +47,14 @@ enum amdgpu_device_attr_states {
>  struct amdgpu_device_attr {
>         struct device_attribute dev_attr;
>         enum amdgpu_device_attr_flags flags;
> -       enum amdgpu_device_attr_states states;
> -       int (*attr_update)(struct amdgpu_device *adev,
> -                          struct amdgpu_device_attr* attr,
> -                          uint32_t mask);
> +       int (*attr_update)(struct amdgpu_device *adev, struct amdgpu_device_attr *attr,
> +                          uint32_t mask, enum amdgpu_device_attr_states *states);
> +
> +};
> +
> +struct amdgpu_device_attr_entry {
> +       struct list_head entry;
> +       struct amdgpu_device_attr *attr;
>  };
>
>  #define to_amdgpu_device_attr(_dev_attr) \
> @@ -59,7 +63,6 @@ struct amdgpu_device_attr {
>  #define __AMDGPU_DEVICE_ATTR(_name, _mode, _show, _store, _flags, ...) \
>         { .dev_attr = __ATTR(_name, _mode, _show, _store),              \
>           .flags = _flags,                                              \
> -         .states = ATTR_STATE_SUPPORTED,                                       \
>           ##__VA_ARGS__, }
>
>  #define AMDGPU_DEVICE_ATTR(_name, _mode, _flags, ...)                  \
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Ckevin1.wang%40amd.com%7Cdd2ce58bbf7f4cbc34c408d7fe6317d8%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637257573884409063&sdata=%2FYecffjaMNwPS%2BU0QM%2BWkQ%2BmVBDFP9pw3eYE%2FBgfU60%3D&reserved=0
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20200522/2abad0a4/attachment-0001.htm>


More information about the amd-gfx mailing list