[PATCH] drm/amdgpu: Estimate RAS reservation when report capacity

Zhang, Hawking Hawking.Zhang at amd.com
Tue May 28 05:36:57 UTC 2024


[AMD Official Use Only - AMD Internal Distribution Only]

Hi Tao,

We don't plan to apply the change to gfx adapters. And it's only applicable to aldebran and aqua_vanjaram. I will add back aldebran in v2.

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com>
Sent: Tuesday, May 28, 2024 10:41
To: Zhang, Hawking <Hawking.Zhang at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Kuehling, Felix <Felix.Kuehling at amd.com>; Kasiviswanathan, Harish <Harish.Kasiviswanathan at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>
Subject: RE: [PATCH] drm/amdgpu: Estimate RAS reservation when report capacity

[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of
> Hawking Zhang
> Sent: Tuesday, May 28, 2024 10:21 AM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>; Kuehling, Felix
> <Felix.Kuehling at amd.com>; Kasiviswanathan, Harish
> <Harish.Kasiviswanathan at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>
> Subject: [PATCH] drm/amdgpu: Estimate RAS reservation when report
> capacity
>
> Add estimate of how much vram we need to reserve for RAS when
> caculating the total available vram.
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c   |  9 +++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c        | 18 ++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h        |  2 ++
>  3 files changed, 27 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index e98927529f61..ad813772f8a1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -173,6 +173,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct
> amdgpu_device *adev,  {
>       uint64_t reserved_for_pt =
>               ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
> +     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +     uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes
> + : 0);
>       size_t system_mem_needed, ttm_mem_needed, vram_needed;
>       int ret = 0;
>       uint64_t vram_size = 0;
> @@ -221,7 +223,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct
> amdgpu_device *adev,
>           (kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
>            kfd_mem_limit.max_ttm_mem_limit) ||
>           (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] +
> vram_needed >
> -          vram_size - reserved_for_pt - atomic64_read(&adev->vram_pin_size)
> +
> +          vram_size - reserved_for_pt - reserved_for_ras -
> +atomic64_read(&adev->vram_pin_size) +
>            atomic64_read(&adev->kfd.vram_pinned))) {
>               ret = -ENOMEM;
>               goto release;
> @@ -1694,6 +1696,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct
> amdgpu_device *adev,  {
>       uint64_t reserved_for_pt =
>               ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
> +     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +     uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes
> + : 0);
>       ssize_t available;
>       uint64_t vram_available, system_mem_available,
> ttm_mem_available;
>
> @@ -1702,7 +1706,8 @@ size_t amdgpu_amdkfd_get_available_memory(struct
> amdgpu_device *adev,
>               - adev->kfd.vram_used_aligned[xcp_id]
>               - atomic64_read(&adev->vram_pin_size)
>               + atomic64_read(&adev->kfd.vram_pinned)
> -             - reserved_for_pt;
> +             - reserved_for_pt
> +             - reserved_for_ras;
>
>       if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
>               system_mem_available = no_system_mem_limit ?
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index ecce022c657b..a6334e0e62dc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3317,6 +3317,22 @@ static void amdgpu_ras_event_mgr_init(struct
> amdgpu_device *adev)
>               amdgpu_put_xgmi_hive(hive);  }
>
> +static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device
> +*adev) {
> +     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +
> +     if (!con || (adev->flags & AMD_IS_APU))
> +             return;
> +
> +     switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
> +     case IP_VERSION(13, 0, 6):

[Tao] can we apply the change for all ASICs which support RAS?

> +             con->reserved_pages_in_bytes =
> AMDGPU_RAS_RESERVED_VRAM_SIZE;
> +             break;
> +     default:
> +             break;
> +     }
> +}
> +
>  int amdgpu_ras_init(struct amdgpu_device *adev)  {
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@
> -3422,6
> +3438,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>       /* Get RAS schema for particular SOC */
>       con->schema = amdgpu_get_ras_schema(adev);
>
> +     amdgpu_ras_init_reserved_vram_size(adev);
> +
>       if (amdgpu_ras_fs_init(adev)) {
>               r = -EINVAL;
>               goto release_con;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 6a8c7b1609df..bee622c4268a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -64,6 +64,7 @@ struct amdgpu_iv_entry;  #define
> AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29  #define
> AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe0000000
>
> +#define AMDGPU_RAS_RESERVED_VRAM_SIZE        (16ULL << 20)

[Tao] it's better to add comment here to explain why the value is 16MB.

>  /* The high three bits indicates socketid */  #define
> AMDGPU_RAS_GET_FEATURES(val)  ((val) &
> ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
>
> @@ -541,6 +542,7 @@ struct amdgpu_ras {
>       struct ras_event_manager __event_mgr;
>       struct ras_event_manager *event_mgr;
>
> +     uint64_t reserved_pages_in_bytes;
>  };
>
>  struct ras_fs_data {
> --
> 2.17.1




More information about the amd-gfx mailing list