[PATCH V2] drm/amdkfd: add late initialization support for amdkfd device
Zhang, Jesse(Jie)
Jesse.Zhang at amd.com
Tue Jun 3 03:04:48 UTC 2025
[Public]
Hi Jon
1. Is it possible to just push KFD device initialization later during the KGD initialization?
The challenge is that SDMA reset capabilities depend on SMU initialization, which happens late in the IP init sequence.
Specifically, in sdma_v4_4_2.c, we can't directly access the SMU version early.
We rely on smu_v13_0_6_reset_sdma_is_supported() to query SMU for SDMA reset support,
but this requires SMU to be fully initialized first. Hence, the need for a late-init phase.
2. From a brief look, it doesn't seem like the KFD would operate (let alone the KMS driver not surviving) if late IP initialization failed anyway.
Chunking KFD topology settings into separate phases seems a bit awkward.
If late_init fails at amdgpu device init, it should has a fatal error . The driver will unload kms
kfd should exit as well.
how about kfd get the reset flag at node show show ? will update the patch.
Regards
Jesse
-----Original Message-----
From: Kim, Jonathan <Jonathan.Kim at amd.com>
Sent: Friday, May 30, 2025 10:48 PM
To: Zhang, Jesse(Jie) <Jesse.Zhang at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Kuehling, Felix <Felix.Kuehling at amd.com>; Zhang, Jesse(Jie) <Jesse.Zhang at amd.com>
Subject: RE: [PATCH V2] drm/amdkfd: add late initialization support for amdkfd device
[Public]
Is it possible to just push KFD device initialization later during the KGD initialization?
>From a brief look, it doesn't seem like the KFD would operate (let alone the KMS driver not surviving) if late IP initialization failed anyway.
Chunking KFD topology settings into separate phases seems a bit awkward.
Jon
> -----Original Message-----
> From: Jesse.Zhang <Jesse.Zhang at amd.com>
> Sent: Thursday, May 29, 2025 3:58 AM
> To: amd-gfx at lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Kuehling, Felix
> <Felix.Kuehling at amd.com>; Kim, Jonathan <Jonathan.Kim at amd.com>; Zhang,
> Jesse(Jie) <Jesse.Zhang at amd.com>
> Subject: [PATCH V2] drm/amdkfd: add late initialization support for
> amdkfd device
>
> Add support for late initialization of KFD device capabilities that
> depend on information only available after IP blocks are fully initialized.
> This is particularly needed for SDMA queue reset capabilities which
> require sdma.supported_reset to be populated during AMDGPU IP late init.
>
> Key changes:
> 1. Added amdgpu_amdkfd_device_late_init() interface 2. Implemented
> kgd2kfd_device_late_init() in KFD 3. Added
> kfd_topology_update_capabilities() to update node properties 4.
> Integrated into amdgpu_device_ip_late_init() sequence
>
> v2: remove the include "kfd_priv.h"
>
> Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 5 +++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 7 +++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++++++
> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 6 ++++++
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
> drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 22
> ++++++++++++++++++++++
> 6 files changed, 47 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 4cec3a873995..d80745f60873 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -232,6 +232,11 @@ void amdgpu_amdkfd_device_init(struct
> amdgpu_device
> *adev)
> }
> }
>
> +int amdgpu_amdkfd_device_late_init(struct amdgpu_device *adev) {
> + return kgd2kfd_device_late_init(adev->kfd.dev);
> +}
> +
> void amdgpu_amdkfd_device_fini_sw(struct amdgpu_device *adev) {
> if (adev->kfd.dev) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index b6ca41859b53..6c8bbcc7f177 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -160,6 +160,7 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device
> *adev,
> const void *ih_ring_entry); void
> amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); void
> amdgpu_amdkfd_device_init(struct amdgpu_device *adev);
> +int amdgpu_amdkfd_device_late_init(struct amdgpu_device *adev);
> void amdgpu_amdkfd_device_fini_sw(struct amdgpu_device *adev); int
> amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev); void
> amdgpu_amdkfd_unlock_kfd(struct amdgpu_device *adev); @@ -410,6 +411,7
> @@ void kgd2kfd_exit(void); struct kfd_dev *kgd2kfd_probe(struct
> amdgpu_device *adev, bool vf); bool kgd2kfd_device_init(struct
> kfd_dev *kfd,
> const struct kgd2kfd_shared_resources
> *gpu_resources);
> +int kgd2kfd_device_late_init(struct kfd_dev *kfd);
> void kgd2kfd_device_exit(struct kfd_dev *kfd); void
> kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm); int
> kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm); @@ -433,6 +435,11 @@
> static inline int kgd2kfd_init(void)
> return -ENOENT;
> }
>
> +static inline int kgd2kfd_device_late_init(struct kfd_dev *kfd) {
> + return -ENOENT;
> +}
> +
> static inline void kgd2kfd_exit(void) { } diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index d9d8cd063829..b7c0281cb6ad 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3395,6 +3395,12 @@ static int amdgpu_device_ip_late_init(struct
> amdgpu_device *adev)
> return r;
> }
>
> + amdgpu_amdkfd_device_late_init(adev);
> + if (r) {
> + DRM_ERROR("amdkfd late init failed %d", r);
> + return r;
> + }
> +
> if (!amdgpu_reset_in_recovery(adev))
> amdgpu_ras_set_error_query_ready(adev, true);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index b9c82be6ce13..3aece03ad092 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -947,6 +947,12 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
> return kfd->init_complete;
> }
>
> +int kgd2kfd_device_late_init(struct kfd_dev *kfd) {
> + kfd_topology_update_capabilities(kfd);
> + return 0;
> +}
> +
> void kgd2kfd_device_exit(struct kfd_dev *kfd) {
> if (kfd->init_complete) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index d221c58dccc3..1eee4d625ba2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1134,6 +1134,7 @@ int kfd_topology_init(void); void
> kfd_topology_shutdown(void); int kfd_topology_add_device(struct
> kfd_node *gpu); int kfd_topology_remove_device(struct kfd_node *gpu);
> +void kfd_topology_update_capabilities(struct kfd_dev *kfd);
> struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
> uint32_t
> proximity_domain); struct kfd_topology_device
> *kfd_topology_device_by_proximity_domain_no_lock(
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index 09011d78f700..052215faff76 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -2026,6 +2026,28 @@ static void
> kfd_topology_set_capabilities(struct
> kfd_topology_device *dev)
> kfd_topology_set_dbg_firmware_support(dev);
> }
>
> +void kfd_topology_update_capabilities(struct kfd_dev *kfd) {
> + struct amdgpu_device *adev = kfd->adev;
> + struct kfd_topology_device *kdev;
> + struct amdgpu_device *node_adev;
> +
> + list_for_each_entry(kdev, &topology_device_list, list) {
> +
> + if (!kdev->gpu || !kdev->gpu->adev)
> + continue;
> +
> + /* Compare the underlying adev pointers, not the
> + top-level structs
> directly */
> + if (kdev->gpu->adev != adev)
> + continue;
> +
> + node_adev = kdev->gpu->adev;
> + if (KFD_GC_VERSION(kdev->gpu) < IP_VERSION(10, 0, 0) &&
> + (node_adev->sdma.supported_reset &
> AMDGPU_RESET_TYPE_PER_QUEUE))
> + kdev->node_props.capability2 |=
> HSA_CAP2_PER_SDMA_QUEUE_RESET_SUPPORTED;
> + }
> +}
> +
> int kfd_topology_add_device(struct kfd_node *gpu) {
> uint32_t gpu_id;
> --
> 2.49.0
More information about the amd-gfx
mailing list