[PATCH V2] drm/amdkfd: add late initialization support for amdkfd device
Jesse.Zhang
Jesse.Zhang at amd.com
Thu May 29 07:57:32 UTC 2025
Add support for late initialization of KFD device capabilities that
depend on information only available after IP blocks are fully initialized.
This is particularly needed for SDMA queue reset capabilities which require
sdma.supported_reset to be populated during AMDGPU IP late init.
Key changes:
1. Added amdgpu_amdkfd_device_late_init() interface
2. Implemented kgd2kfd_device_late_init() in KFD
3. Added kfd_topology_update_capabilities() to update node properties
4. Integrated into amdgpu_device_ip_late_init() sequence
v2: remove the include "kfd_priv.h"
Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 5 +++++
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 7 +++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++++++
drivers/gpu/drm/amd/amdkfd/kfd_device.c | 6 ++++++
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 22 ++++++++++++++++++++++
6 files changed, 47 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 4cec3a873995..d80745f60873 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -232,6 +232,11 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
}
}
+int amdgpu_amdkfd_device_late_init(struct amdgpu_device *adev)
+{
+ return kgd2kfd_device_late_init(adev->kfd.dev);
+}
+
void amdgpu_amdkfd_device_fini_sw(struct amdgpu_device *adev)
{
if (adev->kfd.dev) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index b6ca41859b53..6c8bbcc7f177 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -160,6 +160,7 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
const void *ih_ring_entry);
void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev);
void amdgpu_amdkfd_device_init(struct amdgpu_device *adev);
+int amdgpu_amdkfd_device_late_init(struct amdgpu_device *adev);
void amdgpu_amdkfd_device_fini_sw(struct amdgpu_device *adev);
int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev);
void amdgpu_amdkfd_unlock_kfd(struct amdgpu_device *adev);
@@ -410,6 +411,7 @@ void kgd2kfd_exit(void);
struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf);
bool kgd2kfd_device_init(struct kfd_dev *kfd,
const struct kgd2kfd_shared_resources *gpu_resources);
+int kgd2kfd_device_late_init(struct kfd_dev *kfd);
void kgd2kfd_device_exit(struct kfd_dev *kfd);
void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
@@ -433,6 +435,11 @@ static inline int kgd2kfd_init(void)
return -ENOENT;
}
+static inline int kgd2kfd_device_late_init(struct kfd_dev *kfd)
+{
+ return -ENOENT;
+}
+
static inline void kgd2kfd_exit(void)
{
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index d9d8cd063829..b7c0281cb6ad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3395,6 +3395,12 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
return r;
}
+ amdgpu_amdkfd_device_late_init(adev);
+ if (r) {
+ DRM_ERROR("amdkfd late init failed %d", r);
+ return r;
+ }
+
if (!amdgpu_reset_in_recovery(adev))
amdgpu_ras_set_error_query_ready(adev, true);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index b9c82be6ce13..3aece03ad092 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -947,6 +947,12 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
return kfd->init_complete;
}
+int kgd2kfd_device_late_init(struct kfd_dev *kfd)
+{
+ kfd_topology_update_capabilities(kfd);
+ return 0;
+}
+
void kgd2kfd_device_exit(struct kfd_dev *kfd)
{
if (kfd->init_complete) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index d221c58dccc3..1eee4d625ba2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1134,6 +1134,7 @@ int kfd_topology_init(void);
void kfd_topology_shutdown(void);
int kfd_topology_add_device(struct kfd_node *gpu);
int kfd_topology_remove_device(struct kfd_node *gpu);
+void kfd_topology_update_capabilities(struct kfd_dev *kfd);
struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
uint32_t proximity_domain);
struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 09011d78f700..052215faff76 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -2026,6 +2026,28 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev)
kfd_topology_set_dbg_firmware_support(dev);
}
+void kfd_topology_update_capabilities(struct kfd_dev *kfd)
+{
+ struct amdgpu_device *adev = kfd->adev;
+ struct kfd_topology_device *kdev;
+ struct amdgpu_device *node_adev;
+
+ list_for_each_entry(kdev, &topology_device_list, list) {
+
+ if (!kdev->gpu || !kdev->gpu->adev)
+ continue;
+
+ /* Compare the underlying adev pointers, not the top-level structs directly */
+ if (kdev->gpu->adev != adev)
+ continue;
+
+ node_adev = kdev->gpu->adev;
+ if (KFD_GC_VERSION(kdev->gpu) < IP_VERSION(10, 0, 0) &&
+ (node_adev->sdma.supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE))
+ kdev->node_props.capability2 |= HSA_CAP2_PER_SDMA_QUEUE_RESET_SUPPORTED;
+ }
+}
+
int kfd_topology_add_device(struct kfd_node *gpu)
{
uint32_t gpu_id;
--
2.49.0
More information about the amd-gfx
mailing list