[v4 1/7] drm/amd/amdgpu: Simplify SDMA reset mechanism by removing dynamic callbacks
Jesse.zhang@amd.com
jesse.zhang at amd.com
Tue Apr 8 08:36:27 UTC 2025
Since KFD no longer registers its own callbacks for SDMA resets, and only KGD uses the reset mechanism,
we can simplify the SDMA reset flow by directly calling the ring's `stop_queue` and `start_queue` functions.
This patch removes the dynamic callback mechanism and prepares for its eventual deprecation.
1. **Remove Dynamic Callbacks**:
- The `pre_reset` and `post_reset` callback invocations in `amdgpu_sdma_reset_engine` are removed.
- Instead, the ring's `stop_queue` and `start_queue` functions are called directly during the reset process.
2. **Prepare for Deprecation of Dynamic Mechanism**:
- By removing the callback invocations, this patch prepares the codebase for the eventual removal of the dynamic callback registration mechanism.
v2: Update stop_queue/start_queue function paramters to use ring pointer instead of device/instance(Christian)
v3: The general coding style is to declare variables like "i" or "r" last. E.g. longest lines first and short lasts. (Chritian)
v4: move stop_queue/start_queue to struct amdgpu_sdma_instance and rename them. (Alex)
Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 54 ++++++------------------
drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 7 ++-
drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 22 ++++++----
3 files changed, 34 insertions(+), 49 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 0a9893fee828..541b349e0310 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -558,20 +558,14 @@ void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct
* @adev: Pointer to the AMDGPU device
* @instance_id: ID of the SDMA engine instance to reset
*
- * This function performs the following steps:
- * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save their state.
- * 2. Resets the specified SDMA engine instance.
- * 3. Calls all registered post_reset callbacks to allow KFD and AMDGPU to restore their state.
- *
* Returns: 0 on success, or a negative error code on failure.
*/
int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
{
- struct sdma_on_reset_funcs *funcs;
int ret = 0;
struct amdgpu_sdma_instance *sdma_instance = &adev->sdma.instance[instance_id];
- struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
- struct amdgpu_ring *page_ring = &sdma_instance->page;
+ struct amdgpu_ring *sdma_gfx_ring = &sdma_instance->ring;
+ struct amdgpu_ring *sdma_page_ring = &sdma_instance->page;
bool gfx_sched_stopped = false, page_sched_stopped = false;
mutex_lock(&sdma_instance->engine_reset_mutex);
@@ -579,28 +573,18 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
* This ensures that no new tasks are submitted to the queues while
* the reset is in progress.
*/
- if (!amdgpu_ring_sched_ready(gfx_ring)) {
- drm_sched_wqueue_stop(&gfx_ring->sched);
+ if (!amdgpu_ring_sched_ready(sdma_gfx_ring)) {
+ drm_sched_wqueue_stop(&sdma_gfx_ring->sched);
gfx_sched_stopped = true;
}
- if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(page_ring)) {
- drm_sched_wqueue_stop(&page_ring->sched);
+ if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(sdma_page_ring)) {
+ drm_sched_wqueue_stop(&sdma_page_ring->sched);
page_sched_stopped = true;
}
- /* Invoke all registered pre_reset callbacks */
- list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
- if (funcs->pre_reset) {
- ret = funcs->pre_reset(adev, instance_id);
- if (ret) {
- dev_err(adev->dev,
- "beforeReset callback failed for instance %u: %d\n",
- instance_id, ret);
- goto exit;
- }
- }
- }
+ if (sdma_instance->funcs->stop_kernel_queue)
+ sdma_instance->funcs->stop_kernel_queue(sdma_gfx_ring);
/* Perform the SDMA reset for the specified instance */
ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
@@ -609,18 +593,8 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
goto exit;
}
- /* Invoke all registered post_reset callbacks */
- list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
- if (funcs->post_reset) {
- ret = funcs->post_reset(adev, instance_id);
- if (ret) {
- dev_err(adev->dev,
- "afterReset callback failed for instance %u: %d\n",
- instance_id, ret);
- goto exit;
- }
- }
- }
+ if (sdma_instance->funcs->start_kernel_queue)
+ sdma_instance->funcs->start_kernel_queue(sdma_gfx_ring);
exit:
/* Restart the scheduler's work queue for the GFX and page rings
@@ -628,11 +602,11 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
* to be submitted to the queues after the reset is complete.
*/
if (!ret) {
- if (gfx_sched_stopped && amdgpu_ring_sched_ready(gfx_ring)) {
- drm_sched_wqueue_start(&gfx_ring->sched);
+ if (gfx_sched_stopped && amdgpu_ring_sched_ready(sdma_gfx_ring)) {
+ drm_sched_wqueue_start(&sdma_gfx_ring->sched);
}
- if (page_sched_stopped && amdgpu_ring_sched_ready(page_ring)) {
- drm_sched_wqueue_start(&page_ring->sched);
+ if (page_sched_stopped && amdgpu_ring_sched_ready(sdma_page_ring)) {
+ drm_sched_wqueue_start(&sdma_page_ring->sched);
}
}
mutex_unlock(&sdma_instance->engine_reset_mutex);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 47d56fd0589f..620fd7663526 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -50,6 +50,11 @@ enum amdgpu_sdma_irq {
#define NUM_SDMA(x) hweight32(x)
+struct amdgpu_sdma_funcs {
+ int (*stop_kernel_queue)(struct amdgpu_ring *ring);
+ int (*start_kernel_queue)(struct amdgpu_ring *ring);
+};
+
struct amdgpu_sdma_instance {
/* SDMA firmware */
const struct firmware *fw;
@@ -68,7 +73,7 @@ struct amdgpu_sdma_instance {
/* track guilty state of GFX and PAGE queues */
bool gfx_guilty;
bool page_guilty;
-
+ const struct amdgpu_sdma_funcs *funcs;
};
enum amdgpu_sdma_ras_memory_id {
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 688a720bbbbd..c663c63485f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -108,6 +108,8 @@ static void sdma_v4_4_2_set_irq_funcs(struct amdgpu_device *adev);
static void sdma_v4_4_2_set_ras_funcs(struct amdgpu_device *adev);
static void sdma_v4_4_2_set_engine_reset_funcs(struct amdgpu_device *adev);
static void sdma_v4_4_2_update_reset_mask(struct amdgpu_device *adev);
+static int sdma_v4_4_2_stop_queue(struct amdgpu_ring *ring);
+static int sdma_v4_4_2_restore_queue(struct amdgpu_ring *ring);
static u32 sdma_v4_4_2_get_reg_offset(struct amdgpu_device *adev,
u32 instance, u32 offset)
@@ -1333,6 +1335,11 @@ static bool sdma_v4_4_2_fw_support_paging_queue(struct amdgpu_device *adev)
}
}
+static const struct amdgpu_sdma_funcs sdma_v4_4_2_sdma_funcs = {
+ .stop_kernel_queue = &sdma_v4_4_2_stop_queue,
+ .start_kernel_queue = &sdma_v4_4_2_restore_queue,
+};
+
static int sdma_v4_4_2_early_init(struct amdgpu_ip_block *ip_block)
{
struct amdgpu_device *adev = ip_block->adev;
@@ -1352,7 +1359,6 @@ static int sdma_v4_4_2_early_init(struct amdgpu_ip_block *ip_block)
sdma_v4_4_2_set_irq_funcs(adev);
sdma_v4_4_2_set_ras_funcs(adev);
sdma_v4_4_2_set_engine_reset_funcs(adev);
-
return 0;
}
@@ -1447,6 +1453,7 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
/* Initialize guilty flags for GFX and PAGE queues */
adev->sdma.instance[i].gfx_guilty = false;
adev->sdma.instance[i].page_guilty = false;
+ adev->sdma.instance[i].funcs = &sdma_v4_4_2_sdma_funcs;
ring = &adev->sdma.instance[i].ring;
ring->ring_obj = NULL;
@@ -1678,11 +1685,12 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
return r;
}
-static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t instance_id)
+static int sdma_v4_4_2_stop_queue(struct amdgpu_ring *ring)
{
+ struct amdgpu_device *adev = ring->adev;
+ u32 instance_id = GET_INST(SDMA0, ring->me);
u32 inst_mask;
uint64_t rptr;
- struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring;
if (amdgpu_sriov_vf(adev))
return -EINVAL;
@@ -1715,11 +1723,11 @@ static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t instance_
return 0;
}
-static int sdma_v4_4_2_restore_queue(struct amdgpu_device *adev, uint32_t instance_id)
+static int sdma_v4_4_2_restore_queue(struct amdgpu_ring *ring)
{
- int i;
+ struct amdgpu_device *adev = ring->adev;
u32 inst_mask;
- struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring;
+ int i;
inst_mask = 1 << ring->me;
udelay(50);
@@ -1740,8 +1748,6 @@ static int sdma_v4_4_2_restore_queue(struct amdgpu_device *adev, uint32_t instan
}
static struct sdma_on_reset_funcs sdma_v4_4_2_engine_reset_funcs = {
- .pre_reset = sdma_v4_4_2_stop_queue,
- .post_reset = sdma_v4_4_2_restore_queue,
};
static void sdma_v4_4_2_set_engine_reset_funcs(struct amdgpu_device *adev)
--
2.25.1
More information about the amd-gfx
mailing list