[PATCH 25/32] drm/amdkfd: add debug suspend and resume process queues operation

Felix Kuehling felix.kuehling at amd.com
Tue Mar 21 22:16:03 UTC 2023


On 2023-01-25 14:53, Jonathan Kim wrote:
> In order to inspect waves from the saved context at any point during a
> debug session, the debugger must be able to preempt queues to trigger
> context save by suspending them.
>
> On queue suspend, the KFD will copy the context save header information
> so that the debugger can correctly crawl the appropriate size of the saved
> context. The debugger must then also be allowed to resume suspended queues.
>
> A queue that is newly created cannot be suspended because queue ids are
> recycled after destruction so the debugger needs to know that this has
> occurred.  Query functions will be later added that will clear a given
> queue of its new queue status.
>
> A queue cannot be destroyed while it is suspended to preserve its saved
> context during debugger inspection.  Have queue destruction block while
> a queue is suspended and unblocked when it is resumed.  Likewise, if a
> queue is about to be destroyed, it cannot be suspended.
>
> Return the number of queues successfully suspended or resumed along with
> a per queue status array where the upper bits per queue status show that
> the request was invalid (new/destroyed queue suspend request, missing
> queue) or an error occurred (HWS in a fatal state so it can't suspend or
> resume queues).
>
> v2: add gfx11/mes support.
> prevent header copy on suspend from overwriting user fields.
> simplify resume_queues function.
> address other nit-picks
>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |   5 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |   1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  11 +
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        |   7 +
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 446 +++++++++++++++++-
>   .../drm/amd/amdkfd/kfd_device_queue_manager.h |  10 +
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  14 +
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c  |  11 +-
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  18 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   5 +-
>   10 files changed, 518 insertions(+), 10 deletions(-)
>
[snip]
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> index 50da16dd4c96..047c43418a1a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> @@ -288,6 +288,11 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
>   			  u32 *save_area_used_size)
>   {
>   	struct v9_mqd *m;
> +	struct kfd_context_save_area_header header;
> +	size_t header_copy_size = sizeof(header.control_stack_size) +
> +		sizeof(header.wave_state_size) +
> +		sizeof(header.wave_state_offset) +
> +		sizeof(header.control_stack_offset);

This makes assumptions about the structure layout. I'd feel better if 
these fields were in a sub-structure, which would make this easier and 
safer to handle.

struct kfd_context_save_area_header {
	struct {
		__u32 control_stack_offset;
		__u32 control_stack_size;
		__u32 wave_state_offset;
		__u32 wave_state_size;
	} wave_state;
	...
};

...

|static int get_wave_state(...) { struct kfd_context_save_area_header 
header; ... header.wave_state.control_stack_size = *ctl_stack_used_size; 
header.wave_state.wave_state_size = *save_area_used_size; 
header.wave_state.wave_state_offset = m->cp_hqd_wg_state_offset; 
header.wave_state.control_stack_offset = m->cp_hqd_cntl_stack_offset; if 
(copy_to_user(ctl_stack, &header.wave_state, sizeof(header.wave_state))) 
return -EFAULT; ... } |

This way you're sure you only copy initialized data. The only assumption 
this still makes is, that wave_state is at the start of the header 
structure.

Regards,
   Felix


>   
>   	/* Control stack is located one page after MQD. */
>   	void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
> @@ -299,7 +304,18 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
>   	*save_area_used_size = m->cp_hqd_wg_state_offset -
>   		m->cp_hqd_cntl_stack_size;
>   
> -	if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size))
> +	header.control_stack_size = *ctl_stack_used_size;
> +	header.wave_state_size = *save_area_used_size;
> +
> +	header.wave_state_offset = m->cp_hqd_wg_state_offset;
> +	header.control_stack_offset = m->cp_hqd_cntl_stack_offset;
> +
> +	if (copy_to_user(ctl_stack, &header, header_copy_size))
> +		return -EFAULT;
> +
> +	if (copy_to_user(ctl_stack + m->cp_hqd_cntl_stack_offset,
> +				mqd_ctl_stack + m->cp_hqd_cntl_stack_offset,
> +				*ctl_stack_used_size))
>   		return -EFAULT;
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 6f7dc23af104..8dc7cc1e18a5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -477,6 +477,8 @@ struct queue_properties {
>   	uint32_t doorbell_off;
>   	bool is_interop;
>   	bool is_evicted;
> +	bool is_suspended;
> +	bool is_being_destroyed;
>   	bool is_active;
>   	bool is_gws;
>   	bool is_dbg_wa;
> @@ -501,7 +503,8 @@ struct queue_properties {
>   #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 &&	\
>   			    (q).queue_address != 0 &&	\
>   			    (q).queue_percent > 0 &&	\
> -			    !(q).is_evicted)
> +			    !(q).is_evicted &&		\
> +			    !(q).is_suspended)
>   
>   enum mqd_update_flag {
>   	UPDATE_FLAG_DBG_WA_ENABLE = 1,


More information about the amd-gfx mailing list