[PATCH 21/29] drm/amdkfd: add debug wave launch mode operation

Felix Kuehling felix.kuehling at amd.com
Thu Dec 1 00:02:38 UTC 2022


On 2022-10-31 12:23, Jonathan Kim wrote:
> Allow the debugger to set wave behaviour on to either normally operate,
> halt at launch, trap on every instruction, terminate immediately or
> stall on allocation.
>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
> ---
>   .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  | 18 ++++++++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  1 +
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c    | 27 +++++++++++++++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h    |  3 ++
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |  3 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 34 +++++++++++++++++++
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  3 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  3 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 27 ++++++++++++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_debug.h        |  5 ++-
>   10 files changed, 119 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> index a5003f6f05bf..91c7fdee883e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
> @@ -99,6 +99,23 @@ static uint32_t kgd_aldebaran_set_wave_launch_trap_override(struct amdgpu_device
>   	return data;
>   }
>   
> +static uint32_t kgd_aldebaran_set_wave_launch_mode(struct amdgpu_device *adev,
> +					uint8_t wave_launch_mode,
> +					uint32_t vmid)
> +{
> +	uint32_t data = 0;
> +	bool is_stall_mode = wave_launch_mode == KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT;
> +
> +	if (is_stall_mode)
> +		data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, STALL_VMID,
> +									1);
> +	else
> +		data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, LAUNCH_MODE,
> +							wave_launch_mode);
> +
> +	return data;
> +}
> +
>   const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
>   	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
> @@ -120,6 +137,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
>   	.disable_debug_trap = kgd_aldebaran_disable_debug_trap,
>   	.validate_trap_override_request = kgd_aldebaran_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_aldebaran_set_wave_launch_trap_override,
> +	.set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index b3682758184f..10470f4a4eaf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -399,6 +399,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
>   	.disable_debug_trap = kgd_arcturus_disable_debug_trap,
>   	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
> +	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 32a6e5fbeacd..66a83e6fb9e5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -854,6 +854,32 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
> +					uint8_t wave_launch_mode,
> +					uint32_t vmid)
> +{
> +	uint32_t data = 0;
> +	bool is_stall_mode = wave_launch_mode == KFD_DBG_TRAP_WAVE_LAUNCH_MODE_STALL;
> +	bool is_mode_set = wave_launch_mode && !is_stall_mode;
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	kgd_gfx_v10_set_wave_launch_stall(adev, vmid, true);
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
> +			VMID_MASK, is_mode_set ? 1 << vmid : 0);
> +	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
> +			MODE, is_mode_set ? wave_launch_mode : 0);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data);
> +
> +	if (!is_stall_mode)
> +		kgd_gfx_v10_set_wave_launch_stall(adev, vmid, false);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
> +
>   /* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
>    * The values read are:
>    *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> @@ -941,6 +967,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
>   	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
>   	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
> +	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode,
>   	.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
>   	.program_trap_handler_settings = program_trap_handler_settings,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> index 85c929fc2926..34c04a2bb83b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
> @@ -36,6 +36,9 @@ uint32_t kgd_gfx_v10_set_wave_launch_trap_override(struct amdgpu_device *adev,
>   					     uint32_t trap_mask_request,
>   					     uint32_t *trap_mask_prev,
>   					     uint32_t kfd_dbg_trap_cntl_prev);
> +uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,
> +					 uint8_t wave_launch_mode,
> +					 uint32_t vmid);
>   void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, uint32_t *wait_times);
>   void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
>   					       uint32_t wait_times,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> index ae3ead207df4..8627c5458973 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
> @@ -675,6 +675,7 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
>   	.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
>   	.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
>   	.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,
> -	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override
> +	.set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override,
> +	.set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode
>   
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index cb0044bbfae5..3bba7ca21926 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -777,6 +777,39 @@ uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
>   	return 0;
>   }
>   
> +uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev,
> +					uint8_t wave_launch_mode,
> +					uint32_t vmid)
> +{
> +	uint32_t data = 0;
> +	bool is_stall_mode = wave_launch_mode ==
> +				KFD_DBG_TRAP_WAVE_LAUNCH_MODE_STALL;
> +	bool is_mode_set = wave_launch_mode && !is_stall_mode;
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, true);
> +
> +	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
> +		VMID_MASK, is_mode_set ? 1 << vmid : 0);
> +	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
> +		MODE, is_mode_set ? wave_launch_mode : 0);
> +	WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data);
> +
> +	/* Although Pre-GFX9.4.1 stalls globally, the per-VMID stall for
> +	 * GFX9.4.1 effectively does the same thing as global STALL_RA as
> +	 * all other VMID allocations are back logged by the stalled VMID.
> +	 *
> +	 * Use with caution.
> +	 */

This is potentially problematic. Discussing this with the debugger team, 
it turned out that they're not actually using stall mode. So we should 
remove it from the code and the API.

Other than that, this patch looks good to me.

Regards,
   Felix


> +	if (!is_stall_mode)
> +		kgd_gfx_v9_set_wave_launch_stall(adev, vmid, false);
> +
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return 0;
> +}
> +
>   /* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values
>    * The values read are:
>    *     ib_offload_wait_time     -- Wait Count for Indirect Buffer Offloads.
> @@ -1047,6 +1080,7 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
>   	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
>   	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
>   	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
> +	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
>   	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
>   	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
>   	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> index 47cff392b434..2a2ab42037e4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> @@ -67,6 +67,9 @@ uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
>   int kgd_gfx_v9_validate_trap_override_request(struct amdgpu_device *adev,
>   					     uint32_t trap_override,
>   					     uint32_t *trap_mask_supported);
> +uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev,
> +					uint8_t wave_launch_mode,
> +					uint32_t vmid);
>   uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
>   					     uint32_t vmid,
>   					     uint32_t trap_override,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 1f0ee2413b13..63665279ce4d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2899,6 +2899,9 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
>   				&args->launch_override.support_request_mask);
>   		break;
>   	case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
> +		r = kfd_dbg_trap_set_wave_launch_mode(target,
> +				args->launch_mode.launch_mode);
> +		break;
>   	case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
>   	case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
>   	case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> index 8add359d1cb9..210851f2cdb3 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> @@ -239,8 +239,10 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
>   {
>   	int i, count = 0;
>   
> -	if (!unwind)
> +	if (!unwind) {
>   		cancel_work_sync(&target->debug_event_workarea);
> +		kfd_dbg_trap_set_wave_launch_mode(target, 0);
> +	}
>   
>   	for (i = 0; i < target->n_pdds; i++) {
>   		struct kfd_process_device *pdd = target->pdds[i];
> @@ -507,6 +509,29 @@ int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
>   	return r;
>   }
>   
> +int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
> +					uint8_t wave_launch_mode)
> +{
> +	int r = 0, i;
> +
> +	for (i = 0; i < target->n_pdds; i++) {
> +		struct kfd_process_device *pdd = target->pdds[i];
> +
> +		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
> +		pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
> +				pdd->dev->adev,
> +				wave_launch_mode,
> +				pdd->dev->vm_info.last_vmid_kfd);
> +		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
> +
> +		r = debug_refresh_runlist(pdd->dev->dqm);
> +		if (r)
> +			break;
> +	}
> +
> +	return r;
> +}
> +
>   void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
>   					uint64_t exception_set_mask)
>   {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> index b54a50a5d310..ca3ab1f01985 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h
> @@ -25,9 +25,6 @@
>   
>   #include "kfd_priv.h"
>   
> -void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
> -					uint32_t vmid,
> -					bool stall);
>   void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count);
>   int kfd_dbg_trap_activate(struct kfd_process *target);
>   bool kfd_set_dbg_ev_from_interrupt(struct kfd_dev *dev,
> @@ -51,6 +48,8 @@ int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
>   					uint32_t trap_mask_request,
>   					uint32_t *trap_mask_prev,
>   					uint32_t *trap_mask_supported);
> +int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
> +					uint8_t wave_launch_mode);
>   
>   int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
>   					unsigned int dev_id,


More information about the amd-gfx mailing list