[PATCH 1/1] drm/amdkfd: Consistently apply noretry setting

Yang, Philip Philip.Yang at amd.com
Thu Jul 4 17:16:31 UTC 2019


On 2019-07-04 12:02 p.m., Kuehling, Felix wrote:
> On 2019-07-03 6:19 p.m., Yang, Philip wrote:
>> amdgpu_noretry default value is 0, this will generate VM fault storm
>> because the vm fault is not recovered. It may slow down the machine and
>> need reboot after application VM fault. Maybe change default value to 1?
> 
> This is the same as the current behaviour. My change doesn't modify the
> default behaviour, it only makes it configurable in a sensible way.
> 
> If we want to change the default, I'd do that in a separate commit.
>
I misunderstood VM_CONTEXT1_CNT/RETRY_PERMISSION_OR_INVALID_PAGE_FAULT 
change, yes, this is the same behavior. retry was already enabled by 
another patch "drm/amdgpu: re-enable retry faults" before. But I don't 
find the patch to silent retry storm on the mailing list.

Regards,
Philip

> Regards,
>     Felix
> 
>>
>> Other than that, this is reviewed by Philip Yang <Philip.Yang at amd.com>
>>
>> On 2019-07-02 3:05 p.m., Kuehling, Felix wrote:
>>> Ping.
>>>
>>> Christian, Philip, any opinion about this patch?
>>>
>>> On 2019-06-21 8:20 p.m., Kuehling, Felix wrote:
>>>> Apply the same setting to SH_MEM_CONFIG and VM_CONTEXT1_CNTL. This
>>>> makes the noretry param no longer KFD-specific. On GFX10 I'm not
>>>> changing SH_MEM_CONFIG in this commit because GFX10 has different
>>>> retry behaviour in the SQ and I don't have a way to test it at the
>>>> moment.
>>>>
>>>> Suggested-by: Christian König <Christian.Koenig at amd.com>
>>>> CC: Philip Yang <Philip.Yang at amd.com>
>>>> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu.h              |  1 +
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c          | 16 +++++-----------
>>>>      drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c            |  4 ++++
>>>>      drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c         |  3 ++-
>>>>      drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c         |  3 ++-
>>>>      drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c          |  3 ++-
>>>>      drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c          |  3 ++-
>>>>      .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c |  2 +-
>>>>      drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  2 +-
>>>>      9 files changed, 20 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index 9b1efdf94bdf..05875279c09e 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -164,6 +164,7 @@ extern int amdgpu_async_gfx_ring;
>>>>      extern int amdgpu_mcbp;
>>>>      extern int amdgpu_discovery;
>>>>      extern int amdgpu_mes;
>>>> +extern int amdgpu_noretry;
>>>>      
>>>>      #ifdef CONFIG_DRM_AMDGPU_SI
>>>>      extern int amdgpu_si_support;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> index 7cf6ab07b113..0d578d95be93 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>>> @@ -140,6 +140,7 @@ int amdgpu_async_gfx_ring = 1;
>>>>      int amdgpu_mcbp = 0;
>>>>      int amdgpu_discovery = 0;
>>>>      int amdgpu_mes = 0;
>>>> +int amdgpu_noretry;
>>>>      
>>>>      struct amdgpu_mgpu_info mgpu_info = {
>>>>      	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
>>>> @@ -591,6 +592,10 @@ MODULE_PARM_DESC(mes,
>>>>      	"Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)");
>>>>      module_param_named(mes, amdgpu_mes, int, 0444);
>>>>      
>>>> +MODULE_PARM_DESC(noretry,
>>>> +	"Disable retry faults (0 = retry enabled (default), 1 = retry disabled)");
>>>> +module_param_named(noretry, amdgpu_noretry, int, 0644);
>>>> +
>>>>      #ifdef CONFIG_HSA_AMD
>>>>      /**
>>>>       * DOC: sched_policy (int)
>>>> @@ -666,17 +671,6 @@ module_param(ignore_crat, int, 0444);
>>>>      MODULE_PARM_DESC(ignore_crat,
>>>>      	"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");
>>>>      
>>>> -/**
>>>> - * DOC: noretry (int)
>>>> - * This parameter sets sh_mem_config.retry_disable. Default value, 0, enables retry.
>>>> - * Setting 1 disables retry.
>>>> - * Retry is needed for recoverable page faults.
>>>> - */
>>>> -int noretry;
>>>> -module_param(noretry, int, 0644);
>>>> -MODULE_PARM_DESC(noretry,
>>>> -	"Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)");
>>>> -
>>>>      /**
>>>>       * DOC: halt_if_hws_hang (int)
>>>>       * Halt if HWS hang is detected. Default value, 0, disables the halt on hang.
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> index e0f3014e76ea..c4e715170bfe 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> @@ -1938,11 +1938,15 @@ static void gfx_v9_0_constants_init(struct amdgpu_device *adev)
>>>>      		if (i == 0) {
>>>>      			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,
>>>>      					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);
>>>> +			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,
>>>> +					    !!amdgpu_noretry);
>>>>      			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);
>>>>      			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_BASES, 0);
>>>>      		} else {
>>>>      			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,
>>>>      					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);
>>>> +			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,
>>>> +					    !!amdgpu_noretry);
>>>>      			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);
>>>>      			tmp = REG_SET_FIELD(0, SH_MEM_BASES, PRIVATE_BASE,
>>>>      				(adev->gmc.private_aperture_start >> 48));
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
>>>> index 9f0f189fc111..15986748f59f 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
>>>> @@ -236,7 +236,8 @@ static void gfxhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)
>>>>      				    block_size);
>>>>      		/* Send no-retry XNACK on fault to suppress VM fault storm. */
>>>>      		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
>>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);
>>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,
>>>> +				    !amdgpu_noretry);
>>>>      		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_CNTL, i, tmp);
>>>>      		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);
>>>>      		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c
>>>> index b7de60a15623..d605b4963f8a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c
>>>> @@ -215,7 +215,8 @@ static void gfxhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)
>>>>      				adev->vm_manager.block_size - 9);
>>>>      		/* Send no-retry XNACK on fault to suppress VM fault storm. */
>>>>      		tmp = REG_SET_FIELD(tmp, GCVM_CONTEXT1_CNTL,
>>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);
>>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,
>>>> +				    !amdgpu_noretry);
>>>>      		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_CNTL, i, tmp);
>>>>      		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);
>>>>      		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
>>>> index 05d1d448c8f5..dc5ce03034d3 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
>>>> @@ -265,7 +265,8 @@ static void mmhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)
>>>>      				    block_size);
>>>>      		/* Send no-retry XNACK on fault to suppress VM fault storm. */
>>>>      		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
>>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);
>>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,
>>>> +				    !amdgpu_noretry);
>>>>      		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_CNTL, i, tmp);
>>>>      		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);
>>>>      		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
>>>> index 37a1a318ae63..0f9549f19ade 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
>>>> @@ -205,7 +205,8 @@ static void mmhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)
>>>>      				    adev->vm_manager.block_size - 9);
>>>>      		/* Send no-retry XNACK on fault to suppress VM fault storm. */
>>>>      		tmp = REG_SET_FIELD(tmp, MMVM_CONTEXT1_CNTL,
>>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);
>>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,
>>>> +				    !amdgpu_noretry);
>>>>      		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_CNTL, i, tmp);
>>>>      		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);
>>>>      		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
>>>> index e9fe39382371..95a82ac455f2 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
>>>> @@ -61,7 +61,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm,
>>>>      		qpd->sh_mem_config =
>>>>      				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
>>>>      					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
>>>> -		if (noretry &&
>>>> +		if (amdgpu_noretry &&
>>>>      		    !dqm->dev->device_info->needs_iommu_device)
>>>>      			qpd->sh_mem_config |=
>>>>      				1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> index d4bba0124d29..aa7bf20d20f8 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>>> @@ -157,7 +157,7 @@ extern int ignore_crat;
>>>>      /*
>>>>       * Set sh_mem_config.retry_disable on Vega10
>>>>       */
>>>> -extern int noretry;
>>>> +extern int amdgpu_noretry;
>>>>      
>>>>      /*
>>>>       * Halt if HWS hang is detected


More information about the amd-gfx mailing list