[PATCH 1/1] drm/amdkfd: Consistently apply noretry setting

Kuehling, Felix Felix.Kuehling at amd.com
Thu Jul 4 16:02:23 UTC 2019


On 2019-07-03 6:19 p.m., Yang, Philip wrote:
> amdgpu_noretry default value is 0, this will generate VM fault storm
> because the vm fault is not recovered. It may slow down the machine and
> need reboot after application VM fault. Maybe change default value to 1?

This is the same as the current behaviour. My change doesn't modify the 
default behaviour, it only makes it configurable in a sensible way.

If we want to change the default, I'd do that in a separate commit.

Regards,
   Felix

>
> Other than that, this is reviewed by Philip Yang <Philip.Yang at amd.com>
>
> On 2019-07-02 3:05 p.m., Kuehling, Felix wrote:
>> Ping.
>>
>> Christian, Philip, any opinion about this patch?
>>
>> On 2019-06-21 8:20 p.m., Kuehling, Felix wrote:
>>> Apply the same setting to SH_MEM_CONFIG and VM_CONTEXT1_CNTL. This
>>> makes the noretry param no longer KFD-specific. On GFX10 I'm not
>>> changing SH_MEM_CONFIG in this commit because GFX10 has different
>>> retry behaviour in the SQ and I don't have a way to test it at the
>>> moment.
>>>
>>> Suggested-by: Christian König <Christian.Koenig at amd.com>
>>> CC: Philip Yang <Philip.Yang at amd.com>
>>> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu.h              |  1 +
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c          | 16 +++++-----------
>>>     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c            |  4 ++++
>>>     drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c         |  3 ++-
>>>     drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c         |  3 ++-
>>>     drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c          |  3 ++-
>>>     drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c          |  3 ++-
>>>     .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c |  2 +-
>>>     drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  2 +-
>>>     9 files changed, 20 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 9b1efdf94bdf..05875279c09e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -164,6 +164,7 @@ extern int amdgpu_async_gfx_ring;
>>>     extern int amdgpu_mcbp;
>>>     extern int amdgpu_discovery;
>>>     extern int amdgpu_mes;
>>> +extern int amdgpu_noretry;
>>>     
>>>     #ifdef CONFIG_DRM_AMDGPU_SI
>>>     extern int amdgpu_si_support;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index 7cf6ab07b113..0d578d95be93 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -140,6 +140,7 @@ int amdgpu_async_gfx_ring = 1;
>>>     int amdgpu_mcbp = 0;
>>>     int amdgpu_discovery = 0;
>>>     int amdgpu_mes = 0;
>>> +int amdgpu_noretry;
>>>     
>>>     struct amdgpu_mgpu_info mgpu_info = {
>>>     	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
>>> @@ -591,6 +592,10 @@ MODULE_PARM_DESC(mes,
>>>     	"Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)");
>>>     module_param_named(mes, amdgpu_mes, int, 0444);
>>>     
>>> +MODULE_PARM_DESC(noretry,
>>> +	"Disable retry faults (0 = retry enabled (default), 1 = retry disabled)");
>>> +module_param_named(noretry, amdgpu_noretry, int, 0644);
>>> +
>>>     #ifdef CONFIG_HSA_AMD
>>>     /**
>>>      * DOC: sched_policy (int)
>>> @@ -666,17 +671,6 @@ module_param(ignore_crat, int, 0444);
>>>     MODULE_PARM_DESC(ignore_crat,
>>>     	"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");
>>>     
>>> -/**
>>> - * DOC: noretry (int)
>>> - * This parameter sets sh_mem_config.retry_disable. Default value, 0, enables retry.
>>> - * Setting 1 disables retry.
>>> - * Retry is needed for recoverable page faults.
>>> - */
>>> -int noretry;
>>> -module_param(noretry, int, 0644);
>>> -MODULE_PARM_DESC(noretry,
>>> -	"Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)");
>>> -
>>>     /**
>>>      * DOC: halt_if_hws_hang (int)
>>>      * Halt if HWS hang is detected. Default value, 0, disables the halt on hang.
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index e0f3014e76ea..c4e715170bfe 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -1938,11 +1938,15 @@ static void gfx_v9_0_constants_init(struct amdgpu_device *adev)
>>>     		if (i == 0) {
>>>     			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,
>>>     					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);
>>> +			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,
>>> +					    !!amdgpu_noretry);
>>>     			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);
>>>     			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_BASES, 0);
>>>     		} else {
>>>     			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,
>>>     					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);
>>> +			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,
>>> +					    !!amdgpu_noretry);
>>>     			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);
>>>     			tmp = REG_SET_FIELD(0, SH_MEM_BASES, PRIVATE_BASE,
>>>     				(adev->gmc.private_aperture_start >> 48));
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
>>> index 9f0f189fc111..15986748f59f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
>>> @@ -236,7 +236,8 @@ static void gfxhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)
>>>     				    block_size);
>>>     		/* Send no-retry XNACK on fault to suppress VM fault storm. */
>>>     		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);
>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,
>>> +				    !amdgpu_noretry);
>>>     		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_CNTL, i, tmp);
>>>     		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);
>>>     		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c
>>> index b7de60a15623..d605b4963f8a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c
>>> @@ -215,7 +215,8 @@ static void gfxhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)
>>>     				adev->vm_manager.block_size - 9);
>>>     		/* Send no-retry XNACK on fault to suppress VM fault storm. */
>>>     		tmp = REG_SET_FIELD(tmp, GCVM_CONTEXT1_CNTL,
>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);
>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,
>>> +				    !amdgpu_noretry);
>>>     		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_CNTL, i, tmp);
>>>     		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);
>>>     		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
>>> index 05d1d448c8f5..dc5ce03034d3 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
>>> @@ -265,7 +265,8 @@ static void mmhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)
>>>     				    block_size);
>>>     		/* Send no-retry XNACK on fault to suppress VM fault storm. */
>>>     		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);
>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,
>>> +				    !amdgpu_noretry);
>>>     		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_CNTL, i, tmp);
>>>     		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);
>>>     		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
>>> index 37a1a318ae63..0f9549f19ade 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
>>> @@ -205,7 +205,8 @@ static void mmhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)
>>>     				    adev->vm_manager.block_size - 9);
>>>     		/* Send no-retry XNACK on fault to suppress VM fault storm. */
>>>     		tmp = REG_SET_FIELD(tmp, MMVM_CONTEXT1_CNTL,
>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);
>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,
>>> +				    !amdgpu_noretry);
>>>     		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_CNTL, i, tmp);
>>>     		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);
>>>     		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
>>> index e9fe39382371..95a82ac455f2 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
>>> @@ -61,7 +61,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm,
>>>     		qpd->sh_mem_config =
>>>     				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
>>>     					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
>>> -		if (noretry &&
>>> +		if (amdgpu_noretry &&
>>>     		    !dqm->dev->device_info->needs_iommu_device)
>>>     			qpd->sh_mem_config |=
>>>     				1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index d4bba0124d29..aa7bf20d20f8 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -157,7 +157,7 @@ extern int ignore_crat;
>>>     /*
>>>      * Set sh_mem_config.retry_disable on Vega10
>>>      */
>>> -extern int noretry;
>>> +extern int amdgpu_noretry;
>>>     
>>>     /*
>>>      * Halt if HWS hang is detected


More information about the amd-gfx mailing list