<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<meta name="Generator" content="Microsoft Exchange Server">
<!-- converted from text --><style><!-- .EmailQuote { margin-left: 1pt; padding-left: 4pt; border-left: #800000 2px solid; } --></style>
</head>
<body>
<div>
<div dir="auto">Patches to silent retry faults are already on the mailing list.
<div dir="auto"><br>
</div>
<div dir="auto">But I need to get back from vacation to take a closer look at this once more.</div>
<div dir="auto"><br>
</div>
<div dir="auto">Christian.</div>
</div>
<div class="x_gmail_extra"><br>
<div class="x_gmail_quote">Am 04.07.2019 00:19 schrieb "Yang, Philip" <Philip.Yang@amd.com>:<br type="attribution">
</div>
</div>
</div>
<font size="2"><span style="font-size:11pt;">
<div class="PlainText">amdgpu_noretry default value is 0, this will generate VM fault storm
<br>
because the vm fault is not recovered. It may slow down the machine and <br>
need reboot after application VM fault. Maybe change default value to 1?<br>
<br>
Other than that, this is reviewed by Philip Yang <Philip.Yang@amd.com><br>
<br>
On 2019-07-02 3:05 p.m., Kuehling, Felix wrote:<br>
> Ping.<br>
> <br>
> Christian, Philip, any opinion about this patch?<br>
> <br>
> On 2019-06-21 8:20 p.m., Kuehling, Felix wrote:<br>
>> Apply the same setting to SH_MEM_CONFIG and VM_CONTEXT1_CNTL. This<br>
>> makes the noretry param no longer KFD-specific. On GFX10 I'm not<br>
>> changing SH_MEM_CONFIG in this commit because GFX10 has different<br>
>> retry behaviour in the SQ and I don't have a way to test it at the<br>
>> moment.<br>
>><br>
>> Suggested-by: Christian König <Christian.Koenig@amd.com><br>
>> CC: Philip Yang <Philip.Yang@amd.com><br>
>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com><br>
>> ---<br>
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h              |  1 +<br>
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c          | 16 +++++-----------<br>
>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c            |  4 ++++<br>
>>    drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c         |  3 ++-<br>
>>    drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c         |  3 ++-<br>
>>    drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c          |  3 ++-<br>
>>    drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c          |  3 ++-<br>
>>    .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c |  2 +-<br>
>>    drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  2 +-<br>
>>    9 files changed, 20 insertions(+), 17 deletions(-)<br>
>><br>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
>> index 9b1efdf94bdf..05875279c09e 100644<br>
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
>> @@ -164,6 +164,7 @@ extern int amdgpu_async_gfx_ring;<br>
>>    extern int amdgpu_mcbp;<br>
>>    extern int amdgpu_discovery;<br>
>>    extern int amdgpu_mes;<br>
>> +extern int amdgpu_noretry;<br>
>>    <br>
>>    #ifdef CONFIG_DRM_AMDGPU_SI<br>
>>    extern int amdgpu_si_support;<br>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c<br>
>> index 7cf6ab07b113..0d578d95be93 100644<br>
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c<br>
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c<br>
>> @@ -140,6 +140,7 @@ int amdgpu_async_gfx_ring = 1;<br>
>>    int amdgpu_mcbp = 0;<br>
>>    int amdgpu_discovery = 0;<br>
>>    int amdgpu_mes = 0;<br>
>> +int amdgpu_noretry;<br>
>>    <br>
>>    struct amdgpu_mgpu_info mgpu_info = {<br>
>>       .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),<br>
>> @@ -591,6 +592,10 @@ MODULE_PARM_DESC(mes,<br>
>>       "Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)");<br>
>>    module_param_named(mes, amdgpu_mes, int, 0444);<br>
>>    <br>
>> +MODULE_PARM_DESC(noretry,<br>
>> +    "Disable retry faults (0 = retry enabled (default), 1 = retry disabled)");<br>
>> +module_param_named(noretry, amdgpu_noretry, int, 0644);<br>
>> +<br>
>>    #ifdef CONFIG_HSA_AMD<br>
>>    /**<br>
>>     * DOC: sched_policy (int)<br>
>> @@ -666,17 +671,6 @@ module_param(ignore_crat, int, 0444);<br>
>>    MODULE_PARM_DESC(ignore_crat,<br>
>>       "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");<br>
>>    <br>
>> -/**<br>
>> - * DOC: noretry (int)<br>
>> - * This parameter sets sh_mem_config.retry_disable. Default value, 0, enables retry.<br>
>> - * Setting 1 disables retry.<br>
>> - * Retry is needed for recoverable page faults.<br>
>> - */<br>
>> -int noretry;<br>
>> -module_param(noretry, int, 0644);<br>
>> -MODULE_PARM_DESC(noretry,<br>
>> -    "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)");<br>
>> -<br>
>>    /**<br>
>>     * DOC: halt_if_hws_hang (int)<br>
>>     * Halt if HWS hang is detected. Default value, 0, disables the halt on hang.<br>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c<br>
>> index e0f3014e76ea..c4e715170bfe 100644<br>
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c<br>
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c<br>
>> @@ -1938,11 +1938,15 @@ static void gfx_v9_0_constants_init(struct amdgpu_device *adev)<br>
>>               if (i == 0) {<br>
>>                       tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,<br>
>>                                           SH_MEM_ALIGNMENT_MODE_UNALIGNED);<br>
>> +                    tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,<br>
>> +                                        !!amdgpu_noretry);<br>
>>                       WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);<br>
>>                       WREG32_SOC15_RLC(GC, 0, mmSH_MEM_BASES, 0);<br>
>>               } else {<br>
>>                       tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,<br>
>>                                           SH_MEM_ALIGNMENT_MODE_UNALIGNED);<br>
>> +                    tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,<br>
>> +                                        !!amdgpu_noretry);<br>
>>                       WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);<br>
>>                       tmp = REG_SET_FIELD(0, SH_MEM_BASES, PRIVATE_BASE,<br>
>>                               (adev->gmc.private_aperture_start >> 48));<br>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c<br>
>> index 9f0f189fc111..15986748f59f 100644<br>
>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c<br>
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c<br>
>> @@ -236,7 +236,8 @@ static void gfxhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)<br>
>>                                   block_size);<br>
>>               /* Send no-retry XNACK on fault to suppress VM fault storm. */<br>
>>               tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,<br>
>> -                                RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);<br>
>> +                                RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,<br>
>> +                                !amdgpu_noretry);<br>
>>               WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_CNTL, i, tmp);<br>
>>               WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);<br>
>>               WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);<br>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c<br>
>> index b7de60a15623..d605b4963f8a 100644<br>
>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c<br>
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c<br>
>> @@ -215,7 +215,8 @@ static void gfxhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)<br>
>>                               adev->vm_manager.block_size - 9);<br>
>>               /* Send no-retry XNACK on fault to suppress VM fault storm. */<br>
>>               tmp = REG_SET_FIELD(tmp, GCVM_CONTEXT1_CNTL,<br>
>> -                                RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);<br>
>> +                                RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,<br>
>> +                                !amdgpu_noretry);<br>
>>               WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_CNTL, i, tmp);<br>
>>               WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);<br>
>>               WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);<br>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c<br>
>> index 05d1d448c8f5..dc5ce03034d3 100644<br>
>> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c<br>
>> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c<br>
>> @@ -265,7 +265,8 @@ static void mmhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)<br>
>>                                   block_size);<br>
>>               /* Send no-retry XNACK on fault to suppress VM fault storm. */<br>
>>               tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,<br>
>> -                                RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);<br>
>> +                                RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,<br>
>> +                                !amdgpu_noretry);<br>
>>               WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_CNTL, i, tmp);<br>
>>               WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);<br>
>>               WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);<br>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c<br>
>> index 37a1a318ae63..0f9549f19ade 100644<br>
>> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c<br>
>> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c<br>
>> @@ -205,7 +205,8 @@ static void mmhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)<br>
>>                                   adev->vm_manager.block_size - 9);<br>
>>               /* Send no-retry XNACK on fault to suppress VM fault storm. */<br>
>>               tmp = REG_SET_FIELD(tmp, MMVM_CONTEXT1_CNTL,<br>
>> -                                RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);<br>
>> +                                RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,<br>
>> +                                !amdgpu_noretry);<br>
>>               WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_CNTL, i, tmp);<br>
>>               WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);<br>
>>               WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);<br>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c<br>
>> index e9fe39382371..95a82ac455f2 100644<br>
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c<br>
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c<br>
>> @@ -61,7 +61,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm,<br>
>>               qpd->sh_mem_config =<br>
>>                               SH_MEM_ALIGNMENT_MODE_UNALIGNED <<<br>
>>                                       SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;<br>
>> -            if (noretry &&<br>
>> +            if (amdgpu_noretry &&<br>
>>                   !dqm->dev->device_info->needs_iommu_device)<br>
>>                       qpd->sh_mem_config |=<br>
>>                               1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;<br>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h<br>
>> index d4bba0124d29..aa7bf20d20f8 100644<br>
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h<br>
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h<br>
>> @@ -157,7 +157,7 @@ extern int ignore_crat;<br>
>>    /*<br>
>>     * Set sh_mem_config.retry_disable on Vega10<br>
>>     */<br>
>> -extern int noretry;<br>
>> +extern int amdgpu_noretry;<br>
>>    <br>
>>    /*<br>
>>     * Halt if HWS hang is detected<br>
</div>
</span></font>
</body>
</html>