[PATCH 2/2] drm/amdgpu: add work function for GPU reset event

Tue Mar 8 17:20:07 UTC 2022

On 3/8/2022 10:00 PM, Sharma, Shashank wrote:
> Hello Andrey
>
> On 3/8/2022 5:26 PM, Andrey Grodzovsky wrote:
>>
>> On 2022-03-07 11:26, Shashank Sharma wrote:
>>> From: Shashank Sharma <shashank.sharma at amd.com>
>>>
>>> This patch adds a work function, which will get scheduled
>>> in event of a GPU reset, and will send a uevent to user with
>>> some reset context infomration, like a PID and some flags.
>>
>>
>> Where is the actual scheduling of the work function ? Shouldn't
>> there be a patch for that too ?
>>
>
> Yes, Amar is working on that patch, on top of these patches. They 
> should be out soon. I thought it was a good idea to get quick feedback 
> on the basic patches before we build something on top of it.
>
schedule_work() will be called in the function amdgpu_do_asic_reset ()

after getting vram_lost info:

vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

update  amdgpu_reset_event_ctx and call schedule_work()

  * vram_lost
  * reset_context->job->vm->task_info.process_name
  * reset_context->job->vm->task_info.pid

Regards,
S.Amarnath
> - Shashank
>
>> Andrey
>>
>>
>>>
>>> The userspace can do some recovery and post-processing work
>>> based on this event.
>>>
>>> V2:
>>> - Changed the name of the work to gpu_reset_event_work
>>>    (Christian)
>>> - Added a structure to accommodate some additional information
>>>    (like a PID and some flags)
>>>
>>> Cc: Alexander Deucher <alexander.deucher at amd.com>
>>> Cc: Christian Koenig <christian.koenig at amd.com>
>>> Signed-off-by: Shashank Sharma <shashank.sharma at amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 +++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++
>>>   2 files changed, 26 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index d8b854fcbffa..7df219fe363f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -813,6 +813,11 @@ struct amd_powerplay {
>>>   #define AMDGPU_RESET_MAGIC_NUM 64
>>>   #define AMDGPU_MAX_DF_PERFMONS 4
>>>   #define AMDGPU_PRODUCT_NAME_LEN 64
>>> +struct amdgpu_reset_event_ctx {
>>> +    uint64_t pid;
>>> +    uint32_t flags;
>>> +};
>>> +
>>>   struct amdgpu_device {
>>>       struct device            *dev;
>>>       struct pci_dev            *pdev;
>>> @@ -1063,6 +1068,7 @@ struct amdgpu_device {
>>>       int asic_reset_res;
>>>       struct work_struct        xgmi_reset_work;
>>> +    struct work_struct        gpu_reset_event_work;
>>>       struct list_head        reset_list;
>>>       long                gfx_timeout;
>>> @@ -1097,6 +1103,7 @@ struct amdgpu_device {
>>>       pci_channel_state_t        pci_channel_state;
>>>       struct amdgpu_reset_control     *reset_cntl;
>>> +    struct amdgpu_reset_event_ctx   reset_event_ctx;
>>>       uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>>>       bool                ram_is_direct_mapped;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index ed077de426d9..c43d099da06d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -73,6 +73,7 @@
>>>   #include <linux/pm_runtime.h>
>>>   #include <drm/drm_drv.h>
>>> +#include <drm/drm_sysfs.h>
>>>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>>>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>>> @@ -3277,6 +3278,23 @@ bool amdgpu_device_has_dc_support(struct 
>>> amdgpu_device *adev)
>>>       return amdgpu_device_asic_has_dc_support(adev->asic_type);
>>>   }
>>> +static void amdgpu_device_reset_event_func(struct work_struct *__work)
>>> +{
>>> +    struct amdgpu_device *adev = container_of(__work, struct 
>>> amdgpu_device,
>>> +                          gpu_reset_event_work);
>>> +    struct amdgpu_reset_event_ctx *event_ctx = &adev->reset_event_ctx;
>>> +
>>> +    /*
>>> +     * A GPU reset has happened, indicate the userspace and pass the
>>> +     * following information:
>>> +     *    - pid of the process involved,
>>> +     *    - if the VRAM is valid or not,
>>> +     *    - indicate that userspace may want to collect the ftrace 
>>> event
>>> +     * data from the trace event.
>>> +     */
>>> +    drm_sysfs_reset_event(&adev->ddev, event_ctx->pid, 
>>> event_ctx->flags);
>>> +}
>>> +
>>>   static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>>>   {
>>>       struct amdgpu_device *adev =
>>> @@ -3525,6 +3543,7 @@ int amdgpu_device_init(struct amdgpu_device 
>>> *adev,
>>>                 amdgpu_device_delay_enable_gfx_off);
>>>       INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
>>> +    INIT_WORK(&adev->gpu_reset_event_work, 
>>> amdgpu_device_reset_event_func);
>>>       adev->gfx.gfx_off_req_count = 1;
>>>       adev->pm.ac_power = power_supply_is_system_supplied() > 0;
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20220308/0376ec07/attachment-0001.htm>