[PATCH 2/2] drm/amdgpu: add work function for GPU reset event
Andrey Grodzovsky
andrey.grodzovsky at amd.com
Tue Mar 8 18:53:41 UTC 2022
On 2022-03-08 12:20, Somalapuram, Amaranath wrote:
>
>
> On 3/8/2022 10:00 PM, Sharma, Shashank wrote:
>> Hello Andrey
>>
>> On 3/8/2022 5:26 PM, Andrey Grodzovsky wrote:
>>>
>>> On 2022-03-07 11:26, Shashank Sharma wrote:
>>>> From: Shashank Sharma <shashank.sharma at amd.com>
>>>>
>>>> This patch adds a work function, which will get scheduled
>>>> in event of a GPU reset, and will send a uevent to user with
>>>> some reset context infomration, like a PID and some flags.
>>>
>>>
>>> Where is the actual scheduling of the work function ? Shouldn't
>>> there be a patch for that too ?
>>>
>>
>> Yes, Amar is working on that patch, on top of these patches. They
>> should be out soon. I thought it was a good idea to get quick
>> feedback on the basic patches before we build something on top of it.
>>
> schedule_work() will be called in the function amdgpu_do_asic_reset ()
>
I didn't follow closely on the requirements and so I don't know but,
what about
job timeout that was able to soft recover - do you need to cover this
too ? Or
in this case no need to restart user application and you hence don't care ?
Andrey
> after getting vram_lost info:
>
> vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
>
> update amdgpu_reset_event_ctx and call schedule_work()
>
> * vram_lost
> * reset_context->job->vm->task_info.process_name
> * reset_context->job->vm->task_info.pid
>
> Regards,
> S.Amarnath
>> - Shashank
>>
>>> Andrey
>>>
>>>
>>>>
>>>> The userspace can do some recovery and post-processing work
>>>> based on this event.
>>>>
>>>> V2:
>>>> - Changed the name of the work to gpu_reset_event_work
>>>> (Christian)
>>>> - Added a structure to accommodate some additional information
>>>> (like a PID and some flags)
>>>>
>>>> Cc: Alexander Deucher <alexander.deucher at amd.com>
>>>> Cc: Christian Koenig <christian.koenig at amd.com>
>>>> Signed-off-by: Shashank Sharma <shashank.sharma at amd.com>
>>>> ---
>>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 +++++++
>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++
>>>> 2 files changed, 26 insertions(+)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index d8b854fcbffa..7df219fe363f 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -813,6 +813,11 @@ struct amd_powerplay {
>>>> #define AMDGPU_RESET_MAGIC_NUM 64
>>>> #define AMDGPU_MAX_DF_PERFMONS 4
>>>> #define AMDGPU_PRODUCT_NAME_LEN 64
>>>> +struct amdgpu_reset_event_ctx {
>>>> + uint64_t pid;
>>>> + uint32_t flags;
>>>> +};
>>>> +
>>>> struct amdgpu_device {
>>>> struct device *dev;
>>>> struct pci_dev *pdev;
>>>> @@ -1063,6 +1068,7 @@ struct amdgpu_device {
>>>> int asic_reset_res;
>>>> struct work_struct xgmi_reset_work;
>>>> + struct work_struct gpu_reset_event_work;
>>>> struct list_head reset_list;
>>>> long gfx_timeout;
>>>> @@ -1097,6 +1103,7 @@ struct amdgpu_device {
>>>> pci_channel_state_t pci_channel_state;
>>>> struct amdgpu_reset_control *reset_cntl;
>>>> + struct amdgpu_reset_event_ctx reset_event_ctx;
>>>> uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>>>> bool ram_is_direct_mapped;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index ed077de426d9..c43d099da06d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -73,6 +73,7 @@
>>>> #include <linux/pm_runtime.h>
>>>> #include <drm/drm_drv.h>
>>>> +#include <drm/drm_sysfs.h>
>>>> MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>>>> MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>>>> @@ -3277,6 +3278,23 @@ bool amdgpu_device_has_dc_support(struct
>>>> amdgpu_device *adev)
>>>> return amdgpu_device_asic_has_dc_support(adev->asic_type);
>>>> }
>>>> +static void amdgpu_device_reset_event_func(struct work_struct
>>>> *__work)
>>>> +{
>>>> + struct amdgpu_device *adev = container_of(__work, struct
>>>> amdgpu_device,
>>>> + gpu_reset_event_work);
>>>> + struct amdgpu_reset_event_ctx *event_ctx =
>>>> &adev->reset_event_ctx;
>>>> +
>>>> + /*
>>>> + * A GPU reset has happened, indicate the userspace and pass the
>>>> + * following information:
>>>> + * - pid of the process involved,
>>>> + * - if the VRAM is valid or not,
>>>> + * - indicate that userspace may want to collect the ftrace
>>>> event
>>>> + * data from the trace event.
>>>> + */
>>>> + drm_sysfs_reset_event(&adev->ddev, event_ctx->pid,
>>>> event_ctx->flags);
>>>> +}
>>>> +
>>>> static void amdgpu_device_xgmi_reset_func(struct work_struct
>>>> *__work)
>>>> {
>>>> struct amdgpu_device *adev =
>>>> @@ -3525,6 +3543,7 @@ int amdgpu_device_init(struct amdgpu_device
>>>> *adev,
>>>> amdgpu_device_delay_enable_gfx_off);
>>>> INIT_WORK(&adev->xgmi_reset_work,
>>>> amdgpu_device_xgmi_reset_func);
>>>> + INIT_WORK(&adev->gpu_reset_event_work,
>>>> amdgpu_device_reset_event_func);
>>>> adev->gfx.gfx_off_req_count = 1;
>>>> adev->pm.ac_power = power_supply_is_system_supplied() > 0;
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20220308/d0c1e3cb/attachment-0001.htm>
More information about the amd-gfx
mailing list