[PATCH 2/2] drm/amdgpu: add work function for GPU reset event

Sharma, Shashank shashank.sharma at amd.com
Tue Mar 8 16:30:16 UTC 2022


Hello Andrey

On 3/8/2022 5:26 PM, Andrey Grodzovsky wrote:
> 
> On 2022-03-07 11:26, Shashank Sharma wrote:
>> From: Shashank Sharma <shashank.sharma at amd.com>
>>
>> This patch adds a work function, which will get scheduled
>> in event of a GPU reset, and will send a uevent to user with
>> some reset context infomration, like a PID and some flags.
> 
> 
> Where is the actual scheduling of the work function ? Shouldn't
> there be a patch for that too ?
> 

Yes, Amar is working on that patch, on top of these patches. They should 
be out soon. I thought it was a good idea to get quick feedback on the 
basic patches before we build something on top of it.

- Shashank

> Andrey
> 
> 
>>
>> The userspace can do some recovery and post-processing work
>> based on this event.
>>
>> V2:
>> - Changed the name of the work to gpu_reset_event_work
>>    (Christian)
>> - Added a structure to accommodate some additional information
>>    (like a PID and some flags)
>>
>> Cc: Alexander Deucher <alexander.deucher at amd.com>
>> Cc: Christian Koenig <christian.koenig at amd.com>
>> Signed-off-by: Shashank Sharma <shashank.sharma at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  7 +++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++
>>   2 files changed, 26 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index d8b854fcbffa..7df219fe363f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -813,6 +813,11 @@ struct amd_powerplay {
>>   #define AMDGPU_RESET_MAGIC_NUM 64
>>   #define AMDGPU_MAX_DF_PERFMONS 4
>>   #define AMDGPU_PRODUCT_NAME_LEN 64
>> +struct amdgpu_reset_event_ctx {
>> +    uint64_t pid;
>> +    uint32_t flags;
>> +};
>> +
>>   struct amdgpu_device {
>>       struct device            *dev;
>>       struct pci_dev            *pdev;
>> @@ -1063,6 +1068,7 @@ struct amdgpu_device {
>>       int asic_reset_res;
>>       struct work_struct        xgmi_reset_work;
>> +    struct work_struct        gpu_reset_event_work;
>>       struct list_head        reset_list;
>>       long                gfx_timeout;
>> @@ -1097,6 +1103,7 @@ struct amdgpu_device {
>>       pci_channel_state_t        pci_channel_state;
>>       struct amdgpu_reset_control     *reset_cntl;
>> +    struct amdgpu_reset_event_ctx   reset_event_ctx;
>>       uint32_t                        
>> ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
>>       bool                ram_is_direct_mapped;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index ed077de426d9..c43d099da06d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -73,6 +73,7 @@
>>   #include <linux/pm_runtime.h>
>>   #include <drm/drm_drv.h>
>> +#include <drm/drm_sysfs.h>
>>   MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
>>   MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
>> @@ -3277,6 +3278,23 @@ bool amdgpu_device_has_dc_support(struct 
>> amdgpu_device *adev)
>>       return amdgpu_device_asic_has_dc_support(adev->asic_type);
>>   }
>> +static void amdgpu_device_reset_event_func(struct work_struct *__work)
>> +{
>> +    struct amdgpu_device *adev = container_of(__work, struct 
>> amdgpu_device,
>> +                          gpu_reset_event_work);
>> +    struct amdgpu_reset_event_ctx *event_ctx = &adev->reset_event_ctx;
>> +
>> +    /*
>> +     * A GPU reset has happened, indicate the userspace and pass the
>> +     * following information:
>> +     *    - pid of the process involved,
>> +     *    - if the VRAM is valid or not,
>> +     *    - indicate that userspace may want to collect the ftrace event
>> +     * data from the trace event.
>> +     */
>> +    drm_sysfs_reset_event(&adev->ddev, event_ctx->pid, 
>> event_ctx->flags);
>> +}
>> +
>>   static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>>   {
>>       struct amdgpu_device *adev =
>> @@ -3525,6 +3543,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>                 amdgpu_device_delay_enable_gfx_off);
>>       INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
>> +    INIT_WORK(&adev->gpu_reset_event_work, 
>> amdgpu_device_reset_event_func);
>>       adev->gfx.gfx_off_req_count = 1;
>>       adev->pm.ac_power = power_supply_is_system_supplied() > 0;


More information about the amd-gfx mailing list