[PATCH v3 07/12] drm/sched: Prevent any job recoveries after device is unplugged.
Luben Tuikov
luben.tuikov at amd.com
Tue Nov 24 01:12:34 UTC 2020
On 2020-11-23 3:06 a.m., Christian König wrote:
> Am 23.11.20 um 06:37 schrieb Andrey Grodzovsky:
>>
>> On 11/22/20 6:57 AM, Christian König wrote:
>>> Am 21.11.20 um 06:21 schrieb Andrey Grodzovsky:
>>>> No point to try recovery if device is gone, it's meaningless.
>>>
>>> I think that this should go into the device specific recovery
>>> function and not in the scheduler.
>>
>>
>> The timeout timer is rearmed here, so this prevents any new recovery
>> work to restart from here
>> after drm_dev_unplug was executed from amdgpu_pci_remove.It will not
>> cover other places like
>> job cleanup or starting new job but those should stop once the
>> scheduler thread is stopped later.
>
> Yeah, but this is rather unclean. We should probably return an error
> code instead if the timer should be rearmed or not.
Christian, this is exactly my work I told you about
last week on Wednesday in our weekly meeting. And
which I wrote to you in an email last year about this
time.
So what do we do now?
I can submit those changes without the last part,
which builds on this change.
I'm still testing the last part and was hoping
to submit it all in one sequence of patches,
after my testing.
Regards,
Luben
>
> Christian.
>
>>
>> Andrey
>>
>>
>>>
>>> Christian.
>>>
>>>>
>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>>>> ---
>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 2 +-
>>>> drivers/gpu/drm/etnaviv/etnaviv_sched.c | 3 ++-
>>>> drivers/gpu/drm/lima/lima_sched.c | 3 ++-
>>>> drivers/gpu/drm/panfrost/panfrost_job.c | 2 +-
>>>> drivers/gpu/drm/scheduler/sched_main.c | 15 ++++++++++++++-
>>>> drivers/gpu/drm/v3d/v3d_sched.c | 15 ++++++++++-----
>>>> include/drm/gpu_scheduler.h | 6 +++++-
>>>> 7 files changed, 35 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> index d56f402..d0b0021 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> @@ -487,7 +487,7 @@ int amdgpu_fence_driver_init_ring(struct
>>>> amdgpu_ring *ring,
>>>> r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>>> num_hw_submission, amdgpu_job_hang_limit,
>>>> - timeout, ring->name);
>>>> + timeout, ring->name, &adev->ddev);
>>>> if (r) {
>>>> DRM_ERROR("Failed to create scheduler on ring %s.\n",
>>>> ring->name);
>>>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>> index cd46c88..7678287 100644
>>>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>>>> @@ -185,7 +185,8 @@ int etnaviv_sched_init(struct etnaviv_gpu *gpu)
>>>> ret = drm_sched_init(&gpu->sched, &etnaviv_sched_ops,
>>>> etnaviv_hw_jobs_limit, etnaviv_job_hang_limit,
>>>> - msecs_to_jiffies(500), dev_name(gpu->dev));
>>>> + msecs_to_jiffies(500), dev_name(gpu->dev),
>>>> + gpu->drm);
>>>> if (ret)
>>>> return ret;
>>>> diff --git a/drivers/gpu/drm/lima/lima_sched.c
>>>> b/drivers/gpu/drm/lima/lima_sched.c
>>>> index dc6df9e..8a7e5d7ca 100644
>>>> --- a/drivers/gpu/drm/lima/lima_sched.c
>>>> +++ b/drivers/gpu/drm/lima/lima_sched.c
>>>> @@ -505,7 +505,8 @@ int lima_sched_pipe_init(struct lima_sched_pipe
>>>> *pipe, const char *name)
>>>> return drm_sched_init(&pipe->base, &lima_sched_ops, 1,
>>>> lima_job_hang_limit, msecs_to_jiffies(timeout),
>>>> - name);
>>>> + name,
>>>> + pipe->ldev->ddev);
>>>> }
>>>> void lima_sched_pipe_fini(struct lima_sched_pipe *pipe)
>>>> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c
>>>> b/drivers/gpu/drm/panfrost/panfrost_job.c
>>>> index 30e7b71..37b03b01 100644
>>>> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
>>>> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
>>>> @@ -520,7 +520,7 @@ int panfrost_job_init(struct panfrost_device
>>>> *pfdev)
>>>> ret = drm_sched_init(&js->queue[j].sched,
>>>> &panfrost_sched_ops,
>>>> 1, 0, msecs_to_jiffies(500),
>>>> - "pan_js");
>>>> + "pan_js", pfdev->ddev);
>>>> if (ret) {
>>>> dev_err(pfdev->dev, "Failed to create scheduler: %d.",
>>>> ret);
>>>> goto err_sched;
>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>> index c3f0bd0..95db8c6 100644
>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>> @@ -53,6 +53,7 @@
>>>> #include <drm/drm_print.h>
>>>> #include <drm/gpu_scheduler.h>
>>>> #include <drm/spsc_queue.h>
>>>> +#include <drm/drm_drv.h>
>>>> #define CREATE_TRACE_POINTS
>>>> #include "gpu_scheduler_trace.h"
>>>> @@ -283,8 +284,16 @@ static void drm_sched_job_timedout(struct
>>>> work_struct *work)
>>>> struct drm_gpu_scheduler *sched;
>>>> struct drm_sched_job *job;
>>>> + int idx;
>>>> +
>>>> sched = container_of(work, struct drm_gpu_scheduler,
>>>> work_tdr.work);
>>>> + if (!drm_dev_enter(sched->ddev, &idx)) {
>>>> + DRM_INFO("%s - device unplugged skipping recovery on
>>>> scheduler:%s",
>>>> + __func__, sched->name);
>>>> + return;
>>>> + }
>>>> +
>>>> /* Protects against concurrent deletion in
>>>> drm_sched_get_cleanup_job */
>>>> spin_lock(&sched->job_list_lock);
>>>> job = list_first_entry_or_null(&sched->ring_mirror_list,
>>>> @@ -316,6 +325,8 @@ static void drm_sched_job_timedout(struct
>>>> work_struct *work)
>>>> spin_lock(&sched->job_list_lock);
>>>> drm_sched_start_timeout(sched);
>>>> spin_unlock(&sched->job_list_lock);
>>>> +
>>>> + drm_dev_exit(idx);
>>>> }
>>>> /**
>>>> @@ -845,7 +856,8 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>> unsigned hw_submission,
>>>> unsigned hang_limit,
>>>> long timeout,
>>>> - const char *name)
>>>> + const char *name,
>>>> + struct drm_device *ddev)
>>>> {
>>>> int i, ret;
>>>> sched->ops = ops;
>>>> @@ -853,6 +865,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>> sched->name = name;
>>>> sched->timeout = timeout;
>>>> sched->hang_limit = hang_limit;
>>>> + sched->ddev = ddev;
>>>> for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT;
>>>> i++)
>>>> drm_sched_rq_init(sched, &sched->sched_rq[i]);
>>>> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c
>>>> b/drivers/gpu/drm/v3d/v3d_sched.c
>>>> index 0747614..f5076e5 100644
>>>> --- a/drivers/gpu/drm/v3d/v3d_sched.c
>>>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>>>> @@ -401,7 +401,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>> &v3d_bin_sched_ops,
>>>> hw_jobs_limit, job_hang_limit,
>>>> msecs_to_jiffies(hang_limit_ms),
>>>> - "v3d_bin");
>>>> + "v3d_bin",
>>>> + &v3d->drm);
>>>> if (ret) {
>>>> dev_err(v3d->drm.dev, "Failed to create bin scheduler:
>>>> %d.", ret);
>>>> return ret;
>>>> @@ -411,7 +412,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>> &v3d_render_sched_ops,
>>>> hw_jobs_limit, job_hang_limit,
>>>> msecs_to_jiffies(hang_limit_ms),
>>>> - "v3d_render");
>>>> + "v3d_render",
>>>> + &v3d->drm);
>>>> if (ret) {
>>>> dev_err(v3d->drm.dev, "Failed to create render scheduler:
>>>> %d.",
>>>> ret);
>>>> @@ -423,7 +425,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>> &v3d_tfu_sched_ops,
>>>> hw_jobs_limit, job_hang_limit,
>>>> msecs_to_jiffies(hang_limit_ms),
>>>> - "v3d_tfu");
>>>> + "v3d_tfu",
>>>> + &v3d->drm);
>>>> if (ret) {
>>>> dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
>>>> ret);
>>>> @@ -436,7 +439,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>> &v3d_csd_sched_ops,
>>>> hw_jobs_limit, job_hang_limit,
>>>> msecs_to_jiffies(hang_limit_ms),
>>>> - "v3d_csd");
>>>> + "v3d_csd",
>>>> + &v3d->drm);
>>>> if (ret) {
>>>> dev_err(v3d->drm.dev, "Failed to create CSD scheduler:
>>>> %d.",
>>>> ret);
>>>> @@ -448,7 +452,8 @@ v3d_sched_init(struct v3d_dev *v3d)
>>>> &v3d_cache_clean_sched_ops,
>>>> hw_jobs_limit, job_hang_limit,
>>>> msecs_to_jiffies(hang_limit_ms),
>>>> - "v3d_cache_clean");
>>>> + "v3d_cache_clean",
>>>> + &v3d->drm);
>>>> if (ret) {
>>>> dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN
>>>> scheduler: %d.",
>>>> ret);
>>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>>> index 9243655..a980709 100644
>>>> --- a/include/drm/gpu_scheduler.h
>>>> +++ b/include/drm/gpu_scheduler.h
>>>> @@ -32,6 +32,7 @@
>>>> struct drm_gpu_scheduler;
>>>> struct drm_sched_rq;
>>>> +struct drm_device;
>>>> /* These are often used as an (initial) index
>>>> * to an array, and as such should start at 0.
>>>> @@ -267,6 +268,7 @@ struct drm_sched_backend_ops {
>>>> * @score: score to help loadbalancer pick a idle sched
>>>> * @ready: marks if the underlying HW is ready to work
>>>> * @free_guilty: A hit to time out handler to free the guilty job.
>>>> + * @ddev: Pointer to drm device of this scheduler.
>>>> *
>>>> * One scheduler is implemented for each hardware ring.
>>>> */
>>>> @@ -288,12 +290,14 @@ struct drm_gpu_scheduler {
>>>> atomic_t score;
>>>> bool ready;
>>>> bool free_guilty;
>>>> + struct drm_device *ddev;
>>>> };
>>>> int drm_sched_init(struct drm_gpu_scheduler *sched,
>>>> const struct drm_sched_backend_ops *ops,
>>>> uint32_t hw_submission, unsigned hang_limit, long timeout,
>>>> - const char *name);
>>>> + const char *name,
>>>> + struct drm_device *ddev);
>>>> void drm_sched_fini(struct drm_gpu_scheduler *sched);
>>>> int drm_sched_job_init(struct drm_sched_job *job,
>>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=04%7C01%7Cluben.tuikov%40amd.com%7C7206e081871546cde52408d88f86a3c3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637417155725505874%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=NejMsBm%2Fk9gheoQJv29vIe9f59jelk12ViF9%2Bt2UUWU%3D&reserved=0
>
> _______________________________________________
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Fdri-devel&data=04%7C01%7Cluben.tuikov%40amd.com%7C7206e081871546cde52408d88f86a3c3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637417155725515872%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=WtDcIF22HvCMJHObfEhLD%2F7%2BZ37%2FxQC1465YoOrMEjc%3D&reserved=0
>
More information about the dri-devel
mailing list