[RFC v4 02/11] drm/amdgpu: Move scheduler init to after XGMI is ready

Thu Feb 24 06:47:33 UTC 2022

Hi Andrey,

Will you port this patch into amd-staging-drm-next?

on 2/10/22 2:06 AM, Andrey Grodzovsky wrote:
> All comments are fixed and code pushed. Thanks for everyone
> who helped reviewing.
>
> Andrey
>
> On 2022-02-09 02:53, Christian König wrote:
>> Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:
>>> Before we initialize schedulers we must know which reset
>>> domain are we in - for single device there iis a single
>>> domain per device and so single wq per device. For XGMI
>>> the reset domain spans the entire XGMI hive and so the
>>> reset wq is per hive.
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>>
>> One more comment below, with that fixed Reviewed-by: Christian König <christian.koenig at amd.com>.
>>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--------------
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
>>>   3 files changed, 51 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 9704b0e1fd82..00123b0013d3 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2287,6 +2287,47 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
>>>       return r;
>>>   }
>>>   +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
>>> +{
>>> +    long timeout;
>>> +    int r, i;
>>> +
>>> +    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +        struct amdgpu_ring *ring = adev->rings[i];
>>> +
>>> +        /* No need to setup the GPU scheduler for rings that don't need it */
>>> +        if (!ring || ring->no_scheduler)
>>> +            continue;
>>> +
>>> +        switch (ring->funcs->type) {
>>> +        case AMDGPU_RING_TYPE_GFX:
>>> +            timeout = adev->gfx_timeout;
>>> +            break;
>>> +        case AMDGPU_RING_TYPE_COMPUTE:
>>> +            timeout = adev->compute_timeout;
>>> +            break;
>>> +        case AMDGPU_RING_TYPE_SDMA:
>>> +            timeout = adev->sdma_timeout;
>>> +            break;
>>> +        default:
>>> +            timeout = adev->video_timeout;
>>> +            break;
>>> +        }
>>> +
>>> +        r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>> +                   ring->num_hw_submission, amdgpu_job_hang_limit,
>>> +                   timeout, adev->reset_domain.wq, ring->sched_score, ring->name);
>>> +        if (r) {
>>> +            DRM_ERROR("Failed to create scheduler on ring %s.\n",
>>> +                  ring->name);
>>> +            return r;
>>> +        }
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +
>>>   /**
>>>    * amdgpu_device_ip_init - run init for hardware IPs
>>>    *
>>> @@ -2419,6 +2460,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
>>>           }
>>>       }
>>>   +    r = amdgpu_device_init_schedulers(adev);
>>> +    if (r)
>>> +        goto init_failed;
>>> +
>>>       /* Don't init kfd if whole hive need to be reset during init */
>>>       if (!adev->gmc.xgmi.pending_reset)
>>>           amdgpu_amdkfd_device_init(adev);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> index 45977a72b5dd..fa302540c69a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> @@ -457,8 +457,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>>>                     atomic_t *sched_score)
>>>   {
>>>       struct amdgpu_device *adev = ring->adev;
>>> -    long timeout;
>>> -    int r;
>>>         if (!adev)
>>>           return -EINVAL;
>>> @@ -478,36 +476,12 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>>>       spin_lock_init(&ring->fence_drv.lock);
>>>       ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),
>>>                        GFP_KERNEL);
>>> -    if (!ring->fence_drv.fences)
>>> -        return -ENOMEM;
>>>   -    /* No need to setup the GPU scheduler for rings that don't need it */
>>> -    if (ring->no_scheduler)
>>> -        return 0;
>>> +    ring->num_hw_submission = num_hw_submission;
>>> +    ring->sched_score = sched_score;
>>
>> Let's move this into the caller and then use ring->num_hw_submission in the fence code as well.
>>
>> The maximum number of jobs on the ring is not really fence specific.
>>
>> Regards,
>> Christian.
>>
>>>   -    switch (ring->funcs->type) {
>>> -    case AMDGPU_RING_TYPE_GFX:
>>> -        timeout = adev->gfx_timeout;
>>> -        break;
>>> -    case AMDGPU_RING_TYPE_COMPUTE:
>>> -        timeout = adev->compute_timeout;
>>> -        break;
>>> -    case AMDGPU_RING_TYPE_SDMA:
>>> -        timeout = adev->sdma_timeout;
>>> -        break;
>>> -    default:
>>> -        timeout = adev->video_timeout;
>>> -        break;
>>> -    }
>>> -
>>> -    r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>>> -               num_hw_submission, amdgpu_job_hang_limit,
>>> -               timeout, NULL, sched_score, ring->name);
>>> -    if (r) {
>>> -        DRM_ERROR("Failed to create scheduler on ring %s.\n",
>>> -              ring->name);
>>> -        return r;
>>> -    }
>>> +    if (!ring->fence_drv.fences)
>>> +        return -ENOMEM;
>>>         return 0;
>>>   }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index fae7d185ad0d..7f20ce73a243 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -251,6 +251,8 @@ struct amdgpu_ring {
>>>       bool            has_compute_vm_bug;
>>>       bool            no_scheduler;
>>>       int            hw_prio;
>>> +    unsigned         num_hw_submission;
>>> +    atomic_t        *sched_score;
>>>   };
>>>     #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))
>>