[RFC 2/6] drm/amdgpu: Move scheduler init to after XGMI is ready

Andrey Grodzovsky andrey.grodzovsky at amd.com
Mon Dec 20 21:51:30 UTC 2021


On 2021-12-20 2:16 a.m., Christian König wrote:
>
>
> Am 17.12.21 um 23:27 schrieb Andrey Grodzovsky:
>> Before we initialize schedulers we must know which reset
>> domain are we in - for single device there iis a single
>> domain per device and so single wq per device. For XGMI
>> the reset domain spans the entire XGMI hive and so the
>> reset wq is per hive.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--------------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +
>>   3 files changed, 51 insertions(+), 30 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 5f13195d23d1..b595e6d699b5 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2284,6 +2284,47 @@ static int amdgpu_device_fw_loading(struct 
>> amdgpu_device *adev)
>>       return r;
>>   }
>>   +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
>> +{
>> +    long timeout;
>> +    int r, i;
>> +
>> +    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +        struct amdgpu_ring *ring = adev->rings[i];
>> +
>> +        /* No need to setup the GPU scheduler for rings that don't 
>> need it */
>> +        if (!ring || ring->no_scheduler)
>> +            continue;
>> +
>> +        switch (ring->funcs->type) {
>> +        case AMDGPU_RING_TYPE_GFX:
>> +            timeout = adev->gfx_timeout;
>> +            break;
>> +        case AMDGPU_RING_TYPE_COMPUTE:
>> +            timeout = adev->compute_timeout;
>> +            break;
>> +        case AMDGPU_RING_TYPE_SDMA:
>> +            timeout = adev->sdma_timeout;
>> +            break;
>> +        default:
>> +            timeout = adev->video_timeout;
>> +            break;
>> +        }
>> +
>
>
>> +        r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>> +                   ring->num_hw_submission, amdgpu_job_hang_limit,
>> +                   timeout, adev->reset_domain.wq, 
>> ring->sched_score, ring->name);
>> +        if (r) {
>> +            DRM_ERROR("Failed to create scheduler on ring %s.\n",
>> +                  ring->name);
>> +            return r;
>> +        }
>
> Maybe better put that into amdgpu_ring.c. But not really a hard 
> requirement, more a gut feeling.
>
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +
>>   /**
>>    * amdgpu_device_ip_init - run init for hardware IPs
>>    *
>> @@ -2412,6 +2453,10 @@ static int amdgpu_device_ip_init(struct 
>> amdgpu_device *adev)
>>           }
>>       }
>>   +    r = amdgpu_device_init_schedulers(adev);
>> +    if (r)
>> +        goto init_failed;
>> +
>>       /* Don't init kfd if whole hive need to be reset during init */
>>       if (!adev->gmc.xgmi.pending_reset)
>>           amdgpu_amdkfd_device_init(adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> index 3b7e86ea7167..5527c68c51de 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> @@ -456,8 +456,6 @@ int amdgpu_fence_driver_init_ring(struct 
>> amdgpu_ring *ring,
>>                     atomic_t *sched_score)
>>   {
>>       struct amdgpu_device *adev = ring->adev;
>> -    long timeout;
>> -    int r;
>>         if (!adev)
>>           return -EINVAL;
>> @@ -477,36 +475,12 @@ int amdgpu_fence_driver_init_ring(struct 
>> amdgpu_ring *ring,
>>       spin_lock_init(&ring->fence_drv.lock);
>>       ring->fence_drv.fences = kcalloc(num_hw_submission * 2, 
>> sizeof(void *),
>>                        GFP_KERNEL);
>> -    if (!ring->fence_drv.fences)
>> -        return -ENOMEM;
>>   -    /* No need to setup the GPU scheduler for rings that don't 
>> need it */
>> -    if (ring->no_scheduler)
>> -        return 0;
>> +    ring->num_hw_submission = num_hw_submission;
>> +    ring->sched_score = sched_score;
>
> Probably better to set that in the caller and drop the parameters from 
> the amdgpu_fence_driver_init_ring() function completely.
>
> Christian.


I noticed that at least num_hw_submission is validated within the 
function so not sure we should then discard the parameters.

Andrey


>
>>   -    switch (ring->funcs->type) {
>> -    case AMDGPU_RING_TYPE_GFX:
>> -        timeout = adev->gfx_timeout;
>> -        break;
>> -    case AMDGPU_RING_TYPE_COMPUTE:
>> -        timeout = adev->compute_timeout;
>> -        break;
>> -    case AMDGPU_RING_TYPE_SDMA:
>> -        timeout = adev->sdma_timeout;
>> -        break;
>> -    default:
>> -        timeout = adev->video_timeout;
>> -        break;
>> -    }
>> -
>> -    r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>> -               num_hw_submission, amdgpu_job_hang_limit,
>> -               timeout, NULL, sched_score, ring->name);
>> -    if (r) {
>> -        DRM_ERROR("Failed to create scheduler on ring %s.\n",
>> -              ring->name);
>> -        return r;
>> -    }
>> +    if (!ring->fence_drv.fences)
>> +        return -ENOMEM;
>>         return 0;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index 4d380e79752c..a4b8279e3011 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -253,6 +253,8 @@ struct amdgpu_ring {
>>       bool            has_compute_vm_bug;
>>       bool            no_scheduler;
>>       int            hw_prio;
>> +    unsigned         num_hw_submission;
>> +    atomic_t        *sched_score;
>>   };
>>     #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), 
>> (ib)))
>


More information about the amd-gfx mailing list