[RFC v4 02/11] drm/amdgpu: Move scheduler init to after XGMI is ready
Andrey Grodzovsky
andrey.grodzovsky at amd.com
Wed Feb 9 18:06:24 UTC 2022
All comments are fixed and code pushed. Thanks for everyone
who helped reviewing.
Andrey
On 2022-02-09 02:53, Christian König wrote:
> Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:
>> Before we initialize schedulers we must know which reset
>> domain are we in - for single device there iis a single
>> domain per device and so single wq per device. For XGMI
>> the reset domain spans the entire XGMI hive and so the
>> reset wq is per hive.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>
> One more comment below, with that fixed Reviewed-by: Christian König
> <christian.koenig at amd.com>.
>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 34 ++--------------
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 2 +
>> 3 files changed, 51 insertions(+), 30 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 9704b0e1fd82..00123b0013d3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2287,6 +2287,47 @@ static int amdgpu_device_fw_loading(struct
>> amdgpu_device *adev)
>> return r;
>> }
>> +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
>> +{
>> + long timeout;
>> + int r, i;
>> +
>> + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> + struct amdgpu_ring *ring = adev->rings[i];
>> +
>> + /* No need to setup the GPU scheduler for rings that don't
>> need it */
>> + if (!ring || ring->no_scheduler)
>> + continue;
>> +
>> + switch (ring->funcs->type) {
>> + case AMDGPU_RING_TYPE_GFX:
>> + timeout = adev->gfx_timeout;
>> + break;
>> + case AMDGPU_RING_TYPE_COMPUTE:
>> + timeout = adev->compute_timeout;
>> + break;
>> + case AMDGPU_RING_TYPE_SDMA:
>> + timeout = adev->sdma_timeout;
>> + break;
>> + default:
>> + timeout = adev->video_timeout;
>> + break;
>> + }
>> +
>> + r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>> + ring->num_hw_submission, amdgpu_job_hang_limit,
>> + timeout, adev->reset_domain.wq,
>> ring->sched_score, ring->name);
>> + if (r) {
>> + DRM_ERROR("Failed to create scheduler on ring %s.\n",
>> + ring->name);
>> + return r;
>> + }
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +
>> /**
>> * amdgpu_device_ip_init - run init for hardware IPs
>> *
>> @@ -2419,6 +2460,10 @@ static int amdgpu_device_ip_init(struct
>> amdgpu_device *adev)
>> }
>> }
>> + r = amdgpu_device_init_schedulers(adev);
>> + if (r)
>> + goto init_failed;
>> +
>> /* Don't init kfd if whole hive need to be reset during init */
>> if (!adev->gmc.xgmi.pending_reset)
>> amdgpu_amdkfd_device_init(adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> index 45977a72b5dd..fa302540c69a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> @@ -457,8 +457,6 @@ int amdgpu_fence_driver_init_ring(struct
>> amdgpu_ring *ring,
>> atomic_t *sched_score)
>> {
>> struct amdgpu_device *adev = ring->adev;
>> - long timeout;
>> - int r;
>> if (!adev)
>> return -EINVAL;
>> @@ -478,36 +476,12 @@ int amdgpu_fence_driver_init_ring(struct
>> amdgpu_ring *ring,
>> spin_lock_init(&ring->fence_drv.lock);
>> ring->fence_drv.fences = kcalloc(num_hw_submission * 2,
>> sizeof(void *),
>> GFP_KERNEL);
>> - if (!ring->fence_drv.fences)
>> - return -ENOMEM;
>> - /* No need to setup the GPU scheduler for rings that don't
>> need it */
>> - if (ring->no_scheduler)
>> - return 0;
>> + ring->num_hw_submission = num_hw_submission;
>> + ring->sched_score = sched_score;
>
> Let's move this into the caller and then use ring->num_hw_submission
> in the fence code as well.
>
> The maximum number of jobs on the ring is not really fence specific.
>
> Regards,
> Christian.
>
>> - switch (ring->funcs->type) {
>> - case AMDGPU_RING_TYPE_GFX:
>> - timeout = adev->gfx_timeout;
>> - break;
>> - case AMDGPU_RING_TYPE_COMPUTE:
>> - timeout = adev->compute_timeout;
>> - break;
>> - case AMDGPU_RING_TYPE_SDMA:
>> - timeout = adev->sdma_timeout;
>> - break;
>> - default:
>> - timeout = adev->video_timeout;
>> - break;
>> - }
>> -
>> - r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
>> - num_hw_submission, amdgpu_job_hang_limit,
>> - timeout, NULL, sched_score, ring->name);
>> - if (r) {
>> - DRM_ERROR("Failed to create scheduler on ring %s.\n",
>> - ring->name);
>> - return r;
>> - }
>> + if (!ring->fence_drv.fences)
>> + return -ENOMEM;
>> return 0;
>> }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index fae7d185ad0d..7f20ce73a243 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -251,6 +251,8 @@ struct amdgpu_ring {
>> bool has_compute_vm_bug;
>> bool no_scheduler;
>> int hw_prio;
>> + unsigned num_hw_submission;
>> + atomic_t *sched_score;
>> };
>> #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p),
>> (ib)))
>
More information about the amd-gfx
mailing list