[RFC v4 02/11] drm/amdgpu: Move scheduler init to after XGMI is ready
Christian König
ckoenig.leichtzumerken at gmail.com
Wed Feb 9 07:53:33 UTC 2022
Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:
> Before we initialize schedulers we must know which reset
> domain are we in - for single device there iis a single
> domain per device and so single wq per device. For XGMI
> the reset domain spans the entire XGMI hive and so the
> reset wq is per hive.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
One more comment below, with that fixed Reviewed-by: Christian König
<christian.koenig at amd.com>.
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 34 ++--------------
> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 2 +
> 3 files changed, 51 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 9704b0e1fd82..00123b0013d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2287,6 +2287,47 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
> return r;
> }
>
> +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
> +{
> + long timeout;
> + int r, i;
> +
> + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
> + struct amdgpu_ring *ring = adev->rings[i];
> +
> + /* No need to setup the GPU scheduler for rings that don't need it */
> + if (!ring || ring->no_scheduler)
> + continue;
> +
> + switch (ring->funcs->type) {
> + case AMDGPU_RING_TYPE_GFX:
> + timeout = adev->gfx_timeout;
> + break;
> + case AMDGPU_RING_TYPE_COMPUTE:
> + timeout = adev->compute_timeout;
> + break;
> + case AMDGPU_RING_TYPE_SDMA:
> + timeout = adev->sdma_timeout;
> + break;
> + default:
> + timeout = adev->video_timeout;
> + break;
> + }
> +
> + r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
> + ring->num_hw_submission, amdgpu_job_hang_limit,
> + timeout, adev->reset_domain.wq, ring->sched_score, ring->name);
> + if (r) {
> + DRM_ERROR("Failed to create scheduler on ring %s.\n",
> + ring->name);
> + return r;
> + }
> + }
> +
> + return 0;
> +}
> +
> +
> /**
> * amdgpu_device_ip_init - run init for hardware IPs
> *
> @@ -2419,6 +2460,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
> }
> }
>
> + r = amdgpu_device_init_schedulers(adev);
> + if (r)
> + goto init_failed;
> +
> /* Don't init kfd if whole hive need to be reset during init */
> if (!adev->gmc.xgmi.pending_reset)
> amdgpu_amdkfd_device_init(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 45977a72b5dd..fa302540c69a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -457,8 +457,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
> atomic_t *sched_score)
> {
> struct amdgpu_device *adev = ring->adev;
> - long timeout;
> - int r;
>
> if (!adev)
> return -EINVAL;
> @@ -478,36 +476,12 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
> spin_lock_init(&ring->fence_drv.lock);
> ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),
> GFP_KERNEL);
> - if (!ring->fence_drv.fences)
> - return -ENOMEM;
>
> - /* No need to setup the GPU scheduler for rings that don't need it */
> - if (ring->no_scheduler)
> - return 0;
> + ring->num_hw_submission = num_hw_submission;
> + ring->sched_score = sched_score;
Let's move this into the caller and then use ring->num_hw_submission in
the fence code as well.
The maximum number of jobs on the ring is not really fence specific.
Regards,
Christian.
>
> - switch (ring->funcs->type) {
> - case AMDGPU_RING_TYPE_GFX:
> - timeout = adev->gfx_timeout;
> - break;
> - case AMDGPU_RING_TYPE_COMPUTE:
> - timeout = adev->compute_timeout;
> - break;
> - case AMDGPU_RING_TYPE_SDMA:
> - timeout = adev->sdma_timeout;
> - break;
> - default:
> - timeout = adev->video_timeout;
> - break;
> - }
> -
> - r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
> - num_hw_submission, amdgpu_job_hang_limit,
> - timeout, NULL, sched_score, ring->name);
> - if (r) {
> - DRM_ERROR("Failed to create scheduler on ring %s.\n",
> - ring->name);
> - return r;
> - }
> + if (!ring->fence_drv.fences)
> + return -ENOMEM;
>
> return 0;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index fae7d185ad0d..7f20ce73a243 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -251,6 +251,8 @@ struct amdgpu_ring {
> bool has_compute_vm_bug;
> bool no_scheduler;
> int hw_prio;
> + unsigned num_hw_submission;
> + atomic_t *sched_score;
> };
>
> #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))
More information about the dri-devel
mailing list