[PATCH] drm/amdgpu: Always emit GDS switch when GDS/GWS/OA is used

Fri Jul 7 06:56:58 UTC 2023

Am 07.07.23 um 08:28 schrieb Friedrich Vock:
> During gfxoff, the per-VMID GDS registers are reset and not restored
> afterwards.

Hui? Since when? Those registers should be part of the saved ones.

Have you found that by observation?

Thanks,
Christian.

>   The kernel needs to emit a GDS switch to manually update the
> GWS registers in this case. Since gfxoff can happen between any two
> submissions and the kernel has no way of knowing, emit the GDS switch
> before every submission.
>
> Fixes: 56b0989e29 ("drm/amdgpu: fix GDS/GWS/OA switch handling")
> Cc: stable at vger.kernel.org
> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2530
> Signed-off-by: Friedrich Vock <friedrich.vock at gmx.de>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 22 +++++++---------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 10 ++++++++--
>   3 files changed, 15 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> index ff1ea99292fb..de73797e9279 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> @@ -165,24 +165,17 @@ bool amdgpu_vmid_had_gpu_reset(struct amdgpu_device *adev,
>   		atomic_read(&adev->gpu_reset_counter);
>   }
>
> -/* Check if we need to switch to another set of resources */
> -static bool amdgpu_vmid_gds_switch_needed(struct amdgpu_vmid *id,
> -					  struct amdgpu_job *job)
> -{
> -	return id->gds_base != job->gds_base ||
> -		id->gds_size != job->gds_size ||
> -		id->gws_base != job->gws_base ||
> -		id->gws_size != job->gws_size ||
> -		id->oa_base != job->oa_base ||
> -		id->oa_size != job->oa_size;
> -}
> -
>   /* Check if the id is compatible with the job */
>   static bool amdgpu_vmid_compatible(struct amdgpu_vmid *id,
>   				   struct amdgpu_job *job)
>   {
>   	return  id->pd_gpu_addr == job->vm_pd_addr &&
> -		!amdgpu_vmid_gds_switch_needed(id, job);
> +		id->gds_base == job->gds_base &&
> +		id->gds_size == job->gds_size &&
> +		id->gws_base == job->gws_base &&
> +		id->gws_size == job->gws_size &&
> +		id->oa_base == job->oa_base &&
> +		id->oa_size == job->oa_size;
>   }
>
>   /**
> @@ -434,7 +427,6 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
>   		list_move_tail(&id->list, &id_mgr->ids_lru);
>   	}
>
> -	job->gds_switch_needed = amdgpu_vmid_gds_switch_needed(id, job);
>   	if (job->vm_needs_flush) {
>   		id->flushed_updates = amdgpu_vm_tlb_seq(vm);
>   		dma_fence_put(id->last_flush);
> @@ -503,7 +495,7 @@ void amdgpu_vmid_free_reserved(struct amdgpu_device *adev,
>    * @vmhub: vmhub type
>    * @vmid: vmid number to use
>    *
> - * Reset saved GDW, GWS and OA to force switch on next flush.
> + * Reset saved GDS, GWS and OA data.
>    */
>   void amdgpu_vmid_reset(struct amdgpu_device *adev, unsigned vmhub,
>   		       unsigned vmid)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index a963a25ddd62..2898508b1ce4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -53,7 +53,6 @@ struct amdgpu_job {
>   	uint32_t		preamble_status;
>   	uint32_t                preemption_status;
>   	bool                    vm_needs_flush;
> -	bool			gds_switch_needed;
>   	bool			spm_update_needed;
>   	uint64_t		vm_pd_addr;
>   	unsigned		vmid;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 291977b93b1d..61856040cae2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -557,6 +557,12 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev)
>   	}
>   }
>
> +/* Check if the job needs a GDS switch */
> +static bool amdgpu_vm_need_gds_switch(struct amdgpu_job *job)
> +{
> +	return job->gds_size || job->gws_size || job->oa_size;
> +}
> +
>   /**
>    * amdgpu_vm_need_pipeline_sync - Check if pipe sync is needed for job.
>    *
> @@ -579,7 +585,7 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
>   	if (job->vm_needs_flush || ring->has_compute_vm_bug)
>   		return true;
>
> -	if (ring->funcs->emit_gds_switch && job->gds_switch_needed)
> +	if (ring->funcs->emit_gds_switch && amdgpu_vm_need_gds_switch(job))
>   		return true;
>
>   	if (amdgpu_vmid_had_gpu_reset(adev, &id_mgr->ids[job->vmid]))
> @@ -609,7 +615,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>   	struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
>   	bool spm_update_needed = job->spm_update_needed;
>   	bool gds_switch_needed = ring->funcs->emit_gds_switch &&
> -		job->gds_switch_needed;
> +		amdgpu_vm_need_gds_switch(job);
>   	bool vm_flush_needed = job->vm_needs_flush;
>   	struct dma_fence *fence = NULL;
>   	bool pasid_mapping_needed = false;
> --
> 2.41.0
>