[PATCH] drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues
Marek Olšák
maraeo at gmail.com
Mon Jan 28 17:25:12 UTC 2019
Ping
On Tue, Jan 22, 2019 at 3:05 PM Marek Olšák <maraeo at gmail.com> wrote:
> From: Marek Olšák <marek.olsak at amd.com>
>
> I'm not increasing the DRM version because GDS isn't totally without bugs
> yet.
>
> v2: update emit_ib_size
>
> Signed-off-by: Marek Olšák <marek.olsak at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h | 2 ++
> drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 19 +++++++++++-
> drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 21 +++++++++++--
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 40 +++++++++++++++++++++++--
> include/uapi/drm/amdgpu_drm.h | 5 ++++
> 5 files changed, 82 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> index ecbcefe49a98..f89f5734d985 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> @@ -30,20 +30,22 @@ struct amdgpu_bo;
> struct amdgpu_gds_asic_info {
> uint32_t total_size;
> uint32_t gfx_partition_size;
> uint32_t cs_partition_size;
> };
>
> struct amdgpu_gds {
> struct amdgpu_gds_asic_info mem;
> struct amdgpu_gds_asic_info gws;
> struct amdgpu_gds_asic_info oa;
> + uint32_t gds_compute_max_wave_id;
> +
> /* At present, GDS, GWS and OA resources for gfx (graphics)
> * is always pre-allocated and available for graphics operation.
> * Such resource is shared between all gfx clients.
> * TODO: move this operation to user space
> * */
> struct amdgpu_bo* gds_gfx_bo;
> struct amdgpu_bo* gws_gfx_bo;
> struct amdgpu_bo* oa_gfx_bo;
> };
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> index 7984292f9282..a59e0fdf5a97 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> @@ -2257,20 +2257,36 @@ static void gfx_v7_0_ring_emit_ib_gfx(struct
> amdgpu_ring *ring,
> }
>
> static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
> struct amdgpu_job *job,
> struct amdgpu_ib *ib,
> uint32_t flags)
> {
> unsigned vmid = AMDGPU_JOB_GET_VMID(job);
> u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
>
> + /* Currently, there is a high possibility to get wave ID mismatch
> + * between ME and GDS, leading to a hw deadlock, because ME
> generates
> + * different wave IDs than the GDS expects. This situation happens
> + * randomly when at least 5 compute pipes use GDS ordered append.
> + * The wave IDs generated by ME are also wrong after
> suspend/resume.
> + * Those are probably bugs somewhere else in the kernel driver.
> + *
> + * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME
> and
> + * GDS to 0 for this ring (me/pipe).
> + */
> + if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
> + amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG,
> 1));
> + amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID -
> PACKET3_SET_CONFIG_REG_START);
> + amdgpu_ring_write(ring,
> ring->adev->gds.gds_compute_max_wave_id);
> + }
> +
> amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
> amdgpu_ring_write(ring,
> #ifdef __BIG_ENDIAN
> (2 << 0) |
> #endif
> (ib->gpu_addr & 0xFFFFFFFC));
> amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
> amdgpu_ring_write(ring, control);
> }
>
> @@ -4993,21 +5009,21 @@ static const struct amdgpu_ring_funcs
> gfx_v7_0_ring_funcs_compute = {
> .get_rptr = gfx_v7_0_ring_get_rptr,
> .get_wptr = gfx_v7_0_ring_get_wptr_compute,
> .set_wptr = gfx_v7_0_ring_set_wptr_compute,
> .emit_frame_size =
> 20 + /* gfx_v7_0_ring_emit_gds_switch */
> 7 + /* gfx_v7_0_ring_emit_hdp_flush */
> 5 + /* hdp invalidate */
> 7 + /* gfx_v7_0_ring_emit_pipeline_sync */
> CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /*
> gfx_v7_0_ring_emit_vm_flush */
> 7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user
> fence, vm fence */
> - .emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_compute */
> + .emit_ib_size = 7, /* gfx_v7_0_ring_emit_ib_compute */
> .emit_ib = gfx_v7_0_ring_emit_ib_compute,
> .emit_fence = gfx_v7_0_ring_emit_fence_compute,
> .emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
> .emit_vm_flush = gfx_v7_0_ring_emit_vm_flush,
> .emit_gds_switch = gfx_v7_0_ring_emit_gds_switch,
> .emit_hdp_flush = gfx_v7_0_ring_emit_hdp_flush,
> .test_ring = gfx_v7_0_ring_test_ring,
> .test_ib = gfx_v7_0_ring_test_ib,
> .insert_nop = amdgpu_ring_insert_nop,
> .pad_ib = amdgpu_ring_generic_pad_ib,
> @@ -5050,20 +5066,21 @@ static void gfx_v7_0_set_irq_funcs(struct
> amdgpu_device *adev)
> adev->gfx.priv_inst_irq.num_types = 1;
> adev->gfx.priv_inst_irq.funcs = &gfx_v7_0_priv_inst_irq_funcs;
> }
>
> static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
> {
> /* init asci gds info */
> adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
> adev->gds.gws.total_size = 64;
> adev->gds.oa.total_size = 16;
> + adev->gds.gds_compute_max_wave_id =
> RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
>
> if (adev->gds.mem.total_size == 64 * 1024) {
> adev->gds.mem.gfx_partition_size = 4096;
> adev->gds.mem.cs_partition_size = 4096;
>
> adev->gds.gws.gfx_partition_size = 4;
> adev->gds.gws.cs_partition_size = 4;
>
> adev->gds.oa.gfx_partition_size = 4;
> adev->gds.oa.cs_partition_size = 1;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index a26747681ed6..b8e50a34bdb3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6077,20 +6077,36 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct
> amdgpu_ring *ring,
> }
>
> static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
> struct amdgpu_job *job,
> struct amdgpu_ib *ib,
> uint32_t flags)
> {
> unsigned vmid = AMDGPU_JOB_GET_VMID(job);
> u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
>
> + /* Currently, there is a high possibility to get wave ID mismatch
> + * between ME and GDS, leading to a hw deadlock, because ME
> generates
> + * different wave IDs than the GDS expects. This situation happens
> + * randomly when at least 5 compute pipes use GDS ordered append.
> + * The wave IDs generated by ME are also wrong after
> suspend/resume.
> + * Those are probably bugs somewhere else in the kernel driver.
> + *
> + * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME
> and
> + * GDS to 0 for this ring (me/pipe).
> + */
> + if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
> + amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG,
> 1));
> + amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID -
> PACKET3_SET_CONFIG_REG_START);
> + amdgpu_ring_write(ring,
> ring->adev->gds.gds_compute_max_wave_id);
> + }
> +
> amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
> amdgpu_ring_write(ring,
> #ifdef __BIG_ENDIAN
> (2 << 0) |
> #endif
> (ib->gpu_addr & 0xFFFFFFFC));
> amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
> amdgpu_ring_write(ring, control);
> }
>
> @@ -6883,21 +6899,21 @@ static const struct amdgpu_ring_funcs
> gfx_v8_0_ring_funcs_compute = {
> .get_rptr = gfx_v8_0_ring_get_rptr,
> .get_wptr = gfx_v8_0_ring_get_wptr_compute,
> .set_wptr = gfx_v8_0_ring_set_wptr_compute,
> .emit_frame_size =
> 20 + /* gfx_v8_0_ring_emit_gds_switch */
> 7 + /* gfx_v8_0_ring_emit_hdp_flush */
> 5 + /* hdp_invalidate */
> 7 + /* gfx_v8_0_ring_emit_pipeline_sync */
> VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /*
> gfx_v8_0_ring_emit_vm_flush */
> 7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user
> fence, vm fence */
> - .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */
> + .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
> .emit_ib = gfx_v8_0_ring_emit_ib_compute,
> .emit_fence = gfx_v8_0_ring_emit_fence_compute,
> .emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
> .emit_vm_flush = gfx_v8_0_ring_emit_vm_flush,
> .emit_gds_switch = gfx_v8_0_ring_emit_gds_switch,
> .emit_hdp_flush = gfx_v8_0_ring_emit_hdp_flush,
> .test_ring = gfx_v8_0_ring_test_ring,
> .test_ib = gfx_v8_0_ring_test_ib,
> .insert_nop = amdgpu_ring_insert_nop,
> .pad_ib = amdgpu_ring_generic_pad_ib,
> @@ -6913,21 +6929,21 @@ static const struct amdgpu_ring_funcs
> gfx_v8_0_ring_funcs_kiq = {
> .get_rptr = gfx_v8_0_ring_get_rptr,
> .get_wptr = gfx_v8_0_ring_get_wptr_compute,
> .set_wptr = gfx_v8_0_ring_set_wptr_compute,
> .emit_frame_size =
> 20 + /* gfx_v8_0_ring_emit_gds_switch */
> 7 + /* gfx_v8_0_ring_emit_hdp_flush */
> 5 + /* hdp_invalidate */
> 7 + /* gfx_v8_0_ring_emit_pipeline_sync */
> 17 + /* gfx_v8_0_ring_emit_vm_flush */
> 7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for user
> fence, vm fence */
> - .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */
> + .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
> .emit_fence = gfx_v8_0_ring_emit_fence_kiq,
> .test_ring = gfx_v8_0_ring_test_ring,
> .insert_nop = amdgpu_ring_insert_nop,
> .pad_ib = amdgpu_ring_generic_pad_ib,
> .emit_rreg = gfx_v8_0_ring_emit_rreg,
> .emit_wreg = gfx_v8_0_ring_emit_wreg,
> };
>
> static void gfx_v8_0_set_ring_funcs(struct amdgpu_device *adev)
> {
> @@ -6989,20 +7005,21 @@ static void gfx_v8_0_set_rlc_funcs(struct
> amdgpu_device *adev)
> {
> adev->gfx.rlc.funcs = &iceland_rlc_funcs;
> }
>
> static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev)
> {
> /* init asci gds info */
> adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
> adev->gds.gws.total_size = 64;
> adev->gds.oa.total_size = 16;
> + adev->gds.gds_compute_max_wave_id =
> RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
>
> if (adev->gds.mem.total_size == 64 * 1024) {
> adev->gds.mem.gfx_partition_size = 4096;
> adev->gds.mem.cs_partition_size = 4096;
>
> adev->gds.gws.gfx_partition_size = 4;
> adev->gds.gws.cs_partition_size = 4;
>
> adev->gds.oa.gfx_partition_size = 4;
> adev->gds.oa.cs_partition_size = 1;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 262ee3cf6f1c..5533f6e4f4a4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4003,20 +4003,36 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct
> amdgpu_ring *ring,
> }
>
> static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
> struct amdgpu_job *job,
> struct amdgpu_ib *ib,
> uint32_t flags)
> {
> unsigned vmid = AMDGPU_JOB_GET_VMID(job);
> u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
>
> + /* Currently, there is a high possibility to get wave ID mismatch
> + * between ME and GDS, leading to a hw deadlock, because ME
> generates
> + * different wave IDs than the GDS expects. This situation happens
> + * randomly when at least 5 compute pipes use GDS ordered append.
> + * The wave IDs generated by ME are also wrong after
> suspend/resume.
> + * Those are probably bugs somewhere else in the kernel driver.
> + *
> + * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME
> and
> + * GDS to 0 for this ring (me/pipe).
> + */
> + if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
> + amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG,
> 1));
> + amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID);
> + amdgpu_ring_write(ring,
> ring->adev->gds.gds_compute_max_wave_id);
> + }
> +
> amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
> BUG_ON(ib->gpu_addr & 0x3); /* Dword align */
> amdgpu_ring_write(ring,
> #ifdef __BIG_ENDIAN
> (2 << 0) |
> #endif
> lower_32_bits(ib->gpu_addr));
> amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
> amdgpu_ring_write(ring, control);
> }
> @@ -4722,21 +4738,21 @@ static const struct amdgpu_ring_funcs
> gfx_v9_0_ring_funcs_compute = {
> .set_wptr = gfx_v9_0_ring_set_wptr_compute,
> .emit_frame_size =
> 20 + /* gfx_v9_0_ring_emit_gds_switch */
> 7 + /* gfx_v9_0_ring_emit_hdp_flush */
> 5 + /* hdp invalidate */
> 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> 2 + /* gfx_v9_0_ring_emit_vm_flush */
> 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence,
> vm fence */
> - .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */
> + .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
> .emit_ib = gfx_v9_0_ring_emit_ib_compute,
> .emit_fence = gfx_v9_0_ring_emit_fence,
> .emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
> .emit_vm_flush = gfx_v9_0_ring_emit_vm_flush,
> .emit_gds_switch = gfx_v9_0_ring_emit_gds_switch,
> .emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush,
> .test_ring = gfx_v9_0_ring_test_ring,
> .test_ib = gfx_v9_0_ring_test_ib,
> .insert_nop = amdgpu_ring_insert_nop,
> .pad_ib = amdgpu_ring_generic_pad_ib,
> @@ -4757,21 +4773,21 @@ static const struct amdgpu_ring_funcs
> gfx_v9_0_ring_funcs_kiq = {
> .set_wptr = gfx_v9_0_ring_set_wptr_compute,
> .emit_frame_size =
> 20 + /* gfx_v9_0_ring_emit_gds_switch */
> 7 + /* gfx_v9_0_ring_emit_hdp_flush */
> 5 + /* hdp invalidate */
> 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> 2 + /* gfx_v9_0_ring_emit_vm_flush */
> 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user
> fence, vm fence */
> - .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */
> + .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
> .emit_fence = gfx_v9_0_ring_emit_fence_kiq,
> .test_ring = gfx_v9_0_ring_test_ring,
> .insert_nop = amdgpu_ring_insert_nop,
> .pad_ib = amdgpu_ring_generic_pad_ib,
> .emit_rreg = gfx_v9_0_ring_emit_rreg,
> .emit_wreg = gfx_v9_0_ring_emit_wreg,
> .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
> .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> };
>
> @@ -4839,20 +4855,40 @@ static void gfx_v9_0_set_gds_init(struct
> amdgpu_device *adev)
> adev->gds.mem.total_size = 0x10000;
> break;
> case CHIP_RAVEN:
> adev->gds.mem.total_size = 0x1000;
> break;
> default:
> adev->gds.mem.total_size = 0x10000;
> break;
> }
>
> + switch (adev->asic_type) {
> + case CHIP_VEGA10:
> + case CHIP_VEGA20:
> + adev->gds.gds_compute_max_wave_id = 0x7ff;
> + break;
> + case CHIP_VEGA12:
> + adev->gds.gds_compute_max_wave_id = 0x27f;
> + break;
> + case CHIP_RAVEN:
> + if (adev->rev_id >= 0x8)
> + adev->gds.gds_compute_max_wave_id = 0x77; /*
> raven2 */
> + else
> + adev->gds.gds_compute_max_wave_id = 0x15f; /*
> raven1 */
> + break;
> + default:
> + /* this really depends on the chip */
> + adev->gds.gds_compute_max_wave_id = 0x7ff;
> + break;
> + }
> +
> adev->gds.gws.total_size = 64;
> adev->gds.oa.total_size = 16;
>
> if (adev->gds.mem.total_size == 64 * 1024) {
> adev->gds.mem.gfx_partition_size = 4096;
> adev->gds.mem.cs_partition_size = 4096;
>
> adev->gds.gws.gfx_partition_size = 4;
> adev->gds.gws.cs_partition_size = 4;
>
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index faaad04814e4..662d379ea624 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -561,20 +561,25 @@ union drm_amdgpu_cs {
> /* Preamble flag, which means the IB could be dropped if no context
> switch */
> #define AMDGPU_IB_FLAG_PREAMBLE (1<<1)
>
> /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */
> #define AMDGPU_IB_FLAG_PREEMPT (1<<2)
>
> /* The IB fence should do the L2 writeback but not invalidate any shader
> * caches (L2/vL1/sL1/I$). */
> #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
>
> +/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER.
> + * This will reset wave ID counters for the IB.
> + */
> +#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4)
> +
> struct drm_amdgpu_cs_chunk_ib {
> __u32 _pad;
> /** AMDGPU_IB_FLAG_* */
> __u32 flags;
> /** Virtual address to begin IB execution */
> __u64 va_start;
> /** Size of submission */
> __u32 ib_bytes;
> /** HW IP to submit to */
> __u32 ip_type;
> --
> 2.17.1
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20190128/908f8bdf/attachment-0001.html>
More information about the amd-gfx
mailing list