[PATCH 22/26] drm/amdgpu: implement ring set_priority for gfx_v8 compute v5
Alex Deucher
alexdeucher at gmail.com
Tue Apr 11 22:35:10 UTC 2017
On Thu, Apr 6, 2017 at 2:21 AM, Andres Rodriguez <andresx7 at gmail.com> wrote:
> Programming CP_HQD_QUEUE_PRIORITY enables a queue to take priority over
> other queues on the same pipe. Multiple queues on a pipe are timesliced
> so this gives us full precedence over other queues.
>
> Programming CP_HQD_PIPE_PRIORITY changes the SPI_ARB_PRIORITY of the
> wave as follows:
> 0x2: CS_H
> 0x1: CS_M
> 0x0: CS_L
>
> The SPI block will then dispatch work according to the policy set by
> SPI_ARB_PRIORITY. In the current policy CS_H is higher priority than
> gfx.
>
> In order to prevent getting stuck in loops of CUs bouncing between GFX
> and high priority compute and introducing further latency, we reserve
> CUs 2+ for high priority compute on-demand.
>
> v2: fix srbm_select to ring->queue and use ring->funcs->type
> v3: use AMD_SCHED_PRIORITY_* instead of AMDGPU_CTX_PRIORITY_*
> v4: switch int to enum amd_sched_priority
> v5: corresponding changes for srbm_lock
>
> Acked-by: Christian König <christian.koenig at amd.com>
> Signed-off-by: Andres Rodriguez <andresx7 at gmail.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 +
> drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 96 +++++++++++++++++++++++++++++-
> 3 files changed, 99 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index b9a4161..c56a884 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1044,20 +1044,23 @@ struct amdgpu_gfx {
> uint32_t me_feature_version;
> uint32_t ce_feature_version;
> uint32_t pfp_feature_version;
> uint32_t rlc_feature_version;
> uint32_t mec_feature_version;
> uint32_t mec2_feature_version;
> struct amdgpu_ring gfx_ring[AMDGPU_MAX_GFX_RINGS];
> unsigned num_gfx_rings;
> struct amdgpu_ring compute_ring[AMDGPU_MAX_COMPUTE_RINGS];
> unsigned num_compute_rings;
> + spinlock_t cu_reserve_lock;
> + uint32_t cu_reserve_pipe_mask;
> + uint32_t cu_reserve_queue_mask[AMDGPU_MAX_COMPUTE_RINGS];
> struct amdgpu_irq_src eop_irq;
> struct amdgpu_irq_src priv_reg_irq;
> struct amdgpu_irq_src priv_inst_irq;
> /* gfx status */
> uint32_t gfx_current_status;
> /* ce ram size*/
> unsigned ce_ram_size;
> struct amdgpu_cu_info cu_info;
> const struct amdgpu_gfx_funcs *funcs;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 07f16b4..29b45bb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1874,20 +1874,21 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> /* Registers mapping */
> /* TODO: block userspace mapping of io register */
> spin_lock_init(&adev->mmio_idx_lock);
> spin_lock_init(&adev->smc_idx_lock);
> spin_lock_init(&adev->pcie_idx_lock);
> spin_lock_init(&adev->uvd_ctx_idx_lock);
> spin_lock_init(&adev->didt_idx_lock);
> spin_lock_init(&adev->gc_cac_idx_lock);
> spin_lock_init(&adev->audio_endpt_idx_lock);
> spin_lock_init(&adev->mm_stats.lock);
> + spin_lock_init(&adev->gfx.cu_reserve_lock);
>
> INIT_LIST_HEAD(&adev->shadow_list);
> mutex_init(&adev->shadow_list_lock);
>
> INIT_LIST_HEAD(&adev->gtt_list);
> spin_lock_init(&adev->gtt_list_lock);
>
> INIT_LIST_HEAD(&adev->ring_lru_list);
> spin_lock_init(&adev->ring_lru_list_lock);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 3cfe3c0..f94d532 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -46,21 +46,24 @@
> #include "gca/gfx_8_0_sh_mask.h"
> #include "gca/gfx_8_0_enum.h"
>
> #include "dce/dce_10_0_d.h"
> #include "dce/dce_10_0_sh_mask.h"
>
> #include "smu/smu_7_1_3_d.h"
>
> #define GFX8_NUM_GFX_RINGS 1
> #define GFX8_MEC_HPD_SIZE 2048
> -
> +#define GFX8_CU_RESERVE_RESOURCES 0x45888
> +#define GFX8_CU_NUM 8
> +#define GFX8_UNRESERVED_CU_NUM 2
> +#define GFX8_CU_RESERVE_PIPE_SHIFT 7
>
> #define TOPAZ_GB_ADDR_CONFIG_GOLDEN 0x22010001
> #define CARRIZO_GB_ADDR_CONFIG_GOLDEN 0x22010001
> #define POLARIS11_GB_ADDR_CONFIG_GOLDEN 0x22011002
> #define TONGA_GB_ADDR_CONFIG_GOLDEN 0x22011003
>
> #define ARRAY_MODE(x) ((x) << GB_TILE_MODE0__ARRAY_MODE__SHIFT)
> #define PIPE_CONFIG(x) ((x) << GB_TILE_MODE0__PIPE_CONFIG__SHIFT)
> #define TILE_SPLIT(x) ((x) << GB_TILE_MODE0__TILE_SPLIT__SHIFT)
> #define MICRO_TILE_MODE_NEW(x) ((x) << GB_TILE_MODE0__MICRO_TILE_MODE_NEW__SHIFT)
> @@ -6710,20 +6713,110 @@ static u64 gfx_v8_0_ring_get_wptr_compute(struct amdgpu_ring *ring)
>
> static void gfx_v8_0_ring_set_wptr_compute(struct amdgpu_ring *ring)
> {
> struct amdgpu_device *adev = ring->adev;
>
> /* XXX check if swapping is necessary on BE */
> adev->wb.wb[ring->wptr_offs] = lower_32_bits(ring->wptr);
> WDOORBELL32(ring->doorbell_index, lower_32_bits(ring->wptr));
> }
>
> +static void gfx_v8_0_cu_reserve(struct amdgpu_device *adev,
> + struct amdgpu_ring *ring, bool acquire)
> +{
> + int i, resources;
> + int tmp = 0, queue_mask = 0, type_mask = 0;
> + int reserve_res_reg, reserve_en_reg;
> +
> + /* gfx_v8_0_cu_reserve only supports compute path */
> + if (ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE)
> + return;
> +
> + spin_lock(&adev->gfx.cu_reserve_lock);
> + if (acquire) {
> + adev->gfx.cu_reserve_pipe_mask |= (1 << ring->pipe);
> + adev->gfx.cu_reserve_queue_mask[ring->pipe] |= (1 << ring->queue);
> + } else {
> + adev->gfx.cu_reserve_pipe_mask &= ~(1 << ring->pipe);
> + adev->gfx.cu_reserve_queue_mask[ring->pipe] &= ~(1 << ring->queue);
> + }
> +
> + /* compute pipe 0 starts at GFX8_CU_RESERVE_PIPE_SHIFT */
> + type_mask = (adev->gfx.cu_reserve_pipe_mask << GFX8_CU_RESERVE_PIPE_SHIFT);
> +
> + /* HW only has one register for queue mask, so we collaspse them */
> + for (i = 0; i < AMDGPU_MAX_COMPUTE_RINGS; i++)
> + queue_mask |= adev->gfx.cu_reserve_queue_mask[i];
> +
> + /* leave the first CUs for general processing */
> + for (i = GFX8_UNRESERVED_CU_NUM; i < GFX8_CU_NUM; i++) {
> + reserve_res_reg = mmSPI_RESOURCE_RESERVE_CU_0 + i;
> + reserve_en_reg = mmSPI_RESOURCE_RESERVE_EN_CU_0 + i;
> +
> + tmp = REG_SET_FIELD(tmp, SPI_RESOURCE_RESERVE_EN_CU_0,
> + TYPE_MASK, type_mask);
> + tmp = REG_SET_FIELD(tmp, SPI_RESOURCE_RESERVE_EN_CU_0,
> + QUEUE_MASK, queue_mask);
> + if (queue_mask) {
> + resources = GFX8_CU_RESERVE_RESOURCES;
> + tmp = REG_SET_FIELD(tmp, SPI_RESOURCE_RESERVE_EN_CU_0,
> + EN, 1);
> + } else {
> + resources = 0;
> + tmp = REG_SET_FIELD(tmp, SPI_RESOURCE_RESERVE_EN_CU_0,
> + EN, 0);
> + }
> + /* Commit */
> + WREG32(reserve_res_reg, resources);
> + WREG32(reserve_en_reg, tmp);
> + }
Should these be programmed via the KIQ rather than MMIO? I think
there may even be a special packet for this. John? Felix?
> +
> + spin_unlock(&adev->gfx.cu_reserve_lock);
> +}
> +
> +static void gfx_v8_0_set_spi_priority(struct amdgpu_device *adev,
> + struct amdgpu_ring *ring,
> + enum amd_sched_priority priority)
> +{
> + spin_lock(&adev->srbm_lock);
> + vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
> +
> + switch (priority) {
> + case AMD_SCHED_PRIORITY_NORMAL:
> + WREG32(mmCP_HQD_PIPE_PRIORITY, 0x0);
> + WREG32(mmCP_HQD_QUEUE_PRIORITY, 0x0);
> + break;
> + case AMD_SCHED_PRIORITY_HIGH:
> + WREG32(mmCP_HQD_PIPE_PRIORITY, 0x2);
> + WREG32(mmCP_HQD_QUEUE_PRIORITY, 0xf);
> + break;
> + default:
> + WARN(1, "Attempt to set invalid SPI priority:%d for ring:%d\n",
> + priority, ring->idx);
> + break;
> + }
I wonder if it would be better to program these via the KIQ rather than MMIO.
> +
> + vi_srbm_select(adev, 0, 0, 0, 0);
> + spin_unlock(&adev->srbm_lock);
> +}
> +static void gfx_v8_0_ring_set_priority_compute(struct amdgpu_ring *ring,
> + enum amd_sched_priority priority)
> +{
> + struct amdgpu_device *adev = ring->adev;
> +
> + if (ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE)
> + return;
> +
> + gfx_v8_0_set_spi_priority(adev, ring, priority);
> + gfx_v8_0_cu_reserve(adev, ring, priority == AMD_SCHED_PRIORITY_HIGH);
> +}
> +
> static void gfx_v8_0_ring_emit_fence_compute(struct amdgpu_ring *ring,
> u64 addr, u64 seq,
> unsigned flags)
> {
> bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
> bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
>
> /* RELEASE_MEM - flush caches, send int */
> amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 5));
> amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> @@ -7140,20 +7233,21 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
> .emit_fence = gfx_v8_0_ring_emit_fence_compute,
> .emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
> .emit_vm_flush = gfx_v8_0_ring_emit_vm_flush,
> .emit_gds_switch = gfx_v8_0_ring_emit_gds_switch,
> .emit_hdp_flush = gfx_v8_0_ring_emit_hdp_flush,
> .emit_hdp_invalidate = gfx_v8_0_ring_emit_hdp_invalidate,
> .test_ring = gfx_v8_0_ring_test_ring,
> .test_ib = gfx_v8_0_ring_test_ib,
> .insert_nop = amdgpu_ring_insert_nop,
> .pad_ib = amdgpu_ring_generic_pad_ib,
> + .set_priority = gfx_v8_0_ring_set_priority_compute,
> };
>
> static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
> .type = AMDGPU_RING_TYPE_KIQ,
> .align_mask = 0xff,
> .nop = PACKET3(PACKET3_NOP, 0x3FFF),
> .support_64bit_ptrs = false,
> .get_rptr = gfx_v8_0_ring_get_rptr,
> .get_wptr = gfx_v8_0_ring_get_wptr_compute,
> .set_wptr = gfx_v8_0_ring_set_wptr_compute,
> --
> 2.9.3
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the amd-gfx
mailing list