[PATCH 09/25] drm/amdgpu: allow split of queues with kfd at queue granularity v2

Tue Apr 4 22:05:35 UTC 2017

Previously the queue/pipe split with kfd operated with pipe
granularity. This patch allows amdgpu to take ownership of an arbitrary
set of queues.

It also consolidates the last few magic numbers in the compute
initialization process into mec_init.

v2: support for gfx9

Reviewed-by: Edward O'Callaghan <funfunctor at folklore1984.net>
Acked-by: Christian König <christian.koenig at amd.com>
Signed-off-by: Andres Rodriguez <andresx7 at gmail.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h             |  7 +++
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c           | 83 +++++++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c           | 81 +++++++++++++++++++-----
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c           | 84 +++++++++++++++++++++++--
 drivers/gpu/drm/amd/include/kgd_kfd_interface.h |  1 +
 5 files changed, 212 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index b92f6cb..e2d8243 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -39,20 +39,22 @@
 #include <ttm/ttm_bo_api.h>
 #include <ttm/ttm_bo_driver.h>
 #include <ttm/ttm_placement.h>
 #include <ttm/ttm_module.h>
 #include <ttm/ttm_execbuf_util.h>
 
 #include <drm/drmP.h>
 #include <drm/drm_gem.h>
 #include <drm/amdgpu_drm.h>
 
+#include <kgd_kfd_interface.h>
+
 #include "amd_shared.h"
 #include "amdgpu_mode.h"
 #include "amdgpu_ih.h"
 #include "amdgpu_irq.h"
 #include "amdgpu_ucode.h"
 #include "amdgpu_ttm.h"
 #include "amdgpu_psp.h"
 #include "amdgpu_gds.h"
 #include "amdgpu_sync.h"
 #include "amdgpu_ring.h"
@@ -899,29 +901,34 @@ struct amdgpu_rlc {
 	u32 reg_list_format_start;
 	u32 reg_list_format_separate_start;
 	u32 starting_offsets_start;
 	u32 reg_list_format_size_bytes;
 	u32 reg_list_size_bytes;
 
 	u32 *register_list_format;
 	u32 *register_restore;
 };
 
+#define AMDGPU_MAX_QUEUES KGD_MAX_QUEUES
+
 struct amdgpu_mec {
 	struct amdgpu_bo	*hpd_eop_obj;
 	u64			hpd_eop_gpu_addr;
 	struct amdgpu_bo	*mec_fw_obj;
 	u64			mec_fw_gpu_addr;
 	u32 num_mec;
 	u32 num_pipe_per_mec;
 	u32 num_queue_per_pipe;
 	void			*mqd_backup[AMDGPU_MAX_COMPUTE_RINGS + 1];
+
+	/* These are the resources for which amdgpu takes ownership */
+	DECLARE_BITMAP(queue_bitmap, AMDGPU_MAX_QUEUES);
 };
 
 struct amdgpu_kiq {
 	u64			eop_gpu_addr;
 	struct amdgpu_bo	*eop_obj;
 	struct amdgpu_ring	ring;
 	struct amdgpu_irq_src	irq;
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index 3340012..0586f1c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -42,21 +42,20 @@
 #include "gca/gfx_7_2_enum.h"
 #include "gca/gfx_7_2_sh_mask.h"
 
 #include "gmc/gmc_7_0_d.h"
 #include "gmc/gmc_7_0_sh_mask.h"
 
 #include "oss/oss_2_0_d.h"
 #include "oss/oss_2_0_sh_mask.h"
 
 #define GFX7_NUM_GFX_RINGS     1
-#define GFX7_NUM_COMPUTE_RINGS 8
 #define GFX7_MEC_HPD_SIZE      2048
 
 
 static void gfx_v7_0_set_ring_funcs(struct amdgpu_device *adev);
 static void gfx_v7_0_set_irq_funcs(struct amdgpu_device *adev);
 static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev);
 
 MODULE_FIRMWARE("radeon/bonaire_pfp.bin");
 MODULE_FIRMWARE("radeon/bonaire_me.bin");
 MODULE_FIRMWARE("radeon/bonaire_ce.bin");
@@ -2817,47 +2816,79 @@ static void gfx_v7_0_mec_fini(struct amdgpu_device *adev)
 		if (unlikely(r != 0))
 			dev_warn(adev->dev, "(%d) reserve HPD EOP bo failed\n", r);
 		amdgpu_bo_unpin(adev->gfx.mec.hpd_eop_obj);
 		amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
 
 		amdgpu_bo_unref(&adev->gfx.mec.hpd_eop_obj);
 		adev->gfx.mec.hpd_eop_obj = NULL;
 	}
 }
 
+static void gfx_v7_0_compute_queue_acquire(struct amdgpu_device *adev)
+{
+	int i, queue, pipe, mec;
+
+	/* policy for amdgpu compute queue ownership */
+	for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) {
+		queue = i % adev->gfx.mec.num_queue_per_pipe;
+		pipe = (i / adev->gfx.mec.num_queue_per_pipe)
+			% adev->gfx.mec.num_pipe_per_mec;
+		mec = (i / adev->gfx.mec.num_queue_per_pipe)
+			/ adev->gfx.mec.num_pipe_per_mec;
+
+		/* we've run out of HW */
+		if (mec > adev->gfx.mec.num_mec)
+			break;
+
+		/* policy: amdgpu owns all queues in the first pipe */
+		if (mec == 0 && pipe == 0)
+			set_bit(i, adev->gfx.mec.queue_bitmap);
+	}
+
+	/* update the number of active compute rings */
+	adev->gfx.num_compute_rings =
+		bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_QUEUES);
+
+	/* If you hit this case and edited the policy, you probably just
+	 * need to increase AMDGPU_MAX_COMPUTE_RINGS */
+	WARN_ON(adev->gfx.num_compute_rings > AMDGPU_MAX_COMPUTE_RINGS);
+	if (adev->gfx.num_compute_rings > AMDGPU_MAX_COMPUTE_RINGS)
+		adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
+}
+
 static int gfx_v7_0_mec_init(struct amdgpu_device *adev)
 {
 	int r;
 	u32 *hpd;
 	size_t mec_hpd_size;
 
-	/*
-	 * KV:    2 MEC, 4 Pipes/MEC, 8 Queues/Pipe - 64 Queues total
-	 * CI/KB: 1 MEC, 4 Pipes/MEC, 8 Queues/Pipe - 32 Queues total
-	 * Nonetheless, we assign only 1 pipe because all other pipes will
-	 * be handled by KFD
-	 */
+	bitmap_zero(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_QUEUES);
+
 	switch (adev->asic_type) {
 	case CHIP_KAVERI:
 		adev->gfx.mec.num_mec = 2;
 		break;
 	case CHIP_BONAIRE:
 	case CHIP_HAWAII:
 	case CHIP_KABINI:
 	case CHIP_MULLINS:
 	default:
 		adev->gfx.mec.num_mec = 1;
 		break;
 	}
 	adev->gfx.mec.num_pipe_per_mec = 4;
 	adev->gfx.mec.num_queue_per_pipe = 8;
 
+	/* take ownership of the relevant compute queues */
+	gfx_v7_0_compute_queue_acquire(adev);
+
+	/* allocate space for ALL pipes (even the ones we don't own) */
 	mec_hpd_size = adev->gfx.mec.num_mec * adev->gfx.mec.num_pipe_per_mec
 		* GFX7_MEC_HPD_SIZE * 2;
 	if (adev->gfx.mec.hpd_eop_obj == NULL) {
 		r = amdgpu_bo_create(adev,
 				     mec_hpd_size,
 				     PAGE_SIZE, true,
 				     AMDGPU_GEM_DOMAIN_GTT, 0, NULL, NULL,
 				     &adev->gfx.mec.hpd_eop_obj);
 		if (r) {
 			dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r);
@@ -4522,21 +4553,21 @@ static const struct amdgpu_gfx_funcs gfx_v7_0_gfx_funcs = {
 static const struct amdgpu_rlc_funcs gfx_v7_0_rlc_funcs = {
 	.enter_safe_mode = gfx_v7_0_enter_rlc_safe_mode,
 	.exit_safe_mode = gfx_v7_0_exit_rlc_safe_mode
 };
 
 static int gfx_v7_0_early_init(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
 	adev->gfx.num_gfx_rings = GFX7_NUM_GFX_RINGS;
-	adev->gfx.num_compute_rings = GFX7_NUM_COMPUTE_RINGS;
+	adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
 	adev->gfx.funcs = &gfx_v7_0_gfx_funcs;
 	adev->gfx.rlc.funcs = &gfx_v7_0_rlc_funcs;
 	gfx_v7_0_set_ring_funcs(adev);
 	gfx_v7_0_set_irq_funcs(adev);
 	gfx_v7_0_set_gds_init(adev);
 
 	return 0;
 }
 
 static int gfx_v7_0_late_init(void *handle)
@@ -4718,21 +4749,21 @@ static void gfx_v7_0_gpu_early_init(struct amdgpu_device *adev)
 		gb_addr_config |= (2 << GB_ADDR_CONFIG__ROW_SIZE__SHIFT);
 		break;
 	}
 	adev->gfx.config.gb_addr_config = gb_addr_config;
 }
 
 static int gfx_v7_0_sw_init(void *handle)
 {
 	struct amdgpu_ring *ring;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-	int i, r;
+	int i, r, ring_id;
 
 	/* EOP Event */
 	r = amdgpu_irq_add_id(adev, AMDGPU_IH_CLIENTID_LEGACY, 181, &adev->gfx.eop_irq);
 	if (r)
 		return r;
 
 	/* Privileged reg */
 	r = amdgpu_irq_add_id(adev, AMDGPU_IH_CLIENTID_LEGACY, 184,
 			      &adev->gfx.priv_reg_irq);
 	if (r)
@@ -4769,42 +4800,52 @@ static int gfx_v7_0_sw_init(void *handle)
 		ring = &adev->gfx.gfx_ring[i];
 		ring->ring_obj = NULL;
 		sprintf(ring->name, "gfx");
 		r = amdgpu_ring_init(adev, ring, 1024,
 				     &adev->gfx.eop_irq, AMDGPU_CP_IRQ_GFX_EOP);
 		if (r)
 			return r;
 	}
 
 	/* set up the compute queues */
-	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+	for (i = 0, ring_id = 0; i < AMDGPU_MAX_QUEUES; i++) {
 		unsigned irq_type;
 
-		/* max 32 queues per MEC */
-		if ((i >= 32) || (i >= AMDGPU_MAX_COMPUTE_RINGS)) {
-			DRM_ERROR("Too many (%d) compute rings!\n", i);
-			break;
-		}
-		ring = &adev->gfx.compute_ring[i];
+		if (!test_bit(i, adev->gfx.mec.queue_bitmap))
+			continue;
+
+		ring = &adev->gfx.compute_ring[ring_id];
+
+		/* mec0 is me1 */
+		ring->me = ((i / adev->gfx.mec.num_queue_per_pipe)
+				/ adev->gfx.mec.num_pipe_per_mec)
+				+ 1;
+		ring->pipe = (i / adev->gfx.mec.num_queue_per_pipe)
+				% adev->gfx.mec.num_pipe_per_mec;
+		ring->queue = i % adev->gfx.mec.num_queue_per_pipe;
+
 		ring->ring_obj = NULL;
 		ring->use_doorbell = true;
-		ring->doorbell_index = AMDGPU_DOORBELL_MEC_RING0 + i;
-		ring->me = 1; /* first MEC */
-		ring->pipe = i / 8;
-		ring->queue = i % 8;
+		ring->doorbell_index = AMDGPU_DOORBELL_MEC_RING0 + ring_id;
 		sprintf(ring->name, "comp_%d.%d.%d", ring->me, ring->pipe, ring->queue);
-		irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + ring->pipe;
+
+		irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP
+			+ ((ring->me - 1) * adev->gfx.mec.num_pipe_per_mec)
+			+ ring->pipe;
+
 		/* type-2 packets are deprecated on MEC, use type-3 instead */
 		r = amdgpu_ring_init(adev, ring, 1024,
 				     &adev->gfx.eop_irq, irq_type);
 		if (r)
 			return r;
+
+		ring_id++;
 	}
 
 	/* reserve GDS, GWS and OA resource for gfx */
 	r = amdgpu_bo_create_kernel(adev, adev->gds.mem.gfx_partition_size,
 				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GDS,
 				    &adev->gds.gds_gfx_bo, NULL, NULL);
 	if (r)
 		return r;
 
 	r = amdgpu_bo_create_kernel(adev, adev->gds.gws.gfx_partition_size,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 86cdcb8..177992c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -45,21 +45,20 @@
 #include "gca/gfx_8_0_enum.h"
 #include "gca/gfx_8_0_sh_mask.h"
 #include "gca/gfx_8_0_enum.h"
 
 #include "dce/dce_10_0_d.h"
 #include "dce/dce_10_0_sh_mask.h"
 
 #include "smu/smu_7_1_3_d.h"
 
 #define GFX8_NUM_GFX_RINGS     1
-#define GFX8_NUM_COMPUTE_RINGS 8
 #define GFX8_MEC_HPD_SIZE 2048
 
 
 #define TOPAZ_GB_ADDR_CONFIG_GOLDEN 0x22010001
 #define CARRIZO_GB_ADDR_CONFIG_GOLDEN 0x22010001
 #define POLARIS11_GB_ADDR_CONFIG_GOLDEN 0x22011002
 #define TONGA_GB_ADDR_CONFIG_GOLDEN 0x22011003
 
 #define ARRAY_MODE(x)					((x) << GB_TILE_MODE0__ARRAY_MODE__SHIFT)
 #define PIPE_CONFIG(x)					((x) << GB_TILE_MODE0__PIPE_CONFIG__SHIFT)
@@ -1410,47 +1409,82 @@ static int gfx_v8_0_kiq_init_ring(struct amdgpu_device *adev,
 }
 static void gfx_v8_0_kiq_free_ring(struct amdgpu_ring *ring,
 				   struct amdgpu_irq_src *irq)
 {
 	amdgpu_wb_free(ring->adev, ring->adev->virt.reg_val_offs);
 	amdgpu_ring_fini(ring);
 }
 
 #define GFX8_MEC_HPD_SIZE 2048
 
+static void gfx_v8_0_compute_queue_acquire(struct amdgpu_device *adev)
+{
+	int i, queue, pipe, mec;
+
+	/* policy for amdgpu compute queue ownership */
+	for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) {
+		queue = i % adev->gfx.mec.num_queue_per_pipe;
+		pipe = (i / adev->gfx.mec.num_queue_per_pipe)
+			% adev->gfx.mec.num_pipe_per_mec;
+		mec = (i / adev->gfx.mec.num_queue_per_pipe)
+			/ adev->gfx.mec.num_pipe_per_mec;
+
+		/* we've run out of HW */
+		if (mec > adev->gfx.mec.num_mec)
+			break;
+
+		/* policy: amdgpu owns all queues in the first pipe */
+		if (mec == 0 && pipe == 0)
+			set_bit(i, adev->gfx.mec.queue_bitmap);
+	}
+
+	/* update the number of active compute rings */
+	adev->gfx.num_compute_rings =
+		bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_QUEUES);
+
+	/* If you hit this case and edited the policy, you probably just
+	 * need to increase AMDGPU_MAX_COMPUTE_RINGS */
+	if (WARN_ON(adev->gfx.num_compute_rings > AMDGPU_MAX_COMPUTE_RINGS))
+		adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
+}
+
 static int gfx_v8_0_mec_init(struct amdgpu_device *adev)
 {
 	int r;
 	u32 *hpd;
 	size_t mec_hpd_size;
 
+	bitmap_zero(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_QUEUES);
+
 	switch (adev->asic_type) {
 	case CHIP_FIJI:
 	case CHIP_TONGA:
 	case CHIP_POLARIS11:
 	case CHIP_POLARIS12:
 	case CHIP_POLARIS10:
 	case CHIP_CARRIZO:
 		adev->gfx.mec.num_mec = 2;
 		break;
 	case CHIP_TOPAZ:
 	case CHIP_STONEY:
 	default:
 		adev->gfx.mec.num_mec = 1;
 		break;
 	}
 
 	adev->gfx.mec.num_pipe_per_mec = 4;
 	adev->gfx.mec.num_queue_per_pipe = 8;
 
-	/* only 1 pipe of the first MEC is owned by amdgpu */
-	mec_hpd_size = 1 * 1 * adev->gfx.mec.num_queue_per_pipe * GFX8_MEC_HPD_SIZE;
+	/* take ownership of the relevant compute queues */
+	gfx_v8_0_compute_queue_acquire(adev);
+
+	mec_hpd_size = adev->gfx.num_compute_rings * GFX8_MEC_HPD_SIZE;
 
 	if (adev->gfx.mec.hpd_eop_obj == NULL) {
 		r = amdgpu_bo_create(adev,
 				     mec_hpd_size,
 				     PAGE_SIZE, true,
 				     AMDGPU_GEM_DOMAIN_GTT, 0, NULL, NULL,
 				     &adev->gfx.mec.hpd_eop_obj);
 		if (r) {
 			dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r);
 			return r;
@@ -2083,21 +2117,21 @@ static int gfx_v8_0_gpu_early_init(struct amdgpu_device *adev)
 		gb_addr_config = REG_SET_FIELD(gb_addr_config, GB_ADDR_CONFIG, ROW_SIZE, 2);
 		break;
 	}
 	adev->gfx.config.gb_addr_config = gb_addr_config;
 
 	return 0;
 }
 
 static int gfx_v8_0_sw_init(void *handle)
 {
-	int i, r;
+	int i, r, ring_id;
 	struct amdgpu_ring *ring;
 	struct amdgpu_kiq *kiq;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
 	/* KIQ event */
 	r = amdgpu_irq_add_id(adev, AMDGPU_IH_CLIENTID_LEGACY, 178, &adev->gfx.kiq.irq);
 	if (r)
 		return r;
 
 	/* EOP Event */
@@ -2150,43 +2184,56 @@ static int gfx_v8_0_sw_init(void *handle)
 			ring->doorbell_index = AMDGPU_DOORBELL_GFX_RING0;
 		}
 
 		r = amdgpu_ring_init(adev, ring, 1024, &adev->gfx.eop_irq,
 				     AMDGPU_CP_IRQ_GFX_EOP);
 		if (r)
 			return r;
 	}
 
 	/* set up the compute queues */
-	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+	for (i = 0, ring_id = 0; i < AMDGPU_MAX_QUEUES; i++) {
 		unsigned irq_type;
 
-		/* max 32 queues per MEC */
-		if ((i >= 32) || (i >= AMDGPU_MAX_COMPUTE_RINGS)) {
-			DRM_ERROR("Too many (%d) compute rings!\n", i);
+		if (!test_bit(i, adev->gfx.mec.queue_bitmap))
+			continue;
+
+		if (WARN_ON(ring_id >= AMDGPU_MAX_COMPUTE_RINGS))
 			break;
-		}
-		ring = &adev->gfx.compute_ring[i];
+
+		ring = &adev->gfx.compute_ring[ring_id];
+
+		/* mec0 is me1 */
+		ring->me = ((i / adev->gfx.mec.num_queue_per_pipe)
+				/ adev->gfx.mec.num_pipe_per_mec)
+				+ 1;
+		ring->pipe = (i / adev->gfx.mec.num_queue_per_pipe)
+				% adev->gfx.mec.num_pipe_per_mec;
+		ring->queue = i % adev->gfx.mec.num_queue_per_pipe;
+
 		ring->ring_obj = NULL;
 		ring->use_doorbell = true;
-		ring->doorbell_index = AMDGPU_DOORBELL_MEC_RING0 + i;
-		ring->me = 1; /* first MEC */
-		ring->pipe = i / 8;
-		ring->queue = i % 8;
-		ring->eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr + (i * GFX8_MEC_HPD_SIZE);
+		ring->eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr + (ring_id * GFX8_MEC_HPD_SIZE);
+		ring->doorbell_index = AMDGPU_DOORBELL_MEC_RING0 + ring_id;
 		sprintf(ring->name, "comp_%d.%d.%d", ring->me, ring->pipe, ring->queue);
-		irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + ring->pipe;
+
+		irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP
+			+ ((ring->me - 1) * adev->gfx.mec.num_pipe_per_mec)
+			+ ring->pipe;
+
 		/* type-2 packets are deprecated on MEC, use type-3 instead */
 		r = amdgpu_ring_init(adev, ring, 1024, &adev->gfx.eop_irq,
 				     irq_type);
 		if (r)
 			return r;
+
+		ring_id++;
 	}
 
 	r = gfx_v8_0_kiq_init(adev);
 	if (r) {
 		DRM_ERROR("Failed to init KIQ BOs!\n");
 		return r;
 	}
 
 	kiq = &adev->gfx.kiq;
 	r = gfx_v8_0_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
@@ -5686,21 +5733,21 @@ static const struct amdgpu_gfx_funcs gfx_v8_0_gfx_funcs = {
 	.select_se_sh = &gfx_v8_0_select_se_sh,
 	.read_wave_data = &gfx_v8_0_read_wave_data,
 	.read_wave_sgprs = &gfx_v8_0_read_wave_sgprs,
 };
 
 static int gfx_v8_0_early_init(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
 	adev->gfx.num_gfx_rings = GFX8_NUM_GFX_RINGS;
-	adev->gfx.num_compute_rings = GFX8_NUM_COMPUTE_RINGS;
+	adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
 	adev->gfx.funcs = &gfx_v8_0_gfx_funcs;
 	gfx_v8_0_set_ring_funcs(adev);
 	gfx_v8_0_set_irq_funcs(adev);
 	gfx_v8_0_set_gds_init(adev);
 	gfx_v8_0_set_rlc_funcs(adev);
 
 	return 0;
 }
 
 static int gfx_v8_0_late_init(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 1a7b743..de6e537 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -31,21 +31,20 @@
 #include "vega10/GC/gc_9_0_offset.h"
 #include "vega10/GC/gc_9_0_sh_mask.h"
 #include "vega10/vega10_enum.h"
 #include "vega10/HDP/hdp_4_0_offset.h"
 
 #include "soc15_common.h"
 #include "clearstate_gfx9.h"
 #include "v9_structs.h"
 
 #define GFX9_NUM_GFX_RINGS     1
-#define GFX9_NUM_COMPUTE_RINGS 8
 #define GFX9_NUM_SE		4
 #define RLCG_UCODE_LOADING_START_ADDRESS 0x2000
 
 MODULE_FIRMWARE("amdgpu/vega10_ce.bin");
 MODULE_FIRMWARE("amdgpu/vega10_pfp.bin");
 MODULE_FIRMWARE("amdgpu/vega10_me.bin");
 MODULE_FIRMWARE("amdgpu/vega10_mec.bin");
 MODULE_FIRMWARE("amdgpu/vega10_mec2.bin");
 MODULE_FIRMWARE("amdgpu/vega10_rlc.bin");
 
@@ -469,45 +468,79 @@ static void gfx_v9_0_mec_fini(struct amdgpu_device *adev)
 		amdgpu_bo_unpin(adev->gfx.mec.mec_fw_obj);
 		amdgpu_bo_unreserve(adev->gfx.mec.mec_fw_obj);
 
 		amdgpu_bo_unref(&adev->gfx.mec.mec_fw_obj);
 		adev->gfx.mec.mec_fw_obj = NULL;
 	}
 }
 
 #define MEC_HPD_SIZE 2048
 
+static void gfx_v9_0_compute_queue_acquire(struct amdgpu_device *adev)
+{
+	int i, queue, pipe, mec;
+
+	/* policy for amdgpu compute queue ownership */
+	for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) {
+		queue = i % adev->gfx.mec.num_queue_per_pipe;
+		pipe = (i / adev->gfx.mec.num_queue_per_pipe)
+			% adev->gfx.mec.num_pipe_per_mec;
+		mec = (i / adev->gfx.mec.num_queue_per_pipe)
+			/ adev->gfx.mec.num_pipe_per_mec;
+
+		/* we've run out of HW */
+		if (mec > adev->gfx.mec.num_mec)
+			break;
+
+		/* policy: amdgpu owns all queues in the first pipe */
+		if (mec == 0 && pipe == 0)
+			set_bit(i, adev->gfx.mec.queue_bitmap);
+	}
+
+	/* update the number of active compute rings */
+	adev->gfx.num_compute_rings =
+		bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_QUEUES);
+
+	/* If you hit this case and edited the policy, you probably just
+	 * need to increase AMDGPU_MAX_COMPUTE_RINGS */
+	if (WARN_ON(adev->gfx.num_compute_rings > AMDGPU_MAX_COMPUTE_RINGS))
+		adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
+}
+
 static int gfx_v9_0_mec_init(struct amdgpu_device *adev)
 {
 	int r;
 	u32 *hpd;
 	const __le32 *fw_data;
 	unsigned fw_size;
 	u32 *fw;
 	size_t mec_hpd_size;
 
 	const struct gfx_firmware_header_v1_0 *mec_hdr;
 
+	bitmap_zero(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_QUEUES);
+
 	switch (adev->asic_type) {
 	case CHIP_VEGA10:
 		adev->gfx.mec.num_mec = 2;
 		break;
 	default:
 		adev->gfx.mec.num_mec = 1;
 		break;
 	}
 
 	adev->gfx.mec.num_pipe_per_mec = 4;
 	adev->gfx.mec.num_queue_per_pipe = 8;
 
-	/* only 1 pipe of the first MEC is owned by amdgpu */
-	mec_hpd_size = 1 * 1 * adev->gfx.mec.num_queue_per_pipe * MEC_HPD_SIZE;
+	/* take ownership of the relevant compute queues */
+	gfx_v9_0_compute_queue_acquire(adev);
+	mec_hpd_size = adev->gfx.num_compute_rings * MEC_HPD_SIZE;
 
 	if (adev->gfx.mec.hpd_eop_obj == NULL) {
 		r = amdgpu_bo_create(adev,
 				     mec_hpd_size,
 				     PAGE_SIZE, true,
 				     AMDGPU_GEM_DOMAIN_GTT, 0, NULL, NULL,
 				     &adev->gfx.mec.hpd_eop_obj);
 		if (r) {
 			dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r);
 			return r;
@@ -1024,21 +1057,21 @@ static int gfx_v9_0_ngg_en(struct amdgpu_device *adev)
 	gfx_v9_0_write_data_to_reg(ring, 0, false,
 				   amdgpu_gds_reg_offset[0].mem_size, 0);
 
 	amdgpu_ring_commit(ring);
 
 	return 0;
 }
 
 static int gfx_v9_0_sw_init(void *handle)
 {
-	int i, r;
+	int i, r, ring_id;
 	struct amdgpu_ring *ring;
 	struct amdgpu_kiq *kiq;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
 	/* KIQ event */
 	r = amdgpu_irq_add_id(adev, AMDGPU_IH_CLIENTID_GRBM_CP, 178, &adev->gfx.kiq.irq);
 	if (r)
 		return r;
 
 	/* EOP Event */
@@ -1081,21 +1114,60 @@ static int gfx_v9_0_sw_init(void *handle)
 		sprintf(ring->name, "gfx");
 		ring->use_doorbell = true;
 		ring->doorbell_index = AMDGPU_DOORBELL64_GFX_RING0 << 1;
 		r = amdgpu_ring_init(adev, ring, 1024,
 				     &adev->gfx.eop_irq, AMDGPU_CP_IRQ_GFX_EOP);
 		if (r)
 			return r;
 	}
 
 	/* set up the compute queues */
-	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+	for (i = 0, ring_id = 0; i < AMDGPU_MAX_QUEUES; i++) {
+		unsigned irq_type;
+
+		if (!test_bit(i, adev->gfx.mec.queue_bitmap))
+			continue;
+
+		if (WARN_ON(ring_id >= AMDGPU_MAX_COMPUTE_RINGS))
+			break;
+
+		ring = &adev->gfx.compute_ring[ring_id];
+
+		/* mec0 is me1 */
+		ring->me = ((i / adev->gfx.mec.num_queue_per_pipe)
+				/ adev->gfx.mec.num_pipe_per_mec)
+				+ 1;
+		ring->pipe = (i / adev->gfx.mec.num_queue_per_pipe)
+				% adev->gfx.mec.num_pipe_per_mec;
+		ring->queue = i % adev->gfx.mec.num_queue_per_pipe;
+
+		ring->ring_obj = NULL;
+		ring->use_doorbell = true;
+		ring->eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr + (ring_id * MEC_HPD_SIZE);
+		ring->doorbell_index = AMDGPU_DOORBELL_MEC_RING0 + ring_id;
+		sprintf(ring->name, "comp_%d.%d.%d", ring->me, ring->pipe, ring->queue);
+
+		irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP
+			+ ((ring->me - 1) * adev->gfx.mec.num_pipe_per_mec)
+			+ ring->pipe;
+
+		/* type-2 packets are deprecated on MEC, use type-3 instead */
+		r = amdgpu_ring_init(adev, ring, 1024, &adev->gfx.eop_irq,
+				     irq_type);
+		if (r)
+			return r;
+
+		ring_id++;
+	}
+
+	/* set up the compute queues */
+	for (i = 0, ring_id = 0; i < AMDGPU_MAX_QUEUES; i++) {
 		unsigned irq_type;
 
 		/* max 32 queues per MEC */
 		if ((i >= 32) || (i >= AMDGPU_MAX_COMPUTE_RINGS)) {
 			DRM_ERROR("Too many (%d) compute rings!\n", i);
 			break;
 		}
 		ring = &adev->gfx.compute_ring[i];
 		ring->ring_obj = NULL;
 		ring->use_doorbell = true;
@@ -2655,21 +2727,21 @@ static void gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
 	gfx_v9_0_write_data_to_reg(ring, 0, false,
 				   amdgpu_gds_reg_offset[vmid].oa,
 				   (1 << (oa_size + oa_base)) - (1 << oa_base));
 }
 
 static int gfx_v9_0_early_init(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
 	adev->gfx.num_gfx_rings = GFX9_NUM_GFX_RINGS;
-	adev->gfx.num_compute_rings = GFX9_NUM_COMPUTE_RINGS;
+	adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
 	gfx_v9_0_set_ring_funcs(adev);
 	gfx_v9_0_set_irq_funcs(adev);
 	gfx_v9_0_set_gds_init(adev);
 	gfx_v9_0_set_rlc_funcs(adev);
 
 	return 0;
 }
 
 static int gfx_v9_0_late_init(void *handle)
 {
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index a09d9f3..67f6d19 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -26,20 +26,21 @@
  */
 
 #ifndef KGD_KFD_INTERFACE_H_INCLUDED
 #define KGD_KFD_INTERFACE_H_INCLUDED
 
 #include <linux/types.h>
 
 struct pci_dev;
 
 #define KFD_INTERFACE_VERSION 1
+#define KGD_MAX_QUEUES 128
 
 struct kfd_dev;
 struct kgd_dev;
 
 struct kgd_mem;
 
 enum kgd_memory_pool {
 	KGD_POOL_SYSTEM_CACHEABLE = 1,
 	KGD_POOL_SYSTEM_WRITECOMBINE = 2,
 	KGD_POOL_FRAMEBUFFER = 3,
-- 
2.9.3