[PATCH 1/5] drm/amdgpu: Add work pool to reset domain

Lijo Lazar lijo.lazar at amd.com
Fri Aug 11 06:02:30 UTC 2023


Add a work pool to reset domain. The work pool will be used to schedule
any task in the reset domain. If on successful reset of the domain
indicated by a flag in reset context, all work that are queued will be
drained. Their work handlers won't be executed.

Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 104 +++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  22 +++++
 2 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 02d874799c16..713362a60c9f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -117,6 +117,51 @@ void amdgpu_reset_destroy_reset_domain(struct kref *ref)
 	kvfree(reset_domain);
 }
 
+static void amdgpu_reset_domain_cancel_all_work(struct work_struct *work)
+{
+	struct amdgpu_reset_domain *reset_domain =
+		container_of(work, struct amdgpu_reset_domain, clear);
+	int i;
+
+	for (i = 0; i < AMDGPU_MAX_RESET_WORK; ++i)
+		if (atomic_cmpxchg(&reset_domain->work[i].in_use, 1, 0))
+			cancel_work(&reset_domain->work[i].work);
+
+	drain_workqueue(reset_domain->wq);
+	reset_domain->drain = false;
+}
+
+static void amdgpu_reset_work_handler(struct work_struct *work)
+{
+	struct amdgpu_reset_work *reset_work =
+		container_of(work, struct amdgpu_reset_work, work);
+
+	/* Don't do anything if reset domain is in drain mode */
+	if (reset_work->domain->drain)
+		return;
+
+	reset_work->handler(&reset_work->context);
+	if (reset_work->context.flags & (1U << AMDGPU_RESET_CANCEL_ALL)) {
+		reset_work->domain->drain = true;
+		schedule_work(&reset_work->domain->clear);
+	}
+
+	atomic_set(&reset_work->in_use, 0);
+}
+
+static void
+amdgpu_reset_init_work_pool(struct amdgpu_reset_domain *reset_domain)
+{
+	int i;
+
+	for (i = 0; i < AMDGPU_MAX_RESET_WORK; ++i) {
+		INIT_WORK(&reset_domain->work[i].work,
+			  amdgpu_reset_work_handler);
+		atomic_set(&reset_domain->work[i].in_use, 0);
+		reset_domain->work[i].domain = reset_domain;
+	}
+}
+
 struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type,
 							     char *wq_name)
 {
@@ -139,6 +184,8 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
 
 	}
 
+	INIT_WORK(&reset_domain->clear, amdgpu_reset_domain_cancel_all_work);
+	amdgpu_reset_init_work_pool(reset_domain);
 	atomic_set(&reset_domain->in_gpu_reset, 0);
 	atomic_set(&reset_domain->reset_res, 0);
 	init_rwsem(&reset_domain->sem);
@@ -152,12 +199,67 @@ void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain)
 	down_write(&reset_domain->sem);
 }
 
-
 void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)
 {
 	atomic_set(&reset_domain->in_gpu_reset, 0);
 	up_write(&reset_domain->sem);
 }
 
+static int
+amdgpu_reset_domain_get_work(struct amdgpu_reset_domain *reset_domain,
+			     struct amdgpu_reset_work **reset_work)
+{
+	int i;
 
+	if (!reset_work)
+		return -EINVAL;
+
+	*reset_work = NULL;
+	for (i = 0; i < AMDGPU_MAX_RESET_WORK; ++i) {
+		if (!atomic_cmpxchg(&reset_domain->work[i].in_use, 0, 1)) {
+			*reset_work = &reset_domain->work[i];
+			return 0;
+		}
+	}
+	/* All resources occupied */
+
+	return -EBUSY;
+}
+
+static void amdgpu_reset_init_work(struct amdgpu_reset_work *reset_work,
+				   struct amdgpu_reset_context *reset_context,
+				   amdgpu_reset_work_func_t reset_work_handler)
+{
+	memcpy(&reset_work->context, reset_context, sizeof(*reset_context));
+	reset_work->handler = reset_work_handler;
+}
+
+int amdgpu_reset_schedule_work(struct amdgpu_device *adev,
+			       struct amdgpu_reset_context *reset_context,
+			       amdgpu_reset_work_func_t reset_work_handler)
+{
+	struct amdgpu_reset_work *reset_work;
+	int ret;
+
+	if (!reset_context || !reset_context->reset_req_dev ||
+	    !reset_work_handler)
+		return -EINVAL;
+
+	ret = amdgpu_reset_domain_get_work(adev->reset_domain, &reset_work);
+
+	if (ret)
+		return ret;
+
+	if (!ret) {
+		amdgpu_reset_init_work(reset_work, reset_context,
+				       reset_work_handler);
+
+		queue_work(adev->reset_domain->wq, &reset_work->work);
+
+		if (reset_context->flags & (1U << AMDGPU_RESET_SCHEDULE_NOW))
+			flush_work(&reset_work->work);
+	}
+
+	return ret;
+}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 471d789b33a5..d1393050d3ad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -27,12 +27,16 @@
 #include "amdgpu.h"
 
 #define AMDGPU_RESET_MAX_HANDLERS 5
+#define AMDGPU_MAX_RESET_WORK 5
 
 enum AMDGPU_RESET_FLAGS {
 
 	AMDGPU_NEED_FULL_RESET = 0,
 	AMDGPU_SKIP_HW_RESET = 1,
 	AMDGPU_RESET_FOR_DEVICE_REMOVE = 2,
+	AMDGPU_RESET_XCP = 3,
+	AMDGPU_RESET_SCHEDULE_NOW = 4,
+	AMDGPU_RESET_CANCEL_ALL = 5,
 };
 
 struct amdgpu_reset_context {
@@ -80,13 +84,28 @@ enum amdgpu_reset_domain_type {
 	XGMI_HIVE
 };
 
+typedef void (*amdgpu_reset_work_func_t)(
+	struct amdgpu_reset_context *reset_context);
+
+struct amdgpu_reset_work {
+	struct work_struct work;
+	struct amdgpu_reset_context context;
+	struct amdgpu_reset_domain *domain;
+	atomic_t in_use;
+
+	amdgpu_reset_work_func_t handler;
+};
+
 struct amdgpu_reset_domain {
 	struct kref refcount;
 	struct workqueue_struct *wq;
 	enum amdgpu_reset_domain_type type;
+	struct amdgpu_reset_work work[AMDGPU_MAX_RESET_WORK];
+	struct work_struct clear;
 	struct rw_semaphore sem;
 	atomic_t in_gpu_reset;
 	atomic_t reset_res;
+	bool drain;
 };
 
 
@@ -129,6 +148,9 @@ static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *doma
 void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
 
 void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);
+int amdgpu_reset_schedule_work(struct amdgpu_device *adev,
+			       struct amdgpu_reset_context *reset_context,
+			       amdgpu_reset_work_func_t handler);
 
 #define for_each_handler(i, handler, reset_ctl)                  \
 	for (i = 0; (i < AMDGPU_RESET_MAX_HANDLERS) &&           \
-- 
2.25.1



More information about the amd-gfx mailing list