[PATCH v2] drm/amdkfd: Replace pr_err with dev_err
Asad Kamal
asad.kamal at amd.com
Sat Aug 26 13:41:48 UTC 2023
Replace pr_err with dev_err to show the bus-id of
failing device with kfd queue errors
Signed-off-by: Asad Kamal <asad.kamal at amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar at amd.com>
---
.../drm/amd/amdkfd/kfd_device_queue_manager.c | 116 +++++++++++-------
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
2 files changed, 71 insertions(+), 47 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index b166f30f083e..cd6cfffd6436 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -232,8 +232,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
queue_type = convert_to_mes_queue_type(q->properties.type);
if (queue_type < 0) {
- pr_err("Queue type not supported with MES, queue:%d\n",
- q->properties.type);
+ dev_err(adev->dev, "Queue type not supported with MES, queue:%d\n",
+ q->properties.type);
return -EINVAL;
}
queue_input.queue_type = (uint32_t)queue_type;
@@ -244,9 +244,9 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input);
amdgpu_mes_unlock(&adev->mes);
if (r) {
- pr_err("failed to add hardware queue to MES, doorbell=0x%x\n",
+ dev_err(adev->dev, "failed to add hardware queue to MES, doorbell=0x%x\n",
q->properties.doorbell_off);
- pr_err("MES might be in unrecoverable state, issue a GPU reset\n");
+ dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
kfd_hws_hang(dqm);
}
@@ -272,9 +272,9 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
amdgpu_mes_unlock(&adev->mes);
if (r) {
- pr_err("failed to remove hardware queue from MES, doorbell=0x%x\n",
+ dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n",
q->properties.doorbell_off);
- pr_err("MES might be in unrecoverable state, issue a GPU reset\n");
+ dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
kfd_hws_hang(dqm);
}
@@ -284,6 +284,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
static int remove_all_queues_mes(struct device_queue_manager *dqm)
{
struct device_process_node *cur;
+ struct device *dev = dqm->dev->adev->dev;
struct qcm_process_device *qpd;
struct queue *q;
int retval = 0;
@@ -294,7 +295,7 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm)
if (q->properties.is_active) {
retval = remove_queue_mes(dqm, q, qpd);
if (retval) {
- pr_err("%s: Failed to remove queue %d for dev %d",
+ dev_err(dev, "%s: Failed to remove queue %d for dev %d",
__func__,
q->properties.queue_id,
dqm->dev->id);
@@ -443,6 +444,7 @@ static int allocate_vmid(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
struct queue *q)
{
+ struct device *dev = dqm->dev->adev->dev;
int allocated_vmid = -1, i;
for (i = dqm->dev->vm_info.first_vmid_kfd;
@@ -454,7 +456,7 @@ static int allocate_vmid(struct device_queue_manager *dqm,
}
if (allocated_vmid < 0) {
- pr_err("no more vmid to allocate\n");
+ dev_err(dev, "no more vmid to allocate\n");
return -ENOSPC;
}
@@ -510,10 +512,12 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
struct queue *q)
{
+ struct device *dev = dqm->dev->adev->dev;
+
/* On GFX v7, CP doesn't flush TC at dequeue */
if (q->device->adev->asic_type == CHIP_HAWAII)
if (flush_texture_cache_nocpsch(q->device, qpd))
- pr_err("Failed to flush TC\n");
+ dev_err(dev, "Failed to flush TC\n");
kfd_flush_tlb(qpd_to_pdd(qpd), TLB_FLUSH_LEGACY);
@@ -708,7 +712,7 @@ static int dbgdev_wave_reset_wavefronts(struct kfd_node *dev, struct kfd_process
pr_debug("Killing all process wavefronts\n");
if (!dev->kfd2kgd->get_atc_vmid_pasid_mapping_info) {
- pr_err("no vmid pasid mapping supported \n");
+ dev_err(dev->adev->dev, "no vmid pasid mapping supported\n");
return -EOPNOTSUPP;
}
@@ -729,7 +733,7 @@ static int dbgdev_wave_reset_wavefronts(struct kfd_node *dev, struct kfd_process
}
if (vmid > last_vmid_to_scan) {
- pr_err("Didn't find vmid for pasid 0x%x\n", p->pasid);
+ dev_err(dev->adev->dev, "Didn't find vmid for pasid 0x%x\n", p->pasid);
return -EFAULT;
}
@@ -821,6 +825,7 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
{
int retval;
uint64_t sdma_val = 0;
+ struct device *dev = dqm->dev->adev->dev;
struct kfd_process_device *pdd = qpd_to_pdd(qpd);
struct mqd_manager *mqd_mgr =
dqm->mqd_mgrs[get_mqd_type_from_queue_type(q->properties.type)];
@@ -831,7 +836,7 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
retval = read_sdma_queue_counter((uint64_t __user *)q->properties.read_ptr,
&sdma_val);
if (retval)
- pr_err("Failed to read SDMA queue counter for queue: %d\n",
+ dev_err(dev, "Failed to read SDMA queue counter for queue: %d\n",
q->properties.queue_id);
}
@@ -850,6 +855,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
struct mqd_update_info *minfo)
{
int retval = 0;
+ struct device *dev = dqm->dev->adev->dev;
struct mqd_manager *mqd_mgr;
struct kfd_process_device *pdd;
bool prev_active = false;
@@ -875,7 +881,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
retval = remove_queue_mes(dqm, q, &pdd->qpd);
if (retval) {
- pr_err("unmap queue failed\n");
+ dev_err(dev, "unmap queue failed\n");
goto out_unlock;
}
} else if (prev_active &&
@@ -894,7 +900,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN),
KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
if (retval) {
- pr_err("destroy mqd failed\n");
+ dev_err(dev, "destroy mqd failed\n");
goto out_unlock;
}
}
@@ -1088,6 +1094,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct queue *q;
+ struct device *dev = dqm->dev->adev->dev;
struct kfd_process_device *pdd;
int retval = 0;
@@ -1121,7 +1128,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
if (dqm->dev->kfd->shared_resources.enable_mes) {
retval = remove_queue_mes(dqm, q, qpd);
if (retval) {
- pr_err("Failed to evict queue %d\n",
+ dev_err(dev, "Failed to evict queue %d\n",
q->properties.queue_id);
goto out;
}
@@ -1225,6 +1232,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct queue *q;
+ struct device *dev = dqm->dev->adev->dev;
struct kfd_process_device *pdd;
uint64_t eviction_duration;
int retval = 0;
@@ -1265,7 +1273,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
if (dqm->dev->kfd->shared_resources.enable_mes) {
retval = add_queue_mes(dqm, q, qpd);
if (retval) {
- pr_err("Failed to restore queue %d\n",
+ dev_err(dev, "Failed to restore queue %d\n",
q->properties.queue_id);
goto out;
}
@@ -1474,18 +1482,19 @@ static void pre_reset(struct device_queue_manager *dqm)
static int allocate_sdma_queue(struct device_queue_manager *dqm,
struct queue *q, const uint32_t *restore_sdma_id)
{
+ struct device *dev = dqm->dev->adev->dev;
int bit;
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
if (bitmap_empty(dqm->sdma_bitmap, KFD_MAX_SDMA_QUEUES)) {
- pr_err("No more SDMA queue to allocate\n");
+ dev_err(dev, "No more SDMA queue to allocate\n");
return -ENOMEM;
}
if (restore_sdma_id) {
/* Re-use existing sdma_id */
if (!test_bit(*restore_sdma_id, dqm->sdma_bitmap)) {
- pr_err("SDMA queue already in use\n");
+ dev_err(dev, "SDMA queue already in use\n");
return -EBUSY;
}
clear_bit(*restore_sdma_id, dqm->sdma_bitmap);
@@ -1504,13 +1513,13 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
kfd_get_num_sdma_engines(dqm->dev);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
if (bitmap_empty(dqm->xgmi_sdma_bitmap, KFD_MAX_SDMA_QUEUES)) {
- pr_err("No more XGMI SDMA queue to allocate\n");
+ dev_err(dev, "No more XGMI SDMA queue to allocate\n");
return -ENOMEM;
}
if (restore_sdma_id) {
/* Re-use existing sdma_id */
if (!test_bit(*restore_sdma_id, dqm->xgmi_sdma_bitmap)) {
- pr_err("SDMA queue already in use\n");
+ dev_err(dev, "SDMA queue already in use\n");
return -EBUSY;
}
clear_bit(*restore_sdma_id, dqm->xgmi_sdma_bitmap);
@@ -1562,6 +1571,7 @@ static int set_sched_resources(struct device_queue_manager *dqm)
{
int i, mec;
struct scheduling_resources res;
+ struct device *dev = dqm->dev->adev->dev;
res.vmid_mask = dqm->dev->compute_vmid_bitmap;
@@ -1582,7 +1592,7 @@ static int set_sched_resources(struct device_queue_manager *dqm)
* definition of res.queue_mask needs updating
*/
if (WARN_ON(i >= (sizeof(res.queue_mask)*8))) {
- pr_err("Invalid queue enabled by amdgpu: %d\n", i);
+ dev_err(dev, "Invalid queue enabled by amdgpu: %d\n", i);
break;
}
@@ -1625,6 +1635,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
static int start_cpsch(struct device_queue_manager *dqm)
{
+ struct device *dev = dqm->dev->adev->dev;
int retval;
retval = 0;
@@ -1671,7 +1682,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
retval = pm_update_grace_period(&dqm->packet_mgr,
grace_period);
if (retval)
- pr_err("Setting grace timeout failed\n");
+ dev_err(dev, "Setting grace timeout failed\n");
else if (dqm->dev->kfd2kgd->build_grace_period_packet_info)
/* Update dqm->wait_times maintained in software */
dqm->dev->kfd2kgd->build_grace_period_packet_info(
@@ -1881,15 +1892,17 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
return retval;
}
-int amdkfd_fence_wait_timeout(uint64_t *fence_addr,
- uint64_t fence_value,
- unsigned int timeout_ms)
+int amdkfd_fence_wait_timeout(struct device_queue_manager *dqm,
+ uint64_t fence_value,
+ unsigned int timeout_ms)
{
unsigned long end_jiffies = msecs_to_jiffies(timeout_ms) + jiffies;
+ struct device *dev = dqm->dev->adev->dev;
+ uint64_t *fence_addr = dqm->fence_addr;
while (*fence_addr != fence_value) {
if (time_after(jiffies, end_jiffies)) {
- pr_err("qcm fence wait loop timeout expired\n");
+ dev_err(dev, "qcm fence wait loop timeout expired\n");
/* In HWS case, this is used to halt the driver thread
* in order not to mess up CP states before doing
* scandumps for FW debugging.
@@ -1908,6 +1921,7 @@ int amdkfd_fence_wait_timeout(uint64_t *fence_addr,
/* dqm->lock mutex has to be locked before calling this function */
static int map_queues_cpsch(struct device_queue_manager *dqm)
{
+ struct device *dev = dqm->dev->adev->dev;
int retval;
if (!dqm->sched_running)
@@ -1920,7 +1934,7 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
retval = pm_send_runlist(&dqm->packet_mgr, &dqm->queues);
pr_debug("%s sent runlist\n", __func__);
if (retval) {
- pr_err("failed to execute runlist\n");
+ dev_err(dev, "failed to execute runlist\n");
return retval;
}
dqm->active_runlist = true;
@@ -1935,8 +1949,9 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
uint32_t grace_period,
bool reset)
{
- int retval = 0;
+ struct device *dev = dqm->dev->adev->dev;
struct mqd_manager *mqd_mgr;
+ int retval = 0;
if (!dqm->sched_running)
return 0;
@@ -1959,10 +1974,10 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
pm_send_query_status(&dqm->packet_mgr, dqm->fence_gpu_addr,
KFD_FENCE_COMPLETED);
/* should be timed out */
- retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
- queue_preemption_timeout_ms);
+ retval = amdkfd_fence_wait_timeout(dqm, KFD_FENCE_COMPLETED,
+ queue_preemption_timeout_ms);
if (retval) {
- pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
+ dev_err(dev, "The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
kfd_hws_hang(dqm);
return retval;
}
@@ -1977,7 +1992,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
*/
mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
if (mqd_mgr->read_doorbell_id(dqm->packet_mgr.priv_queue->queue->mqd)) {
- pr_err("HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out\n");
+ dev_err(dev, "HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out\n");
while (halt_if_hws_hang)
schedule();
return -ETIME;
@@ -1987,7 +2002,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
if (pm_update_grace_period(&dqm->packet_mgr,
USE_DEFAULT_GRACE_PERIOD))
- pr_err("Failed to reset grace period\n");
+ dev_err(dev, "Failed to reset grace period\n");
}
pm_release_ib(&dqm->packet_mgr);
@@ -2061,6 +2076,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
struct mqd_manager *mqd_mgr;
uint64_t sdma_val = 0;
struct kfd_process_device *pdd = qpd_to_pdd(qpd);
+ struct device *dev = dqm->dev->adev->dev;
/* Get the SDMA queue stats */
if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
@@ -2068,7 +2084,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
retval = read_sdma_queue_counter((uint64_t __user *)q->properties.read_ptr,
&sdma_val);
if (retval)
- pr_err("Failed to read SDMA queue counter for queue: %d\n",
+ dev_err(dev, "Failed to read SDMA queue counter for queue: %d\n",
q->properties.queue_id);
}
@@ -2349,6 +2365,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
{
int retval;
struct queue *q;
+ struct device *dev = dqm->dev->adev->dev;
struct kernel_queue *kq, *kq_next;
struct mqd_manager *mqd_mgr;
struct device_process_node *cur, *next_dpn;
@@ -2382,7 +2399,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
if (dqm->dev->kfd->shared_resources.enable_mes) {
retval = remove_queue_mes(dqm, q, qpd);
if (retval)
- pr_err("Failed to remove queue %d\n",
+ dev_err(dev, "Failed to remove queue %d\n",
q->properties.queue_id);
}
}
@@ -2437,12 +2454,13 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
static int init_mqd_managers(struct device_queue_manager *dqm)
{
int i, j;
+ struct device *dev = dqm->dev->adev->dev;
struct mqd_manager *mqd_mgr;
for (i = 0; i < KFD_MQD_TYPE_MAX; i++) {
mqd_mgr = dqm->asic_ops.mqd_manager_init(i, dqm->dev);
if (!mqd_mgr) {
- pr_err("mqd manager [%d] initialization failed\n", i);
+ dev_err(dev, "mqd manager [%d] initialization failed\n", i);
goto out_free;
}
dqm->mqd_mgrs[i] = mqd_mgr;
@@ -2552,7 +2570,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev)
dqm->ops.checkpoint_mqd = checkpoint_mqd;
break;
default:
- pr_err("Invalid scheduling policy %d\n", dqm->sched_policy);
+ dev_err(dev->adev->dev, "Invalid scheduling policy %d\n", dqm->sched_policy);
goto out_free;
}
@@ -2590,7 +2608,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev)
goto out_free;
if (!dev->kfd->shared_resources.enable_mes && allocate_hiq_sdma_mqd(dqm)) {
- pr_err("Failed to allocate hiq sdma mqd trunk buffer\n");
+ dev_err(dev->adev->dev, "Failed to allocate hiq sdma mqd trunk buffer\n");
goto out_free;
}
@@ -2649,17 +2667,18 @@ int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
int r;
+ struct device *dev = dqm->dev->adev->dev;
int updated_vmid_mask;
if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
- pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+ dev_err(dev, "Unsupported on sched_policy: %i\n", dqm->sched_policy);
return -EINVAL;
}
dqm_lock(dqm);
if (dqm->trap_debug_vmid != 0) {
- pr_err("Trap debug id already reserved\n");
+ dev_err(dev, "Trap debug id already reserved\n");
r = -EBUSY;
goto out_unlock;
}
@@ -2695,19 +2714,20 @@ int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
int release_debug_trap_vmid(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
+ struct device *dev = dqm->dev->adev->dev;
int r;
int updated_vmid_mask;
uint32_t trap_debug_vmid;
if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
- pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+ dev_err(dev, "Unsupported on sched_policy: %i\n", dqm->sched_policy);
return -EINVAL;
}
dqm_lock(dqm);
trap_debug_vmid = dqm->trap_debug_vmid;
if (dqm->trap_debug_vmid == 0) {
- pr_err("Trap debug id is not reserved\n");
+ dev_err(dev, "Trap debug id is not reserved\n");
r = -EINVAL;
goto out_unlock;
}
@@ -2844,6 +2864,7 @@ int resume_queues(struct kfd_process *p,
for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i];
struct device_queue_manager *dqm = pdd->dev->dqm;
+ struct device *dev = dqm->dev->adev->dev;
struct qcm_process_device *qpd = &pdd->qpd;
struct queue *q;
int r, per_device_resumed = 0;
@@ -2894,7 +2915,7 @@ int resume_queues(struct kfd_process *p,
0,
USE_DEFAULT_GRACE_PERIOD);
if (r) {
- pr_err("Failed to resume process queues\n");
+ dev_err(dev, "Failed to resume process queues\n");
if (queue_ids) {
list_for_each_entry(q, &qpd->queues_list, list) {
int q_idx = q_array_get_index(
@@ -2946,6 +2967,7 @@ int suspend_queues(struct kfd_process *p,
for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i];
struct device_queue_manager *dqm = pdd->dev->dqm;
+ struct device *dev = dqm->dev->adev->dev;
struct qcm_process_device *qpd = &pdd->qpd;
struct queue *q;
int r, per_device_suspended = 0;
@@ -2994,7 +3016,7 @@ int suspend_queues(struct kfd_process *p,
grace_period);
if (r)
- pr_err("Failed to suspend process queues.\n");
+ dev_err(dev, "Failed to suspend process queues.\n");
else
total_suspended += per_device_suspended;
@@ -3081,10 +3103,11 @@ void set_queue_snapshot_entry(struct queue *q,
int debug_lock_and_unmap(struct device_queue_manager *dqm)
{
+ struct device *dev = dqm->dev->adev->dev;
int r;
if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
- pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+ dev_err(dev, "Unsupported on sched_policy: %i\n", dqm->sched_policy);
return -EINVAL;
}
@@ -3102,10 +3125,11 @@ int debug_lock_and_unmap(struct device_queue_manager *dqm)
int debug_map_and_unlock(struct device_queue_manager *dqm)
{
+ struct device *dev = dqm->dev->adev->dev;
int r;
if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
- pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy);
+ dev_err(dev, "Unsupported on sched_policy: %i\n", dqm->sched_policy);
return -EINVAL;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 3d9ce44d88da..b315311dfe2a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1343,7 +1343,7 @@ int pqm_get_queue_snapshot(struct process_queue_manager *pqm,
int *num_qss_entries,
uint32_t *entry_size);
-int amdkfd_fence_wait_timeout(uint64_t *fence_addr,
+int amdkfd_fence_wait_timeout(struct device_queue_manager *dqm,
uint64_t fence_value,
unsigned int timeout_ms);
--
2.34.1
More information about the amd-gfx
mailing list