[PATCH 13/19] drm/amdkfd: Handle remaining BUG_ONs more gracefully

Oded Gabbay oded.gabbay at gmail.com
Sat Aug 12 14:39:18 UTC 2017


On Sat, Aug 12, 2017 at 12:56 AM, Felix Kuehling <Felix.Kuehling at amd.com> wrote:
> In most cases, BUG_ONs can be replaced with WARN_ON with an error
> return. In some void functions just turn them into a WARN_ON and
> possibly an early exit.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c            |  3 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c            |  3 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c            | 16 ++++----
>  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 19 ++++-----
>  .../drm/amd/amdkfd/kfd_device_queue_manager_cik.c  |  2 +-
>  .../drm/amd/amdkfd/kfd_device_queue_manager_vi.c   |  2 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c      | 20 +++++++---
>  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c   |  3 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c    |  3 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c    | 45 +++++++++++++---------
>  drivers/gpu/drm/amd/amdkfd/kfd_pasid.c             |  4 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c           |  9 ++---
>  .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  7 ++--
>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c          |  4 +-
>  14 files changed, 84 insertions(+), 56 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
> index 3841cad..0aa021a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
> @@ -60,7 +60,8 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
>         unsigned int *ib_packet_buff;
>         int status;
>
> -       BUG_ON(!size_in_bytes);
> +       if (WARN_ON(!size_in_bytes))
> +               return -EINVAL;
>
>         kq = dbgdev->kq;
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
> index 2d5555c..3da25f7 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
> @@ -64,7 +64,8 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev)
>         enum DBGDEV_TYPE type = DBGDEV_TYPE_DIQ;
>         struct kfd_dbgmgr *new_buff;
>
> -       BUG_ON(!pdev->init_complete);
> +       if (WARN_ON(!pdev->init_complete))
> +               return false;
>
>         new_buff = kfd_alloc_struct(new_buff);
>         if (!new_buff) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 416955f..f628ac3 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -98,7 +98,7 @@ static const struct kfd_device_info *lookup_device_info(unsigned short did)
>
>         for (i = 0; i < ARRAY_SIZE(supported_devices); i++) {
>                 if (supported_devices[i].did == did) {
> -                       BUG_ON(!supported_devices[i].device_info);
> +                       WARN_ON(!supported_devices[i].device_info);
>                         return supported_devices[i].device_info;
>                 }
>         }
> @@ -212,9 +212,8 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
>                         flags);
>
>         dev = kfd_device_by_pci_dev(pdev);
> -       BUG_ON(!dev);
> -
> -       kfd_signal_iommu_event(dev, pasid, address,
> +       if (!WARN_ON(!dev))
> +               kfd_signal_iommu_event(dev, pasid, address,
>                         flags & PPR_FAULT_WRITE, flags & PPR_FAULT_EXEC);
>
>         return AMD_IOMMU_INV_PRI_RSP_INVALID;
> @@ -397,9 +396,12 @@ static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
>  {
>         unsigned int num_of_longs;
>
> -       BUG_ON(buf_size < chunk_size);
> -       BUG_ON(buf_size == 0);
> -       BUG_ON(chunk_size == 0);
> +       if (WARN_ON(buf_size < chunk_size))
> +               return -EINVAL;
> +       if (WARN_ON(buf_size == 0))
> +               return -EINVAL;
> +       if (WARN_ON(chunk_size == 0))
> +               return -EINVAL;
>
>         kfd->gtt_sa_chunk_size = chunk_size;
>         kfd->gtt_sa_num_of_chunks = buf_size / chunk_size;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 43bc1b5..5dac29d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -386,7 +386,8 @@ static struct mqd_manager *get_mqd_manager_nocpsch(
>  {
>         struct mqd_manager *mqd;
>
> -       BUG_ON(type >= KFD_MQD_TYPE_MAX);
> +       if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
> +               return NULL;
>
>         pr_debug("mqd type %d\n", type);
>
> @@ -511,7 +512,7 @@ static void uninitialize_nocpsch(struct device_queue_manager *dqm)
>  {
>         int i;
>
> -       BUG_ON(dqm->queue_count > 0 || dqm->processes_count > 0);
> +       WARN_ON(dqm->queue_count > 0 || dqm->processes_count > 0);
>
>         kfree(dqm->allocated_queues);
>         for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++)
> @@ -1127,8 +1128,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
>                 dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
>                 break;
>         default:
> -               BUG();
> -               break;
> +               pr_err("Invalid scheduling policy %d\n", sched_policy);
> +               goto out_free;
>         }
>
>         switch (dev->device_info->asic_family) {
> @@ -1141,12 +1142,12 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
>                 break;
>         }
>
> -       if (dqm->ops.initialize(dqm)) {
> -               kfree(dqm);
> -               return NULL;
> -       }
> +       if (!dqm->ops.initialize(dqm))
> +               return dqm;
>
> -       return dqm;
> +out_free:
> +       kfree(dqm);
> +       return NULL;
>  }
>
>  void device_queue_manager_uninit(struct device_queue_manager *dqm)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
> index 43194b4..fadc56a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
> @@ -65,7 +65,7 @@ static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
>          * for LDS/Scratch and GPUVM.
>          */
>
> -       BUG_ON((top_address_nybble & 1) || top_address_nybble > 0xE ||
> +       WARN_ON((top_address_nybble & 1) || top_address_nybble > 0xE ||
>                 top_address_nybble == 0);
>
>         return PRIVATE_BASE(top_address_nybble << 12) |
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
> index 47ef910..15e81ae 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
> @@ -67,7 +67,7 @@ static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
>          * for LDS/Scratch and GPUVM.
>          */
>
> -       BUG_ON((top_address_nybble & 1) || top_address_nybble > 0xE ||
> +       WARN_ON((top_address_nybble & 1) || top_address_nybble > 0xE ||
>                 top_address_nybble == 0);
>
>         return top_address_nybble << 12 |
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 970bc07..0e4d4a9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -41,7 +41,8 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
>         int retval;
>         union PM4_MES_TYPE_3_HEADER nop;
>
> -       BUG_ON(type != KFD_QUEUE_TYPE_DIQ && type != KFD_QUEUE_TYPE_HIQ);
> +       if (WARN_ON(type != KFD_QUEUE_TYPE_DIQ && type != KFD_QUEUE_TYPE_HIQ))
> +               return false;
>
>         pr_debug("Initializing queue type %d size %d\n", KFD_QUEUE_TYPE_HIQ,
>                         queue_size);
> @@ -62,8 +63,8 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
>                                                 KFD_MQD_TYPE_HIQ);
>                 break;
>         default:
> -               BUG();
> -               break;
> +               pr_err("Invalid queue type %d\n", type);
> +               return false;
>         }
>
>         if (!kq->mqd)
> @@ -305,6 +306,7 @@ void kernel_queue_uninit(struct kernel_queue *kq)
>         kfree(kq);
>  }
>
> +/* FIXME: Can this test be removed? */
>  static __attribute__((unused)) void test_kq(struct kfd_dev *dev)
>  {
>         struct kernel_queue *kq;
> @@ -314,10 +316,18 @@ static __attribute__((unused)) void test_kq(struct kfd_dev *dev)
>         pr_err("Starting kernel queue test\n");
>
>         kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ);
> -       BUG_ON(!kq);
> +       if (unlikely(!kq)) {
> +               pr_err("  Failed to initialize HIQ\n");
> +               pr_err("Kernel queue test failed\n");
> +               return;
> +       }
>
>         retval = kq->ops.acquire_packet_buffer(kq, 5, &buffer);
> -       BUG_ON(retval != 0);
> +       if (unlikely(retval != 0)) {
> +               pr_err("  Failed to acquire packet buffer\n");
> +               pr_err("Kernel queue test failed\n");
> +               return;
> +       }
>         for (i = 0; i < 5; i++)
>                 buffer[i] = kq->nop_packet;
>         kq->ops.submit_packet(kq);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
> index a11477d..7e0ec6b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
> @@ -387,7 +387,8 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
>  {
>         struct mqd_manager *mqd;
>
> -       BUG_ON(type >= KFD_MQD_TYPE_MAX);
> +       if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
> +               return NULL;
>
>         mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
>         if (!mqd)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> index d638c2c..f4c8c23 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> @@ -233,7 +233,8 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
>  {
>         struct mqd_manager *mqd;
>
> -       BUG_ON(type >= KFD_MQD_TYPE_MAX);
> +       if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
> +               return NULL;
>
>         mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
>         if (!mqd)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index aacd5a3..77a6f2b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -35,7 +35,8 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
>  {
>         unsigned int temp = *wptr + increment_bytes / sizeof(uint32_t);
>
> -       BUG_ON((temp * sizeof(uint32_t)) > buffer_size_bytes);
> +       WARN((temp * sizeof(uint32_t)) > buffer_size_bytes,
> +            "Runlist IB overflow");
>         *wptr = temp;
>  }
>
> @@ -94,7 +95,8 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
>  {
>         int retval;
>
> -       BUG_ON(pm->allocated);
> +       if (WARN_ON(pm->allocated))
> +               return -EINVAL;
>
>         pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription);
>
> @@ -119,7 +121,8 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer,
>  {
>         struct pm4_runlist *packet;
>
> -       BUG_ON(!ib);
> +       if (WARN_ON(!ib))
> +               return -EFAULT;
>
>         packet = (struct pm4_runlist *)buffer;
>
> @@ -211,9 +214,8 @@ static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer,
>                 use_static = false; /* no static queues under SDMA */
>                 break;
>         default:
> -               pr_err("queue type %d\n", q->properties.type);
> -               BUG();
> -               break;
> +               WARN(1, "queue type %d", q->properties.type);
> +               return -EINVAL;
>         }
>         packet->bitfields3.doorbell_offset =
>                         q->properties.doorbell_off;
> @@ -266,8 +268,8 @@ static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer,
>                 use_static = false; /* no static queues under SDMA */
>                 break;
>         default:
> -               BUG();
> -               break;
> +               WARN(1, "queue type %d", q->properties.type);
> +               return -EINVAL;
>         }
>
>         packet->mes_map_queues_ordinals[0].bitfields3.doorbell_offset =
> @@ -392,14 +394,16 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
>         pr_debug("Finished map process and queues to runlist\n");
>
>         if (is_over_subscription)
> -               pm_create_runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr,
> -                               alloc_size_bytes / sizeof(uint32_t), true);
> +               retval = pm_create_runlist(pm, &rl_buffer[rl_wptr],
> +                                       *rl_gpu_addr,
> +                                       alloc_size_bytes / sizeof(uint32_t),
> +                                       true);
>
>         for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++)
>                 pr_debug("0x%2X ", rl_buffer[i]);
>         pr_debug("\n");
>
> -       return 0;
> +       return retval;
>  }
>
>  int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
> @@ -512,7 +516,8 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
>         int retval;
>         struct pm4_query_status *packet;
>
> -       BUG_ON(!fence_address);
> +       if (WARN_ON(!fence_address))
> +               return -EFAULT;
>
>         mutex_lock(&pm->lock);
>         retval = pm->priv_queue->ops.acquire_packet_buffer(
> @@ -577,8 +582,9 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
>                         engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
>                 break;
>         default:
> -               BUG();
> -               break;
> +               WARN(1, "queue type %d", type);
> +               retval = -EINVAL;
> +               goto err_invalid;
>         }
>
>         if (reset)
> @@ -610,12 +616,15 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
>                                 queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only;
>                 break;
>         default:
> -               BUG();
> -               break;
> +               WARN(1, "filter %d", mode);
> +               retval = -EINVAL;
>         }
>
> -       pm->priv_queue->ops.submit_packet(pm->priv_queue);
> -
> +err_invalid:
> +       if (!retval)
> +               pm->priv_queue->ops.submit_packet(pm->priv_queue);
> +       else
> +               pm->priv_queue->ops.rollback_packet(pm->priv_queue);

I don't feel comfortable putting a valid code path under an "err_invalid" label.
This defeats the purpose of goto statement and common cleanup code,
making the code unreadable.
Also, the rollback packet function was not in the original code. Why
did you add it here ?

>  err_acquire_packet_buffer:
>         mutex_unlock(&pm->lock);
>         return retval;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
> index b3f7d43..1e06de0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
> @@ -92,6 +92,6 @@ unsigned int kfd_pasid_alloc(void)
>
>  void kfd_pasid_free(unsigned int pasid)
>  {
> -       BUG_ON(pasid == 0 || pasid >= pasid_limit);
> -       clear_bit(pasid, pasid_bitmap);
> +       if (!WARN_ON(pasid == 0 || pasid >= pasid_limit))
> +               clear_bit(pasid, pasid_bitmap);
>  }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index d030d76..41a9976 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -79,8 +79,6 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread)
>  {
>         struct kfd_process *process;
>
> -       BUG_ON(!kfd_process_wq);
> -
>         if (!thread->mm)
>                 return ERR_PTR(-EINVAL);
>
> @@ -202,10 +200,10 @@ static void kfd_process_destroy_delayed(struct rcu_head *rcu)
>         struct kfd_process_release_work *work;
>         struct kfd_process *p;
>
> -       BUG_ON(!kfd_process_wq);
> +       WARN_ON(!kfd_process_wq);
I think this is redundant, as kfd_process_wq is later derefernced
inside queue_work (as *wq). So we will get a violation there anyway.

>
>         p = container_of(rcu, struct kfd_process, rcu);
> -       BUG_ON(atomic_read(&p->mm->mm_count) <= 0);
> +       WARN_ON(atomic_read(&p->mm->mm_count) <= 0);
>
>         mmdrop(p->mm);
>
> @@ -229,7 +227,8 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
>          * mmu_notifier srcu is read locked
>          */
>         p = container_of(mn, struct kfd_process, mmu_notifier);
> -       BUG_ON(p->mm != mm);
> +       if (WARN_ON(p->mm != mm))
> +               return;
>
>         mutex_lock(&kfd_processes_mutex);
>         hash_del_rcu(&p->kfd_processes);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index f6ecdff..1cae95e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -218,8 +218,8 @@ int pqm_create_queue(struct process_queue_manager *pqm,
>                                                         kq, &pdd->qpd);
>                 break;
>         default:
> -               BUG();
> -               break;
> +               WARN(1, "Invalid queue type %d", type);
> +               retval = -EINVAL;
>         }
>
>         if (retval != 0) {
> @@ -272,7 +272,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>                 dev = pqn->kq->dev;
>         if (pqn->q)
>                 dev = pqn->q->device;
> -       BUG_ON(!dev);
> +       if (WARN_ON(!dev))
> +               return -ENODEV;
>
>         pdd = kfd_get_process_device_data(dev, pqm->process);
>         if (!pdd) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index e5486f4..19ce590 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -799,10 +799,12 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
>         int ret;
>         uint32_t i;
>
> +       if (WARN_ON(dev->kobj_node))
> +               return -EEXIST;
> +
>         /*
>          * Creating the sysfs folders
>          */
> -       BUG_ON(dev->kobj_node);
>         dev->kobj_node = kfd_alloc_struct(dev->kobj_node);
>         if (!dev->kobj_node)
>                 return -ENOMEM;
> --
> 2.7.4
>


More information about the amd-gfx mailing list