[PATCH 14/24] drm/amdkfd: Handle remaining BUG_ONs more gracefully v2

Oded Gabbay oded.gabbay at gmail.com
Wed Aug 16 09:45:27 UTC 2017


On Wed, Aug 16, 2017 at 6:00 AM, Felix Kuehling <Felix.Kuehling at amd.com> wrote:
> In most cases, BUG_ONs can be replaced with WARN_ON with an error
> return. In some void functions just turn them into a WARN_ON and
> possibly an early exit.
>
> v2:
> * Cleaned up error handling in pm_send_unmap_queue
> * Removed redundant WARN_ON in kfd_process_destroy_delayed
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c            |  3 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c            |  3 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c            | 16 ++++----
>  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 19 +++++-----
>  .../drm/amd/amdkfd/kfd_device_queue_manager_cik.c  |  2 +-
>  .../drm/amd/amdkfd/kfd_device_queue_manager_vi.c   |  2 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c      | 20 +++++++---
>  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c   |  3 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c    |  3 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c    | 44 ++++++++++++++--------
>  drivers/gpu/drm/amd/amdkfd/kfd_pasid.c             |  4 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c           |  9 ++---
>  .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  7 ++--
>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c          |  4 +-
>  14 files changed, 84 insertions(+), 55 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
> index 3841cad..0aa021a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
> @@ -60,7 +60,8 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
>         unsigned int *ib_packet_buff;
>         int status;
>
> -       BUG_ON(!size_in_bytes);
> +       if (WARN_ON(!size_in_bytes))
> +               return -EINVAL;
>
>         kq = dbgdev->kq;
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
> index 2d5555c..3da25f7 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
> @@ -64,7 +64,8 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev)
>         enum DBGDEV_TYPE type = DBGDEV_TYPE_DIQ;
>         struct kfd_dbgmgr *new_buff;
>
> -       BUG_ON(!pdev->init_complete);
> +       if (WARN_ON(!pdev->init_complete))
> +               return false;
>
>         new_buff = kfd_alloc_struct(new_buff);
>         if (!new_buff) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 416955f..f628ac3 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -98,7 +98,7 @@ static const struct kfd_device_info *lookup_device_info(unsigned short did)
>
>         for (i = 0; i < ARRAY_SIZE(supported_devices); i++) {
>                 if (supported_devices[i].did == did) {
> -                       BUG_ON(!supported_devices[i].device_info);
> +                       WARN_ON(!supported_devices[i].device_info);
>                         return supported_devices[i].device_info;
>                 }
>         }
> @@ -212,9 +212,8 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
>                         flags);
>
>         dev = kfd_device_by_pci_dev(pdev);
> -       BUG_ON(!dev);
> -
> -       kfd_signal_iommu_event(dev, pasid, address,
> +       if (!WARN_ON(!dev))
> +               kfd_signal_iommu_event(dev, pasid, address,
>                         flags & PPR_FAULT_WRITE, flags & PPR_FAULT_EXEC);
>
>         return AMD_IOMMU_INV_PRI_RSP_INVALID;
> @@ -397,9 +396,12 @@ static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
>  {
>         unsigned int num_of_longs;
>
> -       BUG_ON(buf_size < chunk_size);
> -       BUG_ON(buf_size == 0);
> -       BUG_ON(chunk_size == 0);
> +       if (WARN_ON(buf_size < chunk_size))
> +               return -EINVAL;
> +       if (WARN_ON(buf_size == 0))
> +               return -EINVAL;
> +       if (WARN_ON(chunk_size == 0))
> +               return -EINVAL;
>
>         kfd->gtt_sa_chunk_size = chunk_size;
>         kfd->gtt_sa_num_of_chunks = buf_size / chunk_size;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 2486dfb..e553c5e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -388,7 +388,8 @@ static struct mqd_manager *get_mqd_manager_nocpsch(
>  {
>         struct mqd_manager *mqd;
>
> -       BUG_ON(type >= KFD_MQD_TYPE_MAX);
> +       if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
> +               return NULL;
>
>         pr_debug("mqd type %d\n", type);
>
> @@ -513,7 +514,7 @@ static void uninitialize_nocpsch(struct device_queue_manager *dqm)
>  {
>         int i;
>
> -       BUG_ON(dqm->queue_count > 0 || dqm->processes_count > 0);
> +       WARN_ON(dqm->queue_count > 0 || dqm->processes_count > 0);
>
>         kfree(dqm->allocated_queues);
>         for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++)
> @@ -1129,8 +1130,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
>                 dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
>                 break;
>         default:
> -               BUG();
> -               break;
> +               pr_err("Invalid scheduling policy %d\n", sched_policy);
> +               goto out_free;
>         }
>
>         switch (dev->device_info->asic_family) {
> @@ -1143,12 +1144,12 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
>                 break;
>         }
>
> -       if (dqm->ops.initialize(dqm)) {
> -               kfree(dqm);
> -               return NULL;
> -       }
> +       if (!dqm->ops.initialize(dqm))
> +               return dqm;
>
> -       return dqm;
> +out_free:
> +       kfree(dqm);
> +       return NULL;
>  }
>
>  void device_queue_manager_uninit(struct device_queue_manager *dqm)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
> index 43194b4..fadc56a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
> @@ -65,7 +65,7 @@ static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
>          * for LDS/Scratch and GPUVM.
>          */
>
> -       BUG_ON((top_address_nybble & 1) || top_address_nybble > 0xE ||
> +       WARN_ON((top_address_nybble & 1) || top_address_nybble > 0xE ||
>                 top_address_nybble == 0);
>
>         return PRIVATE_BASE(top_address_nybble << 12) |
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
> index 47ef910..15e81ae 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
> @@ -67,7 +67,7 @@ static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
>          * for LDS/Scratch and GPUVM.
>          */
>
> -       BUG_ON((top_address_nybble & 1) || top_address_nybble > 0xE ||
> +       WARN_ON((top_address_nybble & 1) || top_address_nybble > 0xE ||
>                 top_address_nybble == 0);
>
>         return top_address_nybble << 12 |
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 970bc07..0e4d4a9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -41,7 +41,8 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
>         int retval;
>         union PM4_MES_TYPE_3_HEADER nop;
>
> -       BUG_ON(type != KFD_QUEUE_TYPE_DIQ && type != KFD_QUEUE_TYPE_HIQ);
> +       if (WARN_ON(type != KFD_QUEUE_TYPE_DIQ && type != KFD_QUEUE_TYPE_HIQ))
> +               return false;
>
>         pr_debug("Initializing queue type %d size %d\n", KFD_QUEUE_TYPE_HIQ,
>                         queue_size);
> @@ -62,8 +63,8 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
>                                                 KFD_MQD_TYPE_HIQ);
>                 break;
>         default:
> -               BUG();
> -               break;
> +               pr_err("Invalid queue type %d\n", type);
> +               return false;
>         }
>
>         if (!kq->mqd)
> @@ -305,6 +306,7 @@ void kernel_queue_uninit(struct kernel_queue *kq)
>         kfree(kq);
>  }
>
> +/* FIXME: Can this test be removed? */
>  static __attribute__((unused)) void test_kq(struct kfd_dev *dev)
>  {
>         struct kernel_queue *kq;
> @@ -314,10 +316,18 @@ static __attribute__((unused)) void test_kq(struct kfd_dev *dev)
>         pr_err("Starting kernel queue test\n");
>
>         kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ);
> -       BUG_ON(!kq);
> +       if (unlikely(!kq)) {
> +               pr_err("  Failed to initialize HIQ\n");
> +               pr_err("Kernel queue test failed\n");
> +               return;
> +       }
>
>         retval = kq->ops.acquire_packet_buffer(kq, 5, &buffer);
> -       BUG_ON(retval != 0);
> +       if (unlikely(retval != 0)) {
> +               pr_err("  Failed to acquire packet buffer\n");
> +               pr_err("Kernel queue test failed\n");
> +               return;
> +       }
>         for (i = 0; i < 5; i++)
>                 buffer[i] = kq->nop_packet;
>         kq->ops.submit_packet(kq);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
> index a11477d..7e0ec6b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
> @@ -387,7 +387,8 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
>  {
>         struct mqd_manager *mqd;
>
> -       BUG_ON(type >= KFD_MQD_TYPE_MAX);
> +       if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
> +               return NULL;
>
>         mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
>         if (!mqd)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> index d638c2c..f4c8c23 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> @@ -233,7 +233,8 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
>  {
>         struct mqd_manager *mqd;
>
> -       BUG_ON(type >= KFD_MQD_TYPE_MAX);
> +       if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
> +               return NULL;
>
>         mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
>         if (!mqd)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index aacd5a3..0816d11 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -35,7 +35,8 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
>  {
>         unsigned int temp = *wptr + increment_bytes / sizeof(uint32_t);
>
> -       BUG_ON((temp * sizeof(uint32_t)) > buffer_size_bytes);
> +       WARN((temp * sizeof(uint32_t)) > buffer_size_bytes,
> +            "Runlist IB overflow");
>         *wptr = temp;
>  }
>
> @@ -94,7 +95,8 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
>  {
>         int retval;
>
> -       BUG_ON(pm->allocated);
> +       if (WARN_ON(pm->allocated))
> +               return -EINVAL;
>
>         pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription);
>
> @@ -119,7 +121,8 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer,
>  {
>         struct pm4_runlist *packet;
>
> -       BUG_ON(!ib);
> +       if (WARN_ON(!ib))
> +               return -EFAULT;
>
>         packet = (struct pm4_runlist *)buffer;
>
> @@ -211,9 +214,8 @@ static int pm_create_map_queue_vi(struct packet_manager *pm, uint32_t *buffer,
>                 use_static = false; /* no static queues under SDMA */
>                 break;
>         default:
> -               pr_err("queue type %d\n", q->properties.type);
> -               BUG();
> -               break;
> +               WARN(1, "queue type %d", q->properties.type);
> +               return -EINVAL;
>         }
>         packet->bitfields3.doorbell_offset =
>                         q->properties.doorbell_off;
> @@ -266,8 +268,8 @@ static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer,
>                 use_static = false; /* no static queues under SDMA */
>                 break;
>         default:
> -               BUG();
> -               break;
> +               WARN(1, "queue type %d", q->properties.type);
> +               return -EINVAL;
>         }
>
>         packet->mes_map_queues_ordinals[0].bitfields3.doorbell_offset =
> @@ -392,14 +394,16 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
>         pr_debug("Finished map process and queues to runlist\n");
>
>         if (is_over_subscription)
> -               pm_create_runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr,
> -                               alloc_size_bytes / sizeof(uint32_t), true);
> +               retval = pm_create_runlist(pm, &rl_buffer[rl_wptr],
> +                                       *rl_gpu_addr,
> +                                       alloc_size_bytes / sizeof(uint32_t),
> +                                       true);
>
>         for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++)
>                 pr_debug("0x%2X ", rl_buffer[i]);
>         pr_debug("\n");
>
> -       return 0;
> +       return retval;
>  }
>
>  int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
> @@ -512,7 +516,8 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
>         int retval;
>         struct pm4_query_status *packet;
>
> -       BUG_ON(!fence_address);
> +       if (WARN_ON(!fence_address))
> +               return -EFAULT;
>
>         mutex_lock(&pm->lock);
>         retval = pm->priv_queue->ops.acquire_packet_buffer(
> @@ -577,8 +582,9 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
>                         engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
>                 break;
>         default:
> -               BUG();
> -               break;
> +               WARN(1, "queue type %d", type);
> +               retval = -EINVAL;
> +               goto err_invalid;
>         }
>
>         if (reset)
> @@ -610,12 +616,18 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
>                                 queue_sel__mes_unmap_queues__perform_request_on_dynamic_queues_only;
>                 break;
>         default:
> -               BUG();
> -               break;
> +               WARN(1, "filter %d", mode);
> +               retval = -EINVAL;
> +               goto err_invalid;
>         }
>
>         pm->priv_queue->ops.submit_packet(pm->priv_queue);
>
> +       mutex_unlock(&pm->lock);
> +       return 0;
> +
> +err_invalid:
> +       pm->priv_queue->ops.rollback_packet(pm->priv_queue);
>  err_acquire_packet_buffer:
>         mutex_unlock(&pm->lock);
>         return retval;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
> index b3f7d43..1e06de0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
> @@ -92,6 +92,6 @@ unsigned int kfd_pasid_alloc(void)
>
>  void kfd_pasid_free(unsigned int pasid)
>  {
> -       BUG_ON(pasid == 0 || pasid >= pasid_limit);
> -       clear_bit(pasid, pasid_bitmap);
> +       if (!WARN_ON(pasid == 0 || pasid >= pasid_limit))
> +               clear_bit(pasid, pasid_bitmap);
>  }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index d030d76..c74cf22 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -79,8 +79,6 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread)
>  {
>         struct kfd_process *process;
>
> -       BUG_ON(!kfd_process_wq);
> -
>         if (!thread->mm)
>                 return ERR_PTR(-EINVAL);
>
> @@ -202,10 +200,8 @@ static void kfd_process_destroy_delayed(struct rcu_head *rcu)
>         struct kfd_process_release_work *work;
>         struct kfd_process *p;
>
> -       BUG_ON(!kfd_process_wq);
> -
>         p = container_of(rcu, struct kfd_process, rcu);
> -       BUG_ON(atomic_read(&p->mm->mm_count) <= 0);
> +       WARN_ON(atomic_read(&p->mm->mm_count) <= 0);
>
>         mmdrop(p->mm);
>
> @@ -229,7 +225,8 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
>          * mmu_notifier srcu is read locked
>          */
>         p = container_of(mn, struct kfd_process, mmu_notifier);
> -       BUG_ON(p->mm != mm);
> +       if (WARN_ON(p->mm != mm))
> +               return;
>
>         mutex_lock(&kfd_processes_mutex);
>         hash_del_rcu(&p->kfd_processes);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index f6ecdff..1cae95e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -218,8 +218,8 @@ int pqm_create_queue(struct process_queue_manager *pqm,
>                                                         kq, &pdd->qpd);
>                 break;
>         default:
> -               BUG();
> -               break;
> +               WARN(1, "Invalid queue type %d", type);
> +               retval = -EINVAL;
>         }
>
>         if (retval != 0) {
> @@ -272,7 +272,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>                 dev = pqn->kq->dev;
>         if (pqn->q)
>                 dev = pqn->q->device;
> -       BUG_ON(!dev);
> +       if (WARN_ON(!dev))
> +               return -ENODEV;
>
>         pdd = kfd_get_process_device_data(dev, pqm->process);
>         if (!pdd) {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index e5486f4..19ce590 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -799,10 +799,12 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
>         int ret;
>         uint32_t i;
>
> +       if (WARN_ON(dev->kobj_node))
> +               return -EEXIST;
> +
>         /*
>          * Creating the sysfs folders
>          */
> -       BUG_ON(dev->kobj_node);
>         dev->kobj_node = kfd_alloc_struct(dev->kobj_node);
>         if (!dev->kobj_node)
>                 return -ENOMEM;
> --
> 2.7.4
>
This patch is:
Reviewed-by: Oded Gabbay <oded.gabbay at gmail.com>


More information about the amd-gfx mailing list