[PATCH 3/4] drm/amdkfd: Add CWSR support

Sun Nov 19 13:37:47 UTC 2017

On Tue, Nov 14, 2017 at 11:41 PM, Felix Kuehling <Felix.Kuehling at amd.com> wrote:
> This hardware feature allows the GPU to preempt shader execution in
> the middle of a compute wave, save the state and restore it later
> to resume execution.
>
> Memory for saving the state is allocated per queue in user mode and
> the address and size passed to the create_queue ioctl. The size
Is this a correct description?
It seems to me the memory is allocated at kfd_process_init_cwsr() and
the address is saved internally and not passed in the create_ioctl.
Which begs the question, why indeed it is not allocated by the user
and then passed through the create_ioctl function ?


> depends on the number of waves that can be in flight simultaneously
> on a given ASIC.
>
> Signed-off-by: Shaoyun.liu <shaoyun.liu at amd.com>
> Signed-off-by: Yong Zhao <yong.zhao at amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c           |  7 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c            | 20 ++++-
>  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  6 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_module.c            |  4 +
>  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c    | 27 +++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h              | 31 +++++++-
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c           | 87 +++++++++++++++++++++-
>  include/uapi/linux/kfd_ioctl.h                     |  3 +-
>  8 files changed, 179 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index 505d391..2a4612d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -117,7 +117,7 @@ static int kfd_open(struct inode *inode, struct file *filep)
>                 return -EPERM;
>         }
>
> -       process = kfd_create_process(current);
> +       process = kfd_create_process(filep);
>         if (IS_ERR(process))
>                 return PTR_ERR(process);
>
> @@ -206,6 +206,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
>         q_properties->ctx_save_restore_area_address =
>                         args->ctx_save_restore_address;
>         q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size;
> +       q_properties->ctl_stack_size = args->ctl_stack_size;
>         if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
>                 args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
>                 q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
> @@ -1088,6 +1089,10 @@ static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
>                         KFD_MMAP_EVENTS_MASK) {
>                 vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK;
>                 return kfd_event_mmap(process, vma);
> +       } else if ((vma->vm_pgoff & KFD_MMAP_RESERVED_MEM_MASK) ==
> +                       KFD_MMAP_RESERVED_MEM_MASK) {
> +               vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_RESERVED_MEM_MASK;
> +               return kfd_reserved_mem_mmap(process, vma);
>         }
>
>         return -EFAULT;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 621a3b5..4f05eac 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -27,6 +27,7 @@
>  #include "kfd_priv.h"
>  #include "kfd_device_queue_manager.h"
>  #include "kfd_pm4_headers_vi.h"
> +#include "cwsr_trap_handler_gfx8.asm"
>
>  #define MQD_SIZE_ALIGNED 768
>
> @@ -38,7 +39,8 @@ static const struct kfd_device_info kaveri_device_info = {
>         .ih_ring_entry_size = 4 * sizeof(uint32_t),
>         .event_interrupt_class = &event_interrupt_class_cik,
>         .num_of_watch_points = 4,
> -       .mqd_size_aligned = MQD_SIZE_ALIGNED
> +       .mqd_size_aligned = MQD_SIZE_ALIGNED,
> +       .supports_cwsr = false,
>  };
>
>  static const struct kfd_device_info carrizo_device_info = {
> @@ -49,7 +51,8 @@ static const struct kfd_device_info carrizo_device_info = {
>         .ih_ring_entry_size = 4 * sizeof(uint32_t),
>         .event_interrupt_class = &event_interrupt_class_cik,
>         .num_of_watch_points = 4,
> -       .mqd_size_aligned = MQD_SIZE_ALIGNED
> +       .mqd_size_aligned = MQD_SIZE_ALIGNED,
> +       .supports_cwsr = true,
>  };
>
>  struct kfd_deviceid {
> @@ -212,6 +215,17 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
>         return AMD_IOMMU_INV_PRI_RSP_INVALID;
>  }
>
> +static void kfd_cwsr_init(struct kfd_dev *kfd)
> +{
> +       if (cwsr_enable && kfd->device_info->supports_cwsr) {
> +               BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE);
> +
> +               kfd->cwsr_isa = cwsr_trap_gfx8_hex;
> +               kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex);
> +               kfd->cwsr_enabled = true;
> +       }
> +}
> +
>  bool kgd2kfd_device_init(struct kfd_dev *kfd,
>                          const struct kgd2kfd_shared_resources *gpu_resources)
>  {
> @@ -286,6 +300,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>                 goto device_iommu_pasid_error;
>         }
>
> +       kfd_cwsr_init(kfd);
> +
>         if (kfd_resume(kfd))
>                 goto kfd_resume_error;
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index e202921..5c06502 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -173,6 +173,9 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>         *allocated_vmid = qpd->vmid;
>         q->properties.vmid = qpd->vmid;
>
> +       q->properties.tba_addr = qpd->tba_addr;
> +       q->properties.tma_addr = qpd->tma_addr;
> +
>         if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
>                 retval = create_compute_queue_nocpsch(dqm, q, qpd);
>         else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
> @@ -846,6 +849,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
>         }
>
>         dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
> +
> +       q->properties.tba_addr = qpd->tba_addr;
> +       q->properties.tma_addr = qpd->tma_addr;
>         retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
>                                 &q->gart_mqd_addr, &q->properties);
>         if (retval)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
> index 6c5a9ca..4b2423b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
> @@ -49,6 +49,10 @@ module_param(sched_policy, int, 0444);
>  MODULE_PARM_DESC(sched_policy,
>         "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)");
>
> +int cwsr_enable = 1;
> +module_param(cwsr_enable, int, 0444);
> +MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))");
> +
>  int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
>  module_param(max_num_of_queues_per_device, int, 0444);
>  MODULE_PARM_DESC(max_num_of_queues_per_device,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> index 2ba7cea..00e1f1a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
> @@ -89,6 +89,28 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
>         if (q->format == KFD_QUEUE_FORMAT_AQL)
>                 m->cp_hqd_iq_rptr = 1;
>
> +       if (q->tba_addr) {
> +               m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8);
> +               m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8);
> +               m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8);
> +               m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8);
Why the >> 8 on the addresses ?

> +               m->compute_pgm_rsrc2 |=
> +                       (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT);
> +       }
> +
> +       if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) {
> +               m->cp_hqd_persistent_state |=
> +                       (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
> +               m->cp_hqd_ctx_save_base_addr_lo =
> +                       lower_32_bits(q->ctx_save_restore_area_address);
> +               m->cp_hqd_ctx_save_base_addr_hi =
> +                       upper_32_bits(q->ctx_save_restore_area_address);
> +               m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
> +               m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
> +               m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
> +               m->cp_hqd_wg_state_offset = q->ctl_stack_size;
Just wanted to make sure the last two lines are not copy-paste error
from the third line

> +       }
> +
>         *mqd = m;
>         if (gart_addr)
>                 *gart_addr = addr;
> @@ -167,6 +189,11 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
>                                 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT;
>         }
>
> +       if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address)
> +               m->cp_hqd_ctx_save_control =
> +                       atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT |
> +                       mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT;
> +
>         q->is_active = (q->queue_size > 0 &&
>                         q->queue_address != 0 &&
>                         q->queue_percent > 0);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 4750473..a668764 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -41,6 +41,7 @@
>
>  #define KFD_MMAP_DOORBELL_MASK 0x8000000000000
>  #define KFD_MMAP_EVENTS_MASK 0x4000000000000
> +#define KFD_MMAP_RESERVED_MEM_MASK 0x2000000000000
>
>  /*
>   * When working with cp scheduler we should assign the HIQ manually or via
> @@ -63,6 +64,15 @@
>  #define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS 1024
>
>  /*
> + * Size of the per-process TBA+TMA buffer: 2 pages
> + *
> + * The first page is the TBA used for the CWSR ISA code. The second
> + * page is used as TMA for daisy changing a user-mode trap handler.
> + */
> +#define KFD_CWSR_TBA_TMA_SIZE (PAGE_SIZE * 2)
> +#define KFD_CWSR_TMA_OFFSET PAGE_SIZE
> +
> +/*
>   * Kernel module parameter to specify maximum number of supported queues per
>   * device
>   */
> @@ -78,6 +88,8 @@ extern int max_num_of_queues_per_device;
>  /* Kernel module parameter to specify the scheduling policy */
>  extern int sched_policy;
>
> +extern int cwsr_enable;
> +
>  /*
>   * Kernel module parameter to specify whether to send sigterm to HSA process on
>   * unhandled exception
> @@ -131,6 +143,7 @@ struct kfd_device_info {
>         size_t ih_ring_entry_size;
>         uint8_t num_of_watch_points;
>         uint16_t mqd_size_aligned;
> +       bool supports_cwsr;
>  };
>
>  struct kfd_mem_obj {
> @@ -200,6 +213,11 @@ struct kfd_dev {
>
>         /* Debug manager */
>         struct kfd_dbgmgr           *dbgmgr;
> +
> +       /* CWSR */
> +       bool cwsr_enabled;
> +       const void *cwsr_isa;
> +       unsigned int cwsr_isa_size;
>  };
>
>  /* KGD2KFD callbacks */
> @@ -332,6 +350,9 @@ struct queue_properties {
>         uint32_t eop_ring_buffer_size;
>         uint64_t ctx_save_restore_area_address;
>         uint32_t ctx_save_restore_area_size;
> +       uint32_t ctl_stack_size;
> +       uint64_t tba_addr;
> +       uint64_t tma_addr;
>  };
>
>  /**
> @@ -439,6 +460,11 @@ struct qcm_process_device {
>         uint32_t num_gws;
>         uint32_t num_oac;
>         uint32_t sh_hidden_private_base;
> +
> +       /* CWSR memory */
> +       void *cwsr_kaddr;
> +       uint64_t tba_addr;
> +       uint64_t tma_addr;
>  };
>
>
> @@ -563,7 +589,7 @@ struct amdkfd_ioctl_desc {
>
>  void kfd_process_create_wq(void);
>  void kfd_process_destroy_wq(void);
> -struct kfd_process *kfd_create_process(const struct task_struct *);
> +struct kfd_process *kfd_create_process(struct file *filep);
>  struct kfd_process *kfd_get_process(const struct task_struct *);
>  struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
>
> @@ -577,6 +603,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
>  struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
>                                                         struct kfd_process *p);
>
> +int kfd_reserved_mem_mmap(struct kfd_process *process,
> +                         struct vm_area_struct *vma);
> +
>  /* Process device data iterator */
>  struct kfd_process_device *kfd_get_first_process_device_data(
>                                                         struct kfd_process *p);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 1bb9b26..39f4c19 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -28,6 +28,7 @@
>  #include <linux/amd-iommu.h>
>  #include <linux/notifier.h>
>  #include <linux/compat.h>
> +#include <linux/mman.h>
>
>  struct mm_struct;
>
> @@ -53,6 +54,8 @@ struct kfd_process_release_work {
>
>  static struct kfd_process *find_process(const struct task_struct *thread);
>  static struct kfd_process *create_process(const struct task_struct *thread);
> +static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep);
> +
>
>  void kfd_process_create_wq(void)
>  {
> @@ -68,9 +71,10 @@ void kfd_process_destroy_wq(void)
>         }
>  }
>
> -struct kfd_process *kfd_create_process(const struct task_struct *thread)
> +struct kfd_process *kfd_create_process(struct file *filep)
>  {
>         struct kfd_process *process;
> +       struct task_struct *thread = current;
>
>         if (!thread->mm)
>                 return ERR_PTR(-EINVAL);
> @@ -101,6 +105,8 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread)
>
>         up_write(&thread->mm->mmap_sem);
>
> +       kfd_process_init_cwsr(process, filep);
> +
>         return process;
>  }
>
> @@ -168,6 +174,11 @@ static void kfd_process_wq_release(struct work_struct *work)
>                         amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
>
>                 list_del(&pdd->per_device_list);
> +
> +               if (pdd->qpd.cwsr_kaddr)
> +                       free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
> +                               get_order(KFD_CWSR_TBA_TMA_SIZE));
> +
>                 kfree(pdd);
>         }
>
> @@ -260,6 +271,46 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
>         .release = kfd_process_notifier_release,
>  };
>
> +static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep)
> +{
> +       int err = 0;
> +       unsigned long  offset;
> +       struct kfd_process_device *temp, *pdd = NULL;
> +       struct kfd_dev *dev = NULL;
> +       struct qcm_process_device *qpd = NULL;
> +
> +       mutex_lock(&p->mutex);
> +       list_for_each_entry_safe(pdd, temp, &p->per_device_data,
> +                               per_device_list) {
> +               dev = pdd->dev;
> +               qpd = &pdd->qpd;
> +               if (!dev->cwsr_enabled || qpd->cwsr_kaddr)
> +                       continue;
> +               offset = (dev->id | KFD_MMAP_RESERVED_MEM_MASK) << PAGE_SHIFT;
> +               qpd->tba_addr = (int64_t)vm_mmap(filep, 0,
> +                       KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC,
> +                       MAP_SHARED, offset);
> +
> +               if (IS_ERR_VALUE(qpd->tba_addr)) {
> +                       pr_err("Failure to set tba address. error -%d.\n",
> +                               (int)qpd->tba_addr);
> +                       err = qpd->tba_addr;
> +                       qpd->tba_addr = 0;
> +                       qpd->cwsr_kaddr = NULL;
> +                       goto out;
> +               }
> +
> +               memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
> +
> +               qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
> +               pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
> +                       qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
> +       }
> +out:
> +       mutex_unlock(&p->mutex);
> +       return err;
> +}
> +
>  static struct kfd_process *create_process(const struct task_struct *thread)
>  {
>         struct kfd_process *process;
> @@ -535,3 +586,37 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
>
>         return p;
>  }
> +
> +int kfd_reserved_mem_mmap(struct kfd_process *process,
> +                         struct vm_area_struct *vma)
> +{
> +       struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff);
> +       struct kfd_process_device *pdd;
> +       struct qcm_process_device *qpd;
> +
> +       if (!dev)
> +               return -EINVAL;
> +       if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) {
> +               pr_err("Incorrect CWSR mapping size.\n");
> +               return -EINVAL;
> +       }
> +
> +       pdd = kfd_get_process_device_data(dev, process);
> +       if (!pdd)
> +               return -EINVAL;
> +       qpd = &pdd->qpd;
> +
> +       qpd->cwsr_kaddr = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
> +                                       get_order(KFD_CWSR_TBA_TMA_SIZE));
> +       if (!qpd->cwsr_kaddr) {
> +               pr_err("Error allocating per process CWSR buffer.\n");
> +               return -ENOMEM;
> +       }
> +
> +       vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND
> +               | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP;
> +       /* Mapping pages to user process */
> +       return remap_pfn_range(vma, vma->vm_start,
> +                              PFN_DOWN(__pa(qpd->cwsr_kaddr)),
> +                              KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot);
> +}
> diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> index 731d0df..7039f16 100644
> --- a/include/uapi/linux/kfd_ioctl.h
> +++ b/include/uapi/linux/kfd_ioctl.h
> @@ -58,7 +58,8 @@ struct kfd_ioctl_create_queue_args {
>         __u64 eop_buffer_address;       /* to KFD */
>         __u64 eop_buffer_size;  /* to KFD */
>         __u64 ctx_save_restore_address; /* to KFD */
> -       __u64 ctx_save_restore_size;    /* to KFD */
> +       __u32 ctx_save_restore_size;    /* to KFD */
> +       __u32 ctl_stack_size;           /* to KFD */
>  };
>
>  struct kfd_ioctl_destroy_queue_args {
> --
> 2.7.4
>