[PATCH v2 11/25] amdkfd: Add basic modules to amdkfd

Jerome Glisse j.glisse at gmail.com
Sun Jul 20 16:02:40 PDT 2014


On Thu, Jul 17, 2014 at 04:29:18PM +0300, Oded Gabbay wrote:
> From: Andrew Lewycky <Andrew.Lewycky at amd.com>
> 
> This patch adds the process module and 4 helper modules:
> 
> - kfd_process, which handles process which open /dev/kfd
> - kfd_doorbell, which provides helper functions for doorbell allocation, release and mapping to userspace
> - kfd_pasid, which provides helper functions for pasid allocation and release
> - kfd_vidmem, which provides helper functions for allocation and release of memory from the gfx driver
> - kfd_aperture, which provides helper functions for managing the LDS, Local GPU memory and Scratch memory apertures of the process
> 
> This patch only contains the basic kfd_process module, which doesn't contain the reference to the queue scheduler. This was done to allow easier code review.
> 
> Also, this patch doesn't contain the calls to the IOMMU driver for binding the pasid to the device. Again, this was done to allow easier code review
> 
> The kfd_process object is created when a process opens /dev/kfd and is closed when the mm_struct of that process is teared-down.

So i valid argument were made to have one file per device and because this is not
a common hsa architecture i am rather reluctant to add the /dev/kfd directory just
for a temporary solution until people inside the HSA foundation get there act to-
gether and work on a common API.

So i rather have all kfd temporary solution inside the radeon driver under the
drm folder. I think we have enough ioctl left to accomodate you.

> 
> Signed-off-by: Andrew Lewycky <Andrew.Lewycky at amd.com>
> Signed-off-by: Oded Gabbay <oded.gabbay at amd.com>
> ---
>  drivers/gpu/drm/radeon/amdkfd/Makefile       |   4 +-
>  drivers/gpu/drm/radeon/amdkfd/kfd_aperture.c | 123 +++++++++
>  drivers/gpu/drm/radeon/amdkfd/kfd_chardev.c  |  36 ++-
>  drivers/gpu/drm/radeon/amdkfd/kfd_device.c   |   2 +
>  drivers/gpu/drm/radeon/amdkfd/kfd_doorbell.c | 264 +++++++++++++++++++
>  drivers/gpu/drm/radeon/amdkfd/kfd_module.c   |  22 ++
>  drivers/gpu/drm/radeon/amdkfd/kfd_pasid.c    |  97 +++++++
>  drivers/gpu/drm/radeon/amdkfd/kfd_priv.h     | 148 +++++++++++
>  drivers/gpu/drm/radeon/amdkfd/kfd_process.c  | 374 +++++++++++++++++++++++++++
>  drivers/gpu/drm/radeon/amdkfd/kfd_vidmem.c   |  96 +++++++
>  10 files changed, 1163 insertions(+), 3 deletions(-)
>  create mode 100644 drivers/gpu/drm/radeon/amdkfd/kfd_aperture.c
>  create mode 100644 drivers/gpu/drm/radeon/amdkfd/kfd_doorbell.c
>  create mode 100644 drivers/gpu/drm/radeon/amdkfd/kfd_pasid.c
>  create mode 100644 drivers/gpu/drm/radeon/amdkfd/kfd_process.c
>  create mode 100644 drivers/gpu/drm/radeon/amdkfd/kfd_vidmem.c
> 
> diff --git a/drivers/gpu/drm/radeon/amdkfd/Makefile b/drivers/gpu/drm/radeon/amdkfd/Makefile
> index 08ecfcd..daf75a8 100644
> --- a/drivers/gpu/drm/radeon/amdkfd/Makefile
> +++ b/drivers/gpu/drm/radeon/amdkfd/Makefile
> @@ -4,6 +4,8 @@
>  
>  ccflags-y := -Iinclude/drm
>  
> -amdkfd-y	:= kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o
> +amdkfd-y	:= kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \
> +		kfd_pasid.o kfd_doorbell.o kfd_vidmem.o kfd_aperture.o \
> +		kfd_process.o
>  
>  obj-$(CONFIG_HSA_RADEON)	+= amdkfd.o
> diff --git a/drivers/gpu/drm/radeon/amdkfd/kfd_aperture.c b/drivers/gpu/drm/radeon/amdkfd/kfd_aperture.c
> new file mode 100644
> index 0000000..0468114
> --- /dev/null
> +++ b/drivers/gpu/drm/radeon/amdkfd/kfd_aperture.c
> @@ -0,0 +1,123 @@
> +/*
> + * Copyright 2014 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#include <linux/device.h>
> +#include <linux/export.h>
> +#include <linux/err.h>
> +#include <linux/fs.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/compat.h>
> +#include <uapi/linux/kfd_ioctl.h>
> +#include <linux/time.h>
> +#include "kfd_priv.h"
> +#include <linux/mm.h>
> +#include <uapi/asm-generic/mman-common.h>
> +#include <asm/processor.h>
> +
> +
> +#define MAKE_GPUVM_APP_BASE(gpu_num) (((uint64_t)(gpu_num) << 61) + 0x1000000000000)
> +#define MAKE_GPUVM_APP_LIMIT(base) (((uint64_t)(base) & 0xFFFFFF0000000000) | 0xFFFFFFFFFF)
> +#define MAKE_SCRATCH_APP_BASE(gpu_num) (((uint64_t)(gpu_num) << 61) + 0x100000000)
> +#define MAKE_SCRATCH_APP_LIMIT(base) (((uint64_t)base & 0xFFFFFFFF00000000) | 0xFFFFFFFF)
> +#define MAKE_LDS_APP_BASE(gpu_num) (((uint64_t)(gpu_num) << 61) + 0x0)
> +#define MAKE_LDS_APP_LIMIT(base) (((uint64_t)(base) & 0xFFFFFFFF00000000) | 0xFFFFFFFF)
> +
> +#define HSA_32BIT_LDS_APP_SIZE 0x10000
> +#define HSA_32BIT_LDS_APP_ALIGNMENT 0x10000
> +
> +static unsigned long kfd_reserve_aperture(struct kfd_process *process, unsigned long len, unsigned long alignment)
> +{
> +
> +	unsigned long addr = 0;
> +	unsigned long start_address;
> +
> +	/*
> +	 * Go bottom up and find the first available aligned address.
> +	 * We may narrow space to scan by getting mmap range limits.
> +	 */
> +	for (start_address =  alignment; start_address < (TASK_SIZE - alignment); start_address += alignment) {
> +		addr = vm_mmap(NULL, start_address, len, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, 0);

So this forcing aperture into address space process is not really
welcome. Userspace have no idea this will happen and valid existing
program may already staticly allocate those address through mmap
either after or before they might trigger this code.

As i said in the general answer, i think best here is to use the
kernel reserved area to map this. You can work around the gate
page if gate page matter to you.

This of course beg the question what happen if gpu try to access
inside the kernel region ? Does the iommu respect the system flag
of the page table ? Or does it just happily allow the gpu to access
the whole kernel area ?

I guess i should go dive into the iommuv2 datasheet to find out.

> +		if (!IS_ERR_VALUE(addr)) {
> +			if (addr == start_address)
> +				return addr;
> +			vm_munmap(addr, len);
> +		}
> +	}
> +	return 0;
> +
> +}
> +
> +int kfd_init_apertures(struct kfd_process *process)
> +{
> +	uint8_t id  = 0;
> +	struct kfd_dev *dev;
> +	struct kfd_process_device *pdd;
> +
> +	mutex_lock(&process->mutex);
> +
> +	/*Iterating over all devices*/
> +	while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL && id < NUM_OF_SUPPORTED_GPUS) {
> +
> +		pdd = kfd_get_process_device_data(dev, process);
> +
> +		/*for 64 bit process aperture will be statically reserved in the non canonical process address space

What does non canonical process address space means ? This is the x86-64 terminology
or something else ?

> +		 *for 32 bit process the aperture will be reserved in the process address space
> +		 */
> +		if (process->is_32bit_user_mode) {
> +			/*try to reserve aperture. continue on failure, just put the aperture size to be 0*/
> +			pdd->lds_base = kfd_reserve_aperture(
> +						process,
> +						HSA_32BIT_LDS_APP_SIZE,
> +						HSA_32BIT_LDS_APP_ALIGNMENT);
> +
> +			if (pdd->lds_base)
> +				pdd->lds_limit = pdd->lds_base + HSA_32BIT_LDS_APP_SIZE - 1;
> +			else
> +				pdd->lds_limit = 0;
> +
> +			/*GPUVM and Scratch apertures are not supported*/
> +			pdd->gpuvm_base = pdd->gpuvm_limit = pdd->scratch_base = pdd->scratch_limit = 0;
> +		} else {
> +			/*node id couldn't be 0 - the three MSB bits of aperture shoudn't be 0*/
> +			pdd->lds_base = MAKE_LDS_APP_BASE(id + 1);
> +			pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
> +			pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1);
> +			pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base);
> +			pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1);
> +			pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
> +		}
> +
> +		dev_dbg(kfd_device, "node id %u, gpu id %u, lds_base %llX lds_limit %llX gpuvm_base %llX gpuvm_limit %llX scratch_base %llX scratch_limit %llX",
> +				id, pdd->dev->id, pdd->lds_base, pdd->lds_limit, pdd->gpuvm_base, pdd->gpuvm_limit, pdd->scratch_base, pdd->scratch_limit);

Break this debug output into several debug message. Not all of us have 30"
monitor.

> +
> +		id++;
> +	}
> +
> +	mutex_unlock(&process->mutex);
> +
> +	return 0;
> +}
> +
> +
> diff --git a/drivers/gpu/drm/radeon/amdkfd/kfd_chardev.c b/drivers/gpu/drm/radeon/amdkfd/kfd_chardev.c
> index b98bcb7..d6580a6 100644
> --- a/drivers/gpu/drm/radeon/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/radeon/amdkfd/kfd_chardev.c
> @@ -38,6 +38,7 @@
>  
>  static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>  static int kfd_open(struct inode *, struct file *);
> +static int kfd_mmap(struct file *, struct vm_area_struct *);
>  
>  static const char kfd_dev_name[] = "kfd";
>  
> @@ -46,6 +47,7 @@ static const struct file_operations kfd_fops = {
>  	.unlocked_ioctl = kfd_ioctl,
>  	.compat_ioctl = kfd_ioctl,
>  	.open = kfd_open,
> +	.mmap = kfd_mmap,
>  };
>  
>  static int kfd_char_dev_major = -1;
> @@ -96,9 +98,22 @@ struct device *kfd_chardev(void)
>  
>  static int kfd_open(struct inode *inode, struct file *filep)
>  {
> +	struct kfd_process *process;
> +
>  	if (iminor(inode) != 0)
>  		return -ENODEV;
>  
> +	process = kfd_create_process(current);
> +	if (IS_ERR(process))
> +		return PTR_ERR(process);
> +
> +	process->is_32bit_user_mode = is_compat_task();
> +
> +	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
> +		process->pasid, process->is_32bit_user_mode);
> +
> +	kfd_init_apertures(process);
> +
>  	return 0;
>  }
>  
> @@ -152,8 +167,9 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
>  		"ioctl cmd 0x%x (#%d), arg 0x%lx\n",
>  		cmd, _IOC_NR(cmd), arg);
>  
> -	/* TODO: add function that retrieves process */
> -	process = NULL;
> +	process = kfd_get_process(current);
> +	if (IS_ERR(process))
> +		return PTR_ERR(process);
>  
>  	switch (cmd) {
>  	case KFD_IOC_CREATE_QUEUE:
> @@ -201,3 +217,19 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
>  
>  	return err;
>  }
> +
> +static int
> +kfd_mmap(struct file *filp, struct vm_area_struct *vma)
> +{
> +	unsigned long pgoff = vma->vm_pgoff;
> +	struct kfd_process *process;
> +
> +	process = kfd_get_process(current);
> +	if (IS_ERR(process))
> +		return PTR_ERR(process);
> +
> +	if (pgoff >= KFD_MMAP_DOORBELL_START && pgoff < KFD_MMAP_DOORBELL_END)
> +		return kfd_doorbell_mmap(process, vma);
> +
> +	return -EINVAL;
> +}
> diff --git a/drivers/gpu/drm/radeon/amdkfd/kfd_device.c b/drivers/gpu/drm/radeon/amdkfd/kfd_device.c
> index 4138694..f6a7cf7 100644
> --- a/drivers/gpu/drm/radeon/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/radeon/amdkfd/kfd_device.c
> @@ -100,6 +100,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>  {
>  	kfd->shared_resources = *gpu_resources;
>  
> +	kfd_doorbell_init(kfd);
> +
>  	if (kfd_topology_add_device(kfd) != 0)
>  		return false;
>  
> diff --git a/drivers/gpu/drm/radeon/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/radeon/amdkfd/kfd_doorbell.c
> new file mode 100644
> index 0000000..972eaea
> --- /dev/null
> +++ b/drivers/gpu/drm/radeon/amdkfd/kfd_doorbell.c
> @@ -0,0 +1,264 @@
> +/*
> + * Copyright 2014 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include "kfd_priv.h"
> +#include <linux/mm.h>
> +#include <linux/mman.h>
> +#include <linux/slab.h>
> +
> +/*
> + * This extension supports a kernel level doorbells management for
> + * the kernel queues.
> + * Basically the last doorbells page is devoted to kernel queues
> + * and that's assures that any user process won't get access to the
> + * kernel doorbells page
> + */
> +static DEFINE_MUTEX(doorbell_mutex);
> +static unsigned long doorbell_available_index[DIV_ROUND_UP(MAX_PROCESS_QUEUES, BITS_PER_LONG)] = { 0 };
> +#define KERNEL_DOORBELL_PASID 1
> +
> +/*
> + * Each device exposes a doorbell aperture, a PCI MMIO aperture that
> + * receives 32-bit writes that are passed to queues as wptr values.
> + * The doorbells are intended to be written by applications as part
> + * of queueing work on user-mode queues.
> + * We assign doorbells to applications in PAGE_SIZE-sized and aligned chunks.
> + * We map the doorbell address space into user-mode when a process creates
> + * its first queue on each device.
> + * Although the mapping is done by KFD, it is equivalent to an mmap of
> + * the /dev/kfd with the particular device encoded in the mmap offset.
> + * There will be other uses for mmap of /dev/kfd, so only a range of
> + * offsets (KFD_MMAP_DOORBELL_START-END) is used for doorbells.
> + */

Mapping should not be done by the driver instead you should provide the
offset to userspace and have userspace call mmap with proper argument.
I do not think having device driver doing mmap in the back of an ioctl
would be a welcome idea.

> +
> +/* # of doorbell bytes allocated for each process. */
> +static inline size_t doorbell_process_allocation(void)
> +{
> +	return roundup(sizeof(doorbell_t) * MAX_PROCESS_QUEUES, PAGE_SIZE);
> +}

This whole doorbell situation needs some cleanup instead of passing every
things as byte and byte offset you should rather pass everything as pfn and
pgoffset so it is clear that a doorbell is on page granularity and you will
not have to clutter all kind of align and round up accross code. Just cleaner
and safer.

> +
> +/* Doorbell calculations for device init. */
> +void kfd_doorbell_init(struct kfd_dev *kfd)
> +{
> +	size_t doorbell_start_offset;
> +	size_t doorbell_aperture_size;
> +	size_t doorbell_process_limit;
> +
> +	/*
> +	 * We start with calculations in bytes because the input data might
> +	 * only be byte-aligned.
> +	 * Only after we have done the rounding can we assume any alignment.
> +	 */
> +
> +	doorbell_start_offset = roundup(kfd->shared_resources.doorbell_start_offset,
> +					doorbell_process_allocation());
> +	doorbell_aperture_size = rounddown(kfd->shared_resources.doorbell_aperture_size,
> +					doorbell_process_allocation());
> +
> +	if (doorbell_aperture_size > doorbell_start_offset)
> +		doorbell_process_limit =
> +			(doorbell_aperture_size - doorbell_start_offset) / doorbell_process_allocation();
> +	else
> +		doorbell_process_limit = 0;
> +
> +	kfd->doorbell_base = kfd->shared_resources.doorbell_physical_address + doorbell_start_offset;
> +	kfd->doorbell_id_offset = doorbell_start_offset / sizeof(doorbell_t);
> +	kfd->doorbell_process_limit = doorbell_process_limit - 1;
> +
> +	kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, doorbell_process_allocation());
> +	BUG_ON(!kfd->doorbell_kernel_ptr);
> +
> +	pr_debug("kfd: doorbell initialization\n"
> +				 "     doorbell base           == 0x%08lX\n"
> +				 "     doorbell_id_offset      == 0x%08lu\n"
> +				 "     doorbell_process_limit  == 0x%08lu\n"
> +				 "     doorbell_kernel_offset  == 0x%08lX\n"
> +				 "     doorbell aperture size  == 0x%08lX\n"
> +				 "     doorbell kernel address == 0x%08lX\n",
> +				 (uintptr_t)kfd->doorbell_base,
> +				 kfd->doorbell_id_offset,
> +				 doorbell_process_limit,
> +				 (uintptr_t)kfd->doorbell_base,
> +				 kfd->shared_resources.doorbell_aperture_size,
> +				 (uintptr_t)kfd->doorbell_kernel_ptr);

Kind of ugly, will break some of the kernel log manager, you need to do one
pr_debug call per line.

> +
> +}
> +
> +/*
> + * This is the /dev/kfd mmap (for doorbell) implementation.
> + * We intend that this is only called through map_doorbells, not through
> + * user-mode mmap of /dev/kfd
> + */
> +int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma)
> +{
> +	unsigned int device_index;
> +	struct kfd_dev *dev;
> +	phys_addr_t start;
> +
> +	BUG_ON(vma->vm_pgoff < KFD_MMAP_DOORBELL_START || vma->vm_pgoff >= KFD_MMAP_DOORBELL_END);
> +
> +	/* For simplicitly we only allow mapping of the entire doorbell allocation of a single device & process. */
> +	if (vma->vm_end - vma->vm_start != doorbell_process_allocation())
> +		return -EINVAL;
> +
> +	/* device_index must be GPU ID!! */
> +	device_index = vma->vm_pgoff - KFD_MMAP_DOORBELL_START;
> +
> +	dev = kfd_device_by_id(device_index);
> +	if (dev == NULL)
> +		return -EINVAL;
> +
> +	vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP;
> +	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> +
> +	start = dev->doorbell_base + process->pasid * doorbell_process_allocation();
> +
> +	pr_debug("kfd: mapping doorbell page in kfd_doorbell_mmap\n"
> +		 "     target user address == 0x%016llX\n"
> +		 "     physical address    == 0x%016llX\n"
> +		 "     vm_flags            == 0x%08lX\n"
> +		 "     size                == 0x%08lX\n",
> +		 (long long unsigned int) vma->vm_start, start, vma->vm_flags,
> +		 doorbell_process_allocation());
> +
> +	return io_remap_pfn_range(vma,
> +				vma->vm_start,
> +				start >> PAGE_SHIFT,
> +				doorbell_process_allocation(),
> +				vma->vm_page_prot);
> +}
> +
> +/*
> + * Map the doorbells for a single process & device.
> + * This will indirectly call kfd_doorbell_mmap.
> + * This assumes that the process mutex is being held.
> + */
> +static int map_doorbells(struct file *devkfd, struct kfd_process *process,
> +				struct kfd_dev *dev)
> +{
> +	struct kfd_process_device *pdd = kfd_get_process_device_data(dev, process);
> +
> +	if (pdd == NULL)
> +		return -ENOMEM;
> +
> +	if (pdd->doorbell_mapping == NULL) {
> +		unsigned long offset = (KFD_MMAP_DOORBELL_START + dev->id) << PAGE_SHIFT;
> +		doorbell_t __user *doorbell_mapping;
> +
> +		doorbell_mapping = (doorbell_t __user *)vm_mmap(devkfd, 0, doorbell_process_allocation(), PROT_WRITE,
> +								MAP_SHARED, offset);

Like said above have the userspace do that. Do not do it inside
the kernel.

> +		if (IS_ERR(doorbell_mapping))
> +			return PTR_ERR(doorbell_mapping);
> +
> +		pdd->doorbell_mapping = doorbell_mapping;
> +	}
> +
> +	return 0;
> +}
> +
> +/* get kernel iomem pointer for a doorbell */
> +u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, unsigned int *doorbell_off)
> +{
> +	u32 inx;
> +
> +	BUG_ON(!kfd || !doorbell_off);
> +
> +	mutex_lock(&doorbell_mutex);
> +	inx = find_first_zero_bit(doorbell_available_index, MAX_PROCESS_QUEUES);
> +	__set_bit(inx, doorbell_available_index);
> +	mutex_unlock(&doorbell_mutex);
> +
> +	if (inx >= MAX_PROCESS_QUEUES)
> +		return NULL;
> +
> +	/* caluculating the kernel doorbell offset using "faked" kernel pasid that allocated for kernel queues only */
> +	*doorbell_off = KERNEL_DOORBELL_PASID * (doorbell_process_allocation()/sizeof(doorbell_t)) + inx;
> +
> +	pr_debug("kfd: get kernel queue doorbell\n"
> +			 "     doorbell offset   == 0x%08d\n"
> +			 "     kernel address    == 0x%08lX\n",
> +			 *doorbell_off, (uintptr_t)(kfd->doorbell_kernel_ptr + inx));
> +
> +	return kfd->doorbell_kernel_ptr + inx;
> +}
> +
> +void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr)
> +{
> +	unsigned int inx;
> +
> +	BUG_ON(!kfd || !db_addr);
> +
> +	inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr);
> +
> +	mutex_lock(&doorbell_mutex);
> +	__clear_bit(inx, doorbell_available_index);
> +	mutex_unlock(&doorbell_mutex);
> +}
> +
> +inline void write_kernel_doorbell(u32 __iomem *db, u32 value)
> +{
> +	if (db) {
> +		writel(value, db);
> +		pr_debug("writing %d to doorbell address 0x%p\n", value, db);
> +	}
> +}
> +
> +/*
> + * Get the user-mode address of a doorbell.
> + * Assumes that the process mutex is being held.
> + */
> +doorbell_t __user *kfd_get_doorbell(struct file *devkfd,
> +					struct kfd_process *process,
> +					struct kfd_dev *dev,
> +					unsigned int doorbell_index)
> +{
> +	struct kfd_process_device *pdd;
> +	int err;
> +
> +	BUG_ON(doorbell_index > MAX_DOORBELL_INDEX);
> +
> +	err = map_doorbells(devkfd, process, dev);
> +	if (err)
> +		return ERR_PTR(err);
> +
> +	pdd = kfd_get_process_device_data(dev, process);
> +	BUG_ON(pdd == NULL); /* map_doorbells would have failed otherwise */
> +
> +	pr_debug("doorbell value on creation 0x%x\n", pdd->doorbell_mapping[doorbell_index]);
> +
> +	return &pdd->doorbell_mapping[doorbell_index];
> +}
> +
> +/*
> + * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1
> + * to doorbells with the process's doorbell page
> + */
> +unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, struct kfd_process *process, unsigned int queue_id)
> +{
> +	/*
> +	 * doorbell_id_offset accounts for doorbells taken by KGD.
> +	 * pasid * doorbell_process_allocation/sizeof(doorbell_t) adjusts
> +	 * to the process's doorbells
> +	 */
> +	return kfd->doorbell_id_offset + process->pasid * (doorbell_process_allocation()/sizeof(doorbell_t)) + queue_id;
> +}
> +
> diff --git a/drivers/gpu/drm/radeon/amdkfd/kfd_module.c b/drivers/gpu/drm/radeon/amdkfd/kfd_module.c
> index c51f981..dc08f51 100644
> --- a/drivers/gpu/drm/radeon/amdkfd/kfd_module.c
> +++ b/drivers/gpu/drm/radeon/amdkfd/kfd_module.c
> @@ -65,14 +65,30 @@ void kgd2kfd_exit(void)
>  {
>  }
>  
> +extern int kfd_process_exit(struct notifier_block *nb,
> +				unsigned long action, void *data);
> +
> +static struct notifier_block kfd_mmput_nb = {
> +	.notifier_call		= kfd_process_exit,
> +	.priority		= 3,
> +};
> +
>  static int __init kfd_module_init(void)
>  {
>  	int err;
>  
> +	err = kfd_pasid_init();
> +	if (err < 0)
> +		goto err_pasid;
> +
>  	err = kfd_chardev_init();
>  	if (err < 0)
>  		goto err_ioctl;
>  
> +	err = mmput_register_notifier(&kfd_mmput_nb);
> +	if (err)
> +		goto err_mmu_notifier;
> +
>  	err = kfd_topology_init();
>  	if (err < 0)
>  		goto err_topology;
> @@ -82,15 +98,21 @@ static int __init kfd_module_init(void)
>  	return 0;
>  
>  err_topology:
> +	mmput_unregister_notifier(&kfd_mmput_nb);
> +err_mmu_notifier:
>  	kfd_chardev_exit();
>  err_ioctl:
> +	kfd_pasid_exit();
> +err_pasid:
>  	return err;
>  }
>  
>  static void __exit kfd_module_exit(void)
>  {
>  	kfd_topology_shutdown();
> +	mmput_unregister_notifier(&kfd_mmput_nb);
>  	kfd_chardev_exit();
> +	kfd_pasid_exit();
>  	dev_info(kfd_device, "Removed module\n");
>  }
>  
> diff --git a/drivers/gpu/drm/radeon/amdkfd/kfd_pasid.c b/drivers/gpu/drm/radeon/amdkfd/kfd_pasid.c
> new file mode 100644
> index 0000000..0b594e4
> --- /dev/null
> +++ b/drivers/gpu/drm/radeon/amdkfd/kfd_pasid.c
> @@ -0,0 +1,97 @@
> +/*
> + * Copyright 2014 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include <linux/slab.h>
> +#include <linux/types.h>
> +#include "kfd_priv.h"
> +
> +#define INITIAL_PASID_LIMIT (1<<20)
> +
> +static unsigned long *pasid_bitmap;
> +static pasid_t pasid_limit;
> +static DEFINE_MUTEX(pasid_mutex);
> +
> +int kfd_pasid_init(void)
> +{
> +	pasid_limit = INITIAL_PASID_LIMIT;
> +
> +	pasid_bitmap = kzalloc(DIV_ROUND_UP(INITIAL_PASID_LIMIT, BITS_PER_BYTE), GFP_KERNEL);
> +	if (!pasid_bitmap)
> +		return -ENOMEM;
> +
> +	set_bit(0, pasid_bitmap); /* PASID 0 is reserved. */
> +
> +	return 0;
> +}
> +
> +void kfd_pasid_exit(void)
> +{
> +	kfree(pasid_bitmap);
> +}
> +
> +bool kfd_set_pasid_limit(pasid_t new_limit)
> +{
> +	if (new_limit < pasid_limit) {
> +		bool ok;
> +
> +		mutex_lock(&pasid_mutex);
> +
> +		/* ensure that no pasids >= new_limit are in-use */
> +		ok = (find_next_bit(pasid_bitmap, pasid_limit, new_limit) == pasid_limit);
> +		if (ok)
> +			pasid_limit = new_limit;
> +
> +		mutex_unlock(&pasid_mutex);
> +
> +		return ok;
> +	}
> +
> +	return true;
> +}
> +
> +inline pasid_t kfd_get_pasid_limit(void)
> +{
> +	return pasid_limit;
> +}
> +
> +pasid_t kfd_pasid_alloc(void)
> +{
> +	pasid_t found;
> +
> +	mutex_lock(&pasid_mutex);
> +
> +	found = find_first_zero_bit(pasid_bitmap, pasid_limit);
> +	if (found == pasid_limit)
> +		found = 0;
> +	else
> +		set_bit(found, pasid_bitmap);
> +
> +	mutex_unlock(&pasid_mutex);
> +
> +	return found;
> +}
> +
> +void kfd_pasid_free(pasid_t pasid)
> +{
> +	BUG_ON(pasid == 0 || pasid >= pasid_limit);
> +	clear_bit(pasid, pasid_bitmap);
> +}
> diff --git a/drivers/gpu/drm/radeon/amdkfd/kfd_priv.h b/drivers/gpu/drm/radeon/amdkfd/kfd_priv.h
> index b391e24..af5a5e4 100644
> --- a/drivers/gpu/drm/radeon/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/radeon/amdkfd/kfd_priv.h
> @@ -32,14 +32,39 @@
>  #include <linux/spinlock.h>
>  #include "../radeon_kfd.h"
>  
> +/*
> + * Per-process limit. Each process can only
> + * create MAX_PROCESS_QUEUES across all devices
> + */
> +#define MAX_PROCESS_QUEUES 1024
> +
> +#define MAX_DOORBELL_INDEX MAX_PROCESS_QUEUES
>  #define KFD_SYSFS_FILE_MODE 0444
>  
> +/*
> + * We multiplex different sorts of mmap-able memory onto /dev/kfd.
> + * We figure out what type of memory the caller wanted by comparing
> + * the mmap page offset to known ranges.
> + */
> +#define KFD_MMAP_DOORBELL_START	(((1ULL << 32)*1) >> PAGE_SHIFT)
> +#define KFD_MMAP_DOORBELL_END	(((1ULL << 32)*2) >> PAGE_SHIFT)
> +
>  /* GPU ID hash width in bits */
>  #define KFD_GPU_ID_HASH_WIDTH 16
>  
>  /* Macro for allocating structures */
>  #define kfd_alloc_struct(ptr_to_struct)	((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL))
>  
> +/*
> + * Large enough to hold the maximum usable pasid + 1.
> + * It must also be able to store the number of doorbells
> + * reported by a KFD device.
> + */
> +typedef unsigned int pasid_t;
> +
> +/* Type that represents a HW doorbell slot. */
> +typedef u32 doorbell_t;
> +
>  struct kfd_device_info {
>  	const struct kfd_scheduler_class *scheduler_class;
>  	unsigned int max_pasid_bits;
> @@ -56,6 +81,17 @@ struct kfd_dev {
>  
>  	unsigned int id;		/* topology stub index */
>  
> +	phys_addr_t doorbell_base;	/* Start of actual doorbells used by
> +					 * KFD. It is aligned for mapping
> +					 * into user mode
> +					 */
> +	size_t doorbell_id_offset;	/* Doorbell offset (from KFD doorbell
> +					 * to HW doorbell, GFX reserved some
> +					 * at the start)
> +					 */
> +	size_t doorbell_process_limit;	/* Number of processes we have doorbell space for. */
> +	u32 __iomem *doorbell_kernel_ptr; /* this is a pointer for a doorbells page used by kernel queue */
> +
>  	struct kgd2kfd_shared_resources shared_resources;
>  };
>  
> @@ -68,15 +104,124 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd);
>  
>  extern const struct kfd2kgd_calls *kfd2kgd;
>  
> +/* Dummy struct just to make kfd_mem_obj* a unique pointer type. */
> +struct kfd_mem_obj_s;
> +typedef struct kfd_mem_obj_s *kfd_mem_obj;

IIRC the rule is no more typedef in kernel. Or maybe i just dreamt
that rule.

> +
> +enum kfd_mempool {
> +	KFD_MEMPOOL_SYSTEM_CACHEABLE = 1,
> +	KFD_MEMPOOL_SYSTEM_WRITECOMBINE = 2,
> +	KFD_MEMPOOL_FRAMEBUFFER = 3,
> +};
> +
> +
> +int kfd_vidmem_alloc(struct kfd_dev *kfd, size_t size, size_t alignment,
> +			enum kfd_mempool pool, kfd_mem_obj *mem_obj);
> +void kfd_vidmem_free(struct kfd_dev *kfd, kfd_mem_obj mem_obj);
> +int kfd_vidmem_gpumap(struct kfd_dev *kfd, kfd_mem_obj mem_obj, uint64_t *vmid0_address);
> +void kfd_vidmem_ungpumap(struct kfd_dev *kfd, kfd_mem_obj mem_obj);
> +int kfd_vidmem_kmap(struct kfd_dev *kfd, kfd_mem_obj mem_obj, void **ptr);
> +void kfd_vidmem_unkmap(struct kfd_dev *kfd, kfd_mem_obj mem_obj);
> +int kfd_vidmem_alloc_map(struct kfd_dev *kfd, kfd_mem_obj *mem_obj, void **ptr,
> +			uint64_t *vmid0_address, size_t size);
> +void kfd_vidmem_free_unmap(struct kfd_dev *kfd, kfd_mem_obj mem_obj);
>  /* Character device interface */
>  int kfd_chardev_init(void);
>  void kfd_chardev_exit(void);
>  struct device *kfd_chardev(void);
>  
> +
> +/* Data that is per-process-per device. */
> +struct kfd_process_device {
> +	/*
> +	 * List of all per-device data for a process.
> +	 * Starts from kfd_process.per_device_data.
> +	 */
> +	struct list_head per_device_list;
> +
> +	/* The device that owns this data. */
> +	struct kfd_dev *dev;
> +
> +	/* The user-mode address of the doorbell mapping for this device. */
> +	doorbell_t __user *doorbell_mapping;
> +
> +	/* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */
> +	bool bound;

Best to put the boolean at the end of the structure ...

> +
> +	/*Apertures*/
> +	uint64_t lds_base;
> +	uint64_t lds_limit;
> +	uint64_t gpuvm_base;
> +	uint64_t gpuvm_limit;
> +	uint64_t scratch_base;
> +	uint64_t scratch_limit;
> +};
> +
>  /* Process data */
>  struct kfd_process {
> +	struct list_head processes_list;
> +
> +	struct mm_struct *mm;
> +
> +	struct mutex mutex;
> +
> +	/*
> +	 * In any process, the thread that started main() is the lead
> +	 * thread and outlives the rest.
> +	 * It is here because amd_iommu_bind_pasid wants a task_struct.
> +	 */
> +	struct task_struct *lead_thread;
> +
> +	pasid_t pasid;
> +
> +	/*
> +	 * List of kfd_process_device structures,
> +	 * one for each device the process is using.
> +	 */
> +	struct list_head per_device_data;
> +
> +	/* The process's queues. */
> +	size_t queue_array_size;
> +
> +	/* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */
> +	struct kfd_queue **queues;
> +
> +	unsigned long allocated_queue_bitmap[DIV_ROUND_UP(MAX_PROCESS_QUEUES, BITS_PER_LONG)];
> +
> +	/*Is the user space process 32 bit?*/
> +	bool is_32bit_user_mode;
>  };
>  
> +struct kfd_process *kfd_create_process(const struct task_struct *);
> +struct kfd_process *kfd_get_process(const struct task_struct *);
> +
> +struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
> +							struct kfd_process *p);
> +
> +/* PASIDs */
> +int kfd_pasid_init(void);
> +void kfd_pasid_exit(void);
> +bool kfd_set_pasid_limit(pasid_t new_limit);
> +pasid_t kfd_get_pasid_limit(void);
> +pasid_t kfd_pasid_alloc(void);
> +void kfd_pasid_free(pasid_t pasid);
> +
> +/* Doorbells */
> +void kfd_doorbell_init(struct kfd_dev *kfd);
> +int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma);
> +doorbell_t __user *kfd_get_doorbell(struct file *devkfd,
> +					struct kfd_process *process,
> +					struct kfd_dev *dev,
> +					unsigned int doorbell_index);
> +u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
> +					unsigned int *doorbell_off);
> +void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr);
> +u32 read_kernel_doorbell(u32 __iomem *db);
> +void write_kernel_doorbell(u32 __iomem *db, u32 value);
> +unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd,
> +					struct kfd_process *process,
> +					unsigned int queue_id);
> +
>  extern struct device *kfd_device;
>  
>  /* Topology */
> @@ -95,4 +240,7 @@ void kgd2kfd_interrupt(struct kfd_dev *dev, const void *ih_ring_entry);
>  void kgd2kfd_suspend(struct kfd_dev *dev);
>  int kgd2kfd_resume(struct kfd_dev *dev);
>  
> +/* amdkfd Apertures */
> +int kfd_init_apertures(struct kfd_process *process);
> +
>  #endif
> diff --git a/drivers/gpu/drm/radeon/amdkfd/kfd_process.c b/drivers/gpu/drm/radeon/amdkfd/kfd_process.c
> new file mode 100644
> index 0000000..5efbce0
> --- /dev/null
> +++ b/drivers/gpu/drm/radeon/amdkfd/kfd_process.c
> @@ -0,0 +1,374 @@
> +/*
> + * Copyright 2014 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include <linux/mutex.h>
> +#include <linux/log2.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/notifier.h>
> +struct mm_struct;
> +
> +#include "kfd_priv.h"
> +
> +/*
> + * Initial size for the array of queues.
> + * The allocated size is doubled each time
> + * it is exceeded up to MAX_PROCESS_QUEUES.
> + */
> +#define INITIAL_QUEUE_ARRAY_SIZE 16
> +
> +/* List of struct kfd_process */
> +static struct list_head kfd_processes_list = LIST_HEAD_INIT(kfd_processes_list);
> +
> +static DEFINE_MUTEX(kfd_processes_mutex);
> +
> +static struct kfd_process *create_process(const struct task_struct *thread);
> +
> +struct kfd_process *kfd_create_process(const struct task_struct *thread)
> +{
> +	struct kfd_process *process;
> +
> +	if (thread->mm == NULL)
> +		return ERR_PTR(-EINVAL);
> +
> +	/* Only the pthreads threading model is supported. */
> +	if (thread->group_leader->mm != thread->mm)
> +		return ERR_PTR(-EINVAL);
> +
> +	/*
> +	 * take kfd processes mutex before starting of process creation
> +	 * so there won't be a case where two threads of the same process
> +	 * create two kfd_process structures
> +	 */
> +	mutex_lock(&kfd_processes_mutex);
> +
> +	/* A prior open of /dev/kfd could have already created the process. */
> +	process = thread->mm->kfd_process;
> +	if (process)
> +		pr_debug("kfd: process already found\n");
> +
> +	if (!process)
> +		process = create_process(thread);
> +
> +	mutex_unlock(&kfd_processes_mutex);
> +
> +	return process;
> +}
> +
> +struct kfd_process *kfd_get_process(const struct task_struct *thread)
> +{
> +	struct kfd_process *process;
> +
> +	if (thread->mm == NULL)
> +		return ERR_PTR(-EINVAL);
> +
> +	/* Only the pthreads threading model is supported. */
> +	if (thread->group_leader->mm != thread->mm)
> +		return ERR_PTR(-EINVAL);
> +
> +	process = thread->mm->kfd_process;
> +
> +	return process;
> +}
> +
> +static void free_process(struct kfd_process *p)
> +{
> +	struct kfd_process_device *pdd, *temp;
> +
> +	BUG_ON(p == NULL);
> +
> +	list_for_each_entry_safe(pdd, temp, &p->per_device_data, per_device_list) {
> +		list_del(&pdd->per_device_list);
> +		kfree(pdd);
> +	}
> +
> +	kfd_pasid_free(p->pasid);
> +
> +	mutex_destroy(&p->mutex);
> +
> +	kfree(p->queues);
> +
> +	list_del(&p->processes_list);
> +
> +	kfree(p);
> +}
> +
> +int kfd_process_exit(struct notifier_block *nb,
> +			unsigned long action, void *data)
> +{
> +	struct mm_struct *mm = data;
> +	struct kfd_process *p;
> +
> +	mutex_lock(&kfd_processes_mutex);
> +
> +	p = mm->kfd_process;
> +	if (p) {
> +		free_process(p);
> +		mm->kfd_process = NULL;
> +	}
> +
> +	mutex_unlock(&kfd_processes_mutex);
> +
> +	return 0;
> +}
> +
> +static struct kfd_process *create_process(const struct task_struct *thread)
> +{
> +	struct kfd_process *process;
> +	int err = -ENOMEM;
> +
> +	process = kzalloc(sizeof(*process), GFP_KERNEL);
> +
> +	if (!process)
> +		goto err_alloc_process;
> +
> +	process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE, sizeof(process->queues[0]), GFP_KERNEL);
> +	if (!process->queues)
> +		goto err_alloc_queues;
> +
> +	process->pasid = kfd_pasid_alloc();
> +	if (process->pasid == 0)
> +		goto err_alloc_pasid;
> +
> +	mutex_init(&process->mutex);
> +
> +	process->mm = thread->mm;
> +	thread->mm->kfd_process = process;
> +	list_add_tail(&process->processes_list, &kfd_processes_list);
> +
> +	process->lead_thread = thread->group_leader;
> +
> +	process->queue_array_size = INITIAL_QUEUE_ARRAY_SIZE;
> +
> +	INIT_LIST_HEAD(&process->per_device_data);
> +
> +	return process;
> +
> +err_alloc_pasid:
> +	kfree(process->queues);
> +err_alloc_queues:
> +	kfree(process);
> +err_alloc_process:
> +	return ERR_PTR(err);
> +}
> +
> +struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
> +							struct kfd_process *p)
> +{
> +	struct kfd_process_device *pdd;
> +
> +	list_for_each_entry(pdd, &p->per_device_data, per_device_list)
> +		if (pdd->dev == dev)
> +			return pdd;
> +
> +	pdd = kzalloc(sizeof(*pdd), GFP_KERNEL);
> +	if (pdd != NULL) {
> +		pdd->dev = dev;
> +		list_add(&pdd->per_device_list, &p->per_device_data);
> +	}
> +
> +	return pdd;
> +}
> +
> +/*
> + * Direct the IOMMU to bind the process (specifically the pasid->mm) to the device.
> + * Unbinding occurs when the process dies or the device is removed.
> + *
> + * Assumes that the process lock is held.
> + */
> +struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
> +							struct kfd_process *p)
> +{
> +	struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p);
> +
> +	if (pdd == NULL)
> +		return ERR_PTR(-ENOMEM);
> +
> +	if (pdd->bound)
> +		return pdd;
> +
> +	pdd->bound = true;
> +
> +	return pdd;
> +}
> +
> +void kfd_unbind_process_from_device(struct kfd_dev *dev, pasid_t pasid)
> +{
> +	struct kfd_process *p;
> +	struct kfd_process_device *pdd;
> +
> +	BUG_ON(dev == NULL);
> +
> +	mutex_lock(&kfd_processes_mutex);
> +
> +	list_for_each_entry(p, &kfd_processes_list, processes_list)
> +		if (p->pasid == pasid)
> +			break;
> +
> +	mutex_unlock(&kfd_processes_mutex);
> +
> +	BUG_ON(p->pasid != pasid);
> +
> +	pdd = kfd_get_process_device_data(dev, p);
> +
> +	BUG_ON(pdd == NULL);
> +
> +	mutex_lock(&p->mutex);
> +
> +	/*
> +	 * Just mark pdd as unbound, because we still need it to call
> +	 * amd_iommu_unbind_pasid() in when the process exits.
> +	 * We don't call amd_iommu_unbind_pasid() here
> +	 * because the IOMMU called us.
> +	 */
> +	pdd->bound = false;
> +
> +	mutex_unlock(&p->mutex);
> +}
> +
> +/*
> + * Ensure that the process's queue array is large enough to hold
> + * the queue at queue_id.
> + * Assumes that the process lock is held.
> + */
> +static bool ensure_queue_array_size(struct kfd_process *p, unsigned int queue_id)
> +{
> +	size_t desired_size;
> +	struct kfd_queue **new_queues;
> +
> +	compiletime_assert(INITIAL_QUEUE_ARRAY_SIZE > 0, "INITIAL_QUEUE_ARRAY_SIZE must not be 0");
> +	compiletime_assert(INITIAL_QUEUE_ARRAY_SIZE <= MAX_PROCESS_QUEUES,
> +			   "INITIAL_QUEUE_ARRAY_SIZE must be less than MAX_PROCESS_QUEUES");
> +	/* Ensure that doubling the current size won't ever overflow. */
> +	compiletime_assert(MAX_PROCESS_QUEUES < SIZE_MAX / 2, "MAX_PROCESS_QUEUES must be less than SIZE_MAX/2");
> +
> +	/*
> +	 * These & queue_id < MAX_PROCESS_QUEUES guarantee that
> +	 * the desired_size calculation will end up <= MAX_PROCESS_QUEUES
> +	 */
> +	compiletime_assert(is_power_of_2(INITIAL_QUEUE_ARRAY_SIZE), "INITIAL_QUEUE_ARRAY_SIZE must be power of 2.");
> +	compiletime_assert(MAX_PROCESS_QUEUES % INITIAL_QUEUE_ARRAY_SIZE == 0,
> +			   "MAX_PROCESS_QUEUES must be multiple of INITIAL_QUEUE_ARRAY_SIZE.");
> +	compiletime_assert(is_power_of_2(MAX_PROCESS_QUEUES / INITIAL_QUEUE_ARRAY_SIZE),
> +			   "MAX_PROCESS_QUEUES must be a power-of-2 multiple of INITIAL_QUEUE_ARRAY_SIZE.");
> +
> +	if (queue_id < p->queue_array_size)
> +		return true;
> +
> +	if (queue_id >= MAX_PROCESS_QUEUES)
> +		return false;
> +
> +	desired_size = p->queue_array_size;
> +	while (desired_size <= queue_id)
> +		desired_size *= 2;
> +
> +	BUG_ON(desired_size < queue_id || desired_size > MAX_PROCESS_QUEUES);
> +	BUG_ON(desired_size % INITIAL_QUEUE_ARRAY_SIZE != 0 || !is_power_of_2(desired_size / INITIAL_QUEUE_ARRAY_SIZE));
> +
> +	new_queues = kmalloc_array(desired_size, sizeof(p->queues[0]), GFP_KERNEL);
> +	if (!new_queues)
> +		return false;
> +
> +	memcpy(new_queues, p->queues, p->queue_array_size * sizeof(p->queues[0]));
> +
> +	kfree(p->queues);
> +	p->queues = new_queues;
> +	p->queue_array_size = desired_size;
> +
> +	return true;
> +}
> +
> +/* Assumes that the process lock is held. */
> +bool kfd_allocate_queue_id(struct kfd_process *p, unsigned int *queue_id)
> +{
> +	unsigned int qid = find_first_zero_bit(p->allocated_queue_bitmap, MAX_PROCESS_QUEUES);
> +
> +	if (qid >= MAX_PROCESS_QUEUES)
> +		return false;
> +
> +	if (!ensure_queue_array_size(p, qid))
> +		return false;
> +
> +	__set_bit(qid, p->allocated_queue_bitmap);
> +
> +	p->queues[qid] = NULL;
> +	*queue_id = qid;
> +
> +	return true;
> +}
> +
> +/*
> + * Install a queue into a previously-allocated queue id.
> + * Assumes that the process lock is held.
> + */
> +void kfd_install_queue(struct kfd_process *p, unsigned int queue_id, struct kfd_queue *queue)
> +{
> +	/* Have to call allocate_queue_id before install_queue. */
> +	BUG_ON(queue_id >= p->queue_array_size);
> +	BUG_ON(queue == NULL);
> +
> +	p->queues[queue_id] = queue;
> +}
> +
> +/*
> + * Remove a queue from the open queue list and deallocate the queue id.
> + * This can be called whether or not a queue was installed.
> + * Assumes that the process lock is held.
> + */
> +void kfd_remove_queue(struct kfd_process *p, unsigned int queue_id)
> +{
> +	BUG_ON(!test_bit(queue_id, p->allocated_queue_bitmap));
> +	BUG_ON(queue_id >= p->queue_array_size);
> +
> +	__clear_bit(queue_id, p->allocated_queue_bitmap);
> +}
> +
> +/* Assumes that the process lock is held. */
> +struct kfd_queue *kfd_get_queue(struct kfd_process *p, unsigned int queue_id)
> +{
> +	/*
> +	 * test_bit because the contents of unallocated
> +	 * queue slots are undefined.
> +	 * Otherwise ensure_queue_array_size would have to clear new entries and
> +	 * remove_queue would have to NULL removed queues.
> +	 */
> +	return (queue_id < p->queue_array_size &&
> +		test_bit(queue_id, p->allocated_queue_bitmap)) ?
> +			p->queues[queue_id] : NULL;
> +}
> +
> +struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p)
> +{
> +	return list_first_entry(&p->per_device_data, struct kfd_process_device, per_device_list);
> +}
> +
> +struct kfd_process_device *kfd_get_next_process_device_data(struct kfd_process *p, struct kfd_process_device *pdd)
> +{
> +	if (list_is_last(&pdd->per_device_list, &p->per_device_data))
> +		return NULL;
> +	return list_next_entry(pdd, per_device_list);
> +}
> +
> +bool kfd_has_process_device_data(struct kfd_process *p)
> +{
> +	return !(list_empty(&p->per_device_data));
> +}
> diff --git a/drivers/gpu/drm/radeon/amdkfd/kfd_vidmem.c b/drivers/gpu/drm/radeon/amdkfd/kfd_vidmem.c
> new file mode 100644
> index 0000000..a2c4d30
> --- /dev/null
> +++ b/drivers/gpu/drm/radeon/amdkfd/kfd_vidmem.c
> @@ -0,0 +1,96 @@
> +/*
> + * Copyright 2014 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include "kfd_priv.h"
> +
> +int kfd_vidmem_alloc(struct kfd_dev *kfd, size_t size, size_t alignment,
> +			enum kfd_mempool pool, kfd_mem_obj *mem_obj)
> +{
> +	return kfd2kgd->allocate_mem(kfd->kgd,
> +					size,
> +					alignment,
> +					(enum kgd_memory_pool)pool,
> +					(struct kgd_mem **)mem_obj);
> +}
> +
> +void kfd_vidmem_free(struct kfd_dev *kfd, kfd_mem_obj mem_obj)
> +{
> +	kfd2kgd->free_mem(kfd->kgd, (struct kgd_mem *)mem_obj);
> +}
> +
> +int kfd_vidmem_gpumap(struct kfd_dev *kfd, kfd_mem_obj mem_obj,
> +			uint64_t *vmid0_address)
> +{
> +	return kfd2kgd->gpumap_mem(kfd->kgd,
> +					(struct kgd_mem *)mem_obj,
> +					vmid0_address);

As discussed previously this will not fly, pinning gpu memory is a big NACK.

> +}
> +
> +void kfd_vidmem_ungpumap(struct kfd_dev *kfd, kfd_mem_obj mem_obj)
> +{
> +	kfd2kgd->ungpumap_mem(kfd->kgd, (struct kgd_mem *)mem_obj);
> +}
> +
> +int kfd_vidmem_kmap(struct kfd_dev *kfd, kfd_mem_obj mem_obj, void **ptr)
> +{
> +	return kfd2kgd->kmap_mem(kfd->kgd, (struct kgd_mem *)mem_obj, ptr);
> +}
> +
> +void kfd_vidmem_unkmap(struct kfd_dev *kfd, kfd_mem_obj mem_obj)
> +{
> +	kfd2kgd->unkmap_mem(kfd->kgd, (struct kgd_mem *)mem_obj);
> +}
> +
> +int kfd_vidmem_alloc_map(struct kfd_dev *kfd, kfd_mem_obj *mem_obj,
> +			void **ptr, uint64_t *vmid0_address, size_t size)
> +{
> +	int retval;
> +
> +	retval = kfd_vidmem_alloc(kfd, size, PAGE_SIZE,
> +				KFD_MEMPOOL_SYSTEM_WRITECOMBINE, mem_obj);
> +	if (retval != 0)
> +		goto fail_vidmem_alloc;
> +
> +	retval = kfd_vidmem_kmap(kfd, *mem_obj, ptr);
> +	if (retval != 0)
> +		goto fail_vidmem_kmap;
> +
> +	retval = kfd_vidmem_gpumap(kfd, *mem_obj, vmid0_address);
> +	if (retval != 0)
> +		goto fail_vidmem_gpumap;
> +
> +	return 0;
> +
> +fail_vidmem_gpumap:
> +	kfd_vidmem_unkmap(kfd, *mem_obj);
> +fail_vidmem_kmap:
> +	kfd_vidmem_free(kfd, *mem_obj);
> +fail_vidmem_alloc:
> +	return retval;
> +}
> +
> +void kfd_vidmem_free_unmap(struct kfd_dev *kfd, kfd_mem_obj mem_obj)
> +{
> +	kfd_vidmem_ungpumap(kfd, mem_obj);
> +	kfd_vidmem_unkmap(kfd, mem_obj);
> +	kfd_vidmem_free(kfd, mem_obj);
> +}
> -- 
> 1.9.1
> 


More information about the dri-devel mailing list