[Intel-gfx] [PATCH 6/9] drm/i915: driver based PASID handling

Thu Oct 8 08:57:11 PDT 2015

On Fri, Sep 04, 2015 at 09:59:00AM -0700, Jesse Barnes wrote:
> New file with VT-d SVM and PASID handling functions and page table
> management.  This belongs in the IOMMU code (along with some extra bits
> for waiting for invalidations and page faults to complete, flushing the
> device IOTLB, etc.)
> 
> FIXME:
>   need work queue for re-submitting contexts
>   TE bit handling on SKL
> ---
>  drivers/gpu/drm/i915/Makefile           |    5 +-
>  drivers/gpu/drm/i915/i915_drv.h         |   43 ++
>  drivers/gpu/drm/i915/i915_gem.c         |    3 +
>  drivers/gpu/drm/i915/i915_gem_context.c |    3 +
>  drivers/gpu/drm/i915/i915_irq.c         |    7 +
>  drivers/gpu/drm/i915/i915_reg.h         |   47 ++
>  drivers/gpu/drm/i915/i915_svm.c         | 1102 +++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_lrc.c        |  120 +++-
>  drivers/gpu/drm/i915/intel_lrc.h        |    1 +
>  9 files changed, 1299 insertions(+), 32 deletions(-)
>  create mode 100644 drivers/gpu/drm/i915/i915_svm.c
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 44d290a..e4883a7 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -38,7 +38,8 @@ i915-y += i915_cmd_parser.o \
>  	  intel_lrc.o \
>  	  intel_mocs.o \
>  	  intel_ringbuffer.o \
> -	  intel_uncore.o
> +	  intel_uncore.o \
> +	  i915_svm.o

Correct me if I am wrong, but it looks like i915_svm implements the
lowlevel interface with the hardware, so by convention is intel_svm.c

>  # general-purpose microcontroller (GuC) support
>  i915-y += intel_guc_loader.o \
> @@ -93,6 +94,8 @@ i915-y += dvo_ch7017.o \
>  # virtual gpu code
>  i915-y += i915_vgpu.o
>  
> +i915-$(CONFIG_MMU_NOTIFIER) += i915_svm.o

Added twice?

> +
>  # legacy horrors
>  i915-y += i915_dma.o
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 20beb51..ca38a7a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -47,6 +47,7 @@
>  #include <drm/drm_gem.h>
>  #include <linux/backlight.h>
>  #include <linux/hashtable.h>
> +#include <linux/mmu_notifier.h>
>  #include <linux/intel-iommu.h>
>  #include <linux/kref.h>
>  #include <linux/pm_qos.h>
> @@ -848,6 +849,13 @@ struct i915_ctx_hang_stats {
>  	bool banned;
>  };
>  
> +struct intel_mm_struct {
> +	struct kref kref;
> +	struct mmu_notifier notifier;
> +	struct drm_i915_private *dev_priv;
> +	struct list_head context_list;
> +};

Doesn't this look kind of familiar? struct i915_mm_struct perhaps?

> +
>  /* This must match up with the value previously used for execbuf2.rsvd1. */
>  #define DEFAULT_CONTEXT_HANDLE 0
>  
> @@ -874,6 +882,9 @@ struct i915_ctx_hang_stats {
>  struct intel_context {
>  	struct kref ref;
>  	int user_handle;
> +	bool is_svm; /* shares x86 page tables */
> +	u32 pasid; /* 20 bits */
> +	struct intel_mm_struct *ims;
>  	uint8_t remap_slice;
>  	struct drm_i915_private *i915;
>  	int flags;
> @@ -895,6 +906,9 @@ struct intel_context {
>  		int pin_count;
>  	} engine[I915_NUM_RINGS];
>  
> +	struct list_head mm_list;

This is a link, name it so.

> +	struct task_struct *tsk;

One task? A context can be passed by the device fd to another process.
Do we inherit the VM along with the context? I don't anything to prevent
such.

> +static void gpu_mm_segv(struct task_struct *tsk, unsigned long address,
> +			int si_code)
> +{
> +	siginfo_t info;
> +
> +	/* Need specific signal info here */
> +	info.si_signo	= SIGSEGV;
> +	info.si_errno	= EIO;
> +	info.si_code	= si_code;
> +	info.si_addr	= (void __user *)address;
> +
> +	force_sig_info(SIGSEGV, &info, tsk);

force_sig_info() is not exported, ah you builtin i915-svm.c

> +}
> +
> +/*
> + * Read the fault descriptor and handle the fault:
> + *   get PML4 from PASID
> + *   get mm struct
> + *   get the vma
> + *   verify the address is valid
> + *   call handle_mm_fault after taking the mm->mmap_sem
> + */
> +void intel_gpu_fault_work(struct work_struct *work)
> +{
> +	struct i915_svm_state *svm = container_of(work, struct i915_svm_state,
> +						  work);
> +	struct drm_i915_private *dev_priv =
> +		container_of(svm, struct drm_i915_private, svm);
> +	struct drm_device *dev = dev_priv->dev;
> +	struct intel_ringbuffer *ringbuf;
> +	struct page_request_dsc desc;
> +	struct page_group_response_dsc resp;
> +	struct intel_context *ctx;
> +	struct task_struct *tsk;
> +	struct mm_struct *mm;
> +	struct vm_area_struct *vma;
> +	u64 address;
> +	int ret;
> +
> +	DRM_ERROR("PRQ updated, head 0x%08x, tail 0x%08x\n",
> +		  I915_READ(SVM_PRQ_HEAD), I915_READ(SVM_PRQ_TAIL));
> +	prq_read_descriptor(dev, &desc);
> +	DRM_ERROR("page fault on addr 0x%016llx, PASID %d, srr %d\n",
> +		  (u64)(desc.addr << PAGE_SHIFT), desc.pasid, desc.srr);
> +
> +	spin_lock(&dev_priv->svm.lock);
> +	ctx = dev_priv->svm.pasid_ctx[desc.pasid];
> +	tsk = ctx->tsk;
> +	mm = tsk->mm;
> +	address = desc.addr << PAGE_SHIFT;
> +	ringbuf = ctx->engine[RCS].ringbuf;
> +	spin_unlock(&dev_priv->svm.lock);

All of the above can disappear at anytime after the unlock?

> +
> +	down_read_trylock(&mm->mmap_sem);
> +	vma = find_extend_vma(mm, address);
> +	if (!vma || address < vma->vm_start) {
> +		DRM_ERROR("bad VMA or address out of range\n");
> +		gpu_mm_segv(tsk, address, SEGV_MAPERR);
> +		goto out_unlock; /* need to kill process */
> +	}
> +
> +	ret = handle_mm_fault(mm, vma, address,
> +			      desc.wr_req ? FAULT_FLAG_WRITE : 0);
> +	if (ret & VM_FAULT_ERROR) {
> +		gpu_mm_segv(tsk, address, SEGV_ACCERR); /* ? */
> +		goto out_unlock;
> +	}
> +
> +	if (ret & VM_FAULT_MAJOR)
> +		tsk->maj_flt++;
> +	else
> +		tsk->min_flt++;
> +
> +	if (desc.srr)
> +		resp.dsc_type = PAGE_STREAM_RESP_DSC;
> +	else
> +		resp.dsc_type = PAGE_GRP_RESP_DSC;
> +	resp.pasid = desc.pasid;
> +	resp.pasid_present = 1;
> +	resp.requestor_id = PCI_DEVID(0, PCI_DEVFN(2,0));
> +	resp.resp_code = RESP_CODE_SUCCESS;
> +	resp.prg_index = desc.prg_index;
> +	resp.private = desc.private;
> +	ivq_write_resp_descriptor(dev, &resp);
> +out_unlock:
> +	up_read(&mm->mmap_sem);
> +
> +	/* FIXME: wait for page response to be serviced */
> +
> +	/* FIXME: queue context for re-submit */
> +	/* execlists_context_queue(req); */
> +}

> +/* Make sure GPU writes can't hit the mm that's about to go away */
> +static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
> +{
> +	struct intel_mm_struct *ims = container_of(mn, struct intel_mm_struct,
> +						   notifier);
> +	struct drm_i915_private *dev_priv = ims->dev_priv;
> +	struct drm_device *dev = dev_priv->dev;
> +	struct intel_context *ctx;
> +
> +	/*
> +	 * Wait for any outstanding activity and unbind the mm.  Since
> +	 * each context has its own ring, we can simply wait for the ring
> +	 * to idle before invalidating the PASID and flushing the TLB.
> +	 */
> +	mutex_lock(&dev->struct_mutex);
> +	list_for_each_entry(ctx, &ims->context_list, mm_list) {
> +		intel_ring_idle(ctx->engine[RCS].ringbuf->ring);
> +	}
> +
> +	intel_iommu_tlb_flush(dev_priv->dev);
> +	mutex_unlock(&dev->struct_mutex);

Erm, what! So you halt the GPU everytime? But you've already invalidated
the shadow PTE -- ah, invalidate-range looks to be a wip.

> +static void intel_flush_page_locked(struct drm_device *dev, int pasid,
> +				    unsigned long address)
> +{
> +	struct ext_iotlb_inv_dsc dsc = { 0 };
> +
> +	dsc.dsc_type = EXT_IOTLB_INV_DSC;
> +	dsc.g = EXT_IOTLB_INV_G_PASID_PAGE_SELECT;
> +	dsc.pasid = pasid;
> +	dsc.ih = 0;
> +	dsc.addr = address;
> +	dsc.am = 1;
> +	ivq_write_ext_iotlb_inv_descriptor(dev, &dsc);
> +}
> +
> +static void intel_change_pte(struct mmu_notifier *mn, struct mm_struct *mm,
> +			     unsigned long address, pte_t pte)
> +{
> +	struct intel_mm_struct *ims = container_of(mn, struct intel_mm_struct,
> +						   notifier);
> +	struct drm_i915_private *dev_priv = ims->dev_priv;
> +	struct drm_device *dev = dev_priv->dev;
> +
> +	struct intel_context *ctx;
> +
> +	mutex_lock(&dev->struct_mutex);
> +	list_for_each_entry(ctx, &ims->context_list, mm_list)
> +		intel_flush_page_locked(dev, ctx->pasid, address);
> +	mutex_unlock(&dev->struct_mutex);

Suggests you really want a ims->spinlock for context_list instead.

> +}
> +
> +static void intel_invalidate_page(struct mmu_notifier *mn,
> +				  struct mm_struct *mm,
> +				  unsigned long address)
> +{
> +	struct intel_mm_struct *ims = container_of(mn, struct intel_mm_struct,
> +						   notifier);
> +	struct drm_i915_private *dev_priv = ims->dev_priv;
> +	struct drm_device *dev = dev_priv->dev;
> +	struct intel_context *ctx;
> +
> +	mutex_lock(&dev->struct_mutex);
> +	list_for_each_entry(ctx, &ims->context_list, mm_list)
> +		intel_flush_page_locked(dev, ctx->pasid, address);
> +	mutex_unlock(&dev->struct_mutex);
> +}
> +
> +/* Need to unmap this range and make sure it doesn't get re-faulted */
> +static void intel_invalidate_range_start(struct mmu_notifier *mn,
> +					 struct mm_struct *mm,
> +					 unsigned long start, unsigned long end)
> +{
> +	struct intel_mm_struct *ims = container_of(mn, struct intel_mm_struct,
> +						   notifier);
> +	struct drm_i915_private *dev_priv = ims->dev_priv;
> +	struct drm_device *dev = dev_priv->dev;
> +
> +	/* FIXME: invalidate page only */
> +	intel_iommu_tlb_flush(dev);
> +}
> +
> +/* Pages have been freed at this point */
> +static void intel_invalidate_range_end(struct mmu_notifier *mn,
> +				       struct mm_struct *mm,
> +				       unsigned long start, unsigned long end)
> +{
> +	struct intel_mm_struct *ims = container_of(mn, struct intel_mm_struct,
> +						   notifier);
> +	struct drm_i915_private *dev_priv = ims->dev_priv;
> +	struct drm_device *dev = dev_priv->dev;
> +
> +	/* FIXME: invalidate page only */
> +	intel_iommu_tlb_flush(dev);
> +}
> +
> +static const struct mmu_notifier_ops intel_mmuops = {
> +	.release = intel_mm_release,
> +	/* no clear_flush_young, we just share the x86 bits */
> +	/* no test_young, we just share the x86 bits */
> +	.change_pte = intel_change_pte,
> +	.invalidate_page = intel_invalidate_page,
> +	.invalidate_range_start = intel_invalidate_range_start,
> +	.invalidate_range_end = intel_invalidate_range_end,
> +};
> +
> +struct intel_mm_struct *intel_bind_mm(struct drm_device *dev,
> +				      struct intel_context *ctx)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct intel_mm_struct *ims;
> +	struct mmu_notifier *mn;
> +	int ret;
> +
> +	WARN_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
> +
> +	mn = mmu_find_ops(current->mm, &intel_mmuops);

Magic function, I am missing its definition

> +	if (mn) {
> +		ims = container_of(mn, struct intel_mm_struct, notifier);
> +		kref_get(&ims->kref);
> +		goto out;
> +	}
> +
> +	ims = kzalloc(sizeof(*ims), GFP_KERNEL);
> +	if (!ims) {
> +		ret = -ENOMEM;
> +		goto error;
> +	}
> +	INIT_LIST_HEAD(&ims->context_list);
> +
> +	ims->notifier.ops = &intel_mmuops;
> +
> +	ret = mmu_notifier_register(&ims->notifier, current->mm);

This has lock inversion between struct_mutex and mm->mmap_sem.

> +	if (ret)
> +		goto error;
> +
> +	ims->dev_priv = dev->dev_private;
> +
> +out:
> +	list_add(&ctx->mm_list, &ims->context_list);
> +	return ims;
> +error:
> +	kfree(ims);
> +	return ERR_PTR(ret);
> +}
> +
> +static void intel_mm_free(struct kref *ims_ref)
> +{
> +	struct intel_mm_struct *ims =
> +		container_of(ims_ref, struct intel_mm_struct, kref);
> +
> +	mmu_notifier_unregister(&ims->notifier, current->mm);

More lock inversion.

> +	kfree(ims);
> +}
> +
> +void intel_unbind_mm(struct intel_context *ctx)
> +{
> +	struct drm_i915_private *dev_priv = ctx->ims->dev_priv;
> +
> +	WARN_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
> +
> +	list_del(&ctx->mm_list);
> +	kref_put(&ctx->ims->kref, intel_mm_free);
> +
> +	return;
> +}
> +
> +int intel_exec_mm_ioctl(struct drm_device *dev, void *data,
> +			struct drm_file *file)
> +{
> +//	struct drm_i915_exec_mm *exec_mm = data;
> +//	struct drm_i915_private *dev_priv = dev->dev_private;
> +
> +	/* Load new context into context reg */

Ah, there is a modicum of user API here.

> +	return 0;
> +}
> +
> +/*
> + * The PASID table has 32 entries in the current config, rotate through
> + * them as needed.
> + */
> +int intel_alloc_pasid(struct drm_device *dev, struct intel_context *ctx)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct pasid_table_entry *table;
> +	int i;
> +
> +	WARN_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
> +
> +	spin_lock(&dev_priv->svm.lock);
> +	table = dev_priv->svm.pasid_table;
> +
> +	for (i = 0; i < PASID_COUNT; i++) {
> +		if (!table[i].present)
> +			goto found;
> +	}
> +
> +	spin_unlock(&dev_priv->svm.lock);
> +	return -1;
> +
> +found:
> +	table[i].pml4 = __pa(current->mm->pgd) >> PAGE_SHIFT;
> +	table[i].present = 1;
> +
> +	ctx->pasid = i;
> +	dev_priv->svm.pasid_ctx[ctx->pasid] = NULL;
> +	spin_unlock(&dev_priv->svm.lock);
> +
> +	intel_iommu_tlb_flush(dev);
> +
> +	return 0;
> +}
> +
> +void intel_free_pasid(struct drm_device *dev, struct intel_context *ctx)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct pasid_table_entry *table;
> +
> +	WARN_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
> +
> +	if (ctx->pasid >= PASID_COUNT)
> +		return;
> +
> +	spin_lock(&dev_priv->svm.lock);
> +	table = dev_priv->svm.pasid_table;
> +	memset(&table[ctx->pasid], 0, sizeof(struct pasid_table_entry));
> +	dev_priv->svm.pasid_ctx[ctx->pasid] = NULL;
> +	ctx->pasid = -1;
> +	spin_unlock(&dev_priv->svm.lock);
> +
> +	intel_iommu_tlb_flush(dev);
> +}
> +
> +/*
> + * Each root table entry is 16 bytes wide.  In legacy mode, only
> + * the lower 64 bits are used:
> + *   Bits 38:12: context table pointer
> + *   Bit 0: present
> + *   all other bits reserved
> + * In extended mode (what we use for SVM):
> + *   Bits 102:76: upper context table pointer
> + *   Bit 64: upper present
> + *   Bits 38:12: lower context table pointer
> + *   Bit 0: lower present
> + *   all other bits reserved
> + *
> + * The context entries are 128 bit in legacy mode:
> + *   Bits 87:72: Domain ID
> + *   Bits 70:67: Available
> + *   Bits 66:64: Address width
> + *   Bits 38:12: Page table pointer
> + *   Bits 3:2: Translation type
> + *     00 - only untranslated DMA requests go through this table
> + *          translated and translation requests are blocked
> + *     01 - untranslated, translated, and translation requests supported
> + *     10 - untranslated requests are treated as pass through (HPA == GPA),
> + *          translated DMA requests and translation requests are blocked
> + *     11 - reserved
> + *   Bit 1: fault disable
> + *   Bit 0: Present
> + * and 256 bit in extended:
> + *   Bits 230:204: PASID state table pointer
> + *   Bits 166:140: PASID table pointer
> + *   Bits 131:128: PASID table size
> + *   Bits 127:96: Page table attribute (PAT)
> + *   Bit 92: SL64KPE
> + *   Bit 91: SLEE
> + *   Bit 90: ERE
> + *   Bit 89: SRE
> + *   Bit 88: SMEP
> + *   Bits 87:72: Domain ID
> + *   Bit 71: Extended memory type enable
> + *   Bit 70: cache disable (CD)
> + *   Bit 69: write protect (WP)
> + *   Bit 68: no execute enable (NXE)
> + *   Bit 67: page global enable (PGE)
> + *   Bits 66:64: address width
> + *   Bits 38:12: 2nd level (VT-d) page table pointer
> + *   Bit 11: PASID enable
> + *   Bit 10: Nesting enable
> + *   Bit 9: Page Request enable
> + *   Bit 8: Lazy-Invalidate enable
> + *   Bits 7:5: Extended Memory Type (VT-d)
> + *   Bits 4:2: Translation type
> + *     000 - Only Untranslated DMA requests are translated through this page
> + *           table. Translated DMA requests and Translation Requests are
> + *           blocked.  Untranslated requests-without-PASID are remapped using
> + *           the second-level page-table referenced through SLPTPTR field.
> + *           If PASIDE field is Set, Untranslated requests-with-PASID are
> + *           remapped using the PASID Table referenced through PASIDPTPTR
> + *           field. If PASIDE field is Clear, Untranslated requests-with-PASID
> + *           are blocked.  Translation requests (with or without PASID), and
> + *           Translated Requests are blocked.
> + *     001 - Un-translated and Translation requests without PASID supported
> + *           (and with PASID supported, if PASID Enable Set); Translate
> + *           requests bypass address translation.  Untranslated
> + *           requests-without-PASID and Translation requests-without-PASID are
> + *           remapped using the second level page-table referenced through
> + *           SLPTPTR field. If PASIDE field is Set, Untranslated
> + *           requests-with-PASID and Translation requests-with-PASID are
> + *           remapped using the PASID Table referenced through PASIDPTPTR
> + *           field. If PASIDE field is Clear, Untranslated requests-with-PASID,
> + *           and Translation requests-with-PASID, are blocked. Translated
> + *           requests bypass address translation.
> + *     010 - If Pass-through Supported (GT supports pass-through),
> + *           Un-translated requests without PASID bypass address translation;
> + *           All other requests (with or without PASID) blocked. Untranslated
> + *           requests-without-PASID bypass address translation and are
> + *           processed as passthrough. SLPTPTR field is ignored by hardware.
> + *           Untranslated requests-with-PASID, Translation requests (with or
> + *           without PASID), and Translated requests are blocked.
> + *     011 - Reserved.
> + *     100 - Un-translated requests without PASID bypass address translation;
> + *           Un-translated requests with PASID supported, if PASID Enable Set;
> + *           All other requests blocked. Untranslated requests-without-PASID
> + *           bypass address translation and are processed as passthrough.
> + *           SLPTPTR field is ignored by hardware. Untranslated
> + *           requests-with-PASID are remapped using the PASID Table referenced
> + *           through PASIDPTPTR field. Translation requests (with or without
> + *           PASID) and Translated requests are blocked.
> + *     101 - Un-translated and Translation requests without PASID bypass
> + *           address translation; Un-translated and Translation requests with
> + *           PASID supported, if PASID Enable Set; Translated requests bypass
> + *           address translation.  Untranslated requests-without-PASID bypass
> + *           address translation and are processed as passthrough. SLPTPTR
> + *           field is ignored by hardware.  Translation requests-without-PASID
> + *           are responded with Untranslated access only bit Set (U=1) along
> + *           with read and write permissions (R=W=1). SLPTPTR field is ignored
> + *           by hardware. Untranslated requests-with-PASID, and Translation
> + *           requests-with-PASID are remapped using the PASID Table referenced
> + *           through PASIDPTPTR field.  Translated requests bypass address
> + *           translation.
> + *     110 - Un-translated requests without PASID are blocked; Un-translated
> + *           requests with PASID supported, if PASID Enable Set; All other
> + *           requests blocked – Not applicable to GFX, GT should treat this as
> + *           reserved.
> + *     111 - Un-translated and Translation requests without PASID blocked;
> + *           Un-translated and Translation requests with PASID supported, if
> + *           PASID Enable Set; Translated requests bypass address translation.
> + *           Note: Not applicable to GFX, GT should treat this as reserved.
> + *   Bit 1: Fault disable
> + *   Bit 0: Present
> + *
> + * Page walks for graphics addresses can go through one or two levels of
> + * translation, depending on whether VT-d is enabled.
> + *
> + * If we're in driver mode (currently the only supported mode), we always
> + * use a single level of translation, meaning the second level page table
> + * pointer (if present) is ignored.
> + *
> + * The full walk starts at the root table, which indexes into the upper
> + * and lower context tables.  Those tables point to PASID mapping and state
> + * tables and potentially a second level page table for VT-d (which, as noted
> + * above, is unused currently).  The PASID mapping table points to a PML4
> + * (x86 compatible) page table, while the state table indicates other
> + * information about the PASID involved in the request, which ultimately comes
> + * from the execlist port submission of the context descriptor.
> + *
> + * To enable a shared CPU/GPU address space, we can use a couple of different
> + * translation types, either 101 or 01 w/o nesting.  The main requirement
> + * is that requests with PASID are translated through the page tables provided,
> + * potentially with nesting if we're running in a VT-d context (which we
> + * don't currently support).
> + */
> +#define CONTEXT_OFFSET (PAGE_SIZE * 1)
> +#define PASID_OFFSET (PAGE_SIZE * 2)
> +#define PASID_STATE_OFFSET (PAGE_SIZE * 3)
> +#define PRQ_OFFSET (PAGE_SIZE * 4)
> +#define IVQ_OFFSET (PAGE_SIZE * 5)
> +static void intel_init_svm_root_table(struct drm_device *dev,
> +				      drm_dma_handle_t *tables)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct extended_root_table_entry *root_table;
> +	struct extended_context_table_entry *context;
> +	struct pasid_table_entry *pasid_table;
> +	struct pasid_state_table_entry *pasid_state_table;
> +	u64 *tmp;
> +
> +	root_table = tables->vaddr;
> +	context = tables->vaddr + CONTEXT_OFFSET;
> +        pasid_table = tables->vaddr + PASID_OFFSET;
> +	pasid_state_table = tables->vaddr + PASID_STATE_OFFSET;
> +
> +	DRM_ERROR("programmed PASID table, vaddr %p, busaddr 0x%16llx\n",
> +		  pasid_table, tables->busaddr + PASID_OFFSET);
> +
> +	/* Context entry for gfx device */
> +	context[16].pat = 0x66666666;
> +	context[16].ere = 1;
> +	context[16].sre = 1;
> +	context[16].smep = 1;
> +	context[16].domain_id = 1;
> +	context[16].addr_width = AGAW_48; /* full x86 walk */
> +	context[16].pasid_en = 1;
> +	context[16].nesting_en = 0; /* not yet */
> +	context[16].pg_req_en = 1;
> +	context[16].lazy_invalidate_en = 1;
> +	context[16].ext_mem_type = EXTENDED_MTYPE_WB;
> +	context[16].translation_type = EXTENDED_TTYPE_UT_TR_PASID_PT;
> +	context[16].fault_disable = 0;
> +	context[16].present = 1;
> +	context[16].pasid_state_table_addr = (tables->busaddr + PASID_STATE_OFFSET) >> PAGE_SHIFT;
> +	context[16].pasid_table_addr = (tables->busaddr + PASID_OFFSET) >>
> +		PAGE_SHIFT;
> +	context[16].pasid_table_size = 0; /* 2^(5+x) */
> +
> +	tmp = (u64 *)&context[16];
> +	DRM_ERROR("root entry: 0x%016llx%016llx\n", tmp[1], tmp[0]);
> +
> +	DRM_ERROR("programmed context table, vaddr %p, busaddr 0x%16llx\n",
> +		  context, tables->busaddr + CONTEXT_OFFSET);
> +
> +	/* Root table */
> +	root_table[0].lo_ctx_addr = (tables->busaddr + CONTEXT_OFFSET) >>
> +		PAGE_SHIFT;
> +	root_table[0].lo_present = 1;
> +	root_table[0].hi_present = 0;
> +
> +	tmp = (u64 *)&root_table[0];
> +	DRM_ERROR("root entry: 0x%016llx%016llx\n", tmp[1], tmp[0]);
> +
> +	dev_priv->svm.root_table = root_table;
> +	dev_priv->svm.context = context;
> +        dev_priv->svm.pasid_table = pasid_table;
> +	dev_priv->svm.pasid_state_table = pasid_state_table;
> +	dev_priv->svm.prq_ring = tables->vaddr + PRQ_OFFSET;
> +	dev_priv->svm.ivq_ring = tables->vaddr + IVQ_OFFSET;
> +
> +	/* Enable the page request queue */
> +	I915_WRITE64(SVM_PRQA, tables->busaddr + PRQ_OFFSET);
> +	I915_WRITE(SVM_PRQ_HEAD, 0);
> +	I915_WRITE(SVM_PRQ_TAIL, 0);
> +	I915_WRITE(SVM_PRECTL, 0);
> +
> +	/* Set up the invalidation request queue */
> +	I915_WRITE64(SVM_IQA, tables->busaddr + IVQ_OFFSET);
> +	I915_WRITE(SVM_IVQ_HEAD, 0);
> +	I915_WRITE(SVM_IVQ_TAIL, 0);
> +	I915_WRITE(SVM_IECTL, 0);
> +
> +	I915_WRITE(SVM_GCMD, GCMD_QIE);
> +	if (wait_for(I915_READ(SVM_GSTS) & GSTS_QIES, 500))
> +		DRM_ERROR("timed out waiting for queued invalidation enable\n");
> +
> +	/* All set, program the root */
> +	I915_WRITE(SVM_RTADDR, tables->busaddr | SVM_RTT_TYPE_EXT);
> +
> +	I915_WRITE(SVM_GCMD, GCMD_SRTP);
> +	if (wait_for(I915_READ(SVM_GSTS) & GSTS_RTPS, 500))
> +		DRM_ERROR("timed out waiting for root table to load\n");
> +
> +	DRM_ERROR("programmed SVM root, vaddr %p, busaddr 0x%16llx\n",
> +		  tables->vaddr, tables->busaddr);
> +
> +	intel_iommu_tlb_flush(dev);
> +}
> +
> +/*
> + * Probe for SVM capability.  If found:
> + *  - try to switch to driver mode
> + *  - set up root PASID table
> + *  - enable page fault and error handling interrupts
> + *  - allow SVM ioctls
> + */
> +void intel_init_svm(struct drm_device *dev)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	drm_dma_handle_t *tables;
> +	u32 dev_mode;
> +	int num_tables = 6;
> +
> +	dev_mode = I915_READ(BDW_SVM_DEV_MODE_CNFG);
> +	I915_WRITE(BDW_SVM_DEV_MODE_CNFG, dev_mode | BDW_SVM_MODE_DRIVER);
> +	dev_mode = I915_READ(BDW_SVM_DEV_MODE_CNFG);
> +#if defined(CONFIG_INTEL_IOMMU) || defined(IOMMU_SUPPORT)
> +#error must disable IOMMU support
> +#endif
> +	if (!dev_mode & BDW_SVM_MODE_DRIVER) {
> +		DRM_ERROR("driver mode not available, disabling SVM\n");
> +		goto err;
> +	}
> +
> +	tables = drm_pci_alloc(dev, PAGE_SIZE*num_tables, PAGE_SIZE);
> +	if (!tables) {
> +		DRM_ERROR("table alloc failed, disabling SVM\n");
> +		goto err;
> +	}
> +
> +	memset(tables->vaddr, 0, PAGE_SIZE*num_tables);
> +
> +	intel_init_svm_root_table(dev, tables);
> +
> +	spin_lock_init(&dev_priv->svm.lock);
> +
> +#if 0
> +	I915_WRITE(SVM_GCMD, GCMD_TE);
> +	if (wait_for(I915_READ(SVM_GSTS) & GSTS_TES, 500))
> +		DRM_ERROR("timed out waiting for translation enable\n");
> +#endif
> +	INIT_WORK(&dev_priv->svm.work, intel_gpu_fault_work);
> +
> +	DRM_ERROR("SVM driver mode enabled\n");
> +	dev_priv->svm.svm_available = true;
> +	return;
> +
> +err:
> +	dev_priv->svm.svm_available = false;
> +	return;
> +}
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 40cbba4..1450491 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -217,6 +217,7 @@ enum {
>  	FAULT_AND_STREAM,
>  	FAULT_AND_CONTINUE /* Unsupported */
>  };
> +#define GEN8_CTX_FAULT_SHIFT 6
>  #define GEN8_CTX_ID_SHIFT 32
>  #define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT  0x17
>  
> @@ -289,12 +290,21 @@ uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
>  	WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
>  
>  	desc = GEN8_CTX_VALID;
> -	desc |= GEN8_CTX_ADDRESSING_MODE(dev) << GEN8_CTX_ADDRESSING_MODE_SHIFT;
> -	if (IS_GEN8(ctx_obj->base.dev))
> -		desc |= GEN8_CTX_L3LLC_COHERENT;
> -	desc |= GEN8_CTX_PRIVILEGE;
> -	desc |= lrca;
> -	desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT;
> +	if (ctx->is_svm) {
> +		desc |= ADVANCED_CONTEXT << GEN8_CTX_ADDRESSING_MODE_SHIFT;
> +		desc |= FAULT_AND_STREAM << GEN8_CTX_FAULT_SHIFT;
> +		desc |= lrca;
> +		desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT;
> +	} else {
> +		desc |= GEN8_CTX_ADDRESSING_MODE(dev) <<
> +			GEN8_CTX_ADDRESSING_MODE_SHIFT;
> +		if (IS_GEN8(ctx_obj->base.dev))
> +			desc |= GEN8_CTX_L3LLC_COHERENT;
> +		desc |= GEN8_CTX_PRIVILEGE;
> +		desc |= lrca;
> +		desc |= (u64)intel_execlists_ctx_id(ctx_obj) <<
> +			GEN8_CTX_ID_SHIFT;
> +	}
>  
>  	/* TODO: WaDisableLiteRestore when we start using semaphore
>  	 * signalling between Command Streamers */
> @@ -545,7 +555,7 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
>  		   _MASKED_FIELD(0x07 << 8, ((u32)ring->next_context_status_buffer & 0x07) << 8));
>  }
>  
> -static int execlists_context_queue(struct drm_i915_gem_request *request)
> +int execlists_context_queue(struct drm_i915_gem_request *request)
>  {
>  	struct intel_engine_cs *ring = request->ring;
>  	struct drm_i915_gem_request *cursor;
> @@ -2273,31 +2283,40 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
>  	reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED;
>  	reg_state[CTX_CTX_TIMESTAMP] = ring->mmio_base + 0x3a8;
>  	reg_state[CTX_CTX_TIMESTAMP+1] = 0;
> -	reg_state[CTX_PDP3_UDW] = GEN8_RING_PDP_UDW(ring, 3);
> -	reg_state[CTX_PDP3_LDW] = GEN8_RING_PDP_LDW(ring, 3);
> -	reg_state[CTX_PDP2_UDW] = GEN8_RING_PDP_UDW(ring, 2);
> -	reg_state[CTX_PDP2_LDW] = GEN8_RING_PDP_LDW(ring, 2);
> -	reg_state[CTX_PDP1_UDW] = GEN8_RING_PDP_UDW(ring, 1);
> -	reg_state[CTX_PDP1_LDW] = GEN8_RING_PDP_LDW(ring, 1);
> -	reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0);
> -	reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0);
> -
> -	if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
> -		/* 64b PPGTT (48bit canonical)
> -		 * PDP0_DESCRIPTOR contains the base address to PML4 and
> -		 * other PDP Descriptors are ignored.
> -		 */
> -		ASSIGN_CTX_PML4(ppgtt, reg_state);
> +
> +	if (ctx->is_svm) {
> +		reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0);
> +		reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0);
> +		reg_state[CTX_PDP0_UDW+1] = 0;
> +		reg_state[CTX_PDP0_LDW+1] = ctx->pasid;
>  	} else {
> -		/* 32b PPGTT
> -		 * PDP*_DESCRIPTOR contains the base address of space supported.
> -		 * With dynamic page allocation, PDPs may not be allocated at
> -		 * this point. Point the unallocated PDPs to the scratch page
> -		 */
> -		ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
> -		ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
> -		ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
> -		ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
> +		reg_state[CTX_PDP3_UDW] = GEN8_RING_PDP_UDW(ring, 3);
> +		reg_state[CTX_PDP3_LDW] = GEN8_RING_PDP_LDW(ring, 3);
> +		reg_state[CTX_PDP2_UDW] = GEN8_RING_PDP_UDW(ring, 2);
> +		reg_state[CTX_PDP2_LDW] = GEN8_RING_PDP_LDW(ring, 2);
> +		reg_state[CTX_PDP1_UDW] = GEN8_RING_PDP_UDW(ring, 1);
> +		reg_state[CTX_PDP1_LDW] = GEN8_RING_PDP_LDW(ring, 1);
> +		reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0);
> +		reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0);
> +
> +		if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
> +			/* 64b PPGTT (48bit canonical)
> +			 * PDP0_DESCRIPTOR contains the base address to PML4 and
> +			 * other PDP Descriptors are ignored.
> +			 */
> +			ASSIGN_CTX_PML4(ppgtt, reg_state);
> +		} else {
> +			/* 32b PPGTT
> +			 * PDP*_DESCRIPTOR contains the base address of space
> +			 * supported. With dynamic page allocation, PDPs may
> +			 * not be allocated at this point. Point the
> +			 * unallocated PDPs to the scratch page
> +			 */
> +			ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
> +			ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
> +			ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
> +			ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
> +		}
>  	}
>  
>  	if (ring->id == RCS) {
> @@ -2327,6 +2346,12 @@ void intel_lr_context_free(struct intel_context *ctx)
>  {
>  	int i;
>  
> +        if (ctx->is_svm) {
> +                intel_free_pasid(ctx->ims->dev_priv->dev, ctx);
> +                intel_unbind_mm(ctx);
> +		put_task_struct(ctx->tsk);
> +       }
> +
>  	for (i = 0; i < I915_NUM_RINGS; i++) {
>  		struct drm_i915_gem_object *ctx_obj = ctx->engine[i].state;
>  
> @@ -2480,6 +2505,37 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
>  
>  	}
>  
> +	if (ctx->is_svm) {
> +		/* FIXME: just skip here, don't bail and trash the ctx */
> +		if (ring->id != RCS) {
> +			DRM_DEBUG_DRIVER("svm context only allowed on RCS\n");

That's fairly useless then :)
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre