[Intel-gfx] [PATCH 6/9] drm/i915: driver based PASID handling

Fri Sep 4 09:59:00 PDT 2015

New file with VT-d SVM and PASID handling functions and page table
management.  This belongs in the IOMMU code (along with some extra bits
for waiting for invalidations and page faults to complete, flushing the
device IOTLB, etc.)

FIXME:
  need work queue for re-submitting contexts
  TE bit handling on SKL
---
 drivers/gpu/drm/i915/Makefile           |    5 +-
 drivers/gpu/drm/i915/i915_drv.h         |   43 ++
 drivers/gpu/drm/i915/i915_gem.c         |    3 +
 drivers/gpu/drm/i915/i915_gem_context.c |    3 +
 drivers/gpu/drm/i915/i915_irq.c         |    7 +
 drivers/gpu/drm/i915/i915_reg.h         |   47 ++
 drivers/gpu/drm/i915/i915_svm.c         | 1102 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.c        |  120 +++-
 drivers/gpu/drm/i915/intel_lrc.h        |    1 +
 9 files changed, 1299 insertions(+), 32 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/i915_svm.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 44d290a..e4883a7 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -38,7 +38,8 @@ i915-y += i915_cmd_parser.o \
 	  intel_lrc.o \
 	  intel_mocs.o \
 	  intel_ringbuffer.o \
-	  intel_uncore.o
+	  intel_uncore.o \
+	  i915_svm.o
 
 # general-purpose microcontroller (GuC) support
 i915-y += intel_guc_loader.o \
@@ -93,6 +94,8 @@ i915-y += dvo_ch7017.o \
 # virtual gpu code
 i915-y += i915_vgpu.o
 
+i915-$(CONFIG_MMU_NOTIFIER) += i915_svm.o
+
 # legacy horrors
 i915-y += i915_dma.o
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 20beb51..ca38a7a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -47,6 +47,7 @@
 #include <drm/drm_gem.h>
 #include <linux/backlight.h>
 #include <linux/hashtable.h>
+#include <linux/mmu_notifier.h>
 #include <linux/intel-iommu.h>
 #include <linux/kref.h>
 #include <linux/pm_qos.h>
@@ -848,6 +849,13 @@ struct i915_ctx_hang_stats {
 	bool banned;
 };
 
+struct intel_mm_struct {
+	struct kref kref;
+	struct mmu_notifier notifier;
+	struct drm_i915_private *dev_priv;
+	struct list_head context_list;
+};
+
 /* This must match up with the value previously used for execbuf2.rsvd1. */
 #define DEFAULT_CONTEXT_HANDLE 0
 
@@ -874,6 +882,9 @@ struct i915_ctx_hang_stats {
 struct intel_context {
 	struct kref ref;
 	int user_handle;
+	bool is_svm; /* shares x86 page tables */
+	u32 pasid; /* 20 bits */
+	struct intel_mm_struct *ims;
 	uint8_t remap_slice;
 	struct drm_i915_private *i915;
 	int flags;
@@ -895,6 +906,9 @@ struct intel_context {
 		int pin_count;
 	} engine[I915_NUM_RINGS];
 
+	struct list_head mm_list;
+	struct task_struct *tsk;
+
 	struct list_head link;
 };
 
@@ -1334,6 +1348,21 @@ struct i915_gem_mm {
 	u32 object_count;
 };
 
+#define PASID_COUNT 32
+
+struct i915_svm_state {
+	bool svm_available;
+	struct extended_root_table_entry *root_table;
+	struct extended_context_table_entry *context;
+	struct pasid_table_entry *pasid_table;
+	struct pasid_state_table_entry *pasid_state_table;
+        struct intel_context *pasid_ctx[PASID_COUNT];
+	u8 *prq_ring;
+	u8 *ivq_ring;
+	struct work_struct work;
+	spinlock_t lock; /* protects pasid_ctx, prq, and ivq rings */
+};
+
 struct drm_i915_error_state_buf {
 	struct drm_i915_private *i915;
 	unsigned bytes;
@@ -1942,6 +1971,8 @@ struct drm_i915_private {
 
 	struct i915_runtime_pm pm;
 
+	struct i915_svm_state svm;
+
 	/* Abstract the submission mechanism (legacy ringbuffer or execlists) away */
 	struct {
 		int (*execbuf_submit)(struct i915_execbuffer_params *params,
@@ -3342,6 +3373,18 @@ int i915_reg_read_ioctl(struct drm_device *dev, void *data,
 int i915_get_reset_stats_ioctl(struct drm_device *dev, void *data,
 			       struct drm_file *file);
 
+/* svm */
+extern struct intel_mm_struct *intel_bind_mm(struct drm_device *dev,
+					     struct intel_context *ctx);
+extern void intel_unbind_mm(struct intel_context *ctx);
+extern int intel_alloc_pasid(struct drm_device *dev,
+			     struct intel_context *ctx);
+extern void intel_free_pasid(struct drm_device *dev,
+			     struct intel_context *ctx);
+extern void intel_init_svm(struct drm_device *dev);
+extern void intel_iommu_tlb_flush(struct drm_device *dev);
+extern void intel_gpu_fault(struct drm_device *dev);
+
 /* overlay */
 extern struct intel_overlay_error_state *intel_overlay_capture_error_state(struct drm_device *dev);
 extern void intel_overlay_print_error_state(struct drm_i915_error_state_buf *e,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 41263cd..5341591 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4665,6 +4665,9 @@ i915_gem_init_hw(struct drm_device *dev)
 		}
 	}
 
+	if (INTEL_INFO(dev)->gen >= 8)
+		intel_init_svm(dev);
+
 	i915_gem_init_swizzling(dev);
 
 	/*
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index e9ec2f3..3207bbf 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -254,6 +254,9 @@ i915_gem_create_context(struct drm_device *dev,
 	if (IS_ERR(ctx))
 		return ctx;
 
+	if (flags & I915_GEM_CONTEXT_ENABLE_SVM)
+		ctx->is_svm = true;
+
 	if (is_global_default_ctx && ctx->legacy_hw_ctx.rcs_state) {
 		/* We may need to do things with the shrinker which
 		 * require us to immediately switch back to the default
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 2063279..192ad0c 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2176,6 +2176,8 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
 			ret = IRQ_HANDLED;
 			if (tmp & GEN8_DE_MISC_GSE)
 				intel_opregion_asle_intr(dev);
+			else if (tmp & GEN8_DE_MISC_SVM_PRQ)
+				queue_work(dev_priv->wq, &dev_priv->svm.work);
 			else
 				DRM_ERROR("Unexpected DE Misc interrupt\n");
 		}
@@ -3571,6 +3573,11 @@ static void gen8_de_irq_postinstall(struct drm_i915_private *dev_priv)
 					  de_pipe_enables);
 
 	GEN5_IRQ_INIT(GEN8_DE_PORT_, ~de_port_masked, de_port_enables);
+
+	I915_WRITE(0x42a4, (1<<29));
+	I915_WRITE(GEN8_DE_MISC_IMR, ~GEN8_DE_MISC_SVM_PRQ);
+	I915_WRITE(GEN8_DE_MISC_IER, GEN8_DE_MISC_SVM_PRQ);
+	POSTING_READ(GEN8_DE_PORT_IER);
 }
 
 static int gen8_irq_postinstall(struct drm_device *dev)
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 84ed9ab..618bd33 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -2806,6 +2806,52 @@ enum skl_disp_power_wells {
 				INTERVAL_1_33_US(us) : \
 				INTERVAL_1_28_US(us))
 
+/* GFXVTBAR mirror */
+#define GFXVTBAR_BASE		0x118000
+
+#define BDW_SVM_DEV_MODE_CNFG	(0x110000)
+#define   BDW_SVM_MODE_DRIVER	(1<<0)
+
+#define SVM_GCMD		(GFXVTBAR_BASE + 0x18)
+#define   GCMD_TE		(1<<31)
+#define   GCMD_SRTP		(1<<30)
+#define   GCMD_QIE		(1<<26)
+
+#define SVM_GSTS		(GFXVTBAR_BASE + 0x1c)
+#define   GSTS_TES		(1<<31)
+#define   GSTS_RTPS		(1<<30)
+#define   GSTS_QIES		(1<<26)
+
+#define SVM_RTADDR		(GFXVTBAR_BASE + 0x20)
+#define   SVM_RTT_TYPE_EXT	(1<<11)
+
+#define SVM_CCMD		(GFXVTBAR_BASE + 0x28)
+#define   CCMD_ICC		(1ULL<<63)
+#define   CCMD_CIRG_GLOBAL	(1ULL<<61)
+
+#define SVM_IVQ_HEAD		(GFXVTBAR_BASE + 0x80)
+#define SVM_IVQ_TAIL		(GFXVTBAR_BASE + 0x88)
+#define SVM_IQA			(GFXVTBAR_BASE + 0x90)
+#define SVM_IECTL		(GFXVTBAR_BASE + 0xa0)
+
+#define SVM_PRQ_HEAD		(GFXVTBAR_BASE + 0xc0)
+#define SVM_PRQ_TAIL		(GFXVTBAR_BASE + 0xc8)
+#define SVM_PRQA		(GFXVTBAR_BASE + 0xd0)
+
+#define SVM_PRECTL		(GFXVTBAR_BASE + 0xe0)
+#define   PRE_IM		(1<<31)
+
+#define SVM_IOTLB		(GFXVTBAR_BASE + 0x508)
+#define   IOTLB_IVT		(1ULL<<63)
+#define   IOTLB_GLOBAL		(1ULL<<60)
+#define   IOTLB_DOMAIN		(2ULL<<60)
+#define   IOTLB_PAGE		(3ULL<<60)
+#define   IOTLB_IIRG_MASK	(3ULL<<60)
+#define   IOTLB_GLOBAL_DONE	(1ULL<<57)
+#define   IOTLB_DOMAIN_DONE	(2ULL<<57)
+#define   IOTLB_PAGE_DONE	(3ULL<<57)
+#define   IOTLB_IAIG_MASK	(3ULL<<57)
+
 /*
  * Logical Context regs
  */
@@ -5791,6 +5837,7 @@ enum skl_disp_power_wells {
 #define GEN8_DE_MISC_IIR 0x44468
 #define GEN8_DE_MISC_IER 0x4446c
 #define  GEN8_DE_MISC_GSE		(1 << 27)
+#define  GEN8_DE_MISC_SVM_PRQ		(1 << 22)
 
 #define GEN8_PCU_ISR 0x444e0
 #define GEN8_PCU_IMR 0x444e4
diff --git a/drivers/gpu/drm/i915/i915_svm.c b/drivers/gpu/drm/i915/i915_svm.c
new file mode 100644
index 0000000..1d05318
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_svm.c
@@ -0,0 +1,1102 @@
+/*
+ * Copyright 2013-2014 Intel Corporation
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/mmu_notifier.h>
+#include <drm/drmP.h>
+#include <drm/i915_drm.h>
+#include "i915_drv.h"
+#include "i915_trace.h"
+#include "intel_drv.h"
+
+/*
+ * The root tables have 256 entries, one per bus in the PCI domain.  Each
+ * of them points to a context table (or two context tables in the extended
+ * case), which have entries for each devfn in the domain.  So entry 0 in
+ * each table is devfn(0,0), entry 8 is devfn(1,0) etc.  For gfx, we just
+ * care about entry 16, which is devfn(2,0).
+ */
+
+/**
+ * legacy_root_table_entry - translation root table w/o extended features
+ * @present: entry present & valid
+ * @ctx_addr: address of a legacy_context_table_entry
+ */
+struct legacy_root_table_entry {
+	u64 present:1;
+	u64 rsvd2:11;
+	u64 ctx_addr:27;
+	u64 rsvd1:25;
+	u64 rsvd0;
+} __attribute__((packed));
+
+/**
+ * extended_root_table_entry - translation root table w/extended features
+ * @lo_present: low ctx addr entry present & valid
+ * @lo_ctx_addr: low bits of address of a extended_context_table_entry
+ * @hi_present: high ctx addr entry present & valid
+ * @hi_ctx_addr: high bits of address of a extended_context_table_entry
+ */
+struct extended_root_table_entry {
+	u64 lo_present:1;
+	u64 rsvd3:11;
+	u64 lo_ctx_addr:27;
+	u64 rsvd2:25;
+	u64 hi_present:1;
+	u64 rsvd1:11;
+	u64 hi_ctx_addr:27;
+	u64 rsvd0:25;
+} __attribute__((packed));
+
+/*
+ * Only untranslated requests walk the page tables, translated DMA requests and
+ * translation requests blocked
+ */
+#define LEGACY_TTYPE_UT_ONLY		0
+/* All types are translated with the page tables */
+#define LEGACY_TTYPE_ALL		1
+/*
+ * Untranslated requests are treated as passthrough (host physical ==
+ * guest physical) regardless of translation enable status.  Translated
+ * DMA requests and translation requests are blocked.
+ */
+#define LEGACY_TTYPE_UT_PT		2
+
+#define AGAW_30		0 /* 2 level page tables */
+#define AGAW_39		1 /* 3 level page tables */
+#define AGAW_48		2 /* 4 level page tables */
+#define AGAW_57		3 /* 5 level page tables */
+#define AGAW_64		4 /* 6 level page tables */
+
+/**
+ * legacy_context_table_entry - context table for a non-PASID context
+ * @present: entry present & valid
+ * @fault_disable: disable fault reporting of non-recoverable faults
+ * @translation_type: see legacy translation type defines
+ * @pgt_addr: address of a PML4 page table root
+ * @addr_width: see AGAW defines
+ * @domain_id: domain of request (not applicable to GFX)
+ */
+struct legacy_context_table_entry {
+	/* Bits 0:63 */
+	u64 present:1;
+	u64 fault_disable:1;
+	u64 translation_type:2;
+	u64 rsvd0:8;
+	u64 pgt_addr:27;
+	u64 rsvd1:25;
+	/* Bits 64:127 */
+	u64 addr_width:3;
+	u64 available:4;
+	u64 rsvd2:1;
+	u64 domain_id:16;
+	u64 rsvd3:40;
+} __attribute__((packed));
+
+
+/*
+ * Only untranslated requests walk the page tables, translated DMA requests and
+ * translation requests blocked
+ */
+#define EXTENDED_TTYPE_UT		0
+/*
+ * Untranslated and translation requets with or without PASID are translated,
+ * translated requests bypass translation.
+ */
+#define EXTENDED_TTYPE_UT_TR		1
+/*
+ * Untranslated requests w/o PASID bypass translation, all other requests
+ * blocked.
+ */
+#define EXTENDED_TTYPE_UT_PT		2
+/*
+ * Untranslated requests w/o PASID bypass, untranslated requests with PASID
+ * are translated, all other requests blocked.
+ */
+#define EXTENDED_TTYPE_UT_PT_PASID	4
+/*
+ * Untranslated and translation requests w/o PASID bypass, untranslated
+ * and translation requests w/PASID are translated.  Translated requests
+ * bypass.
+ */
+#define EXTENDED_TTYPE_UT_TR_PASID_PT	5
+/*
+ * Untranslated requests w/o PASID are blocked.  Untranslated requests with
+ * PASID are translated.  All other requests blocked.  N/A for gfx.
+ */
+#define EXTENDED_TTYPE_UT_PASID		6
+#define EXTENDED_TTYPE_UT_TR_PASID	7
+
+#define EXTENDED_MTYPE_UC		0
+#define EXTENDED_MTYPE_WB		6
+
+/**
+ * extended_context_table_entry - context table for a PASID context
+ * @present: entry present & valid
+ * @fault_disable: disable fault reporting of non-recoverable faults
+ * @translation_type: see extended translation type defines
+ * @ext_mem_type: extended memory type lookup requested
+ * @lazy_invalidate_en: allow deferred invalidate for PASIDs not in use
+ * @pg_req_en: report page requests through SW queue
+ * @nesting_en: allow second level page table walk through VT-d
+ * @slpt_addr: address of a PML4 page table root (if nesting_en set)
+ * @addr_width: see AGAW defines
+ * @pge: page global enable
+ * @nxe: no execute enable
+ * @wp: write protect
+ * @cd: cache disable
+ * @ext_mem_type_en: enable extended memory type lookup
+ * @domain_id: domain of request (not applicable to GFX)
+ * @smep: supervisor mode execution protection
+ * @sre: supervisor request enable
+ * @ere: execute request enable
+ * @slee: second level execute enable
+ * @sl64kpe: second level 64KG page enable
+ * @pat: page attribute table
+ * @pasid_table_size: size of PASID table (2^x+5 entries)
+ * @pasid_table_addr: address of PASID table
+ * @paid_state_table_addr: address of PASID state table
+ */
+struct extended_context_table_entry {
+	/* Bits 0:63 */
+	u64 present:1;
+	u64 fault_disable:1;
+	u64 translation_type:3;
+	u64 ext_mem_type:3;
+	u64 lazy_invalidate_en:1;
+	u64 pg_req_en:1;
+	u64 nesting_en:1;
+	u64 pasid_en:1;
+	u64 slpt_addr:27;
+	u64 rsvd0:25;
+	/* Bits 64:127 */
+	u64 addr_width:3;
+	u64 pge:1;
+	u64 nxe:1;
+	u64 wp:1;
+	u64 cd:1;
+	u64 ext_mem_type_en:1;
+	u64 domain_id:16;
+	u64 smep:1;
+	u64 sre:1;
+	u64 ere:1;
+	u64 slee:1;
+	u64 sl64kpe:1;
+	u64 rsvd1:3;
+	u64 pat:32;
+	/* Bits 128:191 */
+	u64 pasid_table_size:4;
+	u64 rsvd2:8;
+	u64 pasid_table_addr:27;
+	u64 rsvd3:25;
+	/* Bits 192:255 */
+	u64 rsvd4:12;
+	u64 pasid_state_table_addr:27;
+	u64 rsvd5:25;
+} __attribute__((packed));
+
+/**
+ * pasid_state_table_entry - state for a PASID
+ * @active_refcnt: active count managed by GPU hw
+ * @lazy_invalidate: use to defer invalidations, not used for gfx
+ */
+struct pasid_state_table_entry {
+	u64 rsvd0:32;
+	u64 active_refcnt:16;
+	u64 rsvd1:15;
+	u64 lazy_invalidate:1;
+} __attribute__((packed));
+
+/**
+ * pasid_table_entry - per-PASID PML4 and ancillary data
+ * @present: entry present & valid
+ * @pwt: write through (not used for gfx)
+ * @pcd: cache disable (not used for gfx)
+ * @eafe: extended access flag enable
+ * @fl64kpe: enable bit 11 IPS handling in first level page tables
+ * @pml4: physical addr of PML4 root of page tables
+ */
+struct pasid_table_entry {
+	u64 present:1;
+	u64 rsvd0:2;
+	u64 pwt:1;
+	u64 pcd:1;
+	u64 rsvd1:5;
+	u64 eafe:1;
+	u64 fl64kpe:1;
+	u64 pml4:27;
+	u64 rsvd2:25;
+} __attribute__((packed));
+
+/* Invalidation queue descriptors */
+#define CC_INV_DSC		1
+#define IOTLB_INV_DSC		2
+#define DEV_IOTLB_INV_DSC	3
+#define IEC_INV_DSC		4
+#define INV_WAIT_DSC		5
+#define EXT_IOTLB_INV_DSC	6
+#define PTC_INV_DSC		7
+#define EXT_DEV_IOTLB_INV_DSC	8
+struct inv_dsc {
+	u64 lo;
+	u64 hi;
+};
+
+#define INV_PAGES_WITH_PASID	0
+#define INV_NON_GLOBALS_WITH_PASID 1
+#define INV_NON_GLOBALS_ALL_PASIDS 2
+#define INV_GLOBAL		3
+struct iotlb_inv_dsc {
+	u64 dsc_type:4;
+	u64 g:2;
+	u64 dw:1;
+	u64 dr:1;
+	u64 rsvd1:8;
+	u64 did:16;
+	u64 rsvd2:32;
+	u64 am:6;
+	u64 ih:1;
+	u64 rsvd3:5;
+	u64 addr:52;
+};
+
+#define EXT_IOTLB_INV_G_ALL_PASIDS		0
+#define EXT_IOTLB_INV_G_ALL_PASIDS_NON_GLOBAL	1
+#define EXT_IOTLB_INV_G_PASID_NON_GLOBAL	2
+#define EXT_IOTLB_INV_G_PASID_PAGE_SELECT	3
+struct ext_iotlb_inv_dsc {
+	u64 dsc_type:4;
+	u64 g:2;
+	u64 rsvd1:10;
+	u64 did:16;
+	u64 pasid:20;
+	u64 rsvd2:12;
+	u64 am:6;
+	u64 ih:1;
+	u64 gl:1;
+	u64 rsvd3:4;
+	u64 addr:52;
+};
+
+struct dev_iotlb_inv_dsc {
+	u64 dsc_type:4;
+	u64 rsvd1:12;
+	u64 max_invs_pend:5;
+	u64 rsvd2:11;
+	u64 sid:16;
+	u64 rsvd3:16;
+	u64 s:1;
+	u64 rsvd4:11;
+	u64 addr:52;
+};
+
+#define PAGE_GRP_RESP_DSC	9
+#define PAGE_STREAM_RESP_DSC	10
+
+#define RESP_CODE_SUCCESS	0
+#define RESP_CODE_NOPAGE	1
+#define RESP_CODE_ERROR		15
+struct page_group_response_dsc {
+	u64 dsc_type:4;
+	u64 pasid_present:1;
+	u64 rsvd1:11;
+	u64 requestor_id:16;
+	u64 pasid:20;
+	u64 rsvd2:12;
+	u64 resp_code:4;
+	u64 rsvd3:28;
+	u64 private:23;
+	u64 prg_index:9;
+};
+
+/* Page request queue descriptor */
+struct page_request_dsc {
+	u64 srr:1;
+	u64 bof:1;
+	u64 pasid_present:1;
+	u64 lpig:1;
+	u64 pasid:20;
+	u64 bus:8;
+	u64 private:23;
+	u64 prg_index:9;
+	u64 rd_req:1;
+	u64 wr_req:1;
+	u64 exe_req:1;
+	u64 priv_req:1;
+	u64 devfn:8;
+	u64 addr:52;
+} __attribute((packed));
+
+/*
+ * TODO:
+ *  - variable size prq and invq rings
+ *  - split out root, context, and pasid table initialization, generalize
+ *  - sync object list in exec_mm to prevent execution
+ *  - fine grained command events with MI_USER_INTERRUPT and events
+ *  - error handling in page fault handler
+ *  - invalidation queue error handling
+ *  - debug support for page faults
+ *    - notification when in-use page is unmapped
+ *  - adv context cleanup
+ *    - allocate one on open by default
+ *    - split from intel_context
+ *  - VT-d nesting support
+ *    - push code to IOMMU layer
+ *  - PASID allocation & wraparound for invalidation handling
+ */
+#if 0
+void intel_iommu_tlb_flush(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	I915_WRITE(0x4260, 1);
+	if (wait_for(!(I915_READ(0x4260) & 1), 10))
+		DRM_ERROR("render TLB invalidate timed out\n");
+
+	I915_WRITE64(SVM_IOTLB, IOTLB_IVT | IOTLB_GLOBAL);
+	if (wait_for(!(I915_READ64(SVM_IOTLB) & IOTLB_IVT), 10))
+		DRM_ERROR("IOMMU TLB invalidate timed out\n");
+	I915_WRITE64(SVM_CCMD, CCMD_ICC | CCMD_CIRG_GLOBAL);
+	if (wait_for(!(I915_READ64(SVM_CCMD) & CCMD_ICC), 10))
+		DRM_ERROR("IOMMU context cache invalidate timed out\n");
+}
+#else
+static void ivq_write_ext_iotlb_inv_descriptor(struct drm_device *dev,
+					       struct ext_iotlb_inv_dsc *desc);
+
+void intel_iommu_tlb_flush(struct drm_device *dev)
+{
+	struct ext_iotlb_inv_dsc dsc = { 0 };
+
+	dsc.dsc_type = EXT_IOTLB_INV_DSC;
+	dsc.g = EXT_IOTLB_INV_G_ALL_PASIDS;
+	dsc.ih = 0;
+	ivq_write_ext_iotlb_inv_descriptor(dev, &dsc);
+}
+#endif
+
+static void prq_read_descriptor(struct drm_device *dev,
+				struct page_request_dsc *desc)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	u8 *prq_ring = dev_priv->svm.prq_ring;
+	struct page_request_dsc *addr;
+	int offset;
+
+	spin_lock(&dev_priv->svm.lock);
+	offset = I915_READ(SVM_PRQ_HEAD);
+	if (offset * sizeof(*addr) > PAGE_SIZE)
+		offset = 0;
+	prq_ring += offset;
+	addr = (struct page_request_dsc *)prq_ring;
+	*desc = *addr;
+	I915_WRITE(SVM_PRQ_HEAD, offset + sizeof(*addr));
+	spin_unlock(&dev_priv->svm.lock);
+}
+
+static void ivq_write_inv_descriptor(struct drm_device *dev,
+				     struct inv_dsc *desc)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	u8 *ivq_ring = dev_priv->svm.ivq_ring;
+	struct inv_dsc *addr;
+	int offset;
+
+	spin_lock(&dev_priv->svm.lock);
+	offset = I915_READ(SVM_IVQ_TAIL);
+	if (offset * sizeof(*addr) > PAGE_SIZE)
+		offset = 0;
+	ivq_ring += offset;
+	addr = (struct inv_dsc *)ivq_ring;
+	*addr = *desc;
+	I915_WRITE(SVM_IVQ_TAIL, offset + sizeof(*addr));
+	spin_unlock(&dev_priv->svm.lock);
+}
+
+static void ivq_write_ext_iotlb_inv_descriptor(struct drm_device *dev,
+					       struct ext_iotlb_inv_dsc *desc)
+{
+
+	ivq_write_inv_descriptor(dev, (struct inv_dsc *)desc);
+}
+
+static void ivq_write_resp_descriptor(struct drm_device *dev,
+				      struct page_group_response_dsc *desc)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	u8 *ivq_ring = dev_priv->svm.ivq_ring;
+	struct page_group_response_dsc *addr;
+	int offset;
+
+	spin_lock(&dev_priv->svm.lock);
+	offset = I915_READ(SVM_IVQ_TAIL);
+	if (offset * sizeof(*addr) > PAGE_SIZE)
+		offset = 0;
+	ivq_ring += offset;
+	addr = (struct page_group_response_dsc *)ivq_ring;
+	*addr = *desc;
+	I915_WRITE(SVM_IVQ_TAIL, offset + sizeof(*addr));
+	spin_unlock(&dev_priv->svm.lock);
+}
+
+static void gpu_mm_segv(struct task_struct *tsk, unsigned long address,
+			int si_code)
+{
+	siginfo_t info;
+
+	/* Need specific signal info here */
+	info.si_signo	= SIGSEGV;
+	info.si_errno	= EIO;
+	info.si_code	= si_code;
+	info.si_addr	= (void __user *)address;
+
+	force_sig_info(SIGSEGV, &info, tsk);
+}
+
+/*
+ * Read the fault descriptor and handle the fault:
+ *   get PML4 from PASID
+ *   get mm struct
+ *   get the vma
+ *   verify the address is valid
+ *   call handle_mm_fault after taking the mm->mmap_sem
+ */
+void intel_gpu_fault_work(struct work_struct *work)
+{
+	struct i915_svm_state *svm = container_of(work, struct i915_svm_state,
+						  work);
+	struct drm_i915_private *dev_priv =
+		container_of(svm, struct drm_i915_private, svm);
+	struct drm_device *dev = dev_priv->dev;
+	struct intel_ringbuffer *ringbuf;
+	struct page_request_dsc desc;
+	struct page_group_response_dsc resp;
+	struct intel_context *ctx;
+	struct task_struct *tsk;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	u64 address;
+	int ret;
+
+	DRM_ERROR("PRQ updated, head 0x%08x, tail 0x%08x\n",
+		  I915_READ(SVM_PRQ_HEAD), I915_READ(SVM_PRQ_TAIL));
+	prq_read_descriptor(dev, &desc);
+	DRM_ERROR("page fault on addr 0x%016llx, PASID %d, srr %d\n",
+		  (u64)(desc.addr << PAGE_SHIFT), desc.pasid, desc.srr);
+
+	spin_lock(&dev_priv->svm.lock);
+	ctx = dev_priv->svm.pasid_ctx[desc.pasid];
+	tsk = ctx->tsk;
+	mm = tsk->mm;
+	address = desc.addr << PAGE_SHIFT;
+	ringbuf = ctx->engine[RCS].ringbuf;
+	spin_unlock(&dev_priv->svm.lock);
+
+	down_read_trylock(&mm->mmap_sem);
+	vma = find_extend_vma(mm, address);
+	if (!vma || address < vma->vm_start) {
+		DRM_ERROR("bad VMA or address out of range\n");
+		gpu_mm_segv(tsk, address, SEGV_MAPERR);
+		goto out_unlock; /* need to kill process */
+	}
+
+	ret = handle_mm_fault(mm, vma, address,
+			      desc.wr_req ? FAULT_FLAG_WRITE : 0);
+	if (ret & VM_FAULT_ERROR) {
+		gpu_mm_segv(tsk, address, SEGV_ACCERR); /* ? */
+		goto out_unlock;
+	}
+
+	if (ret & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
+
+	if (desc.srr)
+		resp.dsc_type = PAGE_STREAM_RESP_DSC;
+	else
+		resp.dsc_type = PAGE_GRP_RESP_DSC;
+	resp.pasid = desc.pasid;
+	resp.pasid_present = 1;
+	resp.requestor_id = PCI_DEVID(0, PCI_DEVFN(2,0));
+	resp.resp_code = RESP_CODE_SUCCESS;
+	resp.prg_index = desc.prg_index;
+	resp.private = desc.private;
+	ivq_write_resp_descriptor(dev, &resp);
+out_unlock:
+	up_read(&mm->mmap_sem);
+
+	/* FIXME: wait for page response to be serviced */
+
+	/* FIXME: queue context for re-submit */
+	/* execlists_context_queue(req); */
+}
+
+/*
+ * SVM with Intel GPUs
+ *
+ * On BDW and newer GPUs, the GPU can use x86 page tables to convert linear
+ * addresses to physical addresses for GPU activity.  This allows applications
+ * to simply malloc() buffers for use on the GPU (whether command buffers,
+ * state, source data, or destination data).
+ *
+ * We accomplish this sharing one of two ways for a given mm_struct:
+ *   1) copy the page tables and manage the mm_struct with mmu_notifier
+ *      callbacks
+ *   2) use the current->mm page tables directly, synchronizing with
+ *      TLB shootdowns and updates on the host with the mmu_notifiers
+ *
+ * In case (1) we can delay page table updates until GPU activity on a given
+ * page or range is complete to make debug on the application side easier.
+ * TLB shootdowns will occur only on the GPU after any outstanding activity
+ * has completed and page tables have been updated.
+ *
+ * In case (2) we must not block, since threads on the CPU will be concurrently
+ * accessing the data, and we don't want to delay them.  This may mean shooting
+ * down GPU activity currently in progress, either by preempting the current
+ * batch or by removing commands from a ringbuffer where we previously
+ * queued them.
+ *
+ * Note that in either of these cases, CPU threads changing the memory
+ * map out from under the GPU is highly likely to be a programming error,
+ * just as it is when a CPU thread modifies the mapping of a virtual
+ * address space in use by another thread.
+ *
+ * We track the mmu notifier bits using struct intel_mm_struct, which has
+ * a one to many relationship with intel_context (i.e. multiple contexts
+ * can exist and share the same address space).  Thus we refcount the
+ * struct and only destroy it when the final reference is dropped.
+ */
+
+/* Make sure GPU writes can't hit the mm that's about to go away */
+static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	struct intel_mm_struct *ims = container_of(mn, struct intel_mm_struct,
+						   notifier);
+	struct drm_i915_private *dev_priv = ims->dev_priv;
+	struct drm_device *dev = dev_priv->dev;
+	struct intel_context *ctx;
+
+	/*
+	 * Wait for any outstanding activity and unbind the mm.  Since
+	 * each context has its own ring, we can simply wait for the ring
+	 * to idle before invalidating the PASID and flushing the TLB.
+	 */
+	mutex_lock(&dev->struct_mutex);
+	list_for_each_entry(ctx, &ims->context_list, mm_list) {
+		intel_ring_idle(ctx->engine[RCS].ringbuf->ring);
+	}
+
+	intel_iommu_tlb_flush(dev_priv->dev);
+	mutex_unlock(&dev->struct_mutex);
+}
+
+static void intel_flush_page_locked(struct drm_device *dev, int pasid,
+				    unsigned long address)
+{
+	struct ext_iotlb_inv_dsc dsc = { 0 };
+
+	dsc.dsc_type = EXT_IOTLB_INV_DSC;
+	dsc.g = EXT_IOTLB_INV_G_PASID_PAGE_SELECT;
+	dsc.pasid = pasid;
+	dsc.ih = 0;
+	dsc.addr = address;
+	dsc.am = 1;
+	ivq_write_ext_iotlb_inv_descriptor(dev, &dsc);
+}
+
+static void intel_change_pte(struct mmu_notifier *mn, struct mm_struct *mm,
+			     unsigned long address, pte_t pte)
+{
+	struct intel_mm_struct *ims = container_of(mn, struct intel_mm_struct,
+						   notifier);
+	struct drm_i915_private *dev_priv = ims->dev_priv;
+	struct drm_device *dev = dev_priv->dev;
+
+	struct intel_context *ctx;
+
+	mutex_lock(&dev->struct_mutex);
+	list_for_each_entry(ctx, &ims->context_list, mm_list)
+		intel_flush_page_locked(dev, ctx->pasid, address);
+	mutex_unlock(&dev->struct_mutex);
+}
+
+static void intel_invalidate_page(struct mmu_notifier *mn,
+				  struct mm_struct *mm,
+				  unsigned long address)
+{
+	struct intel_mm_struct *ims = container_of(mn, struct intel_mm_struct,
+						   notifier);
+	struct drm_i915_private *dev_priv = ims->dev_priv;
+	struct drm_device *dev = dev_priv->dev;
+	struct intel_context *ctx;
+
+	mutex_lock(&dev->struct_mutex);
+	list_for_each_entry(ctx, &ims->context_list, mm_list)
+		intel_flush_page_locked(dev, ctx->pasid, address);
+	mutex_unlock(&dev->struct_mutex);
+}
+
+/* Need to unmap this range and make sure it doesn't get re-faulted */
+static void intel_invalidate_range_start(struct mmu_notifier *mn,
+					 struct mm_struct *mm,
+					 unsigned long start, unsigned long end)
+{
+	struct intel_mm_struct *ims = container_of(mn, struct intel_mm_struct,
+						   notifier);
+	struct drm_i915_private *dev_priv = ims->dev_priv;
+	struct drm_device *dev = dev_priv->dev;
+
+	/* FIXME: invalidate page only */
+	intel_iommu_tlb_flush(dev);
+}
+
+/* Pages have been freed at this point */
+static void intel_invalidate_range_end(struct mmu_notifier *mn,
+				       struct mm_struct *mm,
+				       unsigned long start, unsigned long end)
+{
+	struct intel_mm_struct *ims = container_of(mn, struct intel_mm_struct,
+						   notifier);
+	struct drm_i915_private *dev_priv = ims->dev_priv;
+	struct drm_device *dev = dev_priv->dev;
+
+	/* FIXME: invalidate page only */
+	intel_iommu_tlb_flush(dev);
+}
+
+static const struct mmu_notifier_ops intel_mmuops = {
+	.release = intel_mm_release,
+	/* no clear_flush_young, we just share the x86 bits */
+	/* no test_young, we just share the x86 bits */
+	.change_pte = intel_change_pte,
+	.invalidate_page = intel_invalidate_page,
+	.invalidate_range_start = intel_invalidate_range_start,
+	.invalidate_range_end = intel_invalidate_range_end,
+};
+
+struct intel_mm_struct *intel_bind_mm(struct drm_device *dev,
+				      struct intel_context *ctx)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_mm_struct *ims;
+	struct mmu_notifier *mn;
+	int ret;
+
+	WARN_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
+
+	mn = mmu_find_ops(current->mm, &intel_mmuops);
+	if (mn) {
+		ims = container_of(mn, struct intel_mm_struct, notifier);
+		kref_get(&ims->kref);
+		goto out;
+	}
+
+	ims = kzalloc(sizeof(*ims), GFP_KERNEL);
+	if (!ims) {
+		ret = -ENOMEM;
+		goto error;
+	}
+	INIT_LIST_HEAD(&ims->context_list);
+
+	ims->notifier.ops = &intel_mmuops;
+
+	ret = mmu_notifier_register(&ims->notifier, current->mm);
+	if (ret)
+		goto error;
+
+	ims->dev_priv = dev->dev_private;
+
+out:
+	list_add(&ctx->mm_list, &ims->context_list);
+	return ims;
+error:
+	kfree(ims);
+	return ERR_PTR(ret);
+}
+
+static void intel_mm_free(struct kref *ims_ref)
+{
+	struct intel_mm_struct *ims =
+		container_of(ims_ref, struct intel_mm_struct, kref);
+
+	mmu_notifier_unregister(&ims->notifier, current->mm);
+	kfree(ims);
+}
+
+void intel_unbind_mm(struct intel_context *ctx)
+{
+	struct drm_i915_private *dev_priv = ctx->ims->dev_priv;
+
+	WARN_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
+
+	list_del(&ctx->mm_list);
+	kref_put(&ctx->ims->kref, intel_mm_free);
+
+	return;
+}
+
+int intel_exec_mm_ioctl(struct drm_device *dev, void *data,
+			struct drm_file *file)
+{
+//	struct drm_i915_exec_mm *exec_mm = data;
+//	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	/* Load new context into context reg */
+	return 0;
+}
+
+/*
+ * The PASID table has 32 entries in the current config, rotate through
+ * them as needed.
+ */
+int intel_alloc_pasid(struct drm_device *dev, struct intel_context *ctx)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct pasid_table_entry *table;
+	int i;
+
+	WARN_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
+
+	spin_lock(&dev_priv->svm.lock);
+	table = dev_priv->svm.pasid_table;
+
+	for (i = 0; i < PASID_COUNT; i++) {
+		if (!table[i].present)
+			goto found;
+	}
+
+	spin_unlock(&dev_priv->svm.lock);
+	return -1;
+
+found:
+	table[i].pml4 = __pa(current->mm->pgd) >> PAGE_SHIFT;
+	table[i].present = 1;
+
+	ctx->pasid = i;
+	dev_priv->svm.pasid_ctx[ctx->pasid] = NULL;
+	spin_unlock(&dev_priv->svm.lock);
+
+	intel_iommu_tlb_flush(dev);
+
+	return 0;
+}
+
+void intel_free_pasid(struct drm_device *dev, struct intel_context *ctx)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct pasid_table_entry *table;
+
+	WARN_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
+
+	if (ctx->pasid >= PASID_COUNT)
+		return;
+
+	spin_lock(&dev_priv->svm.lock);
+	table = dev_priv->svm.pasid_table;
+	memset(&table[ctx->pasid], 0, sizeof(struct pasid_table_entry));
+	dev_priv->svm.pasid_ctx[ctx->pasid] = NULL;
+	ctx->pasid = -1;
+	spin_unlock(&dev_priv->svm.lock);
+
+	intel_iommu_tlb_flush(dev);
+}
+
+/*
+ * Each root table entry is 16 bytes wide.  In legacy mode, only
+ * the lower 64 bits are used:
+ *   Bits 38:12: context table pointer
+ *   Bit 0: present
+ *   all other bits reserved
+ * In extended mode (what we use for SVM):
+ *   Bits 102:76: upper context table pointer
+ *   Bit 64: upper present
+ *   Bits 38:12: lower context table pointer
+ *   Bit 0: lower present
+ *   all other bits reserved
+ *
+ * The context entries are 128 bit in legacy mode:
+ *   Bits 87:72: Domain ID
+ *   Bits 70:67: Available
+ *   Bits 66:64: Address width
+ *   Bits 38:12: Page table pointer
+ *   Bits 3:2: Translation type
+ *     00 - only untranslated DMA requests go through this table
+ *          translated and translation requests are blocked
+ *     01 - untranslated, translated, and translation requests supported
+ *     10 - untranslated requests are treated as pass through (HPA == GPA),
+ *          translated DMA requests and translation requests are blocked
+ *     11 - reserved
+ *   Bit 1: fault disable
+ *   Bit 0: Present
+ * and 256 bit in extended:
+ *   Bits 230:204: PASID state table pointer
+ *   Bits 166:140: PASID table pointer
+ *   Bits 131:128: PASID table size
+ *   Bits 127:96: Page table attribute (PAT)
+ *   Bit 92: SL64KPE
+ *   Bit 91: SLEE
+ *   Bit 90: ERE
+ *   Bit 89: SRE
+ *   Bit 88: SMEP
+ *   Bits 87:72: Domain ID
+ *   Bit 71: Extended memory type enable
+ *   Bit 70: cache disable (CD)
+ *   Bit 69: write protect (WP)
+ *   Bit 68: no execute enable (NXE)
+ *   Bit 67: page global enable (PGE)
+ *   Bits 66:64: address width
+ *   Bits 38:12: 2nd level (VT-d) page table pointer
+ *   Bit 11: PASID enable
+ *   Bit 10: Nesting enable
+ *   Bit 9: Page Request enable
+ *   Bit 8: Lazy-Invalidate enable
+ *   Bits 7:5: Extended Memory Type (VT-d)
+ *   Bits 4:2: Translation type
+ *     000 - Only Untranslated DMA requests are translated through this page
+ *           table. Translated DMA requests and Translation Requests are
+ *           blocked.  Untranslated requests-without-PASID are remapped using
+ *           the second-level page-table referenced through SLPTPTR field.
+ *           If PASIDE field is Set, Untranslated requests-with-PASID are
+ *           remapped using the PASID Table referenced through PASIDPTPTR
+ *           field. If PASIDE field is Clear, Untranslated requests-with-PASID
+ *           are blocked.  Translation requests (with or without PASID), and
+ *           Translated Requests are blocked.
+ *     001 - Un-translated and Translation requests without PASID supported
+ *           (and with PASID supported, if PASID Enable Set); Translate
+ *           requests bypass address translation.  Untranslated
+ *           requests-without-PASID and Translation requests-without-PASID are
+ *           remapped using the second level page-table referenced through
+ *           SLPTPTR field. If PASIDE field is Set, Untranslated
+ *           requests-with-PASID and Translation requests-with-PASID are
+ *           remapped using the PASID Table referenced through PASIDPTPTR
+ *           field. If PASIDE field is Clear, Untranslated requests-with-PASID,
+ *           and Translation requests-with-PASID, are blocked. Translated
+ *           requests bypass address translation.
+ *     010 - If Pass-through Supported (GT supports pass-through),
+ *           Un-translated requests without PASID bypass address translation;
+ *           All other requests (with or without PASID) blocked. Untranslated
+ *           requests-without-PASID bypass address translation and are
+ *           processed as passthrough. SLPTPTR field is ignored by hardware.
+ *           Untranslated requests-with-PASID, Translation requests (with or
+ *           without PASID), and Translated requests are blocked.
+ *     011 - Reserved.
+ *     100 - Un-translated requests without PASID bypass address translation;
+ *           Un-translated requests with PASID supported, if PASID Enable Set;
+ *           All other requests blocked. Untranslated requests-without-PASID
+ *           bypass address translation and are processed as passthrough.
+ *           SLPTPTR field is ignored by hardware. Untranslated
+ *           requests-with-PASID are remapped using the PASID Table referenced
+ *           through PASIDPTPTR field. Translation requests (with or without
+ *           PASID) and Translated requests are blocked.
+ *     101 - Un-translated and Translation requests without PASID bypass
+ *           address translation; Un-translated and Translation requests with
+ *           PASID supported, if PASID Enable Set; Translated requests bypass
+ *           address translation.  Untranslated requests-without-PASID bypass
+ *           address translation and are processed as passthrough. SLPTPTR
+ *           field is ignored by hardware.  Translation requests-without-PASID
+ *           are responded with Untranslated access only bit Set (U=1) along
+ *           with read and write permissions (R=W=1). SLPTPTR field is ignored
+ *           by hardware. Untranslated requests-with-PASID, and Translation
+ *           requests-with-PASID are remapped using the PASID Table referenced
+ *           through PASIDPTPTR field.  Translated requests bypass address
+ *           translation.
+ *     110 - Un-translated requests without PASID are blocked; Un-translated
+ *           requests with PASID supported, if PASID Enable Set; All other
+ *           requests blocked – Not applicable to GFX, GT should treat this as
+ *           reserved.
+ *     111 - Un-translated and Translation requests without PASID blocked;
+ *           Un-translated and Translation requests with PASID supported, if
+ *           PASID Enable Set; Translated requests bypass address translation.
+ *           Note: Not applicable to GFX, GT should treat this as reserved.
+ *   Bit 1: Fault disable
+ *   Bit 0: Present
+ *
+ * Page walks for graphics addresses can go through one or two levels of
+ * translation, depending on whether VT-d is enabled.
+ *
+ * If we're in driver mode (currently the only supported mode), we always
+ * use a single level of translation, meaning the second level page table
+ * pointer (if present) is ignored.
+ *
+ * The full walk starts at the root table, which indexes into the upper
+ * and lower context tables.  Those tables point to PASID mapping and state
+ * tables and potentially a second level page table for VT-d (which, as noted
+ * above, is unused currently).  The PASID mapping table points to a PML4
+ * (x86 compatible) page table, while the state table indicates other
+ * information about the PASID involved in the request, which ultimately comes
+ * from the execlist port submission of the context descriptor.
+ *
+ * To enable a shared CPU/GPU address space, we can use a couple of different
+ * translation types, either 101 or 01 w/o nesting.  The main requirement
+ * is that requests with PASID are translated through the page tables provided,
+ * potentially with nesting if we're running in a VT-d context (which we
+ * don't currently support).
+ */
+#define CONTEXT_OFFSET (PAGE_SIZE * 1)
+#define PASID_OFFSET (PAGE_SIZE * 2)
+#define PASID_STATE_OFFSET (PAGE_SIZE * 3)
+#define PRQ_OFFSET (PAGE_SIZE * 4)
+#define IVQ_OFFSET (PAGE_SIZE * 5)
+static void intel_init_svm_root_table(struct drm_device *dev,
+				      drm_dma_handle_t *tables)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct extended_root_table_entry *root_table;
+	struct extended_context_table_entry *context;
+	struct pasid_table_entry *pasid_table;
+	struct pasid_state_table_entry *pasid_state_table;
+	u64 *tmp;
+
+	root_table = tables->vaddr;
+	context = tables->vaddr + CONTEXT_OFFSET;
+        pasid_table = tables->vaddr + PASID_OFFSET;
+	pasid_state_table = tables->vaddr + PASID_STATE_OFFSET;
+
+	DRM_ERROR("programmed PASID table, vaddr %p, busaddr 0x%16llx\n",
+		  pasid_table, tables->busaddr + PASID_OFFSET);
+
+	/* Context entry for gfx device */
+	context[16].pat = 0x66666666;
+	context[16].ere = 1;
+	context[16].sre = 1;
+	context[16].smep = 1;
+	context[16].domain_id = 1;
+	context[16].addr_width = AGAW_48; /* full x86 walk */
+	context[16].pasid_en = 1;
+	context[16].nesting_en = 0; /* not yet */
+	context[16].pg_req_en = 1;
+	context[16].lazy_invalidate_en = 1;
+	context[16].ext_mem_type = EXTENDED_MTYPE_WB;
+	context[16].translation_type = EXTENDED_TTYPE_UT_TR_PASID_PT;
+	context[16].fault_disable = 0;
+	context[16].present = 1;
+	context[16].pasid_state_table_addr = (tables->busaddr + PASID_STATE_OFFSET) >> PAGE_SHIFT;
+	context[16].pasid_table_addr = (tables->busaddr + PASID_OFFSET) >>
+		PAGE_SHIFT;
+	context[16].pasid_table_size = 0; /* 2^(5+x) */
+
+	tmp = (u64 *)&context[16];
+	DRM_ERROR("root entry: 0x%016llx%016llx\n", tmp[1], tmp[0]);
+
+	DRM_ERROR("programmed context table, vaddr %p, busaddr 0x%16llx\n",
+		  context, tables->busaddr + CONTEXT_OFFSET);
+
+	/* Root table */
+	root_table[0].lo_ctx_addr = (tables->busaddr + CONTEXT_OFFSET) >>
+		PAGE_SHIFT;
+	root_table[0].lo_present = 1;
+	root_table[0].hi_present = 0;
+
+	tmp = (u64 *)&root_table[0];
+	DRM_ERROR("root entry: 0x%016llx%016llx\n", tmp[1], tmp[0]);
+
+	dev_priv->svm.root_table = root_table;
+	dev_priv->svm.context = context;
+        dev_priv->svm.pasid_table = pasid_table;
+	dev_priv->svm.pasid_state_table = pasid_state_table;
+	dev_priv->svm.prq_ring = tables->vaddr + PRQ_OFFSET;
+	dev_priv->svm.ivq_ring = tables->vaddr + IVQ_OFFSET;
+
+	/* Enable the page request queue */
+	I915_WRITE64(SVM_PRQA, tables->busaddr + PRQ_OFFSET);
+	I915_WRITE(SVM_PRQ_HEAD, 0);
+	I915_WRITE(SVM_PRQ_TAIL, 0);
+	I915_WRITE(SVM_PRECTL, 0);
+
+	/* Set up the invalidation request queue */
+	I915_WRITE64(SVM_IQA, tables->busaddr + IVQ_OFFSET);
+	I915_WRITE(SVM_IVQ_HEAD, 0);
+	I915_WRITE(SVM_IVQ_TAIL, 0);
+	I915_WRITE(SVM_IECTL, 0);
+
+	I915_WRITE(SVM_GCMD, GCMD_QIE);
+	if (wait_for(I915_READ(SVM_GSTS) & GSTS_QIES, 500))
+		DRM_ERROR("timed out waiting for queued invalidation enable\n");
+
+	/* All set, program the root */
+	I915_WRITE(SVM_RTADDR, tables->busaddr | SVM_RTT_TYPE_EXT);
+
+	I915_WRITE(SVM_GCMD, GCMD_SRTP);
+	if (wait_for(I915_READ(SVM_GSTS) & GSTS_RTPS, 500))
+		DRM_ERROR("timed out waiting for root table to load\n");
+
+	DRM_ERROR("programmed SVM root, vaddr %p, busaddr 0x%16llx\n",
+		  tables->vaddr, tables->busaddr);
+
+	intel_iommu_tlb_flush(dev);
+}
+
+/*
+ * Probe for SVM capability.  If found:
+ *  - try to switch to driver mode
+ *  - set up root PASID table
+ *  - enable page fault and error handling interrupts
+ *  - allow SVM ioctls
+ */
+void intel_init_svm(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	drm_dma_handle_t *tables;
+	u32 dev_mode;
+	int num_tables = 6;
+
+	dev_mode = I915_READ(BDW_SVM_DEV_MODE_CNFG);
+	I915_WRITE(BDW_SVM_DEV_MODE_CNFG, dev_mode | BDW_SVM_MODE_DRIVER);
+	dev_mode = I915_READ(BDW_SVM_DEV_MODE_CNFG);
+#if defined(CONFIG_INTEL_IOMMU) || defined(IOMMU_SUPPORT)
+#error must disable IOMMU support
+#endif
+	if (!dev_mode & BDW_SVM_MODE_DRIVER) {
+		DRM_ERROR("driver mode not available, disabling SVM\n");
+		goto err;
+	}
+
+	tables = drm_pci_alloc(dev, PAGE_SIZE*num_tables, PAGE_SIZE);
+	if (!tables) {
+		DRM_ERROR("table alloc failed, disabling SVM\n");
+		goto err;
+	}
+
+	memset(tables->vaddr, 0, PAGE_SIZE*num_tables);
+
+	intel_init_svm_root_table(dev, tables);
+
+	spin_lock_init(&dev_priv->svm.lock);
+
+#if 0
+	I915_WRITE(SVM_GCMD, GCMD_TE);
+	if (wait_for(I915_READ(SVM_GSTS) & GSTS_TES, 500))
+		DRM_ERROR("timed out waiting for translation enable\n");
+#endif
+	INIT_WORK(&dev_priv->svm.work, intel_gpu_fault_work);
+
+	DRM_ERROR("SVM driver mode enabled\n");
+	dev_priv->svm.svm_available = true;
+	return;
+
+err:
+	dev_priv->svm.svm_available = false;
+	return;
+}
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 40cbba4..1450491 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -217,6 +217,7 @@ enum {
 	FAULT_AND_STREAM,
 	FAULT_AND_CONTINUE /* Unsupported */
 };
+#define GEN8_CTX_FAULT_SHIFT 6
 #define GEN8_CTX_ID_SHIFT 32
 #define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT  0x17
 
@@ -289,12 +290,21 @@ uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
 	WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
 
 	desc = GEN8_CTX_VALID;
-	desc |= GEN8_CTX_ADDRESSING_MODE(dev) << GEN8_CTX_ADDRESSING_MODE_SHIFT;
-	if (IS_GEN8(ctx_obj->base.dev))
-		desc |= GEN8_CTX_L3LLC_COHERENT;
-	desc |= GEN8_CTX_PRIVILEGE;
-	desc |= lrca;
-	desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT;
+	if (ctx->is_svm) {
+		desc |= ADVANCED_CONTEXT << GEN8_CTX_ADDRESSING_MODE_SHIFT;
+		desc |= FAULT_AND_STREAM << GEN8_CTX_FAULT_SHIFT;
+		desc |= lrca;
+		desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT;
+	} else {
+		desc |= GEN8_CTX_ADDRESSING_MODE(dev) <<
+			GEN8_CTX_ADDRESSING_MODE_SHIFT;
+		if (IS_GEN8(ctx_obj->base.dev))
+			desc |= GEN8_CTX_L3LLC_COHERENT;
+		desc |= GEN8_CTX_PRIVILEGE;
+		desc |= lrca;
+		desc |= (u64)intel_execlists_ctx_id(ctx_obj) <<
+			GEN8_CTX_ID_SHIFT;
+	}
 
 	/* TODO: WaDisableLiteRestore when we start using semaphore
 	 * signalling between Command Streamers */
@@ -545,7 +555,7 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
 		   _MASKED_FIELD(0x07 << 8, ((u32)ring->next_context_status_buffer & 0x07) << 8));
 }
 
-static int execlists_context_queue(struct drm_i915_gem_request *request)
+int execlists_context_queue(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *ring = request->ring;
 	struct drm_i915_gem_request *cursor;
@@ -2273,31 +2283,40 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
 	reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED;
 	reg_state[CTX_CTX_TIMESTAMP] = ring->mmio_base + 0x3a8;
 	reg_state[CTX_CTX_TIMESTAMP+1] = 0;
-	reg_state[CTX_PDP3_UDW] = GEN8_RING_PDP_UDW(ring, 3);
-	reg_state[CTX_PDP3_LDW] = GEN8_RING_PDP_LDW(ring, 3);
-	reg_state[CTX_PDP2_UDW] = GEN8_RING_PDP_UDW(ring, 2);
-	reg_state[CTX_PDP2_LDW] = GEN8_RING_PDP_LDW(ring, 2);
-	reg_state[CTX_PDP1_UDW] = GEN8_RING_PDP_UDW(ring, 1);
-	reg_state[CTX_PDP1_LDW] = GEN8_RING_PDP_LDW(ring, 1);
-	reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0);
-	reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0);
-
-	if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
-		/* 64b PPGTT (48bit canonical)
-		 * PDP0_DESCRIPTOR contains the base address to PML4 and
-		 * other PDP Descriptors are ignored.
-		 */
-		ASSIGN_CTX_PML4(ppgtt, reg_state);
+
+	if (ctx->is_svm) {
+		reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0);
+		reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0);
+		reg_state[CTX_PDP0_UDW+1] = 0;
+		reg_state[CTX_PDP0_LDW+1] = ctx->pasid;
 	} else {
-		/* 32b PPGTT
-		 * PDP*_DESCRIPTOR contains the base address of space supported.
-		 * With dynamic page allocation, PDPs may not be allocated at
-		 * this point. Point the unallocated PDPs to the scratch page
-		 */
-		ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
-		ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
-		ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
-		ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
+		reg_state[CTX_PDP3_UDW] = GEN8_RING_PDP_UDW(ring, 3);
+		reg_state[CTX_PDP3_LDW] = GEN8_RING_PDP_LDW(ring, 3);
+		reg_state[CTX_PDP2_UDW] = GEN8_RING_PDP_UDW(ring, 2);
+		reg_state[CTX_PDP2_LDW] = GEN8_RING_PDP_LDW(ring, 2);
+		reg_state[CTX_PDP1_UDW] = GEN8_RING_PDP_UDW(ring, 1);
+		reg_state[CTX_PDP1_LDW] = GEN8_RING_PDP_LDW(ring, 1);
+		reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0);
+		reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0);
+
+		if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
+			/* 64b PPGTT (48bit canonical)
+			 * PDP0_DESCRIPTOR contains the base address to PML4 and
+			 * other PDP Descriptors are ignored.
+			 */
+			ASSIGN_CTX_PML4(ppgtt, reg_state);
+		} else {
+			/* 32b PPGTT
+			 * PDP*_DESCRIPTOR contains the base address of space
+			 * supported. With dynamic page allocation, PDPs may
+			 * not be allocated at this point. Point the
+			 * unallocated PDPs to the scratch page
+			 */
+			ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
+			ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
+			ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
+			ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
+		}
 	}
 
 	if (ring->id == RCS) {
@@ -2327,6 +2346,12 @@ void intel_lr_context_free(struct intel_context *ctx)
 {
 	int i;
 
+        if (ctx->is_svm) {
+                intel_free_pasid(ctx->ims->dev_priv->dev, ctx);
+                intel_unbind_mm(ctx);
+		put_task_struct(ctx->tsk);
+       }
+
 	for (i = 0; i < I915_NUM_RINGS; i++) {
 		struct drm_i915_gem_object *ctx_obj = ctx->engine[i].state;
 
@@ -2480,6 +2505,37 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
 
 	}
 
+	if (ctx->is_svm) {
+		/* FIXME: just skip here, don't bail and trash the ctx */
+		if (ring->id != RCS) {
+			DRM_DEBUG_DRIVER("svm context only allowed on RCS\n");
+			ret = -EINVAL;
+			goto error_destroy_rbuf;
+		}
+
+		ret = intel_alloc_pasid(dev, ctx);
+		if (ret) {
+			DRM_ERROR("pasid alloc fail: %d\n", ret);
+			ret = -ENOSPC;
+			kfree(ring);
+			goto error_destroy_rbuf;
+		}
+
+		ctx->ims = intel_bind_mm(dev, ctx);
+		if (IS_ERR(ctx->ims)) {
+			intel_free_pasid(dev, ctx);
+			DRM_ERROR("bind mm call failed: %ld\n",
+				  PTR_ERR(ctx->ims));
+			ret = PTR_ERR(ctx->ims);
+			kfree(ring);
+			goto error_destroy_rbuf;
+		}
+
+		ctx->tsk = current;
+		get_task_struct(current);
+		ctx->ims->dev_priv->svm.pasid_ctx[ctx->pasid] = ctx;
+	}
+
 	ret = populate_lr_context(ctx, ctx_obj, ring, ringbuf);
 	if (ret) {
 		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
@@ -2520,6 +2576,8 @@ error:
 	if (is_global_default_ctx)
 		intel_unpin_ringbuffer_obj(ringbuf);
 error_destroy_rbuf:
+       if (ctx->is_svm)
+                put_task_struct(current);
 	intel_destroy_ringbuffer_obj(ringbuf);
 error_free_rbuf:
 	kfree(ringbuf);
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 4cc54b3..dcaa65b 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -93,5 +93,6 @@ u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj);
 
 void intel_lrc_irq_handler(struct intel_engine_cs *ring);
 void intel_execlists_retire_requests(struct intel_engine_cs *ring);
+int execlists_context_queue(struct drm_i915_gem_request *request);
 
 #endif /* _INTEL_LRC_H_ */
-- 
1.9.1