[Intel-gfx] [PATCH] drm/i915: Prevent negative relocation deltas from wrapping

Fri May 23 08:48:08 CEST 2014

From: Chris Wilson <chris at chris-wilson.co.uk>

This is pure evil. Userspace, I'm looking at you SNA, repacks batch
buffers on the fly after generation as they are being passed to the
kernel for execution. These batches also contain self-referenced
relocations as a single buffer encompasses the state commands, kernels,
vertices and sampler. During generation the buffers are placed at known
offsets within the full batch, and then the relocation deltas (as passed
to the kernel) are tweaked as the batch is repacked into a smaller buffer.
This means that userspace is passing negative relocations deltas, which
subsequently wrap to large values if the batch is at a low address. The
GPU hangs when it then tries to use the large value as a base for its
address offsets, rather than wrapping back to the real value (as one
would hope). As the GPU uses positive offsets from the base, we can
treat the relocation address as the minimum address read by the GPU.
For the upper bound, we trust that userspace will not read beyond the
end of the buffer.

So, how do we fix negative relocations from wrapping? We can either
check that every relocation looks valid when we write it, and then
position each object such that we prevent the offset wraparound, or we
just special-case the self-referential behaviour of SNA and force all
batches to be above 256k. Daniel prefers the latter approach.

This fixes a GPU hang when it tries to use an address (relocation +
offset) greater than the GTT size. The issue would occur quite easily
with full-ppgtt as each fd gets its own VM space, so low offsets would
often be handed out. However, with the rearrangement of the low GTT due
to capturing the BIOS framebuffer, it is already affecting kernels 3.15
onwards. I think only IVB+ is susceptible to this bug, but the workaround
should only kick in rarely, so it seems sensible to always apply it.

v3: Use a bias for batch buffers to prevent small negative delta relocations
from wrapping.

v4 from Daniel:
- s/BIAS/BATCH_OFFSET_BIAS/
- Extract eb_vma_misplaced/i915_vma_misplaced since the conditions
  were growing rather cumbersome.
- Add a comment to eb_get_batch explaining why we do this.
- Apply the batch offset bias everywhere but mention that we've only
  observed it on gen7 gpus.
- Drop PIN_OFFSET_FIX for now, that slipped in from a feature patch.

v5: Add static to eb_get_batch, spotted by 0-day tester.

Testcase: igt/gem_bad_reloc
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=78533
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk> (v3)
Cc: stable at vger.kernel.org
Signed-off-by: Daniel Vetter <daniel.vetter at ffwll.ch>
---
 drivers/gpu/drm/i915/i915_drv.h            |  6 ++-
 drivers/gpu/drm/i915/i915_gem.c            | 46 +++++++++++++-----
 drivers/gpu/drm/i915/i915_gem_evict.c      |  9 ++--
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 76 ++++++++++++++++++++++--------
 drivers/gpu/drm/i915/i915_gem_gtt.c        |  4 +-
 5 files changed, 103 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index ec5f6fb42ab3..388c028e223c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2189,10 +2189,12 @@ void i915_gem_vma_destroy(struct i915_vma *vma);
 #define PIN_MAPPABLE 0x1
 #define PIN_NONBLOCK 0x2
 #define PIN_GLOBAL 0x4
+#define PIN_OFFSET_BIAS 0x8
+#define PIN_OFFSET_MASK (~4095)
 int __must_check i915_gem_object_pin(struct drm_i915_gem_object *obj,
 				     struct i915_address_space *vm,
 				     uint32_t alignment,
-				     unsigned flags);
+				     uint64_t flags);
 int __must_check i915_vma_unbind(struct i915_vma *vma);
 int i915_gem_object_put_pages(struct drm_i915_gem_object *obj);
 void i915_gem_release_all_mmaps(struct drm_i915_private *dev_priv);
@@ -2445,6 +2447,8 @@ int __must_check i915_gem_evict_something(struct drm_device *dev,
 					  int min_size,
 					  unsigned alignment,
 					  unsigned cache_level,
+					  unsigned long start,
+					  unsigned long end,
 					  unsigned flags);
 int i915_gem_evict_vm(struct i915_address_space *vm, bool do_idle);
 int i915_gem_evict_everything(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b391f30f9985..3326770c9ed2 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3326,12 +3326,14 @@ static struct i915_vma *
 i915_gem_object_bind_to_vm(struct drm_i915_gem_object *obj,
 			   struct i915_address_space *vm,
 			   unsigned alignment,
-			   unsigned flags)
+			   uint64_t flags)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 size, fence_size, fence_alignment, unfenced_alignment;
-	size_t gtt_max =
+	unsigned long start =
+		flags & PIN_OFFSET_BIAS ? flags & PIN_OFFSET_MASK : 0;
+	unsigned long end =
 		flags & PIN_MAPPABLE ? dev_priv->gtt.mappable_end : vm->total;
 	struct i915_vma *vma;
 	int ret;
@@ -3360,11 +3362,11 @@ i915_gem_object_bind_to_vm(struct drm_i915_gem_object *obj,
 	/* If the object is bigger than the entire aperture, reject it early
 	 * before evicting everything in a vain attempt to find space.
 	 */
-	if (obj->base.size > gtt_max) {
-		DRM_DEBUG("Attempting to bind an object larger than the aperture: object=%zd > %s aperture=%zu\n",
+	if (obj->base.size > end) {
+		DRM_DEBUG("Attempting to bind an object larger than the aperture: object=%zd > %s aperture=%lu\n",
 			  obj->base.size,
 			  flags & PIN_MAPPABLE ? "mappable" : "total",
-			  gtt_max);
+			  end);
 		return ERR_PTR(-E2BIG);
 	}
 
@@ -3381,12 +3383,15 @@ i915_gem_object_bind_to_vm(struct drm_i915_gem_object *obj,
 search_free:
 	ret = drm_mm_insert_node_in_range_generic(&vm->mm, &vma->node,
 						  size, alignment,
-						  obj->cache_level, 0, gtt_max,
+						  obj->cache_level,
+						  start, end,
 						  DRM_MM_SEARCH_DEFAULT,
 						  DRM_MM_CREATE_DEFAULT);
 	if (ret) {
 		ret = i915_gem_evict_something(dev, vm, size, alignment,
-					       obj->cache_level, flags);
+					       obj->cache_level,
+					       start, end,
+					       flags);
 		if (ret == 0)
 			goto search_free;
 
@@ -3946,11 +3951,30 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 	return ret;
 }
 
+static bool
+i915_vma_misplaced(struct i915_vma *vma, uint32_t alignment, uint64_t flags)
+{
+	struct drm_i915_gem_object *obj = vma->obj;
+
+	if (alignment &&
+	    vma->node.start & (alignment - 1))
+		return true;
+
+	if (flags & PIN_MAPPABLE && !obj->map_and_fenceable)
+		return true;
+
+	if (flags & PIN_OFFSET_BIAS &&
+	    vma->node.start < (flags & PIN_OFFSET_MASK))
+		return true;
+
+	return false;
+}
+
 int
 i915_gem_object_pin(struct drm_i915_gem_object *obj,
 		    struct i915_address_space *vm,
 		    uint32_t alignment,
-		    unsigned flags)
+		    uint64_t flags)
 {
 	struct i915_vma *vma;
 	int ret;
@@ -3963,15 +3987,13 @@ i915_gem_object_pin(struct drm_i915_gem_object *obj,
 		if (WARN_ON(vma->pin_count == DRM_I915_GEM_OBJECT_MAX_PIN_COUNT))
 			return -EBUSY;
 
-		if ((alignment &&
-		     vma->node.start & (alignment - 1)) ||
-		    (flags & PIN_MAPPABLE && !obj->map_and_fenceable)) {
+		if (i915_vma_misplaced(vma, alignment, flags)) {
 			WARN(vma->pin_count,
 			     "bo is already pinned with incorrect alignment:"
 			     " offset=%lx, req.alignment=%x, req.map_and_fenceable=%d,"
 			     " obj->map_and_fenceable=%d\n",
 			     i915_gem_obj_offset(obj, vm), alignment,
-			     flags & PIN_MAPPABLE,
+			     !!(flags & PIN_MAPPABLE),
 			     obj->map_and_fenceable);
 			ret = i915_vma_unbind(vma);
 			if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index 75fca63dc8c1..bbf4b12d842e 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -68,9 +68,9 @@ mark_free(struct i915_vma *vma, struct list_head *unwind)
 int
 i915_gem_evict_something(struct drm_device *dev, struct i915_address_space *vm,
 			 int min_size, unsigned alignment, unsigned cache_level,
+			 unsigned long start, unsigned long end,
 			 unsigned flags)
 {
-	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct list_head eviction_list, unwind_list;
 	struct i915_vma *vma;
 	int ret = 0;
@@ -102,11 +102,10 @@ i915_gem_evict_something(struct drm_device *dev, struct i915_address_space *vm,
 	 */
 
 	INIT_LIST_HEAD(&unwind_list);
-	if (flags & PIN_MAPPABLE) {
-		BUG_ON(!i915_is_ggtt(vm));
+	if (start != 0 || end != vm->total) {
 		drm_mm_init_scan_with_range(&vm->mm, min_size,
-					    alignment, cache_level, 0,
-					    dev_priv->gtt.mappable_end);
+					    alignment, cache_level,
+					    start, end);
 	} else
 		drm_mm_init_scan(&vm->mm, min_size, alignment, cache_level);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 2c9d9cbaf653..6651195edcad 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -35,6 +35,9 @@
 
 #define  __EXEC_OBJECT_HAS_PIN (1<<31)
 #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
+#define  __EXEC_OBJECT_NEEDS_BIAS (1<<28)
+
+#define BATCH_OFFSET_BIAS (256*1024)
 
 struct eb_vmas {
 	struct list_head vmas;
@@ -545,7 +548,7 @@ i915_gem_execbuffer_reserve_vma(struct i915_vma *vma,
 	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
 	bool has_fenced_gpu_access = INTEL_INFO(ring->dev)->gen < 4;
 	bool need_fence;
-	unsigned flags;
+	uint64_t flags;
 	int ret;
 
 	flags = 0;
@@ -559,6 +562,8 @@ i915_gem_execbuffer_reserve_vma(struct i915_vma *vma,
 
 	if (entry->flags & EXEC_OBJECT_NEEDS_GTT)
 		flags |= PIN_GLOBAL;
+	if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS)
+		flags |= BATCH_OFFSET_BIAS | PIN_OFFSET_BIAS;
 
 	ret = i915_gem_object_pin(obj, vma->vm, entry->alignment, flags);
 	if (ret)
@@ -592,6 +597,36 @@ i915_gem_execbuffer_reserve_vma(struct i915_vma *vma,
 	return 0;
 }
 
+static bool
+eb_vma_misplaced(struct i915_vma *vma, bool has_fenced_gpu_access)
+{
+	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
+	struct drm_i915_gem_object *obj = vma->obj;
+	bool need_fence, need_mappable;
+
+	need_fence =
+		has_fenced_gpu_access &&
+		entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
+		obj->tiling_mode != I915_TILING_NONE;
+	need_mappable = need_fence || need_reloc_mappable(vma);
+
+	WARN_ON((need_mappable || need_fence) &&
+	       !i915_is_ggtt(vma->vm));
+
+	if (entry->alignment &&
+	    vma->node.start & (entry->alignment - 1))
+		return true;
+
+	if (need_mappable && !obj->map_and_fenceable)
+		return true;
+
+	if (entry->flags & __EXEC_OBJECT_NEEDS_BIAS &&
+	    vma->node.start < BATCH_OFFSET_BIAS)
+		return true;
+
+	return false;
+}
+
 static int
 i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
 			    struct list_head *vmas,
@@ -653,26 +688,10 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
 
 		/* Unbind any ill-fitting objects or pin. */
 		list_for_each_entry(vma, vmas, exec_list) {
-			struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
-			bool need_fence, need_mappable;
-
-			obj = vma->obj;
-
 			if (!drm_mm_node_allocated(&vma->node))
 				continue;
 
-			need_fence =
-				has_fenced_gpu_access &&
-				entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
-				obj->tiling_mode != I915_TILING_NONE;
-			need_mappable = need_fence || need_reloc_mappable(vma);
-
-			WARN_ON((need_mappable || need_fence) &&
-			       !i915_is_ggtt(vma->vm));
-
-			if ((entry->alignment &&
-			     vma->node.start & (entry->alignment - 1)) ||
-			    (need_mappable && !obj->map_and_fenceable))
+			if (eb_vma_misplaced(vma, has_fenced_gpu_access))
 				ret = i915_vma_unbind(vma);
 			else
 				ret = i915_gem_execbuffer_reserve_vma(vma, ring, need_relocs);
@@ -999,6 +1018,25 @@ i915_reset_gen7_sol_offsets(struct drm_device *dev,
 	return 0;
 }
 
+static struct drm_i915_gem_object *
+eb_get_batch(struct eb_vmas *eb)
+{
+	struct i915_vma *vma = list_entry(eb->vmas.prev, typeof(*vma), exec_list);
+
+	/*
+	 * SNA is doing fancy tricks with compressing batch buffers, which leads
+	 * to negative relocation deltas. Usually that works out ok since the
+	 * relocate address is still positive, except when the batch is placed
+	 * very low in the GTT. Ensure this doesn't happen.
+	 *
+	 * Note that actual hangs have only been observed on gen7, but for
+	 * paranoia do it everywhere.
+	 */
+	vma->exec_entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
+
+	return vma->obj;
+}
+
 static int
 i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		       struct drm_file *file,
@@ -1153,7 +1191,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		goto err;
 
 	/* take note of the batch buffer before we might reorder the lists */
-	batch_obj = list_entry(eb->vmas.prev, struct i915_vma, exec_list)->obj;
+	batch_obj = eb_get_batch(eb);
 
 	/* Move the objects en-masse into the GTT, evicting if necessary. */
 	need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 154b0f8bb88d..5deb22864c52 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1089,7 +1089,9 @@ alloc:
 	if (ret == -ENOSPC && !retried) {
 		ret = i915_gem_evict_something(dev, &dev_priv->gtt.base,
 					       GEN6_PD_SIZE, GEN6_PD_ALIGN,
-					       I915_CACHE_NONE, 0);
+					       I915_CACHE_NONE,
+					       0, dev_priv->gtt.base.total,
+					       0);
 		if (ret)
 			return ret;
 
-- 
1.9.2