[Intel-gfx] [PATCH 02/11] drm/i915: Avoid forcing relocations through the mappable GTT or CPU
Imre Deak
imre.deak at intel.com
Mon Jan 14 20:21:27 CET 2013
On Tue, 2013-01-08 at 10:53 +0000, Chris Wilson wrote:
> If the object lies outside of the mappable GTT aperture, do not force it
> through the CPU domain for relocations, but simply flush the writes as
> we perform them and then queue a chipset flush.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 87 ++++++++++++++++------------
> 1 file changed, 51 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 163bb52..daa5409 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -33,6 +33,9 @@
> #include "intel_drv.h"
> #include <linux/dma_remapping.h>
>
> +#define __EXEC_OBJECT_HAS_PIN (1<<31)
> +#define __EXEC_OBJECT_HAS_FENCE (1<<30)
> +
> struct eb_objects {
> int and;
> struct hlist_head buckets[0];
> @@ -95,10 +98,16 @@ eb_destroy(struct eb_objects *eb)
> static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
> {
> return (obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
> - !obj->map_and_fenceable ||
> obj->cache_level != I915_CACHE_NONE);
> }
>
> +static inline struct page *
> +gtt_offset_to_page(struct drm_i915_gem_object *obj, u32 offset)
> +{
> + offset -= obj->gtt_space->start;
> + return i915_gem_object_get_page(obj, offset >> PAGE_SHIFT);
> +}
> +
> static int
> i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
> struct eb_objects *eb,
> @@ -182,22 +191,20 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
> return -EFAULT;
>
> reloc->delta += target_offset;
> + reloc->offset += obj->gtt_offset;
> if (use_cpu_reloc(obj)) {
> - uint32_t page_offset = reloc->offset & ~PAGE_MASK;
> char *vaddr;
>
> - ret = i915_gem_object_set_to_cpu_domain(obj, 1);
> + ret = i915_gem_object_set_to_cpu_domain(obj, true);
> if (ret)
> return ret;
>
> - vaddr = kmap_atomic(i915_gem_object_get_page(obj,
> - reloc->offset >> PAGE_SHIFT));
> - *(uint32_t *)(vaddr + page_offset) = reloc->delta;
> + vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset));
> + *(uint32_t *)(vaddr + offset_in_page(reloc->offset)) = reloc->delta;
> kunmap_atomic(vaddr);
> } else {
> struct drm_i915_private *dev_priv = dev->dev_private;
> - uint32_t __iomem *reloc_entry;
> - void __iomem *reloc_page;
> + unsigned page_offset;
>
> ret = i915_gem_object_set_to_gtt_domain(obj, true);
> if (ret)
> @@ -208,13 +215,28 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
> return ret;
>
> /* Map the page containing the relocation we're going to perform. */
> - reloc->offset += obj->gtt_offset;
> - reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
> - reloc->offset & PAGE_MASK);
> - reloc_entry = (uint32_t __iomem *)
> - (reloc_page + (reloc->offset & ~PAGE_MASK));
> - iowrite32(reloc->delta, reloc_entry);
> - io_mapping_unmap_atomic(reloc_page);
> + page_offset = offset_in_page(reloc->offset);
> +
> + if (reloc->offset < dev_priv->mm.gtt_mappable_end) {
> + void __iomem *reloc_page;
> +
> + reloc_page = io_mapping_map_atomic_wc(dev_priv->mm.gtt_mapping,
> + reloc->offset & PAGE_MASK);
> + iowrite32(reloc->delta, reloc_page + page_offset);
> + io_mapping_unmap_atomic(reloc_page);
> + } else {
> + char *vaddr;
> +
> + vaddr = kmap_atomic(gtt_offset_to_page(obj, reloc->offset));
> +
> + drm_clflush_virt_range(vaddr + page_offset, 4);
> + *(uint32_t *)(vaddr + page_offset) = reloc->delta;
> + drm_clflush_virt_range(vaddr + page_offset, 4);
Discussed this already to some degree with Chris, but I still think the
first cache flush is redundant.
> +
> + kunmap_atomic(vaddr);
> +
> + obj->base.pending_write_domain |= I915_GEM_DOMAIN_CPU;
> + }
> }
>
> /* and update the user's relocation entry */
> @@ -312,16 +334,6 @@ i915_gem_execbuffer_relocate(struct drm_device *dev,
> return ret;
> }
>
> -#define __EXEC_OBJECT_HAS_PIN (1<<31)
> -#define __EXEC_OBJECT_HAS_FENCE (1<<30)
> -
> -static int
> -need_reloc_mappable(struct drm_i915_gem_object *obj)
> -{
> - struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
> - return entry->relocation_count && !use_cpu_reloc(obj);
> -}
> -
> static int
> i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
> struct intel_ring_buffer *ring)
> @@ -329,16 +341,15 @@ i915_gem_execbuffer_reserve_object(struct drm_i915_gem_object *obj,
> struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
> struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
> bool has_fenced_gpu_access = INTEL_INFO(ring->dev)->gen < 4;
> - bool need_fence, need_mappable;
> + bool need_fence;
> int ret;
>
> need_fence =
> has_fenced_gpu_access &&
> entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
> obj->tiling_mode != I915_TILING_NONE;
> - need_mappable = need_fence || need_reloc_mappable(obj);
>
> - ret = i915_gem_object_pin(obj, entry->alignment, need_mappable, false);
> + ret = i915_gem_object_pin(obj, entry->alignment, need_fence, false);
> if (ret)
> return ret;
>
> @@ -401,7 +412,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
> INIT_LIST_HEAD(&ordered_objects);
> while (!list_empty(objects)) {
> struct drm_i915_gem_exec_object2 *entry;
> - bool need_fence, need_mappable;
> + bool need_fence;
>
> obj = list_first_entry(objects,
> struct drm_i915_gem_object,
> @@ -412,9 +423,8 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
> has_fenced_gpu_access &&
> entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
> obj->tiling_mode != I915_TILING_NONE;
> - need_mappable = need_fence || need_reloc_mappable(obj);
>
> - if (need_mappable)
> + if (need_fence)
> list_move(&obj->exec_list, &ordered_objects);
> else
> list_move_tail(&obj->exec_list, &ordered_objects);
> @@ -444,7 +454,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
> /* Unbind any ill-fitting objects or pin. */
> list_for_each_entry(obj, objects, exec_list) {
> struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
> - bool need_fence, need_mappable;
> + bool need_fence;
>
> if (!obj->gtt_space)
> continue;
> @@ -453,10 +463,9 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
> has_fenced_gpu_access &&
> entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
> obj->tiling_mode != I915_TILING_NONE;
> - need_mappable = need_fence || need_reloc_mappable(obj);
>
> if ((entry->alignment && obj->gtt_offset & (entry->alignment - 1)) ||
> - (need_mappable && !obj->map_and_fenceable))
> + (need_fence && !obj->map_and_fenceable))
> ret = i915_gem_object_unbind(obj);
> else
> ret = i915_gem_execbuffer_reserve_object(obj, ring);
> @@ -603,10 +612,16 @@ i915_gem_execbuffer_move_to_gpu(struct intel_ring_buffer *ring,
> if (ret)
> return ret;
>
> + flush_domains |= obj->base.write_domain;
> +
> if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
> i915_gem_clflush_object(obj);
>
> - flush_domains |= obj->base.write_domain;
Looks like an unnecessary move.
> + /* Used as an internal marker during relocation processing */
> + if (obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS) {
> + flush_domains |= obj->base.pending_write_domain & ~I915_GEM_GPU_DOMAINS;
> + obj->base.pending_write_domain &= I915_GEM_GPU_DOMAINS;
> + }
> }
>
> if (flush_domains & I915_GEM_DOMAIN_CPU)
> @@ -923,7 +938,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> }
>
> /* Set the pending read domains for the batch buffer to COMMAND */
> - if (batch_obj->base.pending_write_domain) {
> + if (batch_obj->base.pending_write_domain & I915_GEM_GPU_DOMAINS) {
> DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
> ret = -EINVAL;
> goto err;
More information about the Intel-gfx
mailing list