[Intel-gfx] [PATCH 1/2] drm/i915: Use Write-Through cacheing for the display plane on Iris

Fri Aug 2 16:51:49 CEST 2013

On Thu, Aug 01, 2013 at 06:39:54PM +0100, Chris Wilson wrote:
> Haswell GT3e has the unique feature of supporting Write-Through cacheing
> of objects within the eLLC/LLC. The purpose of this is to enable the display
> plane to remain coherent whilst objects lie resident in the eLLC/LLC - so
> that we, in theory, get the best of both worlds, perfect display and fast
> access.
> 
> However, we still need to be careful as the CPU does not see the WT when
> accessing the cache. In particular, this means that we need to flush the
> cache lines after writing to an object through the CPU, and on
> transitioning from a cached state to WT.
> 
> v2: Actually do the clflush on transition to WT, nagging by Ville.
> v3: Flush the CPU cache after writes into WT objects.

v4?

> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Ville Syrjälä <ville.syrjala at linux.intel.com>
> Cc: Kenneth Graunke <kenneth at whitecape.org>
> ---
>  drivers/gpu/drm/i915/i915_dma.c     |  3 +++
>  drivers/gpu/drm/i915/i915_drv.h     |  4 +++-
>  drivers/gpu/drm/i915/i915_gem.c     | 34 +++++++++++++++++++++++-----------
>  drivers/gpu/drm/i915/i915_gem_gtt.c | 11 ++++++++++-
>  include/uapi/drm/i915_drm.h         |  1 +
>  5 files changed, 40 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 8da0b3d..75989fc 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -976,6 +976,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
>  	case I915_PARAM_HAS_LLC:
>  		value = HAS_LLC(dev);
>  		break;
> +	case I915_PARAM_HAS_WT:
> +		value = HAS_WT(dev);
> +		break;
>  	case I915_PARAM_HAS_ALIASING_PPGTT:
>  		value = dev_priv->mm.aliasing_ppgtt ? 1 : 0;
>  		break;
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 34d2b9d..d27a82a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -452,6 +452,7 @@ enum i915_cache_level {
>  	I915_CACHE_NONE = 0,
>  	I915_CACHE_LLC,
>  	I915_CACHE_LLC_MLC, /* gen6+, in docs at least! */
> +	I915_CACHE_WT,
>  };
>  
>  typedef uint32_t gen6_gtt_pte_t;
> @@ -1344,7 +1345,7 @@ struct drm_i915_gem_object {
>  	unsigned int pending_fenced_gpu_access:1;
>  	unsigned int fenced_gpu_access:1;
>  
> -	unsigned int cache_level:2;
> +	unsigned int cache_level:3;
>  
>  	unsigned int has_aliasing_ppgtt_mapping:1;
>  	unsigned int has_global_gtt_mapping:1;
> @@ -1547,6 +1548,7 @@ struct drm_i915_file_private {
>  #define HAS_BLT(dev)            (INTEL_INFO(dev)->has_blt_ring)
>  #define HAS_VEBOX(dev)          (INTEL_INFO(dev)->has_vebox_ring)
>  #define HAS_LLC(dev)            (INTEL_INFO(dev)->has_llc)
> +#define HAS_WT(dev)            (IS_HASWELL(dev) && ((struct drm_i915_private *)(dev)->dev_private)->ellc_size)
>  #define I915_NEED_GFX_HWS(dev)	(INTEL_INFO(dev)->need_gfx_hws)
>  
>  #define HAS_HW_CONTEXTS(dev)	(INTEL_INFO(dev)->gen >= 5)
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 99362f7..12ff60a 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -60,6 +60,16 @@ static long i915_gem_purge(struct drm_i915_private *dev_priv, long target);
>  static void i915_gem_shrink_all(struct drm_i915_private *dev_priv);
>  static void i915_gem_object_truncate(struct drm_i915_gem_object *obj);
>  
> +static bool is_cpu_write_cached(enum i915_cache_level cl)
> +{
> +	return cl == I915_CACHE_LLC || cl == I915_CACHE_LLC_MLC;
> +}
> +
> +static bool is_cpu_read_cached(enum i915_cache_level cl)
> +{
> +	return cl != I915_CACHE_NONE;
> +}
> +
>  static inline void i915_gem_object_fence_lost(struct drm_i915_gem_object *obj)
>  {
>  	if (obj->tiling_mode)
> @@ -510,7 +520,7 @@ i915_gem_shmem_pread(struct drm_device *dev,
>  		 * read domain and manually flush cachelines (if required). This
>  		 * optimizes for the case when the gpu will dirty the data
>  		 * anyway again before the next pread happens. */
> -		if (obj->cache_level == I915_CACHE_NONE)
> +		if (!is_cpu_read_cached(obj->cache_level))

Based on what we discussed on irc about GPU snooping even on UC
accesses, couldn't this even be:

!HAS_LLC(dev) && obj->cache_level == I915_CACHE_NONE

BTW I was toying around a bit w/ gem_cacheing. I modified it to map the
scratch bo just once and use DRM_I915_GEM_WAIT to wait for the GPU so
that I could avoid the set_domain stuff. Then I ran it w/ both cached
and uncached bo, and it passes on IVB in both cases. On ILK it fails
in the uncached bo case, but passes in the cached bo case. So that
result appears to support the notion that UC accesses snoop on LLC
platforms. But of course I didn't try other use cases than the BLT
vs. CPU that gem_cacheing tests.

>  			needs_clflush = 1;
>  		if (i915_gem_obj_ggtt_bound(obj)) {
>  			ret = i915_gem_object_set_to_gtt_domain(obj, false);
> @@ -827,7 +837,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
>  		 * write domain and manually flush cachelines (if required). This
>  		 * optimizes for the case when the gpu will use the data
>  		 * right away and we therefore have to clflush anyway. */
> -		if (obj->cache_level == I915_CACHE_NONE)
> +		if (!is_cpu_write_cached(obj->cache_level))
>  			needs_clflush_after = 1;

This makese sense. Although if we were to separate scanout UC from other
UC uses, even this clflush could be avoided for non-scanout cases
on LLC platforms.

>  		if (i915_gem_obj_ggtt_bound(obj)) {
>  			ret = i915_gem_object_set_to_gtt_domain(obj, true);
> @@ -837,8 +847,8 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
>  	}
>  	/* Same trick applies for invalidate partially written cachelines before
>  	 * writing.  */
> -	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)
> -	    && obj->cache_level == I915_CACHE_NONE)
> +	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU) &&
> +	    !is_cpu_read_cached(obj->cache_level))

Again, I think this could be:
!HAS_LLC(dev) && obj->cache_level == I915_CACHE_NONE

>  		needs_clflush_before = 1;
>  
>  	ret = i915_gem_object_get_pages(obj);
> @@ -996,7 +1006,7 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
>  		goto out;
>  	}
>  
> -	if (obj->cache_level == I915_CACHE_NONE &&
> +	if (!is_cpu_write_cached(obj->cache_level) &&
>  	    obj->tiling_mode == I915_TILING_NONE &&
>  	    obj->base.write_domain != I915_GEM_DOMAIN_CPU) {

Might we want to skip the GTT path on LLC platforms completely?

>  		ret = i915_gem_gtt_pwrite_fast(dev, obj, args, file);
> @@ -1432,7 +1442,7 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  	trace_i915_gem_object_fault(obj, page_offset, true, write);
>  
>  	/* Access to snoopable pages through the GTT is incoherent. */
> -	if (obj->cache_level != I915_CACHE_NONE && !HAS_LLC(dev)) {
> +	if (is_cpu_read_cached(obj->cache_level) && !HAS_LLC(dev)) {
>  		ret = -EINVAL;
>  		goto unlock;
>  	}
> @@ -3286,11 +3296,10 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj)
>  	 * snooping behaviour occurs naturally as the result of our domain
>  	 * tracking.
>  	 */
> -	if (obj->cache_level != I915_CACHE_NONE)
> +	if (is_cpu_write_cached(obj->cache_level))
>  		return;
>  
>  	trace_i915_gem_object_clflush(obj);
> -
>  	drm_clflush_sg(obj->pages);
>  }
>  
> @@ -3447,7 +3456,7 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
>  		i915_gem_obj_ggtt_set_color(obj, cache_level);
>  	}
>  
> -	if (cache_level == I915_CACHE_NONE) {
> +	if (is_cpu_write_cached(obj->cache_level) != is_cpu_write_cached(cache_level)) {
>  		u32 old_read_domains, old_write_domain;
>  
>  		/* If we're coming from LLC cached, then we haven't
> @@ -3565,7 +3574,8 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
>  	 * of uncaching, which would allow us to flush all the LLC-cached data
>  	 * with that bit in the PTE to main memory with just one PIPE_CONTROL.
>  	 */
> -	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
> +	ret = i915_gem_object_set_cache_level(obj,
> +					      HAS_WT(obj->base.dev) ? I915_CACHE_WT : I915_CACHE_NONE);
>  	if (ret)
>  		return ret;
>  
> @@ -3638,7 +3648,9 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
>  
>  	/* Flush the CPU cache if it's still invalid. */
>  	if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0) {
> -		i915_gem_clflush_object(obj);
> +		if (obj->cache_level == I915_CACHE_NONE &&
> +		    !HAS_LLC(obj->base.dev))
> +			i915_gem_clflush_object(obj);
>  
>  		obj->base.read_domains |= I915_GEM_DOMAIN_CPU;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 0522d00..072a348 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -54,6 +54,7 @@
>  					 (((bits) & 0x8) << (11 - 3)))
>  #define HSW_WB_LLC_AGE0			HSW_CACHEABILITY_CONTROL(0x3)
>  #define HSW_WB_ELLC_LLC_AGE0		HSW_CACHEABILITY_CONTROL(0xb)
> +#define HSW_WT_ELLC_LLC_AGE0		HSW_CACHEABILITY_CONTROL(0x6)
>  
>  static gen6_gtt_pte_t gen6_pte_encode(dma_addr_t addr,
>  				      enum i915_cache_level level)
> @@ -116,8 +117,16 @@ static gen6_gtt_pte_t iris_pte_encode(dma_addr_t addr,
>  	gen6_gtt_pte_t pte = GEN6_PTE_VALID;
>  	pte |= HSW_PTE_ADDR_ENCODE(addr);
>  
> -	if (level != I915_CACHE_NONE)
> +	switch (level) {
> +	case I915_CACHE_NONE:
> +		break;
> +	case I915_CACHE_WT:
> +		pte |= HSW_WT_ELLC_LLC_AGE0;
> +		break;
> +	default:
>  		pte |= HSW_WB_ELLC_LLC_AGE0;
> +		break;
> +	}
>  
>  	return pte;
>  }
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index e47cf00..e831292 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -338,6 +338,7 @@ typedef struct drm_i915_irq_wait {
>  #define I915_PARAM_HAS_PINNED_BATCHES	 24
>  #define I915_PARAM_HAS_EXEC_NO_RELOC	 25
>  #define I915_PARAM_HAS_EXEC_HANDLE_LUT   26
> +#define I915_PARAM_HAS_WT     	 	 27
>  
>  typedef struct drm_i915_getparam {
>  	int param;
> -- 
> 1.8.4.rc0

-- 
Ville Syrjälä
Intel OTC