[Intel-gfx] [PATCH v2 1/1] drm/i915: Add fallback inside memcpy_from_wc functions

Balasubramani Vivekanandan balasubramani.vivekanandan at intel.com
Wed Feb 9 11:02:29 UTC 2022


On 08.02.2022 11:11, Lucas De Marchi wrote:
> On Mon, Feb 07, 2022 at 09:43:08PM +0530, Balasubramani Vivekanandan wrote:
> > memcpy_from_wc functions can fail if SSE4.1 is not supported or the
> > supplied addresses are not 16-byte aligned. It was then upto to the
> > caller to use memcpy as fallback.
> > Now fallback to memcpy is implemented inside memcpy_from_wc functions
> > relieving the user from checking the return value of i915_memcpy_from_wc
> > and doing fallback.
> > 
> > When doing copying from io memory address memcpy_fromio should be used
> > as fallback. So a new function is added to the family of memcpy_to_wc
> > functions which should be used while copying from io memory.
> > 
> > This change is implemented also with an intention to perpare for porting
> > memcpy_from_wc code to ARM64. Since SSE4.1 is not valid for ARM,
> > accelerated reads will not be supported and the driver should rely on
> > fallback always.
> > So there would be few more places in the code where fallback should be
> > introduced. For e.g. GuC log relay is currently not using fallback since
> > a GPU supporting GuC submission will mostly have SSE4.1 enabled CPU.
> > This is no more valid with Discrete GPU and with enabling support for
> > ARM64.
> > With fallback moved inside memcpy_from_wc function, call sites would
> > look neat and fallback can be implemented in a uniform way.
> > 
> > Signed-off-by: Balasubramani Vivekanandan <balasubramani.vivekanandan at intel.com>
> > ---
> > drivers/gpu/drm/i915/gem/i915_gem_object.c |  5 +-
> > drivers/gpu/drm/i915/gt/selftest_reset.c   |  8 ++-
> > drivers/gpu/drm/i915/i915_gpu_error.c      |  9 ++-
> > drivers/gpu/drm/i915/i915_memcpy.c         | 78 ++++++++++++++++------
> > drivers/gpu/drm/i915/i915_memcpy.h         | 18 ++---
> > 5 files changed, 78 insertions(+), 40 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c
> > index e03e362d320b..e187c4bfb7e4 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
> > @@ -444,7 +444,7 @@ static void
> > i915_gem_object_read_from_page_iomap(struct drm_i915_gem_object *obj, u64 offset, void *dst, int size)
> > {
> > 	void __iomem *src_map;
> > -	void __iomem *src_ptr;
> > +	const void __iomem *src_ptr;
> > 	dma_addr_t dma = i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT);
> > 
> > 	src_map = io_mapping_map_wc(&obj->mm.region->iomap,
> > @@ -452,8 +452,7 @@ i915_gem_object_read_from_page_iomap(struct drm_i915_gem_object *obj, u64 offset
> > 				    PAGE_SIZE);
> > 
> > 	src_ptr = src_map + offset_in_page(offset);
> > -	if (!i915_memcpy_from_wc(dst, (void __force *)src_ptr, size))
> > -		memcpy_fromio(dst, src_ptr, size);
> > +	i915_io_memcpy_from_wc(dst, src_ptr, size);
> 
> nitpick, but maybe to align with the memcpy_fromio() API this would
> better be named i915_memcpy_fromio_wc()?

I too thought for a moment should I rename to i915_memcpy_fromio_wc()
but stayed with the current name, when preparing the patch.
I will rename it.

> 
> > 
> > 	io_mapping_unmap(src_map);
> > }
> > diff --git a/drivers/gpu/drm/i915/gt/selftest_reset.c b/drivers/gpu/drm/i915/gt/selftest_reset.c
> > index 37c38bdd5f47..64b8521a8b28 100644
> > --- a/drivers/gpu/drm/i915/gt/selftest_reset.c
> > +++ b/drivers/gpu/drm/i915/gt/selftest_reset.c
> > @@ -99,8 +99,10 @@ __igt_reset_stolen(struct intel_gt *gt,
> > 			memset_io(s, STACK_MAGIC, PAGE_SIZE);
> > 
> > 		in = (void __force *)s;
> > -		if (i915_memcpy_from_wc(tmp, in, PAGE_SIZE))
> > +		if (i915_can_memcpy_from_wc(tmp, in, PAGE_SIZE)) {
> > +			i915_io_memcpy_from_wc(tmp, in, PAGE_SIZE);
> > 			in = tmp;
> > +		}
> > 		crc[page] = crc32_le(0, in, PAGE_SIZE);
> > 
> > 		io_mapping_unmap(s);
> > @@ -135,8 +137,10 @@ __igt_reset_stolen(struct intel_gt *gt,
> > 				      PAGE_SIZE);
> > 
> > 		in = (void __force *)s;
> > -		if (i915_memcpy_from_wc(tmp, in, PAGE_SIZE))
> > +		if (i915_can_memcpy_from_wc(tmp, in, PAGE_SIZE)) {
> > +			i915_io_memcpy_from_wc(tmp, in, PAGE_SIZE);
> 
> but you removed __iomem above
Yeah, it is a mistake. I will change it. There is one more place in the
same file which needs correction.

Regards,
Bala
> 
> > 			in = tmp;
> > +		}
> > 		x = crc32_le(0, in, PAGE_SIZE);
> > 
> > 		if (x != crc[page] &&
> > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> > index 127ff56c8ce6..2c14a28cbbbb 100644
> > --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> > @@ -297,8 +297,10 @@ static int compress_page(struct i915_vma_compress *c,
> > 	struct z_stream_s *zstream = &c->zstream;
> > 
> > 	zstream->next_in = src;
> > -	if (wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
> > +	if (wc && c->tmp && i915_can_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) {
> > +		i915_io_memcpy_from_wc(c->tmp, (const void __iomem *)src, PAGE_SIZE);
> > 		zstream->next_in = c->tmp;
> > +	}
> > 	zstream->avail_in = PAGE_SIZE;
> > 
> > 	do {
> > @@ -397,8 +399,11 @@ static int compress_page(struct i915_vma_compress *c,
> > 	if (!ptr)
> > 		return -ENOMEM;
> > 
> > -	if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE)))
> > +	if (wc)
> > +		i915_io_memcpy_from_wc(ptr, src, PAGE_SIZE);
> > +	else
> > 		memcpy(ptr, src, PAGE_SIZE);
> > +
> > 	list_add_tail(&virt_to_page(ptr)->lru, &dst->page_list);
> > 	cond_resched();
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
> > index 1b021a4902de..4d9fbf3b2614 100644
> > --- a/drivers/gpu/drm/i915/i915_memcpy.c
> > +++ b/drivers/gpu/drm/i915/i915_memcpy.c
> > @@ -24,15 +24,10 @@
> > 
> > #include <linux/kernel.h>
> > #include <asm/fpu/api.h>
> > +#include <linux/io.h>
> > 
> > #include "i915_memcpy.h"
> > 
> > -#if IS_ENABLED(CONFIG_DRM_I915_DEBUG)
> > -#define CI_BUG_ON(expr) BUG_ON(expr)
> > -#else
> > -#define CI_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr)
> > -#endif
> > -
> > static DEFINE_STATIC_KEY_FALSE(has_movntdqa);
> > 
> > static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)
> > @@ -93,6 +88,26 @@ static void __memcpy_ntdqu(void *dst, const void *src, unsigned long len)
> > 	kernel_fpu_end();
> > }
> > 
> > +/* The movntdqa instructions used for memcpy-from-wc require 16-byte alignment,
> > + * as well as SSE4.1 support. To check beforehand, pass in the parameters to
> > + * i915_can_memcpy_from_wc() - since we only care about the low 4 bits,
> > + * you only need to pass in the minor offsets, page-aligned pointers are
> > + * always valid.
> > + *
> > + * For just checking for SSE4.1, in the foreknowledge that the future use
> > + * will be correctly aligned, just use i915_has_memcpy_from_wc().
> > + */
> > +bool i915_can_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> > +{
> > +	if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
> > +		return false;
> > +
> > +	if (static_branch_likely(&has_movntdqa))
> > +		return true;
> > +
> > +	return false;
> > +}
> > +
> > /**
> >  * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
> >  * @dst: destination pointer
> > @@ -104,24 +119,18 @@ static void __memcpy_ntdqu(void *dst, const void *src, unsigned long len)
> >  * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
> >  * of 16.
> >  *
> > - * To test whether accelerated reads from WC are supported, use
> > - * i915_memcpy_from_wc(NULL, NULL, 0);
> > - *
> > - * Returns true if the copy was successful, false if the preconditions
> > - * are not met.
> > + * If the acccelerated read from WC is not possible fallback to memcpy
> >  */
> > -bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> > +void i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> > {
> > -	if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
> > -		return false;
> > -
> > -	if (static_branch_likely(&has_movntdqa)) {
> > +	if (i915_can_memcpy_from_wc(dst, src, len)) {
> > 		if (likely(len))
> > 			__memcpy_ntdqa(dst, src, len >> 4);
> > -		return true;
> > +		return;
> > 	}
> > 
> > -	return false;
> > +	/* Fallback */
> > +	memcpy(dst, src, len);
> > }
> > 
> > /**
> > @@ -134,12 +143,15 @@ bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> >  * @src to @dst using * non-temporal instructions where available, but
> >  * accepts that its arguments may not be aligned, but are valid for the
> >  * potential 16-byte read past the end.
> > + *
> > + * Fallback to memcpy if accelerated read is not supported
> >  */
> > void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> > {
> > 	unsigned long addr;
> > 
> > -	CI_BUG_ON(!i915_has_memcpy_from_wc());
> > +	if (!i915_has_memcpy_from_wc())
> > +		goto fallback;
> > 
> > 	addr = (unsigned long)src;
> > 	if (!IS_ALIGNED(addr, 16)) {
> > @@ -154,6 +166,34 @@ void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len
> > 
> > 	if (likely(len))
> > 		__memcpy_ntdqu(dst, src, DIV_ROUND_UP(len, 16));
> > +
> > +	return;
> > +
> > +fallback:
> > +	memcpy(dst, src, len);
> > +}
> > +
> > +/**
> > + * i915_io_memcpy_from_wc: perform an accelerated *aligned* read from WC
> > + * @dst: destination pointer
> > + * @src: source pointer
> > + * @len: how many bytes to copy
> > + *
> > + * To be used when the when copying from io memory.
> > + *
> > + * memcpy_fromio() is used as fallback otherewise no difference to
> > + * i915_memcpy_from_wc()
> > + */
> > +void i915_io_memcpy_from_wc(void *dst, const void __iomem *src, unsigned long len)
> > +{
> > +	if (i915_can_memcpy_from_wc(dst, (const void __force *)src, len)) {
> > +		if (likely(len))
> > +			__memcpy_ntdqa(dst, (const void __force *)src, len >> 4);
> > +		return;
> > +	}
> > +
> > +	/* Fallback */
> > +	memcpy_fromio(dst, src, len);
> > }
> > 
> > void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
> > diff --git a/drivers/gpu/drm/i915/i915_memcpy.h b/drivers/gpu/drm/i915/i915_memcpy.h
> > index 3df063a3293b..93ea9295e28c 100644
> > --- a/drivers/gpu/drm/i915/i915_memcpy.h
> > +++ b/drivers/gpu/drm/i915/i915_memcpy.h
> > @@ -12,23 +12,13 @@ struct drm_i915_private;
> > 
> > void i915_memcpy_init_early(struct drm_i915_private *i915);
> > 
> > -bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
> > +void i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
> > void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len);
> > +void i915_io_memcpy_from_wc(void *dst, const void __iomem *src, unsigned long len);
> > 
> > -/* The movntdqa instructions used for memcpy-from-wc require 16-byte alignment,
> > - * as well as SSE4.1 support. i915_memcpy_from_wc() will report if it cannot
> > - * perform the operation. To check beforehand, pass in the parameters to
> > - * to i915_can_memcpy_from_wc() - since we only care about the low 4 bits,
> > - * you only need to pass in the minor offsets, page-aligned pointers are
> > - * always valid.
> > - *
> > - * For just checking for SSE4.1, in the foreknowledge that the future use
> > - * will be correctly aligned, just use i915_has_memcpy_from_wc().
> > - */
> > -#define i915_can_memcpy_from_wc(dst, src, len) \
> > -	i915_memcpy_from_wc((void *)((unsigned long)(dst) | (unsigned long)(src) | (len)), NULL, 0)
> > +bool i915_can_memcpy_from_wc(void *dst, const void *src, unsigned long len);
> > 
> > #define i915_has_memcpy_from_wc() \
> > -	i915_memcpy_from_wc(NULL, NULL, 0)
> > +	i915_can_memcpy_from_wc(NULL, NULL, 0)
> 
> I think the has vs can here is confusing. But a cleanup on that could be
> on top since it would just add noise to this patch.
> 
> I or someone else probably need a more careful review, but ack on the
> direction:
> 
> 
> 
> Acked-by: Lucas De Marchi <lucas.demarchi at intel.com>
> 
> 
> Lucas De Marchi


More information about the Intel-gfx mailing list