[Intel-gfx] [PATCH v2 1/1] drm/i915: Add fallback inside memcpy_from_wc functions

Lucas De Marchi lucas.demarchi at intel.com
Tue Feb 8 19:11:55 UTC 2022


On Mon, Feb 07, 2022 at 09:43:08PM +0530, Balasubramani Vivekanandan wrote:
>memcpy_from_wc functions can fail if SSE4.1 is not supported or the
>supplied addresses are not 16-byte aligned. It was then upto to the
>caller to use memcpy as fallback.
>Now fallback to memcpy is implemented inside memcpy_from_wc functions
>relieving the user from checking the return value of i915_memcpy_from_wc
>and doing fallback.
>
>When doing copying from io memory address memcpy_fromio should be used
>as fallback. So a new function is added to the family of memcpy_to_wc
>functions which should be used while copying from io memory.
>
>This change is implemented also with an intention to perpare for porting
>memcpy_from_wc code to ARM64. Since SSE4.1 is not valid for ARM,
>accelerated reads will not be supported and the driver should rely on
>fallback always.
>So there would be few more places in the code where fallback should be
>introduced. For e.g. GuC log relay is currently not using fallback since
>a GPU supporting GuC submission will mostly have SSE4.1 enabled CPU.
>This is no more valid with Discrete GPU and with enabling support for
>ARM64.
>With fallback moved inside memcpy_from_wc function, call sites would
>look neat and fallback can be implemented in a uniform way.
>
>Signed-off-by: Balasubramani Vivekanandan <balasubramani.vivekanandan at intel.com>
>---
> drivers/gpu/drm/i915/gem/i915_gem_object.c |  5 +-
> drivers/gpu/drm/i915/gt/selftest_reset.c   |  8 ++-
> drivers/gpu/drm/i915/i915_gpu_error.c      |  9 ++-
> drivers/gpu/drm/i915/i915_memcpy.c         | 78 ++++++++++++++++------
> drivers/gpu/drm/i915/i915_memcpy.h         | 18 ++---
> 5 files changed, 78 insertions(+), 40 deletions(-)
>
>diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c
>index e03e362d320b..e187c4bfb7e4 100644
>--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
>+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
>@@ -444,7 +444,7 @@ static void
> i915_gem_object_read_from_page_iomap(struct drm_i915_gem_object *obj, u64 offset, void *dst, int size)
> {
> 	void __iomem *src_map;
>-	void __iomem *src_ptr;
>+	const void __iomem *src_ptr;
> 	dma_addr_t dma = i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT);
>
> 	src_map = io_mapping_map_wc(&obj->mm.region->iomap,
>@@ -452,8 +452,7 @@ i915_gem_object_read_from_page_iomap(struct drm_i915_gem_object *obj, u64 offset
> 				    PAGE_SIZE);
>
> 	src_ptr = src_map + offset_in_page(offset);
>-	if (!i915_memcpy_from_wc(dst, (void __force *)src_ptr, size))
>-		memcpy_fromio(dst, src_ptr, size);
>+	i915_io_memcpy_from_wc(dst, src_ptr, size);

nitpick, but maybe to align with the memcpy_fromio() API this would
better be named i915_memcpy_fromio_wc()?

>
> 	io_mapping_unmap(src_map);
> }
>diff --git a/drivers/gpu/drm/i915/gt/selftest_reset.c b/drivers/gpu/drm/i915/gt/selftest_reset.c
>index 37c38bdd5f47..64b8521a8b28 100644
>--- a/drivers/gpu/drm/i915/gt/selftest_reset.c
>+++ b/drivers/gpu/drm/i915/gt/selftest_reset.c
>@@ -99,8 +99,10 @@ __igt_reset_stolen(struct intel_gt *gt,
> 			memset_io(s, STACK_MAGIC, PAGE_SIZE);
>
> 		in = (void __force *)s;
>-		if (i915_memcpy_from_wc(tmp, in, PAGE_SIZE))
>+		if (i915_can_memcpy_from_wc(tmp, in, PAGE_SIZE)) {
>+			i915_io_memcpy_from_wc(tmp, in, PAGE_SIZE);
> 			in = tmp;
>+		}
> 		crc[page] = crc32_le(0, in, PAGE_SIZE);
>
> 		io_mapping_unmap(s);
>@@ -135,8 +137,10 @@ __igt_reset_stolen(struct intel_gt *gt,
> 				      PAGE_SIZE);
>
> 		in = (void __force *)s;
>-		if (i915_memcpy_from_wc(tmp, in, PAGE_SIZE))
>+		if (i915_can_memcpy_from_wc(tmp, in, PAGE_SIZE)) {
>+			i915_io_memcpy_from_wc(tmp, in, PAGE_SIZE);

but you removed __iomem above

> 			in = tmp;
>+		}
> 		x = crc32_le(0, in, PAGE_SIZE);
>
> 		if (x != crc[page] &&
>diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
>index 127ff56c8ce6..2c14a28cbbbb 100644
>--- a/drivers/gpu/drm/i915/i915_gpu_error.c
>+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>@@ -297,8 +297,10 @@ static int compress_page(struct i915_vma_compress *c,
> 	struct z_stream_s *zstream = &c->zstream;
>
> 	zstream->next_in = src;
>-	if (wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
>+	if (wc && c->tmp && i915_can_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) {
>+		i915_io_memcpy_from_wc(c->tmp, (const void __iomem *)src, PAGE_SIZE);
> 		zstream->next_in = c->tmp;
>+	}
> 	zstream->avail_in = PAGE_SIZE;
>
> 	do {
>@@ -397,8 +399,11 @@ static int compress_page(struct i915_vma_compress *c,
> 	if (!ptr)
> 		return -ENOMEM;
>
>-	if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE)))
>+	if (wc)
>+		i915_io_memcpy_from_wc(ptr, src, PAGE_SIZE);
>+	else
> 		memcpy(ptr, src, PAGE_SIZE);
>+
> 	list_add_tail(&virt_to_page(ptr)->lru, &dst->page_list);
> 	cond_resched();
>
>diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
>index 1b021a4902de..4d9fbf3b2614 100644
>--- a/drivers/gpu/drm/i915/i915_memcpy.c
>+++ b/drivers/gpu/drm/i915/i915_memcpy.c
>@@ -24,15 +24,10 @@
>
> #include <linux/kernel.h>
> #include <asm/fpu/api.h>
>+#include <linux/io.h>
>
> #include "i915_memcpy.h"
>
>-#if IS_ENABLED(CONFIG_DRM_I915_DEBUG)
>-#define CI_BUG_ON(expr) BUG_ON(expr)
>-#else
>-#define CI_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr)
>-#endif
>-
> static DEFINE_STATIC_KEY_FALSE(has_movntdqa);
>
> static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)
>@@ -93,6 +88,26 @@ static void __memcpy_ntdqu(void *dst, const void *src, unsigned long len)
> 	kernel_fpu_end();
> }
>
>+/* The movntdqa instructions used for memcpy-from-wc require 16-byte alignment,
>+ * as well as SSE4.1 support. To check beforehand, pass in the parameters to
>+ * i915_can_memcpy_from_wc() - since we only care about the low 4 bits,
>+ * you only need to pass in the minor offsets, page-aligned pointers are
>+ * always valid.
>+ *
>+ * For just checking for SSE4.1, in the foreknowledge that the future use
>+ * will be correctly aligned, just use i915_has_memcpy_from_wc().
>+ */
>+bool i915_can_memcpy_from_wc(void *dst, const void *src, unsigned long len)
>+{
>+	if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
>+		return false;
>+
>+	if (static_branch_likely(&has_movntdqa))
>+		return true;
>+
>+	return false;
>+}
>+
> /**
>  * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
>  * @dst: destination pointer
>@@ -104,24 +119,18 @@ static void __memcpy_ntdqu(void *dst, const void *src, unsigned long len)
>  * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
>  * of 16.
>  *
>- * To test whether accelerated reads from WC are supported, use
>- * i915_memcpy_from_wc(NULL, NULL, 0);
>- *
>- * Returns true if the copy was successful, false if the preconditions
>- * are not met.
>+ * If the acccelerated read from WC is not possible fallback to memcpy
>  */
>-bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
>+void i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> {
>-	if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
>-		return false;
>-
>-	if (static_branch_likely(&has_movntdqa)) {
>+	if (i915_can_memcpy_from_wc(dst, src, len)) {
> 		if (likely(len))
> 			__memcpy_ntdqa(dst, src, len >> 4);
>-		return true;
>+		return;
> 	}
>
>-	return false;
>+	/* Fallback */
>+	memcpy(dst, src, len);
> }
>
> /**
>@@ -134,12 +143,15 @@ bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
>  * @src to @dst using * non-temporal instructions where available, but
>  * accepts that its arguments may not be aligned, but are valid for the
>  * potential 16-byte read past the end.
>+ *
>+ * Fallback to memcpy if accelerated read is not supported
>  */
> void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> {
> 	unsigned long addr;
>
>-	CI_BUG_ON(!i915_has_memcpy_from_wc());
>+	if (!i915_has_memcpy_from_wc())
>+		goto fallback;
>
> 	addr = (unsigned long)src;
> 	if (!IS_ALIGNED(addr, 16)) {
>@@ -154,6 +166,34 @@ void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len
>
> 	if (likely(len))
> 		__memcpy_ntdqu(dst, src, DIV_ROUND_UP(len, 16));
>+
>+	return;
>+
>+fallback:
>+	memcpy(dst, src, len);
>+}
>+
>+/**
>+ * i915_io_memcpy_from_wc: perform an accelerated *aligned* read from WC
>+ * @dst: destination pointer
>+ * @src: source pointer
>+ * @len: how many bytes to copy
>+ *
>+ * To be used when the when copying from io memory.
>+ *
>+ * memcpy_fromio() is used as fallback otherewise no difference to
>+ * i915_memcpy_from_wc()
>+ */
>+void i915_io_memcpy_from_wc(void *dst, const void __iomem *src, unsigned long len)
>+{
>+	if (i915_can_memcpy_from_wc(dst, (const void __force *)src, len)) {
>+		if (likely(len))
>+			__memcpy_ntdqa(dst, (const void __force *)src, len >> 4);
>+		return;
>+	}
>+
>+	/* Fallback */
>+	memcpy_fromio(dst, src, len);
> }
>
> void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
>diff --git a/drivers/gpu/drm/i915/i915_memcpy.h b/drivers/gpu/drm/i915/i915_memcpy.h
>index 3df063a3293b..93ea9295e28c 100644
>--- a/drivers/gpu/drm/i915/i915_memcpy.h
>+++ b/drivers/gpu/drm/i915/i915_memcpy.h
>@@ -12,23 +12,13 @@ struct drm_i915_private;
>
> void i915_memcpy_init_early(struct drm_i915_private *i915);
>
>-bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
>+void i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
> void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len);
>+void i915_io_memcpy_from_wc(void *dst, const void __iomem *src, unsigned long len);
>
>-/* The movntdqa instructions used for memcpy-from-wc require 16-byte alignment,
>- * as well as SSE4.1 support. i915_memcpy_from_wc() will report if it cannot
>- * perform the operation. To check beforehand, pass in the parameters to
>- * to i915_can_memcpy_from_wc() - since we only care about the low 4 bits,
>- * you only need to pass in the minor offsets, page-aligned pointers are
>- * always valid.
>- *
>- * For just checking for SSE4.1, in the foreknowledge that the future use
>- * will be correctly aligned, just use i915_has_memcpy_from_wc().
>- */
>-#define i915_can_memcpy_from_wc(dst, src, len) \
>-	i915_memcpy_from_wc((void *)((unsigned long)(dst) | (unsigned long)(src) | (len)), NULL, 0)
>+bool i915_can_memcpy_from_wc(void *dst, const void *src, unsigned long len);
>
> #define i915_has_memcpy_from_wc() \
>-	i915_memcpy_from_wc(NULL, NULL, 0)
>+	i915_can_memcpy_from_wc(NULL, NULL, 0)

I think the has vs can here is confusing. But a cleanup on that could be
on top since it would just add noise to this patch.

I or someone else probably need a more careful review, but ack on the
direction:



Acked-by: Lucas De Marchi <lucas.demarchi at intel.com>


Lucas De Marchi


More information about the Intel-gfx mailing list