[Intel-gfx] [PATCH v2 1/1] drm/i915: Add fallback inside memcpy_from_wc functions
Lucas De Marchi
lucas.demarchi at intel.com
Tue Feb 8 19:11:55 UTC 2022
On Mon, Feb 07, 2022 at 09:43:08PM +0530, Balasubramani Vivekanandan wrote:
>memcpy_from_wc functions can fail if SSE4.1 is not supported or the
>supplied addresses are not 16-byte aligned. It was then upto to the
>caller to use memcpy as fallback.
>Now fallback to memcpy is implemented inside memcpy_from_wc functions
>relieving the user from checking the return value of i915_memcpy_from_wc
>and doing fallback.
>
>When doing copying from io memory address memcpy_fromio should be used
>as fallback. So a new function is added to the family of memcpy_to_wc
>functions which should be used while copying from io memory.
>
>This change is implemented also with an intention to perpare for porting
>memcpy_from_wc code to ARM64. Since SSE4.1 is not valid for ARM,
>accelerated reads will not be supported and the driver should rely on
>fallback always.
>So there would be few more places in the code where fallback should be
>introduced. For e.g. GuC log relay is currently not using fallback since
>a GPU supporting GuC submission will mostly have SSE4.1 enabled CPU.
>This is no more valid with Discrete GPU and with enabling support for
>ARM64.
>With fallback moved inside memcpy_from_wc function, call sites would
>look neat and fallback can be implemented in a uniform way.
>
>Signed-off-by: Balasubramani Vivekanandan <balasubramani.vivekanandan at intel.com>
>---
> drivers/gpu/drm/i915/gem/i915_gem_object.c | 5 +-
> drivers/gpu/drm/i915/gt/selftest_reset.c | 8 ++-
> drivers/gpu/drm/i915/i915_gpu_error.c | 9 ++-
> drivers/gpu/drm/i915/i915_memcpy.c | 78 ++++++++++++++++------
> drivers/gpu/drm/i915/i915_memcpy.h | 18 ++---
> 5 files changed, 78 insertions(+), 40 deletions(-)
>
>diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c
>index e03e362d320b..e187c4bfb7e4 100644
>--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
>+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
>@@ -444,7 +444,7 @@ static void
> i915_gem_object_read_from_page_iomap(struct drm_i915_gem_object *obj, u64 offset, void *dst, int size)
> {
> void __iomem *src_map;
>- void __iomem *src_ptr;
>+ const void __iomem *src_ptr;
> dma_addr_t dma = i915_gem_object_get_dma_address(obj, offset >> PAGE_SHIFT);
>
> src_map = io_mapping_map_wc(&obj->mm.region->iomap,
>@@ -452,8 +452,7 @@ i915_gem_object_read_from_page_iomap(struct drm_i915_gem_object *obj, u64 offset
> PAGE_SIZE);
>
> src_ptr = src_map + offset_in_page(offset);
>- if (!i915_memcpy_from_wc(dst, (void __force *)src_ptr, size))
>- memcpy_fromio(dst, src_ptr, size);
>+ i915_io_memcpy_from_wc(dst, src_ptr, size);
nitpick, but maybe to align with the memcpy_fromio() API this would
better be named i915_memcpy_fromio_wc()?
>
> io_mapping_unmap(src_map);
> }
>diff --git a/drivers/gpu/drm/i915/gt/selftest_reset.c b/drivers/gpu/drm/i915/gt/selftest_reset.c
>index 37c38bdd5f47..64b8521a8b28 100644
>--- a/drivers/gpu/drm/i915/gt/selftest_reset.c
>+++ b/drivers/gpu/drm/i915/gt/selftest_reset.c
>@@ -99,8 +99,10 @@ __igt_reset_stolen(struct intel_gt *gt,
> memset_io(s, STACK_MAGIC, PAGE_SIZE);
>
> in = (void __force *)s;
>- if (i915_memcpy_from_wc(tmp, in, PAGE_SIZE))
>+ if (i915_can_memcpy_from_wc(tmp, in, PAGE_SIZE)) {
>+ i915_io_memcpy_from_wc(tmp, in, PAGE_SIZE);
> in = tmp;
>+ }
> crc[page] = crc32_le(0, in, PAGE_SIZE);
>
> io_mapping_unmap(s);
>@@ -135,8 +137,10 @@ __igt_reset_stolen(struct intel_gt *gt,
> PAGE_SIZE);
>
> in = (void __force *)s;
>- if (i915_memcpy_from_wc(tmp, in, PAGE_SIZE))
>+ if (i915_can_memcpy_from_wc(tmp, in, PAGE_SIZE)) {
>+ i915_io_memcpy_from_wc(tmp, in, PAGE_SIZE);
but you removed __iomem above
> in = tmp;
>+ }
> x = crc32_le(0, in, PAGE_SIZE);
>
> if (x != crc[page] &&
>diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
>index 127ff56c8ce6..2c14a28cbbbb 100644
>--- a/drivers/gpu/drm/i915/i915_gpu_error.c
>+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>@@ -297,8 +297,10 @@ static int compress_page(struct i915_vma_compress *c,
> struct z_stream_s *zstream = &c->zstream;
>
> zstream->next_in = src;
>- if (wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
>+ if (wc && c->tmp && i915_can_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) {
>+ i915_io_memcpy_from_wc(c->tmp, (const void __iomem *)src, PAGE_SIZE);
> zstream->next_in = c->tmp;
>+ }
> zstream->avail_in = PAGE_SIZE;
>
> do {
>@@ -397,8 +399,11 @@ static int compress_page(struct i915_vma_compress *c,
> if (!ptr)
> return -ENOMEM;
>
>- if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE)))
>+ if (wc)
>+ i915_io_memcpy_from_wc(ptr, src, PAGE_SIZE);
>+ else
> memcpy(ptr, src, PAGE_SIZE);
>+
> list_add_tail(&virt_to_page(ptr)->lru, &dst->page_list);
> cond_resched();
>
>diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
>index 1b021a4902de..4d9fbf3b2614 100644
>--- a/drivers/gpu/drm/i915/i915_memcpy.c
>+++ b/drivers/gpu/drm/i915/i915_memcpy.c
>@@ -24,15 +24,10 @@
>
> #include <linux/kernel.h>
> #include <asm/fpu/api.h>
>+#include <linux/io.h>
>
> #include "i915_memcpy.h"
>
>-#if IS_ENABLED(CONFIG_DRM_I915_DEBUG)
>-#define CI_BUG_ON(expr) BUG_ON(expr)
>-#else
>-#define CI_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr)
>-#endif
>-
> static DEFINE_STATIC_KEY_FALSE(has_movntdqa);
>
> static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)
>@@ -93,6 +88,26 @@ static void __memcpy_ntdqu(void *dst, const void *src, unsigned long len)
> kernel_fpu_end();
> }
>
>+/* The movntdqa instructions used for memcpy-from-wc require 16-byte alignment,
>+ * as well as SSE4.1 support. To check beforehand, pass in the parameters to
>+ * i915_can_memcpy_from_wc() - since we only care about the low 4 bits,
>+ * you only need to pass in the minor offsets, page-aligned pointers are
>+ * always valid.
>+ *
>+ * For just checking for SSE4.1, in the foreknowledge that the future use
>+ * will be correctly aligned, just use i915_has_memcpy_from_wc().
>+ */
>+bool i915_can_memcpy_from_wc(void *dst, const void *src, unsigned long len)
>+{
>+ if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
>+ return false;
>+
>+ if (static_branch_likely(&has_movntdqa))
>+ return true;
>+
>+ return false;
>+}
>+
> /**
> * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
> * @dst: destination pointer
>@@ -104,24 +119,18 @@ static void __memcpy_ntdqu(void *dst, const void *src, unsigned long len)
> * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
> * of 16.
> *
>- * To test whether accelerated reads from WC are supported, use
>- * i915_memcpy_from_wc(NULL, NULL, 0);
>- *
>- * Returns true if the copy was successful, false if the preconditions
>- * are not met.
>+ * If the acccelerated read from WC is not possible fallback to memcpy
> */
>-bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
>+void i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> {
>- if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
>- return false;
>-
>- if (static_branch_likely(&has_movntdqa)) {
>+ if (i915_can_memcpy_from_wc(dst, src, len)) {
> if (likely(len))
> __memcpy_ntdqa(dst, src, len >> 4);
>- return true;
>+ return;
> }
>
>- return false;
>+ /* Fallback */
>+ memcpy(dst, src, len);
> }
>
> /**
>@@ -134,12 +143,15 @@ bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> * @src to @dst using * non-temporal instructions where available, but
> * accepts that its arguments may not be aligned, but are valid for the
> * potential 16-byte read past the end.
>+ *
>+ * Fallback to memcpy if accelerated read is not supported
> */
> void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len)
> {
> unsigned long addr;
>
>- CI_BUG_ON(!i915_has_memcpy_from_wc());
>+ if (!i915_has_memcpy_from_wc())
>+ goto fallback;
>
> addr = (unsigned long)src;
> if (!IS_ALIGNED(addr, 16)) {
>@@ -154,6 +166,34 @@ void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len
>
> if (likely(len))
> __memcpy_ntdqu(dst, src, DIV_ROUND_UP(len, 16));
>+
>+ return;
>+
>+fallback:
>+ memcpy(dst, src, len);
>+}
>+
>+/**
>+ * i915_io_memcpy_from_wc: perform an accelerated *aligned* read from WC
>+ * @dst: destination pointer
>+ * @src: source pointer
>+ * @len: how many bytes to copy
>+ *
>+ * To be used when the when copying from io memory.
>+ *
>+ * memcpy_fromio() is used as fallback otherewise no difference to
>+ * i915_memcpy_from_wc()
>+ */
>+void i915_io_memcpy_from_wc(void *dst, const void __iomem *src, unsigned long len)
>+{
>+ if (i915_can_memcpy_from_wc(dst, (const void __force *)src, len)) {
>+ if (likely(len))
>+ __memcpy_ntdqa(dst, (const void __force *)src, len >> 4);
>+ return;
>+ }
>+
>+ /* Fallback */
>+ memcpy_fromio(dst, src, len);
> }
>
> void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
>diff --git a/drivers/gpu/drm/i915/i915_memcpy.h b/drivers/gpu/drm/i915/i915_memcpy.h
>index 3df063a3293b..93ea9295e28c 100644
>--- a/drivers/gpu/drm/i915/i915_memcpy.h
>+++ b/drivers/gpu/drm/i915/i915_memcpy.h
>@@ -12,23 +12,13 @@ struct drm_i915_private;
>
> void i915_memcpy_init_early(struct drm_i915_private *i915);
>
>-bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
>+void i915_memcpy_from_wc(void *dst, const void *src, unsigned long len);
> void i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len);
>+void i915_io_memcpy_from_wc(void *dst, const void __iomem *src, unsigned long len);
>
>-/* The movntdqa instructions used for memcpy-from-wc require 16-byte alignment,
>- * as well as SSE4.1 support. i915_memcpy_from_wc() will report if it cannot
>- * perform the operation. To check beforehand, pass in the parameters to
>- * to i915_can_memcpy_from_wc() - since we only care about the low 4 bits,
>- * you only need to pass in the minor offsets, page-aligned pointers are
>- * always valid.
>- *
>- * For just checking for SSE4.1, in the foreknowledge that the future use
>- * will be correctly aligned, just use i915_has_memcpy_from_wc().
>- */
>-#define i915_can_memcpy_from_wc(dst, src, len) \
>- i915_memcpy_from_wc((void *)((unsigned long)(dst) | (unsigned long)(src) | (len)), NULL, 0)
>+bool i915_can_memcpy_from_wc(void *dst, const void *src, unsigned long len);
>
> #define i915_has_memcpy_from_wc() \
>- i915_memcpy_from_wc(NULL, NULL, 0)
>+ i915_can_memcpy_from_wc(NULL, NULL, 0)
I think the has vs can here is confusing. But a cleanup on that could be
on top since it would just add noise to this patch.
I or someone else probably need a more careful review, but ack on the
direction:
Acked-by: Lucas De Marchi <lucas.demarchi at intel.com>
Lucas De Marchi
More information about the dri-devel
mailing list