[Intel-gfx] [PATCH] drm/i915: More reasonable memcpy unroll in i915_gem_swizzle_page

Tvrtko Ursulin tursulin at ursulin.net
Mon Dec 19 09:19:46 UTC 2016


From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

For some reason GCC 6.2.1 here unrolls the from and to stack memcpy
here in per-byte fashion and also by repeatedly loading offset
constants. It look horrible like this for example:

      ...
     fdc:       48 b8 41 00 00 00 00    movabs rax,0xffff880000000041
     fe3:       88 ff ff
     fe6:       44 88 74 06 80          mov    BYTE PTR [rsi+rax*1-0x80],r14b
     feb:       48 b8 42 00 00 00 00    movabs rax,0xffff880000000042
     ff2:       88 ff ff
     ff5:       44 88 6c 06 80          mov    BYTE PTR [rsi+rax*1-0x80],r13b
     ffa:       48 b8 43 00 00 00 00    movabs rax,0xffff880000000043
    1001:       88 ff ff
    1004:       44 88 64 06 80          mov    BYTE PTR [rsi+rax*1-0x80],r12b
    1009:       48 b8 44 00 00 00 00    movabs rax,0xffff880000000044
    1010:       88 ff ff
    1013:       88 5c 06 80             mov    BYTE PTR [rsi+rax*1-0x80],bl
    1017:       48 b8 45 00 00 00 00    movabs rax,0xffff880000000045
    101e:       88 ff ff
    1021:       44 88 5c 06 80          mov    BYTE PTR [rsi+rax*1-0x80],r11b
    1026:       48 b8 46 00 00 00 00    movabs rax,0xffff880000000046
    102d:       88 ff ff
    1030:       44 88 54 06 80          mov    BYTE PTR [rsi+rax*1-0x80],r10b
    1035:       48 b8 47 00 00 00 00    movabs rax,0xffff880000000047
    103c:       88 ff ff
    103f:       44 88 4c 06 80          mov    BYTE PTR [rsi+rax*1-0x80],r9b
    1044:       0f b6 5d d0             movzx  ebx,BYTE PTR [rbp-0x30]
    1048:       48 b8 48 00 00 00 00    movabs rax,0xffff880000000048
    104f:       88 ff ff
    1052:       88 5c 06 80             mov    BYTE PTR [rsi+rax*1-0x80],bl
    1056:       48 b8 49 00 00 00 00    movabs rax,0xffff880000000049
    105d:       88 ff ff
    1060:       40 88 7c 06 80          mov    BYTE PTR [rsi+rax*1-0x80],dil
    1065:       0f b6 5d cf             movzx  ebx,BYTE PTR [rbp-0x31]
    1069:       48 b8 4a 00 00 00 00    movabs rax,0xffff88000000004a
    1070:       88 ff ff
    1073:       88 5c 06 80             mov    BYTE PTR [rsi+rax*1-0x80],bl
    1077:       0f b6 7d ce             movzx  edi,BYTE PTR [rbp-0x32]
    107b:       48 b8 4b 00 00 00 00    movabs rax,0xffff88000000004b
      ...

So change the code a bit which makes it generate a more reasonable
code like:
  ...
 bf1:   48 89 78 b8             mov    QWORD PTR [rax-0x48],rdi
 bf5:   4c 89 60 c0             mov    QWORD PTR [rax-0x40],r12
 bf9:   48 89 58 c8             mov    QWORD PTR [rax-0x38],rbx
 bfd:   4c 89 58 d0             mov    QWORD PTR [rax-0x30],r11
 c01:   4c 89 50 d8             mov    QWORD PTR [rax-0x28],r10
  ...

Which saves 2087 bytes of code.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
---
 drivers/gpu/drm/i915/i915_gem_fence_reg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index e03983973252..d665d2e74641 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -631,9 +631,9 @@ i915_gem_swizzle_page(struct page *page)
 	vaddr = kmap(page);
 
 	for (i = 0; i < PAGE_SIZE; i += 128) {
-		memcpy(temp, &vaddr[i], 64);
+		memcpy(&temp[0], &vaddr[i], 64);
 		memcpy(&vaddr[i], &vaddr[i + 64], 64);
-		memcpy(&vaddr[i + 64], temp, 64);
+		memcpy(&vaddr[i + 64], &temp[0], 64);
 	}
 
 	kunmap(page);
-- 
2.7.4



More information about the Intel-gfx mailing list