[Spice-devel] [PATCH 10/12] xddm/display/amd64: implement Save/RestoreFPU & fast_memcpy_*
Alon Levy
alevy at redhat.com
Wed Apr 30 07:40:34 PDT 2014
Some notes:
Follows AMD64 calling conventions.
Uses masm 64 (ml64.exe).
Dropped in the next patches because it is too slow.
---
xddm/display/amd64/x64.asm | 236 +++++++++++++++++++++++++++++++++++++++++++++
xddm/display/res.c | 13 +--
2 files changed, 241 insertions(+), 8 deletions(-)
diff --git a/xddm/display/amd64/x64.asm b/xddm/display/amd64/x64.asm
index 36971d3..bb45d33 100644
--- a/xddm/display/amd64/x64.asm
+++ b/xddm/display/amd64/x64.asm
@@ -11,4 +11,240 @@ CheckAndSetSSE2 proc
ret
CheckAndSetSSE2 endp
+RestoreFPU proc
+; rcx PDev *pdev
+; rdx size_t aligned_addr
+ movdqa xmm0, [rcx]
+ movdqa xmm1, [rcx + 16]
+ movdqa xmm2, [rcx + 32]
+ movdqa xmm3, [rcx + 48]
+ ret
+RestoreFPU endp
+
+SaveFPU proc
+; rcx PDev *pdev
+; rdx size_t aligned_addr
+ movdqa [rcx], xmm0
+ movdqa [rcx + 16], xmm1
+ movdqa [rcx + 32], xmm2
+ movdqa [rcx + 48], xmm3
+ ret
+SaveFPU endp
+
+fast_memcpy_aligned proc
+; rcx void *dest
+; rdx const void *src
+; r8 size_t len
+ ; Save rsi and rdi
+ mov r9, rsi
+ mov r10, rdi
+
+ mov rsi, rdx
+ mov rdi, rcx
+ mov rcx, r8
+
+ cmp rcx, 128
+ jb try_to_copy64
+
+ prefetchnta [rsi]
+ copy_128:
+ prefetchnta [rsi + 64]
+
+ movdqa xmm0, [rsi]
+ movdqa xmm1, [rsi + 16]
+ movdqa xmm2, [rsi + 32]
+ movdqa xmm3, [rsi + 48]
+
+ prefetchnta [rsi + 128]
+
+ movntdq [rdi], xmm0
+ movntdq [rdi + 16], xmm1
+ movntdq [rdi + 32], xmm2
+ movntdq [rdi + 48], xmm3
+
+ movdqa xmm0, [rsi + 64]
+ movdqa xmm1, [rsi + 80]
+ movdqa xmm2, [rsi + 96]
+ movdqa xmm3, [rsi + 112]
+
+ movntdq [rdi + 64], xmm0
+ movntdq [rdi + 80], xmm1
+ movntdq [rdi + 96], xmm2
+ movntdq [rdi + 112], xmm3
+
+ add rdi, 128
+ add rsi, 128
+ sub rcx, 128
+ cmp rcx, 128
+ jae copy_128
+
+ try_to_copy64:
+ cmp rcx, 64
+ jb try_to_copy32
+
+ movdqa xmm0, [rsi]
+ movdqa xmm1, [rsi + 16]
+ movdqa xmm2, [rsi + 32]
+ movdqa xmm3, [rsi + 48]
+
+ movntdq [rdi], xmm0
+ movntdq [rdi + 16], xmm1
+ movntdq [rdi + 32], xmm2
+ movntdq [rdi + 48], xmm3
+
+ add rdi, 64
+ add rsi, 64
+ sub rcx, 64
+ prefetchnta [rsi]
+
+ try_to_copy32:
+ cmp rcx, 32
+ jb try_to_copy16
+
+ movdqa xmm0, [rsi]
+ movdqa xmm1, [rsi + 16]
+ movntdq [rdi], xmm0
+ movntdq [rdi + 16], xmm1
+
+ add rdi, 32
+ add rsi, 32
+ sub rcx, 32
+
+ try_to_copy16:
+ cmp rcx, 16
+ jb try_to_copy4
+
+ movdqa xmm0, [rsi]
+ movntdq [rdi], xmm0
+
+ add rdi, 16
+ add rsi, 16
+ sub rcx, 16
+
+
+ try_to_copy4:
+ cmp rcx, 4
+ jb try_to_copy_1
+ movsd
+ sub rcx, 4
+ jmp try_to_copy4
+
+ try_to_copy_1:
+ rep movsb
+
+ sfence
+ ; Save rsi and rdi
+ mov rsi, r9
+ mov rdi, r10
+ ret
+fast_memcpy_aligned endp
+
+fast_memcpy_unaligned proc
+; rcx void *dest
+; rdx const void *src
+; r8 size_t len
+ ; Save rsi and rdi
+ mov r9, rsi
+ mov r10, rdi
+
+ mov rsi, rdx
+ mov rdi, rcx
+ mov rcx, r8
+
+ cmp rcx, 128
+ jb try_to_copy64
+
+ prefetchnta [rsi]
+ copy_128:
+ prefetchnta [rsi + 64]
+
+ movdqu xmm0, [rsi]
+ movdqu xmm1, [rsi + 16]
+ movdqu xmm2, [rsi + 32]
+ movdqu xmm3, [rsi + 48]
+
+ prefetchnta [rsi + 128]
+
+ movntdq [rdi], xmm0
+ movntdq [rdi + 16], xmm1
+ movntdq [rdi + 32], xmm2
+ movntdq [rdi + 48], xmm3
+
+ movdqu xmm0, [rsi + 64]
+ movdqu xmm1, [rsi + 80]
+ movdqu xmm2, [rsi + 96]
+ movdqu xmm3, [rsi + 112]
+
+ movntdq [rdi + 64], xmm0
+ movntdq [rdi + 80], xmm1
+ movntdq [rdi + 96], xmm2
+ movntdq [rdi + 112], xmm3
+
+ add rdi, 128
+ add rsi, 128
+ sub rcx, 128
+ cmp rcx, 128
+ jae copy_128
+
+ try_to_copy64:
+ cmp rcx, 64
+ jb try_to_copy32
+
+ movdqu xmm0, [rsi]
+ movdqu xmm1, [rsi + 16]
+ movdqu xmm2, [rsi + 32]
+ movdqu xmm3, [rsi + 48]
+
+ movntdq [rdi], xmm0
+ movntdq [rdi + 16], xmm1
+ movntdq [rdi + 32], xmm2
+ movntdq [rdi + 48], xmm3
+
+ add rdi, 64
+ add rsi, 64
+ sub rcx, 64
+ prefetchnta [rsi]
+
+ try_to_copy32:
+ cmp rcx, 32
+ jb try_to_copy16
+
+ movdqu xmm0, [rsi]
+ movdqu xmm1, [rsi + 16]
+ movntdq [rdi], xmm0
+ movntdq [rdi + 16], xmm1
+
+ add rdi, 32
+ add rsi, 32
+ sub rcx, 32
+
+ try_to_copy16:
+ cmp rcx, 16
+ jb try_to_copy4
+
+ movdqu xmm0, [rsi]
+ movntdq [rdi], xmm0
+
+ add rdi, 16
+ add rsi, 16
+ sub rcx, 16
+
+
+ try_to_copy4:
+ cmp rcx, 4
+ jb try_to_copy_1
+ movsd
+ sub rcx, 4
+ jmp try_to_copy4
+
+ try_to_copy_1:
+ rep movsb
+
+ sfence
+ ; restore rsi and rdi
+ mov rsi, r9
+ mov rdi, r10
+ ret
+fast_memcpy_unaligned endp
+
end
\ No newline at end of file
diff --git a/xddm/display/res.c b/xddm/display/res.c
index 9320dd1..60e9bcb 100644
--- a/xddm/display/res.c
+++ b/xddm/display/res.c
@@ -36,6 +36,11 @@
#include "devioctl.h"
#include "ntddvdeo.h"
+void SaveFPU(PDev *pdev, size_t aligned_addr);
+void RestoreFPU(PDev *pdev, size_t aligned_addr);
+void fast_memcpy_unaligned(void *dest, const void *src, size_t len);
+void fast_memcpy_aligned(void *dest, const void *src, size_t len);
+
static _inline QXLPHYSICAL PA(PDev *pdev, PVOID virt, UINT8 slot_id)
{
PMemSlot *p_slot = &pdev->mem_slots[slot_id];
@@ -1312,7 +1317,6 @@ static void __PutBytesAlign(PDev *pdev, QXLDataChunk **chunk_ptr, UINT8 **now_pt
NEW_DATA_CHUNK(page_counter, aligned_size);
cp_size = (int)MIN(end - now, size);
}
-#ifndef _WIN64
if (use_sse) {
offset = (size_t)now & SSE_MASK;
if (offset) {
@@ -1341,9 +1345,6 @@ static void __PutBytesAlign(PDev *pdev, QXLDataChunk **chunk_ptr, UINT8 **now_pt
} else {
RtlCopyMemory(now, src, cp_size);
}
-#else
- RtlCopyMemory(now, src, cp_size);
-#endif
src += cp_size;
now += cp_size;
chunk->data_size += cp_size;
@@ -1905,21 +1906,17 @@ static _inline Resource *GetBitmapImage(PDev *pdev, SURFOBJ *surf, XLATEOBJ *col
dest_end = (UINT8 *)image_res + alloc_size;
alloc_size = height * line_size;
-#ifndef _WIN64
if (have_sse2 && alloc_size >= 1024) {
use_sse = TRUE;
SaveFPU(pdev, FPUSave);
}
-#endif
for (; src != src_end; src -= surf->lDelta, alloc_size -= line_size) {
PutBytesAlign(pdev, &chunk, &dest, &dest_end, src, line_size,
&pdev->num_bits_pages, alloc_size, line_size, use_sse);
}
-#ifndef _WIN64
if (use_sse) {
RestoreFPU(pdev, FPUSave);
}
-#endif
GetPallette(pdev, &internal->image.bitmap, color_trans);
DEBUG_PRINT((pdev, 13, "%s: done\n", __FUNCTION__));
--
1.9.0
More information about the Spice-devel
mailing list