[Spice-devel] [PATCH 11/12] add benchmarks for memcpy (RtlCopyMemory) vs fast - fast is slower
Marc-André Lureau
mlureau at redhat.com
Fri May 2 04:21:00 PDT 2014
----- Original Message -----
> Two benchmarks:
> 1. standalone, actually uses LIBCMT's memcpy to compare.
> 2. Part of qxldd.dll (in display/res.c called from display/driver.c)
> so using the same implementation.
>
> Note: next commit removes benchmark code as well as fast_memcpy_* code
> and related SSE check and FPU save/restore.
Why not keep it in a "benchmark" branch, instead of adding and removing?
> Results for 32 bit windows 7 give ~2 times better performance for
> *system* memcpy.
> 64 bit windows 7 is ~2 times better for size < 128 KByte copies, and
> approximately the same for 128 KB <= size <= 1 MB
>
> More complete results:
> Note: for 64 bit with 262144 <= size <= 1048576 I got mixed results,
> depending on the order of performing the comparison - if memcpy was run
> first I got better results for fast_memcpy, by about 20%, for 64 bit
> only (for 32 bit memcpy/RtlCopyMemory was still faster).
>
> Windows 7 32 bit:
> size [bytes] our time/system time [percent]
> 1024 232
> 2048 352
> 4096 681
> 8192 303
> 16384 455
> 32768 403
> 65536 352
> 131072 232
> 262144 232
> 524288 152
> 1048576 177
>
> Windows 7 64 bit:
> size [bytes] our time/system time [percent]
> 1024 140
> 2048 204
> 4096 200
> 8192 198
> 16384 232
> 32768 173
> 65536 272
> 131072 177
> 262144 134
> 524288 115
> 1048576 115
> ---
> xddm/display/benchmark_memcpy.c | 152
> +++++++++++++++++++++++++++++++++
> xddm/display/driver.c | 4 +
> xddm/display/res.c | 57 +++++++++++++
> xddm/tests/benchmark_format_results.py | 38 +++++++++
> xddm/tests/build_benchmark.bat | 7 ++
> 5 files changed, 258 insertions(+)
> create mode 100644 xddm/display/benchmark_memcpy.c
> create mode 100644 xddm/tests/benchmark_format_results.py
> create mode 100644 xddm/tests/build_benchmark.bat
>
> diff --git a/xddm/display/benchmark_memcpy.c
> b/xddm/display/benchmark_memcpy.c
> new file mode 100644
> index 0000000..fa44577
> --- /dev/null
> +++ b/xddm/display/benchmark_memcpy.c
> @@ -0,0 +1,152 @@
> +#include <windows.h>
> +#include <stdio.h>
> +
> +#ifdef _WIN64
> +int have_sse2 = 0;
> +void fast_memcpy_aligned(void *dest, const void *src, size_t len);
> +#else
> +static _inline void fast_memcpy_aligned(void *dest, const void *src, size_t
> len)
> +{
> + _asm
> + {
> + mov ecx, len
> + mov esi, src
> + mov edi, dest
> +
> + cmp ecx, 128
> + jb try_to_copy64
> +
> + prefetchnta [esi]
> + copy_128:
> + prefetchnta [esi + 64]
> +
> + movdqa xmm0, [esi]
> + movdqa xmm1, [esi + 16]
> + movdqa xmm2, [esi + 32]
> + movdqa xmm3, [esi + 48]
> +
> + prefetchnta [esi + 128]
> +
> + movntdq [edi], xmm0
> + movntdq [edi + 16], xmm1
> + movntdq [edi + 32], xmm2
> + movntdq [edi + 48], xmm3
> +
> + movdqa xmm0, [esi + 64]
> + movdqa xmm1, [esi + 80]
> + movdqa xmm2, [esi + 96]
> + movdqa xmm3, [esi + 112]
> +
> + movntdq [edi + 64], xmm0
> + movntdq [edi + 80], xmm1
> + movntdq [edi + 96], xmm2
> + movntdq [edi + 112], xmm3
> +
> + add edi, 128
> + add esi, 128
> + sub ecx, 128
> + cmp ecx, 128
> + jae copy_128
> +
> + try_to_copy64:
> + cmp ecx, 64
> + jb try_to_copy32
> +
> + movdqa xmm0, [esi]
> + movdqa xmm1, [esi + 16]
> + movdqa xmm2, [esi + 32]
> + movdqa xmm3, [esi + 48]
> +
> + movntdq [edi], xmm0
> + movntdq [edi + 16], xmm1
> + movntdq [edi + 32], xmm2
> + movntdq [edi + 48], xmm3
> +
> + add edi, 64
> + add esi, 64
> + sub ecx, 64
> + prefetchnta [esi]
> +
> + try_to_copy32:
> + cmp ecx, 32
> + jb try_to_copy16
> +
> + movdqa xmm0, [esi]
> + movdqa xmm1, [esi + 16]
> + movntdq [edi], xmm0
> + movntdq [edi + 16], xmm1
> +
> + add edi, 32
> + add esi, 32
> + sub ecx, 32
> +
> + try_to_copy16:
> + cmp ecx, 16
> + jb try_to_copy4
> +
> + movdqa xmm0, [esi]
> + movntdq [edi], xmm0
> +
> + add edi, 16
> + add esi, 16
> + sub ecx, 16
> +
> +
> + try_to_copy4:
> + cmp ecx, 4
> + jb try_to_copy_1
> + movsd
> + sub ecx, 4
> + jmp try_to_copy4
> +
> + try_to_copy_1:
> + rep movsb
> +
> + sfence
> + }
> +}
> +#endif
> +
> +typedef unsigned long long uint64_t;
> +
> +uint64_t time_usecs(void)
> +{
> + SYSTEMTIME systime;
> + GetSystemTime(&systime);
> + return systime.wMilliseconds * 1000 + systime.wSecond * 1e6 +
> systime.wMinute * 60e6 + systime.wHour * 3600e6;
> +}
> +
> +int main(void)
> +{
> + int i;
> + unsigned char *src_unaligned;
> + unsigned char *dest_unaligned;
> + uint64_t start, total1, total2;
> + unsigned char *src = NULL;
> + unsigned char *dest = NULL;
> + size_t size = 1024;
> + size_t iter = 1024 * 1024;
> +
> + printf("fast_memcpy compared to memcpy (< 1.0 means memcpy is better)\n");
> + for (size = 1024; size < 1024*1024*2; size *= 2, iter /= 2) {
> + src_unaligned = malloc(size + 15);
> + dest_unaligned = malloc(size + 15);
> + src = (unsigned char *)((size_t)(src_unaligned + 15) & ~0xf);
> + dest = (unsigned char *)((size_t)(dest_unaligned + 15) & ~0xf);
> + start = time_usecs();
> + for (i = 0 ; i < iter ; ++i)
> + memcpy(dest, src, size);
> + total1 = time_usecs() - start;
> +
> + start = time_usecs();
> + for (i = 0 ; i < iter ; ++i)
> + fast_memcpy_aligned(dest, src, size);
> + total2 = time_usecs() - start;
> +
> + printf("%d: %f (%d, ", size, ((float)total1) / total2, total1);
> + printf("%d)\n", total2);
> + free(src_unaligned);
> + free(dest_unaligned);
> + }
> + return 0;
> +}
> diff --git a/xddm/display/driver.c b/xddm/display/driver.c
> index 5a3dbfa..bed1d58 100644
> --- a/xddm/display/driver.c
> +++ b/xddm/display/driver.c
> @@ -903,6 +903,8 @@ VOID EnableQXLPrimarySurface(PDev *pdev)
> pdev->surf_enable = TRUE;
> }
>
> +void benchmark_memcpy(PDev *pdev);
> +
> HSURF DrvEnableSurface(DHPDEV in_pdev)
> {
> PDev *pdev;
> @@ -941,6 +943,8 @@ HSURF DrvEnableSurface(DHPDEV in_pdev)
>
> EnableQXLPrimarySurface(pdev);
>
> + benchmark_memcpy(pdev);
> +
> DEBUG_PRINT((pdev, 1, "%s: 0x%lx exit\n", __FUNCTION__, pdev));
> return surf;
>
> diff --git a/xddm/display/res.c b/xddm/display/res.c
> index 60e9bcb..589218b 100644
> --- a/xddm/display/res.c
> +++ b/xddm/display/res.c
> @@ -1283,6 +1283,63 @@ static _inline void fast_memcpy_unaligment(void *dest,
> const void *src, size_t l
>
> #endif
>
> +uint64_t time_usecs(void)
> +{
> + ENG_TIME_FIELDS systime;
> + EngQueryLocalTime(&systime);
> + return (uint64_t)(systime.usMilliseconds * 1000 + systime.usSecond * 1e6 +
> + systime.usMinute * 60e6 + systime.usHour * 3600e6);
> +}
> +
> +void benchmark_memcpy(PDev *pdev)
> +{
> + size_t i;
> + unsigned char *src_unaligned;
> + unsigned char *dest_unaligned;
> + uint64_t start, total1, total2;
> + unsigned char *src = NULL;
> + unsigned char *dest = NULL;
> + size_t size = 1024;
> + size_t iter = 1024 * 1024;
> +
> + for (size = 1024; size < 1024*1024*2; size *= 2, iter /= 2) {
> + src_unaligned = EngAllocMem(0, size + 31, ALLOC_TAG);
> + dest_unaligned = EngAllocMem(0, size + 31, ALLOC_TAG);
> + src = (unsigned char *)((size_t)(src_unaligned + 31) & ~0x1f);
> + dest = (unsigned char *)((size_t)(dest_unaligned + 31) & ~0x1f);
> +
> + for (i = 0 ; i < size ; ++i)
> + src[i] = i;
> +
> + start = time_usecs();
> + for (i = 0 ; i < iter ; ++i) {
> + fast_memcpy_aligned(dest, src, size);
> + }
> + total2 = time_usecs() - start;
> +
> + {
> + int errors = 0;
> + for (i = 0 ; i < size ; ++i) {
> + if (dest[i] != src[i]) {
> + errors++;
> + }
> + }
> + if (errors > 0) {
> + DEBUG_PRINT((pdev, 1, "!!! copy errors %d !!!\n", errors));
> + }
> + }
> +
> + start = time_usecs();
> + for (i = 0 ; i < iter ; ++i)
> + memcpy(dest, src, size);
> + total1 = time_usecs() - start;
> +
> + DEBUG_PRINT((pdev, 1, "%d: %lld, %lld\n", size, total1, total2));
> + EngFreeMem(src_unaligned);
> + EngFreeMem(dest_unaligned);
> + }
> +}
> +
> #ifdef DBG
> #define PutBytesAlign __PutBytesAlign
> #define PutBytes(pdev, chunk, now, end, src, size, page_counter, alloc_size,
> use_sse)\
> diff --git a/xddm/tests/benchmark_format_results.py
> b/xddm/tests/benchmark_format_results.py
> new file mode 100644
> index 0000000..96d302b
> --- /dev/null
> +++ b/xddm/tests/benchmark_format_results.py
> @@ -0,0 +1,38 @@
> +import sys
> +
> +win7_32="""qxl/guest-0: 96463384453: qxldd: 1024: 47000, 109000
> +qxl/guest-0: 96591785177: qxldd: 2048: 31000, 109000
> +qxl/guest-0: 96722899152: qxldd: 4096: 16000, 109000
> +qxl/guest-0: 96851422238: qxldd: 8192: 31000, 94000
> +qxl/guest-0: 97013842048: qxldd: 16384: 31000, 141000
> +qxl/guest-0: 97167323122: qxldd: 32768: 31000, 125000
> +qxl/guest-0: 97316872306: qxldd: 65536: 31000, 109000
> +qxl/guest-0: 97465747407: qxldd: 131072: 47000, 109000
> +qxl/guest-0: 97624668249: qxldd: 262144: 47000, 109000
> +qxl/guest-0: 97785876639: qxldd: 524288: 62000, 94000
> +qxl/guest-0: 97953480643: qxldd: 1048576: 62000, 110000
> +"""
> +
> +win7_64="""
> +qxl/guest-0: 2278149101498: qxldd: 1024: 78000, 109000
> +qxl/guest-0: 2278288271327: qxldd: 2048: 46000, 94000
> +qxl/guest-0: 2278428135167: qxldd: 4096: 47000, 94000
> +qxl/guest-0: 2278575078269: qxldd: 8192: 47000, 93000
> +qxl/guest-0: 2278734906600: qxldd: 16384: 47000, 109000
> +qxl/guest-0: 2278896881683: qxldd: 32768: 63000, 109000
> +qxl/guest-0: 2279073699223: qxldd: 65536: 46000, 125000
> +qxl/guest-0: 2279250403663: qxldd: 131072: 62000, 110000
> +qxl/guest-0: 2279467314681: qxldd: 262144: 93000, 125000
> +qxl/guest-0: 2279693375414: qxldd: 524288: 109000, 125000
> +qxl/guest-0: 2279929972847: qxldd: 1048576: 109000, 125000
> +"""
> +
> +filt = lambda txt: filt2(filt1(txt))
> +filt2 = lambda data: [(s, system, ours*100.0/system) for t, s, system, ours
> in data]
> +filt1 = lambda txt: map(lambda a: (int(a[1][:-1]), int(a[3][:-1]),
> int(a[4][:-1]), int(a[5])), map(lambda l: l.strip().split(), [l for l in
> txt.split('\n') if l.strip() != '']))
> +display = lambda txt: sys.stdout.write('\n'.join('%10s %10s' % (a, '%3.0f' %
> c) for a, b, c in filt(txt))+'\n')
> +
> +print('size [bytes]'.ljust(18) + 'our time/system time [percent]')
> +display(win7_32)
> +print('size [bytes]'.ljust(18) + 'our time/system time [percent]')
> +display(win7_64)
> diff --git a/xddm/tests/build_benchmark.bat b/xddm/tests/build_benchmark.bat
> new file mode 100644
> index 0000000..a184249
> --- /dev/null
> +++ b/xddm/tests/build_benchmark.bat
> @@ -0,0 +1,7 @@
> +cl /Zi /nologo /c /I %CRT_INC_PATH% ..\display\benchmark_memcpy.c
> +if defined AMD64 (
> +ml64 /c /Zd ..\display\amd64\x64.asm
> +link /nologo /debug /libpath:%BASEDIR%\lib\crt\amd64\
> /libpath:%DDK_LIB_DEST%\amd64 x64.obj benchmark_memcpy.obj
> +) else (
> +link /nologo /debug /libpath:%BASEDIR%\lib\crt\i386\
> /libpath:%DDK_LIB_DEST%\i386 benchmark_memcpy.obj
> +)
> --
> 1.9.0
>
> _______________________________________________
> Spice-devel mailing list
> Spice-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/spice-devel
>
More information about the Spice-devel
mailing list