[PATCH 35/36] x86: Benchmark the copy_user assembly routines for x86-64
Chris Wilson
chris at chris-wilson.co.uk
Wed May 31 20:28:01 UTC 2017
Generates results like, on Broadwell:
[ 1.084868] benchmark_copy_user(cold):
size unrolled string fast-string generic
[ 1.570211] 1 10 10 10 10
[ 2.056518] 2 21 20 20 21
[ 2.539453] 4 44 42 42 44
[ 3.021847] 6 67 63 63 66
[ 3.506297] 8 91 89 84 88
[ 3.988532] 12 135 129 126 134
[ 4.472073] 16 179 178 169 181
[ 4.953796] 24 268 267 255 269
[ 5.435589] 32 359 355 342 356
[ 5.916646] 48 540 534 514 536
[ 6.398404] 64 713 715 687 711
[ 6.890398] 96 718 716 579 717
[ 7.382498] 128 958 955 783 951
[ 7.878221] 192 1088 1100 865 1074
[ 8.391972] 256 1105 1117 979 1106
[ 8.905690] 384 1164 1168 1042 1164
[ 9.447616] 512 1264 1270 1237 1267
[ 9.969166] 768 1275 1292 1287 1287
[ 10.518342] 1024 1291 1347 1305 1330
[ 11.129231] 1536 1309 1404 1402 1401
[ 11.693534] 2048 1337 1398 1395 1396
[ 12.527074] 3072 1360 1403 1404 1404
[ 13.501876] 4096 1378 1424 1425 1423
[ 14.424982] 6144 1376 1441 1441 1441
[ 15.652224] 8192 1384 1439 1440 1440
[ 17.451471] 12288 1407 1560 1564 1480
[ 19.831951] 16384 1424 1589 1589 1514
[ 22.195824] 24576 1436 1534 1537 1536
[ 24.547366] 32768 1503 1617 1619 1553
[ 26.874961] 49152 1514 1663 1666 1667
[ 29.185502] 65536 1523 1689 1692 1692
[ 31.478786] 98304 1533 1723 1726 1724
[ 33.769143] 131072 1547 1733 1740 1651
[ 36.030720] 196608 1645 1661 1664 1664
[ 38.281924] 262144 1732 1666 1670 1671
[ 40.519900] 393216 1923 1724 1741 1742
[ 42.758308] 524288 1997 1748 1752 1753
[ 44.984908] 786432 2090 1767 1769 1768
[ 47.210658] 1048576 2177 1811 1802 1800
[ 47.210895] benchmark_copy_user(hot):
size unrolled string fast-string generic
[ 47.748956] 1 158 77 79 167
[ 48.286764] 2 306 154 158 326
[ 48.824514] 4 614 308 317 555
[ 49.362420] 6 926 462 476 764
[ 49.900384] 8 1344 298 635 1206
[ 50.438453] 12 1773 482 952 1485
[ 50.976273] 16 2797 602 1269 1863
[ 51.514170] 24 4020 903 1906 2630
[ 52.052022] 32 5055 1204 2540 3391
[ 52.589984] 48 6150 1806 3810 4338
[ 53.127804] 64 9564 2409 5082 8637
[ 53.665852] 96 13583 3612 6483 9265
[ 54.203970] 128 18108 4815 8434 13747
[ 54.742200] 192 21537 6258 12064 17893
[ 55.280208] 256 21579 7510 15355 20318
[ 55.819082] 384 21617 9598 21160 20757
[ 56.357612] 512 21631 11146 26086 20980
[ 56.896935] 768 21653 17864 34023 32405
[ 57.436693] 1024 21664 22289 40085 38411
[ 57.977938] 1536 21661 29575 48848 47189
[ 58.518943] 2048 21678 35479 54799 53237
[ 59.059616] 3072 21268 44170 62432 61048
[ 59.603317] 4096 21374 50171 67114 65959
[ 60.144534] 6144 21476 58358 72517 71611
[ 60.698266] 8192 21526 63527 75597 74818
[ 61.253098] 12288 21574 69707 78932 78379
[ 61.818717] 16384 20574 69146 76810 76085
[ 62.397996] 24576 16733 21608 21667 21665
[ 62.975917] 32768 16779 21667 21729 21723
[ 63.553200] 49152 16838 21660 21728 21753
[ 64.129602] 65536 16882 21698 21740 21797
[ 64.706081] 98304 16867 21711 21769 21746
[ 65.285680] 131072 16712 21629 21719 21721
[ 65.885188] 196608 11332 19835 19971 19974
[ 66.440236] 262144 11327 18223 18518 18516
[ 66.994009] 393216 11331 14680 14768 14768
[ 67.551185] 524288 11334 14652 14775 14775
[ 68.108775] 786432 11332 14600 14778 14777
[ 68.671271] 1048576 11337 14533 14775 14776
and on Broxton:
[ 0.691194] benchmark_copy_user(cold):
size unrolled string fast-string generic
[ 1.178069] 1 15 14 13 15
[ 1.660684] 2 30 29 29 31
[ 2.146557] 4 60 67 70 68
[ 2.631990] 6 103 105 105 103
[ 3.115012] 8 137 141 142 139
[ 3.597576] 12 209 200 214 205
[ 4.079667] 16 279 285 285 277
[ 4.561716] 24 418 428 428 418
[ 5.043600] 32 558 571 571 558
[ 5.525527] 48 838 857 857 837
[ 6.007451] 64 1116 1143 1134 1117
[ 6.498654] 96 999 999 999 991
[ 6.991405] 128 1346 1345 1344 1334
[ 7.474368] 192 1703 1449 1442 1702
[ 7.954307] 256 1686 1670 1685 1683
[ 8.449810] 384 1891 1890 1889 1891
[ 8.944714] 512 2642 2513 2513 2661
[ 9.447221] 768 4865 3534 3654 4666
[ 9.954514] 1024 5925 4213 4167 4181
[ 10.473064] 1536 5928 5435 5037 5723
[ 11.011117] 2048 5904 6841 5958 6924
[ 11.559081] 3072 8120 7868 8475 8082
[ 12.162886] 4096 7679 8956 8950 8863
[ 12.884126] 6144 5438 5441 5302 5195
[ 13.542026] 8192 5612 5687 5697 5761
[ 14.523619] 12288 5674 5720 5734 5728
[ 15.295379] 16384 5680 5696 5699 5686
[ 16.078191] 24576 5509 5446 5425 5448
[ 16.869596] 32768 5349 5293 5296 5306
[ 17.663901] 49152 5151 5267 5274 5289
[ 18.460441] 65536 5102 5269 5274 5271
[ 19.257257] 98304 5058 5296 5298 5299
[ 20.052725] 131072 5061 5333 5330 5329
[ 20.847417] 196608 5060 5336 5337 5332
[ 21.642442] 262144 5071 5349 5345 5346
[ 22.436005] 393216 5078 5359 5357 5357
[ 23.235470] 524288 5039 5341 5348 5343
[ 24.233473] 786432 3149 3399 3400 3398
[ 25.278839] 1048576 3015 3150 3148 3152
[ 25.278853] benchmark_copy_user(hot):
size unrolled string fast-string generic
[ 25.816821] 1 270 52 53 234
[ 26.354850] 2 364 106 109 273
[ 26.892838] 4 460 213 218 397
[ 27.430631] 6 486 305 312 468
[ 27.969025] 8 1250 253 437 972
[ 28.507017] 12 1009 332 625 972
[ 29.045381] 16 2059 514 897 1667
[ 29.584242] 24 2624 672 1071 2187
[ 30.122465] 32 3043 1014 1750 2592
[ 30.660963] 48 3620 1499 2561 3181
[ 31.199311] 64 7777 1971 3333 6364
[ 31.738052] 96 7499 2876 4772 6999
[ 32.275679] 128 9999 3733 6088 9654
[ 32.813284] 192 11052 5316 8400 10767
[ 33.352050] 256 11665 6746 10371 11427
[ 33.890995] 384 12351 9230 13549 12173
[ 34.429906] 512 12583 11312 16000 12042
[ 34.969950] 768 13022 14608 19535 19090
[ 35.509582] 1024 13254 17097 21961 21538
[ 36.052610] 1536 13492 20611 25073 24705
[ 36.595365] 2048 12725 22967 26987 26664
[ 37.142441] 3072 13123 25944 29215 28961
[ 37.683810] 4096 13331 27738 30472 30267
[ 38.240970] 6144 13491 28026 28953 29193
[ 38.797596] 8192 13315 25170 26071 25883
[ 39.372520] 12288 13130 26645 26731 28266
[ 39.963133] 16384 10452 17193 17407 17387
[ 40.578435] 24576 8455 14114 14243 14232
[ 41.216819] 32768 8083 13668 13756 13749
[ 41.881091] 49152 7773 13203 13261 13254
[ 42.533352] 65536 7661 12894 12938 12936
[ 43.220041] 98304 7640 12643 12675 12672
[ 43.843197] 131072 7639 12580 12604 12601
[ 44.468129] 196608 7639 12555 12570 12570
[ 45.093873] 262144 7641 12552 12563 12561
[ 45.718450] 393216 7642 12551 12560 12559
[ 46.373853] 524288 7296 11999 12084 12102
[ 47.020981] 786432 3091 3224 3225 3222
[ 47.686092] 1048576 3037 3159 3158 3158
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
arch/x86/lib/Makefile | 1 +
arch/x86/lib/bench_usercopy.c | 176 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 177 insertions(+)
create mode 100644 arch/x86/lib/bench_usercopy.c
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 34a74131a12c..5019e8b46ea3 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -27,6 +27,7 @@ lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
+obj-$(CONFIG_X86_64) += bench_usercopy.o
ifeq ($(CONFIG_X86_32),y)
obj-y += atomic64_32.o
diff --git a/arch/x86/lib/bench_usercopy.c b/arch/x86/lib/bench_usercopy.c
new file mode 100644
index 000000000000..b9a500b8cead
--- /dev/null
+++ b/arch/x86/lib/bench_usercopy.c
@@ -0,0 +1,176 @@
+#include <linux/random.h>
+#include <linux/sched/clock.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/sort.h>
+#include <linux/uaccess.h>
+
+#include <asm/cacheflush.h>
+
+typedef unsigned long (*copy_fn)(void *, const void *, unsigned);
+
+#define CU_COLD 0x1
+
+static unsigned long
+__benchmark_copy_user(void *to, const void *from, unsigned size,
+ copy_fn func, unsigned int flags)
+{
+ const unsigned int maxloop = min((1u << 26) / size, 4096u);
+ unsigned long count = 0, loop;
+ u64 start, now, overhead;
+ u64 speed;
+
+ get_cpu();
+ start = local_clock();
+ for (loop = 0; loop <= maxloop; loop++) {
+ if (flags & CU_COLD)
+ clflush_cache_range(to, size);
+ barrier();
+ }
+ now = local_clock();
+ overhead = now - start;
+ start = now;
+ do {
+ for (loop = 0; loop <= maxloop; loop++) {
+ if (flags & CU_COLD)
+ clflush_cache_range(to, size);
+ func(to, from, size);
+ }
+
+ count++;
+ now = local_clock();
+ } while (!((now - start) >> 24)); /* ~16ms */
+ put_cpu();
+
+ speed = count;
+ speed *= maxloop + 1;
+ speed *= size;
+ speed *= 1000 * 1000 * 1000;
+ return div64_u64(speed, ktime_sub(now, start) - count * overhead) >> 20;
+}
+
+static const struct copy_user {
+ copy_fn fn;
+ const char *name;
+ unsigned int cpu_feature;
+} copy_user_methods[] = {
+ { copy_user_generic_unrolled, "unrolled", 0 },
+ { copy_user_generic_string, "string", X86_FEATURE_REP_GOOD },
+ { copy_user_enhanced_fast_string, "fast-string", X86_FEATURE_ERMS },
+ { (copy_fn)raw_copy_to_user, "generic", 0 },
+ { }
+};
+
+static void __header_methods(const char *name, const char *mode)
+{
+ const struct copy_user *m;
+ char buf[180];
+ int len;
+
+ len = snprintf(buf, sizeof(buf), "%s(%s):\n%10s", name, mode, "size");
+ for (m = copy_user_methods; m->fn; m++) {
+ if (m->cpu_feature && !boot_cpu_has(m->cpu_feature))
+ continue;
+
+ len += snprintf(buf + len, sizeof(buf) - len,
+ " %10s", m->name);
+ }
+ pr_info("%s\n", buf);
+}
+
+static int ulong_cmp(const void *a, const void *b)
+{
+ const unsigned long *A = a, *B = b;
+
+ if (*A > *B)
+ return 1;
+ else if (*A < *B)
+ return -1;
+ else
+ return 0;
+}
+
+static void ulong_swap(void *a, void *b, int size)
+{
+ unsigned long *A = a, *B = b, tmp;
+
+ tmp = *A;
+ *A = *B;
+ *B = tmp;
+}
+
+static void __benchmark_methods(void *to, const void *from, int size,
+ unsigned int flags)
+{
+ const struct copy_user *m;
+ char buf[180];
+ int len;
+
+ len = snprintf(buf, sizeof(buf), "%10d", size);
+
+ for (m = copy_user_methods; m->fn; m++) {
+ unsigned long speed[7];
+ int i;
+
+ if (m->cpu_feature && !boot_cpu_has(m->cpu_feature))
+ continue;
+
+ if (!(flags & CU_COLD))
+ __benchmark_copy_user(to, from, size, m->fn, flags);
+ for (i = 0; i < ARRAY_SIZE(speed); i++) {
+ cond_resched();
+ speed[i] = __benchmark_copy_user(to, from, size,
+ m->fn, flags);
+ }
+
+ sort(speed, ARRAY_SIZE(speed), sizeof(speed[0]),
+ ulong_cmp, ulong_swap);
+ len += snprintf(buf + len, sizeof(buf) - len,
+ " %10lu", speed[3]);
+ }
+
+ pr_info("%s\n", buf);
+}
+
+static int benchmark_copy_user(void)
+{
+ const unsigned int maxsz = 1 << 20;
+ const struct {
+ const char *name;
+ unsigned int flags;
+ } modes[] = {
+ { "cold", CU_COLD },
+ { "hot", 0 },
+ { },
+ }, *m;
+ unsigned int order;
+ void *from, *to;
+ u32 prng;
+
+ from = kmalloc(2*maxsz, GFP_KERNEL);
+ if (!from)
+ return -ENOMEM;
+ to = from + maxsz;
+
+ prng = 0;
+ for (order = 0; order <= maxsz/sizeof(u32); order++) {
+ prng = next_pseudo_random32(prng);
+ *((u32 *)from + order) = prng;
+ }
+
+ for (m = modes; m->name; m++) {
+ __header_methods(__func__, m->name);
+ for (order = 0; order <= ilog2(maxsz); order++) {
+ if (order > 2) {
+ __benchmark_methods(to, from, 3 << (order - 2),
+ m->flags);
+ }
+
+ __benchmark_methods(to, from, 1 << order, m->flags);
+ }
+ }
+
+ kfree(from);
+ return 0;
+}
+late_initcall(benchmark_copy_user)
--
2.11.0
More information about the Intel-gfx-trybot
mailing list