[PATCH 35/36] x86: Benchmark the copy_user assembly routines for x86-64

Chris Wilson chris at chris-wilson.co.uk
Wed May 31 20:28:01 UTC 2017


Generates results like, on Broadwell:

[    1.084868] benchmark_copy_user(cold):
                     size   unrolled     string fast-string    generic
[    1.570211]          1         10         10         10         10
[    2.056518]          2         21         20         20         21
[    2.539453]          4         44         42         42         44
[    3.021847]          6         67         63         63         66
[    3.506297]          8         91         89         84         88
[    3.988532]         12        135        129        126        134
[    4.472073]         16        179        178        169        181
[    4.953796]         24        268        267        255        269
[    5.435589]         32        359        355        342        356
[    5.916646]         48        540        534        514        536
[    6.398404]         64        713        715        687        711
[    6.890398]         96        718        716        579        717
[    7.382498]        128        958        955        783        951
[    7.878221]        192       1088       1100        865       1074
[    8.391972]        256       1105       1117        979       1106
[    8.905690]        384       1164       1168       1042       1164
[    9.447616]        512       1264       1270       1237       1267
[    9.969166]        768       1275       1292       1287       1287
[   10.518342]       1024       1291       1347       1305       1330
[   11.129231]       1536       1309       1404       1402       1401
[   11.693534]       2048       1337       1398       1395       1396
[   12.527074]       3072       1360       1403       1404       1404
[   13.501876]       4096       1378       1424       1425       1423
[   14.424982]       6144       1376       1441       1441       1441
[   15.652224]       8192       1384       1439       1440       1440
[   17.451471]      12288       1407       1560       1564       1480
[   19.831951]      16384       1424       1589       1589       1514
[   22.195824]      24576       1436       1534       1537       1536
[   24.547366]      32768       1503       1617       1619       1553
[   26.874961]      49152       1514       1663       1666       1667
[   29.185502]      65536       1523       1689       1692       1692
[   31.478786]      98304       1533       1723       1726       1724
[   33.769143]     131072       1547       1733       1740       1651
[   36.030720]     196608       1645       1661       1664       1664
[   38.281924]     262144       1732       1666       1670       1671
[   40.519900]     393216       1923       1724       1741       1742
[   42.758308]     524288       1997       1748       1752       1753
[   44.984908]     786432       2090       1767       1769       1768
[   47.210658]    1048576       2177       1811       1802       1800
[   47.210895] benchmark_copy_user(hot):
                     size   unrolled     string fast-string    generic
[   47.748956]          1        158         77         79        167
[   48.286764]          2        306        154        158        326
[   48.824514]          4        614        308        317        555
[   49.362420]          6        926        462        476        764
[   49.900384]          8       1344        298        635       1206
[   50.438453]         12       1773        482        952       1485
[   50.976273]         16       2797        602       1269       1863
[   51.514170]         24       4020        903       1906       2630
[   52.052022]         32       5055       1204       2540       3391
[   52.589984]         48       6150       1806       3810       4338
[   53.127804]         64       9564       2409       5082       8637
[   53.665852]         96      13583       3612       6483       9265
[   54.203970]        128      18108       4815       8434      13747
[   54.742200]        192      21537       6258      12064      17893
[   55.280208]        256      21579       7510      15355      20318
[   55.819082]        384      21617       9598      21160      20757
[   56.357612]        512      21631      11146      26086      20980
[   56.896935]        768      21653      17864      34023      32405
[   57.436693]       1024      21664      22289      40085      38411
[   57.977938]       1536      21661      29575      48848      47189
[   58.518943]       2048      21678      35479      54799      53237
[   59.059616]       3072      21268      44170      62432      61048
[   59.603317]       4096      21374      50171      67114      65959
[   60.144534]       6144      21476      58358      72517      71611
[   60.698266]       8192      21526      63527      75597      74818
[   61.253098]      12288      21574      69707      78932      78379
[   61.818717]      16384      20574      69146      76810      76085
[   62.397996]      24576      16733      21608      21667      21665
[   62.975917]      32768      16779      21667      21729      21723
[   63.553200]      49152      16838      21660      21728      21753
[   64.129602]      65536      16882      21698      21740      21797
[   64.706081]      98304      16867      21711      21769      21746
[   65.285680]     131072      16712      21629      21719      21721
[   65.885188]     196608      11332      19835      19971      19974
[   66.440236]     262144      11327      18223      18518      18516
[   66.994009]     393216      11331      14680      14768      14768
[   67.551185]     524288      11334      14652      14775      14775
[   68.108775]     786432      11332      14600      14778      14777
[   68.671271]    1048576      11337      14533      14775      14776

and on Broxton:

[    0.691194] benchmark_copy_user(cold):
      size   unrolled     string fast-string    generic
[    1.178069]          1         15         14         13         15
[    1.660684]          2         30         29         29         31
[    2.146557]          4         60         67         70         68
[    2.631990]          6        103        105        105        103
[    3.115012]          8        137        141        142        139
[    3.597576]         12        209        200        214        205
[    4.079667]         16        279        285        285        277
[    4.561716]         24        418        428        428        418
[    5.043600]         32        558        571        571        558
[    5.525527]         48        838        857        857        837
[    6.007451]         64       1116       1143       1134       1117
[    6.498654]         96        999        999        999        991
[    6.991405]        128       1346       1345       1344       1334
[    7.474368]        192       1703       1449       1442       1702
[    7.954307]        256       1686       1670       1685       1683
[    8.449810]        384       1891       1890       1889       1891
[    8.944714]        512       2642       2513       2513       2661
[    9.447221]        768       4865       3534       3654       4666
[    9.954514]       1024       5925       4213       4167       4181
[   10.473064]       1536       5928       5435       5037       5723
[   11.011117]       2048       5904       6841       5958       6924
[   11.559081]       3072       8120       7868       8475       8082
[   12.162886]       4096       7679       8956       8950       8863
[   12.884126]       6144       5438       5441       5302       5195
[   13.542026]       8192       5612       5687       5697       5761
[   14.523619]      12288       5674       5720       5734       5728
[   15.295379]      16384       5680       5696       5699       5686
[   16.078191]      24576       5509       5446       5425       5448
[   16.869596]      32768       5349       5293       5296       5306
[   17.663901]      49152       5151       5267       5274       5289
[   18.460441]      65536       5102       5269       5274       5271
[   19.257257]      98304       5058       5296       5298       5299
[   20.052725]     131072       5061       5333       5330       5329
[   20.847417]     196608       5060       5336       5337       5332
[   21.642442]     262144       5071       5349       5345       5346
[   22.436005]     393216       5078       5359       5357       5357
[   23.235470]     524288       5039       5341       5348       5343
[   24.233473]     786432       3149       3399       3400       3398
[   25.278839]    1048576       3015       3150       3148       3152
[   25.278853] benchmark_copy_user(hot):
      size   unrolled     string fast-string    generic
[   25.816821]          1        270         52         53        234
[   26.354850]          2        364        106        109        273
[   26.892838]          4        460        213        218        397
[   27.430631]          6        486        305        312        468
[   27.969025]          8       1250        253        437        972
[   28.507017]         12       1009        332        625        972
[   29.045381]         16       2059        514        897       1667
[   29.584242]         24       2624        672       1071       2187
[   30.122465]         32       3043       1014       1750       2592
[   30.660963]         48       3620       1499       2561       3181
[   31.199311]         64       7777       1971       3333       6364
[   31.738052]         96       7499       2876       4772       6999
[   32.275679]        128       9999       3733       6088       9654
[   32.813284]        192      11052       5316       8400      10767
[   33.352050]        256      11665       6746      10371      11427
[   33.890995]        384      12351       9230      13549      12173
[   34.429906]        512      12583      11312      16000      12042
[   34.969950]        768      13022      14608      19535      19090
[   35.509582]       1024      13254      17097      21961      21538
[   36.052610]       1536      13492      20611      25073      24705
[   36.595365]       2048      12725      22967      26987      26664
[   37.142441]       3072      13123      25944      29215      28961
[   37.683810]       4096      13331      27738      30472      30267
[   38.240970]       6144      13491      28026      28953      29193
[   38.797596]       8192      13315      25170      26071      25883
[   39.372520]      12288      13130      26645      26731      28266
[   39.963133]      16384      10452      17193      17407      17387
[   40.578435]      24576       8455      14114      14243      14232
[   41.216819]      32768       8083      13668      13756      13749
[   41.881091]      49152       7773      13203      13261      13254
[   42.533352]      65536       7661      12894      12938      12936
[   43.220041]      98304       7640      12643      12675      12672
[   43.843197]     131072       7639      12580      12604      12601
[   44.468129]     196608       7639      12555      12570      12570
[   45.093873]     262144       7641      12552      12563      12561
[   45.718450]     393216       7642      12551      12560      12559
[   46.373853]     524288       7296      11999      12084      12102
[   47.020981]     786432       3091       3224       3225       3222
[   47.686092]    1048576       3037       3159       3158       3158

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 arch/x86/lib/Makefile         |   1 +
 arch/x86/lib/bench_usercopy.c | 176 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 177 insertions(+)
 create mode 100644 arch/x86/lib/bench_usercopy.c

diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 34a74131a12c..5019e8b46ea3 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -27,6 +27,7 @@ lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
 
 obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
+obj-$(CONFIG_X86_64) += bench_usercopy.o
 
 ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
diff --git a/arch/x86/lib/bench_usercopy.c b/arch/x86/lib/bench_usercopy.c
new file mode 100644
index 000000000000..b9a500b8cead
--- /dev/null
+++ b/arch/x86/lib/bench_usercopy.c
@@ -0,0 +1,176 @@
+#include <linux/random.h>
+#include <linux/sched/clock.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/sort.h>
+#include <linux/uaccess.h>
+
+#include <asm/cacheflush.h>
+
+typedef unsigned long (*copy_fn)(void *, const void *, unsigned);
+
+#define CU_COLD 0x1
+
+static unsigned long
+__benchmark_copy_user(void *to, const void *from, unsigned size,
+		      copy_fn func, unsigned int flags)
+{
+	const unsigned int maxloop = min((1u << 26) / size, 4096u);
+	unsigned long count = 0, loop;
+	u64 start, now, overhead;
+	u64 speed;
+
+	get_cpu();
+	start = local_clock();
+	for (loop = 0; loop <= maxloop; loop++) {
+		if (flags & CU_COLD)
+			clflush_cache_range(to, size);
+		barrier();
+	}
+	now = local_clock();
+	overhead = now - start;
+	start = now;
+	do {
+		for (loop = 0; loop <= maxloop; loop++) {
+			if (flags & CU_COLD)
+				clflush_cache_range(to, size);
+			func(to, from, size);
+		}
+
+		count++;
+		now = local_clock();
+	} while (!((now - start) >> 24)); /* ~16ms */
+	put_cpu();
+
+	speed = count;
+	speed *= maxloop + 1;
+	speed *= size;
+	speed *= 1000 * 1000 * 1000;
+	return div64_u64(speed, ktime_sub(now, start) - count * overhead) >> 20;
+}
+
+static const struct copy_user {
+	copy_fn fn;
+	const char *name;
+	unsigned int cpu_feature;
+} copy_user_methods[] = {
+	{ copy_user_generic_unrolled, "unrolled", 0 },
+	{ copy_user_generic_string, "string", X86_FEATURE_REP_GOOD },
+	{ copy_user_enhanced_fast_string, "fast-string", X86_FEATURE_ERMS },
+	{ (copy_fn)raw_copy_to_user, "generic", 0 },
+	{ }
+};
+
+static void __header_methods(const char *name, const char *mode)
+{
+	const struct copy_user *m;
+	char buf[180];
+	int len;
+
+	len = snprintf(buf, sizeof(buf), "%s(%s):\n%10s", name, mode, "size");
+	for (m = copy_user_methods; m->fn; m++) {
+		if (m->cpu_feature && !boot_cpu_has(m->cpu_feature))
+			continue;
+
+		len += snprintf(buf + len, sizeof(buf) - len,
+				" %10s", m->name);
+	}
+	pr_info("%s\n", buf);
+}
+
+static int ulong_cmp(const void *a, const void *b)
+{
+	const unsigned long *A = a, *B = b;
+
+	if (*A > *B)
+		return 1;
+	else if (*A < *B)
+		return -1;
+	else
+		return 0;
+}
+
+static void ulong_swap(void *a, void *b, int size)
+{
+	unsigned long *A = a, *B = b, tmp;
+
+	tmp = *A;
+	*A = *B;
+	*B = tmp;
+}
+
+static void __benchmark_methods(void *to, const void *from, int size,
+				unsigned int flags)
+{
+	const struct copy_user *m;
+	char buf[180];
+	int len;
+
+	len = snprintf(buf, sizeof(buf), "%10d", size);
+
+	for (m = copy_user_methods; m->fn; m++) {
+		unsigned long speed[7];
+		int i;
+
+		if (m->cpu_feature && !boot_cpu_has(m->cpu_feature))
+			continue;
+
+		if (!(flags & CU_COLD))
+			__benchmark_copy_user(to, from, size, m->fn, flags);
+		for (i = 0; i < ARRAY_SIZE(speed); i++) {
+			cond_resched();
+			speed[i] = __benchmark_copy_user(to, from, size,
+							 m->fn, flags);
+		}
+
+		sort(speed, ARRAY_SIZE(speed), sizeof(speed[0]),
+		     ulong_cmp, ulong_swap);
+		len += snprintf(buf + len, sizeof(buf) - len,
+				" %10lu", speed[3]);
+	}
+
+	pr_info("%s\n", buf);
+}
+
+static int benchmark_copy_user(void)
+{
+	const unsigned int maxsz = 1 << 20;
+	const struct {
+		const char *name;
+		unsigned int flags;
+	} modes[] = {
+		{ "cold", CU_COLD },
+		{ "hot", 0 },
+		{ },
+	}, *m;
+	unsigned int order;
+	void *from, *to;
+	u32 prng;
+
+	from = kmalloc(2*maxsz, GFP_KERNEL);
+	if (!from)
+		return -ENOMEM;
+	to = from + maxsz;
+
+	prng = 0;
+	for (order = 0; order <= maxsz/sizeof(u32); order++) {
+		prng = next_pseudo_random32(prng);
+		*((u32 *)from + order) = prng;
+	}
+
+	for (m = modes; m->name; m++) {
+		__header_methods(__func__, m->name);
+		for (order = 0; order <= ilog2(maxsz); order++) {
+			if (order > 2) {
+				__benchmark_methods(to, from, 3 << (order - 2),
+						    m->flags);
+			}
+
+			__benchmark_methods(to, from, 1 << order, m->flags);
+		}
+	}
+
+	kfree(from);
+	return 0;
+}
+late_initcall(benchmark_copy_user)
-- 
2.11.0



More information about the Intel-gfx-trybot mailing list