[Piglit] [PATCH 5/8] arb_shader_image_load_store: add additional coherency test

Thu Apr 7 01:10:49 UTC 2016

From: Nicolai Hähnle <nicolai.haehnle at amd.com>

The existing coherency test isn't a good match for the AMD GCN execution
model.
---
 .../execution/coherency-extra.shader_test          | 90 ++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 tests/spec/arb_shader_image_load_store/execution/coherency-extra.shader_test

diff --git a/tests/spec/arb_shader_image_load_store/execution/coherency-extra.shader_test b/tests/spec/arb_shader_image_load_store/execution/coherency-extra.shader_test
new file mode 100644
index 0000000..f718cd2
--- /dev/null
+++ b/tests/spec/arb_shader_image_load_store/execution/coherency-extra.shader_test
@@ -0,0 +1,90 @@
+# Additional coherency test that can demonstrate failures in an incorrect
+# coherency implementation for AMD GCN, unlike arb_shader_image_load_store-coherency.
+#
+# The real problem with coherency in AMD GCN is separate, non-coherent L1
+# caches, i.e. when a shader execution writes to an image in a CU that uses
+# one L1 cache, and a different shader execution reads from the image
+# in a CU with a different L1 cache.
+#
+# This test uses atomic accesses to a control texture to select the very first
+# fragment shader thread as a writer thread which keeps changing a data
+# texture in a tight loop. All other threads become reader threads which
+# report success if they see two different values of the same texture.
+#
+# This test can produce a false negative (false failure) in two cases:
+#  1) The timeout value ITERS is too low,
+#  2) There is no (or insufficient) parallelism in the implementation, and
+#     therefore the writer thread must finish before most of the reader threads
+#     get a chance to run.
+#
+
+[require]
+GL >= 3.3
+GLSL >= 3.30
+GL_ARB_shader_image_load_store
+SIZE 256 256
+
+[vertex shader passthrough]
+
+[fragment shader]
+#version 330
+#extension GL_ARB_shader_image_load_store: enable
+
+// Change this to 0 to get a control test that should fail on hardware
+// without coherent L1 caches.
+//
+// Need volatile instead of just coherent to prevent overly smart compilers
+// from moving the imageLoad/imageStore out of the loop.
+#if 1
+volatile
+#endif
+layout(r32i) uniform iimage2D tex;
+volatile layout(r32i) uniform iimage2D ctrl;
+out vec4 outcolor;
+
+// Add a timeout so that an incorrect coherency implementation doesn't hang
+// the GPU. If this timeout is too low, you can get false negative results
+// because the writer thread quits before all reader threads have
+// executed.
+#define ITERS 100000
+
+void main()
+{
+	int id = imageAtomicAdd(ctrl, ivec2(0, 0), 1);
+	int orig = imageLoad(tex, ivec2(0, 0)).x;
+	bool done = false;
+
+	outcolor = vec4(0.0, 0.0, 0.0, 1.0);
+
+	for (int iter = 0; iter < ITERS && !done; ++iter) {
+		if (id == 0) {
+			imageStore(tex, ivec2(0, 0), ivec4(iter));
+			if (imageLoad(ctrl, ivec2(0, 1)).x >= 256 * 256)
+				done = true;
+		} else {
+			int current = imageLoad(tex, ivec2(0, 0)).x;
+			if (current != orig)
+				done = true;
+		}
+
+		if (done || (id == 0 && iter == 0))
+			imageAtomicAdd(ctrl, ivec2(0, 1), 1);
+	}
+
+	if (done)
+		outcolor.y = 1.0;
+	else
+		outcolor.x = 1.0;
+}
+
+[test]
+texture integer 0 (1, 2) (0, 0) GL_R32I
+image texture 0 GL_R32I
+texture integer 1 (1, 1) (0, 0) GL_R32I
+image texture 1 GL_R32I
+
+uniform int ctrl 0
+uniform int tex 1
+draw rect -1 -1 2 2
+
+probe all rgba 0.0 1.0 0.0 1.0
-- 
2.5.0