[Intel-gfx] [PATCH i-g-t 2/3] gem_render_copy: Add functions for performance testing

Mon Oct 5 04:42:24 PDT 2015

gen8_render_writefunc does only constant writes to the
framebuffer, no texture reads. Used for testing write bandwidth.

gen8_render_readfunc does only reads from the sampler and
discards the result. Used for testing sampler read bandwidth.

Alpha blend tests and support for more Gens still to come.

Signed-off-by: Antti Koskipaa <antti.koskipaa at linux.intel.com>
---
 lib/gen8_render.h       |  3 ++
 lib/intel_batchbuffer.c | 40 +++++++++++++++++++++++++++
 lib/intel_batchbuffer.h |  2 ++
 lib/rendercopy.h        | 12 ++++++++
 lib/rendercopy_gen8.c   | 53 +++++++++++++++++++++++++++++++----
 shaders/ps/discard.g7a  | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
 shaders/ps/fill.g7a     |  6 ++++
 7 files changed, 184 insertions(+), 5 deletions(-)
 create mode 100644 shaders/ps/discard.g7a
 create mode 100644 shaders/ps/fill.g7a

diff --git a/lib/gen8_render.h b/lib/gen8_render.h
index ba3f9f2..610a457 100644
--- a/lib/gen8_render.h
+++ b/lib/gen8_render.h
@@ -60,6 +60,9 @@
 #define GEN8_3DSTATE_WM_DEPTH_STENCIL		GEN6_3D(3, 0, 0x4e)
 #define GEN8_3DSTATE_PS_EXTRA			GEN6_3D(3,0, 0x4f)
 # define GEN8_PSX_PIXEL_SHADER_VALID			(1 << 31)
+# define GEN8_PSX_DONT_WRITE_RT				(1 << 30)
+# define GEN8_PSX_OMASK_PRESENT				(1 << 29)
+# define GEN8_PSX_KILLS_PIXEL				(1 << 28)
 # define GEN8_PSX_ATTRIBUTE_ENABLE			(1 << 8)
 
 #define GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC	GEN6_3D(3, 0, 0x23)
diff --git a/lib/intel_batchbuffer.c b/lib/intel_batchbuffer.c
index 692521f..e3cf622 100644
--- a/lib/intel_batchbuffer.c
+++ b/lib/intel_batchbuffer.c
@@ -748,6 +748,46 @@ igt_render_copyfunc_t igt_get_render_copyfunc(int devid)
 }
 
 /**
+ * igt_get_render_writefunc:
+ * @devid: pci device id
+ *
+ * Returns:
+ *
+ * The platform-specific render write function pointer for the device
+ * specified with @devid. Will return NULL when no render copy function is
+ * implemented.
+ */
+igt_render_copyfunc_t igt_get_render_writefunc(int devid)
+{
+	igt_render_copyfunc_t copy = NULL;
+
+	if (IS_GEN8(devid))
+		copy = gen8_render_writefunc;
+
+	return copy;
+}
+
+/**
+ * igt_get_render_readfunc:
+ * @devid: pci device id
+ *
+ * Returns:
+ *
+ * The platform-specific render read function pointer for the device
+ * specified with @devid. Will return NULL when no render copy function is
+ * implemented.
+ */
+igt_render_copyfunc_t igt_get_render_readfunc(int devid)
+{
+	igt_render_copyfunc_t copy = NULL;
+
+	if (IS_GEN8(devid))
+		copy = gen8_render_readfunc;
+
+	return copy;
+}
+
+/**
  * igt_get_media_fillfunc:
  * @devid: pci device id
  *
diff --git a/lib/intel_batchbuffer.h b/lib/intel_batchbuffer.h
index 869747d..7d8f990 100644
--- a/lib/intel_batchbuffer.h
+++ b/lib/intel_batchbuffer.h
@@ -274,6 +274,8 @@ typedef void (*igt_render_copyfunc_t)(struct intel_batchbuffer *batch,
 				      struct igt_buf *dst, unsigned dst_x, unsigned dst_y);
 
 igt_render_copyfunc_t igt_get_render_copyfunc(int devid);
+igt_render_copyfunc_t igt_get_render_writefunc(int devid);
+igt_render_copyfunc_t igt_get_render_readfunc(int devid);
 
 /**
  * igt_fillfunc_t:
diff --git a/lib/rendercopy.h b/lib/rendercopy.h
index fdc3cab..f4ec74b 100644
--- a/lib/rendercopy.h
+++ b/lib/rendercopy.h
@@ -53,3 +53,15 @@ void gen2_render_copyfunc(struct intel_batchbuffer *batch,
 			  struct igt_buf *src, unsigned src_x, unsigned src_y,
 			  unsigned width, unsigned height,
 			  struct igt_buf *dst, unsigned dst_x, unsigned dst_y);
+
+void gen8_render_writefunc(struct intel_batchbuffer *batch,
+			   drm_intel_context *context,
+			   struct igt_buf *src, unsigned src_x, unsigned src_y,
+			   unsigned width, unsigned height,
+			   struct igt_buf *dst, unsigned dst_x, unsigned dst_y);
+
+void gen8_render_readfunc(struct intel_batchbuffer *batch,
+			   drm_intel_context *context,
+			   struct igt_buf *src, unsigned src_x, unsigned src_y,
+			   unsigned width, unsigned height,
+			   struct igt_buf *dst, unsigned dst_x, unsigned dst_y);
diff --git a/lib/rendercopy_gen8.c b/lib/rendercopy_gen8.c
index 4a9a283..b243ed7 100644
--- a/lib/rendercopy_gen8.c
+++ b/lib/rendercopy_gen8.c
@@ -71,6 +71,28 @@ static const uint32_t ps_kernel_copy[][4] = {
 #endif
 };
 
+/* see shaders/ps/discard.g7a */
+static const uint32_t ps_kernel_read[][4] = {
+   { 0x0060005a, 0x21403ae8, 0x3a0000c0, 0x008d0040 },
+   { 0x0060005a, 0x21603ae8, 0x3a0000c0, 0x008d0080 },
+   { 0x0060005a, 0x21803ae8, 0x3a0000d0, 0x008d0040 },
+   { 0x0060005a, 0x21a03ae8, 0x3a0000d0, 0x008d0080 },
+   { 0x02800031, 0x2e4022e8, 0x0e000140, 0x08840001 },
+   { 0x00000001, 0x2e020e08, 0x08000000, 0x00000000 },
+   { 0x00000001, 0x2e000e08, 0x08000000, 0x00000000 },
+   { 0x00000001, 0x2e010e08, 0x08000000, 0x00000000 },
+   { 0x05800031, 0x200022e0, 0x0e000e00, 0x920b1000 },
+};
+
+/* see shaders/ps/fill.g7a */
+static const uint32_t ps_kernel_write[][4] = {
+   { 0x00800001, 0x2e003ee8, 0x38000000, 0x3f800000 },
+   { 0x00800001, 0x2e403ee8, 0x38000000, 0x3f7d70a4 },
+   { 0x00800001, 0x2e803ee8, 0x38000000, 0x3f000000 },
+   { 0x00800001, 0x2ec03ee8, 0x38000000, 0x3dcccccd },
+   { 0x05800031, 0x200022e0, 0x0e000e00, 0x90031000 },
+};
+
 /* AUB annotation support */
 #define MAX_ANNOTATIONS	33
 struct annotations_context {
@@ -779,7 +801,7 @@ gen8_emit_sf(struct intel_batchbuffer *batch)
 }
 
 static void
-gen8_emit_ps(struct intel_batchbuffer *batch, uint32_t kernel) {
+gen8_emit_ps(struct intel_batchbuffer *batch, uint32_t kernel, bool kills_pixel) {
 	const int max_threads = 63;
 
 	OUT_BATCH(GEN6_3DSTATE_WM | (2 - 2));
@@ -819,7 +841,8 @@ gen8_emit_ps(struct intel_batchbuffer *batch, uint32_t kernel) {
 	OUT_BATCH(GEN8_PS_BLEND_HAS_WRITEABLE_RT);
 
 	OUT_BATCH(GEN8_3DSTATE_PS_EXTRA | (2 - 2));
-	OUT_BATCH(GEN8_PSX_PIXEL_SHADER_VALID | GEN8_PSX_ATTRIBUTE_ENABLE);
+	OUT_BATCH(GEN8_PSX_PIXEL_SHADER_VALID | GEN8_PSX_ATTRIBUTE_ENABLE |
+		  (kills_pixel ? (GEN8_PSX_KILLS_PIXEL | GEN8_PSX_DONT_WRITE_RT) : 0));
 }
 
 static void
@@ -925,7 +948,7 @@ static void _gen8_render_func(struct intel_batchbuffer *batch,
 			  struct igt_buf *src, unsigned src_x, unsigned src_y,
 			  unsigned width, unsigned height,
 			  struct igt_buf *dst, unsigned dst_x, unsigned dst_y,
-			  const uint32_t ps_kernel[][4], int kernel_size)
+			  const uint32_t ps_kernel[][4], int kernel_size, bool discard)
 {
 	struct annotations_context aub_annotations;
 	uint32_t ps_sampler_state, ps_kernel_off, ps_binding_table;
@@ -1001,7 +1024,7 @@ static void _gen8_render_func(struct intel_batchbuffer *batch,
 	OUT_BATCH(GEN7_3DSTATE_SAMPLER_STATE_POINTERS_PS);
 	OUT_BATCH(ps_sampler_state);
 
-	gen8_emit_ps(batch, ps_kernel_off);
+	gen8_emit_ps(batch, ps_kernel_off, discard);
 
 	OUT_BATCH(GEN6_3DSTATE_SCISSOR_STATE_POINTERS);
 	OUT_BATCH(scissor_state);
@@ -1039,5 +1062,25 @@ void gen8_render_copyfunc(struct intel_batchbuffer *batch,
 				 struct igt_buf *dst, unsigned dst_x, unsigned dst_y)
 {
 	_gen8_render_func(batch, context, src, src_x, src_y, width, height,
-			  dst, dst_x, dst_y, ps_kernel_copy, sizeof(ps_kernel_copy));
+			  dst, dst_x, dst_y, ps_kernel_copy, sizeof(ps_kernel_copy), false);
+}
+
+void gen8_render_writefunc(struct intel_batchbuffer *batch,
+			   drm_intel_context *context,
+			   struct igt_buf *src, unsigned src_x, unsigned src_y,
+			   unsigned width, unsigned height,
+			   struct igt_buf *dst, unsigned dst_x, unsigned dst_y)
+{
+	_gen8_render_func(batch, context, src, src_x, src_y, width, height,
+			  dst, dst_x, dst_y, ps_kernel_write, sizeof(ps_kernel_write), false);
+}
+
+void gen8_render_readfunc(struct intel_batchbuffer *batch,
+			   drm_intel_context *context,
+			   struct igt_buf *src, unsigned src_x, unsigned src_y,
+			   unsigned width, unsigned height,
+			   struct igt_buf *dst, unsigned dst_x, unsigned dst_y)
+{
+	_gen8_render_func(batch, context, src, src_x, src_y, width, height,
+			  dst, dst_x, dst_y, ps_kernel_read, sizeof(ps_kernel_read), true);
 }
diff --git a/shaders/ps/discard.g7a b/shaders/ps/discard.g7a
new file mode 100644
index 0000000..eafbd60
--- /dev/null
+++ b/shaders/ps/discard.g7a
@@ -0,0 +1,73 @@
+/* Assemble with  ".../intel-gen4asm/src/intel-gen4asm -g 7" */
+
+
+/* Move pixels into g10-g13. The pixel shaader does not load what you want. It
+ * loads the input data for a plane function to calculate what you want. The
+ * following is boiler plate code to move our normalized texture coordinates
+ * (u,v) into g10-g13. It does this 4 subspans (16 pixels) at a time.
+ *
+ * This should do the same thing, but it doesn't work for some reason.
+ *   pln(16) g10 g6<0,1,0>F g2<8,8,1>F	{ align1 };
+ *   pln(16) g12 g6.16<1>F g2<8,8,1>F	{ align1 };
+ */
+/* U */
+pln (8) g10<1>F g6.0<0,1,0>F g2.0<8,8,1>F { align1 }; /* pixel 0-7 */
+pln (8) g11<1>F g6.0<0,1,0>F g4.0<8,8,1>F { align1 }; /* pixel 8-15 */
+/* V */
+pln (8) g12<1>F g6.16<0,1,0> g2.0<8,8,1>F { align1 }; /* pixel 0-7 */
+pln (8) g13<1>F g6.16<0,1,0> g4.0<8,8,1>F { align1 }; /* pixel 8-15 */
+
+
+/* Next the we want the sampler to fetch the src texture (ie. src buffer). This
+ * is done with a pretty simple send message. The output goes to g112, which is
+ * exactly what we're supposed to use in our final send message.
+ * In intel-gen4asm, we should end up parsed by the following rule:
+ *   predicate SEND execsize dst sendleadreg sndopr directsrcoperand instoptions
+ *
+ * Send message descriptor:
+ * 28:25 = message len = 4 // our 4 registers have 16 pixels
+ * 24:20 = response len = 8 // Each pixel is RGBA32, so we need 8 registers
+ * 19:19 = header present = 0
+ * 18:17 = SIMD16 = 2
+ * 16:12 = TYPE = 0  (regular sample)
+ * 11:08 = Sampler index = ignored/0
+ * 7:0 = binding table index = src = 1
+ * 0x8840001
+ *
+ * Send message extra descriptor
+ * 5:5 = End of Thread = 0
+ * 3:0 = Target Function ID = SFID_SAMPLER (2)
+ * 0x2
+ */
+
+send(16) g114 g10 0x2 0x8840001 { align1 };
+
+/* Next discard the result. This is done by using a send message to the pixel
+ * data port with all the output masks set to 0. These are in the message header,
+ * in dword g112.2.
+ */
+mov(1) g112.2<1>UD 0x00000000 { align1 };
+
+/* Set pixel offsets in the header to 0 */
+mov(1) g112.0<1>UD 0 { align1 };
+mov(1) g112.1<1>UD 0 { align1 };
+
+/* Send message descriptor:
+ * 28:25 = message len = 12 // 16 pixels RGBA32 + header
+ * 24:20 = response len = 0
+ * 19:19 = header present = 1
+ * 17:14 = message type = Render Target Write (12)
+ * 12:12 = Last Render Target Select = 1
+ * 10:08 = Message Type = SIMD16 (0)
+ * 07:00 = Binding Table Index = dest = 0
+ * 0x120B1000
+ *
+ * Send message extra descriptor
+ * 5:5 = End of Thread = 1
+ * 3:0 = Target Function ID = SFID_DP_RC (5)
+ * 0x25
+ */
+
+send(16) null g112  0x25 0x120B1000 { align1, EOT };
+
+/* vim: set ft=c ts=4 sw=2 tw=80 et: */
diff --git a/shaders/ps/fill.g7a b/shaders/ps/fill.g7a
new file mode 100644
index 0000000..89f130f
--- /dev/null
+++ b/shaders/ps/fill.g7a
@@ -0,0 +1,6 @@
+mov (16) g112<1>F 1.0F { align1 };
+mov (16) g114<1>F 0.99F { align1 };
+mov (16) g116<1>F 0.5F { align1 };
+mov (16) g118<1>F 0.1F { align1 };
+send (16) null g112 0x25 0x10031000 { align1, EOT };
+/* <8,8,1>F render RT write SIMD16 LastRT Surface = 0 mlen 8 rlen 0 { align1 1H EOT }; */
-- 
2.3.6