[Intel-gfx] [PATCH 2/3] lib: Add GPGPU fill

Wed Dec 3 03:11:18 PST 2014

This is to add fill operation using GPGPU pipeline which is similar to
current media fill. This can be used to simply verify GPGPU pipeline
and help to enable it on newer HW, currently it works on Gen7 only and
will add support on later platform.

Now this sets very simply thread group dispatch for one thread per
thread group on SIMD16 dispatch. So the fill shader just uses thread
group ID for buffer offset.

v2: No new fill func typedef but adapt to igt_fillfunc_t.

Signed-off-by: Zhenyu Wang <zhenyuw at linux.intel.com>
---
 lib/gen7_media.h             |   2 +
 lib/intel_batchbuffer.c      |  19 +++++
 lib/intel_batchbuffer.h      |   7 +-
 lib/media_fill.h             |   7 ++
 lib/media_fill_gen7.c        | 161 +++++++++++++++++++++++++++++++++++++++++--
 shaders/gpgpu/README         |   4 ++
 shaders/gpgpu/gpgpu_fill.gxa |  51 ++++++++++++++
 7 files changed, 244 insertions(+), 7 deletions(-)
 create mode 100644 shaders/gpgpu/README
 create mode 100644 shaders/gpgpu/gpgpu_fill.gxa

diff --git a/lib/gen7_media.h b/lib/gen7_media.h
index d5f9921..91294d2 100644
--- a/lib/gen7_media.h
+++ b/lib/gen7_media.h
@@ -179,6 +179,7 @@
 #define GEN7_PIPELINE_SELECT			GFXPIPE(1, 1, 4)
 # define PIPELINE_SELECT_3D			(0 << 0)
 # define PIPELINE_SELECT_MEDIA			(1 << 0)
+# define PIPELINE_SELECT_GPGPU			(2 << 0)
 
 #define GEN7_STATE_BASE_ADDRESS			GFXPIPE(0, 1, 1)
 # define BASE_ADDRESS_MODIFY			(1 << 0)
@@ -187,6 +188,7 @@
 #define GEN7_MEDIA_CURBE_LOAD			GFXPIPE(2, 0, 1)
 #define GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD	GFXPIPE(2, 0, 2)
 #define GEN7_MEDIA_OBJECT			GFXPIPE(2, 1, 0)
+#define GEN7_GPGPU_WALKER                       GFXPIPE(2, 1, 5)
 
 struct gen7_interface_descriptor_data
 {
diff --git a/lib/intel_batchbuffer.c b/lib/intel_batchbuffer.c
index 4b3a5b8..c70f6d8 100644
--- a/lib/intel_batchbuffer.c
+++ b/lib/intel_batchbuffer.c
@@ -511,3 +511,22 @@ igt_fillfunc_t igt_get_media_fillfunc(int devid)
 
 	return fill;
 }
+
+/**
+ * igt_get_gpgpu_fillfunc:
+ * @devid: pci device id
+ *
+ * Returns:
+ *
+ * The platform-specific gpgpu fill function pointer for the device specified
+ * with @devid. Will return NULL when no gpgpu fill function is implemented.
+ */
+igt_fillfunc_t igt_get_gpgpu_fillfunc(int devid)
+{
+	igt_fillfunc_t fill = NULL;
+
+	if (IS_GEN7(devid))
+		fill = gen7_gpgpu_fillfunc;
+
+	return fill;
+}
diff --git a/lib/intel_batchbuffer.h b/lib/intel_batchbuffer.h
index f0e21ea..12f7be1 100644
--- a/lib/intel_batchbuffer.h
+++ b/lib/intel_batchbuffer.h
@@ -250,11 +250,11 @@ igt_render_copyfunc_t igt_get_render_copyfunc(int devid);
  * @color: fill color to use
  *
  * This is the type of the per-platform fill functions using media
- * pipeline. The platform-specific implementation can be obtained
- * by calling igt_get_media_fillfunc().
+ * or gpgpu pipeline. The platform-specific implementation can be obtained
+ * by calling igt_get_media_fillfunc() or igt_get_gpgpu_fillfunc().
  *
  * A fill function will emit a batchbuffer to the kernel which executes
- * the specified blit fill operation using the media engine.
+ * the specified blit fill operation using the media/gpgpu engine.
  */
 typedef void (*igt_fillfunc_t)(struct intel_batchbuffer *batch,
 			       struct igt_buf *dst,
@@ -263,5 +263,6 @@ typedef void (*igt_fillfunc_t)(struct intel_batchbuffer *batch,
 			       uint8_t color);
 
 igt_fillfunc_t igt_get_media_fillfunc(int devid);
+igt_fillfunc_t igt_get_gpgpu_fillfunc(int devid);
 
 #endif
diff --git a/lib/media_fill.h b/lib/media_fill.h
index 226489c..2a30055 100644
--- a/lib/media_fill.h
+++ b/lib/media_fill.h
@@ -32,4 +32,11 @@ gen9_media_fillfunc(struct intel_batchbuffer *batch,
                 unsigned width, unsigned height,
                 uint8_t color);
 
+void
+gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
+		    struct igt_buf *dst,
+		    unsigned x, unsigned y,
+		    unsigned width, unsigned height,
+		    uint8_t color);
+
 #endif /* RENDE_MEDIA_FILL_H */
diff --git a/lib/media_fill_gen7.c b/lib/media_fill_gen7.c
index 5a23b7d..7113fda 100644
--- a/lib/media_fill_gen7.c
+++ b/lib/media_fill_gen7.c
@@ -8,7 +8,6 @@
 
 #include <assert.h>
 
-
 static const uint32_t media_kernel[][4] = {
 	{ 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
 	{ 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
@@ -23,6 +22,23 @@ static const uint32_t media_kernel[][4] = {
 	{ 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
 };
 
+/* shaders/gpgpu/gpgpu_fill.gxa */
+static const uint32_t gpgpu_kernel[][4] = {
+	{ 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
+	{ 0x00000041, 0x20400c21, 0x00000004, 0x00000010 },
+	{ 0x00000001, 0x20440021, 0x00000018, 0x00000000 },
+	{ 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
+	{ 0x00200001, 0x20800021, 0x00450040, 0x00000000 },
+	{ 0x00000001, 0x20880061, 0x00000000, 0x0000000f },
+	{ 0x00800001, 0x20a00021, 0x00000020, 0x00000000 },
+	{ 0x00800001, 0x20e00021, 0x00000020, 0x00000000 },
+	{ 0x00800001, 0x21200021, 0x00000020, 0x00000000 },
+	{ 0x00800001, 0x21600021, 0x00000020, 0x00000000 },
+	{ 0x05800031, 0x24001ca8, 0x00000080, 0x120a8000 },
+	{ 0x00600001, 0x2e000021, 0x008d0000, 0x00000000 },
+	{ 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
+};
+
 static uint32_t
 batch_used(struct intel_batchbuffer *batch)
 {
@@ -160,14 +176,15 @@ gen7_fill_media_kernel(struct intel_batchbuffer *batch,
 }
 
 static uint32_t
-gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst)
+gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst,
+			       const uint32_t kernel[][4], size_t size)
 {
 	struct gen7_interface_descriptor_data *idd;
 	uint32_t offset;
 	uint32_t binding_table_offset, kernel_offset;
 
 	binding_table_offset = gen7_fill_binding_table(batch, dst);
-	kernel_offset = gen7_fill_media_kernel(batch, media_kernel, sizeof(media_kernel));
+	kernel_offset = gen7_fill_media_kernel(batch, kernel, size);
 
 	idd = batch_alloc(batch, sizeof(*idd), 64);
 	offset = batch_offset(batch, idd);
@@ -329,7 +346,9 @@ gen7_media_fillfunc(struct intel_batchbuffer *batch,
 	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
 
 	curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
-	interface_descriptor = gen7_fill_interface_descriptor(batch, dst);
+	interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
+							      media_kernel,
+							      sizeof(media_kernel));
 	igt_assert(batch->ptr < &batch->buffer[4095]);
 
 	/* media pipeline */
@@ -353,3 +372,137 @@ gen7_media_fillfunc(struct intel_batchbuffer *batch,
 	gen7_render_flush(batch, batch_end);
 	intel_batchbuffer_reset(batch);
 }
+
+static void
+gen7_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
+{
+	OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2));
+
+	/* scratch buffer */
+	OUT_BATCH(0);
+
+	/* number of threads & urb entries */
+	OUT_BATCH(1 << 16 | /* max num of threads */
+		  0 << 8 | /* num of URB entry */
+		  1 << 2); /* GPGPU mode */
+
+	OUT_BATCH(0);
+
+	/* urb entry size & curbe size */
+	OUT_BATCH(0 << 16 | 	/* URB entry size in 256 bits unit */
+		  1);		/* CURBE entry size in 256 bits unit */
+
+	/* scoreboard */
+	OUT_BATCH(0);
+	OUT_BATCH(0);
+	OUT_BATCH(0);
+}
+
+static void
+gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch,
+		     unsigned x, unsigned y,
+		     unsigned width, unsigned height)
+{
+	uint32_t x_dim, y_dim, tmp, right_mask;
+
+	/*
+	 * Simply do SIMD16 based dispatch, so every thread uses
+	 * SIMD16 channels.
+	 *
+	 * Define our own thread group size, e.g 16x1 for every group, then
+	 * will have 1 thread each group in SIMD16 dispatch. So thread
+	 * width/height/depth are all 1.
+	 *
+	 * Then thread group X = width / 16 (aligned to 16)
+	 * thread group Y = height;
+	 */
+	x_dim = (width + 15) / 16;
+	y_dim = height;
+
+	tmp = width & 15;
+	if (tmp == 0)
+		right_mask = (1 << 16) - 1;
+	else
+		right_mask = (1 << tmp) - 1;
+
+	OUT_BATCH(GEN7_GPGPU_WALKER | 9);
+
+	/* interface descriptor offset */
+	OUT_BATCH(0);
+
+	/* SIMD size, thread w/h/d */
+	OUT_BATCH(1 << 30 | /* SIMD16 */
+		  0 << 16 | /* depth:1 */
+		  0 << 8 | /* height:1 */
+		  0); /* width:1 */
+
+	/* thread group X */
+	OUT_BATCH(0);
+	OUT_BATCH(x_dim);
+
+	/* thread group Y */
+	OUT_BATCH(0);
+	OUT_BATCH(y_dim);
+
+	/* thread group Z */
+	OUT_BATCH(0);
+	OUT_BATCH(1);
+
+	/* right mask */
+	OUT_BATCH(right_mask);
+
+	/* bottom mask, height 1, always 0xffffffff */
+	OUT_BATCH(0xffffffff);
+}
+
+void
+gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
+		    struct igt_buf *dst,
+		    unsigned x, unsigned y,
+		    unsigned width, unsigned height,
+		    uint8_t color)
+{
+	uint32_t curbe_buffer, interface_descriptor;
+	uint32_t batch_end;
+
+	intel_batchbuffer_flush(batch);
+
+	/* setup states */
+	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
+
+	/*
+	 * const buffer needs to fill for every thread, but as we have just 1 thread
+	 * per every group, so need only one curbe data.
+	 *
+	 * For each thread, just use thread group ID for buffer offset.
+	 */
+	curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
+
+	interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
+							      gpgpu_kernel,
+							      sizeof(gpgpu_kernel));
+	igt_assert(batch->ptr < &batch->buffer[4095]);
+
+	batch->ptr = batch->buffer;
+
+	/* GPGPU pipeline */
+	OUT_BATCH(GEN7_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU);
+
+	gen7_emit_state_base_address(batch);
+
+	gen7_emit_vfe_state_gpgpu(batch);
+
+	gen7_emit_curbe_load(batch, curbe_buffer);
+
+	gen7_emit_interface_descriptor_load(batch, interface_descriptor);
+
+	gen7_emit_gpgpu_walk(batch, x, y, width, height);
+
+	OUT_BATCH(MI_BATCH_BUFFER_END);
+
+	batch_end = batch_align(batch, 8);
+	igt_assert(batch_end < BATCH_STATE_SPLIT);
+
+	gen7_render_flush(batch, batch_end);
+	intel_batchbuffer_reset(batch);
+}
diff --git a/shaders/gpgpu/README b/shaders/gpgpu/README
new file mode 100644
index 0000000..3bf328a
--- /dev/null
+++ b/shaders/gpgpu/README
@@ -0,0 +1,4 @@
+
+Commands used to generate the shader on gen7
+$> m4 gpgpu_fill.gxa > gpgpu_fill.gxm
+$> intel-gen4asm -g 7 -o <output> gpgpu_fill.gxm
diff --git a/shaders/gpgpu/gpgpu_fill.gxa b/shaders/gpgpu/gpgpu_fill.gxa
new file mode 100644
index 0000000..fc309f3
--- /dev/null
+++ b/shaders/gpgpu/gpgpu_fill.gxa
@@ -0,0 +1,51 @@
+/*
+ * Registers
+ * g0 -- header
+ * g1 -- constant
+ * g2 -- calculate X/Y offset
+ * g4-g12 payload for write message
+ */
+define(`ORIG',          `g2.0<2,2,1>UD')
+define(`ORIG_X',        `g2.0<1>UD')
+define(`ORIG_Y',        `g2.4<1>UD')
+define(`COLOR',         `g1.0')
+define(`COLORUB',       `COLOR<0,1,0>UB')
+define(`COLORUD',       `COLOR<0,1,0>UD')
+define(`X',             `g0.4<0,1,0>UD')
+define(`Y',             `g0.24<0,1,0>UD')
+
+mov(4)  COLOR<1>UB      COLORUB         {align1};
+
+/* WRITE */
+/* count thread group ID for X/Y offset */
+mul(1)  ORIG_X          X        0x10UD {align1};
+mov(1)  ORIG_Y          Y               {align1};
+mov(8)  g4.0<1>UD       g0.0<8,8,1>UD   {align1};
+mov(2)  g4.0<1>UD       ORIG            {align1};
+/* Normal mode: for block height 1 row and block width 16 bytes */
+mov(1)  g4.8<1>UD       0x0000000fUD    {align1};
+
+mov(16) g5.0<1>UD       COLORUD         {align1 compr};
+mov(16) g7.0<1>UD       COLORUD         {align1 compr};
+mov(16) g9.0<1>UD       COLORUD         {align1 compr};
+mov(16) g11.0<1>UD      COLORUD         {align1 compr};
+
+/*
+ * comment out the following instruction on Gen7
+ * write(0, 0, 10, 12)
+ *   10: media_block_write
+ *   12: data cache data port 1
+ */
+send(16) 4 acc0<1>UW null write(0, 0, 10, 12) mlen 9 rlen 0 {align1};
+
+/*
+ * uncomment the following instruction on Gen7
+ * write(0, 0, 10, 0)
+ *   10: media_block_write
+ *    0: reander cache data port
+ */
+/* send(16) 4 acc0<1>UW null write(0, 0, 10, 0) mlen 9 rlen 0 {align1}; */
+
+/* EOT */
+mov(8)  g112.0<1>UD       g0.0<8,8,1>UD   {align1};
+send(16) 112 null<1>UW null thread_spawner(0, 0, 1) mlen 1 rlen 0 {align1 EOT};
-- 
2.1.3