[Intel-gfx] [PATCH 1/2] lib: Add GPGPU fill

Zhenyu Wang zhenyuw at linux.intel.com
Tue Dec 2 03:59:27 PST 2014


This is to add fill operation using GPGPU pipeline which is similar to
current media fill. This can be used to simply verify GPGPU pipeline
and help to enable it on newer HW, currently it works on Gen7 only and
will add support on later platform.

Now this sets very simply thread group dispatch for one thread per
thread group on SIMD16 dispatch. So the fill shader just uses thread
group ID for buffer offset.

Signed-off-by: Zhenyu Wang <zhenyuw at linux.intel.com>
---
 lib/gen7_media.h             |   2 +
 lib/intel_batchbuffer.c      |  19 +++++
 lib/intel_batchbuffer.h      |  25 +++++++
 lib/media_fill.h             |   7 ++
 lib/media_fill_gen7.c        | 161 +++++++++++++++++++++++++++++++++++++++++--
 shaders/gpgpu/README         |   4 ++
 shaders/gpgpu/gpgpu_fill.gxa |  51 ++++++++++++++
 7 files changed, 265 insertions(+), 4 deletions(-)
 create mode 100644 shaders/gpgpu/README
 create mode 100644 shaders/gpgpu/gpgpu_fill.gxa

diff --git a/lib/gen7_media.h b/lib/gen7_media.h
index d5f9921..91294d2 100644
--- a/lib/gen7_media.h
+++ b/lib/gen7_media.h
@@ -179,6 +179,7 @@
 #define GEN7_PIPELINE_SELECT			GFXPIPE(1, 1, 4)
 # define PIPELINE_SELECT_3D			(0 << 0)
 # define PIPELINE_SELECT_MEDIA			(1 << 0)
+# define PIPELINE_SELECT_GPGPU			(2 << 0)
 
 #define GEN7_STATE_BASE_ADDRESS			GFXPIPE(0, 1, 1)
 # define BASE_ADDRESS_MODIFY			(1 << 0)
@@ -187,6 +188,7 @@
 #define GEN7_MEDIA_CURBE_LOAD			GFXPIPE(2, 0, 1)
 #define GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD	GFXPIPE(2, 0, 2)
 #define GEN7_MEDIA_OBJECT			GFXPIPE(2, 1, 0)
+#define GEN7_GPGPU_WALKER                       GFXPIPE(2, 1, 5)
 
 struct gen7_interface_descriptor_data
 {
diff --git a/lib/intel_batchbuffer.c b/lib/intel_batchbuffer.c
index 30ef2cf..18b0ef3 100644
--- a/lib/intel_batchbuffer.c
+++ b/lib/intel_batchbuffer.c
@@ -511,3 +511,22 @@ igt_media_fillfunc_t igt_get_media_fillfunc(int devid)
 
 	return fill;
 }
+
+/**
+ * igt_get_gpgpu_fillfunc:
+ * @devid: pci device id
+ *
+ * Returns:
+ *
+ * The platform-specific media fill function pointer for the device specified
+ * with @devid. Will return NULL when no media fill function is implemented.
+ */
+igt_gpgpu_fillfunc_t igt_get_gpgpu_fillfunc(int devid)
+{
+	igt_gpgpu_fillfunc_t fill = NULL;
+
+	if (IS_GEN7(devid))
+		fill = gen7_gpgpu_fillfunc;
+
+	return fill;
+}
diff --git a/lib/intel_batchbuffer.h b/lib/intel_batchbuffer.h
index 0ec6601..b5d697f 100644
--- a/lib/intel_batchbuffer.h
+++ b/lib/intel_batchbuffer.h
@@ -264,4 +264,29 @@ typedef void (*igt_media_fillfunc_t)(struct intel_batchbuffer *batch,
 
 igt_media_fillfunc_t igt_get_media_fillfunc(int devid);
 
+/**
+ * igt_gpgpu_fillfunc_t:
+ * @batch: batchbuffer object
+ * @dst: destination i-g-t buffer object
+ * @x: destination pixel x-coordination
+ * @y: destination pixel y-coordination
+ * @width: width of the filled rectangle
+ * @height: height of the filled rectangle
+ * @color: fill color to use
+ *
+ * This is the type of the per-platform media fill functions. The
+ * platform-specific implementation can be obtained by calling
+ * igt_get_gpgpu_fillfunc().
+ *
+ * A media fill function will emit a batchbuffer to the kernel which executes
+ * the specified blit fill operation using the media engine.
+ */
+typedef void (*igt_gpgpu_fillfunc_t)(struct intel_batchbuffer *batch,
+				     struct igt_buf *dst,
+				     unsigned x, unsigned y,
+				     unsigned width, unsigned height,
+				     uint8_t color);
+
+igt_gpgpu_fillfunc_t igt_get_gpgpu_fillfunc(int devid);
+
 #endif
diff --git a/lib/media_fill.h b/lib/media_fill.h
index 226489c..2a30055 100644
--- a/lib/media_fill.h
+++ b/lib/media_fill.h
@@ -32,4 +32,11 @@ gen9_media_fillfunc(struct intel_batchbuffer *batch,
                 unsigned width, unsigned height,
                 uint8_t color);
 
+void
+gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
+		    struct igt_buf *dst,
+		    unsigned x, unsigned y,
+		    unsigned width, unsigned height,
+		    uint8_t color);
+
 #endif /* RENDE_MEDIA_FILL_H */
diff --git a/lib/media_fill_gen7.c b/lib/media_fill_gen7.c
index 5a23b7d..7113fda 100644
--- a/lib/media_fill_gen7.c
+++ b/lib/media_fill_gen7.c
@@ -8,7 +8,6 @@
 
 #include <assert.h>
 
-
 static const uint32_t media_kernel[][4] = {
 	{ 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
 	{ 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
@@ -23,6 +22,23 @@ static const uint32_t media_kernel[][4] = {
 	{ 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
 };
 
+/* shaders/gpgpu/gpgpu_fill.gxa */
+static const uint32_t gpgpu_kernel[][4] = {
+	{ 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
+	{ 0x00000041, 0x20400c21, 0x00000004, 0x00000010 },
+	{ 0x00000001, 0x20440021, 0x00000018, 0x00000000 },
+	{ 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
+	{ 0x00200001, 0x20800021, 0x00450040, 0x00000000 },
+	{ 0x00000001, 0x20880061, 0x00000000, 0x0000000f },
+	{ 0x00800001, 0x20a00021, 0x00000020, 0x00000000 },
+	{ 0x00800001, 0x20e00021, 0x00000020, 0x00000000 },
+	{ 0x00800001, 0x21200021, 0x00000020, 0x00000000 },
+	{ 0x00800001, 0x21600021, 0x00000020, 0x00000000 },
+	{ 0x05800031, 0x24001ca8, 0x00000080, 0x120a8000 },
+	{ 0x00600001, 0x2e000021, 0x008d0000, 0x00000000 },
+	{ 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
+};
+
 static uint32_t
 batch_used(struct intel_batchbuffer *batch)
 {
@@ -160,14 +176,15 @@ gen7_fill_media_kernel(struct intel_batchbuffer *batch,
 }
 
 static uint32_t
-gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst)
+gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst,
+			       const uint32_t kernel[][4], size_t size)
 {
 	struct gen7_interface_descriptor_data *idd;
 	uint32_t offset;
 	uint32_t binding_table_offset, kernel_offset;
 
 	binding_table_offset = gen7_fill_binding_table(batch, dst);
-	kernel_offset = gen7_fill_media_kernel(batch, media_kernel, sizeof(media_kernel));
+	kernel_offset = gen7_fill_media_kernel(batch, kernel, size);
 
 	idd = batch_alloc(batch, sizeof(*idd), 64);
 	offset = batch_offset(batch, idd);
@@ -329,7 +346,9 @@ gen7_media_fillfunc(struct intel_batchbuffer *batch,
 	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
 
 	curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
-	interface_descriptor = gen7_fill_interface_descriptor(batch, dst);
+	interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
+							      media_kernel,
+							      sizeof(media_kernel));
 	igt_assert(batch->ptr < &batch->buffer[4095]);
 
 	/* media pipeline */
@@ -353,3 +372,137 @@ gen7_media_fillfunc(struct intel_batchbuffer *batch,
 	gen7_render_flush(batch, batch_end);
 	intel_batchbuffer_reset(batch);
 }
+
+static void
+gen7_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
+{
+	OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2));
+
+	/* scratch buffer */
+	OUT_BATCH(0);
+
+	/* number of threads & urb entries */
+	OUT_BATCH(1 << 16 | /* max num of threads */
+		  0 << 8 | /* num of URB entry */
+		  1 << 2); /* GPGPU mode */
+
+	OUT_BATCH(0);
+
+	/* urb entry size & curbe size */
+	OUT_BATCH(0 << 16 | 	/* URB entry size in 256 bits unit */
+		  1);		/* CURBE entry size in 256 bits unit */
+
+	/* scoreboard */
+	OUT_BATCH(0);
+	OUT_BATCH(0);
+	OUT_BATCH(0);
+}
+
+static void
+gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch,
+		     unsigned x, unsigned y,
+		     unsigned width, unsigned height)
+{
+	uint32_t x_dim, y_dim, tmp, right_mask;
+
+	/*
+	 * Simply do SIMD16 based dispatch, so every thread uses
+	 * SIMD16 channels.
+	 *
+	 * Define our own thread group size, e.g 16x1 for every group, then
+	 * will have 1 thread each group in SIMD16 dispatch. So thread
+	 * width/height/depth are all 1.
+	 *
+	 * Then thread group X = width / 16 (aligned to 16)
+	 * thread group Y = height;
+	 */
+	x_dim = (width + 15) / 16;
+	y_dim = height;
+
+	tmp = width & 15;
+	if (tmp == 0)
+		right_mask = (1 << 16) - 1;
+	else
+		right_mask = (1 << tmp) - 1;
+
+	OUT_BATCH(GEN7_GPGPU_WALKER | 9);
+
+	/* interface descriptor offset */
+	OUT_BATCH(0);
+
+	/* SIMD size, thread w/h/d */
+	OUT_BATCH(1 << 30 | /* SIMD16 */
+		  0 << 16 | /* depth:1 */
+		  0 << 8 | /* height:1 */
+		  0); /* width:1 */
+
+	/* thread group X */
+	OUT_BATCH(0);
+	OUT_BATCH(x_dim);
+
+	/* thread group Y */
+	OUT_BATCH(0);
+	OUT_BATCH(y_dim);
+
+	/* thread group Z */
+	OUT_BATCH(0);
+	OUT_BATCH(1);
+
+	/* right mask */
+	OUT_BATCH(right_mask);
+
+	/* bottom mask, height 1, always 0xffffffff */
+	OUT_BATCH(0xffffffff);
+}
+
+void
+gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
+		    struct igt_buf *dst,
+		    unsigned x, unsigned y,
+		    unsigned width, unsigned height,
+		    uint8_t color)
+{
+	uint32_t curbe_buffer, interface_descriptor;
+	uint32_t batch_end;
+
+	intel_batchbuffer_flush(batch);
+
+	/* setup states */
+	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
+
+	/*
+	 * const buffer needs to fill for every thread, but as we have just 1 thread
+	 * per every group, so need only one curbe data.
+	 *
+	 * For each thread, just use thread group ID for buffer offset.
+	 */
+	curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
+
+	interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
+							      gpgpu_kernel,
+							      sizeof(gpgpu_kernel));
+	igt_assert(batch->ptr < &batch->buffer[4095]);
+
+	batch->ptr = batch->buffer;
+
+	/* GPGPU pipeline */
+	OUT_BATCH(GEN7_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU);
+
+	gen7_emit_state_base_address(batch);
+
+	gen7_emit_vfe_state_gpgpu(batch);
+
+	gen7_emit_curbe_load(batch, curbe_buffer);
+
+	gen7_emit_interface_descriptor_load(batch, interface_descriptor);
+
+	gen7_emit_gpgpu_walk(batch, x, y, width, height);
+
+	OUT_BATCH(MI_BATCH_BUFFER_END);
+
+	batch_end = batch_align(batch, 8);
+	igt_assert(batch_end < BATCH_STATE_SPLIT);
+
+	gen7_render_flush(batch, batch_end);
+	intel_batchbuffer_reset(batch);
+}
diff --git a/shaders/gpgpu/README b/shaders/gpgpu/README
new file mode 100644
index 0000000..3bf328a
--- /dev/null
+++ b/shaders/gpgpu/README
@@ -0,0 +1,4 @@
+
+Commands used to generate the shader on gen7
+$> m4 gpgpu_fill.gxa > gpgpu_fill.gxm
+$> intel-gen4asm -g 7 -o <output> gpgpu_fill.gxm
diff --git a/shaders/gpgpu/gpgpu_fill.gxa b/shaders/gpgpu/gpgpu_fill.gxa
new file mode 100644
index 0000000..fc309f3
--- /dev/null
+++ b/shaders/gpgpu/gpgpu_fill.gxa
@@ -0,0 +1,51 @@
+/*
+ * Registers
+ * g0 -- header
+ * g1 -- constant
+ * g2 -- calculate X/Y offset
+ * g4-g12 payload for write message
+ */
+define(`ORIG',          `g2.0<2,2,1>UD')
+define(`ORIG_X',        `g2.0<1>UD')
+define(`ORIG_Y',        `g2.4<1>UD')
+define(`COLOR',         `g1.0')
+define(`COLORUB',       `COLOR<0,1,0>UB')
+define(`COLORUD',       `COLOR<0,1,0>UD')
+define(`X',             `g0.4<0,1,0>UD')
+define(`Y',             `g0.24<0,1,0>UD')
+
+mov(4)  COLOR<1>UB      COLORUB         {align1};
+
+/* WRITE */
+/* count thread group ID for X/Y offset */
+mul(1)  ORIG_X          X        0x10UD {align1};
+mov(1)  ORIG_Y          Y               {align1};
+mov(8)  g4.0<1>UD       g0.0<8,8,1>UD   {align1};
+mov(2)  g4.0<1>UD       ORIG            {align1};
+/* Normal mode: for block height 1 row and block width 16 bytes */
+mov(1)  g4.8<1>UD       0x0000000fUD    {align1};
+
+mov(16) g5.0<1>UD       COLORUD         {align1 compr};
+mov(16) g7.0<1>UD       COLORUD         {align1 compr};
+mov(16) g9.0<1>UD       COLORUD         {align1 compr};
+mov(16) g11.0<1>UD      COLORUD         {align1 compr};
+
+/*
+ * comment out the following instruction on Gen7
+ * write(0, 0, 10, 12)
+ *   10: media_block_write
+ *   12: data cache data port 1
+ */
+send(16) 4 acc0<1>UW null write(0, 0, 10, 12) mlen 9 rlen 0 {align1};
+
+/*
+ * uncomment the following instruction on Gen7
+ * write(0, 0, 10, 0)
+ *   10: media_block_write
+ *    0: reander cache data port
+ */
+/* send(16) 4 acc0<1>UW null write(0, 0, 10, 0) mlen 9 rlen 0 {align1}; */
+
+/* EOT */
+mov(8)  g112.0<1>UD       g0.0<8,8,1>UD   {align1};
+send(16) 112 null<1>UW null thread_spawner(0, 0, 1) mlen 1 rlen 0 {align1 EOT};
-- 
2.1.3




More information about the Intel-gfx mailing list