[Intel-gfx] [PATCH] igt/gem_trtt: Exercise the TRTT hardware

Mon Jan 11 04:32:08 PST 2016

On Sat, Jan 09, 2016 at 05:01:30PM +0530, akash.goel at intel.com wrote:
> +static void* mmap_bo(int fd, uint32_t handle, uint64_t size)
> +{
> +	uint32_t *ptr = gem_mmap__cpu(fd, handle, 0, size, PROT_READ);
> +	gem_set_domain(fd, handle, I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);

read-only mapping, but set to the cpu write domain? Seems inconsistent.

> +	return ptr;
> +}

> +/* gem_store_data_svm
> + * populate batch buffer with MI_STORE_DWORD_IMM command
> + * @fd: drm file descriptor
> + * @cmd_buf: batch buffer
> + * @dw_offset: write offset in batch buffer
> + * @vaddr: destination Virtual address
> + * @data: data to be store at destination
> + * @end: whether to end batch buffer or not
> +*/
> +static int gem_store_data_svm(int fd, uint32_t *cmd_buf, uint32_t dw_offset,
> +			uint64_t vaddr, uint32_t data, bool end)

Urm, what?

Just call this what it is,
emit_store_dword(cs, offset, vaddr, value);

Don't pass in bool end, since it is going to used exactly once and just
confuses all the other callers.

> +{
> +	cmd_buf[dw_offset++] = MI_STORE_DWORD_IMM;
> +	cmd_buf[dw_offset++] = vaddr & 0xFFFFFFFC;
> +	cmd_buf[dw_offset++] = (vaddr >> 32) & 0xFFFF; /* bits 32:47 */
> +
> +	cmd_buf[dw_offset++] = data;

Interesting use of whitespace.

> +	if (end) {
> +		cmd_buf[dw_offset++] = MI_BATCH_BUFFER_END;
> +		cmd_buf[dw_offset++] = 0;

> +	}
> +
> +	return dw_offset;
> +}
> +
> +/* setup_execbuffer
> + * helper for buffer execution
> + * @execbuf - pointer to execbuffer
> + * @exec_object - pointer to exec object2 struct
> + * @ring - ring to be used
> + * @buffer_count - how manu buffers to submit
> + * @batch_length - length of batch buffer
> +*/
> +static void setup_execbuffer(struct drm_i915_gem_execbuffer2 *execbuf,
> +			     struct drm_i915_gem_exec_object2 *exec_object,
> +			     int ring, int buffer_count, int batch_length)
> +{

How about memset(execbuf, 0, sizeof(*execbuf));

> +	execbuf->buffers_ptr = (unsigned long)exec_object;
> +	execbuf->buffer_count = buffer_count;
> +	execbuf->batch_start_offset = 0;
> +	execbuf->batch_len = batch_length;
> +	execbuf->cliprects_ptr = 0;
> +	execbuf->num_cliprects = 0;
> +	execbuf->DR1 = 0;
> +	execbuf->DR4 = 0;
> +	execbuf->flags = ring;
> +	i915_execbuffer2_set_context_id(*execbuf, 0);
> +	execbuf->rsvd2 = 0;
> +}
> +
> +/* submit_and_sync
> + * Helper function for exec and sync functions
> + * @fd - drm fd
> + * @execbuf - pointer to execbuffer
> + * @batch_buf_handle - batch buffer handle
> +*/
> +static void submit_and_sync(int fd, struct drm_i915_gem_execbuffer2 *execbuf,
> +			    uint32_t batch_buf_handle)
> +{
> +	gem_execbuf(fd, execbuf);
> +	gem_sync(fd, batch_buf_handle);

The only caller of this also does its own sync. This seems irrelevant
and serves a bad example.

> +}
> +
> +struct local_i915_gem_context_trtt_param {
> +	uint64_t l3_table_address;
> +	uint32_t invd_tile_val;
> +	uint32_t null_tile_val;
> +};
> +
> +/* send_trtt_params
> + * Helper function to request KMD to enable TRTT
> + * @fd - drm fd
> + * @ctx_id - id of the context for which TRTT is to be enabled
> + * @l3_table_address - GFX address of the L3 table
> +*/
> +static void send_trtt_params(int fd, uint32_t ctx_id, uint64_t l3_table_address)

It is not a socket, pipe, or other transport medium. Just setup_trtt().

> +#define TABLE_SIZE 0x1000
> +#define TILE_SIZE 0x10000
> +
> +#define FIRST_TILE_ADDRESS 0xF00000000000
> +#define LAST_TILE_ADDRESS  0xFFFFFFFF0000
> +
> +#define BO_ALLOC_AND_SETUP(fd, bo_size, bo_handle, bo_offset, idx) \
> +	bo_handle = gem_create(fd, bo_size); \
> +	bo_offset = current_ppgtt_offset; \
> +	setup_exec_obj(&exec_object2[idx], bo_handle, EXEC_OBJECT_PINNED, bo_offset); \
> +	current_ppgtt_offset += bo_size;

Function!

> +
> +/* basic test
> + * This test will add a series of MI_STORE_ commands, first to update the
> + * TR-TT table entries and then to update the data buffers using the TR-TT VA,
> + * exercising the programming the table programming done previously
> +*/
> +static void gem_basic_trtt_use(void)
> +{
> +	int fd;
> +	int ring, len = 0;
> +	uint32_t *ptr;
> +	struct drm_i915_gem_execbuffer2 execbuf;
> +	struct drm_i915_gem_exec_object2 exec_object2[8];
> +	uint32_t batch_buffer[BO_SIZE];
> +
> +	uint32_t l3_tbl_handle, l2_tbl1_handle, l2_tbl2_handle;
> +	uint32_t l1_tbl1_handle, l1_tbl2_handle, batch_buf_handle;
> +	uint32_t buffer1_handle, buffer2_handle;
> +
> +	uint64_t l3_tbl_offset, l2_tbl1_offset, l2_tbl2_offset;
> +	uint64_t l1_tbl1_offset, l1_tbl2_offset;
> +	uint64_t buffer1_offset, buffer2_offset;
> +
> +	uint32_t data;
> +	uint64_t address, current_ppgtt_offset = 0x10000;
> +
> +	fd = drm_open_driver(DRIVER_INTEL);
> +	igt_require(uses_full_ppgtt(fd, FULL_48_BIT_PPGTT));
> +	igt_require(has_softpin_support(fd));
> +	igt_require(has_trtt_support(fd));
> +
> +	/* Allocate a L3 table BO */
> +	BO_ALLOC_AND_SETUP(fd, TABLE_SIZE, l3_tbl_handle, l3_tbl_offset, 0);
> +
> +	/* Allocate two L2 table BOs */
> +	BO_ALLOC_AND_SETUP(fd, TABLE_SIZE, l2_tbl1_handle, l2_tbl1_offset, 1);
> +	BO_ALLOC_AND_SETUP(fd, TABLE_SIZE, l2_tbl2_handle, l2_tbl2_offset, 2);
> +
> +	/* Allocate two L1 table BOs */
> +	BO_ALLOC_AND_SETUP(fd, TABLE_SIZE, l1_tbl1_handle, l1_tbl1_offset, 3);
> +	BO_ALLOC_AND_SETUP(fd, TABLE_SIZE, l1_tbl2_handle, l1_tbl2_offset, 4);
> +
> +	/* Align the PPGTT offsets for the 2 data buffers to next 64 KB boundary */
> +	current_ppgtt_offset = ALIGN(current_ppgtt_offset, TILE_SIZE);
> +
> +	/* Allocate two Data buffer BOs */
> +	BO_ALLOC_AND_SETUP(fd, TILE_SIZE, buffer1_handle, buffer1_offset, 5);
> +	BO_ALLOC_AND_SETUP(fd, TILE_SIZE, buffer2_handle, buffer2_offset, 6);
> +
> +	/* Finally allocate Batch buffer BO */
> +	batch_buf_handle = gem_create(fd, BO_SIZE);
> +	setup_exec_obj(&exec_object2[7], batch_buf_handle, 0, 0);

Scary jump from idx to 7.
Why not just pin this as well to reduce the code complexity? Afterwards
setup_exec_obj() can allocate an object all by itself.

> +
> +	/* Add commands to update the two L3 table entries to point them to the L2 tables*/
> +	address = l3_tbl_offset;
> +	data = l2_tbl1_offset;
> +	len = gem_store_data_svm(fd, batch_buffer, len, address, data, false);
> +
> +	address = l3_tbl_offset + 511*sizeof(uint64_t);
> +	data = l2_tbl2_offset;
> +	len = gem_store_data_svm(fd, batch_buffer, len, address, data, false);
> +
> +	/* Add commands to update an entry of 2 L2 tables to point them to the L1 tables*/
> +	address = l2_tbl1_offset;
> +	data = l1_tbl1_offset;
> +	len = gem_store_data_svm(fd, batch_buffer, len, address, data, false);
> +
> +	address = l2_tbl2_offset + 511*sizeof(uint64_t);
> +	data = l1_tbl2_offset;
> +	len = gem_store_data_svm(fd, batch_buffer, len, address, data, false);
> +
> +	/* Add commands to update an entry of 2 L1 tables to point them to the data buffers*/
> +	address = l1_tbl1_offset;
> +	data = buffer1_offset >> 16;
> +	len = gem_store_data_svm(fd, batch_buffer, len, address, data, false);
> +
> +	address = l1_tbl2_offset + 1023*sizeof(uint32_t);
> +	data = buffer2_offset >> 16;
> +	len = gem_store_data_svm(fd, batch_buffer, len, address, data, false);
> +
> +	/* Add commands to update the 2 data buffers, using their TRTT VA */
> +	data = 0x12345678;
> +	len = gem_store_data_svm(fd, batch_buffer, len, FIRST_TILE_ADDRESS, data, false);
> +	len = gem_store_data_svm(fd, batch_buffer, len, LAST_TILE_ADDRESS, data, true);
> +
> +	gem_write(fd, batch_buf_handle, 0, batch_buffer, len*4);

Or for even shorter code: batch_buffer =
gem_mmap__cpu(exec_object[batch].handle);

> +
> +	/* Request KMD to setup the TR-TT */
> +	send_trtt_params(fd, 0, l3_tbl_offset);
> +
> +	ring = I915_EXEC_RENDER;
> +	setup_execbuffer(&execbuf, exec_object2, ring, 8, len*4);
> +
> +	/* submit command buffer */
> +	submit_and_sync(fd, &execbuf, batch_buf_handle);
> +
> +	/* read the 2 data buffers to check for the value written by the GPU */
> +	ptr = mmap_bo(fd, buffer1_handle, TILE_SIZE);
> +	igt_fail_on_f(ptr[0] != data,
> +		"\nCPU read does not match GPU write,\
> +		expected: 0x%x, got: 0x%x\n",
> +		data, ptr[0]);
> +
> +	ptr = mmap_bo(fd, buffer2_handle, TILE_SIZE);
> +	igt_fail_on_f(ptr[0] != data,
> +		"\nCPU read does not match GPU write,\
> +		expected: 0x%x, got: 0x%x\n",
> +		data, ptr[0]);
> +
> +	gem_close(fd, l3_tbl_handle);
> +	gem_close(fd, l2_tbl1_handle);
> +	gem_close(fd, l2_tbl2_handle);
> +	gem_close(fd, l1_tbl1_handle);
> +	gem_close(fd, l1_tbl2_handle);
> +	gem_close(fd, buffer1_handle);
> +	gem_close(fd, buffer2_handle);
> +	gem_close(fd, batch_buf_handle);
> +	close(fd);
> +}
> +
> +int main(int argc, char* argv[])
> +{
> +	igt_subtest_init(argc, argv);

Together these are igt_main, and then you can also drop igt_exit.

> +	igt_skip_on_simulation();

I think you want this on simulation as well, as least "basic".

> +
> +	/* test needs 48 PPGTT & Soft Pin support */
> +	igt_subtest("basic") {
> +		gem_basic_trtt_use();
> +	}
> +
> +	igt_exit();
> +}
> +
> -- 
> 1.9.2
> 

-- 
Chris Wilson, Intel Open Source Technology Centre