[igt-dev] [PATCH i-g-t v7 2/2] tests/kms_ccs: CCS Clear Color test

Mon Nov 2 17:27:05 UTC 2020

On Fri, Oct 23, 2020 at 04:05:26PM +0300, Mika Kahola wrote:
> The patch proposes a method to test CCS with clear color
> capability.
> 
> The test paints a solid color on primary fb and a small sprite fb.
> These are cleared with fast clear feature. A crc is captured and
> compared against the reference.
> 
> v2: Modify _gen9_render_copyfunc to support fast clear (Matt)
>     Enable fast clear bit on 3D sequence (Matt)
>     Add helper function to figure out clear color modifier (Matt)
> v3: Remove unrelated line additions/removes
> v4: Fast clear with color (Imre)
> v5: Write raw 32-bit color values to register (Imre)
>     Require 32-bit color format
> v6: Rebase to use batchbuffer without libdrm dependency
> v7: Enable clear color (Nanley)
> 
> Signed-off-by: Mika Kahola <mika.kahola at intel.com>
> ---
>  lib/gen8_render.h       |  1 +
>  lib/gen9_render.h       |  6 ++-
>  lib/intel_batchbuffer.c | 10 +++++
>  lib/intel_batchbuffer.h |  6 +++
>  lib/rendercopy.h        |  4 ++
>  lib/rendercopy_gen9.c   | 79 ++++++++++++++++++++++++++++------
>  tests/kms_ccs.c         | 94 ++++++++++++++++++++++++++++++++++++++---
>  7 files changed, 181 insertions(+), 19 deletions(-)
> 
> diff --git a/lib/gen8_render.h b/lib/gen8_render.h
> index 31dc01bc..1b0f527e 100644
> --- a/lib/gen8_render.h
> +++ b/lib/gen8_render.h
> @@ -26,6 +26,7 @@
>  
>  # define GEN8_VS_FLOATING_POINT_MODE_ALTERNATE          (1 << 16)
>  
> +#define GEN8_3DSTATE_FAST_CLEAR_ENABLE		(1 << 8)
>  #define GEN8_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP	\
>  						GEN4_3D(3, 0, 0x21)
>  #define GEN8_3DSTATE_PS_BLEND			GEN4_3D(3, 0, 0x4d)
> diff --git a/lib/gen9_render.h b/lib/gen9_render.h
> index 6274e902..b4cdf18a 100644
> --- a/lib/gen9_render.h
> +++ b/lib/gen9_render.h
> @@ -131,7 +131,11 @@ struct gen9_surface_state {
>  	} ss10;
>  
>  	struct {
> -		uint32_t aux_base_addr_hi;
> +		uint32_t aux_base_addr_hi:20;
> +		uint32_t procedual_texture:1;
> +		uint32_t clearvalue_addr_enable:1;
> +		uint32_t quilt_height:5;
> +		uint32_t quilt_width:5;
>  	} ss11;

These are in ss10.

>  
>  	/* register can be used for either
> diff --git a/lib/intel_batchbuffer.c b/lib/intel_batchbuffer.c
> index fc73495c..905f69ff 100644
> --- a/lib/intel_batchbuffer.c
> +++ b/lib/intel_batchbuffer.c
> @@ -1096,6 +1096,16 @@ igt_vebox_copyfunc_t igt_get_vebox_copyfunc(int devid)
>  	return copy;
>  }
>  
> +igt_render_clearfunc_t igt_get_render_clearfunc(int devid)
> +{
> +	igt_render_clearfunc_t clear = NULL;
> +
> +	if (IS_GEN12(devid))
> +		clear = gen12_render_clearfunc;
> +
> +	return clear;
> +}
> +
>  /**
>   * igt_get_media_fillfunc:
>   * @devid: pci device id
> diff --git a/lib/intel_batchbuffer.h b/lib/intel_batchbuffer.h
> index ab1b0c28..4f0b33ee 100644
> --- a/lib/intel_batchbuffer.h
> +++ b/lib/intel_batchbuffer.h
> @@ -374,6 +374,12 @@ typedef void (*igt_vebox_copyfunc_t)(struct intel_bb *ibb,
>  
>  igt_vebox_copyfunc_t igt_get_vebox_copyfunc(int devid);
>  
> +typedef void (*igt_render_clearfunc_t)(struct intel_bb *ibb,
> +				       struct intel_buf *dst, unsigned int dst_x, unsigned int dst_y,
> +				       unsigned int width, unsigned int height,
> +				       uint32_t r, uint32_t g, uint32_t b);

The clear value needs to be in float format, so let's pass that in a float[4] array.

> +igt_render_clearfunc_t igt_get_render_clearfunc(int devid);
> +
>  /**
>   * igt_fillfunc_t:
>   * @i915: drm fd
> diff --git a/lib/rendercopy.h b/lib/rendercopy.h
> index 7d5f0802..394fc94e 100644
> --- a/lib/rendercopy.h
> +++ b/lib/rendercopy.h
> @@ -23,6 +23,10 @@ static inline void emit_vertex_normalized(struct intel_bb *ibb,
>  	intel_bb_out(ibb, u.ui);
>  }
>  
> +void gen12_render_clearfunc(struct intel_bb *ibb,
> +                            struct intel_buf *dst, unsigned int dst_x, unsigned int dst_y,
> +                            unsigned int width, unsigned int height,
> +			    uint32_t r, uint32_t g, uint32_t b);
>  void gen12_render_copyfunc(struct intel_bb *ibb,
>  			   struct intel_buf *src, uint32_t src_x, uint32_t src_y,
>  			   uint32_t width, uint32_t height,
> diff --git a/lib/rendercopy_gen9.c b/lib/rendercopy_gen9.c
> index ef6855c9..fb6f6ba3 100644
> --- a/lib/rendercopy_gen9.c
> +++ b/lib/rendercopy_gen9.c
> @@ -117,7 +117,7 @@ static const uint32_t gen12_render_copy[][4] = {
>  
>  /* Mostly copy+paste from gen6, except height, width, pitch moved */
>  static uint32_t
> -gen8_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst) {
> +gen8_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst, bool fast_clear) {

No need for fast_clear, we need to program the color params whenever
buf->cc.offset is set (and that's not only for fast clear).

>  	struct gen9_surface_state *ss;
>  	uint32_t write_domain, read_domain;
>  	uint64_t address;
> @@ -190,6 +190,9 @@ gen8_bind_buf(struct intel_bb *ibb, const struct intel_buf *buf, int is_dst) {
>  							   buf->addr.offset);
>  		ss->ss10.aux_base_addr = (address + buf->ccs[0].offset);
>  		ss->ss11.aux_base_addr_hi = (address + buf->ccs[0].offset) >> 32;
> +
> +		if (fast_clear)
> +			ss->ss11.clearvalue_addr_enable = 1;
>  	}
>  
>  	if (buf->cc.offset) {

Let's move this whole cc.offset address programming under the
I915_COMPRESSION_RENDER branch, where this is actually relevant.

> @@ -217,8 +220,10 @@ gen8_bind_surfaces(struct intel_bb *ibb,
>  	binding_table = intel_bb_ptr_align(ibb, 32);
>  	binding_table_offset = intel_bb_ptr_add_return_prev_offset(ibb, 32);
>  
> -	binding_table[0] = gen8_bind_buf(ibb, dst, 1);
> -	binding_table[1] = gen8_bind_buf(ibb, src, 0);
> +	binding_table[0] = gen8_bind_buf(ibb, dst, 1, 1);
> +
> +	if (src != NULL)
> +		binding_table[1] = gen8_bind_buf(ibb, src, 1, 0);
>  
>  	return binding_table_offset;
>  }
> @@ -274,16 +279,25 @@ gen7_fill_vertex_buffer_data(struct intel_bb *ibb,
>  	offset = intel_bb_offset(ibb);
>  
>  	emit_vertex_2s(ibb, dst_x + width, dst_y + height);
> -	emit_vertex_normalized(ibb, src_x + width, intel_buf_width(src));
> -	emit_vertex_normalized(ibb, src_y + height, intel_buf_height(src));
> +
> +	if (src != NULL) {
> +		emit_vertex_normalized(ibb, src_x + width, intel_buf_width(src));
> +		emit_vertex_normalized(ibb, src_y + height, intel_buf_height(src));
> +	}

For the !src case you also need to emit the two source vertex elements
to keep the VUE format we defined in gen6_emit_vertex_elements().

>  
>  	emit_vertex_2s(ibb, dst_x, dst_y + height);
> -	emit_vertex_normalized(ibb, src_x, intel_buf_width(src));
> -	emit_vertex_normalized(ibb, src_y + height, intel_buf_height(src));
> +
> +	if (src != NULL) {
> +		emit_vertex_normalized(ibb, src_x, intel_buf_width(src));
> +		emit_vertex_normalized(ibb, src_y + height, intel_buf_height(src));
> +	}
>  
>  	emit_vertex_2s(ibb, dst_x, dst_y);
> -	emit_vertex_normalized(ibb, src_x, intel_buf_width(src));
> -	emit_vertex_normalized(ibb, src_y, intel_buf_height(src));
> +
> +	if (src != NULL) {
> +		emit_vertex_normalized(ibb, src_x, intel_buf_width(src));
> +		emit_vertex_normalized(ibb, src_y, intel_buf_height(src));
> +	}
>  
>  	return offset;
>  }
> @@ -729,7 +743,7 @@ gen8_emit_sf(struct intel_bb *ibb)
>  }
>  
>  static void
> -gen8_emit_ps(struct intel_bb *ibb, uint32_t kernel) {
> +gen8_emit_ps(struct intel_bb *ibb, uint32_t kernel, bool fast_clear) {
>  	const int max_threads = 63;
>  
>  	intel_bb_out(ibb, GEN6_3DSTATE_WM | (2 - 2));
> @@ -757,6 +771,10 @@ gen8_emit_ps(struct intel_bb *ibb, uint32_t kernel) {
>  		     2 << GEN6_3DSTATE_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT);

There is only 1 table entry in case of a fast clear, and sampler count
should be also set to 0.

>  	intel_bb_out(ibb, 0); /* scratch space stuff */
>  	intel_bb_out(ibb, 0); /* scratch hi */
> +
> +	if (fast_clear)
> +		intel_bb_out(ibb, GEN8_3DSTATE_FAST_CLEAR_ENABLE);

This flag is set in the next dword.

> +
>  	intel_bb_out(ibb, (max_threads - 1) << GEN8_3DSTATE_PS_MAX_THREADS_SHIFT |
>  		     GEN6_3DSTATE_WM_16_DISPATCH_ENABLE);
>  	intel_bb_out(ibb, 6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT);
>
> @@ -890,13 +908,20 @@ void _gen9_render_copyfunc(struct intel_bb *ibb,

Would make sense to rename this to stg like gen9_render_op() as it can
be either a copy or a fast clear now.

>  	uint32_t scissor_state;
>  	uint32_t vertex_buffer;
>  	uint32_t aux_pgtable_state;
> +	bool fast_clear = src != NULL ? false : true;

It's just
	bool fast_clear = src != NULL;

>  
> -	igt_assert(src->bpp == dst->bpp);
> +	if (src != NULL)
> +		igt_assert(src->bpp == dst->bpp);
> +
> +	if (!fast_clear)
> +		igt_assert(src->bpp == dst->bpp);

Redundant check.

>  
>  	intel_bb_flush_render(ibb);
>  
>  	intel_bb_add_intel_buf(ibb, dst, true);
> -	intel_bb_add_intel_buf(ibb, src, false);
> +
> +	if (!fast_clear)
> +		intel_bb_add_intel_buf(ibb, src, false);
>  
>  	intel_bb_ptr_set(ibb, BATCH_STATE_SPLIT);
>  
> @@ -949,11 +974,13 @@ void _gen9_render_copyfunc(struct intel_bb *ibb,
>  	intel_bb_out(ibb, 0);
>  	intel_bb_out(ibb, 0);
>  
> +	gen8_emit_ps(ibb, ps_kernel_off, fast_clear);
> +
>  	gen7_emit_clip(ibb);
>  
>  	gen8_emit_sf(ibb);
>  
> -	gen8_emit_ps(ibb, ps_kernel_off);
> +	gen8_emit_ps(ibb, ps_kernel_off, fast_clear);
>  
>  	intel_bb_out(ibb, GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS);
>  	intel_bb_out(ibb, ps_binding_table);
> @@ -1027,3 +1054,29 @@ void gen12_render_copyfunc(struct intel_bb *ibb,
>  
>  	gen12_aux_pgtable_cleanup(ibb, &pgtable_info);
>  }

Before returning we also need PIPE_CONTROL(render_target_cache_flush,
l3_fabric_flush and depth_stall) see BSpec/47112.

> +
> +void gen12_render_clearfunc(struct intel_bb *ibb,
> +			    struct intel_buf *dst,
> +			    unsigned int dst_x, unsigned int dst_y,
> +			    unsigned int width, unsigned int height,
> +			    uint32_t r, uint32_t g, uint32_t b)
> +{
> +	struct aux_pgtable_info pgtable_info = { };
> +
> +	gen12_aux_pgtable_init(&pgtable_info, ibb, NULL, dst);

The above expects both src and dst to be set, so you need to add support
for the dst only case there.

> +
> +	/* BSpec 21136 */
> +	intel_bb_ptr_set(ibb, dst->cc.offset);

The above sets a pointer inside the ibb, but cc.offset is an offset in
dst. Also these emitted values would be overwritten by the following
batchbuffer initing in _gen9_render_copyfunc() and it's missing the
reloc info.

Let's pass the float[4] clear value to _gen9_render_copyfunc() and do
an MI_STORE_DWORD_IMM+reloc emit for each of the clear value members
after gen12_emit_aux_pgtable_state() in _gen9_render_copyfunc().

> +	intel_bb_out(ibb, r);
> +	intel_bb_out(ibb, b);
> +	intel_bb_out(ibb, g);
> +	intel_bb_out(ibb, 0xffffffff);
> +
> +	_gen9_render_copyfunc(ibb, NULL, 0, 0,
> +			  width, height, dst, dst_x, dst_y,
> +			  pgtable_info.pgtable_buf,
> +			  gen12_render_copy,
> +			  sizeof(gen12_render_copy));
> +
> +	gen12_aux_pgtable_cleanup(ibb, &pgtable_info);
> +}
> diff --git a/tests/kms_ccs.c b/tests/kms_ccs.c
> index 53abecce..fac1ed4e 100644
> --- a/tests/kms_ccs.c
> +++ b/tests/kms_ccs.c
> @@ -50,10 +50,12 @@ enum test_fb_flags {
>  	FB_MISALIGN_AUX_STRIDE		= 1 << 2,
>  	FB_SMALL_AUX_STRIDE		= 1 << 3,
>  	FB_ZERO_AUX_STRIDE		= 1 << 4,
> +	FB_CLEAR_COLOR			= 1 << 5,

No need for a new flag, we can pick the fast clear path based on the
RC-CC modifier.

>  };
>  
>  typedef struct {
>  	int drm_fd;
> +	int devid;
>  	igt_display_t display;
>  	igt_output_t *output;
>  	enum pipe pipe;
> @@ -62,6 +64,11 @@ typedef struct {
>  	igt_pipe_crc_t *pipe_crc;
>  	uint32_t format;
>  	uint64_t ccs_modifier;
> +	struct igt_fb primary_fb;
> +	igt_plane_t *primary;
> +	igt_render_clearfunc_t fast_clear;
> +	struct buf_ops *bops;
> +	struct intel_bb *ibb;

Th above don't need to be cached here, we'll do a fast clear at a single
spot, so can get all these values there locally.

>  } data_t;
>  
>  static const struct {
> @@ -120,6 +127,16 @@ static void addfb_init(struct igt_fb *fb, struct drm_mode_fb_cmd2 *f)
>  	}
>  }
>  
> +static bool is_ccs_cc_modifier(uint64_t modifier)
> +{
> +	switch (modifier) {
> +	case LOCAL_I915_FORMAT_MOD_Y_TILED_GEN12_RC_CCS_CC:
> +		return true;
> +	default:
> +		return false;
> +	}
> +}
> +
>  /*
>   * The CCS planes of compressed framebuffers contain non-zero bytes if the
>   * engine compressed effectively the framebuffer. The actual encoding of these
> @@ -290,6 +307,32 @@ static igt_plane_t *compatible_main_plane(data_t *data)
>  	return igt_output_get_plane_type(data->output, DRM_PLANE_TYPE_PRIMARY);
>  }
>  
> +static struct intel_buf *init_buf(data_t *data, const struct igt_fb *fb, const char *buf_name)
> +{
> +	struct intel_buf *buf;
> +	uint32_t name, handle, tiling, stride, width, height, bpp, size;
> +
> +	igt_assert_eq(fb->offsets[0], 0);
> +	tiling = igt_fb_mod_to_tiling(fb->modifier);
> +	stride = fb->strides[0];
> +	bpp = fb->plane_bpp[0];
> +	size = fb->size;
> +	width = stride / (bpp / 8);
> +	height = size / stride;
> +	name = gem_flink(data->drm_fd, fb->gem_handle);
> +	handle = gem_open(data->drm_fd, name);
> +	buf = intel_buf_create_using_handle(data->bops, handle, width, height, bpp, 0, tiling, 0);

This won't setup any of the compression state, so instead of init_buf()
let's instead add a fast_clear_fb() func here and call a new exported

	struct intel_buf *
	igt_fb_create_intel_buf(int drm_fd, struct buf_ops *,
				const struct igt_fb *, const char *name);

function from lib/igt_fb.c, which calls lib/igt_fb.c/crate_buf() internally.

> +        intel_buf_set_name(buf, buf_name);
> +        intel_buf_set_ownership(buf, true);
> +
> +        return buf;
> +}
> +
> +static void fini_buf(struct intel_buf *buf)
> +{
> +	intel_buf_destroy(buf);
> +}
> +
>  static bool try_config(data_t *data, enum test_fb_flags fb_flags,
>  		       igt_crc_t *crc)
>  {
> @@ -349,6 +392,37 @@ static bool try_config(data_t *data, enum test_fb_flags fb_flags,
>  	if (data->flags & TEST_BAD_ROTATION_90)
>  		igt_plane_set_rotation(primary, IGT_ROTATION_90);
>  
> +	if (fb_flags & FB_CLEAR_COLOR) {
> +		struct intel_buf *dst;
> +
> +		/* require 32-bit bpp for a fast clear test */
> +		igt_skip_on(data->primary_fb.plane_bpp[0] != 32);

This should be just a

	if (!ccs_cc_modifier && format != XRGB8888)
		return false;

early return to avoid the overhead up to this point in the func.

> +
> +		data->ibb = intel_bb_create(data->drm_fd, 4096);
> +		data->bops = buf_ops_create(data->drm_fd);
> +
> +		dst = init_buf(data, &data->primary_fb, "fast clear dst");
> +		gem_set_domain(data->drm_fd, data->primary_fb.gem_handle,
> +			       I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
> +
> +		/*
> +		 * We expect the kernel to limit the max fb
> +		 * size/stride to something that can still
> +		 * rendered with the blitter/render engine.
> +		 */
> +		data->fast_clear(data->ibb, dst, 0, 0,
> +				 data->primary_fb.width,
> +				 data->primary_fb.height,
> +				 colors[0].r*UINT32_MAX,
> +				 colors[0].g*UINT32_MAX,
> +				 colors[0].b*UINT32_MAX);
> +
> +		fini_buf(dst);
> +		intel_bb_reset(data->ibb, true);

All the above belongs to generate_fb() as an alternative path for the
cairo fill there (in the !TEST_BAD_PIXEL_FORMAT branch for the
RC-CC modifier case).

> +
> +		return true;

Stray return.

> +	}
> +
>  	ret = igt_display_try_commit2(display, commit);
>  	if (data->flags & TEST_BAD_ROTATION_90) {
>  		igt_assert_eq(ret, -EINVAL);
> @@ -386,10 +460,18 @@ static int test_ccs(data_t *data)
>  	if (data->flags & TEST_CRC) {
>  		data->pipe_crc = igt_pipe_crc_new(data->drm_fd, data->pipe, INTEL_PIPE_CRC_SOURCE_AUTO);
>  
> -		if (try_config(data, fb_flags | FB_COMPRESSED, &ref_crc) &&
> -		    try_config(data, fb_flags, &crc)) {
> -			igt_assert_crc_equal(&crc, &ref_crc);
> -			valid_tests++;
> +		if (is_ccs_cc_modifier(data->ccs_modifier)) {
> +			if (try_config(data, fb_flags | FB_COMPRESSED, &ref_crc) &&
> +			    try_config(data, fb_flags | FB_COMPRESSED | FB_CLEAR_COLOR, &crc)) {
> +				igt_assert_crc_equal(&crc, &ref_crc);
> +				valid_tests++;
> +			}
> +		} else {
> +			if (try_config(data, fb_flags | FB_COMPRESSED, &ref_crc) &&
> +			    try_config(data, fb_flags, &crc)) {
> +				igt_assert_crc_equal(&crc, &ref_crc);
> +				valid_tests++;
> +			}

We can keep this function as-is, and just depend on the RC-CC modifier
the pick the fast clear path in try_config().

Please also add a check_ccs_cc_plane() which will check if the
fast_clear func() programmed the clear values properly and that the
render engine generated the native surface format value 16 bytes above
the dst.cc_offset area. The check should be something like

	uint32_t *cc_val = cc_map;

	igt_assert(color->r == cc_map[0] &&
	           color->g == cc_map[1] &&
		   color->b == cc_map[2]);

	native_color = (uint8_t)(color->r * 0xff) << 16 |
		       (uint8_t)(color->g * 0xff) << 8 |
		       (uint8_t)(color->b * 0xff);

	igt_assert(native_color == cc_map[4]);

>  		}
>  
>  		igt_pipe_crc_free(data->pipe_crc);
> @@ -471,11 +553,13 @@ igt_main_args("c", NULL, help_str, opt_handler, NULL)
>  	igt_fixture {
>  		data.drm_fd = drm_open_driver_master(DRIVER_INTEL);
>  
> -		igt_require(intel_gen(intel_get_drm_devid(data.drm_fd)) >= 9);
> +		data.devid = intel_gen(intel_get_drm_devid(data.drm_fd));
> +		igt_require(data.devid >= 9);
>  		kmstest_set_vt_graphics_mode();
>  		igt_require_pipe_crc(data.drm_fd);
>  
>  		igt_display_require(&data.display, data.drm_fd);
> +		data.fast_clear = igt_get_render_clearfunc(data.devid);

No need to cache this in data, we can use it locally at the single spot
it's needed.

>  	}
>  
>  	for_each_pipe_static(pipe) {
> -- 
> 2.25.1
>