[Mesa-dev] [PATCH 5/5] radeonsi: implement GL_KHR_blend_equation_advanced

Marek Olšák maraeo at gmail.com
Fri Mar 23 19:36:03 UTC 2018


From: Marek Olšák <marek.olsak at amd.com>

MSAA is supported using sample shading. Layered rendering and all texture
targets are also supported.
---
 docs/features.txt                                 |  2 +-
 docs/relnotes/18.1.0.html                         |  1 +
 src/gallium/drivers/radeonsi/si_blit.c            |  8 +++
 src/gallium/drivers/radeonsi/si_descriptors.c     | 87 +++++++++++++++++++++--
 src/gallium/drivers/radeonsi/si_get.c             |  2 +-
 src/gallium/drivers/radeonsi/si_pipe.h            |  9 +++
 src/gallium/drivers/radeonsi/si_shader.c          |  4 +-
 src/gallium/drivers/radeonsi/si_shader.h          |  3 +
 src/gallium/drivers/radeonsi/si_shader_internal.h |  1 +
 src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c | 59 +++++++++++++++
 src/gallium/drivers/radeonsi/si_state.c           | 19 +++--
 src/gallium/drivers/radeonsi/si_state.h           |  8 +++
 src/gallium/drivers/radeonsi/si_state_binning.c   |  2 +-
 src/gallium/drivers/radeonsi/si_state_shaders.c   | 18 +++++
 14 files changed, 205 insertions(+), 18 deletions(-)

diff --git a/docs/features.txt b/docs/features.txt
index 5eae34bf0df..d579d245eb4 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -262,21 +262,21 @@ GLES3.1, GLSL ES 3.1 -- all DONE: i965/hsw+, nvc0, r600, radeonsi
 
   Additional functionality not covered above:
       glMemoryBarrierByRegion                           DONE
       glGetTexLevelParameter[fi]v - needs updates       DONE
       glGetBooleani_v - restrict to GLES enums
       gl_HelperInvocation support                       DONE (i965, r600)
 
 GLES3.2, GLSL ES 3.2 -- all DONE: i965/gen9+
 
   GL_EXT_color_buffer_float                             DONE (all drivers)
-  GL_KHR_blend_equation_advanced                        DONE (i965, nvc0)
+  GL_KHR_blend_equation_advanced                        DONE (i965, nvc0, radeonsi)
   GL_KHR_debug                                          DONE (all drivers)
   GL_KHR_robustness                                     DONE (i965, nvc0, radeonsi)
   GL_KHR_texture_compression_astc_ldr                   DONE (freedreno, i965/gen9+)
   GL_OES_copy_image                                     DONE (all drivers)
   GL_OES_draw_buffers_indexed                           DONE (all drivers that support GL_ARB_draw_buffers_blend)
   GL_OES_draw_elements_base_vertex                      DONE (all drivers)
   GL_OES_geometry_shader                                DONE (i965/hsw+, nvc0, radeonsi)
   GL_OES_gpu_shader5                                    DONE (all drivers that support GL_ARB_gpu_shader5)
   GL_OES_primitive_bounding_box                         DONE (i965/gen7+, nvc0, radeonsi)
   GL_OES_sample_shading                                 DONE (i965, nvc0, r600, radeonsi)
diff --git a/docs/relnotes/18.1.0.html b/docs/relnotes/18.1.0.html
index 3e119078731..a89861d2bda 100644
--- a/docs/relnotes/18.1.0.html
+++ b/docs/relnotes/18.1.0.html
@@ -43,20 +43,21 @@ TBD.
 Note: some of the new features are only available with certain drivers.
 </p>
 
 <ul>
 <li>OpenGL 3.1 with ARB_compatibility on nv50, nvc0, r600, radeonsi, softpipe, llvmpipe, svga</li>
 <li>GL_ARB_bindless_texture on nvc0/maxwell+</li>
 <li>GL_EXT_semaphore on radeonsi</li>
 <li>GL_EXT_semaphore_fd on radeonsi</li>
 <li>GL_EXT_shader_framebuffer_fetch on i965 on desktop GL (GLES was already supported)</li>
 <li>GL_EXT_shader_framebuffer_fetch_non_coherent on i965</li>
+<li>GL_KHR_blend_equation_advanced on radeonsi</li>
 <li>Disk shader cache support for i965 enabled by default</li>
 </ul>
 
 <h2>Bug fixes</h2>
 
 <ul>
 TBD
 </ul>
 
 <h2>Changes</h2>
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index f1c4f6d1e72..d9d489825f8 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -799,20 +799,28 @@ void si_decompress_textures(struct si_context *sctx, unsigned shader_mask)
 			si_decompress_resident_textures(sctx);
 		if (sctx->uses_bindless_images)
 			si_decompress_resident_images(sctx);
 	} else if (shader_mask & (1 << PIPE_SHADER_COMPUTE)) {
 		if (sctx->cs_shader_state.program->uses_bindless_samplers)
 			si_decompress_resident_textures(sctx);
 		if (sctx->cs_shader_state.program->uses_bindless_images)
 			si_decompress_resident_images(sctx);
 	}
 
+	if (sctx->ps_uses_fbfetch) {
+		struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
+		si_decompress_color_texture(sctx,
+					    (struct r600_texture*)cb0->texture,
+					    cb0->u.tex.first_layer,
+					    cb0->u.tex.last_layer);
+	}
+
 	si_check_render_feedback(sctx);
 }
 
 /* Helper for decompressing a portion of a color or depth resource before
  * blitting if any decompression is needed.
  * The driver doesn't decompress resources automatically while u_blitter is
  * rendering. */
 static void si_decompress_subresource(struct pipe_context *ctx,
 				      struct pipe_resource *tex,
 				      unsigned planes, unsigned level,
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 07d1420d8ee..38befa445d2 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -680,21 +680,21 @@ si_mark_image_range_valid(const struct pipe_image_view *view)
 	assert(res && res->b.b.target == PIPE_BUFFER);
 
 	util_range_add(&res->valid_buffer_range,
 		       view->u.buf.offset,
 		       view->u.buf.offset + view->u.buf.size);
 }
 
 static void si_set_shader_image_desc(struct si_context *ctx,
 				     const struct pipe_image_view *view,
 				     bool skip_decompress,
-				     uint32_t *desc)
+				     uint32_t *desc, uint32_t *fmask_desc)
 {
 	struct si_screen *screen = ctx->screen;
 	struct r600_resource *res;
 
 	res = (struct r600_resource *)view->resource;
 
 	if (res->b.b.target == PIPE_BUFFER) {
 		if (view->access & PIPE_IMAGE_ACCESS_WRITE)
 			si_mark_image_range_valid(view);
 
@@ -708,27 +708,28 @@ static void si_set_shader_image_desc(struct si_context *ctx,
 		struct r600_texture *tex = (struct r600_texture *)res;
 		unsigned level = view->u.tex.level;
 		unsigned width, height, depth, hw_level;
 		bool uses_dcc = vi_dcc_enabled(tex, level);
 		unsigned access = view->access;
 
 		/* Clear the write flag when writes can't occur.
 		 * Note that DCC_DECOMPRESS for MSAA doesn't work in some cases,
 		 * so we don't wanna trigger it.
 		 */
-		if (tex->is_depth || tex->resource.b.b.nr_samples >= 2) {
+		if (tex->is_depth ||
+		    (!fmask_desc && tex->fmask.size != 0)) {
 			assert(!"Z/S and MSAA image stores are not supported");
 			access &= ~PIPE_IMAGE_ACCESS_WRITE;
 		}
 
 		assert(!tex->is_depth);
-		assert(tex->fmask.size == 0);
+		assert(fmask_desc || tex->fmask.size == 0);
 
 		if (uses_dcc && !skip_decompress &&
 		    (view->access & PIPE_IMAGE_ACCESS_WRITE ||
 		     !vi_dcc_formats_compatible(res->b.b.format, view->format))) {
 			/* If DCC can't be disabled, at least decompress it.
 			 * The decompression is relatively cheap if the surface
 			 * has been decompressed already.
 			 */
 			if (!si_texture_disable_dcc(&ctx->b, tex))
 				ctx->b.decompress_dcc(&ctx->b.b, tex);
@@ -755,21 +756,21 @@ static void si_set_shader_image_desc(struct si_context *ctx,
 			hw_level = 0;
 		}
 
 		si_make_texture_descriptor(screen, tex,
 					   false, res->b.b.target,
 					   view->format, swizzle,
 					   hw_level, hw_level,
 					   view->u.tex.first_layer,
 					   view->u.tex.last_layer,
 					   width, height, depth,
-					   desc, NULL);
+					   desc, fmask_desc);
 		si_set_mutable_tex_desc_fields(screen, tex,
 					       &tex->surface.u.legacy.level[level],
 					       level, level,
 					       util_format_get_blockwidth(view->format),
 					       false, desc);
 	}
 }
 
 static void si_set_shader_image(struct si_context *ctx,
 				unsigned shader,
@@ -785,21 +786,21 @@ static void si_set_shader_image(struct si_context *ctx,
 	if (!view || !view->resource) {
 		si_disable_shader_image(ctx, shader, slot);
 		return;
 	}
 
 	res = (struct r600_resource *)view->resource;
 
 	if (&images->views[slot] != view)
 		util_copy_image_view(&images->views[slot], view);
 
-	si_set_shader_image_desc(ctx, view, skip_decompress, desc);
+	si_set_shader_image_desc(ctx, view, skip_decompress, desc, NULL);
 
 	if (res->b.b.target == PIPE_BUFFER) {
 		images->needs_color_decompress_mask &= ~(1 << slot);
 		res->bind_history |= PIPE_BIND_SHADER_IMAGE;
 	} else {
 		struct r600_texture *tex = (struct r600_texture *)res;
 		unsigned level = view->u.tex.level;
 
 		if (color_needs_decompression(tex)) {
 			images->needs_color_decompress_mask |= 1 << slot;
@@ -863,20 +864,91 @@ si_images_update_needs_color_decompress_mask(struct si_images *images)
 
 			if (color_needs_decompression(rtex)) {
 				images->needs_color_decompress_mask |= 1 << i;
 			} else {
 				images->needs_color_decompress_mask &= ~(1 << i);
 			}
 		}
 	}
 }
 
+void si_update_ps_colorbuf0_slot(struct si_context *sctx)
+{
+	struct si_buffer_resources *buffers = &sctx->rw_buffers;
+	struct si_descriptors *descs = &sctx->descriptors[SI_DESCS_RW_BUFFERS];
+	unsigned slot = SI_PS_IMAGE_COLORBUF0;
+	struct pipe_surface *surf = NULL;
+
+	/* si_texture_disable_dcc can get us here again. */
+	if (sctx->blitter->running)
+		return;
+
+	/* See whether FBFETCH is used and color buffer 0 is set. */
+	if (sctx->ps_shader.cso &&
+	    sctx->ps_shader.cso->info.opcode_count[TGSI_OPCODE_FBFETCH] &&
+	    sctx->framebuffer.state.nr_cbufs &&
+	    sctx->framebuffer.state.cbufs[0])
+		surf = sctx->framebuffer.state.cbufs[0];
+
+	/* Return if FBFETCH transitions from disabled to disabled. */
+	if (!buffers->buffers[slot] && !surf)
+		return;
+
+	sctx->ps_uses_fbfetch = surf != NULL;
+	si_update_ps_iter_samples(sctx);
+
+	if (surf) {
+		struct r600_texture *tex = (struct r600_texture*)surf->texture;
+		struct pipe_image_view view;
+
+		assert(tex);
+		assert(!tex->is_depth);
+
+		/* Disable DCC, because the texture is used as both a sampler
+		 * and color buffer.
+		 */
+		si_texture_disable_dcc(&sctx->b, tex);
+
+		if (tex->resource.b.b.nr_samples <= 1 && tex->cmask_buffer) {
+			/* Disable CMASK. */
+			assert(tex->cmask_buffer != &tex->resource);
+			si_eliminate_fast_color_clear(&sctx->b, tex);
+			si_texture_discard_cmask(sctx->screen, tex);
+		}
+
+		view.resource = surf->texture;
+		view.format = surf->format;
+		view.access = PIPE_IMAGE_ACCESS_READ;
+		view.u.tex.first_layer = surf->u.tex.first_layer;
+		view.u.tex.last_layer = surf->u.tex.last_layer;
+		view.u.tex.level = surf->u.tex.level;
+
+		/* Set the descriptor. */
+		uint32_t *desc = descs->list + slot*4;
+		memset(desc, 0, 16 * 4);
+		si_set_shader_image_desc(sctx, &view, true, desc, desc + 8);
+
+		pipe_resource_reference(&buffers->buffers[slot], &tex->resource.b.b);
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+					  &tex->resource, RADEON_USAGE_READ,
+					  RADEON_PRIO_SHADER_RW_IMAGE);
+		buffers->enabled_mask |= 1u << slot;
+	} else {
+		/* Clear the descriptor. */
+		memset(descs->list + slot*4, 0, 8*4);
+		pipe_resource_reference(&buffers->buffers[slot], NULL);
+		buffers->enabled_mask &= ~(1u << slot);
+	}
+
+	sctx->descriptors_dirty |= 1u << SI_DESCS_RW_BUFFERS;
+}
+
 /* SAMPLER STATES */
 
 static void si_bind_sampler_states(struct pipe_context *ctx,
                                    enum pipe_shader_type shader,
                                    unsigned start, unsigned count, void **states)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_samplers *samplers = &sctx->samplers[shader];
 	struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, shader);
 	struct si_sampler_state **sstates = (struct si_sampler_state**)states;
@@ -1848,21 +1920,21 @@ static void si_update_bindless_image_descriptor(struct si_context *sctx,
 	unsigned desc_slot_offset = img_handle->desc_slot * 16;
 	struct pipe_image_view *view = &img_handle->view;
 	uint32_t desc_list[8];
 
 	if (view->resource->target == PIPE_BUFFER)
 		return;
 
 	memcpy(desc_list, desc->list + desc_slot_offset,
 	       sizeof(desc_list));
 	si_set_shader_image_desc(sctx, view, true,
-				 desc->list + desc_slot_offset);
+				 desc->list + desc_slot_offset, NULL);
 
 	if (memcmp(desc_list, desc->list + desc_slot_offset,
 		   sizeof(desc_list))) {
 		img_handle->desc_dirty = true;
 		sctx->bindless_descriptors_dirty = true;
 	}
 }
 
 static void si_update_all_resident_texture_descriptors(struct si_context *sctx)
 {
@@ -1914,20 +1986,21 @@ void si_update_all_texture_descriptors(struct si_context *sctx)
 				continue;
 
 			si_set_sampler_view(sctx, shader, i,
 					    samplers->views[i], true);
 		}
 
 		si_update_shader_needs_decompress_mask(sctx, shader);
 	}
 
 	si_update_all_resident_texture_descriptors(sctx);
+	si_update_ps_colorbuf0_slot(sctx);
 }
 
 /* SHADER USER DATA */
 
 static void si_mark_shader_pointers_dirty(struct si_context *sctx,
 					  unsigned shader)
 {
 	sctx->shader_pointers_dirty |=
 		u_bit_consecutive(SI_DESCS_FIRST_SHADER + shader * SI_NUM_SHADER_DESCS,
 				  SI_NUM_SHADER_DESCS);
@@ -2453,21 +2526,21 @@ static uint64_t si_create_image_handle(struct pipe_context *ctx,
 	if (!view || !view->resource)
 		return 0;
 
 	img_handle = CALLOC_STRUCT(si_image_handle);
 	if (!img_handle)
 		return 0;
 
 	memset(desc_list, 0, sizeof(desc_list));
 	si_init_descriptor_list(&desc_list[0], 8, 1, null_image_descriptor);
 
-	si_set_shader_image_desc(sctx, view, false, &desc_list[0]);
+	si_set_shader_image_desc(sctx, view, false, &desc_list[0], NULL);
 
 	img_handle->desc_slot = si_create_bindless_descriptor(sctx, desc_list,
 							      sizeof(desc_list));
 	if (!img_handle->desc_slot) {
 		FREE(img_handle);
 		return 0;
 	}
 
 	handle = img_handle->desc_slot;
 
diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c
index b4ca5bea943..327d8a28c38 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -183,20 +183,21 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_MEMOBJ:
 	case PIPE_CAP_LOAD_CONSTBUF:
 	case PIPE_CAP_INT64:
 	case PIPE_CAP_INT64_DIVMOD:
 	case PIPE_CAP_TGSI_CLOCK:
 	case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
 	case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
 	case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
 	case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
 	case PIPE_CAP_TGSI_VOTE:
+	case PIPE_CAP_TGSI_FS_FBFETCH:
 		return 1;
 
 	case PIPE_CAP_TGSI_BALLOT:
 		return HAVE_LLVM >= 0x0500;
 
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
 		return !SI_BIG_ENDIAN && sscreen->info.has_userptr;
 
 	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
 		return (sscreen->info.drm_major == 2 &&
@@ -259,21 +260,20 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	/* Unsupported features. */
 	case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
 	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
 	case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
 	case PIPE_CAP_USER_VERTEX_BUFFERS:
 	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_VERTEXID_NOBASE:
 	case PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES:
 	case PIPE_CAP_MAX_WINDOW_RECTANGLES:
-	case PIPE_CAP_TGSI_FS_FBFETCH:
 	case PIPE_CAP_TGSI_MUL_ZERO_WINS:
 	case PIPE_CAP_UMA:
 	case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE:
 	case PIPE_CAP_POST_DEPTH_COVERAGE:
 	case PIPE_CAP_TILE_RASTER_ORDER:
 	case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
 	case PIPE_CAP_CONTEXT_PRIORITY_MASK:
 		return 0;
 
 	case PIPE_CAP_FENCE_SIGNAL:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index dbb04ed7e45..e3d45ef6c3b 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -525,20 +525,21 @@ struct si_context {
 	unsigned			num_vs_blit_sgprs;
 	uint32_t			vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD];
 
 	/* Vertex and index buffers. */
 	bool				vertex_buffers_dirty;
 	bool				vertex_buffer_pointer_dirty;
 	struct pipe_vertex_buffer	vertex_buffer[SI_NUM_VERTEX_BUFFERS];
 
 	/* MSAA config state. */
 	int				ps_iter_samples;
+	bool				ps_uses_fbfetch;
 	bool				smoothing_enabled;
 
 	/* DB render state. */
 	unsigned		ps_db_shader_control;
 	unsigned		dbcb_copy_sample;
 	bool			dbcb_depth_copy_enabled:1;
 	bool			dbcb_stencil_copy_enabled:1;
 	bool			db_flush_depth_inplace:1;
 	bool			db_flush_stencil_inplace:1;
 	bool			db_depth_clear:1;
@@ -924,11 +925,19 @@ si_htile_enabled(struct r600_texture *tex, unsigned level)
 	return tex->htile_offset && level == 0;
 }
 
 static inline bool
 vi_tc_compat_htile_enabled(struct r600_texture *tex, unsigned level)
 {
 	assert(!tex->tc_compatible_htile || tex->htile_offset);
 	return tex->tc_compatible_htile && level == 0;
 }
 
+static inline unsigned si_get_ps_iter_samples(struct si_context *sctx)
+{
+	if (sctx->ps_uses_fbfetch)
+		return sctx->framebuffer.nr_samples;
+
+	return sctx->ps_iter_samples;
+}
+
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 08b071e810b..ae98e102eae 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1919,21 +1919,21 @@ void si_llvm_load_input_fs(
 
 static void declare_input_fs(
 	struct si_shader_context *ctx,
 	unsigned input_index,
 	const struct tgsi_full_declaration *decl,
 	LLVMValueRef out[4])
 {
 	si_llvm_load_input_fs(ctx, input_index, out);
 }
 
-static LLVMValueRef get_sample_id(struct si_shader_context *ctx)
+LLVMValueRef si_get_sample_id(struct si_shader_context *ctx)
 {
 	return si_unpack_param(ctx, SI_PARAM_ANCILLARY, 8, 4);
 }
 
 static LLVMValueRef get_base_vertex(struct ac_shader_abi *abi)
 {
 	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 
 	/* For non-indexed draws, the base vertex set by the driver
 	 * (for direct draws) or the CP (for indirect draws) is the
@@ -2142,21 +2142,21 @@ void si_load_system_value(struct si_shader_context *ctx,
 		};
 		value = lp_build_gather_values(&ctx->gallivm, pos, 4);
 		break;
 	}
 
 	case TGSI_SEMANTIC_FACE:
 		value = ctx->abi.front_face;
 		break;
 
 	case TGSI_SEMANTIC_SAMPLEID:
-		value = get_sample_id(ctx);
+		value = si_get_sample_id(ctx);
 		break;
 
 	case TGSI_SEMANTIC_SAMPLEPOS: {
 		LLVMValueRef pos[4] = {
 			LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
 			LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
 			LLVMConstReal(ctx->f32, 0),
 			LLVMConstReal(ctx->f32, 0)
 		};
 		pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index f58978989d4..f598b762e1d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -534,20 +534,23 @@ struct si_shader_key {
 	struct {
 		/* One byte for every input: SI_FIX_FETCH_* enums. */
 		uint8_t		vs_fix_fetch[SI_MAX_ATTRIBS];
 
 		union {
 			uint64_t	ff_tcs_inputs_to_copy; /* for fixed-func TCS */
 			/* When PS needs PrimID and GS is disabled. */
 			unsigned	vs_export_prim_id:1;
 			struct {
 				unsigned interpolate_at_sample_force_center:1;
+				unsigned fbfetch_msaa;
+				unsigned fbfetch_is_1D;
+				unsigned fbfetch_layered;
 			} ps;
 		} u;
 	} mono;
 
 	/* Optimization flags for asynchronous compilation only. */
 	struct {
 		/* For HW VS (it can be VS, TES, GS) */
 		uint64_t	kill_outputs; /* "get_unique_index" bits */
 		unsigned	clip_disable:1;
 
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 1bd52722413..941c6fc736d 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -293,20 +293,21 @@ void si_llvm_emit_store(struct lp_build_tgsi_context *bld_base,
 #define NOOP_WAITCNT 0xf7f
 #define LGKM_CNT 0x07f
 #define VM_CNT 0xf70
 
 LLVMValueRef si_get_indirect_index(struct si_shader_context *ctx,
 				   const struct tgsi_ind_register *ind,
 				   unsigned addr_mul, int rel_index);
 LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
 					   const struct tgsi_ind_register *ind,
 					   int rel_index, unsigned num);
+LLVMValueRef si_get_sample_id(struct si_shader_context *ctx);
 
 void si_shader_context_init_alu(struct lp_build_tgsi_context *bld_base);
 void si_shader_context_init_mem(struct si_shader_context *ctx);
 
 LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx,
 				  LLVMValueRef list, LLVMValueRef index,
 				  enum ac_descriptor_type type);
 LLVMValueRef si_load_image_desc(struct si_shader_context *ctx,
 				LLVMValueRef list, LLVMValueRef index,
 				enum ac_descriptor_type desc_type, bool dcc_off);
diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
index f5fa18fd38a..b90edddf2ef 100644
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
@@ -1866,20 +1866,77 @@ static void si_llvm_emit_txqs(
 	samples = LLVMBuildLShr(ctx->ac.builder, samples,
 				LLVMConstInt(ctx->i32, 16, 0), "");
 	samples = LLVMBuildAnd(ctx->ac.builder, samples,
 			       LLVMConstInt(ctx->i32, 0xf, 0), "");
 	samples = LLVMBuildShl(ctx->ac.builder, ctx->i32_1,
 			       samples, "");
 
 	emit_data->output[emit_data->chan] = samples;
 }
 
+static void si_llvm_emit_fbfetch(const struct lp_build_tgsi_action *action,
+				 struct lp_build_tgsi_context *bld_base,
+				 struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct ac_image_args args = {};
+	LLVMValueRef ptr, image, fmask, addr_vec;
+
+	/* Ignore src0, because KHR_blend_func_extended disallows multiple render
+	 * targets.
+	 */
+
+	/* Load the image descriptor. */
+	STATIC_ASSERT(SI_PS_IMAGE_COLORBUF0 % 2 == 0);
+	ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
+	ptr = LLVMBuildPointerCast(ctx->ac.builder, ptr,
+				   ac_array_in_const32_addr_space(ctx->v8i32), "");
+	image = ac_build_load_to_sgpr(&ctx->ac, ptr,
+			LLVMConstInt(ctx->i32, SI_PS_IMAGE_COLORBUF0 / 2, 0));
+
+	LLVMValueRef addr[4];
+	unsigned chan = 0;
+
+	addr[chan++] = si_unpack_param(ctx, SI_PARAM_POS_FIXED_PT, 0, 16);
+
+	if (!ctx->shader->key.mono.u.ps.fbfetch_is_1D)
+		addr[chan++] = si_unpack_param(ctx, SI_PARAM_POS_FIXED_PT, 16, 16);
+
+	/* Get the current render target layer index. */
+	if (ctx->shader->key.mono.u.ps.fbfetch_layered)
+		addr[chan++] = si_unpack_param(ctx, SI_PARAM_ANCILLARY, 16, 11);
+
+	if (ctx->shader->key.mono.u.ps.fbfetch_msaa)
+		addr[chan++] = si_get_sample_id(ctx);
+
+	while (chan < 4)
+		addr[chan++] = LLVMGetUndef(ctx->i32);
+
+	if (ctx->shader->key.mono.u.ps.fbfetch_msaa) {
+		fmask = ac_build_load_to_sgpr(&ctx->ac, ptr,
+			LLVMConstInt(ctx->i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0));
+
+		ac_apply_fmask_to_sample(&ctx->ac, fmask, addr, false);
+	}
+
+	addr_vec = ac_build_gather_values(&ctx->ac, addr, ARRAY_SIZE(addr));
+
+	args.opcode = ac_image_load;
+	args.resource = image;
+	args.addr = addr_vec;
+	args.dmask = 0xf;
+	args.da = ctx->shader->key.mono.u.ps.fbfetch_layered;
+
+	emit_data->output[emit_data->chan] =
+		ac_build_image_opcode(&ctx->ac, &args);
+}
+
 static const struct lp_build_tgsi_action tex_action = {
 	.fetch_args = tex_fetch_args,
 	.emit = build_tex_intrinsic,
 };
 
 /**
  * Setup actions for TGSI memory opcode, including texture opcodes.
  */
 void si_shader_context_init_mem(struct si_shader_context *ctx)
 {
@@ -1898,20 +1955,22 @@ void si_shader_context_init_mem(struct si_shader_context *ctx)
 	bld_base->op_actions[TGSI_OPCODE_TXF_LZ] = tex_action;
 	bld_base->op_actions[TGSI_OPCODE_TXL] = tex_action;
 	bld_base->op_actions[TGSI_OPCODE_TXL2] = tex_action;
 	bld_base->op_actions[TGSI_OPCODE_TXP] = tex_action;
 	bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = txq_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
 	bld_base->op_actions[TGSI_OPCODE_TG4] = tex_action;
 	bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
 	bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
 
+	bld_base->op_actions[TGSI_OPCODE_FBFETCH].emit = si_llvm_emit_fbfetch;
+
 	bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
 	bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
 	bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
 
 	tmpl.fetch_args = atomic_fetch_args;
 	tmpl.emit = atomic_emit;
 	bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index b92ec03a054..16ad1d4f823 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2891,20 +2891,21 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 		if (!surf->depth_initialized) {
 			si_init_depth_surface(sctx, surf);
 		}
 
 		if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level))
 			sctx->framebuffer.DB_has_shader_readable_metadata = true;
 
 		si_context_add_resource_size(ctx, surf->base.texture);
 	}
 
+	si_update_ps_colorbuf0_slot(sctx);
 	si_update_poly_offset_state(sctx);
 	si_mark_atom_dirty(sctx, &sctx->cb_render_state);
 	si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
 
 	if (sctx->screen->dpbb_allowed)
 		si_mark_atom_dirty(sctx, &sctx->dpbb_state);
 
 	if (sctx->framebuffer.any_dst_linear != old_any_dst_linear)
 		si_mark_atom_dirty(sctx, &sctx->msaa_config);
 
@@ -3353,40 +3354,41 @@ static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom)
 	if (setup_samples > 1) {
 		/* distance from the pixel center, indexed by log2(nr_samples) */
 		static unsigned max_dist[] = {
 			0, /* unused */
 			4, /* 2x MSAA */
 			6, /* 4x MSAA */
 			7, /* 8x MSAA */
 			8, /* 16x MSAA */
 		};
 		unsigned log_samples = util_logbase2(setup_samples);
+		unsigned ps_iter_samples = si_get_ps_iter_samples(sctx);
 		unsigned log_ps_iter_samples =
-			util_logbase2(util_next_power_of_two(sctx->ps_iter_samples));
+			util_logbase2(util_next_power_of_two(ps_iter_samples));
 
 		radeon_set_context_reg_seq(cs, R_028BDC_PA_SC_LINE_CNTL, 2);
 		radeon_emit(cs, sc_line_cntl |
 			    S_028BDC_EXPAND_LINE_WIDTH(1)); /* CM_R_028BDC_PA_SC_LINE_CNTL */
 		radeon_emit(cs, S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
 			    S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) |
 			    S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples)); /* CM_R_028BE0_PA_SC_AA_CONFIG */
 
 		if (sctx->framebuffer.nr_samples > 1) {
 			radeon_set_context_reg(cs, R_028804_DB_EQAA,
 					       S_028804_MAX_ANCHOR_SAMPLES(log_samples) |
 					       S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
 					       S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
 					       S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples) |
 					       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 					       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1));
 			radeon_set_context_reg(cs, R_028A4C_PA_SC_MODE_CNTL_1,
-					       S_028A4C_PS_ITER_SAMPLE(sctx->ps_iter_samples > 1) |
+					       S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) |
 					       sc_mode_cntl_1);
 		} else if (sctx->smoothing_enabled) {
 			radeon_set_context_reg(cs, R_028804_DB_EQAA,
 					       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 					       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1) |
 					       S_028804_OVERRASTERIZATION_AMOUNT(log_samples));
 			radeon_set_context_reg(cs, R_028A4C_PA_SC_MODE_CNTL_1,
 					       sc_mode_cntl_1);
 		}
 	} else {
@@ -3401,34 +3403,39 @@ static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom)
 				       sc_mode_cntl_1);
 	}
 
 	/* GFX9: Flush DFSM when the AA mode changes. */
 	if (sctx->screen->dfsm_allowed) {
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
 	}
 }
 
+void si_update_ps_iter_samples(struct si_context *sctx)
+{
+	if (sctx->framebuffer.nr_samples > 1)
+		si_mark_atom_dirty(sctx, &sctx->msaa_config);
+	if (sctx->screen->dpbb_allowed)
+		si_mark_atom_dirty(sctx, &sctx->dpbb_state);
+}
+
 static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 
 	if (sctx->ps_iter_samples == min_samples)
 		return;
 
 	sctx->ps_iter_samples = min_samples;
 	sctx->do_update_shaders = true;
 
-	if (sctx->framebuffer.nr_samples > 1)
-		si_mark_atom_dirty(sctx, &sctx->msaa_config);
-	if (sctx->screen->dpbb_allowed)
-		si_mark_atom_dirty(sctx, &sctx->dpbb_state);
+	si_update_ps_iter_samples(sctx);
 }
 
 /*
  * Samplers
  */
 
 /**
  * Build the sampler view descriptor for a buffer texture.
  * @param state 256-bit descriptor; only the high 128 bits are filled in
  */
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 02659a7a4f3..37887853388 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -210,20 +210,26 @@ enum {
 	SI_VS_STREAMOUT_BUF1,
 	SI_VS_STREAMOUT_BUF2,
 	SI_VS_STREAMOUT_BUF3,
 
 	SI_HS_CONST_DEFAULT_TESS_LEVELS,
 	SI_VS_CONST_INSTANCE_DIVISORS,
 	SI_VS_CONST_CLIP_PLANES,
 	SI_PS_CONST_POLY_STIPPLE,
 	SI_PS_CONST_SAMPLE_POSITIONS,
 
+	/* Image descriptor of color buffer 0 for KHR_blend_equation_advanced. */
+	SI_PS_IMAGE_COLORBUF0,
+	SI_PS_IMAGE_COLORBUF0_HI,
+	SI_PS_IMAGE_COLORBUF0_FMASK,
+	SI_PS_IMAGE_COLORBUF0_FMASK_HI,
+
 	SI_NUM_RW_BUFFERS,
 };
 
 /* Indices into sctx->descriptors, laid out so that gfx and compute pipelines
  * are contiguous:
  *
  *  0 - rw buffers
  *  1 - vertex const and shader buffers
  *  2 - vertex samplers and images
  *  3 - fragment const and shader buffer
@@ -317,20 +323,21 @@ struct si_buffer_resources {
 				  si_pm4_block_idx(member)); \
 	} while(0)
 
 /* si_descriptors.c */
 void si_set_mutable_tex_desc_fields(struct si_screen *sscreen,
 				    struct r600_texture *tex,
 				    const struct legacy_surf_level *base_level_info,
 				    unsigned base_level, unsigned first_level,
 				    unsigned block_width, bool is_stencil,
 				    uint32_t *state);
+void si_update_ps_colorbuf0_slot(struct si_context *sctx);
 void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader,
 				 uint slot, struct pipe_constant_buffer *cbuf);
 void si_get_shader_buffers(struct si_context *sctx,
 			   enum pipe_shader_type shader,
 			   uint start_slot, uint count,
 			   struct pipe_shader_buffer *sbuf);
 void si_set_ring_buffer(struct pipe_context *ctx, uint slot,
 			struct pipe_resource *buffer,
 			unsigned stride, unsigned num_records,
 			bool add_tid, bool swizzle,
@@ -388,20 +395,21 @@ si_make_texture_descriptor(struct si_screen *screen,
 			   unsigned width, unsigned height, unsigned depth,
 			   uint32_t *state,
 			   uint32_t *fmask_state);
 struct pipe_sampler_view *
 si_create_sampler_view_custom(struct pipe_context *ctx,
 			      struct pipe_resource *texture,
 			      const struct pipe_sampler_view *state,
 			      unsigned width0, unsigned height0,
 			      unsigned force_level);
 void si_update_fb_dirtiness_after_rendering(struct si_context *sctx);
+void si_update_ps_iter_samples(struct si_context *sctx);
 
 /* si_state_binning.c */
 void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state);
 
 /* si_state_shaders.c */
 bool si_update_shaders(struct si_context *sctx);
 void si_init_shader_functions(struct si_context *sctx);
 bool si_init_shader_cache(struct si_screen *sscreen);
 void si_destroy_shader_cache(struct si_screen *sscreen);
 void si_get_active_slot_masks(const struct tgsi_shader_info *info,
diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c
index 686701d718f..7138f7a03aa 100644
--- a/src/gallium/drivers/radeonsi/si_state_binning.c
+++ b/src/gallium/drivers/radeonsi/si_state_binning.c
@@ -75,21 +75,21 @@ static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
 		if (!(cb_target_enabled_4bit & (0xf << (i * 4))))
 			continue;
 
 		struct r600_texture *rtex =
 			(struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture;
 		sum += rtex->surface.bpe;
 	}
 
 	/* Multiply the sum by some function of the number of samples. */
 	if (nr_samples >= 2) {
-		if (sctx->ps_iter_samples >= 2)
+		if (si_get_ps_iter_samples(sctx) >= 2)
 			sum *= nr_samples;
 		else
 			sum *= 2;
 	}
 
 	static const si_bin_size_subtable table[] = {
 		{
 			/* One RB / SE */
 			{
 				/* One shader engine */
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 8fe4c04ae79..d7742eafb04 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1453,20 +1453,37 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 					sel->info.uses_linear_center +
 					sel->info.uses_linear_centroid +
 					sel->info.uses_linear_sample > 1;
 
 				if (sel->info.opcode_count[TGSI_OPCODE_INTERP_SAMPLE])
 					key->mono.u.ps.interpolate_at_sample_force_center = 1;
 			}
 		}
 
 		key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
+
+		/* ps_uses_fbfetch is true only if the color buffer is bound. */
+		if (sctx->ps_uses_fbfetch) {
+			struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
+			struct pipe_resource *tex = cb0->texture;
+
+			/* 1D textures are allocated and used as 2D on GFX9. */
+			key->mono.u.ps.fbfetch_msaa = sctx->framebuffer.nr_samples > 1;
+			key->mono.u.ps.fbfetch_is_1D = sctx->b.chip_class != GFX9 &&
+						       (tex->target == PIPE_TEXTURE_1D ||
+							tex->target == PIPE_TEXTURE_1D_ARRAY);
+			key->mono.u.ps.fbfetch_layered = tex->target == PIPE_TEXTURE_1D_ARRAY ||
+							 tex->target == PIPE_TEXTURE_2D_ARRAY ||
+							 tex->target == PIPE_TEXTURE_CUBE ||
+							 tex->target == PIPE_TEXTURE_CUBE_ARRAY ||
+							 tex->target == PIPE_TEXTURE_3D;
+		}
 		break;
 	}
 	default:
 		assert(0);
 	}
 
 	if (unlikely(sctx->screen->debug_flags & DBG(NO_OPT_VARIANT)))
 		memset(&key->opt, 0, sizeof(key->opt));
 }
 
@@ -2419,20 +2436,21 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 			si_mark_atom_dirty(sctx, &sctx->cb_render_state);
 
 		if (sctx->screen->has_out_of_order_rast &&
 		    (!old_sel ||
 		     old_sel->info.writes_memory != sel->info.writes_memory ||
 		     old_sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] !=
 		     sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]))
 			si_mark_atom_dirty(sctx, &sctx->msaa_config);
 	}
 	si_set_active_descriptors_for_shader(sctx, sel);
+	si_update_ps_colorbuf0_slot(sctx);
 }
 
 static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
 {
 	if (shader->is_optimized) {
 		util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority,
 				    &shader->ready);
 	}
 
 	util_queue_fence_destroy(&shader->ready);
-- 
2.15.1



More information about the mesa-dev mailing list