[Mesa-dev] [PATCH 3/3] i965/fs: Estimate maximum sampler message execution size more accurately.
Kenneth Graunke
kenneth at whitecape.org
Sun Aug 14 13:02:26 UTC 2016
On Friday, August 12, 2016 10:06:29 PM PDT Francisco Jerez wrote:
> The current logic used to determine the execution size of sampler
> messages was based on special-casing several argument and opcode
> combinations, which unsurprisingly missed the possibility that some
> messages could exceed the payload size limit or not depending on the
> number of coordinate components present. In particular:
>
> - The TXL, TXB and TEX messages (the latter on non-FS stages only)
> would attempt to use SIMD16 on Gen7+ hardware even if a shadow
> reference was present and the texture was a cubemap array, causing
> it to overflow the maximum supported sampler payload size and
> crash.
>
> - The TG4_OFFSET message with shadow comparison was falling back to
> SIMD8 regardless of the number of coordinate components, which is
> unnecessary when two coordinates or less are present.
>
> Both cases have been handled incorrectly ever since cubemap arrays and
> texture gather were respectively enabled (the current logic used by
> the SIMD lowering pass is almost unchanged from the previous no16
> fall-back logic used pre-SIMD lowering times).
>
> Fixes the following GL4.5 conformance test on Gen7-8 (the bug also
> affects Gen9+ in principle, but SKL passes the test by luck because it
> manages to use the TXL_LZ message instead of TXL):
>
> GL45-CTS.texture_cube_map_array.sampling
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=97267
> ---
> src/mesa/drivers/dri/i965/brw_fs.cpp | 108 ++++++++++++++++++++++++-----------
> 1 file changed, 74 insertions(+), 34 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 1842d56..b816d24 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -4687,6 +4687,68 @@ get_fpu_lowered_simd_width(const struct brw_device_info *devinfo,
> }
>
> /**
> + * Get the maximum allowed SIMD width for instruction \p inst accounting for
> + * various payload size restrictions that apply to sampler message
> + * instructions.
> + *
> + * This is only intended to provide a maximum theoretical bound for the
> + * execution size of the message based on the number of argument components
> + * alone, which in most cases will determine whether the SIMD8 or SIMD16
> + * variant of the message can be used, though some messages may have
> + * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
> + * the message length to determine the exact SIMD width and argument count,
> + * which makes a number of sampler message combinations impossible to
> + * represent).
> + */
> +static unsigned
> +get_sampler_lowered_simd_width(const struct brw_device_info *devinfo,
> + const fs_inst *inst)
> +{
> + /* Calculate the number of coordinate components that have to be present
> + * assuming that additional arguments follow the texel coordinates in the
> + * message payload. On Gen7+ there is no need for padding, on Gen5-6 we
> + * need to pad to four or three components depending on the message, on
> + * Gen4 we need to pad to at most three components.
> + */
> + const unsigned req_coord_components =
> + (devinfo->gen >= 7 ||
> + !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
> + (devinfo->gen >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
> + inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
> + 3;
> +
> + /* On Gen9+ the LOD argument is for free if we're able to use the LZ
> + * variant of the TXL or TXF message.
> + */
> + const bool implicit_lod = devinfo->gen >= 9 &&
> + (inst->opcode == SHADER_OPCODE_TXL ||
> + inst->opcode == SHADER_OPCODE_TXF) &&
> + inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
> +
> + /* Calculate the total number of argument components that need to be passed
> + * to the sampler unit.
> + */
> + const unsigned num_payload_components =
> + MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
> + req_coord_components) +
> + inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
> + (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
> + inst->components_read(TEX_LOGICAL_SRC_LOD2) +
> + inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
> + (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
> + inst->components_read(TEX_LOGICAL_SRC_OFFSET_VALUE) : 0) +
> + inst->components_read(TEX_LOGICAL_SRC_MCS);
> +
> + /*
Bonus line here ^ can be dropped.
> + * SIMD16 messages with more than five arguments exceed the maximum message
> + * size supported by the sampler, regardless of whether a header is
> + * provided or not.
> + */
> + return MIN2(inst->exec_size,
> + num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
Why not do:
const bool simd16_exceeds_max_mlen =
2 * num_payload_components > MAX_SAMPLER_MESSAGE_SIZE;
return MIN2(inst->exec_size, simd16_exceeds_max_mlen ? 8 : 16);
I find the left-hand multiply a little more obvious than the right-hand
divide, because logically each component is occupying two registers in
SIMD16 mode, and there's a max message length of 11.
> +}
> +
> +/**
> * Get the closest native SIMD width supported by the hardware for instruction
> * \p inst. The instruction will be left untouched by
> * fs_visitor::lower_simd_width() if the returned value is equal to the
> @@ -4861,31 +4923,25 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
> case SHADER_OPCODE_LOD_LOGICAL:
> case SHADER_OPCODE_TG4_LOGICAL:
> case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
> - return MIN2(16, inst->exec_size);
> + case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
> + case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
> + return get_sampler_lowered_simd_width(devinfo, inst);
>
> case SHADER_OPCODE_TXD_LOGICAL:
> /* TXD is unsupported in SIMD16 mode. */
> return 8;
>
> - case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
> - /* gather4_po_c is unsupported in SIMD16 mode. */
> - const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
> - return (shadow_c.file != BAD_FILE ? 8 : MIN2(16, inst->exec_size));
> - }
> case SHADER_OPCODE_TXL_LOGICAL:
> - case FS_OPCODE_TXB_LOGICAL: {
> - /* Gen4 doesn't have SIMD8 non-shadow-compare bias/LOD instructions, and
> - * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
> - * mode because the message exceeds the maximum length of 11.
> + case FS_OPCODE_TXB_LOGICAL:
> + /* Only one execution size is representable pre-ILK depending on whether
> + * the shadow reference argument is present.
> */
> - const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
> - if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
> - return 16;
> - else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
> - return 8;
> + if (devinfo->gen == 4)
> + return (inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ?
> + 16 : 8);
If you drop the unnecessary parenthesis, this will fit on one line.
Otherwise, our coding style would be to use braces.
> else
> - return MIN2(16, inst->exec_size);
> - }
> + return get_sampler_lowered_simd_width(devinfo, inst);
> +
> case SHADER_OPCODE_TXF_LOGICAL:
> case SHADER_OPCODE_TXS_LOGICAL:
> /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
> @@ -4894,23 +4950,7 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
> if (devinfo->gen == 4)
> return 16;
> else
> - return MIN2(16, inst->exec_size);
> -
> - case SHADER_OPCODE_TXF_CMS_W_LOGICAL: {
> - /* This opcode can take up to 6 arguments which means that in some
> - * circumstances it can end up with a message that is too long in SIMD16
> - * mode.
> - */
> - const unsigned coord_components =
> - inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
> - /* First three arguments are the sample index and the two arguments for
> - * the MCS data.
> - */
> - if ((coord_components + 3) * 2 > MAX_SAMPLER_MESSAGE_SIZE)
> - return 8;
> - else
> - return MIN2(16, inst->exec_size);
> - }
> + return get_sampler_lowered_simd_width(devinfo, inst);
>
> case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
> case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
>
Thanks for taking care of this!
Series is:
Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 819 bytes
Desc: This is a digitally signed message part.
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20160814/91fcb235/attachment.sig>
More information about the mesa-dev
mailing list