[Mesa-dev] [PATCH 1/2] i965/blorp: Add bilinear filtering of samples for multisample scaled blits
Paul Berry
stereotype441 at gmail.com
Wed Jun 5 11:53:25 PDT 2013
On 4 June 2013 18:20, Anuj Phogat <anuj.phogat at gmail.com> wrote:
> Current implementation of ext_framebuffer_multisample_blit_scaled in
> i965/blorp uses nearest filtering for multisample scaled blits. Using
> nearest filtering produces blocky artifacts and negates the benefits
> of MSAA. That is the reason why extension was not enabled on i965.
>
> This patch implements the bilinear filtering of samples in blorp
> engine. Images generated with this patch are free from blocky
> artifacts and show big improvement in visual quality.
>
This is definitely a quality improvement over our previous implementation.
I still have some concerns about whether we want this to be our final
answer, though. In particular, it seems to me that to avoid blocky
artifacts, sampling at coordinates (x, y) should produce nearly the same
result as sampling at (x+dx, y+dy), where |dx| and |dy| are much less than
1. Normal bilinear filtering has this property. But the algorithm in this
patch doesn't, since for example sampling at (0.99999, 1.5) of a 4x MSAA
surface gets you a blend of samples 1 and 3 from (0, 1), and sampling at
(1.00000, 1.5) gets you a blend of samples 0 and 2 from (1, 1).
I'm hoping you, Ian, and I can find a time to discuss it in person
tomorrow. It seems like this sort of algorithm design really benefits from
a whiteboard.
The comments that follow assume we decide to stick with the algorithm as is.
>
> Observed no piglit and gles3 regressions.
> ---
> src/mesa/drivers/dri/i965/brw_blorp.h | 3 +
> src/mesa/drivers/dri/i965/brw_blorp_blit.cpp | 157
> ++++++++++++++++++++++++++-
> 2 files changed, 155 insertions(+), 5 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h
> b/src/mesa/drivers/dri/i965/brw_blorp.h
> index 51b23db..a40324b 100644
> --- a/src/mesa/drivers/dri/i965/brw_blorp.h
> +++ b/src/mesa/drivers/dri/i965/brw_blorp.h
> @@ -305,6 +305,9 @@ struct brw_blorp_blit_prog_key
> * than one sample per pixel.
> */
> bool persample_msaa_dispatch;
> +
> + /* True for scaled blitting. */
> + bool blit_scaled;
> };
>
> class brw_blorp_blit_params : public brw_blorp_params
> diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
> b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
> index a6b2bbf..46aa6d4 100644
> --- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
> @@ -619,7 +619,8 @@ private:
> void kill_if_outside_dst_rect();
> void translate_dst_to_src();
> void single_to_blend();
> - void manual_blend(unsigned num_samples);
> + void manual_blend_average(unsigned num_samples);
> + void manual_blend_linear(unsigned num_samples);
> void sample(struct brw_reg dst);
> void texel_fetch(struct brw_reg dst);
> void mcs_fetch();
> @@ -630,7 +631,7 @@ private:
> /**
> * Base-2 logarithm of the maximum number of samples that can be
> blended.
> */
> - static const unsigned LOG2_MAX_BLEND_SAMPLES = 3;
> + static const unsigned LOG2_MAX_BLEND_SAMPLES = 7;
>
This is misleading now, since it looks like it's saying that the maximum
number of samples that can be blended is 2^7 == 128. What's really
happening is that you're adding an algorithm that requires n temporary
storage locations (where n is the number of samples), whereas the old
algorithm required only log2(n)+1 temporary storage locations. Let's just
rip this constant out and declare texture_data to have size 8.
>
> void *mem_ctx;
> struct brw_context *brw;
> @@ -673,6 +674,9 @@ private:
> */
> struct brw_reg y_coords[2];
>
> + struct brw_reg x_coord_fract;
> + struct brw_reg y_coord_fract;
> +
> /* Which element of x_coords and y_coords is currently in use.
> */
> int xy_coord_index;
> @@ -811,15 +815,17 @@ brw_blorp_blit_program::compile(struct brw_context
> *brw,
> * that we want to texture from. Exception: if we are blending, then
> S is
> * irrelevant, because we are going to fetch all samples.
> */
> - if (key->blend) {
> + if (key->blend && !key->blit_scaled) {
> if (brw->intel.gen == 6) {
> /* Gen6 hardware an automatically blend using the SAMPLE message
> */
> single_to_blend();
> sample(texture_data[0]);
> } else {
> /* Gen7+ hardware doesn't automaticaly blend. */
> - manual_blend(key->src_samples);
> + manual_blend_average(key->src_samples);
> }
> + } else if(key->blend && key->blit_scaled) {
> + manual_blend_linear(key->src_samples);
> } else {
> /* We aren't blending, which means we just want to fetch a single
> sample
> * from the source surface. The address that we want to fetch from
> is
> @@ -910,6 +916,13 @@ brw_blorp_blit_program::alloc_regs()
> = retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UD);
> reg += 2;
> }
> + this->x_coord_fract
> + = vec8(retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_F));
> + reg += 2;
> + this->y_coord_fract
> + = vec8(retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_F));
> + reg += 2;
> +
> this->xy_coord_index = 0;
> this->sample_index
> = retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UD);
> @@ -1365,6 +1378,10 @@ brw_blorp_blit_program::translate_dst_to_src()
> brw_MUL(&func, Y_f, Yp_f, y_transform.multiplier);
> brw_ADD(&func, X_f, X_f, x_transform.offset);
> brw_ADD(&func, Y_f, Y_f, y_transform.offset);
> + if (key->blit_scaled && key->blend) {
> + brw_FRC(&func, x_coord_fract, X_f);
> + brw_FRC(&func, y_coord_fract, Y_f);
> + }
> /* Round the float coordinates down to nearest integer by moving to
> * UD registers.
> */
> @@ -1415,7 +1432,7 @@ inline int count_trailing_one_bits(unsigned value)
>
>
> void
> -brw_blorp_blit_program::manual_blend(unsigned num_samples)
> +brw_blorp_blit_program::manual_blend_average(unsigned num_samples)
> {
> if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS)
> mcs_fetch();
> @@ -1520,6 +1537,131 @@ brw_blorp_blit_program::manual_blend(unsigned
> num_samples)
> brw_ENDIF(&func);
> }
>
> +void
> +brw_blorp_blit_program::manual_blend_linear(unsigned num_samples)
> +{
>
+ if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS)
> + mcs_fetch();
> +
> + /* e.g. for 4x MSAA:
> + * result = lrp(lrp(sample[0] + sample[1]) + lrp(sample[2] +
> sample[3]))
> + *
> + * We do this computation by performing the following operations:
> + *
> + * In case of 4x MSAA:
> + * - Compute samples 0 to 3
> + * - linearly interpolate samples 0 and 1 in X
> + * - linearly interpolate samples 2 and 3 in X
> + * - linearly interpolate the results of last two operations in Y
>
+ *
> + * In case of 8x MSAA, we take the average of two samples in each
> quarter
> + * of the pixel to generate 4 sample values. These 4 sample values are
> used
> + * in a way similar to 4x MSAA:
> + * - Compute samples 0 to 7
> + * - Compute avearge of samples 0 and 7 and leave the result in sample
> 0
> + * - Compute average of samples 1 and 3 and leave the result in sample
> 1
> + * - Compute average of samples 2 and 6 and leave the result in sample
> 2
> + * - Compute average of samples 4 and 5 and leave the result in sample
> 3
> + * - linearly interpolate samples 0 and 1 in X
> + * - linearly interpolate samples 2 and 3 in X
> + * - linearly interpolate the results of last two operations in Y
> + *
> + */
> + for (unsigned i = 0; i < num_samples; ++i) {
> + assert(i < ARRAY_SIZE(texture_data));
> + if (i == 0) {
> + s_is_zero = true;
> + } else {
> + s_is_zero = false;
> + brw_MOV(&func, vec16(S), brw_imm_ud(i));
> + }
> + texel_fetch(texture_data[i]);
> +
> + if (i == 0 && key->tex_layout == INTEL_MSAA_LAYOUT_CMS) {
> + /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control
> Surface)
> + * suggests an optimization:
> + *
> + * "A simple optimization with probable large return in
> + * performance is to compare the MCS value to zero
> (indicating
> + * all samples are on sample slice 0), and sample only from
> + * sample slice 0 using ld2dss if MCS is zero."
> + *
> + * Note that in the case where the MCS value is zero, sampling
> from
> + * sample slice 0 using ld2dss and sampling from sample 0 using
> + * ld2dms are equivalent (since all samples are on sample slice
> 0).
> + * Since we have already sampled from sample 0, all we need to
> do is
> + * skip the remaining fetches and averaging if MCS is zero.
> + */
> + brw_CMP(&func, vec16(brw_null_reg()), BRW_CONDITIONAL_NZ,
> + mcs_data, brw_imm_ud(0));
> + brw_IF(&func, BRW_EXECUTE_16);
> + }
> + }
> +
> +#define SAMPLE(x, y) offset(texture_data[x], y)
> +#define COMBINE(brw_op, dst, src0, src1)\
> + for (int k = 0; k < 4; ++k)\
> + brw_op(&func, dst, src0, src1);
> +
> + if (num_samples == 8) {
> + COMBINE(brw_ADD, SAMPLE(0, 2*k),
> + vec8(SAMPLE(0, 2*k)),
> + vec8(SAMPLE(7, 2*k)));
> + COMBINE(brw_MUL, SAMPLE(0, 2*k),
> + vec8(SAMPLE(0, 2*k)),
> + brw_imm_f(0.5));
> + COMBINE(brw_ADD, SAMPLE(1, 2*k),
> + vec8(SAMPLE(1, 2*k)),
> + vec8(SAMPLE(3, 2*k)));
> + COMBINE(brw_MUL, SAMPLE(1, 2*k),
> + vec8(SAMPLE(1, 2*k)),
> + brw_imm_f(0.5));
> + COMBINE(brw_ADD, SAMPLE(2, 2*k),
> + vec8(SAMPLE(2, 2*k)),
> + vec8(SAMPLE(6, 2*k)));
> + COMBINE(brw_MUL, SAMPLE(2, 2*k),
> + vec8(SAMPLE(2, 2*k)),
> + brw_imm_f(0.5));
> + COMBINE(brw_ADD, SAMPLE(3, 2*k),
> + vec8(SAMPLE(4, 2*k)),
> + vec8(SAMPLE(5, 2*k)));
> + COMBINE(brw_MUL, SAMPLE(3, 2*k),
> + vec8(SAMPLE(3, 2*k)),
> + brw_imm_f(0.5));
>
It would be faster to drop the MUL(..., 0.5) parts and just multiply by 0.5
at the end.
>
> +
> + /* Shuffle sample registers stored in texture_data array to match
> with
> + * the layout of samples in 4x MSAA.
> + */
> + t1 = texture_data[0];
> + t2 = texture_data[1];
> + texture_data[0] = texture_data[3];
> + texture_data[1] = texture_data[2];
> + texture_data[2] = t1;
> + texture_data[3] = t2;
>
I see what you're doing, and I see why it will work, but it makes me feel
really jittery to see the texture_data array get modified like this--the
array is supposed to get initialized by alloc_regs() and stay constant
thereafter. I believe we can get the equivalent effect by carefully
rewriting the code above to this:
COMBINE(brw_ADD, SAMPLE(3, 2*k),
vec8(SAMPLE(1, 2*k)),
vec8(SAMPLE(3, 2*k)));
COMBINE(brw_ADD, SAMPLE(1, 2*k),
vec8(SAMPLE(2, 2*k)),
vec8(SAMPLE(6, 2*k)));
COMBINE(brw_ADD, SAMPLE(2, 2*k),
vec8(SAMPLE(0, 2*k)),
vec8(SAMPLE(7, 2*k)));
COMBINE(brw_ADD, SAMPLE(0, 2*k),
vec8(SAMPLE(4, 2*k)),
vec8(SAMPLE(5, 2*k)));
> + }
> +
> + brw_set_access_mode(&func, BRW_ALIGN_16);
> + for (int index = 3; index > 0; ) {
> + for (int k = 0; k < 8; ++k)
> + brw_LRP(&func,
> + vec8(SAMPLE(index - 1, k)),
> + offset(x_coord_fract, k & 1),
> + SAMPLE(index, k),
> + SAMPLE(index - 1, k));
> + index -= 2;
> + }
> + for (int k = 0; k < 8; ++k)
> + brw_LRP(&func,
> + vec8(SAMPLE(0, k)),
> + offset(y_coord_fract, k & 1),
> + vec8(SAMPLE(2, k)),
> + vec8(SAMPLE(0, k)));
> + brw_set_access_mode(&func, BRW_ALIGN_1);
> +#undef SAMPLE
> + if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS)
> + brw_ENDIF(&func);
> +}
> +
> /**
> * Emit code to look up a value in the texture using the SAMPLE message
> (which
> * does blending of MSAA surfaces).
> @@ -1869,6 +2011,11 @@ brw_blorp_blit_params::brw_blorp_blit_params(struct
> brw_context *brw,
> wm_prog_key.persample_msaa_dispatch = true;
> }
>
> + /* Scaled blitting or not. */
> + wm_prog_key.blit_scaled =
> + ((dst_x1 - dst_x0) == (src_x1 - src_x0) &&
> + (dst_y1 - dst_y0) == (src_y1 - src_y0)) ? false : true;
> +
> /* The render path must be configured to use the same number of
> samples as
> * the destination buffer.
> */
> --
> 1.8.1.4
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20130605/94f5f8c7/attachment-0001.html>
More information about the mesa-dev
mailing list