[Intel-gfx] [Mesa-dev] [PATCH] intel: Fix stencil buffer to be W tiled

Chad Versace chad at chad-versace.us
Mon Jul 18 22:54:46 CEST 2011


On 07/18/2011 11:49 AM, Ian Romanick wrote:
> On 07/18/2011 12:55 AM, Chad Versace wrote:
>> Until now, the stencil buffer was allocated as a Y tiled buffer, because
>> in several locations the PRM states that it is. However, it is actually
>> W tiled. From the PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section
>> 4.5.2.1 W-Major Format:
>>     W-Major Tile Format is used for separate stencil.
> 
>> The GTT is incapable of W fencing, so we allocate the stencil buffer with
>> I915_TILING_NONE and decode the tile's layout in software.
> 
>> This fix touches the following portions of code:
>>     - In intel_allocate_renderbuffer_storage(), allocate the stencil
>>       buffer with I915_TILING_NONE.
>>     - In intel_verify_dri2_has_hiz(), verify that the stencil buffer is
>>       not tiled.
>>     - In the stencil buffer's span functions, the tile's layout must be
>>       decoded in software.
> 
>> This commit mutually depends on the xf86-video-intel commit
>>     dri: Do not tile stencil buffer
>>     Author: Chad Versace <chad at chad-versace.us>
>>     Date:   Mon Jul 18 00:38:00 2011 -0700
> 
>> On Gen6 with separate stencil enabled, fixes the following Piglit tests:
>>     bugs/fdo23670-drawpix_stencil
>>     general/stencil-drawpixels
>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX16-copypixels
>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX16-drawpixels
>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX16-readpixels
>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX1-copypixels
>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX1-drawpixels
>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX1-readpixels
>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX4-copypixels
>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX4-drawpixels
>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX4-readpixels
>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX8-copypixels
>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX8-drawpixels
>>     spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX8-readpixels
>>     spec/EXT_packed_depth_stencil/fbo-stencil-GL_DEPTH24_STENCIL8-copypixels
>>     spec/EXT_packed_depth_stencil/fbo-stencil-GL_DEPTH24_STENCIL8-readpixels
>>     spec/EXT_packed_depth_stencil/readpixels-24_8
> 
>> Note: This is a candidate for the 7.11 branch.
> 
>> CC: Eric Anholt <eric at anholt.net>
>> CC: Kenneth Graunke <kenneth at whitecape.org>
>> Signed-off-by: Chad Versace <chad at chad-versace.us>
>> ---
>>  src/mesa/drivers/dri/intel/intel_clear.c   |    6 ++
>>  src/mesa/drivers/dri/intel/intel_context.c |    9 ++-
>>  src/mesa/drivers/dri/intel/intel_fbo.c     |   13 +++--
>>  src/mesa/drivers/dri/intel/intel_screen.h  |    9 ++-
>>  src/mesa/drivers/dri/intel/intel_span.c    |   85 ++++++++++++++++++++--------
>>  5 files changed, 89 insertions(+), 33 deletions(-)
> 
>> diff --git a/src/mesa/drivers/dri/intel/intel_clear.c b/src/mesa/drivers/dri/intel/intel_clear.c
>> index dfca03c..5ab9873 100644
>> --- a/src/mesa/drivers/dri/intel/intel_clear.c
>> +++ b/src/mesa/drivers/dri/intel/intel_clear.c
>> @@ -143,6 +143,12 @@ intelClear(struct gl_context *ctx, GLbitfield mask)
>>  	     */
>>              tri_mask |= BUFFER_BIT_STENCIL;
>>           }
>> +	 else if (intel->has_separate_stencil &&
>> +	       stencilRegion->tiling == I915_TILING_NONE) {
>> +	    /* The stencil buffer is actually W tiled, which the hardware
>> +	     * cannot blit to. */
>> +	    tri_mask |= BUFFER_BIT_STENCIL;
>> +	 }
>>           else {
>>              /* clearing all stencil bits, use blitting */
>>              blit_mask |= BUFFER_BIT_STENCIL;
>> diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
>> index 2ba1363..fe8be08 100644
>> --- a/src/mesa/drivers/dri/intel/intel_context.c
>> +++ b/src/mesa/drivers/dri/intel/intel_context.c
>> @@ -1439,7 +1439,12 @@ intel_verify_dri2_has_hiz(struct intel_context *intel,
>>        assert(stencil_rb->Base.Format == MESA_FORMAT_S8);
>>        assert(depth_rb && depth_rb->Base.Format == MESA_FORMAT_X8_Z24);
> 
>> -      if (stencil_rb->region->tiling == I915_TILING_Y) {
>> +      if (stencil_rb->region->tiling == I915_TILING_NONE) {
>> +	 /*
>> +	  * The stencil buffer is actually W tiled. The region's tiling is
>> +	  * I915_TILING_NONE, however, because the GTT is incapable of W
>> +	  * fencing.
>> +	  */
>>  	 intel->intelScreen->dri2_has_hiz = INTEL_DRI2_HAS_HIZ_TRUE;
>>  	 return;
>>        } else {
>> @@ -1527,7 +1532,7 @@ intel_verify_dri2_has_hiz(struct intel_context *intel,
>>         * Presently, however, no verification or clean up is necessary, and
>>         * execution should not reach here. If the framebuffer still has a hiz
>>         * region, then we have already set dri2_has_hiz to true after
>> -       * confirming above that the stencil buffer is Y tiled.
>> +       * confirming above that the stencil buffer is W tiled.
>>         */
>>        assert(0);
>>     }
>> diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c b/src/mesa/drivers/dri/intel/intel_fbo.c
>> index 1669af2..507cc33 100644
>> --- a/src/mesa/drivers/dri/intel/intel_fbo.c
>> +++ b/src/mesa/drivers/dri/intel/intel_fbo.c
>> @@ -173,6 +173,9 @@ intel_alloc_renderbuffer_storage(struct gl_context * ctx, struct gl_renderbuffer
> 
>>     if (irb->Base.Format == MESA_FORMAT_S8) {
>>        /*
>> +       * The stencil buffer is W tiled. However, we request from the kernel a
>> +       * non-tiled buffer because the GTT is incapable of W fencing.
>> +       *
>>         * The stencil buffer has quirky pitch requirements.  From Vol 2a,
>>         * 11.5.6.2.1 3DSTATE_STENCIL_BUFFER, field "Surface Pitch":
>>         *    The pitch must be set to 2x the value computed based on width, as
>> @@ -180,14 +183,14 @@ intel_alloc_renderbuffer_storage(struct gl_context * ctx, struct gl_renderbuffer
>>         * To accomplish this, we resort to the nasty hack of doubling the drm
>>         * region's cpp and halving its height.
>>         *
>> -       * If we neglect to double the pitch, then drm_intel_gem_bo_map_gtt()
>> -       * maps the memory incorrectly.
>> +       * If we neglect to double the pitch, then render corruption occurs.
>>         */
>>        irb->region = intel_region_alloc(intel->intelScreen,
>> -				       I915_TILING_Y,
>> +				       I915_TILING_NONE,
>>  				       cpp * 2,
>> -				       width,
>> -				       height / 2,
>> +				       ALIGN(width, 64),
>> +				       /* XXX: Maybe align to 128? */
>> +				       ALIGN(height / 2, 64),
>>  				       GL_TRUE);
>>        if (!irb->region)
>>  	return false;
>> diff --git a/src/mesa/drivers/dri/intel/intel_screen.h b/src/mesa/drivers/dri/intel/intel_screen.h
>> index b2013af..9dd6a52 100644
>> --- a/src/mesa/drivers/dri/intel/intel_screen.h
>> +++ b/src/mesa/drivers/dri/intel/intel_screen.h
>> @@ -63,9 +63,12 @@
>>   * x8_z24 and s8).
>>   *
>>   * Eventually, intel_update_renderbuffers() makes a DRI2 request for
>> - * DRI2BufferStencil and DRI2BufferHiz. If the returned buffers are Y tiled,
>> - * then we joyfully set intel_screen.dri2_has_hiz to true and continue as if
>> - * nothing happend.
>> + * DRI2BufferStencil and DRI2BufferHiz. If the stencil buffer's tiling is
>> + * I915_TILING_NONE [1], then we joyfully set intel_screen.dri2_has_hiz to
>> + * true and continue as if nothing happend.
>> + *
>> + * [1] The stencil buffer is actually W tiled. However, we request from the
>> + *     kernel a non-tiled buffer because the GTT is incapable of W fencing.
>>   *
>>   * If the buffers are X tiled, however, the handshake has failed and we must
>>   * clean up.
>> diff --git a/src/mesa/drivers/dri/intel/intel_span.c b/src/mesa/drivers/dri/intel/intel_span.c
>> index 153803f..d306432 100644
>> --- a/src/mesa/drivers/dri/intel/intel_span.c
>> +++ b/src/mesa/drivers/dri/intel/intel_span.c
>> @@ -131,38 +131,77 @@ intel_set_span_functions(struct intel_context *intel,
>>     int miny = 0;							\
>>     int maxx = rb->Width;						\
>>     int maxy = rb->Height;						\
>> -   int stride = rb->RowStride;						\
>> -   uint8_t *buf = rb->Data;						\
>> +									\
>> +   /*									\
>> +    * Here we ignore rb->Data and rb->RowStride as set by		\
>> +    * intelSpanRenderStart. Since intel_offset_S8 decodes the W tile	\
>> +    * manually, the region's *real* base address and stride is		\
>> +    * required.								\
>> +    */									\
>> +   struct intel_renderbuffer *irb = intel_renderbuffer(rb);		\
>> +   uint8_t *buf = irb->region->buffer->virtual;				\
>> +   unsigned stride = irb->region->pitch;				\
>> +   unsigned height = 2 * irb->region->height;				\
>> +   bool flip = rb->Name == 0;						\
> 
>> -/* Don't flip y. */
>>  #undef Y_FLIP
>> -#define Y_FLIP(y) y
>> +#define Y_FLIP(y) ((1 - flip) * y + flip * (height - 1 - y))
> 
>>  /**
>>   * \brief Get pointer offset into stencil buffer.
>>   *
>> - * The stencil buffer interleaves two rows into one. Yay for crazy hardware.
>> - * The table below demonstrates how the pointer arithmetic behaves for a buffer
>> - * with positive stride (s=stride).
>> - *
>> - *     x    | y     | byte offset
>> - *     --------------------------
>> - *     0    | 0     | 0
>> - *     0    | 1     | 1
>> - *     1    | 0     | 2
>> - *     1    | 1     | 3
>> - *     ...  | ...   | ...
>> - *     0    | 2     | s
>> - *     0    | 3     | s + 1
>> - *     1    | 2     | s + 2
>> - *     1    | 3     | s + 3 
>> - *
>> + * The stencil buffer is W tiled. Since the GTT is incapable of W fencing, we
>> + * must decode the tile's layout in software.
>>   *
>> + * See
>> + *   - PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section 4.5.2.1 W-Major Tile
>> + *     Format.
>> + *   - PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section 4.5.3 Tiling Algorithm
>>   */
>> -static inline intptr_t
>> -intel_offset_S8(int stride, GLint x, GLint y)
>> +static inline uintptr_t
> 
> Why the type change?

Two reasons.

1. I redeclared the parameters as unsigned so to generate better code.
Since x, y, and stride are unsigned, the division and modulo operators
generate shift and bit-logic instructions rather than the slower arithmetic
instructions.

Below is a comparison of x % 64, signed versus unsigned, compiled with gcc -O3.
In f, the % generates 5 instructions. In g, only one instruction.

int f(int x) { return x % 64; }

f:
	movl	%edi, %edx
	sarl	$31, %edx
	shrl	$26, %edx
	leal	(%rdi,%rdx), %eax
	andl	$63, %eax
	subl	%edx, %eax
	ret

unsigned g(unsigned x) { return x % 64);

g:
	movl	%edi, %eax
	andl	$63, %eax
	ret

2. I redeclared the return type as unsigned to make it explicit that it does
not return a negative offset.

>> +intel_offset_S8(uint32_t stride, uint32_t x, uint32_t y)
>>  {
>> -   return 2 * ((y / 2) * stride + x) + y % 2;
>> +   uint32_t tile_size = 4096;
>> +   uint32_t tile_width = 64;
>> +   uint32_t tile_height = 64;
>> +   uint32_t row_size = 64 * stride;
>> +
>> +   uint32_t tile_x = x / tile_width;
>> +   uint32_t tile_y = y / tile_height;
>> +
>> +   /* The byte's address relative to the tile's base addres. */
>> +   uint32_t byte_x = x % tile_width;
>> +   uint32_t byte_y = y % tile_height;
>> +
>> +   uintptr_t u = tile_y * row_size
>> +               + tile_x * tile_size
>> +               + 512 * (byte_x / 8)
>> +               +  64 * (byte_y / 8)
>> +               +  32 * ((byte_y / 4) % 2)
>> +               +  16 * ((byte_x / 4) % 2)
>> +               +   8 * ((byte_y / 2) % 2)
>> +               +   4 * ((byte_x / 2) % 2)
>> +               +   2 * (byte_y % 2)
>> +               +   1 * (byte_x % 2);
>> +
>> +   /*
>> +    * Errata for Gen5:
>> +    *
>> +    * An additional offset is needed which is not documented in the PRM.
>> +    *
>> +    * if ((byte_x / 8) % 2 == 1) {
>> +    *    if ((byte_y / 8) % 2) == 0) {
>> +    *       u += 64;
>> +    *    } else {
>> +    *       u -= 64;
>> +    *    }
>> +    * }
>> +    *
>> +    * The offset is expressed more tersely as
>> +    * u += ((int) x & 0x8) * (8 - (((int) y & 0x8) << 1));
>> +    */
>> +
>> +   return u;
>>  }
> 
>>  #define WRITE_STENCIL(x, y, src)  buf[intel_offset_S8(stride, x, y)] = src;

-- 
Chad Versace
chad at chad-versace.us

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 900 bytes
Desc: OpenPGP digital signature
URL: <http://lists.freedesktop.org/archives/intel-gfx/attachments/20110718/d7581283/attachment.sig>


More information about the Intel-gfx mailing list