[Mesa-dev] [PATCH 2/9] ac: change legacy_surf_level::slice_size to dword units

Thu Nov 23 17:29:13 UTC 2017

On Thu, Nov 23, 2017 at 9:18 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> On 22.11.2017 22:33, Marek Olšák wrote:
>>
>> On Wed, Nov 22, 2017 at 7:46 PM, Nicolai Hähnle <nhaehnle at gmail.com
>> <mailto:nhaehnle at gmail.com>> wrote:
>>  > On 21.11.2017 18:30, Marek Olšák wrote:
>>  >>
>>  >> From: Marek Olšák <marek.olsak at amd.com <mailto:marek.olsak at amd.com>>
>>
>>  >>
>>  >> The next commit will reduce the size even more.
>>  >> ---
>>  >>   src/amd/common/ac_surface.c                        |  2 +-
>>  >>   src/amd/common/ac_surface.h                        |  2 +-
>>  >>   src/amd/vulkan/radv_image.c                        |  8 ++++----
>>  >>   src/gallium/drivers/r600/evergreen_state.c         |  8 ++++----
>>  >>   src/gallium/drivers/r600/r600_state.c              |  8 ++++----
>>  >>   src/gallium/drivers/r600/r600_texture.c            | 14
>> +++++++-------
>>  >>   src/gallium/drivers/r600/radeon_uvd.c              |  2 +-
>>  >>   src/gallium/drivers/radeon/r600_texture.c          | 14
>> +++++++-------
>>  >>   src/gallium/drivers/radeon/radeon_uvd.c            |  2 +-
>>  >>   src/gallium/drivers/radeonsi/cik_sdma.c            |  4 ++--
>>  >>   src/gallium/drivers/radeonsi/si_dma.c              |  8 ++++----
>>  >>   src/gallium/winsys/radeon/drm/radeon_drm_surface.c |  4 ++--
>>  >>   12 files changed, 38 insertions(+), 38 deletions(-)
>>  >>
>>  >> diff --git a/src/amd/common/ac_surface.c b/src/amd/common/ac_surface.c
>>  >> index f7600a3..2b6c3fb 100644
>>  >> --- a/src/amd/common/ac_surface.c
>>  >> +++ b/src/amd/common/ac_surface.c
>>  >> @@ -297,21 +297,21 @@ static int gfx6_compute_level(ADDR_HANDLE
>> addrlib,
>>  >>         ret = AddrComputeSurfaceInfo(addrlib,
>>  >>                                      AddrSurfInfoIn,
>>  >>                                      AddrSurfInfoOut);
>>  >>         if (ret != ADDR_OK) {
>>  >>                 return ret;
>>  >>         }
>>  >>         surf_level = is_stencil ? &surf->u.legacy.stencil_level[level]
>> :
>>  >> &surf->u.legacy.level[level];
>>  >>         surf_level->offset = align64(surf->surf_size,
>>  >> AddrSurfInfoOut->baseAlign);
>>  >> -       surf_level->slice_size = AddrSurfInfoOut->sliceSize;
>>  >> +       surf_level->slice_size_dw = AddrSurfInfoOut->sliceSize / 4;
>>  >>         surf_level->nblk_x = AddrSurfInfoOut->pitch;
>>  >>         surf_level->nblk_y = AddrSurfInfoOut->height;
>>  >>         switch (AddrSurfInfoOut->tileMode) {
>>  >>         case ADDR_TM_LINEAR_ALIGNED:
>>  >>                 surf_level->mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
>>  >>                 break;
>>  >>         case ADDR_TM_1D_TILED_THIN1:
>>  >>                 surf_level->mode = RADEON_SURF_MODE_1D;
>>  >>                 break;
>>  >> diff --git a/src/amd/common/ac_surface.h b/src/amd/common/ac_surface.h
>>  >> index 1dc95cd..ec89f6b 100644
>>  >> --- a/src/amd/common/ac_surface.h
>>  >> +++ b/src/amd/common/ac_surface.h
>>  >> @@ -64,21 +64,21 @@ enum radeon_micro_mode {
>>  >>   /* bits 19 and 20 are reserved for libdrm_radeon, don't use them */
>>  >>   #define RADEON_SURF_FMASK                       (1 << 21)
>>  >>   #define RADEON_SURF_DISABLE_DCC                 (1 << 22)
>>  >>   #define RADEON_SURF_TC_COMPATIBLE_HTILE         (1 << 23)
>>  >>   #define RADEON_SURF_IMPORTED                    (1 << 24)
>>  >>   #define RADEON_SURF_OPTIMIZE_FOR_SPACE          (1 << 25)
>>  >>   #define RADEON_SURF_SHAREABLE                   (1 << 26)
>>  >>     struct legacy_surf_level {
>>  >>       uint64_t                    offset;
>>  >> -    uint64_t                    slice_size;
>>  >> +    uint32_t                    slice_size_dw; /* in dwords; max =
>> 4GB /
>>  >> 4. */
>>  >>       uint32_t                    dcc_offset; /* relative offset
>> within
>>  >> DCC mip tree */
>>  >>       uint32_t                    dcc_fast_clear_size;
>>  >>       uint16_t                    nblk_x;
>>  >>       uint16_t                    nblk_y;
>>  >>       enum radeon_surf_mode       mode;
>>  >>   };
>>  >>     struct legacy_surf_layout {
>>  >>       unsigned                    bankw:4;  /* max 8 */
>>  >>       unsigned                    bankh:4;  /* max 8 */
>>  >> diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c
>>  >> index b532aa9..fb7bbde 100644
>>  >> --- a/src/amd/vulkan/radv_image.c
>>  >> +++ b/src/amd/vulkan/radv_image.c
>>  >> @@ -1149,25 +1149,25 @@ void radv_GetImageSubresourceLayout(
>>  >>         if (device->physical_device->rad_info.chip_class >= GFX9) {
>>  >>                 pLayout->offset = surface->u.gfx9.offset[level] +
>>  >> surface->u.gfx9.surf_slice_size * layer;
>>  >>                 pLayout->rowPitch = surface->u.gfx9.surf_pitch *
>>  >> surface->bpe;
>>  >>                 pLayout->arrayPitch = surface->u.gfx9.surf_slice_size;
>>  >>                 pLayout->depthPitch = surface->u.gfx9.surf_slice_size;
>>  >>                 pLayout->size = surface->u.gfx9.surf_slice_size;
>>  >>                 if (image->type == VK_IMAGE_TYPE_3D)
>>  >>                         pLayout->size *= u_minify(image->info.depth,
>>  >> level);
>>  >>         } else {
>>  >> -               pLayout->offset =
>> surface->u.legacy.level[level].offset +
>>  >> surface->u.legacy.level[level].slice_size * layer;
>>  >> +               pLayout->offset =
>> surface->u.legacy.level[level].offset +
>>  >> surface->u.legacy.level[level].slice_size_dw * 4 * layer;
>>  >
>>  >
>>  > I believe the maximum slice size in bytes is (with an RGBA32 texture)
>>  >
>>  > 16384 * 16384 * 16 = 2^14 * 2^14 * 2^4 = 2^32
>>  >
>>  > The problem with this code is that the multiplication is now performed
>> as
>>  > uint32_t and can therefore wrap-around. So an explicit cast to 64-bits
>> is
>>  > required.
>>  >
>>  > In practice, I guess this rather becomes an issue with smaller slice
>> sizes
>>  > but larger layer indices. We really need test case to exercise >= 4 GB
>>  > textures...
>>
>> This should do it:
>>
>> diff --git a/src/amd/common/ac_surface.h b/src/amd/common/ac_surface.h
>> index f18548f..fa17b34 100644
>> --- a/src/amd/common/ac_surface.h
>> +++ b/src/amd/common/ac_surface.h
>> @@ -71,7 +71,8 @@ enum radeon_micro_mode {
>>
>>   struct legacy_surf_level {
>>       uint64_t                    offset;
>> -    uint32_t                    slice_size_dw; /* in dwords; max = 4GB /
>> 4. */
>> +    /* Declare 32 bits of uint64_t, so that multiplication results in 64
>> bits. */
>> +    uint64_t                    slice_size_dw:32; /* in dwords; max = 4GB
>> / 4. */
>>       uint32_t                    dcc_offset; /* relative offset within
>> DCC mip tree */
>>       uint32_t                    dcc_fast_clear_size;
>>       unsigned                    nblk_x:15;
>
>
> Congratulations, you found a compiler bug! :)
>
> I like this idea very much; however:
>
> nha at capella:~/tmp$ cat foo.c
> #include <stdint.h>
>
> struct s {
>     uint64_t foo:32;
> };
>
> uint64_t foo(uint32_t x, struct s* p)
> {
>     return x * p->foo;
> }
>
> uint64_t foo2(uint32_t x, struct s* p)
> {
>     return x * (uint64_t)p->foo;
> }
> nha at capella:~/tmp$ gcc -Wall -Wextra -O2 -S foo.c
> nha at capella:~/tmp$ cat foo.s
>         .file   "foo.c"
>         .text
>         .p2align 4,,15
>         .globl  foo
>         .type   foo, @function
> foo:
> .LFB0:
>         .cfi_startproc
>         movl    %edi, %eax
>         imull   (%rsi), %eax
>         ret
>         .cfi_endproc
> .LFE0:
>         .size   foo, .-foo
>         .p2align 4,,15
>         .globl  foo2
>         .type   foo2, @function
> foo2:
> .LFB1:
>         .cfi_startproc
>         movl    (%rsi), %eax
>         movl    %edi, %edi
>         imulq   %rdi, %rax
>         ret
>         .cfi_endproc
> .LFE1:
>         .size   foo2, .-foo2
>         .ident  "GCC: (Ubuntu 6.3.0-12ubuntu2) 6.3.0 20170406"
>         .section        .note.GNU-stack,"", at progbits
>
> This is with gcc-7.2.
>
> Some more observations:
>
> - Clang 5.0 produces the same code
> - gcc produces entirely ridiculous code when the bitfield has 33 or more
> bits (it ends up producing what is essentially a 33-bit wide multiplication)
> - Clang actually produces the expected code when the bitfield has 33 or more
> bits
> - gcc produces the expected code when compiling it as C++
>
> Could you please file bugs against both gcc and clang for this?
>
> I mean, it's entirely possible that the C standard is crazy enough that it
> says this behavior is technically correct -- though I doubt it given the
> behavior for 33 bits and the inconsistency between the compilers for C++. In
> any case, *if* treating the value as less than 64 bits is correct, they
> should really produce a warning since the resulting code is dangerously
> different from what you'd expect.

I guess common sense doesn't always work with compilers and/or
standards. I don't plan to file compiler bugs because I don't really
know what the C standard says in this case.

Marek