[Mesa-dev] [Bug 108949] RADV: Subgroup codegen is sub-optimal

bugzilla-daemon at freedesktop.org bugzilla-daemon at freedesktop.org
Wed Dec 5 11:20:29 UTC 2018


https://bugs.freedesktop.org/show_bug.cgi?id=108949

            Bug ID: 108949
           Summary: RADV: Subgroup codegen is sub-optimal
           Product: Mesa
           Version: 18.2
          Hardware: Other
                OS: All
            Status: NEW
          Severity: normal
          Priority: medium
         Component: Drivers/Vulkan/radeon
          Assignee: mesa-dev at lists.freedesktop.org
          Reporter: maister at archlinux.us
        QA Contact: mesa-dev at lists.freedesktop.org

I have some code using subgroups which generates suboptimal code where I expect
more use of SGPRs, but I see a lot of VGPRs/vector loads being used instead.
The code-gen is worse than AMDVLK and much worse than AMD's Windows driver as a
result. I filed a similar issue here:
https://github.com/GPUOpen-Drivers/AMDVLK/issues/68.
On a more useful, complicated test, I get 0% uplift from subgroup on RADV, 5%
on AMDVLK and 15% on Windows. GPU is RX 470 (Polaris). Mesa version is 18.2.5.

With a trivial test:
https://github.com/Themaister/Granite/blob/master/tests/assets/shaders/subgroup.comp
I expect the subgroupBroadcastFirst(subgroupOr) to trigger all scalar loads,
but I get in the loop:

BB629_1:
        s_load_dwordx4 s[8:11], s[0:1], 0x0                                  ;
C00A0200 00000000
        s_ff1_i32_b32 s3, s2                                                 ;
BE831002
        v_mul_u32_u24_e64 v7, s3, 48                                         ;
D1080007 00016003
        v_or_b32_e32 v5, 4, v7                                               ;
280A0E84
        v_mad_u32_u24 v10, s3, 48, 20                                        ;
D1C3000A 02516003
        v_mad_u32_u24 v8, s3, 48, 16                                         ;
D1C30008 02416003
        s_waitcnt lgkmcnt(0)                                                 ;
BF8C007F
*       buffer_load_dwordx2 v[5:6], v5, s[8:11], 0 offen                     ;
E0541000 80020505
*       buffer_load_dword v10, v10, s[8:11], 0 offen                         ;
E0501000 80020A0A
*       buffer_load_dword v14, v7, s[8:11], 0 offen                          ;
E0501000 80020E07
*       buffer_load_dword v8, v8, s[8:11], 0 offen                           ;
E0501000 80020808
        v_mad_u32_u24 v11, s3, 48, 24                                        ;
D1C3000B 02616003
        v_or_b32_e32 v7, 12, v7                                              ;
280E0E8C
        v_mad_u32_u24 v12, s3, 48, 28                                        ;
D1C3000C 02716003
*       buffer_load_dword v7, v7, s[8:11], 0 offen                           ;
E0501000 80020707
        v_mad_u32_u24 v9, s3, 48, 32                                         ;
D1C30009 02816003
        buffer_load_dword v11, v11, s[8:11], 0 offen                         ;
E0501000 80020B0B
        v_mad_u32_u24 v13, s3, 48, 36                                        ;
D1C3000D 02916003
*       buffer_load_dword v12, v12, s[8:11], 0 offen                         ;
E0501000 80020C0C
*       buffer_load_dword v9, v9, s[8:11], 0 offen                           ;
E0501000 80020909
...

where Windows codegen is:

label_0028:
  s_cmp_eq_i32  s0, 0                                   // 0000000000A0:
BF008000
  s_cbranch_scc1  label_0052                            // 0000000000A4:
BF850028
  s_and_b32     s1, s3, 0x0000ffff                      // 0000000000A8:
8601FF03 0000FFFF
  s_ff1_i32_b32  s4, s0                                 // 0000000000B0:
BE841000
  s_andn2_b32   s1, s1, 0x3fff0000                      // 0000000000B4:
8901FF01 3FFF0000
  s_mul_i32     s5, s4, 48                              // 0000000000BC:
9205B004
  s_mov_b32     s12, s2                                 // 0000000000C0:
BE8C0002
  s_mov_b32     s13, s1                                 // 0000000000C4:
BE8D0001
  s_movk_i32    s14, 0xffff                             // 0000000000C8:
B00EFFFF
  s_mov_b32     s15, 0x00024fac                         // 0000000000CC:
BE8F00FF 00024FAC
  s_buffer_load_dwordx8  s[16:23], s[12:15], s5         // 0000000000D4:
C02C0406 00000005
  s_add_u32     s1, s5, 32                              // 0000000000DC:
8001A005
  s_buffer_load_dwordx4  s[12:15], s[12:15], s1         // 0000000000E0:
C0280306 00000001
  s_lshl_b32    s1, 1, s4                               // 0000000000E8:
8E010481
  s_xor_b32     s0, s0, s1                              // 0000000000EC:
88000100
  s_waitcnt     vmcnt(0) & lgkmcnt(0)                   // 0000000000F0:
BF8C0070
...

The subgroupOr is implemented strangely, getting similar code as AMDVLK, i.e.
this:

        v_mov_b32_dpp v7, v7  quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf ;
7E0E02FA FF00B107
        v_or_b32_e32 v5, v5, v7                                              ;
280A0F05
        v_mov_b32_e32 v7, v5                                                 ;
7E0E0305
        s_nop 1                                                              ;
BF800001
        v_mov_b32_dpp v7, v7  quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xf ;
7E0E02FA FF004E07
        v_or_b32_e32 v5, v5, v7                                              ;
280A0F05
        v_mov_b32_e32 v7, 0                                                  ;
7E0E0280
        s_nop 1                                                              ;
BF800001
        v_mov_b32_dpp v7, v5  row_half_mirror row_mask:0xf bank_mask:0xf     ;
7E0E02FA FF014105
        v_or_b32_e32 v5, v5, v7                                              ;
280A0F05
        v_mov_b32_e32 v7, 0                                                  ;
7E0E0280
        s_nop 1                                                              ;
BF800001
        v_mov_b32_dpp v7, v5  row_mirror row_mask:0xf bank_mask:0xf          ;
7E0E02FA FF014005
        v_or_b32_e32 v5, v5, v7                                              ;
280A0F05
        v_mov_b32_e32 v7, 0                                                  ;
7E0E0280
        s_nop 1                                                              ;
BF800001
        v_mov_b32_dpp v7, v5  row_bcast:15 row_mask:0xa bank_mask:0xf        ;
7E0E02FA AF014205
        v_or_b32_e32 v5, v5, v7                                              ;
280A0F05
        s_nop 1                                                              ;
BF800001
        v_mov_b32_dpp v6, v5  row_bcast:31 row_mask:0xc bank_mask:0xf        ;
7E0C02FA CF014305
        v_or_b32_e32 v5, v5, v6                                              ;
280A0D05
        v_readlane_b32 s8, v5, 63                                            ;
D2890008 00017F05

instead of:
  v_or_b32      v13, v13, v13 row_shr:1                 // 000000000048:
281A1AFA FF01110D
  s_nop         0x0001                                  // 000000000050:
BF800001
  v_or_b32      v13, v13, v13 row_shr:2                 // 000000000054:
281A1AFA FF01120D
  s_nop         0x0001                                  // 00000000005C:
BF800001
  v_or_b32      v13, v13, v13 row_shr:4                 // 000000000060:
281A1AFA FF01140D
  s_nop         0x0001                                  // 000000000068:
BF800001
  v_or_b32      v13, v13, v13 row_shr:8                 // 00000000006C:
281A1AFA FF01180D
  s_nop         0x0001                                  // 000000000074:
BF800001
  v_or_b32      v13, v13, v13 row_bcast:15 row_mask:0xa // 000000000078:
281A1AFA AF01420D
  s_nop         0x0001                                  // 000000000080:
BF800001
  v_or_b32      v13, v13, v13 row_bcast:31 row_mask:0xc // 000000000084:
281A1AFA CF01430D
  s_mov_b64     exec, s[0:1]                            // 00000000008C:
BEFE0100
  v_readlane_b32  s0, v13, 63                           // 000000000090:
D2890000 00017F0D

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20181205/27db0a20/attachment-0001.html>


More information about the mesa-dev mailing list