[Mesa-dev] [Bug 108949] RADV: Subgroup codegen is sub-optimal
bugzilla-daemon at freedesktop.org
bugzilla-daemon at freedesktop.org
Wed Dec 5 11:20:29 UTC 2018
https://bugs.freedesktop.org/show_bug.cgi?id=108949
Bug ID: 108949
Summary: RADV: Subgroup codegen is sub-optimal
Product: Mesa
Version: 18.2
Hardware: Other
OS: All
Status: NEW
Severity: normal
Priority: medium
Component: Drivers/Vulkan/radeon
Assignee: mesa-dev at lists.freedesktop.org
Reporter: maister at archlinux.us
QA Contact: mesa-dev at lists.freedesktop.org
I have some code using subgroups which generates suboptimal code where I expect
more use of SGPRs, but I see a lot of VGPRs/vector loads being used instead.
The code-gen is worse than AMDVLK and much worse than AMD's Windows driver as a
result. I filed a similar issue here:
https://github.com/GPUOpen-Drivers/AMDVLK/issues/68.
On a more useful, complicated test, I get 0% uplift from subgroup on RADV, 5%
on AMDVLK and 15% on Windows. GPU is RX 470 (Polaris). Mesa version is 18.2.5.
With a trivial test:
https://github.com/Themaister/Granite/blob/master/tests/assets/shaders/subgroup.comp
I expect the subgroupBroadcastFirst(subgroupOr) to trigger all scalar loads,
but I get in the loop:
BB629_1:
s_load_dwordx4 s[8:11], s[0:1], 0x0 ;
C00A0200 00000000
s_ff1_i32_b32 s3, s2 ;
BE831002
v_mul_u32_u24_e64 v7, s3, 48 ;
D1080007 00016003
v_or_b32_e32 v5, 4, v7 ;
280A0E84
v_mad_u32_u24 v10, s3, 48, 20 ;
D1C3000A 02516003
v_mad_u32_u24 v8, s3, 48, 16 ;
D1C30008 02416003
s_waitcnt lgkmcnt(0) ;
BF8C007F
* buffer_load_dwordx2 v[5:6], v5, s[8:11], 0 offen ;
E0541000 80020505
* buffer_load_dword v10, v10, s[8:11], 0 offen ;
E0501000 80020A0A
* buffer_load_dword v14, v7, s[8:11], 0 offen ;
E0501000 80020E07
* buffer_load_dword v8, v8, s[8:11], 0 offen ;
E0501000 80020808
v_mad_u32_u24 v11, s3, 48, 24 ;
D1C3000B 02616003
v_or_b32_e32 v7, 12, v7 ;
280E0E8C
v_mad_u32_u24 v12, s3, 48, 28 ;
D1C3000C 02716003
* buffer_load_dword v7, v7, s[8:11], 0 offen ;
E0501000 80020707
v_mad_u32_u24 v9, s3, 48, 32 ;
D1C30009 02816003
buffer_load_dword v11, v11, s[8:11], 0 offen ;
E0501000 80020B0B
v_mad_u32_u24 v13, s3, 48, 36 ;
D1C3000D 02916003
* buffer_load_dword v12, v12, s[8:11], 0 offen ;
E0501000 80020C0C
* buffer_load_dword v9, v9, s[8:11], 0 offen ;
E0501000 80020909
...
where Windows codegen is:
label_0028:
s_cmp_eq_i32 s0, 0 // 0000000000A0:
BF008000
s_cbranch_scc1 label_0052 // 0000000000A4:
BF850028
s_and_b32 s1, s3, 0x0000ffff // 0000000000A8:
8601FF03 0000FFFF
s_ff1_i32_b32 s4, s0 // 0000000000B0:
BE841000
s_andn2_b32 s1, s1, 0x3fff0000 // 0000000000B4:
8901FF01 3FFF0000
s_mul_i32 s5, s4, 48 // 0000000000BC:
9205B004
s_mov_b32 s12, s2 // 0000000000C0:
BE8C0002
s_mov_b32 s13, s1 // 0000000000C4:
BE8D0001
s_movk_i32 s14, 0xffff // 0000000000C8:
B00EFFFF
s_mov_b32 s15, 0x00024fac // 0000000000CC:
BE8F00FF 00024FAC
s_buffer_load_dwordx8 s[16:23], s[12:15], s5 // 0000000000D4:
C02C0406 00000005
s_add_u32 s1, s5, 32 // 0000000000DC:
8001A005
s_buffer_load_dwordx4 s[12:15], s[12:15], s1 // 0000000000E0:
C0280306 00000001
s_lshl_b32 s1, 1, s4 // 0000000000E8:
8E010481
s_xor_b32 s0, s0, s1 // 0000000000EC:
88000100
s_waitcnt vmcnt(0) & lgkmcnt(0) // 0000000000F0:
BF8C0070
...
The subgroupOr is implemented strangely, getting similar code as AMDVLK, i.e.
this:
v_mov_b32_dpp v7, v7 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf ;
7E0E02FA FF00B107
v_or_b32_e32 v5, v5, v7 ;
280A0F05
v_mov_b32_e32 v7, v5 ;
7E0E0305
s_nop 1 ;
BF800001
v_mov_b32_dpp v7, v7 quad_perm:[2,3,0,1] row_mask:0xf bank_mask:0xf ;
7E0E02FA FF004E07
v_or_b32_e32 v5, v5, v7 ;
280A0F05
v_mov_b32_e32 v7, 0 ;
7E0E0280
s_nop 1 ;
BF800001
v_mov_b32_dpp v7, v5 row_half_mirror row_mask:0xf bank_mask:0xf ;
7E0E02FA FF014105
v_or_b32_e32 v5, v5, v7 ;
280A0F05
v_mov_b32_e32 v7, 0 ;
7E0E0280
s_nop 1 ;
BF800001
v_mov_b32_dpp v7, v5 row_mirror row_mask:0xf bank_mask:0xf ;
7E0E02FA FF014005
v_or_b32_e32 v5, v5, v7 ;
280A0F05
v_mov_b32_e32 v7, 0 ;
7E0E0280
s_nop 1 ;
BF800001
v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf ;
7E0E02FA AF014205
v_or_b32_e32 v5, v5, v7 ;
280A0F05
s_nop 1 ;
BF800001
v_mov_b32_dpp v6, v5 row_bcast:31 row_mask:0xc bank_mask:0xf ;
7E0C02FA CF014305
v_or_b32_e32 v5, v5, v6 ;
280A0D05
v_readlane_b32 s8, v5, 63 ;
D2890008 00017F05
instead of:
v_or_b32 v13, v13, v13 row_shr:1 // 000000000048:
281A1AFA FF01110D
s_nop 0x0001 // 000000000050:
BF800001
v_or_b32 v13, v13, v13 row_shr:2 // 000000000054:
281A1AFA FF01120D
s_nop 0x0001 // 00000000005C:
BF800001
v_or_b32 v13, v13, v13 row_shr:4 // 000000000060:
281A1AFA FF01140D
s_nop 0x0001 // 000000000068:
BF800001
v_or_b32 v13, v13, v13 row_shr:8 // 00000000006C:
281A1AFA FF01180D
s_nop 0x0001 // 000000000074:
BF800001
v_or_b32 v13, v13, v13 row_bcast:15 row_mask:0xa // 000000000078:
281A1AFA AF01420D
s_nop 0x0001 // 000000000080:
BF800001
v_or_b32 v13, v13, v13 row_bcast:31 row_mask:0xc // 000000000084:
281A1AFA CF01430D
s_mov_b64 exec, s[0:1] // 00000000008C:
BEFE0100
v_readlane_b32 s0, v13, 63 // 000000000090:
D2890000 00017F0D
--
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20181205/27db0a20/attachment-0001.html>
More information about the mesa-dev
mailing list