[Mesa-dev] Mesa (master): Revert "radeon/llvm: Use alloca instructions for larger arrays"

Fri Jul 22 10:08:03 UTC 2016

On 21.07.2016 18:17, Matt Arsenault wrote:
>> On Jul 21, 2016, at 01:03, Michel Dänzer <michel at daenzer.net
>> <mailto:michel at daenzer.net>> wrote:
>>
>> On 21.07.2016 00:04, Michel Dänzer wrote:
>>> On 15.07.2016 05:15, Marek =?UNKNOWN?B?T2zFocOhaw==?= wrote:
>>>> Module: Mesa
>>>> Branch: master
>>>> Commit: f84e9d749fbb6da73a60fb70e6725db773c9b8f8
>>>> URL:
>>>>    http://cgit.freedesktop.org/mesa/mesa/commit/?id=f84e9d749fbb6da73a60fb70e6725db773c9b8f8
>>>>
>>>> Author: Marek Olšák <marek.olsak at amd.com <mailto:marek.olsak at amd.com>>
>>>> Date:   Thu Jul 14 22:07:46 2016 +0200
>>>>
>>>> Revert "radeon/llvm: Use alloca instructions for larger arrays"
>>>>
>>>> This reverts commit 513fccdfb68e6a71180e21827f071617c93fd09b.
>>>>
>>>> Bioshock Infinite hangs with that.
>>>
>>> Unfortunately, this change caused the piglit test
>>> shaders at glsl-fs-vec4-indexing-temp-dst-in-loop (and possibly others) to
>>> hang my Kaveri. Any ideas for how we can get out of this conundrum?
>>
>> The hang was introduced by LLVM SVN r275934 ("AMDGPU: Expand register
>> indexing pseudos in custom inserter"). The good/bad (without/with
>> r275934) shader dumps and the GALLIUM_DDEBUG=800 dump corresponding to
>> the hang are attached.
>>
>>
>> BTW, even with Marek's change above reverted, I still see some piglit
>> regressions compared to last week, but I'm not sure if those are all
>> related to the same LLVM change.
>>
>>
>> -- 
>> Earthling Michel Dänzer               |
>>               http://www.amd.com <http://www.amd.com/>
>> Libre software enthusiast             |             Mesa and X developer
>> <glsl-fs-vec4-indexing-temp-dst-in-loop.bad><glsl-fs-vec4-indexing-temp-dst-in-loop.good><shader_runner_3339_00000000.txt>
> 
> This fixes the verifier error in it: https://reviews.llvm.org/D22616

This seems to fix the hang, thanks!

> This fixes another issue which may be
> related: https://reviews.llvm.org/D22556

Even with that applied as well, there are still piglit regressions
compared to early last week, see the attached dumps (look for "LLVM
triggered Diagnostic Handler:").

-- 
Earthling Michel Dänzer               |               http://www.amd.com
Libre software enthusiast             |             Mesa and X developer
-------------- next part --------------
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL IN[0], COLOR, COLOR
DCL OUT[0], COLOR
  0: MOV OUT[0], IN[0]
  1: END
radeonsi: Compiling shader 1
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32, float, float, float, float) #0 {
main_body:
  %27 = bitcast float %5 to i32
  %28 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %27, 10
  %29 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %28, float %23, 11
  %30 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %29, float %24, 12
  %31 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %30, float %25, 13
  %32 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %31, float %26, 14
  %33 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %32, float %21, 24
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %33
}

attributes #0 = { "InitialPSInputAddr"="36983" }

VERT
PROPERTY NEXT_SHADER FRAG
DCL IN[0]
DCL IN[1]
DCL OUT[0], POSITION
DCL OUT[1], COLOR
DCL CONST[0..3]
DCL TEMP[0]
  0: MUL TEMP[0], IN[0].xxxx, CONST[0]
  1: MAD TEMP[0], IN[0].yyyy, CONST[1], TEMP[0]
  2: MAD TEMP[0], IN[0].zzzz, CONST[2], TEMP[0]
  3: MAD OUT[0], IN[0].wwww, CONST[3], TEMP[0]
  4: MOV OUT[1], IN[1]
  5: END
radeonsi: Compiling shader 2
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_vs <{ float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), i32 inreg, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32) {
main_body:
  %15 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
  %16 = load <16 x i8>, <16 x i8> addrspace(2)* %15, align 16, !invariant.load !0
  %17 = call float @llvm.SI.load.const(<16 x i8> %16, i32 0)
  %18 = call float @llvm.SI.load.const(<16 x i8> %16, i32 4)
  %19 = call float @llvm.SI.load.const(<16 x i8> %16, i32 8)
  %20 = call float @llvm.SI.load.const(<16 x i8> %16, i32 12)
  %21 = call float @llvm.SI.load.const(<16 x i8> %16, i32 16)
  %22 = call float @llvm.SI.load.const(<16 x i8> %16, i32 20)
  %23 = call float @llvm.SI.load.const(<16 x i8> %16, i32 24)
  %24 = call float @llvm.SI.load.const(<16 x i8> %16, i32 28)
  %25 = call float @llvm.SI.load.const(<16 x i8> %16, i32 32)
  %26 = call float @llvm.SI.load.const(<16 x i8> %16, i32 36)
  %27 = call float @llvm.SI.load.const(<16 x i8> %16, i32 40)
  %28 = call float @llvm.SI.load.const(<16 x i8> %16, i32 44)
  %29 = call float @llvm.SI.load.const(<16 x i8> %16, i32 48)
  %30 = call float @llvm.SI.load.const(<16 x i8> %16, i32 52)
  %31 = call float @llvm.SI.load.const(<16 x i8> %16, i32 56)
  %32 = call float @llvm.SI.load.const(<16 x i8> %16, i32 60)
  %33 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %5, i64 0, i64 0, !amdgpu.uniform !0
  %34 = load <16 x i8>, <16 x i8> addrspace(2)* %33, align 16, !invariant.load !0
  %35 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %34, i32 0, i32 %13)
  %36 = extractelement <4 x float> %35, i32 0
  %37 = extractelement <4 x float> %35, i32 1
  %38 = extractelement <4 x float> %35, i32 2
  %39 = extractelement <4 x float> %35, i32 3
  %40 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %5, i64 0, i64 1, !amdgpu.uniform !0
  %41 = load <16 x i8>, <16 x i8> addrspace(2)* %40, align 16, !invariant.load !0
  %42 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %41, i32 0, i32 %14)
  %43 = extractelement <4 x float> %42, i32 0
  %44 = extractelement <4 x float> %42, i32 1
  %45 = extractelement <4 x float> %42, i32 2
  %46 = extractelement <4 x float> %42, i32 3
  %47 = fmul float %36, %17
  %48 = fmul float %36, %18
  %49 = fmul float %36, %19
  %50 = fmul float %36, %20
  %51 = fmul float %37, %21
  %52 = fadd float %51, %47
  %53 = fmul float %37, %22
  %54 = fadd float %53, %48
  %55 = fmul float %37, %23
  %56 = fadd float %55, %49
  %57 = fmul float %37, %24
  %58 = fadd float %57, %50
  %59 = fmul float %38, %25
  %60 = fadd float %59, %52
  %61 = fmul float %38, %26
  %62 = fadd float %61, %54
  %63 = fmul float %38, %27
  %64 = fadd float %63, %56
  %65 = fmul float %38, %28
  %66 = fadd float %65, %58
  %67 = fmul float %39, %29
  %68 = fadd float %67, %60
  %69 = fmul float %39, %30
  %70 = fadd float %69, %62
  %71 = fmul float %39, %31
  %72 = fadd float %71, %64
  %73 = fmul float %39, %32
  %74 = fadd float %73, %66
  %75 = and i32 %8, 1
  %76 = icmp eq i32 %75, 0
  br i1 %76, label %endif-block, label %if-true-block

if-true-block:                                    ; preds = %main_body
  %77 = call float @llvm.AMDGPU.clamp.(float %43, float 0.000000e+00, float 1.000000e+00)
  %78 = call float @llvm.AMDGPU.clamp.(float %44, float 0.000000e+00, float 1.000000e+00)
  %79 = call float @llvm.AMDGPU.clamp.(float %45, float 0.000000e+00, float 1.000000e+00)
  %80 = call float @llvm.AMDGPU.clamp.(float %46, float 0.000000e+00, float 1.000000e+00)
  br label %endif-block

endif-block:                                      ; preds = %main_body, %if-true-block
  %.06 = phi float [ %77, %if-true-block ], [ %43, %main_body ]
  %.05 = phi float [ %78, %if-true-block ], [ %44, %main_body ]
  %.04 = phi float [ %79, %if-true-block ], [ %45, %main_body ]
  %.0 = phi float [ %80, %if-true-block ], [ %46, %main_body ]
  %81 = bitcast i32 %11 to float
  %82 = insertvalue <{ float, float, float }> undef, float %81, 2
  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %.06, float %.05, float %.04, float %.0)
  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %68, float %70, float %72, float %74)
  ret <{ float, float, float }> %82
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const(<16 x i8>, i32) #0

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #0

; Function Attrs: nounwind readnone
declare float @llvm.AMDGPU.clamp.(float, float, float) #0

; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }

!0 = !{}

VERT
PROPERTY NEXT_SHADER FRAG
DCL IN[0]
DCL OUT[0], POSITION
DCL OUT[1], COLOR
DCL CONST[0..5]
DCL TEMP[0..3], ARRAY(1), LOCAL
DCL TEMP[4..5], LOCAL
DCL ADDR[0]
IMM[0] FLT32 {    0.0000,     1.0000,     0.0000,     0.0000}
  0: MOV TEMP[3], IMM[0].xyxx
  1: UARL ADDR[0].x, CONST[0].xxxx
  2: UARL ADDR[0].x, CONST[0].xxxx
  3: MOV TEMP[4], TEMP[ADDR[0].x](1)
  4: UARL ADDR[0].x, CONST[1].xxxx
  5: MOV TEMP[ADDR[0].x](1), TEMP[4]
  6: MOV TEMP[4], TEMP[2]
  7: MUL TEMP[5], CONST[2], IN[0].xxxx
  8: MAD TEMP[5], CONST[3], IN[0].yyyy, TEMP[5]
  9: MAD TEMP[5], CONST[4], IN[0].zzzz, TEMP[5]
 10: MAD TEMP[5], CONST[5], IN[0].wwww, TEMP[5]
 11: MOV OUT[0], TEMP[5]
 12: MOV OUT[1], TEMP[4]
 13: END
radeonsi: Compiling shader 3
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_vs <{ float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), i32 inreg, i32 inreg, i32 inreg, i32, i32, i32, i32, i32) {
main_body:
  %14 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
  %15 = load <16 x i8>, <16 x i8> addrspace(2)* %14, align 16, !invariant.load !0
  %16 = call float @llvm.SI.load.const(<16 x i8> %15, i32 0)
  %17 = call float @llvm.SI.load.const(<16 x i8> %15, i32 16)
  %18 = call float @llvm.SI.load.const(<16 x i8> %15, i32 32)
  %19 = call float @llvm.SI.load.const(<16 x i8> %15, i32 36)
  %20 = call float @llvm.SI.load.const(<16 x i8> %15, i32 40)
  %21 = call float @llvm.SI.load.const(<16 x i8> %15, i32 44)
  %22 = call float @llvm.SI.load.const(<16 x i8> %15, i32 48)
  %23 = call float @llvm.SI.load.const(<16 x i8> %15, i32 52)
  %24 = call float @llvm.SI.load.const(<16 x i8> %15, i32 56)
  %25 = call float @llvm.SI.load.const(<16 x i8> %15, i32 60)
  %26 = call float @llvm.SI.load.const(<16 x i8> %15, i32 64)
  %27 = call float @llvm.SI.load.const(<16 x i8> %15, i32 68)
  %28 = call float @llvm.SI.load.const(<16 x i8> %15, i32 72)
  %29 = call float @llvm.SI.load.const(<16 x i8> %15, i32 76)
  %30 = call float @llvm.SI.load.const(<16 x i8> %15, i32 80)
  %31 = call float @llvm.SI.load.const(<16 x i8> %15, i32 84)
  %32 = call float @llvm.SI.load.const(<16 x i8> %15, i32 88)
  %33 = call float @llvm.SI.load.const(<16 x i8> %15, i32 92)
  %34 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %5, i64 0, i64 0, !amdgpu.uniform !0
  %35 = load <16 x i8>, <16 x i8> addrspace(2)* %34, align 16, !invariant.load !0
  %36 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %35, i32 0, i32 %13)
  %37 = extractelement <4 x float> %36, i32 0
  %38 = extractelement <4 x float> %36, i32 1
  %39 = extractelement <4 x float> %36, i32 2
  %40 = extractelement <4 x float> %36, i32 3
  %41 = bitcast float %16 to i32
  %42 = extractelement <4 x float> <float undef, float undef, float undef, float 0.000000e+00>, i32 %41
  %43 = extractelement <4 x float> <float undef, float undef, float undef, float 1.000000e+00>, i32 %41
  %44 = extractelement <4 x float> <float undef, float undef, float undef, float 0.000000e+00>, i32 %41
  %45 = extractelement <4 x float> <float undef, float undef, float undef, float 0.000000e+00>, i32 %41
  %46 = bitcast float %17 to i32
  %47 = insertelement <4 x float> undef, float %42, i32 %46
  %48 = extractelement <4 x float> %47, i32 2
  %49 = insertelement <4 x float> undef, float %43, i32 %46
  %50 = extractelement <4 x float> %49, i32 2
  %51 = insertelement <4 x float> undef, float %44, i32 %46
  %52 = extractelement <4 x float> %51, i32 2
  %53 = insertelement <4 x float> undef, float %45, i32 %46
  %54 = extractelement <4 x float> %53, i32 2
  %55 = fmul float %18, %37
  %56 = fmul float %19, %37
  %57 = fmul float %20, %37
  %58 = fmul float %21, %37
  %59 = fmul float %22, %38
  %60 = fadd float %59, %55
  %61 = fmul float %23, %38
  %62 = fadd float %61, %56
  %63 = fmul float %24, %38
  %64 = fadd float %63, %57
  %65 = fmul float %25, %38
  %66 = fadd float %65, %58
  %67 = fmul float %26, %39
  %68 = fadd float %67, %60
  %69 = fmul float %27, %39
  %70 = fadd float %69, %62
  %71 = fmul float %28, %39
  %72 = fadd float %71, %64
  %73 = fmul float %29, %39
  %74 = fadd float %73, %66
  %75 = fmul float %30, %40
  %76 = fadd float %75, %68
  %77 = fmul float %31, %40
  %78 = fadd float %77, %70
  %79 = fmul float %32, %40
  %80 = fadd float %79, %72
  %81 = fmul float %33, %40
  %82 = fadd float %81, %74
  %83 = and i32 %8, 1
  %84 = icmp eq i32 %83, 0
  br i1 %84, label %endif-block, label %if-true-block

if-true-block:                                    ; preds = %main_body
  %85 = call float @llvm.AMDGPU.clamp.(float %48, float 0.000000e+00, float 1.000000e+00)
  %86 = call float @llvm.AMDGPU.clamp.(float %50, float 0.000000e+00, float 1.000000e+00)
  %87 = call float @llvm.AMDGPU.clamp.(float %52, float 0.000000e+00, float 1.000000e+00)
  %88 = call float @llvm.AMDGPU.clamp.(float %54, float 0.000000e+00, float 1.000000e+00)
  br label %endif-block

endif-block:                                      ; preds = %main_body, %if-true-block
  %.026 = phi float [ %88, %if-true-block ], [ %54, %main_body ]
  %.025 = phi float [ %87, %if-true-block ], [ %52, %main_body ]
  %.024 = phi float [ %86, %if-true-block ], [ %50, %main_body ]
  %.0 = phi float [ %85, %if-true-block ], [ %48, %main_body ]
  %89 = bitcast i32 %11 to float
  %90 = insertvalue <{ float, float, float }> undef, float %89, 2
  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %.0, float %.024, float %.025, float %.026)
  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %76, float %78, float %80, float %82)
  ret <{ float, float, float }> %90
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const(<16 x i8>, i32) #0

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #0

; Function Attrs: nounwind readnone
declare float @llvm.AMDGPU.clamp.(float, float, float) #0

; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }

!0 = !{}

LLVM triggered Diagnostic Handler: Illegal instruction detected: src0 should be subreg of implicit vector use
  V_MOVRELD_B32_e32 %VGPR0<undef>, %VGPR0<kill>, %M0<imp-use>, %EXEC<imp-use>, %VGPR7_VGPR8_VGPR9_VGPR10<imp-def,tied5>, %VGPR7_VGPR8_VGPR9_VGPR10<imp-use,undef,tied4>
LLVM triggered Diagnostic Handler: Illegal instruction detected: src0 should be subreg of implicit vector use
  V_MOVRELD_B32_e32 %VGPR0<undef>, %VGPR1<kill>, %M0<imp-use,kill>, %EXEC<imp-use>, %VGPR10_VGPR11_VGPR12_VGPR13<imp-def,tied5>, %VGPR10_VGPR11_VGPR12_VGPR13<imp-use,undef,tied4>
LLVM failed to compile shader
radeonsi: can't compile a main shader part
FRAG
PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1
DCL IN[0], GENERIC[0], CONSTANT
DCL OUT[0], COLOR
  0: MOV OUT[0], IN[0]
  1: END
radeonsi: Compiling shader 4
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
  %23 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %6)
  %24 = call float @llvm.SI.fs.constant(i32 1, i32 0, i32 %6)
  %25 = call float @llvm.SI.fs.constant(i32 2, i32 0, i32 %6)
  %26 = call float @llvm.SI.fs.constant(i32 3, i32 0, i32 %6)
  %27 = bitcast float %5 to i32
  %28 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %27, 10
  %29 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %28, float %23, 11
  %30 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %29, float %24, 12
  %31 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %30, float %25, 13
  %32 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %31, float %26, 14
  %33 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %32, float %21, 24
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %33
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.constant(i32, i32, i32) #1

attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }

VERT
PROPERTY NEXT_SHADER FRAG
DCL IN[0]
DCL IN[1]
DCL OUT[0], POSITION
DCL OUT[1], GENERIC[0]
  0: MOV OUT[0], IN[0]
  1: MOV OUT[1], IN[1]
  2: END
radeonsi: Compiling shader 5
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_vs <{ float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), i32 inreg, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32) {
main_body:
  %15 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %5, i64 0, i64 0, !amdgpu.uniform !0
  %16 = load <16 x i8>, <16 x i8> addrspace(2)* %15, align 16, !invariant.load !0
  %17 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %16, i32 0, i32 %13)
  %18 = extractelement <4 x float> %17, i32 0
  %19 = extractelement <4 x float> %17, i32 1
  %20 = extractelement <4 x float> %17, i32 2
  %21 = extractelement <4 x float> %17, i32 3
  %22 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %5, i64 0, i64 1, !amdgpu.uniform !0
  %23 = load <16 x i8>, <16 x i8> addrspace(2)* %22, align 16, !invariant.load !0
  %24 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %23, i32 0, i32 %14)
  %25 = extractelement <4 x float> %24, i32 0
  %26 = extractelement <4 x float> %24, i32 1
  %27 = extractelement <4 x float> %24, i32 2
  %28 = extractelement <4 x float> %24, i32 3
  %29 = bitcast i32 %11 to float
  %30 = insertvalue <{ float, float, float }> undef, float %29, 2
  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %25, float %26, float %27, float %28)
  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %18, float %19, float %20, float %21)
  ret <{ float, float, float }> %30
}

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #0

; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }

!0 = !{}

radeonsi: Compiling shader 6
Vertex Shader Prolog LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_vs <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> @main(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32, i32, i32, i32) {
main_body:
  %19 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> undef, i32 %0, 0
  %20 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %19, i32 %1, 1
  %21 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %20, i32 %2, 2
  %22 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %21, i32 %3, 3
  %23 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %22, i32 %4, 4
  %24 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %23, i32 %5, 5
  %25 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %24, i32 %6, 6
  %26 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %25, i32 %7, 7
  %27 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %26, i32 %8, 8
  %28 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %27, i32 %9, 9
  %29 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %28, i32 %10, 10
  %30 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %29, i32 %11, 11
  %31 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %30, i32 %12, 12
  %32 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %31, i32 %13, 13
  %33 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %32, i32 %14, 14
  %34 = bitcast i32 %15 to float
  %35 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %33, float %34, 15
  %36 = bitcast i32 %16 to float
  %37 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %35, float %36, 16
  %38 = bitcast i32 %17 to float
  %39 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %37, float %38, 17
  %40 = bitcast i32 %18 to float
  %41 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %39, float %40, 18
  %42 = add i32 %15, %12
  %43 = bitcast i32 %42 to float
  %44 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %41, float %43, 19
  %45 = add i32 %15, %12
  %46 = bitcast i32 %45 to float
  %47 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %44, float %46, 20
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %47
}

radeonsi: Compiling shader 7
Vertex Shader Epilog LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_vs void @main() {
main_body:
  ret void
}

Vertex Shader as VS:
Shader prolog disassembly:
	v_add_i32_e32 v4, vcc, s12, v0 ; 4A08000C
	v_mov_b32_e32 v5, v4           ; 7E0A0304
Shader main disassembly:
	s_load_dwordx4 s[0:3], s[10:11], 0x0                  ; C0800B00
	s_load_dwordx4 s[4:7], s[10:11], 0x4                  ; C0820B04
	s_waitcnt lgkmcnt(0)                                  ; BF8C007F
	buffer_load_format_xyzw v[6:9], v4, s[0:3], 0 idxen   ; E00C2000 80000604
	buffer_load_format_xyzw v[10:13], v5, s[4:7], 0 idxen ; E00C2000 80010A05
	s_waitcnt vmcnt(0)                                    ; BF8C0F70
	exp 15, 32, 0, 0, 0, v10, v11, v12, v13               ; F800020F 0D0C0B0A
	exp 15, 12, 0, 1, 0, v6, v7, v8, v9                   ; F80008CF 09080706
	s_waitcnt expcnt(0)                                   ; BF8C0F0F
Shader epilog disassembly:
	s_endpgm ; BF810000

*** SHADER STATS ***
SGPRS: 24
VGPRS: 16
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 64 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
radeonsi: Compiling shader 8
Fragment Shader Epilog LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps void @main(i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, float inreg, float, float, float, float, float, float, float, float, float, float, float, float, float, float) #0 {
main_body:
  %20 = call i32 @llvm.SI.packf16(float %6, float %7)
  %21 = bitcast i32 %20 to float
  %22 = call i32 @llvm.SI.packf16(float %8, float %9)
  %23 = bitcast i32 %22 to float
  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %21, float %23, float undef, float undef)
  ret void
}

; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #1

; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #2

attributes #0 = { "InitialPSInputAddr"="16777215" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }

Pixel Shader:
Shader main disassembly:
	s_mov_b32 m0, s11                   ; BEFC030B
	v_interp_mov_f32 v0, P0, 0, 0, [m0] ; C8020002
	v_interp_mov_f32 v1, P0, 1, 0, [m0] ; C8060102
	v_interp_mov_f32 v2, P0, 2, 0, [m0] ; C80A0202
	v_interp_mov_f32 v3, P0, 3, 0, [m0] ; C80E0302
Shader epilog disassembly:
	v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; 5E000300
	v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; 5E020702
	exp 15, 0, 1, 1, 1, v0, v1, v0, v0 ; F8001C0F 00000100
	s_endpgm                           ; BF810000

*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0xd077
SPI_PS_INPUT_ENA  = 0x0020
*** SHADER STATS ***
SGPRS: 16
VGPRS: 16
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 40 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
SHADER KEY
  instance_divisors = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
  as_es = 0
  as_ls = 0
  export_prim_id = 0
VERT
PROPERTY NEXT_SHADER FRAG
DCL IN[0]
DCL OUT[0], POSITION
DCL OUT[1], COLOR
DCL CONST[0..5]
DCL TEMP[0..3], ARRAY(1), LOCAL
DCL TEMP[4..5], LOCAL
DCL ADDR[0]
IMM[0] FLT32 {    0.0000,     1.0000,     0.0000,     0.0000}
  0: MOV TEMP[3], IMM[0].xyxx
  1: UARL ADDR[0].x, CONST[0].xxxx
  2: UARL ADDR[0].x, CONST[0].xxxx
  3: MOV TEMP[4], TEMP[ADDR[0].x](1)
  4: UARL ADDR[0].x, CONST[1].xxxx
  5: MOV TEMP[ADDR[0].x](1), TEMP[4]
  6: MOV TEMP[4], TEMP[2]
  7: MUL TEMP[5], CONST[2], IN[0].xxxx
  8: MAD TEMP[5], CONST[3], IN[0].yyyy, TEMP[5]
  9: MAD TEMP[5], CONST[4], IN[0].zzzz, TEMP[5]
 10: MAD TEMP[5], CONST[5], IN[0].wwww, TEMP[5]
 11: MOV OUT[0], TEMP[5]
 12: MOV OUT[1], TEMP[4]
 13: END
radeonsi: Compiling shader 9
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_vs void @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), i32 inreg, i32 inreg, i32 inreg, i32, i32, i32, i32) {
main_body:
  %13 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %1, i64 0, i64 0, !amdgpu.uniform !0
  %14 = load <16 x i8>, <16 x i8> addrspace(2)* %13, align 16, !invariant.load !0
  %15 = call float @llvm.SI.load.const(<16 x i8> %14, i32 0)
  %16 = call float @llvm.SI.load.const(<16 x i8> %14, i32 16)
  %17 = call float @llvm.SI.load.const(<16 x i8> %14, i32 32)
  %18 = call float @llvm.SI.load.const(<16 x i8> %14, i32 36)
  %19 = call float @llvm.SI.load.const(<16 x i8> %14, i32 40)
  %20 = call float @llvm.SI.load.const(<16 x i8> %14, i32 44)
  %21 = call float @llvm.SI.load.const(<16 x i8> %14, i32 48)
  %22 = call float @llvm.SI.load.const(<16 x i8> %14, i32 52)
  %23 = call float @llvm.SI.load.const(<16 x i8> %14, i32 56)
  %24 = call float @llvm.SI.load.const(<16 x i8> %14, i32 60)
  %25 = call float @llvm.SI.load.const(<16 x i8> %14, i32 64)
  %26 = call float @llvm.SI.load.const(<16 x i8> %14, i32 68)
  %27 = call float @llvm.SI.load.const(<16 x i8> %14, i32 72)
  %28 = call float @llvm.SI.load.const(<16 x i8> %14, i32 76)
  %29 = call float @llvm.SI.load.const(<16 x i8> %14, i32 80)
  %30 = call float @llvm.SI.load.const(<16 x i8> %14, i32 84)
  %31 = call float @llvm.SI.load.const(<16 x i8> %14, i32 88)
  %32 = call float @llvm.SI.load.const(<16 x i8> %14, i32 92)
  %33 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %5, i64 0, i64 0, !amdgpu.uniform !0
  %34 = load <16 x i8>, <16 x i8> addrspace(2)* %33, align 16, !invariant.load !0
  %35 = add i32 %6, %9
  %36 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %34, i32 0, i32 %35)
  %37 = extractelement <4 x float> %36, i32 0
  %38 = extractelement <4 x float> %36, i32 1
  %39 = extractelement <4 x float> %36, i32 2
  %40 = extractelement <4 x float> %36, i32 3
  %41 = bitcast float %15 to i32
  %42 = extractelement <4 x float> <float undef, float undef, float undef, float 0.000000e+00>, i32 %41
  %43 = extractelement <4 x float> <float undef, float undef, float undef, float 1.000000e+00>, i32 %41
  %44 = extractelement <4 x float> <float undef, float undef, float undef, float 0.000000e+00>, i32 %41
  %45 = extractelement <4 x float> <float undef, float undef, float undef, float 0.000000e+00>, i32 %41
  %46 = bitcast float %16 to i32
  %47 = insertelement <4 x float> undef, float %42, i32 %46
  %48 = extractelement <4 x float> %47, i32 2
  %49 = insertelement <4 x float> undef, float %43, i32 %46
  %50 = extractelement <4 x float> %49, i32 2
  %51 = insertelement <4 x float> undef, float %44, i32 %46
  %52 = extractelement <4 x float> %51, i32 2
  %53 = insertelement <4 x float> undef, float %45, i32 %46
  %54 = extractelement <4 x float> %53, i32 2
  %55 = fmul float %17, %37
  %56 = fmul float %18, %37
  %57 = fmul float %19, %37
  %58 = fmul float %20, %37
  %59 = fmul float %21, %38
  %60 = fadd float %59, %55
  %61 = fmul float %22, %38
  %62 = fadd float %61, %56
  %63 = fmul float %23, %38
  %64 = fadd float %63, %57
  %65 = fmul float %24, %38
  %66 = fadd float %65, %58
  %67 = fmul float %25, %39
  %68 = fadd float %67, %60
  %69 = fmul float %26, %39
  %70 = fadd float %69, %62
  %71 = fmul float %27, %39
  %72 = fadd float %71, %64
  %73 = fmul float %28, %39
  %74 = fadd float %73, %66
  %75 = fmul float %29, %40
  %76 = fadd float %75, %68
  %77 = fmul float %30, %40
  %78 = fadd float %77, %70
  %79 = fmul float %31, %40
  %80 = fadd float %79, %72
  %81 = fmul float %32, %40
  %82 = fadd float %81, %74
  %83 = and i32 %8, 1
  %84 = icmp eq i32 %83, 0
  br i1 %84, label %endif-block, label %if-true-block

if-true-block:                                    ; preds = %main_body
  %85 = call float @llvm.AMDGPU.clamp.(float %48, float 0.000000e+00, float 1.000000e+00)
  %86 = call float @llvm.AMDGPU.clamp.(float %50, float 0.000000e+00, float 1.000000e+00)
  %87 = call float @llvm.AMDGPU.clamp.(float %52, float 0.000000e+00, float 1.000000e+00)
  %88 = call float @llvm.AMDGPU.clamp.(float %54, float 0.000000e+00, float 1.000000e+00)
  br label %endif-block

endif-block:                                      ; preds = %main_body, %if-true-block
  %.026 = phi float [ %88, %if-true-block ], [ %54, %main_body ]
  %.025 = phi float [ %87, %if-true-block ], [ %52, %main_body ]
  %.024 = phi float [ %86, %if-true-block ], [ %50, %main_body ]
  %.0 = phi float [ %85, %if-true-block ], [ %48, %main_body ]
  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %.0, float %.024, float %.025, float %.026)
  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %76, float %78, float %80, float %82)
  ret void
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.load.const(<16 x i8>, i32) #0

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #0

; Function Attrs: nounwind readnone
declare float @llvm.AMDGPU.clamp.(float, float, float) #0

; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }

!0 = !{}

LLVM triggered Diagnostic Handler: Illegal instruction detected: src0 should be subreg of implicit vector use
  V_MOVRELD_B32_e32 %VGPR0<undef>, %VGPR4, %M0<imp-use>, %EXEC<imp-use>, %VGPR4_VGPR5_VGPR6_VGPR7<imp-def,tied5>, %VGPR4_VGPR5_VGPR6_VGPR7<imp-use,undef,tied4>
LLVM triggered Diagnostic Handler: Illegal instruction detected: src0 should be subreg of implicit vector use
  V_MOVRELD_B32_e32 %VGPR0<undef>, %VGPR8, %M0<imp-use,kill>, %EXEC<imp-use>, %VGPR7_VGPR8_VGPR9_VGPR10<imp-def,tied5>, %VGPR7_VGPR8_VGPR9_VGPR10<imp-use,undef,tied4>
LLVM failed to compile shader
EE ../../../../../src/gallium/drivers/radeonsi/si_state_shaders.c:1041 si_shader_select_with_key - Failed to build shader variant (type=0) 1
FRAG
DCL IN[0], GENERIC[0], LINEAR
DCL OUT[0], COLOR
DCL SAMP[0]
DCL SVIEW[0], 2D, FLOAT
  0: TEX OUT[0], IN[0], SAMP[0], 2D
  1: END
radeonsi: Compiling shader 10
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
  %23 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %2, i64 0, i64 0, !amdgpu.uniform !0
  %24 = load <8 x i32>, <8 x i32> addrspace(2)* %23, align 32, !invariant.load !0
  %25 = bitcast [32 x <8 x i32>] addrspace(2)* %2 to [0 x <4 x i32>] addrspace(2)*
  %26 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %25, i64 0, i64 3, !amdgpu.uniform !0
  %27 = load <4 x i32>, <4 x i32> addrspace(2)* %26, align 16, !invariant.load !0
  %28 = extractelement <8 x i32> %24, i32 7
  %29 = extractelement <4 x i32> %27, i32 0
  %30 = and i32 %29, %28
  %31 = insertelement <4 x i32> %27, i32 %30, i32 0
  %32 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %6, <2 x i32> %12)
  %33 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %6, <2 x i32> %12)
  %34 = bitcast float %32 to i32
  %35 = bitcast float %33 to i32
  %36 = insertelement <2 x i32> undef, i32 %34, i32 0
  %37 = insertelement <2 x i32> %36, i32 %35, i32 1
  %38 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %37, <8 x i32> %24, <4 x i32> %31, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
  %39 = extractelement <4 x float> %38, i32 0
  %40 = extractelement <4 x float> %38, i32 1
  %41 = extractelement <4 x float> %38, i32 2
  %42 = extractelement <4 x float> %38, i32 3
  %43 = bitcast float %5 to i32
  %44 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %43, 10
  %45 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %44, float %39, 11
  %46 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %45, float %40, 12
  %47 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %46, float %41, 13
  %48 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %47, float %42, 14
  %49 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %48, float %21, 24
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %49
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1

attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }

!0 = !{}

Pixel Shader:
Shader main disassembly:
	s_wqm_b64 exec, exec                                    ; BEFE0A7E
	s_load_dwordx8 s[12:19], s[4:5], 0x0                    ; C0C60500
	s_load_dwordx4 s[0:3], s[4:5], 0xc                      ; C080050C
	s_mov_b32 m0, s11                                       ; BEFC030B
	v_interp_p1_f32 v0, v8, 0, 0, [m0]                      ; C8000008
	v_interp_p2_f32 v0, [v0], v9, 0, 0, [m0]                ; C8010009
	v_interp_p1_f32 v1, v8, 1, 0, [m0]                      ; C8040108
	s_waitcnt lgkmcnt(0)                                    ; BF8C007F
	s_and_b32 s0, s0, s19                                   ; 87001300
	v_interp_p2_f32 v1, [v1], v9, 1, 0, [m0]                ; C8050109
	image_sample v[0:3], v[0:1], s[12:19], s[0:3] dmask:0xf ; F0800F00 00030000
	s_waitcnt vmcnt(0)                                      ; BF8C0F70
Shader epilog disassembly:
	v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; 5E000300
	v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; 5E020702
	exp 15, 0, 1, 1, 1, v0, v1, v0, v0 ; F8001C0F 00000100
	s_endpgm                           ; BF810000

*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0xd077
SPI_PS_INPUT_ENA  = 0x0020
*** SHADER STATS ***
SGPRS: 24
VGPRS: 16
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 72 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
-------------- next part --------------
VERT
PROPERTY NEXT_SHADER GEOM
DCL IN[0]
DCL OUT[0], GENERIC[0]
  0: MOV OUT[0], IN[0]
  1: END
radeonsi: Compiling shader 1
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_vs void @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), i32 inreg, i32 inreg, i32 inreg, i32, i32, i32, i32, i32) {
main_body:
  %14 = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %0, i64 0, i64 2, !amdgpu.uniform !0
  %15 = load <16 x i8>, <16 x i8> addrspace(2)* %14, align 16, !invariant.load !0
  %16 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %5, i64 0, i64 0, !amdgpu.uniform !0
  %17 = load <16 x i8>, <16 x i8> addrspace(2)* %16, align 16, !invariant.load !0
  %18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %17, i32 0, i32 %13)
  %bc = bitcast <4 x float> %18 to <4 x i32>
  %19 = extractelement <4 x i32> %bc, i32 0
  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %15, i32 %19, i32 1, i32 undef, i32 %8, i32 64, i32 4, i32 4, i32 0, i32 0, i32 1, i32 1, i32 0)
  %bc1 = bitcast <4 x float> %18 to <4 x i32>
  %20 = extractelement <4 x i32> %bc1, i32 1
  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %15, i32 %20, i32 1, i32 undef, i32 %8, i32 68, i32 4, i32 4, i32 0, i32 0, i32 1, i32 1, i32 0)
  %bc2 = bitcast <4 x float> %18 to <4 x i32>
  %21 = extractelement <4 x i32> %bc2, i32 2
  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %15, i32 %21, i32 1, i32 undef, i32 %8, i32 72, i32 4, i32 4, i32 0, i32 0, i32 1, i32 1, i32 0)
  %bc3 = bitcast <4 x float> %18 to <4 x i32>
  %22 = extractelement <4 x i32> %bc3, i32 3
  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %15, i32 %22, i32 1, i32 undef, i32 %8, i32 76, i32 4, i32 4, i32 0, i32 0, i32 1, i32 1, i32 0)
  ret void
}

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #0

; Function Attrs: nounwind
declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #1

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }

!0 = !{}

FRAG
DCL IN[0], GENERIC[0], PERSPECTIVE
DCL OUT[0], COLOR
  0: MOV OUT[0], IN[0]
  1: END
radeonsi: Compiling shader 2
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
  %23 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %6, <2 x i32> %8)
  %24 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %6, <2 x i32> %8)
  %25 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %6, <2 x i32> %8)
  %26 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %6, <2 x i32> %8)
  %27 = bitcast float %5 to i32
  %28 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %27, 10
  %29 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %28, float %23, 11
  %30 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %29, float %24, 12
  %31 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %30, float %25, 13
  %32 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %31, float %26, 14
  %33 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %32, float %21, 24
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %33
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1

attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }

radeonsi: Compiling shader 4
Vertex Shader Prolog LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_vs <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> @main(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32, i32, i32, i32) {
main_body:
  %19 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> undef, i32 %0, 0
  %20 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %19, i32 %1, 1
  %21 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %20, i32 %2, 2
  %22 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %21, i32 %3, 3
  %23 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %22, i32 %4, 4
  %24 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %23, i32 %5, 5
  %25 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %24, i32 %6, 6
  %26 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %25, i32 %7, 7
  %27 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %26, i32 %8, 8
  %28 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %27, i32 %9, 9
  %29 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %28, i32 %10, 10
  %30 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %29, i32 %11, 11
  %31 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %30, i32 %12, 12
  %32 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %31, i32 %13, 13
  %33 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %32, i32 %14, 14
  %34 = bitcast i32 %15 to float
  %35 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %33, float %34, 15
  %36 = bitcast i32 %16 to float
  %37 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %35, float %36, 16
  %38 = bitcast i32 %17 to float
  %39 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %37, float %38, 17
  %40 = bitcast i32 %18 to float
  %41 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %39, float %40, 18
  %42 = add i32 %15, %12
  %43 = bitcast i32 %42 to float
  %44 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %41, float %43, 19
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float }> %44
}

LLVM triggered Diagnostic Handler: Illegal instruction detected: missing implicit register operands
  %VGPR4<def> = V_MOVRELS_B32_e32 %VGPR5<undef>, %M0<imp-use>, %EXEC<imp-use>, %VGPR4_VGPR5_VGPR6_VGPR7_VGPR8_VGPR9_VGPR10_VGPR11<imp-use>, %VGPR4_VGPR5<imp-def>, %VGPR6_VGPR7<imp-def>, %VGPR8_VGPR9<imp-def>, %VGPR10_VGPR11<imp-def>
LLVM triggered Diagnostic Handler: Illegal instruction detected: missing implicit register operands
  %VGPR5<def> = V_MOVRELS_B32_e32 %VGPR13<undef>, %M0<imp-use>, %EXEC<imp-use>, %VGPR12_VGPR13_VGPR14_VGPR15_VGPR16_VGPR17_VGPR18_VGPR19<imp-use>, %VGPR12_VGPR13<imp-def>
LLVM failed to compile shader
EE ../../../../../src/gallium/drivers/radeonsi/si_state_shaders.c:1041 si_shader_select_with_key - Failed to build shader variant (type=2) 1
radeonsi: can't create a monolithic shader

Vertex Shader as ES:
Shader prolog disassembly:
	v_add_i32_e32 v4, vcc, s12, v0 ; 4A08000C
Shader main disassembly:
	s_load_dwordx4 s[4:7], s[10:11], 0x0                                       ; C0820B00
	s_load_dwordx4 s[0:3], s[0:1], 0x8                                         ; C0800108
	s_waitcnt lgkmcnt(0)                                                       ; BF8C007F
	buffer_load_format_xyzw v[0:3], v4, s[4:7], 0 idxen                        ; E00C2000 80010004
	s_waitcnt vmcnt(0)                                                         ; BF8C0F70
	tbuffer_store_format_x v0, 0x40, 0, 0, -1, 0, 4, 4, v0, s[0:3], -1, 0, s14 ; EA244040 0E400000
	tbuffer_store_format_x v1, 0x44, 0, 0, -1, 0, 4, 4, v0, s[0:3], -1, 0, s14 ; EA244044 0E400100
	tbuffer_store_format_x v2, 0x48, 0, 0, -1, 0, 4, 4, v0, s[0:3], -1, 0, s14 ; EA244048 0E400200
	tbuffer_store_format_x v3, 0x4c, 0, 0, -1, 0, 4, 4, v0, s[0:3], -1, 0, s14 ; EA24404C 0E400300
	s_endpgm                                                                   ; BF810000

*** SHADER STATS ***
SGPRS: 24
VGPRS: 8
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 64 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
LLVM triggered Diagnostic Handler: Illegal instruction detected: missing implicit register operands
  %VGPR4<def> = V_MOVRELS_B32_e32 %VGPR5<undef>, %M0<imp-use>, %EXEC<imp-use>, %VGPR4_VGPR5_VGPR6_VGPR7_VGPR8_VGPR9_VGPR10_VGPR11<imp-use>, %VGPR4_VGPR5<imp-def>, %VGPR6_VGPR7<imp-def>, %VGPR8_VGPR9<imp-def>, %VGPR10_VGPR11<imp-def>
LLVM triggered Diagnostic Handler: Illegal instruction detected: missing implicit register operands
  %VGPR5<def> = V_MOVRELS_B32_e32 %VGPR13<undef>, %M0<imp-use>, %EXEC<imp-use>, %VGPR12_VGPR13_VGPR14_VGPR15_VGPR16_VGPR17_VGPR18_VGPR19<imp-use>, %VGPR12_VGPR13<imp-def>
LLVM failed to compile shader
EE ../../../../../src/gallium/drivers/radeonsi/si_state_shaders.c:1041 si_shader_select_with_key - Failed to build shader variant (type=2) 1
FRAG
DCL IN[0], GENERIC[0], CONSTANT
DCL OUT[0], COLOR
  0: MOV OUT[0], IN[0]
  1: END
radeonsi: Compiling shader 6
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
  %23 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %6)
  %24 = call float @llvm.SI.fs.constant(i32 1, i32 0, i32 %6)
  %25 = call float @llvm.SI.fs.constant(i32 2, i32 0, i32 %6)
  %26 = call float @llvm.SI.fs.constant(i32 3, i32 0, i32 %6)
  %27 = bitcast float %5 to i32
  %28 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %27, 10
  %29 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %28, float %23, 11
  %30 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %29, float %24, 12
  %31 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %30, float %25, 13
  %32 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %31, float %26, 14
  %33 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %32, float %21, 24
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %33
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.constant(i32, i32, i32) #1

attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }

VERT
PROPERTY NEXT_SHADER FRAG
DCL IN[0]
DCL IN[1]
DCL OUT[0], POSITION
DCL OUT[1], GENERIC[0]
  0: MOV OUT[0], IN[0]
  1: MOV OUT[1], IN[1]
  2: END
radeonsi: Compiling shader 7
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_vs <{ float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), i32 inreg, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32) {
main_body:
  %15 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %5, i64 0, i64 0, !amdgpu.uniform !0
  %16 = load <16 x i8>, <16 x i8> addrspace(2)* %15, align 16, !invariant.load !0
  %17 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %16, i32 0, i32 %13)
  %18 = extractelement <4 x float> %17, i32 0
  %19 = extractelement <4 x float> %17, i32 1
  %20 = extractelement <4 x float> %17, i32 2
  %21 = extractelement <4 x float> %17, i32 3
  %22 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %5, i64 0, i64 1, !amdgpu.uniform !0
  %23 = load <16 x i8>, <16 x i8> addrspace(2)* %22, align 16, !invariant.load !0
  %24 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %23, i32 0, i32 %14)
  %25 = extractelement <4 x float> %24, i32 0
  %26 = extractelement <4 x float> %24, i32 1
  %27 = extractelement <4 x float> %24, i32 2
  %28 = extractelement <4 x float> %24, i32 3
  %29 = bitcast i32 %11 to float
  %30 = insertvalue <{ float, float, float }> undef, float %29, 2
  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %25, float %26, float %27, float %28)
  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %18, float %19, float %20, float %21)
  ret <{ float, float, float }> %30
}

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #0

; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #1

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }

!0 = !{}

radeonsi: Compiling shader 8
Vertex Shader Prolog LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_vs <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> @main(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32, i32, i32, i32) {
main_body:
  %19 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> undef, i32 %0, 0
  %20 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %19, i32 %1, 1
  %21 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %20, i32 %2, 2
  %22 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %21, i32 %3, 3
  %23 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %22, i32 %4, 4
  %24 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %23, i32 %5, 5
  %25 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %24, i32 %6, 6
  %26 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %25, i32 %7, 7
  %27 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %26, i32 %8, 8
  %28 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %27, i32 %9, 9
  %29 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %28, i32 %10, 10
  %30 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %29, i32 %11, 11
  %31 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %30, i32 %12, 12
  %32 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %31, i32 %13, 13
  %33 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %32, i32 %14, 14
  %34 = bitcast i32 %15 to float
  %35 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %33, float %34, 15
  %36 = bitcast i32 %16 to float
  %37 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %35, float %36, 16
  %38 = bitcast i32 %17 to float
  %39 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %37, float %38, 17
  %40 = bitcast i32 %18 to float
  %41 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %39, float %40, 18
  %42 = add i32 %15, %12
  %43 = bitcast i32 %42 to float
  %44 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %41, float %43, 19
  %45 = add i32 %15, %12
  %46 = bitcast i32 %45 to float
  %47 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %44, float %46, 20
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float }> %47
}

radeonsi: Compiling shader 9
Vertex Shader Epilog LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_vs void @main() {
main_body:
  ret void
}

Vertex Shader as VS:
Shader prolog disassembly:
	v_add_i32_e32 v4, vcc, s12, v0 ; 4A08000C
	v_mov_b32_e32 v5, v4           ; 7E0A0304
Shader main disassembly:
	s_load_dwordx4 s[0:3], s[10:11], 0x0                  ; C0800B00
	s_load_dwordx4 s[4:7], s[10:11], 0x4                  ; C0820B04
	s_waitcnt lgkmcnt(0)                                  ; BF8C007F
	buffer_load_format_xyzw v[6:9], v4, s[0:3], 0 idxen   ; E00C2000 80000604
	buffer_load_format_xyzw v[10:13], v5, s[4:7], 0 idxen ; E00C2000 80010A05
	s_waitcnt vmcnt(0)                                    ; BF8C0F70
	exp 15, 32, 0, 0, 0, v10, v11, v12, v13               ; F800020F 0D0C0B0A
	exp 15, 12, 0, 1, 0, v6, v7, v8, v9                   ; F80008CF 09080706
	s_waitcnt expcnt(0)                                   ; BF8C0F0F
Shader epilog disassembly:
	s_endpgm ; BF810000

*** SHADER STATS ***
SGPRS: 24
VGPRS: 16
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 64 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
radeonsi: Compiling shader 10
Fragment Shader Epilog LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps void @main(i64 inreg, i64 inreg, i64 inreg, i64 inreg, i64 inreg, float inreg, float, float, float, float, float, float, float, float, float, float, float, float, float, float) #0 {
main_body:
  %20 = call i32 @llvm.SI.packf16(float %6, float %7)
  %21 = bitcast i32 %20 to float
  %22 = call i32 @llvm.SI.packf16(float %8, float %9)
  %23 = bitcast i32 %22 to float
  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %21, float %23, float undef, float undef)
  ret void
}

; Function Attrs: nounwind readnone
declare i32 @llvm.SI.packf16(float, float) #1

; Function Attrs: nounwind
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #2

attributes #0 = { "InitialPSInputAddr"="16777215" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }

Pixel Shader:
Shader main disassembly:
	s_mov_b32 m0, s11                   ; BEFC030B
	v_interp_mov_f32 v0, P0, 0, 0, [m0] ; C8020002
	v_interp_mov_f32 v1, P0, 1, 0, [m0] ; C8060102
	v_interp_mov_f32 v2, P0, 2, 0, [m0] ; C80A0202
	v_interp_mov_f32 v3, P0, 3, 0, [m0] ; C80E0302
Shader epilog disassembly:
	v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; 5E000300
	v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; 5E020702
	exp 15, 0, 1, 1, 1, v0, v1, v0, v0 ; F8001C0F 00000100
	s_endpgm                           ; BF810000

*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0xd077
SPI_PS_INPUT_ENA  = 0x0020
*** SHADER STATS ***
SGPRS: 16
VGPRS: 16
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 40 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************
FRAG
DCL IN[0], GENERIC[0], LINEAR
DCL OUT[0], COLOR
DCL SAMP[0]
DCL SVIEW[0], 2D, FLOAT
  0: TEX OUT[0], IN[0], SAMP[0], 2D
  1: END
radeonsi: Compiling shader 11
TGSI shader LLVM IR:

; ModuleID = 'tgsi'
source_filename = "tgsi"
target triple = "amdgcn--"

define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615), [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
  %23 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %2, i64 0, i64 0, !amdgpu.uniform !0
  %24 = load <8 x i32>, <8 x i32> addrspace(2)* %23, align 32, !invariant.load !0
  %25 = bitcast [32 x <8 x i32>] addrspace(2)* %2 to [0 x <4 x i32>] addrspace(2)*
  %26 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(2)* %25, i64 0, i64 3, !amdgpu.uniform !0
  %27 = load <4 x i32>, <4 x i32> addrspace(2)* %26, align 16, !invariant.load !0
  %28 = extractelement <8 x i32> %24, i32 7
  %29 = extractelement <4 x i32> %27, i32 0
  %30 = and i32 %29, %28
  %31 = insertelement <4 x i32> %27, i32 %30, i32 0
  %32 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %6, <2 x i32> %12)
  %33 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %6, <2 x i32> %12)
  %34 = bitcast float %32 to i32
  %35 = bitcast float %33 to i32
  %36 = insertelement <2 x i32> undef, i32 %34, i32 0
  %37 = insertelement <2 x i32> %36, i32 %35, i32 1
  %38 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %37, <8 x i32> %24, <4 x i32> %31, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
  %39 = extractelement <4 x float> %38, i32 0
  %40 = extractelement <4 x float> %38, i32 1
  %41 = extractelement <4 x float> %38, i32 2
  %42 = extractelement <4 x float> %38, i32 3
  %43 = bitcast float %5 to i32
  %44 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %43, 10
  %45 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %44, float %39, 11
  %46 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %45, float %40, 12
  %47 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %46, float %41, 13
  %48 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %47, float %42, 14
  %49 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %48, float %21, 24
  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %49
}

; Function Attrs: nounwind readnone
declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1

attributes #0 = { "InitialPSInputAddr"="36983" }
attributes #1 = { nounwind readnone }

!0 = !{}

Pixel Shader:
Shader main disassembly:
	s_wqm_b64 exec, exec                                    ; BEFE0A7E
	s_load_dwordx8 s[12:19], s[4:5], 0x0                    ; C0C60500
	s_load_dwordx4 s[0:3], s[4:5], 0xc                      ; C080050C
	s_mov_b32 m0, s11                                       ; BEFC030B
	v_interp_p1_f32 v0, v8, 0, 0, [m0]                      ; C8000008
	v_interp_p2_f32 v0, [v0], v9, 0, 0, [m0]                ; C8010009
	v_interp_p1_f32 v1, v8, 1, 0, [m0]                      ; C8040108
	s_waitcnt lgkmcnt(0)                                    ; BF8C007F
	s_and_b32 s0, s0, s19                                   ; 87001300
	v_interp_p2_f32 v1, [v1], v9, 1, 0, [m0]                ; C8050109
	image_sample v[0:3], v[0:1], s[12:19], s[0:3] dmask:0xf ; F0800F00 00030000
	s_waitcnt vmcnt(0)                                      ; BF8C0F70
Shader epilog disassembly:
	v_cvt_pkrtz_f16_f32_e32 v0, v0, v1 ; 5E000300
	v_cvt_pkrtz_f16_f32_e32 v1, v2, v3 ; 5E020702
	exp 15, 0, 1, 1, 1, v0, v1, v0, v0 ; F8001C0F 00000100
	s_endpgm                           ; BF810000

*** SHADER CONFIG ***
SPI_PS_INPUT_ADDR = 0xd077
SPI_PS_INPUT_ENA  = 0x0020
*** SHADER STATS ***
SGPRS: 24
VGPRS: 16
Spilled SGPRs: 0
Spilled VGPRs: 0
Code Size: 72 bytes
LDS: 0 blocks
Scratch: 0 bytes per wave
Max Waves: 10
********************