[PATCH] drm/amdgpu/gfx: Improvement on EDC GPR workarounds
Yong Zhao
yong.zhao at amd.com
Tue Dec 3 21:42:39 UTC 2019
Not sure whether we should add the issue ticket info here.
Reviewed-by: Yong Zhao <Yong.Zhao at amd.com>
On 2019-12-03 3:45 p.m., James Zhu wrote:
> SPI limits total CS waves in flight per SE to no more than 32 * num_cu and
> we need to stuff 40 waves on a CU to completely clean the SGPR. This is
> accomplished in the WR by cleaning the SE in two steps, half of the CU per
> step.
>
> Signed-off-by: James Zhu <James.Zhu at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 83 ++++++++++++++++++++++++++---------
> 1 file changed, 63 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 6a251a3..147c08f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -3938,24 +3938,37 @@ static const struct soc15_reg_entry vgpr_init_regs[] = {
> { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE1), 0xffffffff },
> { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE2), 0xffffffff },
> { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE3), 0xffffffff },
> - { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_RESOURCE_LIMITS), 0x1000000 }, /* CU_GROUP_COUNT=1 */
> - { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_X), 256*2 },
> - { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_Y), 1 },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_RESOURCE_LIMITS), 0x0000000 },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_X), 0x40 },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_Y), 4 },
> { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_Z), 1 },
> - { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC1), 0x100007f }, /* VGPRS=15 (256 logical VGPRs, SGPRS=1 (16 SGPRs, BULKY=1 */
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC1), 0x3f },
> { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC2), 0x400000 }, /* 64KB LDS */
> };
>
> -static const struct soc15_reg_entry sgpr_init_regs[] = {
> - { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE0), 0xffffffff },
> - { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE1), 0xffffffff },
> - { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE2), 0xffffffff },
> - { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE3), 0xffffffff },
> - { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_RESOURCE_LIMITS), 0x1000000 }, /* CU_GROUP_COUNT=1 */
> - { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_X), 256*2 },
> - { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_Y), 1 },
> +static const struct soc15_reg_entry sgpr1_init_regs[] = {
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE0), 0x000000ff },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE1), 0x000000ff },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE2), 0x000000ff },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE3), 0x000000ff },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_RESOURCE_LIMITS), 0x0000000 },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_X), 0x40 },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_Y), 8 },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_Z), 1 },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC1), 0x240 }, /* (80 GPRS) */
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC2), 0x0 },
> +};
> +
> +static const struct soc15_reg_entry sgpr2_init_regs[] = {
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE0), 0x0000ff00 },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE1), 0x0000ff00 },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE2), 0x0000ff00 },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_STATIC_THREAD_MGMT_SE3), 0x0000ff00 },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_RESOURCE_LIMITS), 0x0000000 },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_X), 0x40 },
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_Y), 8 },
> { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_NUM_THREAD_Z), 1 },
> - { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC1), 0x340 }, /* SGPRS=13 (112 GPRS) */
> + { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC1), 0x240 }, /* (80 GPRS) */
> { SOC15_REG_ENTRY(GC, 0, mmCOMPUTE_PGM_RSRC2), 0x0 },
> };
>
> @@ -4065,7 +4078,9 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> total_size =
> ((ARRAY_SIZE(vgpr_init_regs) * 3) + 4 + 5 + 2) * 4;
> total_size +=
> - ((ARRAY_SIZE(sgpr_init_regs) * 3) + 4 + 5 + 2) * 4;
> + ((ARRAY_SIZE(sgpr1_init_regs) * 3) + 4 + 5 + 2) * 4;
> + total_size +=
> + ((ARRAY_SIZE(sgpr2_init_regs) * 3) + 4 + 5 + 2) * 4;
> total_size = ALIGN(total_size, 256);
> vgpr_offset = total_size;
> total_size += ALIGN(sizeof(vgpr_init_compute_shader), 256);
> @@ -4108,7 +4123,35 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>
> /* write dispatch packet */
> ib.ptr[ib.length_dw++] = PACKET3(PACKET3_DISPATCH_DIRECT, 3);
> - ib.ptr[ib.length_dw++] = 256; /* x */
> + ib.ptr[ib.length_dw++] = 0x40*2; /* x */
> + ib.ptr[ib.length_dw++] = 1; /* y */
> + ib.ptr[ib.length_dw++] = 1; /* z */
> + ib.ptr[ib.length_dw++] =
> + REG_SET_FIELD(0, COMPUTE_DISPATCH_INITIATOR, COMPUTE_SHADER_EN, 1);
> +
> + /* write CS partial flush packet */
> + ib.ptr[ib.length_dw++] = PACKET3(PACKET3_EVENT_WRITE, 0);
> + ib.ptr[ib.length_dw++] = EVENT_TYPE(7) | EVENT_INDEX(4);
> +
> + /* SGPR1 */
> + /* write the register state for the compute dispatch */
> + for (i = 0; i < ARRAY_SIZE(sgpr1_init_regs); i++) {
> + ib.ptr[ib.length_dw++] = PACKET3(PACKET3_SET_SH_REG, 1);
> + ib.ptr[ib.length_dw++] = SOC15_REG_ENTRY_OFFSET(sgpr1_init_regs[i])
> + - PACKET3_SET_SH_REG_START;
> + ib.ptr[ib.length_dw++] = sgpr1_init_regs[i].reg_value;
> + }
> + /* write the shader start address: mmCOMPUTE_PGM_LO, mmCOMPUTE_PGM_HI */
> + gpu_addr = (ib.gpu_addr + (u64)sgpr_offset) >> 8;
> + ib.ptr[ib.length_dw++] = PACKET3(PACKET3_SET_SH_REG, 2);
> + ib.ptr[ib.length_dw++] = SOC15_REG_OFFSET(GC, 0, mmCOMPUTE_PGM_LO)
> + - PACKET3_SET_SH_REG_START;
> + ib.ptr[ib.length_dw++] = lower_32_bits(gpu_addr);
> + ib.ptr[ib.length_dw++] = upper_32_bits(gpu_addr);
> +
> + /* write dispatch packet */
> + ib.ptr[ib.length_dw++] = PACKET3(PACKET3_DISPATCH_DIRECT, 3);
> + ib.ptr[ib.length_dw++] = 0xA0*2; /* x */
> ib.ptr[ib.length_dw++] = 1; /* y */
> ib.ptr[ib.length_dw++] = 1; /* z */
> ib.ptr[ib.length_dw++] =
> @@ -4118,13 +4161,13 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> ib.ptr[ib.length_dw++] = PACKET3(PACKET3_EVENT_WRITE, 0);
> ib.ptr[ib.length_dw++] = EVENT_TYPE(7) | EVENT_INDEX(4);
>
> - /* SGPR */
> + /* SGPR2 */
> /* write the register state for the compute dispatch */
> - for (i = 0; i < ARRAY_SIZE(sgpr_init_regs); i++) {
> + for (i = 0; i < ARRAY_SIZE(sgpr2_init_regs); i++) {
> ib.ptr[ib.length_dw++] = PACKET3(PACKET3_SET_SH_REG, 1);
> - ib.ptr[ib.length_dw++] = SOC15_REG_ENTRY_OFFSET(sgpr_init_regs[i])
> + ib.ptr[ib.length_dw++] = SOC15_REG_ENTRY_OFFSET(sgpr2_init_regs[i])
> - PACKET3_SET_SH_REG_START;
> - ib.ptr[ib.length_dw++] = sgpr_init_regs[i].reg_value;
> + ib.ptr[ib.length_dw++] = sgpr2_init_regs[i].reg_value;
> }
> /* write the shader start address: mmCOMPUTE_PGM_LO, mmCOMPUTE_PGM_HI */
> gpu_addr = (ib.gpu_addr + (u64)sgpr_offset) >> 8;
> @@ -4136,7 +4179,7 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>
> /* write dispatch packet */
> ib.ptr[ib.length_dw++] = PACKET3(PACKET3_DISPATCH_DIRECT, 3);
> - ib.ptr[ib.length_dw++] = 256; /* x */
> + ib.ptr[ib.length_dw++] = 0xA0*2; /* x */
> ib.ptr[ib.length_dw++] = 1; /* y */
> ib.ptr[ib.length_dw++] = 1; /* z */
> ib.ptr[ib.length_dw++] =
More information about the amd-gfx
mailing list