[PATCH] drm/amdgpu: fix the gfx hang while use per-ib secure flag

Wed Feb 26 16:26:12 UTC 2020

On 2020-02-25 18:40, Luben Tuikov wrote:
> On 2020-02-25 8:57 a.m., Huang Rui wrote:
>> Since 6643ba1 frame control packets are only issued in presence of secure IB(s).
>> This causes hangs on some hardware (eg: Raven1). This patch restores the
>> unconditionnal frame control packets issuing, that's to keep the per-IB logic
>> regarding the secure flag.
>>
>> Fixes: 6643ba1 drm/amdgpu: Move to a per-IB secure flag (TMZ)
>>
>> Reported-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
>> Signed-off-by: Huang Rui <ray.huang at amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c   | 41 +++++++++++++++++++-------------
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  5 ++--
>>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 15 ++++++------
>>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 13 +++++-----
>>  4 files changed, 43 insertions(+), 31 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> index 4b2342d..9713a7d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
>> @@ -131,7 +131,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>  	uint64_t fence_ctx;
>>  	uint32_t status = 0, alloc_size;
>>  	unsigned fence_flags = 0;
>> -	bool secure;
>> +	int secure = -1;
> 
> Don't initialize them far away from where they're being
> used. That's a very bad habit.
> This function is pretty long.
> Initialize "secure" next to where it is being used--right
> above the for-loop so that it is clear how it's used.
> 
>>  
>>  	unsigned i;
>>  	int r = 0;
>> @@ -216,7 +216,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>  		amdgpu_ring_emit_cntxcntl(ring, status);
>>  	}
>>  
>> -	secure = false;
> 
> Set "secure" here, close to where it is used.
> This makes the code modular, so that it is easy
> to see what is happening _locally_. Another
> advantage is that should ever this code be pulled
> out into its own function, we don't have to chase
> variables around.
> 
>>  	for (i = 0; i < num_ibs; ++i) {
>>  		ib = &ibs[i];
>>  
>> @@ -228,27 +227,37 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>>  		    !amdgpu_sriov_vf(adev)) /* for SRIOV preemption, Preamble CE ib must be inserted anyway */
>>  			continue;
>>  
>> -		/* If this IB is TMZ, add frame TMZ start packet,
>> -		 * else, turn off TMZ.
>> -		 */
>> -		if (ib->flags & AMDGPU_IB_FLAGS_SECURE && ring->funcs->emit_tmz) {
>> -			if (!secure) {
>> -				secure = true;
>> -				amdgpu_ring_emit_tmz(ring, true);
>> +		if (job && ring->funcs->emit_frame_cntl) {

Also note that the value of this conditional is static to the body of the loop,
i.e. invariant to the loop, and it can be pulled out of the loop.
Each loop iteration doesn't change the value of this conditional. This would 
create two tight loops. This is a formulaic optimization method which generates
slightly longer code, but more streamlined.

I'm ambivalent as to whether you implement those optimizations.

Regards,
Luben

>> +			if (secure == -1) {
>> +				if (ib->flags & AMDGPU_IB_FLAGS_SECURE) {
>> +					secure = 1;
>> +					amdgpu_ring_emit_frame_cntl(ring, true, true);
>> +				} else {
>> +					secure = 0;
>> +					amdgpu_ring_emit_frame_cntl(ring, true, false);
>> +				}
> 
> So the above is executed only once. At the very first
> iteration, since "secure" is set to -1. You can pull
> that out of the loop and have a more straightforward
> body of the loop.
> 
> Regards,
> Luben
> 
>> +			} else {
>> +				if (secure == 1 &&
>> +				    !(ib->flags & AMDGPU_IB_FLAGS_SECURE)) {
>> +					secure = 0;
>> +					amdgpu_ring_emit_frame_cntl(ring, false, true);
>> +					amdgpu_ring_emit_frame_cntl(ring, true, false);
>> +				} else if (secure == 0 &&
>> +					   ib->flags & AMDGPU_IB_FLAGS_SECURE) {
>> +					secure = 1;
>> +					amdgpu_ring_emit_frame_cntl(ring, false, false);
>> +					amdgpu_ring_emit_frame_cntl(ring, true, true);
>> +				}
>>  			}
>> -		} else if (secure) {
>> -			secure = false;
>> -			amdgpu_ring_emit_tmz(ring, false);
>>  		}
>>  
>>  		amdgpu_ring_emit_ib(ring, job, ib, status);
>>  		status &= ~AMDGPU_HAVE_CTX_SWITCH;
>>  	}
>>  
>> -	if (secure) {
>> -		secure = false;
>> -		amdgpu_ring_emit_tmz(ring, false);
>> -	}
>> +	if (job && ring->funcs->emit_frame_cntl)
>> +		amdgpu_ring_emit_frame_cntl(ring, false,
>> +					    (secure == 1) ? true : false);
>>  
>>  #ifdef CONFIG_X86_64
>>  	if (!(adev->flags & AMD_IS_APU))
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index 24caff0..4d019d6 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -166,7 +166,8 @@ struct amdgpu_ring_funcs {
>>  	void (*emit_reg_write_reg_wait)(struct amdgpu_ring *ring,
>>  					uint32_t reg0, uint32_t reg1,
>>  					uint32_t ref, uint32_t mask);
>> -	void (*emit_tmz)(struct amdgpu_ring *ring, bool start);
>> +	void (*emit_frame_cntl)(struct amdgpu_ring *ring, bool start,
>> +				bool secure);
>>  	/* priority functions */
>>  	void (*set_priority) (struct amdgpu_ring *ring,
>>  			      enum drm_sched_priority priority);
>> @@ -247,7 +248,7 @@ struct amdgpu_ring {
>>  #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>  #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>  #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m))
>> -#define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))
>> +#define amdgpu_ring_emit_frame_cntl(r, b, s) (r)->funcs->emit_frame_cntl((r), (b), (s))
>>  #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>  #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>>  #define amdgpu_ring_patch_cond_exec(r,o) (r)->funcs->patch_cond_exec((r),(o))
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 7b61583..748ac35 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -256,7 +256,7 @@ static int gfx_v10_0_rlc_backdoor_autoload_enable(struct amdgpu_device *adev);
>>  static int gfx_v10_0_wait_for_rlc_autoload_complete(struct amdgpu_device *adev);
>>  static void gfx_v10_0_ring_emit_ce_meta(struct amdgpu_ring *ring, bool resume);
>>  static void gfx_v10_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume);
>> -static void gfx_v10_0_ring_emit_tmz(struct amdgpu_ring *ring, bool start);
>> +static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start, bool secure);
>>  
>>  static void gfx10_kiq_set_resources(struct amdgpu_ring *kiq_ring, uint64_t queue_mask)
>>  {
>> @@ -4729,12 +4729,13 @@ static void gfx_v10_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume)
>>  					   sizeof(de_payload) >> 2);
>>  }
>>  
>> -static void gfx_v10_0_ring_emit_tmz(struct amdgpu_ring *ring, bool start)
>> +static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>> +				    bool secure)
>>  {
>> -	if (amdgpu_is_tmz(ring->adev)) {
>> -		amdgpu_ring_write(ring, PACKET3(PACKET3_FRAME_CONTROL, 0));
>> -		amdgpu_ring_write(ring, FRAME_TMZ | FRAME_CMD(start ? 0 : 1));
>> -	}
>> +	uint32_t v = secure ? FRAME_TMZ : 0;
>> +
>> +	amdgpu_ring_write(ring, PACKET3(PACKET3_FRAME_CONTROL, 0));
>> +	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>  }
>>  
>>  static void gfx_v10_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
>> @@ -5188,7 +5189,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>  	.init_cond_exec = gfx_v10_0_ring_emit_init_cond_exec,
>>  	.patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
>>  	.preempt_ib = gfx_v10_0_ring_preempt_ib,
>> -	.emit_tmz = gfx_v10_0_ring_emit_tmz,
>> +	.emit_frame_cntl = gfx_v10_0_ring_emit_frame_cntl,
>>  	.emit_wreg = gfx_v10_0_ring_emit_wreg,
>>  	.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>>  	.emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 1c7a16b..fbde712 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -5230,12 +5230,13 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
>>  	amdgpu_ring_write_multiple(ring, (void *)&de_payload, sizeof(de_payload) >> 2);
>>  }
>>  
>> -static void gfx_v9_0_ring_emit_tmz(struct amdgpu_ring *ring, bool start)
>> +static void gfx_v9_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
>> +				   bool secure)
>>  {
>> -	if (amdgpu_is_tmz(ring->adev)) {
>> -		amdgpu_ring_write(ring, PACKET3(PACKET3_FRAME_CONTROL, 0));
>> -		amdgpu_ring_write(ring, FRAME_TMZ | FRAME_CMD(start ? 0 : 1));
>> -	}
>> +	uint32_t v = secure ? FRAME_TMZ : 0;
>> +
>> +	amdgpu_ring_write(ring, PACKET3(PACKET3_FRAME_CONTROL, 0));
>> +	amdgpu_ring_write(ring, v | FRAME_CMD(start ? 0 : 1));
>>  }
>>  
>>  static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
>> @@ -6477,7 +6478,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>  	.emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
>>  	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>>  	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
>> -	.emit_tmz = gfx_v9_0_ring_emit_tmz,
>> +	.emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
>>  	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>>  	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>  	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>>
>