[PATCH 06/22] drm/amdgpu: don't use MM idle_work for SRIOV

Christian König ckoenig.leichtzumerken at gmail.com
Mon Feb 26 09:57:38 UTC 2018


Am 26.02.2018 um 06:18 schrieb Monk Liu:
> two reason for this patch:
> 1) SRIOV doesn't give VF cg/pg feature so the idle_work
> is useless

That is a good reason.

> 2) VCE's idle work would cause "KMD reload" test failed and
> trigger KERNEL PANIC, this is because the idle work is triggered
> after VCE ib test and KMD removed right after loaded, which
> can hit page fault in TIMER callback.

That isn't. In this case we need to fix the KMD reload order and not 
band aid here.

Additional to that the patch is way to intrusive, but see below.

>
> ISSUE resovled:
>
> SWDEV-143244:
>
> [  275.021338] RAX: 6f150001c3808df1 RBX: ffff95667fd91e70 RCX: ffff9566586878b8
> [  275.023000] RDX: ffff95667fd83ee8 RSI: ffff95667fd91e28 RDI: ffff95667fd83ef0
> [  275.024660] RBP: ffff95667fd83f58 R08: ffff95667fd92238 R09: ffff95667fd83ef0
> [  275.026303] R10: 0000000000000000 R11: 0000000000000070 R12: ffff95667fd91e00
> [  275.027951] R13: ffff95667fd83ee8 R14: 0000000000000000 R15: ffff9566586878b8
> [  275.029600] FS:  0000000000000000(0000) GS:ffff95667fd80000(0000) knlGS:0000000000000000
> [  275.031896] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  275.033367] CR2: 0000560972e27300 CR3: 000000011fe09000 CR4: 00000000001406e0
> [  275.035095] Call Trace:
> [  275.036032]  <IRQ>
> [  275.036909]  ? ktime_get+0x3b/0xa0
> [  275.037997]  ? native_apic_msr_write+0x2b/0x30
> [  275.039259]  ? lapic_next_event+0x1d/0x30
> [  275.040461]  ? clockevents_program_event+0x7d/0xf0
> [  275.041723]  __do_softirq+0xed/0x278
> [  275.042767]  irq_exit+0xf3/0x100
> [  275.043740]  smp_apic_timer_interrupt+0x3d/0x50
> [  275.044917]  apic_timer_interrupt+0x93/0xa0
> [  275.046041] RIP: 0010:native_safe_halt+0x6/0x10
> [  275.047212] RSP: 0018:ffffaeac8069bea0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff10
> [  275.049330] RAX: 0000000000080000 RBX: ffff95667a6b1e00 RCX: 0000000000000000
> [  275.052920] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
> [  275.054560] RBP: ffffaeac8069bea0 R08: 0000000000000006 R09: 0000000000000001
> [  275.056148] R10: 000000000001fee5 R11: 0000000000000000 R12: 0000000000000003
> [  275.057751] R13: ffff95667a6b1e00 R14: 0000000000000000 R15: 0000000000000000
> [  275.059340]  </IRQ>
> [  275.060138]  default_idle+0x1e/0x100
> [  275.061183]  arch_cpu_idle+0xf/0x20
> [  275.062184]  default_idle_call+0x23/0x30
> [  275.063256]  do_idle+0x16a/0x1d0
> [  275.064223]  cpu_startup_entry+0x1d/0x20
> [  275.065282]  start_secondary+0xfd/0x120
> [  275.066276]  secondary_startup_64+0x9f/0x9f
> [  275.067300] Code: 4c 8d 6c c5 90 49 8b 45 00 48 85 c0 74 e1 4d 8b 7d 00 4d 89 7c 24 08 0f 1f 44 00 00 49 8b 07 49 8b 57 08 48 85 c0 48 89 02 74 04 <48> 89 50 08 41 f6 47 2a 20 48 b8 00 02 00 00 00 00
> [  275.071284] RIP: run_timer_softirq+0x138/0x440 RSP: ffff95667fd83ee0
> [  275.072572] ---[ end trace a5789e583cb451a7 ]---
> [  275.073626] Kernel panic - not syncing: Fatal exception in interrupt
> [  275.075001] Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
> [  275.077381] ---[ end Kernel panic - not syncing: Fatal exception in interrupt
> [  275.078753] sched: Unexpected reschedule of offline CPU#0!
> [  275.079918] ------------[ cut here ]------------
> [  275.082724] WARNING: CPU: 3 PID: 0 at arch/x86/kernel/smp.c:128 native_smp_send_reschedule+0x3c/0x40
> [  275.084856] Modules linked in: amdgpu(OE-) chash ttm drm_kms_helper drm i2c_algo_bit fb_sys_fops syscopyarea sysfillrect sysimgblt snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hw]
> [  275.094230] CPU: 3 PID: 0 Comm: swapper/3 Tainted: G      D W  OE   4.13.0-sriov-17.50 #1
> [  275.096303] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
> [  275.098575] task: ffff95667a6b1e00 task.stack: ffffaeac80698000
> [  275.099883] RIP: 0010:native_smp_send_reschedule+0x3c/0x40
> [  275.101168] RSP: 0018:ffff95667fd83b08 EFLAGS: 00010096
> [  275.102381] RAX: 000000000000002e RBX: ffff9566791c9e00 RCX: 000000000000083f
> [  275.103846] RDX: 0000000000000000 RSI: 00000000000000f6 RDI: 000000000000083f
>
> Change-Id: I6dd7ea48d23b0fee74ecb9e93b53bfe36b0e8164
> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 14 +++++++++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 11 +++++++----
>   2 files changed, 16 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
> index 9cd5517..caba610 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
> @@ -123,7 +123,8 @@ int amdgpu_uvd_sw_init(struct amdgpu_device *adev)
>   	unsigned version_major, version_minor, family_id;
>   	int i, r;
>   
> -	INIT_DELAYED_WORK(&adev->uvd.idle_work, amdgpu_uvd_idle_work_handler);
> +	if (!amdgpu_sriov_vf(adev))
> +		INIT_DELAYED_WORK(&adev->uvd.idle_work, amdgpu_uvd_idle_work_handler);
>   
>   	switch (adev->asic_type) {
>   #ifdef CONFIG_DRM_AMDGPU_CIK
> @@ -297,7 +298,8 @@ int amdgpu_uvd_suspend(struct amdgpu_device *adev)
>   	if (adev->uvd.vcpu_bo == NULL)
>   		return 0;
>   
> -	cancel_delayed_work_sync(&adev->uvd.idle_work);
> +	if (!amdgpu_sriov_vf(adev))
> +		cancel_delayed_work_sync(&adev->uvd.idle_work);
>   
>   	for (i = 0; i < adev->uvd.max_handles; ++i)
>   		if (atomic_read(&adev->uvd.handles[i]))
> @@ -1117,7 +1119,7 @@ static void amdgpu_uvd_idle_work_handler(struct work_struct *work)
>   	unsigned fences = amdgpu_fence_count_emitted(&adev->uvd.ring);
>   
>   	if (amdgpu_sriov_vf(adev))
> -		return;
> +		BUG();
>   
>   	if (fences == 0) {
>   		if (adev->pm.dpm_enabled) {
> @@ -1138,11 +1140,12 @@ static void amdgpu_uvd_idle_work_handler(struct work_struct *work)
>   void amdgpu_uvd_ring_begin_use(struct amdgpu_ring *ring)
>   {
>   	struct amdgpu_device *adev = ring->adev;
> -	bool set_clocks = !cancel_delayed_work_sync(&adev->uvd.idle_work);
> +	bool set_clocks;
>   
>   	if (amdgpu_sriov_vf(adev))
>   		return;
>   
> +	set_clocks = !cancel_delayed_work_sync(&adev->uvd.idle_work);
>   	if (set_clocks) {
>   		if (adev->pm.dpm_enabled) {
>   			amdgpu_dpm_enable_uvd(adev, true);
> @@ -1158,7 +1161,8 @@ void amdgpu_uvd_ring_begin_use(struct amdgpu_ring *ring)
>   
>   void amdgpu_uvd_ring_end_use(struct amdgpu_ring *ring)
>   {
> -	schedule_delayed_work(&ring->adev->uvd.idle_work, UVD_IDLE_TIMEOUT);
> +	if (!amdgpu_sriov_vf(ring->adev))
> +		schedule_delayed_work(&ring->adev->uvd.idle_work, UVD_IDLE_TIMEOUT);
>   }
>   
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> index d274ae5..0877fc39 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> @@ -187,7 +187,8 @@ int amdgpu_vce_sw_init(struct amdgpu_device *adev, unsigned long size)
>   		adev->vce.filp[i] = NULL;
>   	}
>   
> -	INIT_DELAYED_WORK(&adev->vce.idle_work, amdgpu_vce_idle_work_handler);
> +	if (!amdgpu_sriov_vf(adev))
> +		INIT_DELAYED_WORK(&adev->vce.idle_work, amdgpu_vce_idle_work_handler);
>   	mutex_init(&adev->vce.idle_mutex);
>   
>   	return 0;
> @@ -241,7 +242,8 @@ int amdgpu_vce_suspend(struct amdgpu_device *adev)
>   	if (i == AMDGPU_MAX_VCE_HANDLES)
>   		return 0;
>   
> -	cancel_delayed_work_sync(&adev->vce.idle_work);
> +	if (!amdgpu_sriov_vf(adev))
> +		cancel_delayed_work_sync(&adev->vce.idle_work);
>   	/* TODO: suspending running encoding sessions isn't supported */
>   	return -EINVAL;
>   }
> @@ -301,7 +303,7 @@ static void amdgpu_vce_idle_work_handler(struct work_struct *work)
>   	unsigned i, count = 0;
>   
>   	if (amdgpu_sriov_vf(adev))
> -		return;
> +		BUG();
>   
>   	for (i = 0; i < adev->vce.num_rings; i++)
>   		count += amdgpu_fence_count_emitted(&adev->vce.ring[i]);
> @@ -362,7 +364,8 @@ void amdgpu_vce_ring_begin_use(struct amdgpu_ring *ring)
>    */
>   void amdgpu_vce_ring_end_use(struct amdgpu_ring *ring)
>   {
> -	schedule_delayed_work(&ring->adev->vce.idle_work, VCE_IDLE_TIMEOUT);
> +	if (!amdgpu_sriov_vf(ring->adev))
> +		schedule_delayed_work(&ring->adev->vce.idle_work, VCE_IDLE_TIMEOUT);

This is the only change needed, everything else is superfluous and can 
be removed from the patch.

Regards,
Christian.

>   }
>   
>   /**



More information about the amd-gfx mailing list