[PATCH] drm/amdgpu: Count disabled CRTCs in commit tail earlier

Fri Jun 22 18:56:23 UTC 2018

On Fri, 2018-06-22 at 13:34 -0400, Andrey Grodzovsky wrote:
> 
> On 06/21/2018 04:48 PM, Lyude Paul wrote:
> > This fixes a regression I accidentally reduced that was picked up by
> > kasan, where we were checking the CRTC atomic states after DRM's helpers
> > had already freed them. Example:
> > 
> > ==================================================================
> > BUG: KASAN: use-after-free in
> > amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
> > Read of size 1 at addr ffff8803a697b071 by task kworker/u16:0/7
> > 
> > CPU: 7 PID: 7 Comm: kworker/u16:0 Tainted: G           O      4.18.0-
> > rc1Lyude-Upstream+ #1
> > Hardware name: HP HP ZBook 15 G4/8275, BIOS P70 Ver. 01.21 05/02/2018
> > Workqueue: events_unbound commit_work [drm_kms_helper]
> > Call Trace:
> >   dump_stack+0xc1/0x169
> >   ? dump_stack_print_info.cold.1+0x42/0x42
> >   ? kmsg_dump_rewind_nolock+0xd9/0xd9
> >   ? printk+0x9f/0xc5
> >   ? amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
> >   print_address_description+0x6c/0x23c
> >   ? amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
> >   kasan_report.cold.6+0x241/0x2fd
> >   amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
> >   ? commit_planes_to_stream.constprop.45+0x13b0/0x13b0 [amdgpu]
> >   ? cpu_load_update_active+0x290/0x290
> >   ? finish_task_switch+0x2bd/0x840
> >   ? __switch_to_asm+0x34/0x70
> >   ? read_word_at_a_time+0xe/0x20
> >   ? strscpy+0x14b/0x460
> >   ? drm_atomic_helper_wait_for_dependencies+0x47d/0x7e0 [drm_kms_helper]
> >   commit_tail+0x96/0xe0 [drm_kms_helper]
> >   process_one_work+0x88a/0x1360
> >   ? create_worker+0x540/0x540
> >   ? __sched_text_start+0x8/0x8
> >   ? move_queued_task+0x760/0x760
> >   ? call_rcu_sched+0x20/0x20
> >   ? vsnprintf+0xcda/0x1350
> >   ? wait_woken+0x1c0/0x1c0
> >   ? mutex_unlock+0x1d/0x40
> >   ? init_timer_key+0x190/0x230
> >   ? schedule+0xea/0x390
> >   ? __schedule+0x1ea0/0x1ea0
> >   ? need_to_create_worker+0xe4/0x210
> >   ? init_worker_pool+0x700/0x700
> >   ? try_to_del_timer_sync+0xbf/0x110
> >   ? del_timer+0x120/0x120
> >   ? __mutex_lock_slowpath+0x10/0x10
> >   worker_thread+0x196/0x11f0
> >   ? flush_rcu_work+0x50/0x50
> >   ? __switch_to_asm+0x34/0x70
> >   ? __switch_to_asm+0x34/0x70
> >   ? __switch_to_asm+0x40/0x70
> >   ? __switch_to_asm+0x34/0x70
> >   ? __switch_to_asm+0x40/0x70
> >   ? __switch_to_asm+0x34/0x70
> >   ? __switch_to_asm+0x40/0x70
> >   ? __schedule+0x7d6/0x1ea0
> >   ? migrate_swap_stop+0x850/0x880
> >   ? __sched_text_start+0x8/0x8
> >   ? save_stack+0x8c/0xb0
> >   ? kasan_kmalloc+0xbf/0xe0
> >   ? kmem_cache_alloc_trace+0xe4/0x190
> >   ? kthread+0x98/0x390
> >   ? ret_from_fork+0x35/0x40
> >   ? ret_from_fork+0x35/0x40
> >   ? deactivate_slab.isra.67+0x3c4/0x5c0
> >   ? kthread+0x98/0x390
> >   ? kthread+0x98/0x390
> >   ? set_track+0x76/0x120
> >   ? schedule+0xea/0x390
> >   ? __schedule+0x1ea0/0x1ea0
> >   ? wait_woken+0x1c0/0x1c0
> >   ? kasan_unpoison_shadow+0x30/0x40
> >   ? parse_args.cold.15+0x17a/0x17a
> >   ? flush_rcu_work+0x50/0x50
> >   kthread+0x2d4/0x390
> >   ? kthread_create_worker_on_cpu+0xc0/0xc0
> >   ret_from_fork+0x35/0x40
> > 
> > Allocated by task 1124:
> >   kasan_kmalloc+0xbf/0xe0
> >   kmem_cache_alloc_trace+0xe4/0x190
> >   dm_crtc_duplicate_state+0x78/0x130 [amdgpu]
> >   drm_atomic_get_crtc_state+0x147/0x410 [drm]
> >   page_flip_common+0x57/0x230 [drm_kms_helper]
> >   drm_atomic_helper_page_flip+0xa6/0x110 [drm_kms_helper]
> >   drm_mode_page_flip_ioctl+0xc4b/0x10a0 [drm]
> >   drm_ioctl_kernel+0x1d4/0x260 [drm]
> >   drm_ioctl+0x433/0x920 [drm]
> >   amdgpu_drm_ioctl+0x11d/0x290 [amdgpu]
> >   do_vfs_ioctl+0x1a1/0x13d0
> >   ksys_ioctl+0x60/0x90
> >   __x64_sys_ioctl+0x6f/0xb0
> >   do_syscall_64+0x147/0x440
> >   entry_SYSCALL_64_after_hwframe+0x44/0xa9
> > 
> > Freed by task 1124:
> >   __kasan_slab_free+0x12e/0x180
> >   kfree+0x92/0x1a0
> >   drm_atomic_state_default_clear+0x315/0xc40 [drm]
> >   __drm_atomic_state_free+0x35/0xd0 [drm]
> >   drm_atomic_helper_update_plane+0xac/0x350 [drm_kms_helper]
> >   __setplane_internal+0x2d6/0x840 [drm]
> >   drm_mode_cursor_universal+0x41e/0xbe0 [drm]
> >   drm_mode_cursor_common+0x49f/0x880 [drm]
> >   drm_mode_cursor_ioctl+0xd8/0x130 [drm]
> >   drm_ioctl_kernel+0x1d4/0x260 [drm]
> >   drm_ioctl+0x433/0x920 [drm]
> >   amdgpu_drm_ioctl+0x11d/0x290 [amdgpu]
> >   do_vfs_ioctl+0x1a1/0x13d0
> >   ksys_ioctl+0x60/0x90
> >   __x64_sys_ioctl+0x6f/0xb0
> >   do_syscall_64+0x147/0x440
> >   entry_SYSCALL_64_after_hwframe+0x44/0xa9
> > 
> > The buggy address belongs to the object at ffff8803a697b068
> >   which belongs to the cache kmalloc-1024 of size 1024
> > The buggy address is located 9 bytes inside of
> >   1024-byte region [ffff8803a697b068, ffff8803a697b468)
> > The buggy address belongs to the page:
> > page:ffffea000e9a5e00 count:1 mapcount:0 mapping:ffff88041e00efc0 index:0x0
> > compound_mapcount: 0
> > flags: 0x8000000000008100(slab|head)
> > raw: 8000000000008100 ffffea000ecbc208 ffff88041e000c70 ffff88041e00efc0
> > raw: 0000000000000000 0000000000170017 00000001ffffffff 0000000000000000
> > page dumped because: kasan: bad access detected
> > 
> > Memory state around the buggy address:
> >   ffff8803a697af00: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> >   ffff8803a697af80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
> > > ffff8803a697b000: fc fc fc fc fc fc fc fc fc fc fc fc fc fb fb fb
> > 
> >                                                               ^
> >   ffff8803a697b080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> >   ffff8803a697b100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> > ==================================================================
> > 
> > So, we fix this by counting the number of CRTCs this atomic commit disabled
> > early on in the function before their atomic states have been freed, then
> > use
> > that count later to do the appropriate number of RPM puts at the end of the
> > function.
> 
> I am a bit not clear, are you saying that the problem was the 'in the 
> middle' commit (cursor ioctl) doing
> 
> drm_atomic_state_default_clear->dm_crtc_destroy_state->kfree(state)
> 
> where the state is the one you access from from the non blocking part of 
> page flip though old_crtc_state->active?
The problem is that (see the comment in drivers/gpu/drm/drm_atomic_helper.c:2065
) it's unsafe to touch any of the old_crtc_state structures after
drm_atomic_helper_commit_hw_done() is called, as it's likely that they've been
freed already.
> 
> Andrey
> > 
> > Fixes: 97028037a38ae ("drm/amdgpu: Grab/put runtime PM references in
> > atomic_commit_tail()")
> > Signed-off-by: Lyude Paul <lyude at redhat.com>
> > Cc: Michel Dänzer <michel at daenzer.net>
> > Reported-by: Michel Dänzer <michel at daenzer.net>
> > ---
> >   drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 10 ++++++----
> >   1 file changed, 6 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > index f9add85157e7..689dbdf44bbf 100644
> > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > @@ -4206,6 +4206,7 @@ static void amdgpu_dm_atomic_commit_tail(struct
> > drm_atomic_state *state)
> >   	struct drm_connector *connector;
> >   	struct drm_connector_state *old_con_state, *new_con_state;
> >   	struct dm_crtc_state *dm_old_crtc_state, *dm_new_crtc_state;
> > +	int crtc_disable_count = 0;
> >   
> >   	drm_atomic_helper_update_legacy_modeset_state(dev, state);
> >   
> > @@ -4410,6 +4411,9 @@ static void amdgpu_dm_atomic_commit_tail(struct
> > drm_atomic_state *state)
> >   		struct amdgpu_crtc *acrtc = to_amdgpu_crtc(crtc);
> >   		bool modeset_needed;
> >   
> > +		if (old_crtc_state->active && !new_crtc_state->active)
> > +			crtc_disable_count++;
> > +
> >   		dm_new_crtc_state = to_dm_crtc_state(new_crtc_state);
> >   		dm_old_crtc_state = to_dm_crtc_state(old_crtc_state);
> >   		modeset_needed = modeset_required(
> > @@ -4463,11 +4467,9 @@ static void amdgpu_dm_atomic_commit_tail(struct
> > drm_atomic_state *state)
> >   	 * so we can put the GPU into runtime suspend if we're not driving
> > any
> >   	 * displays anymore
> >   	 */
> > +	for (i = 0; i < crtc_disable_count; i++)
> > +		pm_runtime_put_autosuspend(dev->dev);
> >   	pm_runtime_mark_last_busy(dev->dev);
> > -	for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state,
> > new_crtc_state, i) {
> > -		if (old_crtc_state->active && !new_crtc_state->active)
> > -			pm_runtime_put_autosuspend(dev->dev);
> > -	}
> >   }
> >   
> >   
> 
>