[PATCH] drm/amdgpu: Count disabled CRTCs in commit tail earlier
Andrey Grodzovsky
Andrey.Grodzovsky at amd.com
Sat Jun 23 02:42:12 UTC 2018
On 06/22/2018 09:03 PM, Andrey Grodzovsky wrote:
>
>
> On 06/22/2018 02:56 PM, Lyude Paul wrote:
>> On Fri, 2018-06-22 at 13:34 -0400, Andrey Grodzovsky wrote:
>>> On 06/21/2018 04:48 PM, Lyude Paul wrote:
>>>> This fixes a regression I accidentally reduced that was picked up by
>>>> kasan, where we were checking the CRTC atomic states after DRM's
>>>> helpers
>>>> had already freed them. Example:
>>>>
>>>> ==================================================================
>>>> BUG: KASAN: use-after-free in
>>>> amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
>>>> Read of size 1 at addr ffff8803a697b071 by task kworker/u16:0/7
>>>>
>>>> CPU: 7 PID: 7 Comm: kworker/u16:0 Tainted: G O 4.18.0-
>>>> rc1Lyude-Upstream+ #1
>>>> Hardware name: HP HP ZBook 15 G4/8275, BIOS P70 Ver. 01.21 05/02/2018
>>>> Workqueue: events_unbound commit_work [drm_kms_helper]
>>>> Call Trace:
>>>> dump_stack+0xc1/0x169
>>>> ? dump_stack_print_info.cold.1+0x42/0x42
>>>> ? kmsg_dump_rewind_nolock+0xd9/0xd9
>>>> ? printk+0x9f/0xc5
>>>> ? amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
>>>> print_address_description+0x6c/0x23c
>>>> ? amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
>>>> kasan_report.cold.6+0x241/0x2fd
>>>> amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
>>>> ? commit_planes_to_stream.constprop.45+0x13b0/0x13b0 [amdgpu]
>>>> ? cpu_load_update_active+0x290/0x290
>>>> ? finish_task_switch+0x2bd/0x840
>>>> ? __switch_to_asm+0x34/0x70
>>>> ? read_word_at_a_time+0xe/0x20
>>>> ? strscpy+0x14b/0x460
>>>> ? drm_atomic_helper_wait_for_dependencies+0x47d/0x7e0
>>>> [drm_kms_helper]
>>>> commit_tail+0x96/0xe0 [drm_kms_helper]
>>>> process_one_work+0x88a/0x1360
>>>> ? create_worker+0x540/0x540
>>>> ? __sched_text_start+0x8/0x8
>>>> ? move_queued_task+0x760/0x760
>>>> ? call_rcu_sched+0x20/0x20
>>>> ? vsnprintf+0xcda/0x1350
>>>> ? wait_woken+0x1c0/0x1c0
>>>> ? mutex_unlock+0x1d/0x40
>>>> ? init_timer_key+0x190/0x230
>>>> ? schedule+0xea/0x390
>>>> ? __schedule+0x1ea0/0x1ea0
>>>> ? need_to_create_worker+0xe4/0x210
>>>> ? init_worker_pool+0x700/0x700
>>>> ? try_to_del_timer_sync+0xbf/0x110
>>>> ? del_timer+0x120/0x120
>>>> ? __mutex_lock_slowpath+0x10/0x10
>>>> worker_thread+0x196/0x11f0
>>>> ? flush_rcu_work+0x50/0x50
>>>> ? __switch_to_asm+0x34/0x70
>>>> ? __switch_to_asm+0x34/0x70
>>>> ? __switch_to_asm+0x40/0x70
>>>> ? __switch_to_asm+0x34/0x70
>>>> ? __switch_to_asm+0x40/0x70
>>>> ? __switch_to_asm+0x34/0x70
>>>> ? __switch_to_asm+0x40/0x70
>>>> ? __schedule+0x7d6/0x1ea0
>>>> ? migrate_swap_stop+0x850/0x880
>>>> ? __sched_text_start+0x8/0x8
>>>> ? save_stack+0x8c/0xb0
>>>> ? kasan_kmalloc+0xbf/0xe0
>>>> ? kmem_cache_alloc_trace+0xe4/0x190
>>>> ? kthread+0x98/0x390
>>>> ? ret_from_fork+0x35/0x40
>>>> ? ret_from_fork+0x35/0x40
>>>> ? deactivate_slab.isra.67+0x3c4/0x5c0
>>>> ? kthread+0x98/0x390
>>>> ? kthread+0x98/0x390
>>>> ? set_track+0x76/0x120
>>>> ? schedule+0xea/0x390
>>>> ? __schedule+0x1ea0/0x1ea0
>>>> ? wait_woken+0x1c0/0x1c0
>>>> ? kasan_unpoison_shadow+0x30/0x40
>>>> ? parse_args.cold.15+0x17a/0x17a
>>>> ? flush_rcu_work+0x50/0x50
>>>> kthread+0x2d4/0x390
>>>> ? kthread_create_worker_on_cpu+0xc0/0xc0
>>>> ret_from_fork+0x35/0x40
>>>>
>>>> Allocated by task 1124:
>>>> kasan_kmalloc+0xbf/0xe0
>>>> kmem_cache_alloc_trace+0xe4/0x190
>>>> dm_crtc_duplicate_state+0x78/0x130 [amdgpu]
>>>> drm_atomic_get_crtc_state+0x147/0x410 [drm]
>>>> page_flip_common+0x57/0x230 [drm_kms_helper]
>>>> drm_atomic_helper_page_flip+0xa6/0x110 [drm_kms_helper]
>>>> drm_mode_page_flip_ioctl+0xc4b/0x10a0 [drm]
>>>> drm_ioctl_kernel+0x1d4/0x260 [drm]
>>>> drm_ioctl+0x433/0x920 [drm]
>>>> amdgpu_drm_ioctl+0x11d/0x290 [amdgpu]
>>>> do_vfs_ioctl+0x1a1/0x13d0
>>>> ksys_ioctl+0x60/0x90
>>>> __x64_sys_ioctl+0x6f/0xb0
>>>> do_syscall_64+0x147/0x440
>>>> entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>>>
>>>> Freed by task 1124:
>>>> __kasan_slab_free+0x12e/0x180
>>>> kfree+0x92/0x1a0
>>>> drm_atomic_state_default_clear+0x315/0xc40 [drm]
>>>> __drm_atomic_state_free+0x35/0xd0 [drm]
>>>> drm_atomic_helper_update_plane+0xac/0x350 [drm_kms_helper]
>>>> __setplane_internal+0x2d6/0x840 [drm]
>>>> drm_mode_cursor_universal+0x41e/0xbe0 [drm]
>>>> drm_mode_cursor_common+0x49f/0x880 [drm]
>>>> drm_mode_cursor_ioctl+0xd8/0x130 [drm]
>>>> drm_ioctl_kernel+0x1d4/0x260 [drm]
>>>> drm_ioctl+0x433/0x920 [drm]
>>>> amdgpu_drm_ioctl+0x11d/0x290 [amdgpu]
>>>> do_vfs_ioctl+0x1a1/0x13d0
>>>> ksys_ioctl+0x60/0x90
>>>> __x64_sys_ioctl+0x6f/0xb0
>>>> do_syscall_64+0x147/0x440
>>>> entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>>>
>>>> The buggy address belongs to the object at ffff8803a697b068
>>>> which belongs to the cache kmalloc-1024 of size 1024
>>>> The buggy address is located 9 bytes inside of
>>>> 1024-byte region [ffff8803a697b068, ffff8803a697b468)
>>>> The buggy address belongs to the page:
>>>> page:ffffea000e9a5e00 count:1 mapcount:0 mapping:ffff88041e00efc0
>>>> index:0x0
>>>> compound_mapcount: 0
>>>> flags: 0x8000000000008100(slab|head)
>>>> raw: 8000000000008100 ffffea000ecbc208 ffff88041e000c70
>>>> ffff88041e00efc0
>>>> raw: 0000000000000000 0000000000170017 00000001ffffffff
>>>> 0000000000000000
>>>> page dumped because: kasan: bad access detected
>>>>
>>>> Memory state around the buggy address:
>>>> ffff8803a697af00: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>>>> ffff8803a697af80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>>>>> ffff8803a697b000: fc fc fc fc fc fc fc fc fc fc fc fc fc fb fb fb
>>>> ^
>>>> ffff8803a697b080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>>>> ffff8803a697b100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>>>> ==================================================================
>>>>
>>>> So, we fix this by counting the number of CRTCs this atomic commit
>>>> disabled
>>>> early on in the function before their atomic states have been
>>>> freed, then
>>>> use
>>>> that count later to do the appropriate number of RPM puts at the
>>>> end of the
>>>> function.
>>> I am a bit not clear, are you saying that the problem was the 'in the
>>> middle' commit (cursor ioctl) doing
>>>
>>> drm_atomic_state_default_clear->dm_crtc_destroy_state->kfree(state)
>>>
>>> where the state is the one you access from from the non blocking
>>> part of
>>> page flip though old_crtc_state->active?
>> The problem is that (see the comment in
>> drivers/gpu/drm/drm_atomic_helper.c:2065
>> ) it's unsafe to touch any of the old_crtc_state structures after
>> drm_atomic_helper_commit_hw_done() is called, as it's likely that
>> they've been
>> freed already.
>
> I am not sure about that, the comment in
> drm_atomic_helper_commit_hw_done says that
> "the driver is not allowed to read or change any permanent software
> or hardware modeset state" I interpret it as not the old_crtc_state
> but as the new_crtc_state or crtc->state after
> drm_atomic_helper_swap_state completed. It means that if you touch
> crtc->state after drm_atomic_helper_commit_hw_done
> you actually could already be accessing a state which belong to the
> next atomic commit after you.
> It really looks like cursor's atomic commit sneaks in in a middle of
> page flip between the page flip IOCTL
> and it's commit_tail part and swaps away crct->state to his own new
> state and release the 'old' state which is not really
> old yet and needs to be used by the tail part of page flip. This makes
> sense since do_aquire_global_lock we use in amdgpu_dm_atomic_check
> to serialize against concurrent atomic_commits is not called for case
> of cursor plane and so it may race against any commit_tail in flight...
> Not sure why we haven't seen this problem before.
> Obviously your fix makes the problem go away since you stopped
> accessing the new_crtc_state and not the old_crtc_state but the root
> problem
> seems to me still there.
>
> Andrey
I took another look and actually no problem with the CURSOR IOCTL as it
will wait in drm_atomic_helper_swap_state for hw_done event, so
I agree with the fix but just disagree with the explanation, it should
be said that it's unsafe to touch the new_crtc_state (same as
crtc->state) after
call to drm_atomic_helper_commit_hw_done. So I would make the
explanation a bit more detailed on this point.
Anyway, the fix is Reviewed-by: Andrey Grodzovsky
<andrey.grodzovsky at amd.com>
Andrey
>
>>> Andrey
>>>> Fixes: 97028037a38ae ("drm/amdgpu: Grab/put runtime PM references in
>>>> atomic_commit_tail()")
>>>> Signed-off-by: Lyude Paul <lyude at redhat.com>
>>>> Cc: Michel Dänzer <michel at daenzer.net>
>>>> Reported-by: Michel Dänzer <michel at daenzer.net>
>>>> ---
>>>> drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 10 ++++++----
>>>> 1 file changed, 6 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>>>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>>>> index f9add85157e7..689dbdf44bbf 100644
>>>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>>>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>>>> @@ -4206,6 +4206,7 @@ static void amdgpu_dm_atomic_commit_tail(struct
>>>> drm_atomic_state *state)
>>>> struct drm_connector *connector;
>>>> struct drm_connector_state *old_con_state, *new_con_state;
>>>> struct dm_crtc_state *dm_old_crtc_state, *dm_new_crtc_state;
>>>> + int crtc_disable_count = 0;
>>>> drm_atomic_helper_update_legacy_modeset_state(dev, state);
>>>> @@ -4410,6 +4411,9 @@ static void
>>>> amdgpu_dm_atomic_commit_tail(struct
>>>> drm_atomic_state *state)
>>>> struct amdgpu_crtc *acrtc = to_amdgpu_crtc(crtc);
>>>> bool modeset_needed;
>>>> + if (old_crtc_state->active && !new_crtc_state->active)
>>>> + crtc_disable_count++;
>>>> +
>>>> dm_new_crtc_state = to_dm_crtc_state(new_crtc_state);
>>>> dm_old_crtc_state = to_dm_crtc_state(old_crtc_state);
>>>> modeset_needed = modeset_required(
>>>> @@ -4463,11 +4467,9 @@ static void amdgpu_dm_atomic_commit_tail(struct
>>>> drm_atomic_state *state)
>>>> * so we can put the GPU into runtime suspend if we're not
>>>> driving
>>>> any
>>>> * displays anymore
>>>> */
>>>> + for (i = 0; i < crtc_disable_count; i++)
>>>> + pm_runtime_put_autosuspend(dev->dev);
>>>> pm_runtime_mark_last_busy(dev->dev);
>>>> - for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state,
>>>> new_crtc_state, i) {
>>>> - if (old_crtc_state->active && !new_crtc_state->active)
>>>> - pm_runtime_put_autosuspend(dev->dev);
>>>> - }
>>>> }
>>>
>
More information about the amd-gfx
mailing list