[PATCH] drm/amdgpu: Count disabled CRTCs in commit tail earlier

Sat Jun 23 02:42:12 UTC 2018

On 06/22/2018 09:03 PM, Andrey Grodzovsky wrote:
>
>
> On 06/22/2018 02:56 PM, Lyude Paul wrote:
>> On Fri, 2018-06-22 at 13:34 -0400, Andrey Grodzovsky wrote:
>>> On 06/21/2018 04:48 PM, Lyude Paul wrote:
>>>> This fixes a regression I accidentally reduced that was picked up by
>>>> kasan, where we were checking the CRTC atomic states after DRM's 
>>>> helpers
>>>> had already freed them. Example:
>>>>
>>>> ==================================================================
>>>> BUG: KASAN: use-after-free in
>>>> amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
>>>> Read of size 1 at addr ffff8803a697b071 by task kworker/u16:0/7
>>>>
>>>> CPU: 7 PID: 7 Comm: kworker/u16:0 Tainted: G O      4.18.0-
>>>> rc1Lyude-Upstream+ #1
>>>> Hardware name: HP HP ZBook 15 G4/8275, BIOS P70 Ver. 01.21 05/02/2018
>>>> Workqueue: events_unbound commit_work [drm_kms_helper]
>>>> Call Trace:
>>>>    dump_stack+0xc1/0x169
>>>>    ? dump_stack_print_info.cold.1+0x42/0x42
>>>>    ? kmsg_dump_rewind_nolock+0xd9/0xd9
>>>>    ? printk+0x9f/0xc5
>>>>    ? amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
>>>>    print_address_description+0x6c/0x23c
>>>>    ? amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
>>>>    kasan_report.cold.6+0x241/0x2fd
>>>>    amdgpu_dm_atomic_commit_tail.cold.50+0x13d/0x15a [amdgpu]
>>>>    ? commit_planes_to_stream.constprop.45+0x13b0/0x13b0 [amdgpu]
>>>>    ? cpu_load_update_active+0x290/0x290
>>>>    ? finish_task_switch+0x2bd/0x840
>>>>    ? __switch_to_asm+0x34/0x70
>>>>    ? read_word_at_a_time+0xe/0x20
>>>>    ? strscpy+0x14b/0x460
>>>>    ? drm_atomic_helper_wait_for_dependencies+0x47d/0x7e0 
>>>> [drm_kms_helper]
>>>>    commit_tail+0x96/0xe0 [drm_kms_helper]
>>>>    process_one_work+0x88a/0x1360
>>>>    ? create_worker+0x540/0x540
>>>>    ? __sched_text_start+0x8/0x8
>>>>    ? move_queued_task+0x760/0x760
>>>>    ? call_rcu_sched+0x20/0x20
>>>>    ? vsnprintf+0xcda/0x1350
>>>>    ? wait_woken+0x1c0/0x1c0
>>>>    ? mutex_unlock+0x1d/0x40
>>>>    ? init_timer_key+0x190/0x230
>>>>    ? schedule+0xea/0x390
>>>>    ? __schedule+0x1ea0/0x1ea0
>>>>    ? need_to_create_worker+0xe4/0x210
>>>>    ? init_worker_pool+0x700/0x700
>>>>    ? try_to_del_timer_sync+0xbf/0x110
>>>>    ? del_timer+0x120/0x120
>>>>    ? __mutex_lock_slowpath+0x10/0x10
>>>>    worker_thread+0x196/0x11f0
>>>>    ? flush_rcu_work+0x50/0x50
>>>>    ? __switch_to_asm+0x34/0x70
>>>>    ? __switch_to_asm+0x34/0x70
>>>>    ? __switch_to_asm+0x40/0x70
>>>>    ? __switch_to_asm+0x34/0x70
>>>>    ? __switch_to_asm+0x40/0x70
>>>>    ? __switch_to_asm+0x34/0x70
>>>>    ? __switch_to_asm+0x40/0x70
>>>>    ? __schedule+0x7d6/0x1ea0
>>>>    ? migrate_swap_stop+0x850/0x880
>>>>    ? __sched_text_start+0x8/0x8
>>>>    ? save_stack+0x8c/0xb0
>>>>    ? kasan_kmalloc+0xbf/0xe0
>>>>    ? kmem_cache_alloc_trace+0xe4/0x190
>>>>    ? kthread+0x98/0x390
>>>>    ? ret_from_fork+0x35/0x40
>>>>    ? ret_from_fork+0x35/0x40
>>>>    ? deactivate_slab.isra.67+0x3c4/0x5c0
>>>>    ? kthread+0x98/0x390
>>>>    ? kthread+0x98/0x390
>>>>    ? set_track+0x76/0x120
>>>>    ? schedule+0xea/0x390
>>>>    ? __schedule+0x1ea0/0x1ea0
>>>>    ? wait_woken+0x1c0/0x1c0
>>>>    ? kasan_unpoison_shadow+0x30/0x40
>>>>    ? parse_args.cold.15+0x17a/0x17a
>>>>    ? flush_rcu_work+0x50/0x50
>>>>    kthread+0x2d4/0x390
>>>>    ? kthread_create_worker_on_cpu+0xc0/0xc0
>>>>    ret_from_fork+0x35/0x40
>>>>
>>>> Allocated by task 1124:
>>>>    kasan_kmalloc+0xbf/0xe0
>>>>    kmem_cache_alloc_trace+0xe4/0x190
>>>>    dm_crtc_duplicate_state+0x78/0x130 [amdgpu]
>>>>    drm_atomic_get_crtc_state+0x147/0x410 [drm]
>>>>    page_flip_common+0x57/0x230 [drm_kms_helper]
>>>>    drm_atomic_helper_page_flip+0xa6/0x110 [drm_kms_helper]
>>>>    drm_mode_page_flip_ioctl+0xc4b/0x10a0 [drm]
>>>>    drm_ioctl_kernel+0x1d4/0x260 [drm]
>>>>    drm_ioctl+0x433/0x920 [drm]
>>>>    amdgpu_drm_ioctl+0x11d/0x290 [amdgpu]
>>>>    do_vfs_ioctl+0x1a1/0x13d0
>>>>    ksys_ioctl+0x60/0x90
>>>>    __x64_sys_ioctl+0x6f/0xb0
>>>>    do_syscall_64+0x147/0x440
>>>>    entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>>>
>>>> Freed by task 1124:
>>>>    __kasan_slab_free+0x12e/0x180
>>>>    kfree+0x92/0x1a0
>>>>    drm_atomic_state_default_clear+0x315/0xc40 [drm]
>>>>    __drm_atomic_state_free+0x35/0xd0 [drm]
>>>>    drm_atomic_helper_update_plane+0xac/0x350 [drm_kms_helper]
>>>>    __setplane_internal+0x2d6/0x840 [drm]
>>>>    drm_mode_cursor_universal+0x41e/0xbe0 [drm]
>>>>    drm_mode_cursor_common+0x49f/0x880 [drm]
>>>>    drm_mode_cursor_ioctl+0xd8/0x130 [drm]
>>>>    drm_ioctl_kernel+0x1d4/0x260 [drm]
>>>>    drm_ioctl+0x433/0x920 [drm]
>>>>    amdgpu_drm_ioctl+0x11d/0x290 [amdgpu]
>>>>    do_vfs_ioctl+0x1a1/0x13d0
>>>>    ksys_ioctl+0x60/0x90
>>>>    __x64_sys_ioctl+0x6f/0xb0
>>>>    do_syscall_64+0x147/0x440
>>>>    entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>>>
>>>> The buggy address belongs to the object at ffff8803a697b068
>>>>    which belongs to the cache kmalloc-1024 of size 1024
>>>> The buggy address is located 9 bytes inside of
>>>>    1024-byte region [ffff8803a697b068, ffff8803a697b468)
>>>> The buggy address belongs to the page:
>>>> page:ffffea000e9a5e00 count:1 mapcount:0 mapping:ffff88041e00efc0 
>>>> index:0x0
>>>> compound_mapcount: 0
>>>> flags: 0x8000000000008100(slab|head)
>>>> raw: 8000000000008100 ffffea000ecbc208 ffff88041e000c70 
>>>> ffff88041e00efc0
>>>> raw: 0000000000000000 0000000000170017 00000001ffffffff 
>>>> 0000000000000000
>>>> page dumped because: kasan: bad access detected
>>>>
>>>> Memory state around the buggy address:
>>>>    ffff8803a697af00: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>>>>    ffff8803a697af80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>>>>> ffff8803a697b000: fc fc fc fc fc fc fc fc fc fc fc fc fc fb fb fb
>>>>                                                                ^
>>>>    ffff8803a697b080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>>>>    ffff8803a697b100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>>>> ==================================================================
>>>>
>>>> So, we fix this by counting the number of CRTCs this atomic commit 
>>>> disabled
>>>> early on in the function before their atomic states have been 
>>>> freed, then
>>>> use
>>>> that count later to do the appropriate number of RPM puts at the 
>>>> end of the
>>>> function.
>>> I am a bit not clear, are you saying that the problem was the 'in the
>>> middle' commit (cursor ioctl) doing
>>>
>>> drm_atomic_state_default_clear->dm_crtc_destroy_state->kfree(state)
>>>
>>> where the state is the one you access from from the non blocking 
>>> part of
>>> page flip though old_crtc_state->active?
>> The problem is that (see the comment in 
>> drivers/gpu/drm/drm_atomic_helper.c:2065
>> ) it's unsafe to touch any of the old_crtc_state structures after
>> drm_atomic_helper_commit_hw_done() is called, as it's likely that 
>> they've been
>> freed already.
>
> I  am not sure about that, the comment in 
> drm_atomic_helper_commit_hw_done says that
> "the driver is not allowed to read or change any permanent software
> or hardware modeset state" I interpret it as not the old_crtc_state 
> but as the new_crtc_state or crtc->state after
> drm_atomic_helper_swap_state completed.  It means that if you touch 
> crtc->state after drm_atomic_helper_commit_hw_done
> you actually could already be accessing a state which belong to the 
> next atomic commit after you.
> It really looks like cursor's atomic commit sneaks in in a middle of 
> page flip between the page flip IOCTL
> and it's commit_tail part and swaps away crct->state to his own new 
> state and release the 'old' state which is not really
> old yet and needs to be used by the tail part of page flip. This makes 
> sense since do_aquire_global_lock we use in amdgpu_dm_atomic_check
> to serialize against concurrent atomic_commits  is not called for case 
> of cursor plane and so it may race against any commit_tail in flight...
> Not sure why we haven't seen this problem before.
> Obviously your fix makes the problem go away since you stopped 
> accessing the new_crtc_state and not the old_crtc_state but the root 
> problem
> seems to me still there.
>
> Andrey

I took another look and actually no problem with the CURSOR IOCTL as it 
will wait in drm_atomic_helper_swap_state for hw_done event, so
I agree with the fix but just disagree with the explanation, it should 
be said that it's unsafe to touch the new_crtc_state (same as 
crtc->state) after
call to drm_atomic_helper_commit_hw_done. So I would make the 
explanation a bit more detailed on this point.

Anyway, the fix is Reviewed-by: Andrey Grodzovsky 
<andrey.grodzovsky at amd.com>

Andrey

>
>>> Andrey
>>>> Fixes: 97028037a38ae ("drm/amdgpu: Grab/put runtime PM references in
>>>> atomic_commit_tail()")
>>>> Signed-off-by: Lyude Paul <lyude at redhat.com>
>>>> Cc: Michel Dänzer <michel at daenzer.net>
>>>> Reported-by: Michel Dänzer <michel at daenzer.net>
>>>> ---
>>>>    drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 10 ++++++----
>>>>    1 file changed, 6 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>>>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>>>> index f9add85157e7..689dbdf44bbf 100644
>>>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>>>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>>>> @@ -4206,6 +4206,7 @@ static void amdgpu_dm_atomic_commit_tail(struct
>>>> drm_atomic_state *state)
>>>>        struct drm_connector *connector;
>>>>        struct drm_connector_state *old_con_state, *new_con_state;
>>>>        struct dm_crtc_state *dm_old_crtc_state, *dm_new_crtc_state;
>>>> +    int crtc_disable_count = 0;
>>>>           drm_atomic_helper_update_legacy_modeset_state(dev, state);
>>>>    @@ -4410,6 +4411,9 @@ static void 
>>>> amdgpu_dm_atomic_commit_tail(struct
>>>> drm_atomic_state *state)
>>>>            struct amdgpu_crtc *acrtc = to_amdgpu_crtc(crtc);
>>>>            bool modeset_needed;
>>>>    +        if (old_crtc_state->active && !new_crtc_state->active)
>>>> +            crtc_disable_count++;
>>>> +
>>>>            dm_new_crtc_state = to_dm_crtc_state(new_crtc_state);
>>>>            dm_old_crtc_state = to_dm_crtc_state(old_crtc_state);
>>>>            modeset_needed = modeset_required(
>>>> @@ -4463,11 +4467,9 @@ static void amdgpu_dm_atomic_commit_tail(struct
>>>> drm_atomic_state *state)
>>>>         * so we can put the GPU into runtime suspend if we're not 
>>>> driving
>>>> any
>>>>         * displays anymore
>>>>         */
>>>> +    for (i = 0; i < crtc_disable_count; i++)
>>>> +        pm_runtime_put_autosuspend(dev->dev);
>>>>        pm_runtime_mark_last_busy(dev->dev);
>>>> -    for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state,
>>>> new_crtc_state, i) {
>>>> -        if (old_crtc_state->active && !new_crtc_state->active)
>>>> -            pm_runtime_put_autosuspend(dev->dev);
>>>> -    }
>>>>    }
>>>
>