[Mesa-stable] [Mesa-dev] [PATCH] radv/winsys: allow to submit up to 4 IBs for chips without chaining
Samuel Pitoiset
samuel.pitoiset at gmail.com
Tue Apr 24 08:35:14 UTC 2018
Done! :)
On 04/24/2018 09:53 AM, Juan A. Suarez Romero wrote:
> On Sat, 2018-04-21 at 14:38 +0200, Samuel Pitoiset wrote:
>> Hi Mark,
>>
>> Sure, I will do next week.
>>
>
> Hello, Samuel.
>
> Would it be possible to get this patch between today and tomorrow? I'd like to
> create the candidate next 18.0 release tomorrow, and the current patch doesn't
> apply in the branch, as pointed out by Mark.
>
> If you can't provide it, we can skip this patch for this release and add to the
> next one.
>
>
> Thanks.
>
> J.A.
>
>> Thanks.
>>
>> On 04/21/2018 02:22 AM, Mark Janes wrote:
>>> Hi Samuel,
>>>
>>> This patch doesn't apply cleanly to the stable branches. You may want
>>> to send a backport to mesa-stable.
>>>
>>> -Mark
>>>
>>> Samuel Pitoiset <samuel.pitoiset at gmail.com> writes:
>>>
>>>> The SI family doesn't support chaining which means the maximum
>>>> size in dwords per CS is limited. When that limit was reached
>>>> we failed to submit the CS and the application crashed.
>>>>
>>>> This patch allows to submit up to 4 IBs which is currently the
>>>> limit, but recent amdgpu supports more than that.
>>>>
>>>> Please note that we can reach the limit of 4 IBs per submit
>>>> but currently we can't improve that. The only solution is to
>>>> upgrade libdrm. That will be improved later but for now this
>>>> should fix crashes on SI or when using RADV_DEBUG=noibs.
>>>>
>>>> Fixes: 36cb5508e89 ("radv/winsys: Fail early on overgrown cs.")
>>>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105775
>>>> Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
>>>> ---
>>>> src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 218 ++++++++++++++----
>>>> 1 file changed, 168 insertions(+), 50 deletions(-)
>>>>
>>>> diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
>>>> index c4b2232ce9e..17e6d8ba2b6 100644
>>>> --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
>>>> +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
>>>> @@ -68,6 +68,10 @@ struct radv_amdgpu_cs {
>>>> struct radeon_winsys_bo **virtual_buffers;
>>>> uint8_t *virtual_buffer_priorities;
>>>> int *virtual_buffer_hash_table;
>>>> +
>>>> + /* For chips that don't support chaining. */
>>>> + struct radeon_winsys_cs *old_cs_buffers;
>>>> + unsigned num_old_cs_buffers;
>>>> };
>>>>
>>>> static inline struct radv_amdgpu_cs *
>>>> @@ -201,6 +205,12 @@ static void radv_amdgpu_cs_destroy(struct radeon_winsys_cs *rcs)
>>>> for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i)
>>>> cs->ws->base.buffer_destroy(cs->old_ib_buffers[i]);
>>>>
>>>> + for (unsigned i = 0; i < cs->num_old_cs_buffers; ++i) {
>>>> + struct radeon_winsys_cs *rcs = &cs->old_cs_buffers[i];
>>>> + free(rcs->buf);
>>>> + }
>>>> +
>>>> + free(cs->old_cs_buffers);
>>>> free(cs->old_ib_buffers);
>>>> free(cs->virtual_buffers);
>>>> free(cs->virtual_buffer_priorities);
>>>> @@ -286,9 +296,46 @@ static void radv_amdgpu_cs_grow(struct radeon_winsys_cs *_cs, size_t min_size)
>>>> /* The total ib size cannot exceed limit_dws dwords. */
>>>> if (ib_dws > limit_dws)
>>>> {
>>>> - cs->failed = true;
>>>> + /* The maximum size in dwords has been reached,
>>>> + * try to allocate a new one.
>>>> + */
>>>> + if (cs->num_old_cs_buffers + 1 >= AMDGPU_CS_MAX_IBS_PER_SUBMIT) {
>>>> + /* TODO: Allow to submit more than 4 IBs. */
>>>> + fprintf(stderr, "amdgpu: Maximum number of IBs "
>>>> + "per submit reached.\n");
>>>> + cs->failed = true;
>>>> + cs->base.cdw = 0;
>>>> + return;
>>>> + }
>>>> +
>>>> + cs->old_cs_buffers =
>>>> + realloc(cs->old_cs_buffers,
>>>> + (cs->num_old_cs_buffers + 1) * sizeof(*cs->old_cs_buffers));
>>>> + if (!cs->old_cs_buffers) {
>>>> + cs->failed = true;
>>>> + cs->base.cdw = 0;
>>>> + return;
>>>> + }
>>>> +
>>>> + /* Store the current one for submitting it later. */
>>>> + cs->old_cs_buffers[cs->num_old_cs_buffers].cdw = cs->base.cdw;
>>>> + cs->old_cs_buffers[cs->num_old_cs_buffers].max_dw = cs->base.max_dw;
>>>> + cs->old_cs_buffers[cs->num_old_cs_buffers].buf = cs->base.buf;
>>>> + cs->num_old_cs_buffers++;
>>>> +
>>>> + /* Reset the cs, it will be re-allocated below. */
>>>> cs->base.cdw = 0;
>>>> - return;
>>>> + cs->base.buf = NULL;
>>>> +
>>>> + /* Re-compute the number of dwords to allocate. */
>>>> + ib_dws = MAX2(cs->base.cdw + min_size,
>>>> + MIN2(cs->base.max_dw * 2, limit_dws));
>>>> + if (ib_dws > limit_dws) {
>>>> + fprintf(stderr, "amdgpu: Too high number of "
>>>> + "dwords to allocate\n");
>>>> + cs->failed = true;
>>>> + return;
>>>> + }
>>>> }
>>>>
>>>> uint32_t *new_buf = realloc(cs->base.buf, ib_dws * 4);
>>>> @@ -400,6 +447,15 @@ static void radv_amdgpu_cs_reset(struct radeon_winsys_cs *_cs)
>>>> cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
>>>> cs->ib_size_ptr = &cs->ib.size;
>>>> cs->ib.size = 0;
>>>> + } else {
>>>> + for (unsigned i = 0; i < cs->num_old_cs_buffers; ++i) {
>>>> + struct radeon_winsys_cs *rcs = &cs->old_cs_buffers[i];
>>>> + free(rcs->buf);
>>>> + }
>>>> +
>>>> + free(cs->old_cs_buffers);
>>>> + cs->old_cs_buffers = NULL;
>>>> + cs->num_old_cs_buffers = 0;
>>>> }
>>>> }
>>>>
>>>> @@ -550,7 +606,8 @@ static void radv_amdgpu_cs_execute_secondary(struct radeon_winsys_cs *_parent,
>>>> static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
>>>> struct radeon_winsys_cs **cs_array,
>>>> unsigned count,
>>>> - struct radv_amdgpu_winsys_bo *extra_bo,
>>>> + struct radv_amdgpu_winsys_bo **extra_bo_array,
>>>> + unsigned num_extra_bo,
>>>> struct radeon_winsys_cs *extra_cs,
>>>> const struct radv_winsys_bo_list *radv_bo_list,
>>>> amdgpu_bo_list_handle *bo_list)
>>>> @@ -580,7 +637,7 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
>>>> bo_list);
>>>> free(handles);
>>>> pthread_mutex_unlock(&ws->global_bo_list_lock);
>>>> - } else if (count == 1 && !extra_bo && !extra_cs && !radv_bo_list &&
>>>> + } else if (count == 1 && !num_extra_bo && !extra_cs && !radv_bo_list &&
>>>> !radv_amdgpu_cs(cs_array[0])->num_virtual_buffers) {
>>>> struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[0];
>>>> if (cs->num_buffers == 0) {
>>>> @@ -590,8 +647,8 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
>>>> r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, cs->handles,
>>>> cs->priorities, bo_list);
>>>> } else {
>>>> - unsigned total_buffer_count = !!extra_bo;
>>>> - unsigned unique_bo_count = !!extra_bo;
>>>> + unsigned total_buffer_count = num_extra_bo;
>>>> + unsigned unique_bo_count = num_extra_bo;
>>>> for (unsigned i = 0; i < count; ++i) {
>>>> struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[i];
>>>> total_buffer_count += cs->num_buffers;
>>>> @@ -619,9 +676,9 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
>>>> return -ENOMEM;
>>>> }
>>>>
>>>> - if (extra_bo) {
>>>> - handles[0] = extra_bo->bo;
>>>> - priorities[0] = 8;
>>>> + for (unsigned i = 0; i < num_extra_bo; i++) {
>>>> + handles[i] = extra_bo_array[i]->bo;
>>>> + priorities[i] = 8;
>>>> }
>>>>
>>>> for (unsigned i = 0; i < count + !!extra_cs; ++i) {
>>>> @@ -773,7 +830,7 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
>>>> }
>>>> }
>>>>
>>>> - r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, initial_preamble_cs,
>>>> + r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, 0, initial_preamble_cs,
>>>> radv_bo_list, &bo_list);
>>>> if (r) {
>>>> fprintf(stderr, "amdgpu: buffer list creation failed for the "
>>>> @@ -842,7 +899,7 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx,
>>>>
>>>> memset(&request, 0, sizeof(request));
>>>>
>>>> - r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, NULL,
>>>> + r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, NULL, 0,
>>>> preamble_cs, radv_bo_list, &bo_list);
>>>> if (r) {
>>>> fprintf(stderr, "amdgpu: buffer list creation failed "
>>>> @@ -923,68 +980,127 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
>>>> assert(cs_count);
>>>>
>>>> for (unsigned i = 0; i < cs_count;) {
>>>> - struct amdgpu_cs_ib_info ib = {0};
>>>> - struct radeon_winsys_bo *bo = NULL;
>>>> + struct amdgpu_cs_ib_info ibs[AMDGPU_CS_MAX_IBS_PER_SUBMIT] = {0};
>>>> + unsigned number_of_ibs = 1;
>>>> + struct radeon_winsys_bo *bos[AMDGPU_CS_MAX_IBS_PER_SUBMIT] = {0};
>>>> struct radeon_winsys_cs *preamble_cs = i ? continue_preamble_cs : initial_preamble_cs;
>>>> + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]);
>>>> uint32_t *ptr;
>>>> unsigned cnt = 0;
>>>> unsigned size = 0;
>>>> unsigned pad_words = 0;
>>>> - if (preamble_cs)
>>>> - size += preamble_cs->cdw;
>>>>
>>>> - while (i + cnt < cs_count && 0xffff8 - size >= radv_amdgpu_cs(cs_array[i + cnt])->base.cdw) {
>>>> - size += radv_amdgpu_cs(cs_array[i + cnt])->base.cdw;
>>>> - ++cnt;
>>>> - }
>>>> + if (cs->num_old_cs_buffers > 0) {
>>>> + /* Special path when the maximum size in dwords has
>>>> + * been reached because we need to handle more than one
>>>> + * IB per submit.
>>>> + */
>>>> + unsigned new_cs_count = cs->num_old_cs_buffers + 1;
>>>> + struct radeon_winsys_cs *new_cs_array[4];
>>>> + unsigned idx = 0;
>>>> +
>>>> + for (unsigned j = 0; j < cs->num_old_cs_buffers; j++)
>>>> + new_cs_array[idx++] = &cs->old_cs_buffers[j];
>>>> + new_cs_array[idx++] = cs_array[i];
>>>> +
>>>> + for (unsigned j = 0; j < new_cs_count; j++) {
>>>> + struct radeon_winsys_cs *rcs = new_cs_array[j];
>>>> + bool needs_preamble = preamble_cs && j == 0;
>>>> + unsigned size = 0;
>>>> +
>>>> + if (needs_preamble)
>>>> + size += preamble_cs->cdw;
>>>> + size += rcs->cdw;
>>>> +
>>>> + assert(size < 0xffff8);
>>>> +
>>>> + while(!size || (size & 7)) {
>>>> + size++;
>>>> + pad_words++;
>>>> + }
>>>>
>>>> - while(!size || (size & 7)) {
>>>> - size++;
>>>> - pad_words++;
>>>> - }
>>>> - assert(cnt);
>>>> + bos[j] = ws->buffer_create(ws, 4 * size, 4096,
>>>> + RADEON_DOMAIN_GTT,
>>>> + RADEON_FLAG_CPU_ACCESS |
>>>> + RADEON_FLAG_NO_INTERPROCESS_SHARING |
>>>> + RADEON_FLAG_READ_ONLY);
>>>> + ptr = ws->buffer_map(bos[j]);
>>>>
>>>> - bo = ws->buffer_create(ws, 4 * size, 4096, RADEON_DOMAIN_GTT,
>>>> - RADEON_FLAG_CPU_ACCESS |
>>>> - RADEON_FLAG_NO_INTERPROCESS_SHARING |
>>>> - RADEON_FLAG_READ_ONLY);
>>>> - ptr = ws->buffer_map(bo);
>>>> + if (needs_preamble) {
>>>> + memcpy(ptr, preamble_cs->buf, preamble_cs->cdw * 4);
>>>> + ptr += preamble_cs->cdw;
>>>> + }
>>>>
>>>> - if (preamble_cs) {
>>>> - memcpy(ptr, preamble_cs->buf, preamble_cs->cdw * 4);
>>>> - ptr += preamble_cs->cdw;
>>>> - }
>>>> + memcpy(ptr, rcs->buf, 4 * rcs->cdw);
>>>> + ptr += rcs->cdw;
>>>>
>>>> - for (unsigned j = 0; j < cnt; ++j) {
>>>> - struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i + j]);
>>>> - memcpy(ptr, cs->base.buf, 4 * cs->base.cdw);
>>>> - ptr += cs->base.cdw;
>>>> + for (unsigned k = 0; k < pad_words; ++k)
>>>> + *ptr++ = pad_word;
>>>>
>>>> - }
>>>> + ibs[j].size = size;
>>>> + ibs[j].ib_mc_address = radv_buffer_get_va(bos[j]);
>>>> + }
>>>>
>>>> - for (unsigned j = 0; j < pad_words; ++j)
>>>> - *ptr++ = pad_word;
>>>> + number_of_ibs = new_cs_count;
>>>> + cnt++;
>>>> + } else {
>>>> + if (preamble_cs)
>>>> + size += preamble_cs->cdw;
>>>>
>>>> - memset(&request, 0, sizeof(request));
>>>> + while (i + cnt < cs_count && 0xffff8 - size >= radv_amdgpu_cs(cs_array[i + cnt])->base.cdw) {
>>>> + size += radv_amdgpu_cs(cs_array[i + cnt])->base.cdw;
>>>> + ++cnt;
>>>> + }
>>>> +
>>>> + while (!size || (size & 7)) {
>>>> + size++;
>>>> + pad_words++;
>>>> + }
>>>> + assert(cnt);
>>>>
>>>> + bos[0] = ws->buffer_create(ws, 4 * size, 4096,
>>>> + RADEON_DOMAIN_GTT,
>>>> + RADEON_FLAG_CPU_ACCESS |
>>>> + RADEON_FLAG_NO_INTERPROCESS_SHARING |
>>>> + RADEON_FLAG_READ_ONLY);
>>>> + ptr = ws->buffer_map(bos[0]);
>>>> +
>>>> + if (preamble_cs) {
>>>> + memcpy(ptr, preamble_cs->buf, preamble_cs->cdw * 4);
>>>> + ptr += preamble_cs->cdw;
>>>> + }
>>>> +
>>>> + for (unsigned j = 0; j < cnt; ++j) {
>>>> + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i + j]);
>>>> + memcpy(ptr, cs->base.buf, 4 * cs->base.cdw);
>>>> + ptr += cs->base.cdw;
>>>> +
>>>> + }
>>>> +
>>>> + for (unsigned j = 0; j < pad_words; ++j)
>>>> + *ptr++ = pad_word;
>>>> +
>>>> + ibs[0].size = size;
>>>> + ibs[0].ib_mc_address = radv_buffer_get_va(bos[0]);
>>>> + }
>>>>
>>>> r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt,
>>>> - (struct radv_amdgpu_winsys_bo*)bo,
>>>> - preamble_cs, radv_bo_list, &bo_list);
>>>> + (struct radv_amdgpu_winsys_bo **)bos,
>>>> + number_of_ibs, preamble_cs,
>>>> + radv_bo_list, &bo_list);
>>>> if (r) {
>>>> fprintf(stderr, "amdgpu: buffer list creation failed "
>>>> "for the sysmem submission (%d)\n", r);
>>>> return r;
>>>> }
>>>>
>>>> - ib.size = size;
>>>> - ib.ib_mc_address = radv_buffer_get_va(bo);
>>>> + memset(&request, 0, sizeof(request));
>>>>
>>>> request.ip_type = cs0->hw_ip;
>>>> request.ring = queue_idx;
>>>> request.resources = bo_list;
>>>> - request.number_of_ibs = 1;
>>>> - request.ibs = &ib;
>>>> + request.number_of_ibs = number_of_ibs;
>>>> + request.ibs = ibs;
>>>> request.fence_info = radv_set_cs_fence(ctx, cs0->hw_ip, queue_idx);
>>>>
>>>> sem_info->cs_emit_signal = (i == cs_count - cnt) ? emit_signal_sem : false;
>>>> @@ -1000,9 +1116,11 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
>>>> if (bo_list)
>>>> amdgpu_bo_list_destroy(bo_list);
>>>>
>>>> - ws->buffer_destroy(bo);
>>>> - if (r)
>>>> - return r;
>>>> + for (unsigned j = 0; j < number_of_ibs; j++) {
>>>> + ws->buffer_destroy(bos[j]);
>>>> + if (r)
>>>> + return r;
>>>> + }
>>>>
>>>> i += cnt;
>>>> }
>>>> --
>>>> 2.17.0
>>>>
>>>> _______________________________________________
>>>> mesa-dev mailing list
>>>> mesa-dev at lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>
>> _______________________________________________
>> mesa-stable mailing list
>> mesa-stable at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-stable
More information about the mesa-stable
mailing list