[Freedreno] [PATCH] drm/msm: Optimize GPU crashstate capture read path
Sharat Masetty
smasetty at codeaurora.org
Fri Nov 2 09:42:51 UTC 2018
Thanks for the comments Jordan -
On 11/1/2018 8:34 PM, Jordan Crouse wrote:
> On Thu, Nov 01, 2018 at 02:05:41PM +0530, Sharat Masetty wrote:
>> When the userspace tries to read the crashstate dump, the read side
>> implementation in the driver currently ascii85 encodes all the binary
>> buffers and it does this each time the read system call is called.
>> A userspace tool like cat typically does a page by page read and the
>> number of read calls depends on the size of the data captured by the
>> driver. This is certainly not desirable and does not scale well with
>> large captures.
>>
>> This patch starts off with moving the encoding part to the capture time,
>> that is when the GPU tries to recover after a hang. Now we only encode
>> once per buffer object and not per page read. With this there is an
>> immediate >10X speed improvement in crashstate save time.
>>
>> The flip side however is that the GPU recovery time increases and this
>> negatively impacts the user experience, so this is handled by moving the
>> encoding part to a worker thread and only register with the dev_coredump
>> once the encoding of the buffers is complete.
>
> Does your tree have https://patchwork.freedesktop.org/patch/257181/ ? That will
> help with a lot of the problems you have described.
The issue is that even with small sized files like ~5MB or so, the
capture time is huge and sometime does not survive the dev_coredump
timeout of 5 minutes. With this change however I am able to dump large
file sizes like 60MB or so in reasonable amount of time (~50 seconds).
As for the patch that you shared, I am thinking that we should drop it
and instead go with this or whatever we agree on. The patch loses stuff
like IB2's which might be important while debugging hang issues.
>
>> Signed-off-by: Sharat Masetty <smasetty at codeaurora.org>
>> ---
>> drivers/gpu/drm/msm/adreno/adreno_gpu.c | 35 +++----------
>> drivers/gpu/drm/msm/msm_gpu.c | 93 +++++++++++++++++++++++++++++++--
>> drivers/gpu/drm/msm/msm_gpu.h | 1 +
>> 3 files changed, 98 insertions(+), 31 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> index 141062f..7441571 100644
>> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> @@ -406,7 +406,7 @@ int adreno_gpu_state_get(struct msm_gpu *gpu, struct msm_gpu_state *state)
>> size = j + 1;
>>
>> if (size) {
>> - state->ring[i].data = kmalloc(size << 2, GFP_KERNEL);
>> + state->ring[i].data = kvmalloc(size << 2, GFP_KERNEL);
>> if (state->ring[i].data) {
>> memcpy(state->ring[i].data, gpu->rb[i]->start, size << 2);
>> state->ring[i].data_size = size << 2;
>
> This is a good fix, but unrelated to this change as detailed above. Should be
> separate.
>
>> @@ -445,7 +445,7 @@ void adreno_gpu_state_destroy(struct msm_gpu_state *state)
>> int i;
>>
>> for (i = 0; i < ARRAY_SIZE(state->ring); i++)
>> - kfree(state->ring[i].data);
>> + kvfree(state->ring[i].data);
>
> As above.
>
>>
>> for (i = 0; state->bos && i < state->nr_bos; i++)
>> kvfree(state->bos[i].data);
>> @@ -475,34 +475,15 @@ int adreno_gpu_state_put(struct msm_gpu_state *state)
>>
>> #if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
>>
>> -static void adreno_show_object(struct drm_printer *p, u32 *ptr, int len)
>> +static void adreno_show_object(struct drm_printer *p, void *ptr)
>> {
>> - char out[ASCII85_BUFSZ];
>> - long l, datalen, i;
>> -
>> - if (!ptr || !len)
>> - return;
>> -
>> - /*
>> - * Only dump the non-zero part of the buffer - rarely will any data
>> - * completely fill the entire allocated size of the buffer
>> - */
>> - for (datalen = 0, i = 0; i < len >> 2; i++) {
>> - if (ptr[i])
>> - datalen = (i << 2) + 1;
>> - }
>> -
>> - /* Skip printing the object if it is empty */
>> - if (datalen == 0)
>> + if (!ptr)
>> return;
>>
>> - l = ascii85_encode_len(datalen);
>> -
>> drm_puts(p, " data: !!ascii85 |\n");
>> drm_puts(p, " ");
>>
>> - for (i = 0; i < l; i++)
>> - drm_puts(p, ascii85_encode(ptr[i], out));
>> + drm_puts(p, ptr);
>>
>> drm_puts(p, "\n");
>> }
>> @@ -534,8 +515,7 @@ void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
>> drm_printf(p, " wptr: %d\n", state->ring[i].wptr);
>> drm_printf(p, " size: %d\n", MSM_GPU_RINGBUFFER_SZ);
>>
>> - adreno_show_object(p, state->ring[i].data,
>> - state->ring[i].data_size);
>> + adreno_show_object(p, state->ring[i].data);
>> }
>>
>> if (state->bos) {
>> @@ -546,8 +526,7 @@ void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
>> state->bos[i].iova);
>> drm_printf(p, " size: %zd\n", state->bos[i].size);
>>
>> - adreno_show_object(p, state->bos[i].data,
>> - state->bos[i].size);
>> + adreno_show_object(p, state->bos[i].data);
>> }
>> }
>>
>> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
>> index ff95848..d5533f1 100644
>> --- a/drivers/gpu/drm/msm/msm_gpu.c
>> +++ b/drivers/gpu/drm/msm/msm_gpu.c
>> @@ -21,6 +21,7 @@
>> #include "msm_fence.h"
>>
>> #include <generated/utsrelease.h>
>> +#include <linux/ascii85.h>
>> #include <linux/string_helpers.h>
>> #include <linux/pm_opp.h>
>> #include <linux/devfreq.h>
>> @@ -375,9 +376,7 @@ static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
>> /* Set the active crash state to be dumped on failure */
>> gpu->crashstate = state;
>>
>> - /* FIXME: Release the crashstate if this errors out? */
>> - dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL,
>> - msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
>> + schedule_work(&gpu->crashstate_work);
>
> I'm super hesitant to ever add anything having to do with scheduling because we
> already abuse it so badly but this seems like it might be okay since it isn't
> high priority and not related to timing. I would be curious what others think.
>
>> }
>> #else
>> static void msm_gpu_crashstate_capture(struct msm_gpu *gpu,
>> @@ -420,6 +419,93 @@ static void update_fences(struct msm_gpu *gpu, struct msm_ringbuffer *ring,
>>
>> static void retire_submits(struct msm_gpu *gpu);
>>
>> +static char *msm_gpu_ascii85_encode_buf(u32 *src, size_t len)
>> +{
>> + void *buf;
>> + size_t buf_itr = 0, buffer_size;
>> + char out[ASCII85_BUFSZ];
>> + long l;
>> + int i;
>> +
>> + if (!src || !len)
>> + return NULL;
>
> I don't think src() will ever legitimately be NULL so you can remove the check.
>
>> + l = ascii85_encode_len(len);
>> +
>> + /*
>> + * Ascii85 outputs either a 5 byte string or a 1 byte string. So we
>> + * account for the worst case of 5 bytes per dword plus the 1 for '\0'
>> + */
>> + buffer_size = (l * 5) + 1;
>
> buffer_size is't needed - you can do the math inline in the kvmalloc() function.
>
>> +
>> + buf = kvmalloc(buffer_size, GFP_KERNEL);
>> + if (!buf)
>> + return NULL;
>> +
>> + for (i = 0; i < l; i++)
>> + buf_itr += snprintf(buf + buf_itr, buffer_size - buf_itr, "%s",
>
> This specific algorithm should use scnprintf() here but on a broader note
> sprintf() and friends is a bad choice if performance is important because it
> jumps through more hoops to eventually do what amounts to a mempcy (this is the
> reason why seq_printf() and seq_puts() are different things and checkpatch
> insists that you use seq_puts() for constant strings).
>
> Inf you are planning on outputting into a buffer you should just make a custom
> ascii85 function to output directly and avoid copying in the first place.
>
> If you wanted to get fancy you could add it to include/linux/ascii85.h -
> ascii85_encode_to_buf() or some such.
>
Ah! This is a good point - I will try out something on these lines.
>> + ascii85_encode(src[i], out));
>> +
>> + return buf;
>> +}
>> +
>> +/*
>> + * Convert the binary data in the gpu crash state capture(like bos and
>> + * ringbuffer data) to ascii85 encoded strings. Note that the encoded
>> + * string is NULL terminated.
>
> This is technically not true because you are specifically accounting for a NULL
> terminator in the function above.
>
>> + */
>> +static void crashstate_worker(struct work_struct *work)
>> +{
>> + struct msm_gpu_state *state;
>> + struct msm_gpu *gpu = container_of(work, struct msm_gpu,
>> + crashstate_work);
>> + int i;
>> +
>> + state = msm_gpu_crashstate_get(gpu);
>> + if (!state)
>> + return;
>> +
>> + for (i = 0; i < MSM_GPU_MAX_RINGS; i++) {
>> + void *buf = NULL;
>
> You don't needed to initialize buf.
>
>> +
>> + if (!state->ring[i].data)
>> + continue;
>> +
>
> A comment here describing the variable switch-a-roo you are doing would be
> helpful.
>
>> + buf = state->ring[i].data;
>> +
>> + state->ring[i].data = msm_gpu_ascii85_encode_buf(buf,
>> + state->ring[i].data_size);
>> + kvfree(buf);
>> + }
>> +
>> + for (i = 0; i < state->nr_bos; i++) {
>> + u32 *buf = NULL;
>
> buf does not need to be initialized.
>
>> + long datalen, j;
>> +
>> + if (!state->bos[i].data)
>> + continue;
>> +
>> + buf = state->bos[i].data;
>> +
>> + /*
>> + * Only dump the non-zero part of the buffer - rarely will
>> + * any data completely fill the entire allocated size of the
>> + * buffer
>> + */
>> + for (datalen = 0, j = 0; j < state->bos[i].size >> 2; j++)
>> + if (buf[j])
>> + datalen = ((j + 1) << 2);
>> +
>> + state->bos[i].data = msm_gpu_ascii85_encode_buf(buf, datalen);
>> + kvfree(buf);
>> + }
>> +
>> + msm_gpu_crashstate_put(gpu);
>> +
>> + dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL,
>> + msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
>> +}
>> +
>> static void recover_worker(struct work_struct *work)
>> {
>> struct msm_gpu *gpu = container_of(work, struct msm_gpu, recover_work);
>> @@ -856,6 +942,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
>> INIT_LIST_HEAD(&gpu->active_list);
>> INIT_WORK(&gpu->retire_work, retire_worker);
>> INIT_WORK(&gpu->recover_work, recover_worker);
>> + INIT_WORK(&gpu->crashstate_work, crashstate_worker);
>>
>>
>> timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0);
>> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
>> index f82bac0..8825069 100644
>> --- a/drivers/gpu/drm/msm/msm_gpu.h
>> +++ b/drivers/gpu/drm/msm/msm_gpu.h
>> @@ -137,6 +137,7 @@ struct msm_gpu {
>> } devfreq;
>>
>> struct msm_gpu_state *crashstate;
>> + struct work_struct crashstate_work;
>> };
>>
>> /* It turns out that all targets use the same ringbuffer size */
>> --
>> 1.9.1
>>
>
--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
Linux Foundation Collaborative Project
More information about the dri-devel
mailing list