[Freedreno] [PATCH 06/10] drm/msm/gpu: Capture the GPU state on a GPU hang

Wed May 23 11:20:15 UTC 2018


On 4/18/2018 4:14 AM, Jordan Crouse wrote:
> Capture the GPU state on a GPU hang and store it for later playback
> via the devcoredump facility. Only one crash state is stored at a
> time on the assumption that the first hang is usually the most
> interesting. The existing crash state can be cleared after capturing
> it and then a new one will be captured on the next hang.
> 
> Signed-off-by: Jordan Crouse <jcrouse at codeaurora.org>
> ---
>   drivers/gpu/drm/msm/Kconfig             |  1 +
>   drivers/gpu/drm/msm/adreno/a3xx_gpu.c   |  2 +-
>   drivers/gpu/drm/msm/adreno/a4xx_gpu.c   |  2 +-
>   drivers/gpu/drm/msm/adreno/a5xx_gpu.c   |  4 +-
>   drivers/gpu/drm/msm/adreno/adreno_gpu.c | 36 +++++++----
>   drivers/gpu/drm/msm/adreno/adreno_gpu.h |  4 +-
>   drivers/gpu/drm/msm/msm_debugfs.c       |  5 +-
>   drivers/gpu/drm/msm/msm_gpu.c           | 83 ++++++++++++++++++++++++-
>   drivers/gpu/drm/msm/msm_gpu.h           | 38 ++++++++++-
>   9 files changed, 153 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/Kconfig b/drivers/gpu/drm/msm/Kconfig
> index 38cbde971b48..843a9d40c05e 100644
> --- a/drivers/gpu/drm/msm/Kconfig
> +++ b/drivers/gpu/drm/msm/Kconfig
> @@ -12,6 +12,7 @@ config DRM_MSM
>   	select SHMEM
>   	select TMPFS
>   	select QCOM_SCM
> +	select WANT_DEV_COREDUMP
>   	select SND_SOC_HDMI_CODEC if SND_SOC
>   	select SYNC_FILE
>   	select PM_OPP
> diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
> index 4cffec2b6adc..fc502e412132 100644
> --- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
> @@ -454,7 +454,7 @@ static const struct adreno_gpu_funcs funcs = {
>   		.active_ring = adreno_active_ring,
>   		.irq = a3xx_irq,
>   		.destroy = a3xx_destroy,
> -#ifdef CONFIG_DEBUG_FS
> +#if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
>   		.show = adreno_show,
>   #endif
>   		.gpu_state_get = a3xx_gpu_state_get,
> diff --git a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
> index 95f08c22e8d7..8129cf037db1 100644
> --- a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
> @@ -540,7 +540,7 @@ static const struct adreno_gpu_funcs funcs = {
>   		.active_ring = adreno_active_ring,
>   		.irq = a4xx_irq,
>   		.destroy = a4xx_destroy,
> -#ifdef CONFIG_DEBUG_FS
> +#if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
>   		.show = adreno_show,
>   #endif
>   		.gpu_state_get = a4xx_gpu_state_get,
> diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> index b0910bbe5190..836a1df1f257 100644
> --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> @@ -1243,8 +1243,10 @@ static const struct adreno_gpu_funcs funcs = {
>   		.active_ring = a5xx_active_ring,
>   		.irq = a5xx_irq,
>   		.destroy = a5xx_destroy,
> -#ifdef CONFIG_DEBUG_FS
> +#if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
>   		.show = adreno_show,
> +#endif
> +#if defined(CONFIG_DEBUG_FS)
>   		.debugfs_init = a5xx_debugfs_init,
>   #endif
>   		.gpu_busy = a5xx_gpu_busy,
> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> index 522d47ac36e1..d46ae2ede616 100644
> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> @@ -378,6 +378,8 @@ struct msm_gpu_state *adreno_gpu_state_get(struct msm_gpu *gpu)
>   	if (!state)
>   		return ERR_PTR(-ENOMEM);
>   
> +	kref_init(&state->ref);
> +
>   	do_gettimeofday(&state->time);
>   
>   	for (i = 0; i < gpu->nr_rings; i++) {
> @@ -413,18 +415,28 @@ struct msm_gpu_state *adreno_gpu_state_get(struct msm_gpu *gpu)
>   	return state;
>   }
>   
> -void adreno_gpu_state_put(struct msm_gpu_state *state)
> +static void adreno_gpu_state_destroy(struct kref *kref)
>   {
> -	if (IS_ERR_OR_NULL(state))
> -		return;
> +	struct msm_gpu_state *state = container_of(kref,
> +		struct msm_gpu_state, ref);
>   
> +	kfree(state->comm);
> +	kfree(state->cmd);
>   	kfree(state->registers);
>   	kfree(state);
>   }
>   
> -#ifdef CONFIG_DEBUG_FS
> +int adreno_gpu_state_put(struct msm_gpu_state *state)
> +{
> +	if (IS_ERR_OR_NULL(state))
> +		return 1;
> +
> +	return kref_put(&state->ref, adreno_gpu_state_destroy);
> +}
> +
> +#if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
>   void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
> -		struct seq_file *m)
> +		struct drm_printer *p)
>   {
>   	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>   	int i;
> @@ -432,23 +444,23 @@ void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
>   	if (IS_ERR_OR_NULL(state))
>   		return;
>   
> -	seq_printf(m, "status:   %08x\n", state->rbbm_status);
> -	seq_printf(m, "revision: %d (%d.%d.%d.%d)\n",
> +	drm_printf(p, "status:   %08x\n", state->rbbm_status);
> +	drm_printf(p, "revision: %d (%d.%d.%d.%d)\n",
>   			adreno_gpu->info->revn, adreno_gpu->rev.core,
>   			adreno_gpu->rev.major, adreno_gpu->rev.minor,
>   			adreno_gpu->rev.patchid);
>   
>   	for (i = 0; i < gpu->nr_rings; i++) {
> -		seq_printf(m, "rb %d: fence:    %d/%d\n", i,
> +		drm_printf(p, "rb %d: fence:    %d/%d\n", i,
>   			state->ring[i].fence, state->ring[i].seqno);
>   
> -		seq_printf(m, "      rptr:     %d\n", state->ring[i].rptr);
> -		seq_printf(m, "rb wptr:  %d\n", state->ring[i].wptr);
> +		drm_printf(p, "      rptr:     %d\n", state->ring[i].rptr);
> +		drm_printf(p, "rb wptr:  %d\n", state->ring[i].wptr);
>   	}
>   
> -	seq_printf(m, "IO:region %s 00000000 00020000\n", gpu->name);
> +	drm_printf(p, "IO:region %s 00000000 00020000\n", gpu->name);
>   	for (i = 0; i < state->nr_registers; i++) {
> -		seq_printf(m, "IO:R %08x %08x\n",
> +		drm_printf(p, "IO:R %08x %08x\n",
>   			state->registers[i * 2] << 2,
>   			state->registers[(i * 2) + 1]);
>   	}
> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> index 815ae98c7fd1..077bf1149c0b 100644
> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
> @@ -216,7 +216,7 @@ void adreno_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
>   bool adreno_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
>   #ifdef CONFIG_DEBUG_FS
>   void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
> -		struct seq_file *m);
> +		struct drm_printer *p);
>   #endif
>   void adreno_dump_info(struct msm_gpu *gpu);
>   void adreno_dump(struct msm_gpu *gpu);
> @@ -229,7 +229,7 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
>   void adreno_gpu_cleanup(struct adreno_gpu *gpu);
>   
>   struct msm_gpu_state *adreno_gpu_state_get(struct msm_gpu *gpu);
> -void adreno_gpu_state_put(struct msm_gpu_state *state);
> +int adreno_gpu_state_put(struct msm_gpu_state *state);
>   
>   /* ringbuffer helpers (the parts that are adreno specific) */
>   
> diff --git a/drivers/gpu/drm/msm/msm_debugfs.c b/drivers/gpu/drm/msm/msm_debugfs.c
> index ea20eb0ad747..9ae9e0a12b3a 100644
> --- a/drivers/gpu/drm/msm/msm_debugfs.c
> +++ b/drivers/gpu/drm/msm/msm_debugfs.c
> @@ -29,6 +29,7 @@ struct msm_gpu_show_priv {
>   
>   static int msm_gpu_show(struct seq_file *m, void *arg)
>   {
> +	struct drm_printer p = drm_seq_file_printer(m);
>   	struct msm_gpu_show_priv *show_priv = m->private;
>   	struct msm_drm_private *priv = show_priv->dev->dev_private;
>   	struct msm_gpu *gpu = priv->gpu;
> @@ -38,8 +39,8 @@ static int msm_gpu_show(struct seq_file *m, void *arg)
>   	if (ret)
>   		return ret;
>   
> -	seq_printf(m, "%s Status:\n", gpu->name);
> -	gpu->funcs->show(gpu, show_priv->state, m);
> +	drm_printf(&p, "%s Status:\n", gpu->name);
> +	gpu->funcs->show(gpu, show_priv->state, &p);
>   
>   	mutex_unlock(&show_priv->dev->struct_mutex);
>   
> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> index 2ca354047250..f36b415e123b 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.c
> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> @@ -20,10 +20,11 @@
>   #include "msm_mmu.h"
>   #include "msm_fence.h"
>   
> +#include <generated/utsrelease.h>
>   #include <linux/string_helpers.h>
>   #include <linux/pm_opp.h>
>   #include <linux/devfreq.h>
> -
> +#include <linux/devcoredump.h>
>   
>   /*
>    * Power Management:
> @@ -273,6 +274,83 @@ int msm_gpu_hw_init(struct msm_gpu *gpu)
>   	return ret;
>   }
>   
> +#ifdef CONFIG_DEV_COREDUMP
> +static ssize_t msm_gpu_devcoredump_read(char *buffer, loff_t offset,
> +		size_t count, void *data, size_t datalen)
> +{
> +	struct msm_gpu *gpu = data;
> +	struct drm_print_iterator iter;
> +	struct drm_printer p;
> +	struct msm_gpu_state *state;
> +
> +	state = msm_gpu_crashstate_get(gpu);
> +	if (!state)
> +		return 0;
> +
> +	iter.data = buffer;
> +	iter.offset = 0;
> +	iter.start = offset;
> +	iter.remain = count;

I think I understood how this works. So this read could get called many 
times before we are done reading and every time we start from scratch.
Why don't we copy everything to a buffer before hand(right when we 
capture the crash state) and then use the buffer here for copying to the 
user? We do get the offset to the buffer as input. This will probably 
save us from starting from the first print every time?
> +
> +	p = drm_coredump_printer(&iter);
> +
> +	drm_printf(&p, "---\n");
> +	drm_printf(&p, "kernel: " UTS_RELEASE "\n");
> +	drm_printf(&p, "module: " KBUILD_MODNAME "\n");
> +	drm_printf(&p, "time: %ld.%ld\n",
> +		state->time.tv_sec, state->time.tv_usec);
> +	if (state->comm)
> +		drm_printf(&p, "comm: %s\n", state->comm);
> +	if (state->cmd)
> +		drm_printf(&p, "cmdline: %s\n", state->cmd);
> +
> +	gpu->funcs->show(gpu, state, &p);
> +
> +	msm_gpu_crashstate_put(gpu);
> +
> +	return count - iter.remain;
> +}
> +
> +static void msm_gpu_devcoredump_free(void *data)
> +{
> +	struct msm_gpu *gpu = data;
> +
> +	msm_gpu_crashstate_put(gpu);
> +}
> +
> +static void msm_gpu_crashstate_capture(struct msm_gpu *gpu, char *comm,
> +		char *cmd)
> +{
> +	struct msm_gpu_state *state;
> +
> +	/* Only save one crash state at a a time */
remove extra 'a'
> +	if (gpu->crashstate)
> +		return;
> +
> +	state = gpu->funcs->gpu_state_get(gpu);
GPU might enter slumber by this time, so we may have to get/put sync run 
time pm ops here?
> +	if (IS_ERR_OR_NULL(state))
> +		return;
> +
> +	/* Fill in the additional crash state information */
> +	state->comm = kstrdup(comm, GFP_KERNEL);
> +	state->cmd = kstrdup(cmd, GFP_KERNEL);
> +
> +	kref_init(&state->ref);
This is not needed, adreno_gpu_state_get() takes care of this
> +
> +	/* Set the active crash state to be dumped on failure */
> +	gpu->crashstate = state;
> +
> +	/* FIXME: Release the crashstate if this errors out? */
> +	dev_coredumpm(gpu->dev->dev, THIS_MODULE, gpu, 0, GFP_KERNEL,
> +		msm_gpu_devcoredump_read, msm_gpu_devcoredump_free);
> +}
> +#else
> +static void msm_gpu_crashstate_capture(struct msm_gpu *gpu, char *comm,
> +		char *cmd)
> +{
> +}
> +#endif
> +
>   /*
>    * Hangcheck detection for locked gpu:
>    */
> @@ -356,6 +434,9 @@ static void recover_worker(struct work_struct *work)
>   			msm_rd_dump_submit(priv->hangrd, submit, NULL);
>   	}
>   
> +	/* Record the crash state */
> +	msm_gpu_crashstate_capture(gpu, comm, cmd);
> +
>   	kfree(cmd);
>   	kfree(comm);
>   
> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> index 470f3bb5f834..e65f507954c0 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.h
> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> @@ -66,13 +66,13 @@ struct msm_gpu_funcs {
>   #ifdef CONFIG_DEBUG_FS
>   	/* show GPU status in debugfs: */
>   	void (*show)(struct msm_gpu *gpu, struct msm_gpu_state *state,
> -			struct seq_file *m);
> +			struct drm_printer *p);
>   	/* for generation specific debugfs: */
>   	int (*debugfs_init)(struct msm_gpu *gpu, struct drm_minor *minor);
>   #endif
>   	int (*gpu_busy)(struct msm_gpu *gpu, uint64_t *value);
>   	struct msm_gpu_state *(*gpu_state_get)(struct msm_gpu *gpu);
> -	void (*gpu_state_put)(struct msm_gpu_state *state);
> +	int (*gpu_state_put)(struct msm_gpu_state *state);
>   };
>   
>   struct msm_gpu {
> @@ -133,6 +133,8 @@ struct msm_gpu {
>   		u64 busy_cycles;
>   		ktime_t time;
>   	} devfreq;
> +
> +	struct msm_gpu_state *crashstate;
>   };
>   
>   /* It turns out that all targets use the same ringbuffer size */
> @@ -180,6 +182,7 @@ struct msm_gpu_submitqueue {
>   };
>   
>   struct msm_gpu_state {
> +	struct kref ref;
>   	struct timeval time;
>   
>   	struct {
> @@ -193,6 +196,9 @@ struct msm_gpu_state {
>   	u32 *registers;
>   
>   	u32 rbbm_status;
> +
> +	char *comm;
> +	char *cmd;
>   };
>   
>   static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data)
> @@ -274,4 +280,32 @@ static inline void msm_submitqueue_put(struct msm_gpu_submitqueue *queue)
>   		kref_put(&queue->ref, msm_submitqueue_destroy);
>   }
>   
> +static inline struct msm_gpu_state *msm_gpu_crashstate_get(struct msm_gpu *gpu)
> +{
> +	struct msm_gpu_state *state = NULL;
> +
> +	mutex_lock(&gpu->dev->struct_mutex);
> +
> +	if (gpu->crashstate) {
> +		kref_get(&gpu->crashstate->ref);
> +		state = gpu->crashstate;
> +	}
> +
> +	mutex_unlock(&gpu->dev->struct_mutex);
> +
> +	return state;
> +}
> +
> +static inline void msm_gpu_crashstate_put(struct msm_gpu *gpu)
> +{
> +	mutex_lock(&gpu->dev->struct_mutex);
> +
> +	if (gpu->crashstate) {
> +		if (gpu->funcs->gpu_state_put(gpu->crashstate))
> +			gpu->crashstate = NULL;
> +	}
> +
> +	mutex_unlock(&gpu->dev->struct_mutex);
> +}
> +
>   #endif /* __MSM_GPU_H__ */
> 

-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
Linux Foundation Collaborative Project