[PATCH 1/3] accel/qaic: Add bootlog debugfs

Jeffrey Hugo quic_jhugo at quicinc.com
Fri Mar 15 15:39:47 UTC 2024


On 3/14/2024 5:41 AM, Jacek Lawrynowicz wrote:
> Hi,
> 
> On 11.03.2024 17:58, Jeffrey Hugo wrote:
>> During the boot process of AIC100, the bootloaders (PBL and SBL) log
>> messages to device RAM. During SBL, if the host opens the QAIC_LOGGING
>> channel, SBL will offload the contents of the log buffer to the host,
>> and stream any new messages that SBL logs.
>>
>> This log of the boot process can be very useful for an initial triage of
>> any boot related issues. For example, if SBL rejects one of the runtime
>> firmware images for a validation failure, SBL will log a reason why.
>>
>> Add the ability of the driver to open the logging channel, receive the
>> messages, and store them. Also define a debugfs entry called "bootlog"
>> by hooking into the DRM debugfs framework. When the bootlog debugfs
>> entry is read, the current contents of the log that the host is caching
>> is displayed to the user. The driver will retain the cache until it
>> detects that the device has rebooted.  At that point, the cache will be
>> freed, and the driver will wait for a new log. With this scheme, the
>> driver will only have a cache of the log from the current device boot.
>> Note that if the driver initializes a device and it is already in the
>> runtime state (QSM), no bootlog will be available through this mechanism
>> because the driver and SBL have not communicated.
>>
>> Signed-off-by: Jeffrey Hugo <quic_jhugo at quicinc.com>
>> Reviewed-by: Carl Vanderlip <quic_carlv at quicinc.com>
>> Reviewed-by: Pranjal Ramajor Asha Kanojiya <quic_pkanojiy at quicinc.com>
>> ---
>>   drivers/accel/qaic/Makefile       |   2 +
>>   drivers/accel/qaic/qaic.h         |   8 +
>>   drivers/accel/qaic/qaic_debugfs.c | 271 ++++++++++++++++++++++++++++++
>>   drivers/accel/qaic/qaic_debugfs.h |  20 +++
>>   drivers/accel/qaic/qaic_drv.c     |  16 +-
>>   5 files changed, 316 insertions(+), 1 deletion(-)
>>   create mode 100644 drivers/accel/qaic/qaic_debugfs.c
>>   create mode 100644 drivers/accel/qaic/qaic_debugfs.h
>>
>> diff --git a/drivers/accel/qaic/Makefile b/drivers/accel/qaic/Makefile
>> index 3f7f6dfde7f2..2cadcc1baa0e 100644
>> --- a/drivers/accel/qaic/Makefile
>> +++ b/drivers/accel/qaic/Makefile
>> @@ -11,3 +11,5 @@ qaic-y := \
>>   	qaic_data.o \
>>   	qaic_drv.o \
>>   	qaic_timesync.o
>> +
>> +qaic-$(CONFIG_DEBUG_FS) += qaic_debugfs.o
>> diff --git a/drivers/accel/qaic/qaic.h b/drivers/accel/qaic/qaic.h
>> index 9256653b3036..03d9c9fbffb3 100644
>> --- a/drivers/accel/qaic/qaic.h
>> +++ b/drivers/accel/qaic/qaic.h
>> @@ -153,6 +153,14 @@ struct qaic_device {
>>   	struct mhi_device	*qts_ch;
>>   	/* Work queue for tasks related to MHI "QAIC_TIMESYNC" channel */
>>   	struct workqueue_struct	*qts_wq;
>> +	/* Head of list of page allocated by MHI bootlog device */
>> +	struct list_head        bootlog;
>> +	/* MHI bootlog channel device */
>> +	struct mhi_device       *bootlog_ch;
>> +	/* Work queue for tasks related to MHI bootlog device */
>> +	struct workqueue_struct *bootlog_wq;
>> +	/* Synchronizes access of pages in MHI bootlog device */
>> +	struct mutex            bootlog_mutex;
>>   };
>>   
>>   struct qaic_drm_device {
>> diff --git a/drivers/accel/qaic/qaic_debugfs.c b/drivers/accel/qaic/qaic_debugfs.c
>> new file mode 100644
>> index 000000000000..4f87fe29be1a
>> --- /dev/null
>> +++ b/drivers/accel/qaic/qaic_debugfs.c
>> @@ -0,0 +1,271 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +
>> +/* Copyright (c) 2020, The Linux Foundation. All rights reserved. */
>> +/* Copyright (c) 2021-2024 Qualcomm Innovation Center, Inc. All rights reserved. */
>> +
>> +#include <linux/debugfs.h>
>> +#include <linux/device.h>
>> +#include <linux/fs.h>
>> +#include <linux/list.h>
>> +#include <linux/mhi.h>
>> +#include <linux/mutex.h>
>> +#include <linux/pci.h>
>> +#include <linux/seq_file.h>
>> +#include <linux/string.h>
>> +#include <linux/types.h>
>> +#include <linux/workqueue.h>
>> +
>> +#include "qaic.h"
>> +#include "qaic_debugfs.h"
>> +
>> +#define BOOTLOG_POOL_SIZE		16
>> +#define BOOTLOG_MSG_SIZE		512
>> +
>> +struct bootlog_msg {
>> +	/* Buffer for bootlog messages */
>> +	char str[BOOTLOG_MSG_SIZE];
>> +	/* Root struct of device, used to access device resources */
>> +	struct qaic_device *qdev;
>> +	/* Work struct to schedule work coming on QAIC_LOGGING channel */
>> +	struct work_struct work;
>> +};
>> +
>> +struct bootlog_page {
>> +	/* Node in list of bootlog pages maintained by root device struct */
>> +	struct list_head node;
>> +	/* Total size of the buffer that holds the bootlogs. It is PAGE_SIZE */
>> +	unsigned int size;
>> +	/* Offset for the next bootlog */
>> +	unsigned int offset;
>> +};
>> +
>> +static int bootlog_show(struct seq_file *s, void *unused)
>> +{
>> +	struct bootlog_page *page;
>> +	struct qaic_device *qdev;
>> +	void *page_end;
>> +	void *log;
>> +
>> +	qdev = s->private;
>> +	mutex_lock(&qdev->bootlog_mutex);
>> +	list_for_each_entry(page, &qdev->bootlog, node) {
>> +		log = page + 1;
>> +		page_end = (void *)page + page->offset;
>> +		while (log < page_end) {
>> +			seq_printf(s, "%s", (char *)log);
>> +			log += strlen(log) + 1;
>> +		}
>> +	}
>> +	mutex_unlock(&qdev->bootlog_mutex);
>> +
>> +	return 0;
>> +}
>> +
>> +static int bootlog_fops_open(struct inode *inode, struct file *file)
>> +{
>> +	return single_open(file, bootlog_show, inode->i_private);
>> +}
>> +
>> +static const struct file_operations bootlog_fops = {
>> +	.owner = THIS_MODULE,
>> +	.open = bootlog_fops_open,
>> +	.read = seq_read,
>> +	.llseek = seq_lseek,
>> +	.release = single_release,
>> +};
>> +
>> +void qaic_debugfs_init(struct qaic_drm_device *qddev)
>> +{
>> +	struct qaic_device *qdev = qddev->qdev;
>> +	struct dentry *debugfs_root;
>> +
>> +	debugfs_root = to_drm(qddev)->debugfs_root;
>> +
>> +	debugfs_create_file("bootlog", 0400, debugfs_root, qdev, &bootlog_fops);
>> +}
>> +
>> +static struct bootlog_page *alloc_bootlog_page(struct qaic_device *qdev)
>> +{
>> +	struct bootlog_page *page;
>> +
>> +	page = (struct bootlog_page *)devm_get_free_pages(&qdev->pdev->dev, GFP_KERNEL, 0);
>> +	if (!page)
>> +		return page;
>> +
>> +	page->size = PAGE_SIZE;
>> +	page->offset = sizeof(*page);
>> +	list_add_tail(&page->node, &qdev->bootlog);
>> +
>> +	return page;
>> +}
>> +
>> +static int reset_bootlog(struct qaic_device *qdev)
>> +{
>> +	struct bootlog_page *page;
>> +	struct bootlog_page *i;
>> +
>> +	list_for_each_entry_safe(page, i, &qdev->bootlog, node) {
>> +		list_del(&page->node);
>> +		devm_free_pages(&qdev->pdev->dev, (unsigned long)page);
>> +	}
> This is currently dead code. reset is only used to init the bootlog. You may consider making this init_bootlog() if you are not planning to actually reset the bootlog.

No, its not dead code.

We boot the device the first time.  qaic_bootlog_mhi_probe() is called, 
which calls reset_bootlog().  This code does not execute as the list is 
empty.  For this instance, reset_bootlog() is "init_bootlog".  We 
collect a bootlog and store it in the list.  The device finishes 
booting, and qaic_bootlog_mhi_remove() is called.  We do not clear the 
list at that time.  This allows the log to be dumped at a later time.

Now, lets assume the device crashes.  The device will reboot and go 
through the boot flow again.  In this example, this will be boot 
instance 2, but this also applies to boot instance N+1.

qaic_bootlog_mhi_probe() is called again.  Reset_bootlog() will be 
called, and now this code will execute because the list is non-empty and 
contains data from the previous boot.  As we are only storing the 
current bootlog, this loop clears the list and frees the resources. 
Then we collect the new log for the current boot, and 
qaic_bootlog_mhi_remove() is triggered again.

>> +
>> +	page = alloc_bootlog_page(qdev);
>> +	if (!page)
>> +		return -ENOMEM;
>> +
>> +	return 0;
>> +}
>> +
>> +static void *bootlog_get_space(struct qaic_device *qdev, unsigned int size)
>> +{
>> +	struct bootlog_page *page;
>> +
>> +	page = list_last_entry(&qdev->bootlog, struct bootlog_page, node);
>> +
>> +	if (size > page->size - sizeof(*page))
> Not critical but would be safer to use this condition: "sizeof(*page) + size > page->size"

I disagree.  Your suggestion would appear to have the potential to 
overflow because it is doing a calculation based on an untrusted value 
(the size parameter).  The current code restructures the check to avoid 
this.

What would be safer is to utilize size_add(), which I think is better 
than either the current code, or your suggestion, and is what I will 
implement.

> 
>> +		return NULL;
>> +
>> +	if (page->offset + size > page->size) {
>> +		page = alloc_bootlog_page(qdev);
>> +		if (!page)
>> +			return NULL;
>> +	}
>> +
>> +	return (void *)page + page->offset;
>> +}
>> +
>> +static void bootlog_commit(struct qaic_device *qdev, unsigned int size)
>> +{
>> +	struct bootlog_page *page;
>> +
>> +	page = list_last_entry(&qdev->bootlog, struct bootlog_page, node);
>> +
>> +	page->offset += size;
>> +}
>> +
>> +static void bootlog_log(struct work_struct *work)
>> +{
>> +	struct bootlog_msg *msg = container_of(work, struct bootlog_msg, work);
>> +	unsigned int len = strlen(msg->str) + 1;
>> +	struct qaic_device *qdev = msg->qdev;
>> +	void *log;
>> +
>> +	mutex_lock(&qdev->bootlog_mutex);
>> +	log = bootlog_get_space(qdev, len);
>> +	if (log) {
>> +		memcpy(log, msg, len);
>> +		bootlog_commit(qdev, len);
>> +	}
>> +	mutex_unlock(&qdev->bootlog_mutex);
>> +
>> +	if (mhi_queue_buf(qdev->bootlog_ch, DMA_FROM_DEVICE, msg, BOOTLOG_MSG_SIZE, MHI_EOT))
>> +		devm_kfree(&qdev->pdev->dev, msg);
> You are freeing `struct work` while still in work callback. This is unsafe.
> See https://elixir.bootlin.com/linux/v6.8/source/kernel/workqueue.c#L2564.
> Work ptr is kept in busy_hash after the callback has finished and may be still be accessed.

Documentation says that is permitted - 
https://elixir.bootlin.com/linux/v6.8/source/kernel/workqueue.c#L2548

Also, the framework code documents that the struct work cannot be 
accessed after the callback is invoked - 
https://elixir.bootlin.com/linux/v6.8/source/kernel/workqueue.c#L2635

> 
>> +}
>> +
>> +static int qaic_bootlog_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id)
>> +{
>> +	struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(mhi_dev->mhi_cntrl->cntrl_dev));
>> +	struct bootlog_msg *msg;
>> +	int i, ret;
>> +
>> +	qdev->bootlog_wq = alloc_ordered_workqueue("qaic_bootlog", 0);
>> +	if (!qdev->bootlog_wq) {
>> +		ret = -ENOMEM;
>> +		goto out;
>> +	}
>> +
>> +	mutex_lock(&qdev->bootlog_mutex);
> Looks like locking should be inside reset_bootlog(), like in other places.

Will do.

> 
>> +	ret = reset_bootlog(qdev);
>> +	mutex_unlock(&qdev->bootlog_mutex);
>> +	if (ret)
>> +		goto destroy_workqueue;
>> +
>> +	ret = mhi_prepare_for_transfer(mhi_dev);
>> +	if (ret)
>> +		goto destroy_workqueue;
>> +
>> +	for (i = 0; i < BOOTLOG_POOL_SIZE; i++) {
>> +		msg = devm_kzalloc(&qdev->pdev->dev, sizeof(*msg), GFP_KERNEL);
>> +		if (!msg) {
>> +			ret = -ENOMEM;
>> +			goto mhi_unprepare;
>> +		}
>> +
>> +		msg->qdev = qdev;
>> +		INIT_WORK(&msg->work, bootlog_log);
>> +
>> +		ret = mhi_queue_buf(mhi_dev, DMA_FROM_DEVICE, msg, BOOTLOG_MSG_SIZE, MHI_EOT);
>> +		if (ret)
>> +			goto mhi_unprepare;
>> +	}
>> +
>> +	dev_set_drvdata(&mhi_dev->dev, qdev);
>> +	qdev->bootlog_ch = mhi_dev;
>> +	return 0;
>> +
>> +mhi_unprepare:
>> +	mhi_unprepare_from_transfer(mhi_dev);
>> +destroy_workqueue:
>> +	flush_workqueue(qdev->bootlog_wq);
>> +	destroy_workqueue(qdev->bootlog_wq);
>> +out:
>> +	return ret;
>> +}
>> +
>> +static void qaic_bootlog_mhi_remove(struct mhi_device *mhi_dev)
>> +{
>> +	struct qaic_device *qdev;
>> +
>> +	qdev = dev_get_drvdata(&mhi_dev->dev);
>> +
>> +	mhi_unprepare_from_transfer(qdev->bootlog_ch);
>> +	flush_workqueue(qdev->bootlog_wq);
>> +	destroy_workqueue(qdev->bootlog_wq);
>> +	qdev->bootlog_ch = NULL;
>> +}
>> +
>> +static void qaic_bootlog_mhi_ul_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result)
>> +{
>> +}
>> +
>> +static void qaic_bootlog_mhi_dl_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result)
>> +{
>> +	struct qaic_device *qdev = dev_get_drvdata(&mhi_dev->dev);
>> +	struct bootlog_msg *msg = mhi_result->buf_addr;
>> +
>> +	if (mhi_result->transaction_status) {
>> +		devm_kfree(&qdev->pdev->dev, msg);
>> +		return;
>> +	}
>> +
>> +	/* Force a null at the end of the transferred string */
>> +	msg->str[mhi_result->bytes_xferd - 1] = 0;
> Is it guaranteed that bytes_xferd will always be within valid range here?

Yes.  We provide the buffer size when we queue it to MHI.  When the 
buffer comes back, before this callback, MHI will clamp the transfered 
size to the buffer size.

> 
>> +
>> +	queue_work(qdev->bootlog_wq, &msg->work);
>> +}
>> +
>> +static const struct mhi_device_id qaic_bootlog_mhi_match_table[] = {
>> +	{ .chan = "QAIC_LOGGING", },
>> +	{},
>> +};
>> +
>> +static struct mhi_driver qaic_bootlog_mhi_driver = {
>> +	.id_table = qaic_bootlog_mhi_match_table,
>> +	.remove = qaic_bootlog_mhi_remove,
>> +	.probe = qaic_bootlog_mhi_probe,
>> +	.ul_xfer_cb = qaic_bootlog_mhi_ul_xfer_cb,
>> +	.dl_xfer_cb = qaic_bootlog_mhi_dl_xfer_cb,
>> +	.driver = {
>> +		.name = "qaic_bootlog",
>> +	},
>> +};
>> +
>> +int qaic_bootlog_register(void)
>> +{
>> +	return mhi_driver_register(&qaic_bootlog_mhi_driver);
>> +}
>> +
>> +void qaic_bootlog_unregister(void)
>> +{
>> +	mhi_driver_unregister(&qaic_bootlog_mhi_driver);
>> +}
>> diff --git a/drivers/accel/qaic/qaic_debugfs.h b/drivers/accel/qaic/qaic_debugfs.h
>> new file mode 100644
>> index 000000000000..ea3fd1a88405
>> --- /dev/null
>> +++ b/drivers/accel/qaic/qaic_debugfs.h
>> @@ -0,0 +1,20 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +
>> +/* Copyright (c) 2020, The Linux Foundation. All rights reserved. */
>> +/* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. */
>> +
>> +#ifndef __QAIC_DEBUGFS_H__
>> +#define __QAIC_DEBUGFS_H__
>> +
>> +#include <drm/drm_file.h>
>> +
>> +#ifdef CONFIG_DEBUG_FS
>> +int qaic_bootlog_register(void);
>> +void qaic_bootlog_unregister(void);
>> +void qaic_debugfs_init(struct qaic_drm_device *qddev);
>> +#else
>> +int qaic_bootlog_register(void) { return 0; }
>> +void qaic_bootlog_unregister(void) {}
>> +void qaic_debugfs_init(struct qaic_drm_device *qddev) {}
>> +#endif /* CONFIG_DEBUG_FS */
>> +#endif /* __QAIC_DEBUGFS_H__ */
>> diff --git a/drivers/accel/qaic/qaic_drv.c b/drivers/accel/qaic/qaic_drv.c
>> index d1a632dbaec6..f072edb74f22 100644
>> --- a/drivers/accel/qaic/qaic_drv.c
>> +++ b/drivers/accel/qaic/qaic_drv.c
>> @@ -28,6 +28,7 @@
>>   
>>   #include "mhi_controller.h"
>>   #include "qaic.h"
>> +#include "qaic_debugfs.h"
>>   #include "qaic_timesync.h"
>>   
>>   MODULE_IMPORT_NS(DMA_BUF);
>> @@ -229,8 +230,12 @@ static int qaic_create_drm_device(struct qaic_device *qdev, s32 partition_id)
>>   	qddev->partition_id = partition_id;
>>   
>>   	ret = drm_dev_register(drm, 0);
>> -	if (ret)
>> +	if (ret) {
>>   		pci_dbg(qdev->pdev, "drm_dev_register failed %d\n", ret);
>> +		return ret;
>> +	}
>> +
>> +	qaic_debugfs_init(qddev);
>>   
>>   	return ret;
>>   }
>> @@ -380,6 +385,9 @@ static struct qaic_device *create_qdev(struct pci_dev *pdev, const struct pci_de
>>   	if (ret)
>>   		return NULL;
>>   	ret = drmm_mutex_init(drm, &qdev->cntl_mutex);
>> +	if (ret)
>> +		return NULL;
>> +	ret = drmm_mutex_init(drm, &qdev->bootlog_mutex);
>>   	if (ret)
>>   		return NULL;
>>   
>> @@ -399,6 +407,7 @@ static struct qaic_device *create_qdev(struct pci_dev *pdev, const struct pci_de
>>   	qddev->qdev = qdev;
>>   
>>   	INIT_LIST_HEAD(&qdev->cntl_xfer_list);
>> +	INIT_LIST_HEAD(&qdev->bootlog);
>>   	INIT_LIST_HEAD(&qddev->users);
>>   
>>   	for (i = 0; i < qdev->num_dbc; ++i) {
>> @@ -639,6 +648,10 @@ static int __init qaic_init(void)
>>   	if (ret)
>>   		pr_debug("qaic: qaic_timesync_init failed %d\n", ret);
>>   
>> +	ret = qaic_bootlog_register();
>> +	if (ret)
>> +		pr_debug("qaic: qaic_bootlog_register failed %d\n", ret);
>> +
>>   	return 0;
>>   
>>   free_pci:
>> @@ -664,6 +677,7 @@ static void __exit qaic_exit(void)
>>   	 * reinitializing the link_up state after the cleanup is done.
>>   	 */
>>   	link_up = true;
>> +	qaic_bootlog_unregister();
>>   	qaic_timesync_deinit();
>>   	mhi_driver_unregister(&qaic_mhi_driver);
>>   	pci_unregister_driver(&qaic_pci_driver);



More information about the dri-devel mailing list