[PATCH v3 08/14] drm/panthor: Add the FW logical block

Chris Diamand chris.diamand at foss.arm.com
Mon Dec 18 21:25:16 UTC 2023


On 04/12/2023 17:33, Boris Brezillon wrote:
> Contains everything that's FW related, that includes the code dealing
> with the microcontroller unit (MCU) that's running the FW, and anything
> related to allocating memory shared between the FW and the CPU.
> 
> A few global FW events are processed in the IRQ handler, the rest is
> forwarded to the scheduler, since scheduling is the primary reason for
> the FW existence, and also the main source of FW <-> kernel
> interactions.
> 
> v3:
> - Make the FW path more future-proof (Liviu)
> - Use one waitqueue for all FW events
> - Simplify propagation of FW events to the scheduler logic
> - Drop the panthor_fw_mem abstraction and use panthor_kernel_bo instead
> - Account for the panthor_vm changes
> - Replace magic number with 0x7fffffff with ~0 to better signify that
>   it's the maximum permitted value.
> - More accurate rounding when computing the firmware timeout.
> - Add a 'sub iterator' helper function. This also adds a check that a
>   firmware entry doesn't overflow the firmware image.
> - Drop __packed from FW structures, natural alignment is good enough.
> - Other minor code improvements.
> 
> Signed-off-by: Boris Brezillon <boris.brezillon at collabora.com>
> Signed-off-by: Steven Price <steven.price at arm.com>
> ---
>  drivers/gpu/drm/panthor/panthor_fw.c | 1332 ++++++++++++++++++++++++++
>  drivers/gpu/drm/panthor/panthor_fw.h |  504 ++++++++++
>  2 files changed, 1836 insertions(+)
>  create mode 100644 drivers/gpu/drm/panthor/panthor_fw.c
>  create mode 100644 drivers/gpu/drm/panthor/panthor_fw.h
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_fw.c b/drivers/gpu/drm/panthor/panthor_fw.c
> new file mode 100644
> index 000000000000..85afe769f567
> --- /dev/null
> +++ b/drivers/gpu/drm/panthor/panthor_fw.c

...
> +static int panthor_fw_load_section_entry(struct panthor_device *ptdev,
> +					 const struct firmware *fw,
> +					 struct panthor_fw_binary_iter *iter,
> +					 u32 ehdr)
> +{
> +	struct panthor_fw_binary_section_entry_hdr hdr;
> +	struct panthor_fw_section *section;
> +	u32 section_size;
> +	u32 name_len;
> +	int ret;
> +
> +	ret = panthor_fw_binary_iter_read(ptdev, iter, &hdr, sizeof(hdr));
> +	if (ret)
> +		return ret;
> +
> +	if (hdr.data.end < hdr.data.start) {
> +		drm_err(&ptdev->base, "Firmware corrupted, data.end < data.start (0x%x < 0x%x)\n",
> +			hdr.data.end, hdr.data.start);
> +		return -EINVAL;
> +	}
> +
> +	if (hdr.va.end < hdr.va.start) {
> +		drm_err(&ptdev->base, "Firmware corrupted, hdr.va.end < hdr.va.start (0x%x < 0x%x)\n",
> +			hdr.va.end, hdr.va.start);
> +		return -EINVAL;
> +	}
> +
> +	if (hdr.data.end > fw->size) {
> +		drm_err(&ptdev->base, "Firmware corrupted, file truncated? data_end=0x%x > fw size=0x%zx\n",
> +			hdr.data.end, fw->size);
> +		return -EINVAL;
> +	}
> +
> +	if ((hdr.va.start & ~PAGE_MASK) != 0 ||
> +	    (hdr.va.end & ~PAGE_MASK) != 0) {
> +		drm_err(&ptdev->base, "Firmware corrupted, virtual addresses not page aligned: 0x%x-0x%x\n",
> +			hdr.va.start, hdr.va.end);
> +		return -EINVAL;
> +	}
> +
> +	if (hdr.flags & ~CSF_FW_BINARY_IFACE_ENTRY_RD_SUPPORTED_FLAGS) {
> +		drm_err(&ptdev->base, "Firmware contains interface with unsupported flags (0x%x)\n",
> +			hdr.flags);
> +		return -EINVAL;
> +	}
> +
> +	if (hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_PROT) {
> +		drm_warn(&ptdev->base,
> +			 "Firmware protected mode entry not be supported, ignoring");
> +		return 0;
> +	}
> +
> +	if (hdr.va.start == CSF_MCU_SHARED_REGION_START &&
> +	    !(hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_SHARED)) {
> +		drm_err(&ptdev->base,
> +			"Interface at 0x%llx must be shared", CSF_MCU_SHARED_REGION_START);
> +		return -EINVAL;
> +	}
> +
> +	name_len = iter->size - iter->offset;
> +
> +	section = drmm_kzalloc(&ptdev->base, sizeof(*section), GFP_KERNEL);
> +	if (!section)
> +		return -ENOMEM;
> +
> +	list_add_tail(&section->node, &ptdev->fw->sections);
> +	section->flags = hdr.flags;
> +	section->data.size = hdr.data.end - hdr.data.start;
> +
> +	if (section->data.size > 0) {
> +		void *data = drmm_kmalloc(&ptdev->base, section->data.size, GFP_KERNEL);
> +
> +		if (!data)
> +			return -ENOMEM;
> +
> +		memcpy(data, fw->data + hdr.data.start, section->data.size);
> +		section->data.buf = data;
> +	}
> +
> +	if (name_len > 0) {
> +		char *name = drmm_kmalloc(&ptdev->base, name_len + 1, GFP_KERNEL);
> +
> +		if (!name)
> +			return -ENOMEM;
> +
> +		memcpy(name, iter->data + iter->offset, name_len);
> +		name[name_len] = '\0';
> +		section->name = name;
> +	}
> +
> +	section_size = hdr.va.end - hdr.va.start;
> +	if (section_size) {
> +		u32 cache_mode = hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_MASK;
> +		struct panthor_gem_object *bo;
> +		u32 vm_map_flags = 0;
> +		struct sg_table *sgt;
> +		u64 va = hdr.va.start;
> +
> +		if (!(hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_WR))
> +			vm_map_flags |= DRM_PANTHOR_VM_BIND_OP_MAP_READONLY;
> +
> +		if (!(hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_EX))
> +			vm_map_flags |= DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC;
> +
> +		/* TODO: CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_*_COHERENT are mapped to
> +		 * non-cacheable for now. We might want to introduce a new
> +		 * IOMMU_xxx flag (or abuse IOMMU_MMIO, which maps to device
> +		 * memory and is currently not used by our driver) for
> +		 * AS_MEMATTR_AARCH64_SHARED memory, so we can take benefit
> +		 * of IO-coherent systems.
> +		 */
> +		if (cache_mode != CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_CACHED)
> +			vm_map_flags |= DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED;
> +
> +		section->mem = panthor_kernel_bo_create(ptdev, panthor_fw_vm(ptdev),
> +							section_size,
> +							DRM_PANTHOR_BO_NO_MMAP,
> +							vm_map_flags, va);
> +		if (IS_ERR(section->mem))
> +			return PTR_ERR(section->mem);

There's a small bug here - if panthor_kernel_bo_create() fails, section->mem will be
left containing an error value, not NULL or a valid pointer.

This means that on rmmod, panthor_fw_unplug() will try and free the BO even though it
was never allocated, causing a kernel panic.

The solution is obviously to only assign section->mem if the allocation actually succeeded.

However I also wonder if the list_add_tail() call should be moved to later in the function,
after we know everything else has succeeded, or if this function needs some more error
handling in general.

> +
> +		if (drm_WARN_ON(&ptdev->base, section->mem->va_node.start != hdr.va.start))
> +			return -EINVAL;
> +
> +		if (section->flags & CSF_FW_BINARY_IFACE_ENTRY_RD_SHARED) {
> +			ret = panthor_kernel_bo_vmap(section->mem);
> +			if (ret)
> +				return ret;
> +		}
> +
> +		panthor_fw_init_section_mem(ptdev, section);
> +
> +		bo = to_panthor_bo(section->mem->obj);
> +		sgt = drm_gem_shmem_get_pages_sgt(&bo->base);
> +		if (IS_ERR(sgt))
> +			return PTR_ERR(section->mem);
> +
> +		dma_sync_sgtable_for_device(ptdev->base.dev, sgt, DMA_TO_DEVICE);
> +	}
> +
> +	if (hdr.va.start == CSF_MCU_SHARED_REGION_START)
> +		ptdev->fw->shared_section = section;
> +
> +	return 0;
> +}
> +

...

> +/**
> + * panthor_fw_unplug() - Called when the device is unplugged.
> + * @ptdev: Device.
> + *
> + * This function must make sure all pending operations are flushed before
> + * will release device resources, thus preventing any interaction with
> + * the HW.
> + *
> + * If there is still FW-related work running after this function returns,
> + * they must use drm_dev_{enter,exit}() and skip any HW access when
> + * drm_dev_enter() returns false.
> + */
> +void panthor_fw_unplug(struct panthor_device *ptdev)
> +{
> +	struct panthor_fw_section *section;
> +
> +	cancel_delayed_work_sync(&ptdev->fw->watchdog.ping_work);
> +
> +	/* Make sure the IRQ handler can be called after that point. */
> +	if (ptdev->fw->irq.irq)
> +		panthor_job_irq_suspend(&ptdev->fw->irq);
> +
> +	panthor_fw_stop(ptdev);
> +
> +	if (ptdev->fw->vm)
> +		panthor_vm_idle(ptdev->fw->vm);
> +
> +	list_for_each_entry(section, &ptdev->fw->sections, node)
> +		panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), section->mem);

Here's where we destroy the potentially invalid section->mem.

Note that the issues are twofold:
Firstly, section->mem could be an error pointer as mentioned earlier.
Secondly, panthor_kernel_bo_destroy() doesn't actually check for error values or even NULL.

It would probably be worth adding NULL checks to panthor_kernel_bo_destroy() and panthor_kernel_bo_vunmap() to protect against this. 

> +
> +	panthor_vm_put(ptdev->fw->vm);
> +
> +	panthor_gpu_power_off(ptdev, L2, ptdev->gpu_info.l2_present, 20000);
> +}
> +

Cheers,
Chris


More information about the dri-devel mailing list