[PATCH v2 09/15] drm/panthor: Add the FW logical block
Steven Price
steven.price at arm.com
Wed Aug 30 15:20:38 UTC 2023
On 29/08/2023 17:15, Boris Brezillon wrote:
> On Wed, 16 Aug 2023 17:01:56 +0100
> Steven Price <steven.price at arm.com> wrote:
>
>> On 09/08/2023 17:53, Boris Brezillon wrote:
[...]
>>> +/**
>>> + * panthor_fw_mem_alloc() - Allocate a FW memory object and map it to the MCU VM.
>>> + * @ptdev: Device.
>>> + * @size: Size of the memory block.
>>> + * @bo_flags: BO flags.
>>> + * @vm_map_flags: VM_MAP flags.
>>> + * @va: Virtual address of the MCU mapping.
>>> + * Set to PANTHOR_GEM_ALLOC_VA for automatic VA-assignment. In that case, the
>>> + * VA will be allocated in the shared VA space.
>>> + *
>>> + * Return: A valid pointer in case of success, an ERR_PTR() otherwise.
>>> + */
>>> +static struct panthor_fw_mem *
>>> +panthor_fw_mem_alloc(struct panthor_device *ptdev, size_t size,
>>> + u32 bo_flags, u32 vm_map_flags, u64 va)
>>> +{
>>> + struct panthor_fw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
>>> + int ret;
>>> +
>>> + if (!mem)
>>> + return ERR_PTR(-ENOMEM);
>>> +
>>> + mem->bo = panthor_gem_create_and_map(ptdev, ptdev->fw->vm,
>>> + size, bo_flags, vm_map_flags,
>>> + &va, NULL);
>>> + if (IS_ERR(mem->bo)) {
>>> + ret = PTR_ERR(mem->bo);
>>> + mem->bo = NULL;
>>> + goto err_free_mem;
>>> + }
>>> +
>>> + mem->va = va;
>>> + return mem;
>>> +
>>> +err_free_mem:
>>> + panthor_fw_mem_free(ptdev, mem);
>>> + return ERR_PTR(ret);
>>
>> The error handling seems more complex than needed, how about:
>>
>> struct panthor_fw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
>> struct panthor_gem_object *bo;
>> int ret;
>>
>> if (!mem)
>> return ERR_PTR(-ENOMEM);
>>
>> bo = panthor_gem_create_and_map(ptdev, ptdev->fw->vm,
>> size, bo_flags, vm_map_flags,
>> &va, NULL);
>>
>> if (IS_ERR(bo)) {
>> kfree(mem);
>> return ERR_CAST(bo);
>> }
>>
>> mem->bo = bo;
>> mem->va = va;
>> return mem;
>>
>> Which I think also means we don't need the "if (mem->bo)" case in
>> panthor_fw_mem_free().
>
> Not so sure about that one. I've been adding code to existing functions
> and having a structured error path, with free functions that can deal
> with partially initialized object makes code addition less error-prone.
> I agree on the local bo variable to avoid mem->bo re-initialization
> though.
Yeah the "free accepting NULL" style is generally a good one, so leaving
the NULL check in panthor_fw_mem_free() is fine. It was just in this
case having to explicitly assign NULL before the call to
panthor_fw_mem_free() looked ugly.
>>
>>> +}
>>> +
>
> [...]
>
>>> +/**
>>> + * panthor_fw_alloc_suspend_buf_mem() - Allocate a suspend buffer for a command stream group.
>>> + * @ptdev: Device.
>>> + * @size: Size of the suspend buffer.
>>> + *
>>> + * Return: A valid pointer in case of success, an ERR_PTR() otherwise.
>>> + */
>>> +struct panthor_fw_mem *
>>> +panthor_fw_alloc_suspend_buf_mem(struct panthor_device *ptdev, size_t size)
>>> +{
>>> + return panthor_fw_mem_alloc(ptdev, size,
>>> + DRM_PANTHOR_BO_NO_MMAP,
>>> + DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC,
>>> + PANTHOR_GEM_ALLOC_VA);
>>> +}
>>> +
>>> +static int panthor_fw_load_section_entry(struct panthor_device *ptdev,
>>> + const struct firmware *fw,
>>> + struct panthor_fw_binary_iter *iter,
>>> + u32 ehdr)
>>> +{
>>> + struct panthor_fw_binary_section_entry_hdr hdr;
>>> + struct panthor_fw_section *section;
>>> + u32 section_size;
>>> + u32 name_len;
>>> + int ret;
>>> +
>>> + ret = panthor_fw_binary_iter_read(ptdev, iter, &hdr, sizeof(hdr));
>>> + if (ret)
>>> + return ret;
>>> +
>>> + if (hdr.data.end < hdr.data.start) {
>>> + drm_err(&ptdev->base, "Firmware corrupted, data.end < data.start (0x%x < 0x%x)\n",
>>> + hdr.data.end, hdr.data.start);
>>> + return -EINVAL;
>>> + }
>>> +
>>> + if (hdr.va.end < hdr.va.start) {
>>> + drm_err(&ptdev->base, "Firmware corrupted, hdr.va.end < hdr.va.start (0x%x < 0x%x)\n",
>>> + hdr.va.end, hdr.va.start);
>>> + return -EINVAL;
>>> + }
>>> +
>>> + if (hdr.data.end > fw->size) {
>>> + drm_err(&ptdev->base, "Firmware corrupted, file truncated? data_end=0x%x > fw size=0x%zx\n",
>>> + hdr.data.end, fw->size);
>>> + return -EINVAL;
>>> + }
>>> +
>>> + if ((hdr.va.start & ~PAGE_MASK) != 0 ||
>>> + (hdr.va.end & ~PAGE_MASK) != 0) {
>>> + drm_err(&ptdev->base, "Firmware corrupted, virtual addresses not page aligned: 0x%x-0x%x\n",
>>> + hdr.va.start, hdr.va.end);
>>> + return -EINVAL;
>>> + }
>>> +
>>> + if (hdr.flags & ~CSF_FW_BINARY_IFACE_ENTRY_RD_SUPPORTED_FLAGS) {
>>> + drm_err(&ptdev->base, "Firmware contains interface with unsupported flags (0x%x)\n",
>>> + hdr.flags);
>>> + return -EINVAL;
>>> + }
>>> +
>>> + if (hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_PROT) {
>>> + drm_warn(&ptdev->base,
>>> + "Firmware protected mode entry not be supported, ignoring");
>>> + return 0;
>>> + }
>>> +
>>> + if (hdr.va.start == CSF_MCU_SHARED_REGION_START &&
>>> + !(hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_SHARED)) {
>>> + drm_err(&ptdev->base,
>>> + "Interface at 0x%llx must be shared", CSF_MCU_SHARED_REGION_START);
>>> + return -EINVAL;
>>> + }
>>> +
>>> + name_len = iter->size - iter->offset;
>>> +
>>> + section = drmm_kzalloc(&ptdev->base, sizeof(*section), GFP_KERNEL);
>>> + if (!section)
>>> + return -ENOMEM;
>>> +
>>> + list_add_tail(§ion->node, &ptdev->fw->sections);
>>> + section->flags = hdr.flags;
>>> + section->data.size = hdr.data.end - hdr.data.start;
>>> +
>>> + if (section->data.size > 0) {
>>> + void *data = drmm_kmalloc(&ptdev->base, section->data.size, GFP_KERNEL);
>>> +
>>> + if (!data)
>>> + return -ENOMEM;
>>> +
>>> + memcpy(data, fw->data + hdr.data.start, section->data.size);
>>> + section->data.buf = data;
>>> + }
>>> +
>>> + if (name_len > 0) {
>>> + char *name = drmm_kmalloc(&ptdev->base, name_len + 1, GFP_KERNEL);
>>> +
>>> + if (!name)
>>> + return -ENOMEM;
>>> +
>>> + memcpy(name, iter->data + iter->offset, name_len);
>>> + name[name_len] = '\0';
>>> + section->name = name;
>>> + }
>>> +
>>> + section_size = hdr.va.end - hdr.va.start;
>>> + if (section_size) {
>>> + u32 cache_mode = hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_MASK;
>>> + u32 vm_map_flags = 0;
>>> + struct sg_table *sgt;
>>> + u64 va = hdr.va.start;
>>> +
>>> + if (!(hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_WR))
>>> + vm_map_flags |= DRM_PANTHOR_VM_BIND_OP_MAP_READONLY;
>>> +
>>> + if (!(hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_EX))
>>> + vm_map_flags |= DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC;
>>> +
>>> + /* TODO: CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_*_COHERENT are mapped to
>>> + * non-cacheable for now. We might want to introduce a new
>>> + * IOMMU_xxx flag (or abuse IOMMU_MMIO, which maps to device
>>> + * memory and is currently not used by our driver) for
>>> + * AS_MEMATTR_AARCH64_SHARED memory, so we can take benefit
>>> + * of IO-coherent systems.
>>> + */
>>> + if (cache_mode != CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_CACHED)
>>> + vm_map_flags |= DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED;
>>> +
>>> + /* Shared section is in the auto-VA range. We need to
>>> + * reserve the VA range so it's not allocated to someone else.
>>> + */
>>> + if (va >= CSF_MCU_SHARED_REGION_START &&
>>> + va < CSF_MCU_SHARED_REGION_START + CSF_MCU_SHARED_REGION_SIZE)
>>> + va = PANTHOR_GEM_ALLOC_VA;
>>> +
>>> + section->mem = panthor_fw_mem_alloc(ptdev, section_size,
>>> + DRM_PANTHOR_BO_NO_MMAP,
>>> + vm_map_flags, va);
>>> + if (IS_ERR(section->mem))
>>> + return PTR_ERR(section->mem);
>>> +
>>> + if (drm_WARN_ON(&ptdev->base, section->mem->va != hdr.va.start))
>>> + return -EINVAL;
>>> +
>>> + panthor_fw_init_section_mem(ptdev, section);
>>> +
>>> + sgt = drm_gem_shmem_get_pages_sgt(§ion->mem->bo->base);
>>> + if (IS_ERR(sgt))
>>> + return PTR_ERR(section->mem);
>>> +
>>> + dma_sync_sgtable_for_device(ptdev->base.dev, sgt, DMA_TO_DEVICE);
>>> +
>>> + if (section->flags & CSF_FW_BINARY_IFACE_ENTRY_RD_SHARED) {
>>> + if (!panthor_fw_mem_vmap(section->mem))
>>
>> Moving this before panthor_fw_init_section_mem() would avoid an
>> unnecessary unmap/remap - althought this isn't exactly a performance path...
>
> Sure, I can do that.
>
>>
>>> + return -ENOMEM;
>>> + }
>>> + }
>>> +
>>> + if (hdr.va.start == CSF_MCU_SHARED_REGION_START)
>>> + ptdev->fw->shared_section = section;
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static void
>>> +panthor_reload_fw_sections(struct panthor_device *ptdev, bool full_reload)
>>> +{
>>> + struct panthor_fw_section *section;
>>> +
>>> + list_for_each_entry(section, &ptdev->fw->sections, node) {
>>> + struct sg_table *sgt;
>>> +
>>> + if (!full_reload && !(section->flags & CSF_FW_BINARY_IFACE_ENTRY_RD_WR))
>>> + continue;
>>> +
>>> + panthor_fw_init_section_mem(ptdev, section);
>>> + sgt = drm_gem_shmem_get_pages_sgt(§ion->mem->bo->base);
>>> + if (!drm_WARN_ON(&ptdev->base, IS_ERR_OR_NULL(sgt)))
>>> + dma_sync_sgtable_for_device(ptdev->base.dev, sgt, DMA_TO_DEVICE);
>>> + }
>>> +}
>>> +
>>> +static int panthor_fw_load_entry(struct panthor_device *ptdev,
>>> + const struct firmware *fw,
>>> + struct panthor_fw_binary_iter *iter)
>>> +{
>>> + struct panthor_fw_binary_iter eiter;
>>> + u32 ehdr;
>>> + int ret;
>>> +
>>> + ret = panthor_fw_binary_iter_read(ptdev, iter, &ehdr, sizeof(ehdr));
>>> + if (ret)
>>> + return ret;
>>> +
>>> + if ((iter->offset % sizeof(u32)) ||
>>> + (CSF_FW_BINARY_ENTRY_SIZE(ehdr) % sizeof(u32))) {
>>> + drm_err(&ptdev->base, "Firmware entry isn't 32 bit aligned, offset=0x%x size=0x%x\n",
>>> + (u32)(iter->offset - sizeof(u32)), CSF_FW_BINARY_ENTRY_SIZE(ehdr));
>>> + return -EINVAL;
>>> + }
>>> +
>>> + eiter.offset = 0;
>>> + eiter.data = iter->data + iter->offset;
>>> + eiter.size = CSF_FW_BINARY_ENTRY_SIZE(ehdr) - sizeof(ehdr);
>>> + iter->offset += eiter.size;
>>
>> There should really be a check like:
>>
>> if (iter->offset < eiter.size)
>> return -EINVAL;
>
> Uh, I thought I had added size checks everywhere, but I apparently
> missed some places.
>
>>
>> otherwise I think it's possible for a corrupt firmware to cause us to
>> run off the end of the buffer. Ideally the check would look something
>> more like the one in panthor_fw_binary_iter_read() (dealing with
>> potential overflow). I'm wondering if it makes sense to allow
>> panthor_fw_binary_iter_read() with a NULL 'out' and check the return
>> value. That way we can replace "iter->offset += eiter.size" with:
>>
>> ret = panthor_fw_binary_iter_read(ptdev, iter, NULL,
>> eiter.size);
>> if (ret)
>> return ret;
>>
>> (or have a new _skip() function)
>
> Might make sense to add a panthor_fw_binary_sub_iter_init() helper that
> would take care of doing the size check on the main iter, Unless you
> see other places requiring a size check that are not expressed as
> sub-iterators.
It was only the sub-iterators that I spotted the missing size check. A
helper for the sub-iterators is probably more clear than my 'skip' function.
[...]
>>> +struct panthor_fw_ringbuf_input_iface {
>>> + u64 insert;
>>> + u64 extract;
>>> +} __packed;
>>> +
>>> +struct panthor_fw_ringbuf_output_iface {
>>> + u64 extract;
>>> + u32 active;
>>> +} __packed;
>>
>> Is there a good reason for these to be marked '__packed'? They are
>> naturally aligned so there's no padding, and we guarantee they are page
>> aligned. The compiler might have more freedom if they are not marked
>> __packed.
>
> Nope, no good reason.
>
>>
>>> +
>>> +struct panthor_fw_cs_control_iface {
>>> +#define CS_FEATURES_WORK_REGS(x) (((x) & GENMASK(7, 0)) + 1)
>>> +#define CS_FEATURES_SCOREBOARDS(x) (((x) & GENMASK(15, 8)) >> 8)
>>> +#define CS_FEATURES_COMPUTE BIT(16)
>>> +#define CS_FEATURES_FRAGMENT BIT(17)
>>> +#define CS_FEATURES_TILER BIT(18)
>>> + u32 features;
>>> + u32 input_va;
>>> + u32 output_va;
>>> +} __packed;
>>
>> Here I have to admit I can't find a statement in the spec saying that
>> the stride must be a multiple of 4 bytes... but kbase makes that assumption.
>
> The stride of?
The stride of this structure (panthor_fw_cs_control_iface or
STREAM_CONTROL_BLOCK in the spec). The stride is defined by
GROUP_CONTROL_BLOCK::GROUP_STREAM_STRIDE
(panthor_fw_cs_control_iface->stream_stride here), but the spec doesn't
specify that the FW must obey any restrictions on the stride. For that
reason the use of __packed here is technically correct (the FW could
choose a stride which causes this structure to be mis-aligned).
In reality the firmware always aligns to 4 bytes and kbase depends on
this. And I've raised this internally, so hopefully a future spec will
include the 4 byte alignment requirement.
TLDR; the __packed specifiers shouldn't be needed on any of these
structures.
>>
>>> +
>>> +struct panthor_fw_cs_input_iface {
>>> +#define CS_STATE_MASK GENMASK(2, 0)
>>> +#define CS_STATE_STOP 0
>>> +#define CS_STATE_START 1
>>> +#define CS_EXTRACT_EVENT BIT(4)
>>> +#define CS_IDLE_SYNC_WAIT BIT(8)
>>> +#define CS_IDLE_PROTM_PENDING BIT(9)
>>> +#define CS_IDLE_EMPTY BIT(10)
>>> +#define CS_IDLE_RESOURCE_REQ BIT(11)
>>> +#define CS_TILER_OOM BIT(26)
>>> +#define CS_PROTM_PENDING BIT(27)
>>> +#define CS_FATAL BIT(30)
>>> +#define CS_FAULT BIT(31)
>>> +#define CS_REQ_MASK (CS_STATE_MASK | \
>>> + CS_EXTRACT_EVENT | \
>>> + CS_IDLE_SYNC_WAIT | \
>>> + CS_IDLE_PROTM_PENDING | \
>>> + CS_IDLE_EMPTY | \
>>> + CS_IDLE_RESOURCE_REQ)
>>> +#define CS_EVT_MASK (CS_TILER_OOM | \
>>> + CS_PROTM_PENDING | \
>>> + CS_FATAL | \
>>> + CS_FAULT)
>>> + u32 req;
>>> +
>>> +#define CS_CONFIG_PRIORITY(x) ((x) & GENMASK(3, 0))
>>> +#define CS_CONFIG_DOORBELL(x) (((x) << 8) & GENMASK(15, 8))
>>> + u32 config;
>>> + u32 reserved1;
>>> + u32 ack_irq_mask;
>>> + u64 ringbuf_base;
>>> + u32 ringbuf_size;
>>> + u32 reserved2;
>>> + u64 heap_start;
>>> + u64 heap_end;
>>> + u64 ringbuf_input;
>>> + u64 ringbuf_output;
>>> + u32 instr_config;
>>> + u32 instrbuf_size;
>>> + u64 instrbuf_base;
>>> + u64 instrbuf_offset_ptr;
>>> +} __packed;
>>
>> The spec says this has a minimal alignment of 64 bytes. Although I guess
>> the code should check this if we remove __packed and rely on it.
>
> The allocation granularity is 4k, and we're not even in control of the
> offset inside the FW interface section. So yes, we can check it when
> parsing the FW sections, but there's no point adding __aligned() here.
Sorry, no I wasn't intending that we'd add __aligned() - I was just
trying to justify (to myself) that the __packed wasn't necessary.
>>
>>> +
>>> +struct panthor_fw_cs_output_iface {
>>> + u32 ack;
>>> + u32 reserved1[15];
>>> + u64 status_cmd_ptr;
>>> +
>>> +#define CS_STATUS_WAIT_SB_MASK GENMASK(15, 0)
>>> +#define CS_STATUS_WAIT_SB_SRC_MASK GENMASK(19, 16)
>>> +#define CS_STATUS_WAIT_SB_SRC_NONE (0 << 16)
>>> +#define CS_STATUS_WAIT_SB_SRC_WAIT (8 << 16)
>>> +#define CS_STATUS_WAIT_SYNC_COND_LE (0 << 24)
>>> +#define CS_STATUS_WAIT_SYNC_COND_GT (1 << 24)
>>> +#define CS_STATUS_WAIT_SYNC_COND_MASK GENMASK(27, 24)
>>> +#define CS_STATUS_WAIT_PROGRESS BIT(28)
>>> +#define CS_STATUS_WAIT_PROTM BIT(29)
>>> +#define CS_STATUS_WAIT_SYNC_64B BIT(30)
>>> +#define CS_STATUS_WAIT_SYNC BIT(31)
>>> + u32 status_wait;
>>> + u32 status_req_resource;
>>> + u64 status_wait_sync_ptr;
>>> + u32 status_wait_sync_value;
>>> + u32 status_scoreboards;
>>> +
>>> +#define CS_STATUS_BLOCKED_REASON_UNBLOCKED 0
>>> +#define CS_STATUS_BLOCKED_REASON_SB_WAIT 1
>>> +#define CS_STATUS_BLOCKED_REASON_PROGRESS_WAIT 2
>>> +#define CS_STATUS_BLOCKED_REASON_SYNC_WAIT 3
>>> +#define CS_STATUS_BLOCKED_REASON_DEFERRED 5
>>> +#define CS_STATUS_BLOCKED_REASON_RES 6
>>> +#define CS_STATUS_BLOCKED_REASON_FLUSH 7
>>> +#define CS_STATUS_BLOCKED_REASON_MASK GENMASK(3, 0)
>>> + u32 status_blocked_reason;
>>> + u32 status_wait_sync_value_hi;
>>> + u32 reserved2[6];
>>> +
>>> +#define CS_EXCEPTION_TYPE(x) ((x) & GENMASK(7, 0))
>>> +#define CS_EXCEPTION_DATA(x) (((x) >> 8) & GENMASK(23, 0))
>>> + u32 fault;
>>> + u32 fatal;
>>> + u64 fault_info;
>>> + u64 fatal_info;
>>> + u32 reserved3[10];
>>> + u32 heap_vt_start;
>>> + u32 heap_vt_end;
>>> + u32 reserved4;
>>> + u32 heap_frag_end;
>>> + u64 heap_address;
>>> +} __packed;
>>
>> output is the same as input.
>
> You mean in term of alignment?
Yep. (Sorry I did a terrible job of explaining myself here - I got
rather distracted trying to work out what alignment was guaranteed by
the spec for all these different structures).
>>
>>> +
>>> +struct panthor_fw_csg_control_iface {
>>> + u32 features;
>>> + u32 input_va;
>>> + u32 output_va;
>>> + u32 suspend_size;
>>> + u32 protm_suspend_size;
>>> + u32 stream_num;
>>> + u32 stream_stride;
>>> +} __packed;
>>
>> The spec is ambigious here. It one place it states the stride is 256
>> bytes, but in another that you need to look at the GLB_GROUP_STRIDE
>> value. In practice we can rely on 4 byte alignment.
>>
>> I'm beginning to wonder if it's worth worrying about, I think I'll stop
>> here ;)
>
> Hehe. I'll add checks where I can in the parsing logic. I guess having
> things naturally aligned and making sure there's no overlap with other
> interfaces is a minimum.
Yes that would be good, and like I said there should be a clarification
in later specs that everything is (at least) 4 byte aligned.
Apparently the 256 byte stride mentioned in one place was due to the way
the structure was expressed in the XML and the XML->HTML tool
calculating it. Or in one word: 'wrong'! ;)
Steve
More information about the dri-devel
mailing list