[PATCH i-g-t 1/3] lib/intel_blt: Handle multiple ctrl-surf-copy blits on big surface
Juha-Pekka Heikkilä
juhapekka.heikkila at gmail.com
Fri Nov 15 13:10:35 UTC 2024
All look ok to me, entire set is
Reviewed-by: Juha-Pekka Heikkila <juhapekka.heikkila at gmail.com>
On Fri, Nov 8, 2024 at 8:47 AM Zbigniew Kempczyński
<zbigniew.kempczynski at intel.com> wrote:
>
> Copying ccs data for bigger surfaces requires couple of blits. This is
> caused by the limitation of 10-bit size (for Xe and Xe2+) of the
> ctrl-surf-copy operation. To handle bigger than 64MiB surfaces on Xe
> and 4MiB on Xe2+ we need to divide blits to fit in mentioned 10-bit
> size.
>
> Change introduces looping around big surface to produce batch with
> couple of ctrl-surf-copy blits.
>
> Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski at intel.com>
> Cc: Juha-Pekka Heikkila <juhapekka.heikkila at gmail.com>
> ---
> lib/intel_blt.c | 121 ++++++++++++++++++++++++++++++++++--------------
> 1 file changed, 86 insertions(+), 35 deletions(-)
>
> diff --git a/lib/intel_blt.c b/lib/intel_blt.c
> index 1b0f279177..b2fb3151e0 100644
> --- a/lib/intel_blt.c
> +++ b/lib/intel_blt.c
> @@ -1115,7 +1115,13 @@ int blt_block_copy(int fd,
> return ret;
> }
>
> -static uint16_t __ccs_size(int fd, const struct blt_ctrl_surf_copy_data *surf)
> +/*
> + * Function calculates CCS bytes used for the surface. Size field in
> + * ctrl-surf-copy command struct is for Xe and Xe2 10-bit length, so caller
> + * has to divide ccs copy of bigger surfaces to couple of separate commands.
> + * This function returns total size of ccs data used for the surface.
> + */
> +static uint32_t __ccs_size(int fd, const struct blt_ctrl_surf_copy_data *surf)
> {
> uint32_t src_size, dst_size;
> uint16_t ccsratio = CCS_RATIO(fd);
> @@ -1286,8 +1292,8 @@ uint64_t emit_blt_ctrl_surf_copy(int fd,
> uint64_t dst_offset, src_offset, bb_offset, alignment;
> uint32_t bbe = MI_BATCH_BUFFER_END;
> uint32_t *bb;
> - uint16_t num_ccs_blocks = (ip_ver >= IP_VER(20, 0)) ?
> - (xe_get_default_alignment(fd) / CCS_RATIO(fd)) : CCS_RATIO(fd);
> + uint32_t ccs_per_page, max_blocks, src_step, dst_step;
> + int32_t left_blocks;
>
> igt_assert_f(ahnd, "ctrl-surf-copy supports softpin only\n");
> igt_assert_f(surf, "ctrl-surf-copy requires data to do ctrl-surf-copy blit\n");
> @@ -1299,51 +1305,108 @@ uint64_t emit_blt_ctrl_surf_copy(int fd,
> alignment, surf->dst.pat_index);
> bb_offset = get_offset(ahnd, surf->bb.handle, surf->bb.size, alignment);
>
> + bb = bo_map(fd, surf->bb.handle, surf->bb.size, surf->driver);
> +
> + /*
> + * Copying in/out CCS data is limited by bitfield size_of_ctrl_copy size
> + * what means operation on bigger surface needs to be handled on couple
> + * of ctrl-surf-copy commands.
> + *
> + * Instead of hardcoding maximum number of blocks copied in single
> + * command we may calculate it from bitfield size (Xe and Xe2 differs
> + * in bitfield location [and in future platforms potentially size]).
> + */
> if (ip_ver >= IP_VER(20, 0)) {
> + ccs_per_page = SZ_4K / CCS_RATIO(fd);
> +
> data.xe2.dw00.client = 0x2;
> data.xe2.dw00.opcode = 0x48;
> data.xe2.dw00.src_access_type = surf->src.access_type;
> data.xe2.dw00.dst_access_type = surf->dst.access_type;
> -
> - /* Ensure dst has size capable to keep src ccs aux */
> - data.xe2.dw00.size_of_ctrl_copy = __ccs_size(fd, surf) / num_ccs_blocks - 1;
> data.xe2.dw00.length = 0x3;
>
> - data.xe2.dw01.src_address_lo = src_offset;
> - data.xe2.dw02.src_address_hi = src_offset >> 32;
> + data.xe2.dw00.size_of_ctrl_copy = -1;
> + max_blocks = data.xe2.dw00.size_of_ctrl_copy + 1;
> +
> + /*
> + * For Xe2+ each size_of_ctrl_copy increment covers 4K page size
> + * of surface, which in turn is covered by 8B of CCS data.
> + */
> + src_step = surf->src.access_type == DIRECT_ACCESS ? ccs_per_page : SZ_4K;
> + src_step *= max_blocks;
> + dst_step = surf->dst.access_type == DIRECT_ACCESS ? ccs_per_page : SZ_4K;
> + dst_step *= max_blocks;
> +
> data.xe2.dw02.src_mocs_index = surf->src.mocs_index;
> -
> - data.xe2.dw03.dst_address_lo = dst_offset;
> - data.xe2.dw04.dst_address_hi = dst_offset >> 32;
> data.xe2.dw04.dst_mocs_index = surf->dst.mocs_index;
>
> data_sz = sizeof(data.xe2);
> } else {
> + ccs_per_page = SZ_64K / CCS_RATIO(fd);
> +
> data.gen12.dw00.client = 0x2;
> data.gen12.dw00.opcode = 0x48;
> data.gen12.dw00.src_access_type = surf->src.access_type;
> data.gen12.dw00.dst_access_type = surf->dst.access_type;
> -
> - /* Ensure dst has size capable to keep src ccs aux */
> - data.gen12.dw00.size_of_ctrl_copy = __ccs_size(fd, surf) / num_ccs_blocks - 1;
> data.gen12.dw00.length = 0x3;
>
> - data.gen12.dw01.src_address_lo = src_offset;
> - data.gen12.dw02.src_address_hi = src_offset >> 32;
> + data.gen12.dw00.size_of_ctrl_copy = -1;
> + max_blocks = data.gen12.dw00.size_of_ctrl_copy + 1;
> +
> + /*
> + * For Xe each size_of_ctrl_copy increment covers 64K page size
> + * of surface, which in turn is covered by 256B of CCS data.
> + */
> + src_step = surf->src.access_type == DIRECT_ACCESS ? ccs_per_page : SZ_64K;
> + src_step *= max_blocks;
> + dst_step = surf->dst.access_type == DIRECT_ACCESS ? ccs_per_page : SZ_64K;
> + dst_step *= max_blocks;
> +
> data.gen12.dw02.src_mocs_index = surf->src.mocs_index;
> -
> - data.gen12.dw03.dst_address_lo = dst_offset;
> - data.gen12.dw04.dst_address_hi = dst_offset >> 32;
> data.gen12.dw04.dst_mocs_index = surf->dst.mocs_index;
>
> data_sz = sizeof(data.gen12);
> }
>
> - bb = bo_map(fd, surf->bb.handle, surf->bb.size, surf->driver);
> + left_blocks = __ccs_size(fd, surf) / ccs_per_page;
>
> - igt_assert(bb_pos + data_sz < surf->bb.size);
> - memcpy(bb + bb_pos, &data, data_sz);
> - bb_pos += data_sz;
> + while (left_blocks > 0) {
> + int32_t nblocks = min_t(int32_t, left_blocks, max_blocks);
> +
> + if (ip_ver >= IP_VER(20, 0)) {
> + data.xe2.dw00.size_of_ctrl_copy = nblocks - 1;
> + data.xe2.dw01.src_address_lo = src_offset;
> + data.xe2.dw02.src_address_hi = src_offset >> 32;
> + data.xe2.dw03.dst_address_lo = dst_offset;
> + data.xe2.dw04.dst_address_hi = dst_offset >> 32;
> + } else {
> + data.gen12.dw00.size_of_ctrl_copy = nblocks - 1;
> + data.gen12.dw01.src_address_lo = src_offset;
> + data.gen12.dw02.src_address_hi = src_offset >> 32;
> + data.gen12.dw03.dst_address_lo = dst_offset;
> + data.gen12.dw04.dst_address_hi = dst_offset >> 32;
> + }
> +
> + left_blocks -= max_blocks;
> + dst_offset += dst_step;
> + src_offset += src_step;
> +
> + igt_assert(bb_pos + data_sz < surf->bb.size);
> + memcpy(bb + bb_pos, &data, data_sz);
> + bb_pos += data_sz;
> +
> + if (surf->print_bb) {
> + igt_info("[CTRL SURF]:\n");
> + igt_info("src offset: 0x%" PRIx64 ", dst offset: 0x%" PRIx64
> + ", bb offset: 0x%" PRIx64 ", copy nblocks: 0x%x\n",
> + src_offset, dst_offset, bb_offset, nblocks);
> +
> + if (ip_ver >= IP_VER(20, 0))
> + xe2_dump_bb_surf_ctrl_cmd(&data.xe2);
> + else
> + dump_bb_surf_ctrl_cmd(&data.gen12);
> + }
> + }
>
> if (emit_bbe) {
> igt_assert(bb_pos + sizeof(uint32_t) < surf->bb.size);
> @@ -1351,18 +1414,6 @@ uint64_t emit_blt_ctrl_surf_copy(int fd,
> bb_pos += sizeof(uint32_t);
> }
>
> - if (surf->print_bb) {
> - igt_info("[CTRL SURF]:\n");
> - igt_info("src offset: %" PRIx64 ", dst offset: %" PRIx64
> - ", bb offset: %" PRIx64 "\n",
> - src_offset, dst_offset, bb_offset);
> -
> - if (ip_ver >= IP_VER(20, 0))
> - xe2_dump_bb_surf_ctrl_cmd(&data.xe2);
> - else
> - dump_bb_surf_ctrl_cmd(&data.gen12);
> - }
> -
> munmap(bb, surf->bb.size);
>
> return bb_pos;
> --
> 2.34.1
>
More information about the igt-dev
mailing list