[PATCH i-g-t 1/3] lib/intel_blt: Handle multiple ctrl-surf-copy blits on big surface

Juha-Pekka Heikkilä juhapekka.heikkila at gmail.com
Fri Nov 15 13:10:35 UTC 2024


All look ok to me, entire set is

Reviewed-by: Juha-Pekka Heikkila <juhapekka.heikkila at gmail.com>

On Fri, Nov 8, 2024 at 8:47 AM Zbigniew Kempczyński
<zbigniew.kempczynski at intel.com> wrote:
>
> Copying ccs data for bigger surfaces requires couple of blits. This is
> caused by the limitation of 10-bit size (for Xe and Xe2+) of the
> ctrl-surf-copy operation. To handle bigger than 64MiB surfaces on Xe
> and 4MiB on Xe2+ we need to divide blits to fit in mentioned 10-bit
> size.
>
> Change introduces looping around big surface to produce batch with
> couple of ctrl-surf-copy blits.
>
> Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski at intel.com>
> Cc: Juha-Pekka Heikkila <juhapekka.heikkila at gmail.com>
> ---
>  lib/intel_blt.c | 121 ++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 86 insertions(+), 35 deletions(-)
>
> diff --git a/lib/intel_blt.c b/lib/intel_blt.c
> index 1b0f279177..b2fb3151e0 100644
> --- a/lib/intel_blt.c
> +++ b/lib/intel_blt.c
> @@ -1115,7 +1115,13 @@ int blt_block_copy(int fd,
>         return ret;
>  }
>
> -static uint16_t __ccs_size(int fd, const struct blt_ctrl_surf_copy_data *surf)
> +/*
> + * Function calculates CCS bytes used for the surface. Size field in
> + * ctrl-surf-copy command struct is for Xe and Xe2 10-bit length, so caller
> + * has to divide ccs copy of bigger surfaces to couple of separate commands.
> + * This function returns total size of ccs data used for the surface.
> + */
> +static uint32_t __ccs_size(int fd, const struct blt_ctrl_surf_copy_data *surf)
>  {
>         uint32_t src_size, dst_size;
>         uint16_t ccsratio = CCS_RATIO(fd);
> @@ -1286,8 +1292,8 @@ uint64_t emit_blt_ctrl_surf_copy(int fd,
>         uint64_t dst_offset, src_offset, bb_offset, alignment;
>         uint32_t bbe = MI_BATCH_BUFFER_END;
>         uint32_t *bb;
> -       uint16_t num_ccs_blocks = (ip_ver >= IP_VER(20, 0)) ?
> -                               (xe_get_default_alignment(fd) / CCS_RATIO(fd)) : CCS_RATIO(fd);
> +       uint32_t ccs_per_page, max_blocks, src_step, dst_step;
> +       int32_t left_blocks;
>
>         igt_assert_f(ahnd, "ctrl-surf-copy supports softpin only\n");
>         igt_assert_f(surf, "ctrl-surf-copy requires data to do ctrl-surf-copy blit\n");
> @@ -1299,51 +1305,108 @@ uint64_t emit_blt_ctrl_surf_copy(int fd,
>                                           alignment, surf->dst.pat_index);
>         bb_offset = get_offset(ahnd, surf->bb.handle, surf->bb.size, alignment);
>
> +       bb = bo_map(fd, surf->bb.handle, surf->bb.size, surf->driver);
> +
> +       /*
> +        * Copying in/out CCS data is limited by bitfield size_of_ctrl_copy size
> +        * what means operation on bigger surface needs to be handled on couple
> +        * of ctrl-surf-copy commands.
> +        *
> +        * Instead of hardcoding maximum number of blocks copied in single
> +        * command we may calculate it from bitfield size (Xe and Xe2 differs
> +        * in bitfield location [and in future platforms potentially size]).
> +        */
>         if (ip_ver >= IP_VER(20, 0)) {
> +               ccs_per_page = SZ_4K / CCS_RATIO(fd);
> +
>                 data.xe2.dw00.client = 0x2;
>                 data.xe2.dw00.opcode = 0x48;
>                 data.xe2.dw00.src_access_type = surf->src.access_type;
>                 data.xe2.dw00.dst_access_type = surf->dst.access_type;
> -
> -               /* Ensure dst has size capable to keep src ccs aux */
> -               data.xe2.dw00.size_of_ctrl_copy = __ccs_size(fd, surf) / num_ccs_blocks - 1;
>                 data.xe2.dw00.length = 0x3;
>
> -               data.xe2.dw01.src_address_lo = src_offset;
> -               data.xe2.dw02.src_address_hi = src_offset >> 32;
> +               data.xe2.dw00.size_of_ctrl_copy = -1;
> +               max_blocks = data.xe2.dw00.size_of_ctrl_copy + 1;
> +
> +               /*
> +                * For Xe2+ each size_of_ctrl_copy increment covers 4K page size
> +                * of surface, which in turn is covered by 8B of CCS data.
> +                */
> +               src_step = surf->src.access_type == DIRECT_ACCESS ? ccs_per_page : SZ_4K;
> +               src_step *= max_blocks;
> +               dst_step = surf->dst.access_type == DIRECT_ACCESS ? ccs_per_page : SZ_4K;
> +               dst_step *= max_blocks;
> +
>                 data.xe2.dw02.src_mocs_index = surf->src.mocs_index;
> -
> -               data.xe2.dw03.dst_address_lo = dst_offset;
> -               data.xe2.dw04.dst_address_hi = dst_offset >> 32;
>                 data.xe2.dw04.dst_mocs_index = surf->dst.mocs_index;
>
>                 data_sz = sizeof(data.xe2);
>         } else {
> +               ccs_per_page = SZ_64K / CCS_RATIO(fd);
> +
>                 data.gen12.dw00.client = 0x2;
>                 data.gen12.dw00.opcode = 0x48;
>                 data.gen12.dw00.src_access_type = surf->src.access_type;
>                 data.gen12.dw00.dst_access_type = surf->dst.access_type;
> -
> -               /* Ensure dst has size capable to keep src ccs aux */
> -               data.gen12.dw00.size_of_ctrl_copy = __ccs_size(fd, surf) / num_ccs_blocks - 1;
>                 data.gen12.dw00.length = 0x3;
>
> -               data.gen12.dw01.src_address_lo = src_offset;
> -               data.gen12.dw02.src_address_hi = src_offset >> 32;
> +               data.gen12.dw00.size_of_ctrl_copy = -1;
> +               max_blocks = data.gen12.dw00.size_of_ctrl_copy + 1;
> +
> +               /*
> +                * For Xe each size_of_ctrl_copy increment covers 64K page size
> +                * of surface, which in turn is covered by 256B of CCS data.
> +                */
> +               src_step = surf->src.access_type == DIRECT_ACCESS ? ccs_per_page : SZ_64K;
> +               src_step *= max_blocks;
> +               dst_step = surf->dst.access_type == DIRECT_ACCESS ? ccs_per_page : SZ_64K;
> +               dst_step *= max_blocks;
> +
>                 data.gen12.dw02.src_mocs_index = surf->src.mocs_index;
> -
> -               data.gen12.dw03.dst_address_lo = dst_offset;
> -               data.gen12.dw04.dst_address_hi = dst_offset >> 32;
>                 data.gen12.dw04.dst_mocs_index = surf->dst.mocs_index;
>
>                 data_sz = sizeof(data.gen12);
>         }
>
> -       bb = bo_map(fd, surf->bb.handle, surf->bb.size, surf->driver);
> +       left_blocks = __ccs_size(fd, surf) / ccs_per_page;
>
> -       igt_assert(bb_pos + data_sz < surf->bb.size);
> -       memcpy(bb + bb_pos, &data, data_sz);
> -       bb_pos += data_sz;
> +       while (left_blocks > 0) {
> +               int32_t nblocks = min_t(int32_t, left_blocks, max_blocks);
> +
> +               if (ip_ver >= IP_VER(20, 0)) {
> +                       data.xe2.dw00.size_of_ctrl_copy = nblocks - 1;
> +                       data.xe2.dw01.src_address_lo = src_offset;
> +                       data.xe2.dw02.src_address_hi = src_offset >> 32;
> +                       data.xe2.dw03.dst_address_lo = dst_offset;
> +                       data.xe2.dw04.dst_address_hi = dst_offset >> 32;
> +               } else {
> +                       data.gen12.dw00.size_of_ctrl_copy = nblocks - 1;
> +                       data.gen12.dw01.src_address_lo = src_offset;
> +                       data.gen12.dw02.src_address_hi = src_offset >> 32;
> +                       data.gen12.dw03.dst_address_lo = dst_offset;
> +                       data.gen12.dw04.dst_address_hi = dst_offset >> 32;
> +               }
> +
> +               left_blocks -= max_blocks;
> +               dst_offset += dst_step;
> +               src_offset += src_step;
> +
> +               igt_assert(bb_pos + data_sz < surf->bb.size);
> +               memcpy(bb + bb_pos, &data, data_sz);
> +               bb_pos += data_sz;
> +
> +               if (surf->print_bb) {
> +                       igt_info("[CTRL SURF]:\n");
> +                       igt_info("src offset: 0x%" PRIx64 ", dst offset: 0x%" PRIx64
> +                                ", bb offset: 0x%" PRIx64 ", copy nblocks: 0x%x\n",
> +                                src_offset, dst_offset, bb_offset, nblocks);
> +
> +                       if (ip_ver >= IP_VER(20, 0))
> +                               xe2_dump_bb_surf_ctrl_cmd(&data.xe2);
> +                       else
> +                               dump_bb_surf_ctrl_cmd(&data.gen12);
> +               }
> +       }
>
>         if (emit_bbe) {
>                 igt_assert(bb_pos + sizeof(uint32_t) < surf->bb.size);
> @@ -1351,18 +1414,6 @@ uint64_t emit_blt_ctrl_surf_copy(int fd,
>                 bb_pos += sizeof(uint32_t);
>         }
>
> -       if (surf->print_bb) {
> -               igt_info("[CTRL SURF]:\n");
> -               igt_info("src offset: %" PRIx64 ", dst offset: %" PRIx64
> -                        ", bb offset: %" PRIx64 "\n",
> -                        src_offset, dst_offset, bb_offset);
> -
> -               if (ip_ver >= IP_VER(20, 0))
> -                       xe2_dump_bb_surf_ctrl_cmd(&data.xe2);
> -               else
> -                       dump_bb_surf_ctrl_cmd(&data.gen12);
> -       }
> -
>         munmap(bb, surf->bb.size);
>
>         return bb_pos;
> --
> 2.34.1
>


More information about the igt-dev mailing list