[Intel-gfx] [PATCH v3] drm/i915: Eliminate vmap overhead for cmd parser
Ville Syrjälä
ville.syrjala at linux.intel.com
Wed Nov 25 11:51:08 PST 2015
On Fri, Nov 20, 2015 at 03:31:23PM +0000, Chris Wilson wrote:
> With a little complexity to handle cmds straddling page boundaries, we
> can completely avoiding having to vmap the batch and the shadow batch
> objects whilst running the command parser.
>
> On ivb i7-3720MQ:
>
> x11perf -dot before 54.3M, after 53.2M (max 203M)
> glxgears before 7110 fps, after 7300 fps (max 7860 fps)
>
> Before:
> Time to blt 16384 bytes x 1: 12.400µs, 1.2GiB/s
> Time to blt 16384 bytes x 4096: 3.055µs, 5.0GiB/s
>
> After:
> Time to blt 16384 bytes x 1: 8.600µs, 1.8GiB/s
> Time to blt 16384 bytes x 4096: 2.456µs, 6.2GiB/s
>
> Removing the vmap is mostly a win, except we lose in a few cases where
> the batch size is greater than a page due to the extra complexity (loss
> of a simple cache efficient large copy, and boundary handling).
>
> v2: Reorder so that we do check oacontrol remaining set at end-of-batch
> v3: Stage the copy through a temporary page so that we only copy into
> dst commands that have been verified.
> v4: Move the batch offset/length validation to execbuf.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Ville Syrjälä <ville.syrjala at linux.intel.com>
> ---
> drivers/gpu/drm/i915/i915_cmd_parser.c | 258 ++++++++++++++++-----------------
> 1 file changed, 127 insertions(+), 131 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
> index 814d894ed925..86910e17505a 100644
> --- a/drivers/gpu/drm/i915/i915_cmd_parser.c
> +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
> @@ -860,100 +860,6 @@ find_reg(const struct drm_i915_reg_descriptor *table,
> return NULL;
> }
>
> -static u32 *vmap_batch(struct drm_i915_gem_object *obj,
> - unsigned start, unsigned len)
> -{
> - int i;
> - void *addr = NULL;
> - struct sg_page_iter sg_iter;
> - int first_page = start >> PAGE_SHIFT;
> - int last_page = (len + start + 4095) >> PAGE_SHIFT;
> - int npages = last_page - first_page;
> - struct page **pages;
> -
> - pages = drm_malloc_ab(npages, sizeof(*pages));
> - if (pages == NULL) {
> - DRM_DEBUG_DRIVER("Failed to get space for pages\n");
> - goto finish;
> - }
> -
> - i = 0;
> - for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) {
> - pages[i++] = sg_page_iter_page(&sg_iter);
> - if (i == npages)
> - break;
> - }
> -
> - addr = vmap(pages, i, 0, PAGE_KERNEL);
> - if (addr == NULL) {
> - DRM_DEBUG_DRIVER("Failed to vmap pages\n");
> - goto finish;
> - }
> -
> -finish:
> - if (pages)
> - drm_free_large(pages);
> - return (u32*)addr;
> -}
> -
> -/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
> -static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
> - struct drm_i915_gem_object *src_obj,
> - u32 batch_start_offset,
> - u32 batch_len)
> -{
> - int needs_clflush = 0;
> - void *src_base, *src;
> - void *dst = NULL;
> - int ret;
> -
> - if (batch_len > dest_obj->base.size ||
> - batch_len + batch_start_offset > src_obj->base.size)
> - return ERR_PTR(-E2BIG);
> -
> - if (WARN_ON(dest_obj->pages_pin_count == 0))
> - return ERR_PTR(-ENODEV);
> -
> - ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
> - if (ret) {
> - DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
> - return ERR_PTR(ret);
> - }
> -
> - src_base = vmap_batch(src_obj, batch_start_offset, batch_len);
> - if (!src_base) {
> - DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
> - ret = -ENOMEM;
> - goto unpin_src;
> - }
> -
> - ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
> - if (ret) {
> - DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
> - goto unmap_src;
> - }
> -
> - dst = vmap_batch(dest_obj, 0, batch_len);
> - if (!dst) {
> - DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
> - ret = -ENOMEM;
> - goto unmap_src;
> - }
> -
> - src = src_base + offset_in_page(batch_start_offset);
> - if (needs_clflush)
> - drm_clflush_virt_range(src, batch_len);
> -
> - memcpy(dst, src, batch_len);
> -
> -unmap_src:
> - vunmap(src_base);
> -unpin_src:
> - i915_gem_object_unpin_pages(src_obj);
> -
> - return ret ? ERR_PTR(ret) : dst;
> -}
> -
> /**
> * i915_needs_cmd_parser() - should a given ring use software command parsing?
> * @ring: the ring in question
> @@ -1097,6 +1003,7 @@ static bool check_cmd(const struct intel_engine_cs *ring,
> }
>
> #define LENGTH_BIAS 2
> +#define MAX_PARTIAL 256
There seems to some confusion whether this is bytes or dwords.
Also I guess we already end up allocating two pages anyway, so
maybe MAX_PARTIAL should just be one page? It's still not big
enough to cover the max legal cmd length AFAICS, so I think
the WARN in the check needs to be removed.
>
> /**
> * i915_parse_cmds() - parse a submitted batch buffer for privilege violations
> @@ -1120,39 +1027,91 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
> u32 batch_len,
> bool is_master)
> {
> - u32 *cmd, *batch_base, *batch_end;
> + const struct drm_i915_cmd_descriptor *desc;
> + unsigned dst_iter, src_iter;
> + int needs_clflush = 0;
> + struct get_page rewind;
> + void *src, *dst, *tmp;
> + u32 partial, length;
> + unsigned in, out;
> struct drm_i915_cmd_descriptor default_desc = { 0 };
> bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
> - int ret = 0;
> + int ret;
>
> - batch_base = copy_batch(shadow_batch_obj, batch_obj,
> - batch_start_offset, batch_len);
> - if (IS_ERR(batch_base)) {
> - DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
> - return PTR_ERR(batch_base);
> + if (WARN_ON(shadow_batch_obj->pages_pin_count == 0))
> + return -ENODEV;
> +
> + ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
> + if (ret) {
> + DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
> + return ret;
> }
>
> - /*
> - * We use the batch length as size because the shadow object is as
> - * large or larger and copy_batch() will write MI_NOPs to the extra
> - * space. Parsing should be faster in some cases this way.
> + ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
> + if (ret) {
> + DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
> + goto unpin;
> + }
> +
> + tmp = kmalloc(PAGE_SIZE + MAX_PARTIAL, GFP_TEMPORARY);
> + if (tmp == NULL)
> + return -ENOMEM;
> +
> + /* Keep the original cached iterator around as that is likely
> + * to be more useful in future ioctls.
> */
> - batch_end = batch_base + (batch_len / sizeof(*batch_end));
> + rewind = batch_obj->get_page;
>
> - cmd = batch_base;
> - while (cmd < batch_end) {
> - const struct drm_i915_cmd_descriptor *desc;
> - u32 length;
> + ret = -EINVAL;
> +
> + dst_iter = 0;
> + dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, dst_iter));
> + out = 0;
> +
> + in = offset_in_page(batch_start_offset);
> + partial = 0;
> + for (src_iter = batch_start_offset >> PAGE_SHIFT;
> + src_iter < batch_obj->base.size >> PAGE_SHIFT;
> + src_iter++) {
So we're iterating over all the pages. Should be enough to iterate
until batch_start_offset+batch_len I suppose, but as long as we bail
out when we run out of batch it should be fine.
I see there's a batch_len check at the end, but I don't see us handling
the case when the user already gives us something with batch_len==0.
Maybe that should be rejected somewhere higher up?
Also what happens if we don't find MI_BATCH_BUFFER_END before running
out of batch? Oh, I see, we set ret=-EINVAL, and clear it to 0 when we
find MI_BATCH_BUFFER_END. So that part seems to be fine.
> + u32 *cmd, *page_end, *batch_end;
> + u32 this;
> +
> + this = batch_len;
I was a bit concerned about batch_len & 3, but we already check for
batch_len&7==0 in i915_gem_check_execbuffer(), so it should be good here.
> + if (this > PAGE_SIZE - in)
> + this = PAGE_SIZE - in;
>
> - if (*cmd == MI_BATCH_BUFFER_END)
> + src = kmap_atomic(i915_gem_object_get_page(batch_obj, src_iter));
> + if (needs_clflush)
> + drm_clflush_virt_range(src + in, this);
> +
> + if (this == PAGE_SIZE && partial == 0)
> + copy_page(tmp, src);
> + else
> + memcpy(tmp+partial, src+in, this);
> +
> + cmd = tmp;
> + page_end = tmp + this + partial;
> + batch_end = tmp + batch_len + partial;
> + partial = 0;
> +
> + do {
> + if (*cmd == MI_BATCH_BUFFER_END) {
> + if (oacontrol_set) {
> + DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
> + goto unmap;
> + }
> +
> + cmd++; /* copy this cmd to dst */
> + batch_len = this; /* no more src */
> + ret = 0;
> break;
> + }
>
> desc = find_cmd(ring, *cmd, &default_desc);
> if (!desc) {
> DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
> *cmd);
> - ret = -EINVAL;
> - break;
> + goto unmap;
> }
>
> /*
> @@ -1162,7 +1121,7 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
> */
> if (desc->cmd.value == MI_BATCH_BUFFER_START) {
> ret = -EACCES;
Bleh. I wonder why we even bother with this thing on VLV/IVB when
it's that easy to bypass...
> - break;
> + goto unmap;
> }
>
> if (desc->flags & CMD_DESC_FIXED)
> @@ -1170,36 +1129,73 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
> else
> length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
>
> - if ((batch_end - cmd) < length) {
> + if (unlikely(cmd + length > page_end)) {
> + if (unlikely(cmd + length > batch_end)) {
> DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
> - *cmd,
> - length,
> - batch_end - cmd);
> - ret = -EINVAL;
> - break;
> + *cmd, length, batch_end - cmd);
> + goto unmap;
> }
>
> - if (!check_cmd(ring, desc, cmd, length, is_master,
> - &oacontrol_set)) {
> - ret = -EINVAL;
> + if (WARN_ON(length > MAX_PARTIAL)) {
> + ret = -ENODEV;
> + goto unmap;
> + }
> +
> + partial = 4*(page_end - cmd);
> break;
> }
>
> + if (!check_cmd(ring, desc, cmd, length, is_master,
> + &oacontrol_set))
> + goto unmap;
> +
> cmd += length;
> - }
> + } while (cmd < page_end);
>
> - if (oacontrol_set) {
> - DRM_DEBUG_DRIVER("CMD: batch set OACONTROL but did not clear it\n");
> - ret = -EINVAL;
> + /* Copy the validated page to the GPU batch */
> + {
> + /* exclude partial cmd, we copy it next time */
> + unsigned dst_length = (void *)cmd - tmp;
> + in = 0;
> + do {
> + int len;
> +
> + if (out == PAGE_SIZE) {
> + kunmap_atomic(dst);
> + dst = kmap_atomic(i915_gem_object_get_page(shadow_batch_obj, ++dst_iter));
> + out = 0;
> }
>
> - if (cmd >= batch_end) {
> - DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n");
> - ret = -EINVAL;
> + len = dst_length;
> + if (len + out > PAGE_SIZE)
> + len = PAGE_SIZE - out;
> + if (len == PAGE_SIZE)
> + copy_page(dst, tmp + in);
> + else
> + memcpy(dst + out, tmp + in, len);
> + in += len;
> + out += len;
> + dst_length -= len;
> + } while (dst_length);
> }
>
> - vunmap(batch_base);
> + batch_len -= this;
> + if (batch_len == 0)
> + break;
>
> + if (partial)
> + memmove(tmp, cmd, partial);
> +
> + kunmap_atomic(src);
> + in = 0;
> + }
> +unmap:
> + kunmap_atomic(src);
> + kunmap_atomic(dst);
> + batch_obj->get_page = rewind;
> + kfree(tmp);
> +unpin:
> + i915_gem_object_unpin_pages(batch_obj);
> return ret;
> }
>
> --
> 2.6.2
--
Ville Syrjälä
Intel OTC
More information about the Intel-gfx
mailing list