[Intel-gfx] [PATCH 48/49] drm/i915: Eliminate vmap overhead for cmd parser
Chris Wilson
chris at chris-wilson.co.uk
Fri Mar 27 04:02:20 PDT 2015
With a little complexity to handle cmds straddling page boundaries, we
can completely avoiding having to vmap the batch and the shadow batch
objects whilst running the command parser.
On ivb i7-3720MQ:
x11perf -dot before 54.3M, after 53.2M (max 203M)
glxgears before 7110 fps, after 7300 fps (max 7860 fps)
Before:
Time to blt 16384 bytes x 1: 12.400µs, 1.2GiB/s
Time to blt 16384 bytes x 4096: 3.055µs, 5.0GiB/s
After:
Time to blt 16384 bytes x 1: 8.600µs, 1.8GiB/s
Time to blt 16384 bytes x 4096: 2.456µs, 6.2GiB/s
Removing the vmap is mostly a win, except we lose in a few cases where
the batch size is greater than a page due to the extra complexity (loss
of a simple cache efficient large copy, and boundary handling).
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
drivers/gpu/drm/i915/i915_cmd_parser.c | 297 +++++++++++++++++----------------
1 file changed, 150 insertions(+), 147 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 9605ff8f2fcd..60b30b4165d4 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -818,100 +818,6 @@ static bool valid_reg(const u32 *table, int count, u32 addr)
return false;
}
-static u32 *vmap_batch(struct drm_i915_gem_object *obj,
- unsigned start, unsigned len)
-{
- int i;
- void *addr = NULL;
- struct sg_page_iter sg_iter;
- int first_page = start >> PAGE_SHIFT;
- int last_page = (len + start + 4095) >> PAGE_SHIFT;
- int npages = last_page - first_page;
- struct page **pages;
-
- pages = drm_malloc_ab(npages, sizeof(*pages));
- if (pages == NULL) {
- DRM_DEBUG_DRIVER("Failed to get space for pages\n");
- goto finish;
- }
-
- i = 0;
- for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, first_page) {
- pages[i++] = sg_page_iter_page(&sg_iter);
- if (i == npages)
- break;
- }
-
- addr = vmap(pages, i, 0, PAGE_KERNEL);
- if (addr == NULL) {
- DRM_DEBUG_DRIVER("Failed to vmap pages\n");
- goto finish;
- }
-
-finish:
- if (pages)
- drm_free_large(pages);
- return (u32*)addr;
-}
-
-/* Returns a vmap'd pointer to dest_obj, which the caller must unmap */
-static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
- struct drm_i915_gem_object *src_obj,
- u32 batch_start_offset,
- u32 batch_len)
-{
- int needs_clflush = 0;
- void *src_base, *src;
- void *dst = NULL;
- int ret;
-
- if (batch_len > dest_obj->base.size ||
- batch_len + batch_start_offset > src_obj->base.size)
- return ERR_PTR(-E2BIG);
-
- if (WARN_ON(dest_obj->pages_pin_count == 0))
- return ERR_PTR(-ENODEV);
-
- ret = i915_gem_obj_prepare_shmem_read(src_obj, &needs_clflush);
- if (ret) {
- DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
- return ERR_PTR(ret);
- }
-
- src_base = vmap_batch(src_obj, batch_start_offset, batch_len);
- if (!src_base) {
- DRM_DEBUG_DRIVER("CMD: Failed to vmap batch\n");
- ret = -ENOMEM;
- goto unpin_src;
- }
-
- ret = i915_gem_object_set_to_cpu_domain(dest_obj, true);
- if (ret) {
- DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
- goto unmap_src;
- }
-
- dst = vmap_batch(dest_obj, 0, batch_len);
- if (!dst) {
- DRM_DEBUG_DRIVER("CMD: Failed to vmap shadow batch\n");
- ret = -ENOMEM;
- goto unmap_src;
- }
-
- src = src_base + offset_in_page(batch_start_offset);
- if (needs_clflush)
- drm_clflush_virt_range(src, batch_len);
-
- memcpy(dst, src, batch_len);
-
-unmap_src:
- vunmap(src_base);
-unpin_src:
- i915_gem_object_unpin_pages(src_obj);
-
- return ret ? ERR_PTR(ret) : dst;
-}
-
/**
* i915_needs_cmd_parser() - should a given ring use software command parsing?
* @ring: the ring in question
@@ -1046,16 +952,34 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
u32 batch_len,
bool is_master)
{
- u32 *cmd, *batch_base, *batch_end;
+ u32 tmp[128];
+ struct sg_page_iter src_iter, dst_iter;
+ const struct drm_i915_cmd_descriptor *desc;
+ int needs_clflush = 0;
+ void *src, *dst;
+ unsigned in, out;
+ u32 *buf, partial = 0, length;
struct drm_i915_cmd_descriptor default_desc = { 0 };
bool oacontrol_set = false; /* OACONTROL tracking. See check_cmd() */
int ret = 0;
- batch_base = copy_batch(shadow_batch_obj, batch_obj,
- batch_start_offset, batch_len);
- if (IS_ERR(batch_base)) {
- DRM_DEBUG_DRIVER("CMD: Failed to copy batch\n");
- return PTR_ERR(batch_base);
+ if (batch_len > shadow_batch_obj->base.size ||
+ batch_len + batch_start_offset > batch_obj->base.size)
+ return -E2BIG;
+
+ if (WARN_ON(shadow_batch_obj->pages_pin_count == 0))
+ return -ENODEV;
+
+ ret = i915_gem_obj_prepare_shmem_read(batch_obj, &needs_clflush);
+ if (ret) {
+ DRM_DEBUG_DRIVER("CMD: failed to prepare shadow batch\n");
+ return ret;
+ }
+
+ ret = i915_gem_object_set_to_cpu_domain(shadow_batch_obj, true);
+ if (ret) {
+ DRM_DEBUG_DRIVER("CMD: Failed to set shadow batch to CPU\n");
+ goto unpin;
}
/*
@@ -1063,54 +987,136 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
* large or larger and copy_batch() will write MI_NOPs to the extra
* space. Parsing should be faster in some cases this way.
*/
- batch_end = batch_base + (batch_len / sizeof(*batch_end));
+ ret = -EINVAL;
+
+ __sg_page_iter_start(&dst_iter,
+ shadow_batch_obj->pages->sgl,
+ shadow_batch_obj->pages->nents,
+ 0);
+ __sg_page_iter_next(&dst_iter);
+ dst = kmap_atomic(sg_page_iter_page(&dst_iter));
+
+ in = offset_in_page(batch_start_offset);
+ out = 0;
+ for_each_sg_page(batch_obj->pages->sgl,
+ &src_iter,
+ batch_obj->pages->nents,
+ batch_start_offset >> PAGE_SHIFT) {
+ u32 this, i, j, k;
+ u32 *cmd, *page_end, *batch_end;
+
+ this = batch_len;
+ if (this > PAGE_SIZE - in)
+ this = PAGE_SIZE - in;
+
+ src = kmap_atomic(sg_page_iter_page(&src_iter));
+ if (needs_clflush)
+ drm_clflush_virt_range(src + in, this);
+
+ i = this;
+ j = in;
+ do {
+ /* We keep dst around so that we do not blow
+ * the CPU caches immediately after the copy (due
+ * to the kunmap_atomic(dst) doing a TLB flush.
+ */
+ if (out == PAGE_SIZE) {
+ __sg_page_iter_next(&dst_iter);
+ kunmap_atomic(dst);
+ dst = kmap_atomic(sg_page_iter_page(&dst_iter));
+ out = 0;
+ }
- cmd = batch_base;
- while (cmd < batch_end) {
- const struct drm_i915_cmd_descriptor *desc;
- u32 length;
+ k = i;
+ if (k > PAGE_SIZE - out)
+ k = PAGE_SIZE - out;
+ if (k == PAGE_SIZE)
+ copy_page(dst, src);
+ else
+ memcpy(dst + out, src + j, k);
+
+ out += k;
+ j += k;
+ i -= k;
+ } while (i);
+
+ cmd = src + in;
+ page_end = (void *)cmd + this;
+ batch_end = (void *)cmd + batch_len;
+
+ if (partial) {
+ memcpy(tmp + partial, cmd, (length - partial)*4);
+ cmd -= partial;
+ partial = 0;
+ buf = tmp;
+ goto check;
+ }
- if (*cmd == MI_BATCH_BUFFER_END)
- break;
+ do {
+ if (*cmd == MI_BATCH_BUFFER_END) {
+ ret = 0;
+ goto unmap_src;
+ }
- desc = find_cmd(ring, *cmd, &default_desc);
- if (!desc) {
- DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
- *cmd);
- ret = -EINVAL;
- break;
- }
+ desc = find_cmd(ring, *cmd, &default_desc);
+ if (!desc) {
+ DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
+ *cmd);
+ goto unmap_src;
+ }
- /*
- * If the batch buffer contains a chained batch, return an
- * error that tells the caller to abort and dispatch the
- * workload as a non-secure batch.
- */
- if (desc->cmd.value == MI_BATCH_BUFFER_START) {
- ret = -EACCES;
- break;
- }
+ /*
+ * If the batch buffer contains a chained batch, return an
+ * error that tells the caller to abort and dispatch the
+ * workload as a non-secure batch.
+ */
+ if (desc->cmd.value == MI_BATCH_BUFFER_START) {
+ ret = -EACCES;
+ goto unmap_src;
+ }
- if (desc->flags & CMD_DESC_FIXED)
- length = desc->length.fixed;
- else
- length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
-
- if ((batch_end - cmd) < length) {
- DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
- *cmd,
- length,
- batch_end - cmd);
- ret = -EINVAL;
- break;
- }
+ if (desc->flags & CMD_DESC_FIXED)
+ length = desc->length.fixed;
+ else
+ length = ((*cmd & desc->length.mask) + LENGTH_BIAS);
+
+ if (cmd + length > page_end) {
+ if (length + cmd > batch_end) {
+ DRM_DEBUG_DRIVER("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n",
+ *cmd, length, batch_end - cmd);
+ goto unmap_src;
+ }
+
+ if (WARN_ON(length > sizeof(tmp)/4)) {
+ ret = -ENODEV;
+ goto unmap_src;
+ }
+
+ partial = page_end - cmd;
+ memcpy(tmp, cmd, partial*4);
+ break;
+ }
+
+ buf = cmd;
+check:
+ if (!check_cmd(ring, desc, buf, is_master, &oacontrol_set))
+ goto unmap_src;
- if (!check_cmd(ring, desc, cmd, is_master, &oacontrol_set)) {
- ret = -EINVAL;
+ cmd += length;
+ } while (cmd < page_end);
+
+ kunmap_atomic(src);
+
+ batch_len -= this;
+ if (batch_len == 0)
break;
- }
- cmd += length;
+ in = 0;
+ continue;
+
+unmap_src:
+ kunmap_atomic(src);
+ goto unmap_dst;
}
if (oacontrol_set) {
@@ -1118,13 +1124,10 @@ int i915_parse_cmds(struct intel_engine_cs *ring,
ret = -EINVAL;
}
- if (cmd >= batch_end) {
- DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n");
- ret = -EINVAL;
- }
-
- vunmap(batch_base);
-
+unmap_dst:
+ kunmap_atomic(dst);
+unpin:
+ i915_gem_object_unpin_pages(batch_obj);
return ret;
}
--
2.1.4
More information about the Intel-gfx
mailing list