[Mesa-dev] [PATCH 46/51] i965: Introduce a context-local batch manager

Tue Jan 10 21:24:09 UTC 2017

When submitting commands to the GPU every cycle of latency counts;
mutexes, spinlocks, even atomics quickly add to substantial overhead.

This "batch manager" acts as thread-local shim over the buffer manager
(drm_intel_bufmgr_gem). As we are only ever used from within a single
context, we can rely on the upper layers providing thread safety.
This allows us to import buffers from the shared screen (sharing buffers
between multiple contexts, threads and users) and wrap that handle in
our own. Similarly, we want to share the buffer cache between all
users on the file and so allocate from the global threadsafe buffer
manager, with a very small and transient local cache of active buffers.

The batch manager provides a cheap way of busyness tracking and very
efficient batch construction and kernel submission.

The restrictions over and above the generic submission engine in
intel_bufmgr_gem are:
     - not thread-safe
     - flat relocations, only the batch buffer itself carries
       relocations. Relocations relative to auxiliary buffers
       must be performed via STATE_BASE
     - direct mapping of the batch for writes, expect reads
       from the batch to be slow
     - the batch is a fixed 64k in size
     - access to the batch must be wrapped by brw_batch_begin/_end
     - all relocations must be immediately written into the batch

The importance of the flat relocation tree with local offset handling is
that it allows us to use the "relocation-less" execbuffer interfaces,
dramatically reducing the overhead of batch submission. However, that
can be relaxed to allow other buffers than the batch buffer to carry
relocations, if need be.

ivb/bdw OglBatch7 improves by ~20% above and beyond my kernel relocation
speedups.

ISSUES:
* shared mipmap trees
  - we instantiate a context local copy on use, but what are the semantics for
    serializing read/writes between them - do we need automagic flushing of
    execution on other contexts and common busyness tracking?
  - we retain references to the bo past the lifetime of its parent
    batchmgr as the mipmap_tree is retained past the lifetime of its
    original context, see glx_arb_create_context/default_major_version
  - intercontext locking?

* fences
  intel_syncobject's locking is nonsense. What and how to fix?

* OglMultithread is nevertheless unhappy; but that looks like undefined
  behaviour - i.e. a buggy client concurrently executing the same GL
  context in multiple threads, unpatched is equally buggy.

* Add full-ppgtt softpinning support (no more relocations, at least for
  the first 256TiB), at the moment there is a limited proof-of-principle
  demonstration

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
Cc: Kristian Høgsberg <krh at bitplanet.net>
Cc: Kenneth Graunke <kenneth at whitecape.org>
Cc: Jesse Barnes <jbarnes at virtuousgeek.org>
Cc: Ian Romanick <ian.d.romanick at intel.com>
Cc: Abdiel Janulgue <abdiel.janulgue at linux.intel.com>
Cc: Eero Tamminen <eero.t.tamminen at intel.com>
Cc: Martin Peres <martin.peres at linux.intel.com>
---
 src/mesa/drivers/dri/i965/Makefile.sources       |    3 +-
 src/mesa/drivers/dri/i965/brw_batch.c            | 2358 ++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_batch.h            |  518 +++--
 src/mesa/drivers/dri/i965/brw_context.c          |   47 +-
 src/mesa/drivers/dri/i965/brw_context.h          |   20 +-
 src/mesa/drivers/dri/i965/brw_pipe_control.c     |   27 +-
 src/mesa/drivers/dri/i965/brw_program.c          |    4 +-
 src/mesa/drivers/dri/i965/brw_program_cache.c    |   58 +-
 src/mesa/drivers/dri/i965/brw_queryobj.c         |   21 +-
 src/mesa/drivers/dri/i965/brw_reset.c            |    6 +-
 src/mesa/drivers/dri/i965/brw_state.h            |    2 +-
 src/mesa/drivers/dri/i965/brw_state_batch.c      |   26 +-
 src/mesa/drivers/dri/i965/brw_state_dump.c       |   16 +-
 src/mesa/drivers/dri/i965/brw_sync.c             |    6 +-
 src/mesa/drivers/dri/i965/brw_urb.c              |    8 +-
 src/mesa/drivers/dri/i965/gen6_queryobj.c        |   31 +-
 src/mesa/drivers/dri/i965/gen7_sol_state.c       |   13 +-
 src/mesa/drivers/dri/i965/genX_blorp_exec.c      |    7 +-
 src/mesa/drivers/dri/i965/intel_batchbuffer.c    |  480 -----
 src/mesa/drivers/dri/i965/intel_batchbuffer.h    |  148 --
 src/mesa/drivers/dri/i965/intel_blit.c           |    2 +-
 src/mesa/drivers/dri/i965/intel_buffer_objects.c |  161 +-
 src/mesa/drivers/dri/i965/intel_buffer_objects.h |    2 -
 src/mesa/drivers/dri/i965/intel_fbo.c            |    5 +-
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c    |   76 +-
 src/mesa/drivers/dri/i965/intel_mipmap_tree.h    |    1 -
 src/mesa/drivers/dri/i965/intel_pixel_copy.c     |    2 -
 src/mesa/drivers/dri/i965/intel_pixel_read.c     |   31 +-
 src/mesa/drivers/dri/i965/intel_screen.c         |   10 +-
 src/mesa/drivers/dri/i965/intel_screen.h         |    9 +-
 src/mesa/drivers/dri/i965/intel_tex_image.c      |   42 +-
 src/mesa/drivers/dri/i965/intel_tex_subimage.c   |   35 +-
 src/mesa/drivers/dri/i965/intel_tiled_memcpy.c   |   14 +-
 src/mesa/drivers/dri/i965/intel_tiled_memcpy.h   |    4 +-
 src/mesa/drivers/dri/i965/intel_upload.c         |   12 +-
 35 files changed, 2958 insertions(+), 1247 deletions(-)
 create mode 100644 src/mesa/drivers/dri/i965/brw_batch.c
 delete mode 100644 src/mesa/drivers/dri/i965/intel_batchbuffer.c
 delete mode 100644 src/mesa/drivers/dri/i965/intel_batchbuffer.h

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 15c30bfedb..bf2b07e236 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -93,6 +93,7 @@ i965_compiler_GENERATED_FILES = \
 	brw_nir_trig_workarounds.c
 
 i965_FILES = \
+	brw_batch.c \
 	brw_batch.h \
 	brw_binding_tables.c \
 	brw_blorp.c \
@@ -216,8 +217,6 @@ i965_FILES = \
 	gen8_wm_depth_stencil.c \
 	hsw_queryobj.c \
 	hsw_sol.c \
-	intel_batchbuffer.c \
-	intel_batchbuffer.h \
 	intel_blit.c \
 	intel_blit.h \
 	intel_buffer_objects.c \
diff --git a/src/mesa/drivers/dri/i965/brw_batch.c b/src/mesa/drivers/dri/i965/brw_batch.c
new file mode 100644
index 0000000000..defa329e53
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_batch.c
@@ -0,0 +1,2358 @@
+/*
+ * Copyright (c) 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris at chris-wilson.co.uk>
+ *
+ */
+#include "brw_batch.h"
+#include "brw_context.h" /* XXX brw_batch_start_hook() et al*/
+#include "brw_defines.h" /* XXX PIPECONTROL */
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <setjmp.h>
+
+#include <intel_bufmgr.h>
+#include <i915_drm.h>
+#include <xf86drm.h>
+#include <errno.h>
+
+#include "intel_screen.h"
+
+/*
+ * When submitting commands to the GPU every cycle of latency counts;
+ * mutexes, spinlocks, even atomics quickly add to substantial overhead.
+ *
+ * This "batch manager" acts as thread-local shim over the buffer manager
+ * (drm_intel_bufmgr_gem). As we are only ever used from within a single
+ * context, we can rely on the upper layers providing thread safety. This
+ * allows us to import buffers from the shared screen (sharing buffers
+ * between multiple contexts, threads and users) and wrap that handle in
+ * our own. Similarly, we want to share the buffer cache between all users
+ * on the file and so allocate from the global threadsafe buffer manager,
+ * with a very small and transient local cache of active buffers.
+ *
+ * The batch manager provides a cheap way of busyness tracking and very
+ * efficient batch construction and kernel submission.
+ *
+ * The restrictions over and above the generic submission engine in
+ * intel_bufmgr_gem are:
+ *    - not thread-safe
+ *    - flat relocations, only the batch buffer itself carries
+ *      relocations. Relocations relative to auxiliary buffers
+ *      must be performed via STATE_BASE
+ *    - direct mapping of the batch for writes, expect reads
+ *      from the batch to be slow
+ *    - the batch is a fixed 64k in size
+ *    - access to the batch must be wrapped by brw_batch_begin|_end
+ *    - all relocations must be immediately written into the batch
+ */
+
+/**
+ * Number of bytes to reserve for commands necessary to complete a batch.
+ *
+ * This includes:
+ * - MI_BATCHBUFFER_END (4 bytes)
+ * - Optional MI_NOOP for ensuring the batch length is qword aligned (4 bytes)
+ * - Any state emitted by vtbl->finish_batch():
+ *   - Gen4-5 record ending occlusion query values (4 * 4 = 16 bytes)
+ *   - Disabling OA counters on Gen6+ (3 DWords = 12 bytes)
+ *   - Ending MI_REPORT_PERF_COUNT on Gen5+, plus associated PIPE_CONTROLs:
+ *     - Two sets of PIPE_CONTROLs, which become 4 PIPE_CONTROLs each on SNB,
+ *       which are 5 DWords each ==> 2 * 4 * 5 * 4 = 160 bytes
+ *     - 3 DWords for MI_REPORT_PERF_COUNT itself on Gen6+.  ==> 12 bytes.
+ *       On Ironlake, it's 6 DWords, but we have some slack due to the lack of
+ *       Sandybridge PIPE_CONTROL madness.
+ *   - CC_STATE workaround on HSW (17 * 4 = 68 bytes)
+ *     - 10 dwords for initial mi_flush
+ *     - 2 dwords for CC state setup
+ *     - 5 dwords for the required pipe control at the end
+ *   - Restoring L3 configuration: (24 dwords = 96 bytes)
+ *     - 2*6 dwords for two PIPE_CONTROL flushes.
+ *     - 7 dwords for L3 configuration set-up.
+ *     - 5 dwords for L3 atomic set-up (on HSW).
+ */
+#define BATCH_RESERVED 308
+
+/* Surface offsets are limited to a maximum of 64k from the surface base */
+#define BATCH_SIZE (64 << 10)
+#define CACHELINE_BYTES 64
+#define CACHELINE_DWORDS (CACHELINE_BYTES / sizeof(uint32_t))
+
+/* XXX Temporary home until kernel patches land */
+#define I915_PARAM_HAS_EXEC_BATCH_FIRST 45
+#define I915_EXEC_BATCH_FIRST (1<<18)
+
+#if 0
+#include <sys/syscall.h>
+#define __DBG(x) printf x
+#define gettid()  (int)syscall(__NR_gettid)
+#else
+#define __DBG(x)
+#endif
+
+#define DBG_NO_FAST_RELOC 0
+#define DBG_NO_HANDLE_LUT 0
+#define DBG_NO_BATCH_FIRST 1
+#define DBG_NO_SOFTPIN 0
+#define DBG_NO_MMAP_WC 0
+
+#define DBG_PERF_IDLE 0 /* ring mask */
+
+#define READ_SIGNAL 0
+#define WRITE_SIGNAL 1
+#define NO_SIGNAL 2
+
+#define EXEC_FLAGS (~(0xf << 28))
+
+static void list_move(struct list_head *list, struct list_head *head)
+{
+   list_del(list);
+   list_add(list, head);
+}
+
+static void list_movetail(struct list_head *list, struct list_head *head)
+{
+   list_del(list);
+   list_addtail(list, head);
+}
+
+static const unsigned hw_ring[] = {
+   [RENDER_RING] = I915_EXEC_RENDER,
+   [BLT_RING] = I915_EXEC_BLT,
+};
+
+/*
+ * The struct brw_request is central to efficiently tracking GPU activity,
+ * and the busyness of all buffers and fences. It serves as both a read
+ * and a write fence on the buffers (and as the external GL fence). This is
+ * done by associating each relocation (i.e. every use of a buffer by a GPU
+ * within a batch) with the request as a read fence (for a read-only]
+ * relocation) or as both the read/write fences (for a writeable relocation).
+ * Then at the end of each batch, and for every fence sequence point, we
+ * insert a command to update a sequence counter in a private mmapped page.
+ * To see if a fence is ready, we can do a quick read of that sequence counter,
+ * bypassing all kernel overhead for querying readiness.
+ *
+ * Then if we ever need to query whether a particular buffer is active,
+ * we can look at the appropriate fence and see whether it has expired.
+ * If the request is still undergoing construction and not been submitted,
+ * we have that information immediately available and can report busyness
+ * without having to search, or submit the batch for the buffer if so desired.
+ *
+ * Periodically (after every new request) we poll for request completion.
+ * This allows us to then maintain the busyness state of all buffers
+ * without having to query every buffer every time, and allows us to keep
+ * the working set small by recycling inactive buffers.
+ *
+ * After certain events (such as mapping or waiting on a buffer), we know that
+ * the buffer is idle and so is the associated fence and all older fences.
+ *
+ * A nice side-effect of tracking requests, and buffer busyness is that we
+ * can also track a reasonable measure of how much of the aperture is filled
+ * by active buffers (a resident set size). This is useful for predicting
+ * when the kernel will start evicting our buffers, for example.
+ */
+struct brw_request {
+   struct brw_bo *bo; /* batch for this request */
+   struct brw_request *next;
+   struct list_head fences; /* all buffers/fences registered in this request */
+   uint32_t seqno; /* sequence number of the batch */
+};
+#define RQ_MARK_RING(rq, ring) ((struct brw_bo *)((uintptr_t)((rq)->bo) | (ring)))
+#define RQ_BO(rq) ((struct brw_bo *)((uintptr_t)(rq)->bo & ~3))
+#define RQ_RING(rq) (((unsigned)(uintptr_t)(rq)->bo & 3))
+
+#define FENCE_MARK_SIGNAL(rq, s) (struct brw_request *)((uintptr_t)(rq) | (s))
+
+/*
+ * Retire this and all older requests.
+ */
+static void __brw_request_retire(struct brw_request * const rq)
+{
+   const int ring = RQ_RING(rq);
+   struct brw_batch * const batch = RQ_BO(rq)->batch;
+   struct brw_request * const tail = rq->next;
+   struct brw_request *tmp;
+
+   if (DBG_PERF_IDLE & (1 << RQ_RING(rq)) && rq->next == NULL)
+      batch->idle_time[RQ_RING(rq)] = -get_time();
+
+   tmp = batch->requests[ring].lru;
+   do {
+      assert(RQ_BO(tmp)->exec == NULL);
+      assert(RQ_RING(tmp) == ring);
+
+      list_for_each_entry_safe(struct __brw_fence, fence, &tmp->fences, link) {
+         int signal = brw_fence_get_signal(fence);
+
+         assert(brw_fence_get_request(fence) == tmp);
+         list_inithead(&fence->link);
+         fence->rq = NULL;
+
+         if (signal == READ_SIGNAL) {
+            struct brw_bo *bo = NULL;
+
+            bo = container_of(fence, bo, read[ring]);
+
+            assert(bo->active & 1 << ring);
+            bo->active &= ~(1 << ring);
+            if (!bo->active) {
+               assert(bo->exec == NULL);
+               assert(brw_fence_get_request(&bo->write) == NULL);
+               assert(batch->rss >= bo->size);
+               batch->rss -= bo->size;
+
+               bo->pinned = false;
+               if (likely(bo->reusable))
+                  list_move(&bo->link, &batch->inactive);
+
+               if (unlikely(!bo->refcnt))
+                  __brw_bo_free(bo);
+            }
+         }
+      }
+      assert(brw_fence_get_request(&RQ_BO(tmp)->write) == NULL);
+      assert(brw_fence_get_request(&RQ_BO(tmp)->read[ring]) == NULL);
+      assert(!RQ_BO(tmp)->active);
+
+      if (tmp == batch->throttle[0])
+         batch->throttle[0] = NULL;
+      else if (tmp == batch->throttle[1])
+         batch->throttle[1] = NULL;
+
+      tmp->bo = RQ_BO(tmp); /* strip off the ring id */
+      tmp = tmp->next;
+   } while (tmp != tail);
+
+   rq->next = batch->freed_rq;
+   batch->freed_rq = batch->requests[ring].lru;
+
+   batch->requests[ring].lru = tmp;
+   if (tmp == NULL)
+      batch->requests[ring].mru = NULL;
+}
+
+static inline bool seqno_busy(uint32_t a, uint32_t b)
+{
+   return (int32_t)(b - a) < 0;
+}
+
+static inline bool request_busy(const struct brw_request *rq, uint32_t seq)
+{
+   return seqno_busy(rq->seqno, seq);
+}
+
+static inline uint32_t hws_seqno(const struct brw_batch *batch,
+                                 unsigned int ring)
+{
+   return batch->seqno_map[CACHELINE_DWORDS*ring];
+}
+
+/*
+ * Is the request busy? First we can see if this request
+ * has already been retired (idle), or if this request is still under
+ * construction (busy). Failing that to the best of our knowledge, it is
+ * still being processed by the GPU, so then we must ask the kernel if the
+ * request is now idle. If we find it is idle, we now know this and all
+ * older requests are also idle.
+ */
+bool __brw_request_busy(struct brw_request *rq,
+                        unsigned flags,
+                        struct perf_debug *perf)
+{
+   struct brw_batch *batch;
+
+   if (rq == NULL)
+      return false;
+
+   batch = RQ_BO(rq)->batch;
+
+   if (rq->seqno == 0) {
+      if (flags & BUSY_FLUSH && brw_batch_flush(batch, perf))
+         return false;
+      return true;
+   }
+
+   return request_busy(rq, hws_seqno(batch, RQ_RING(rq)));
+}
+
+/*
+ * Update the cache domain tracked by the kernel. This can have a number
+ * of side-effects but is essential in order to maintain coherency and
+ * serialisation between the GPU and CPU. If there is conflicting GPU access
+ * then set-domain will wait until the GPU has finished accessing the buffer
+ * before proceeding to change the domain. If the buffer is not cache coherent
+ * and we request CPU access, the kernel will clflush that buffer to make it
+ * coherent with the CPU access. Both of these imply delays and overhead, so
+ * we do our best to avoid moving buffers to the GTT/CPU domains. However,
+ * if we do, we know the buffer and its request are idle so we can update
+ * our request tracking after a blocking call.
+ */
+static void __brw_bo_set_domain(struct brw_bo *bo, unsigned domain, bool write)
+{
+   struct drm_i915_gem_set_domain set;
+
+   __DBG(("%d: %s: handle=%d, domain %d -> %d, write? %d\n", gettid(), __func__,
+          bo->handle, bo->domain, domain, write));
+
+   if (bo->domain == domain)
+      return;
+
+   if (bo->exec || !bo->batch)
+      return;
+
+   memset(&set, 0, sizeof(set));
+   set.handle = bo->handle;
+   set.read_domains =
+      domain == DOMAIN_CPU ? I915_GEM_DOMAIN_CPU : I915_GEM_DOMAIN_GTT;
+   if (write)
+      set.write_domain = set.read_domains;
+
+   if (unlikely(drmIoctl(bo->batch->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set)))
+      return;
+
+   bo->domain = write ? domain : DOMAIN_NONE;
+   assert(bo->refcnt);
+}
+
+/*
+ * Wait for the request to become completely idle, i.e. not being accessed by
+ * the GPU at all (neither for outstanding reads or writes).
+ * This is equivalent to setting the buffer write domain to GTT, but the
+ * wait ioctl avoids the set-domain side-effects (e.g. clflushing in
+ * some circumstances).
+ */
+static int __brw_request_wait(struct brw_request *rq,
+                              int64_t timeout,
+                              struct perf_debug *perf)
+{
+   struct drm_i915_gem_wait wait;
+   struct brw_bo *bo;
+   int err;
+
+   if (!__brw_request_busy(rq, BUSY_FLUSH, perf))
+      return 0;
+
+   bo = RQ_BO(rq);
+   assert(bo->refcnt);
+   assert(bo->batch);
+   assert(bo->exec == NULL);
+   assert(brw_fence_get_request(&bo->read[RQ_RING(rq)]) == rq);
+
+   memset(&wait, 0, sizeof(wait));
+   wait.bo_handle = bo->handle;
+   wait.timeout_ns = timeout;
+
+   if (unlikely(perf))
+      perf->elapsed = -get_time();
+
+   err = 0;
+   if (unlikely(drmIoctl(bo->batch->fd, DRM_IOCTL_I915_GEM_WAIT, &wait))) {
+      err = -errno;
+      if (timeout < 0) {
+         __brw_bo_set_domain(bo, DOMAIN_GTT, true);
+         err = 0;
+      }
+   }
+
+   /* And sanity check the kernel and our breadcrumbs */
+   assert(err || !__brw_request_busy(rq, 0, NULL));
+
+   if (unlikely(perf)) {
+      perf->elapsed += get_time();
+      if (perf->elapsed > 1e-5) /* 0.01ms */
+         brw_batch_report_stall_hook(bo->batch, perf);
+   }
+
+   return err;
+}
+
+static inline uint32_t hash_32(uint32_t hash, unsigned bits)
+{
+   return (hash * 0x9e37001) >> (32 - bits);
+}
+
+static inline struct list_head *borrowed(struct brw_batch *batch, uint32_t handle)
+{
+   return &batch->borrowed[hash_32(handle, BORROWED_BITS)];
+}
+
+/*
+ * We have context local bo, but those may be shared between contexts by
+ * shared mipmaps and other buffers. If we find we are dealing with a bo
+ * belonging to another batch, we need to translate that into a local bo
+ * for associating with our fences.
+ */
+static struct brw_bo *__brw_batch_lookup_handle(struct brw_batch *batch,
+                                                uint32_t handle)
+{
+   /* XXX may need a resizable ht? */
+   struct list_head *hlist = borrowed(batch, handle);
+
+   list_for_each_entry(struct brw_bo, bo, hlist, link)
+      if (bo->handle == handle)
+         return bo;
+
+   return NULL;
+}
+
+static inline bool has_lut(struct brw_batch *batch)
+{
+   return batch->base_flags & I915_EXEC_HANDLE_LUT;
+}
+
+/*
+ * Reset the batch manager for the start of a new batch.
+ */
+static void __brw_batch_reset(struct brw_batch *batch)
+{
+   struct brw_request *rq;
+
+   memset(&batch->emit, 0, sizeof(batch->emit));
+
+   rq = batch->next_request;
+   assert(rq->bo->target_handle == -1);
+   if (batch->base_flags & I915_EXEC_BATCH_FIRST) {
+      rq->bo->target_handle = has_lut(batch) ? 0 : rq->bo->handle;
+      rq->bo->exec =
+         memset(&batch->exec[batch->emit.nexec++], 0, sizeof(*rq->bo->exec));
+   } else
+      rq->bo->exec = (void *)1;
+   rq->seqno = 0;
+
+   batch->tail = batch->map;
+   batch->reserved = BATCH_RESERVED / 4 + 8;
+   batch->state = BATCH_SIZE / 4;
+   batch->aperture = 0;
+   batch->flags = batch->base_flags;
+
+   assert(!rq->bo->active);
+   list_inithead(&rq->fences);
+}
+
+/*
+ * Prepare the batch manager for constructing a new batch/request.
+ *
+ * Reset all the accounting we do per-batch, and allocate ourselves a new
+ * batch bo.
+ */
+static int __brw_batch_next(struct brw_batch *batch)
+{
+   struct brw_request *rq;
+
+retry:
+   rq = batch->freed_rq;
+   if (unlikely(rq == NULL)) {
+      rq = malloc(sizeof(*rq));
+      if (unlikely(rq == NULL))
+         goto oom;
+
+      rq->bo = brw_bo_create(batch, "batch", BATCH_SIZE, 0, 0);
+      if (unlikely(rq->bo == NULL)) {
+         free(rq);
+         goto oom;
+      }
+      rq->bo->target_handle = -1;
+
+      /* We are inheriting a foreign buffer, so call set-domain */
+      brw_bo_map(rq->bo, MAP_WRITE, NULL);
+   } else
+      batch->freed_rq = rq->next;
+   rq->next = NULL;
+
+   __DBG(("%d:%s: new batch handle=%d\n", gettid(), __func__, rq->bo->handle));
+   assert(RQ_BO(rq) == rq->bo);
+   batch->map = brw_bo_map(rq->bo, MAP_WRITE | MAP_ASYNC, NULL);
+   if (unlikely(batch->map == NULL)) {
+      brw_bo_put(rq->bo);
+      free(rq);
+
+oom:
+      /* force the synchronization to recover some memory */
+      rq = batch->requests[batch->ring].mru;
+      if (rq == NULL) {
+         batch->next_request = NULL;
+         return -ENOMEM;
+      }
+
+      if (__brw_request_wait(rq, -1, NULL) == 0)
+         __brw_request_retire(rq);
+      goto retry;
+   }
+
+   assert(rq->bo->target_handle == -1);
+   batch->next_request = rq;
+   batch->bo = rq->bo;
+
+   __brw_batch_reset(batch);
+   return 0;
+}
+
+static int gem_param(int fd, int name)
+{
+   drm_i915_getparam_t gp;
+   int v = -1; /* No param uses (yet) the sign bit, reserve it for errors */
+
+   memset(&gp, 0, sizeof(gp));
+   gp.param = name;
+   gp.value = &v;
+   if (drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp))
+      return -1;
+
+   return v;
+}
+
+static bool test_has_fast_reloc(int fd)
+{
+   if (DBG_NO_FAST_RELOC)
+      return DBG_NO_FAST_RELOC < 0;
+
+   return gem_param(fd, I915_PARAM_HAS_EXEC_NO_RELOC) > 0;
+}
+
+static bool test_has_handle_lut(int fd)
+{
+   if (DBG_NO_HANDLE_LUT)
+      return DBG_NO_HANDLE_LUT < 0;
+
+   return gem_param(fd, I915_PARAM_HAS_EXEC_HANDLE_LUT) > 0;
+}
+
+static bool test_has_batch_first(int fd)
+{
+   if (DBG_NO_BATCH_FIRST)
+      return DBG_NO_BATCH_FIRST < 0;
+
+   return gem_param(fd, I915_PARAM_HAS_EXEC_BATCH_FIRST) > 0;
+}
+
+static bool test_has_mmap_wc(int fd)
+{
+   if (DBG_NO_MMAP_WC)
+      return DBG_NO_MMAP_WC < 0;
+
+   return gem_param(fd, I915_PARAM_MMAP_VERSION) > 0;
+}
+
+static bool test_has_softpin(int fd)
+{
+   struct drm_i915_gem_execbuffer2 execbuf;
+   struct drm_i915_gem_exec_object2 obj;
+
+   if (DBG_NO_SOFTPIN)
+      return DBG_NO_SOFTPIN < 0;
+
+   if (gem_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN) < 1)
+      return false;
+
+   /* We construct a bad execbuf that we expect to fail reporting that
+    * it cannot lookup the context id. (We run before any user contexts
+    * are created, but we ask for a impossible context id, just in case.)
+    * On old kernels, this will fail early on in validating the ring for
+    * use with a logical context. More recent kernels have lifted that
+    * restriction and so we will proceed to failing to validate the context id
+    * itself.
+    *
+    * An invalid ring selection would generate EINVAL; an invalid context
+    * generates ENOENT. As this execbuf is guaranteed to generate an error
+    * it is reasonably quick.
+    */
+   memset(&obj, 0, sizeof(obj));
+   memset(&execbuf, 0, sizeof(execbuf));
+   execbuf.buffers_ptr = (uintptr_t)&obj;
+   execbuf.buffer_count = 1;
+   execbuf.flags = I915_EXEC_BLT;
+   execbuf.rsvd1 = 0xffffffu;
+   drmIoctl(fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf);
+   if (errno != ENOENT)
+      return false;
+
+   return true;
+}
+
+static uint64_t __get_max_aperture(int fd, int gen)
+{
+   struct drm_i915_gem_get_aperture aperture;
+   struct drm_i915_gem_context_param p;
+   int has_aliasing_ppgtt;
+
+   memset(&p, 0, sizeof(p));
+   /* Assumes all ppgtt are equivalent */
+   p.param = I915_CONTEXT_PARAM_GTT_SIZE;
+   if (drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &p) == 0)
+      return p.value;
+
+   has_aliasing_ppgtt = gem_param(fd, I915_PARAM_HAS_ALIASING_PPGTT);
+
+   if (has_aliasing_ppgtt > 2)
+      return (uint64_t)1 << 48;
+
+   if (has_aliasing_ppgtt > 1)
+      return (uint64_t)1 << (gen > 7 ? 32 : 31);
+
+   memset(&aperture, 0, sizeof(aperture));
+   if (unlikely(drmIoctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture)))
+      return 512 << 20; /* Minimum found on gen4+ */
+
+   return aperture.aper_size;
+}
+
+static uint64_t get_max_aperture(int fd, int gen)
+{
+   static uint64_t max_aperture;
+
+   if (max_aperture == 0)
+      max_aperture = __get_max_aperture(fd, gen);
+
+   return max_aperture;
+}
+
+/*
+ * Initialise the batch-manager for the GL context.
+ *
+ * We use the devinfo and settings found in intel_screen to set ourselves up
+ * for the hardware environment, and supplement that with our own feature
+ * tests. (These too should probably move to intel_screen and shared between
+ * all contexts.)
+ */
+int brw_batch_init(struct brw_batch *batch,
+                   struct intel_screen *screen)
+{
+   const struct gen_device_info *devinfo = &screen->devinfo;
+   struct drm_i915_gem_context_create create;
+   int err;
+
+   batch->fd = intel_screen_to_fd(screen);
+   batch->bufmgr = screen->bufmgr;
+   batch->screen = screen;
+
+   batch->no_hw = screen->no_hw;
+
+   batch->gen = devinfo->gen;
+   batch->needs_pipecontrol_ggtt_wa = devinfo->gen == 6;
+   batch->reloc_size = 512;
+   batch->exec_size = 256;
+   batch->reloc = malloc(sizeof(batch->reloc[0])*batch->reloc_size);
+   batch->exec = malloc(sizeof(batch->exec[0])*batch->exec_size);
+   if (unlikely(batch->reloc == NULL || batch->exec == NULL)) {
+      err = -ENOMEM;
+      goto err;
+   }
+
+   for (unsigned n = 0; n < 1 << BORROWED_BITS; n++)
+      list_inithead(&batch->borrowed[n]);
+   list_inithead(&batch->active);
+   list_inithead(&batch->inactive);
+
+   batch->actual_ring[RENDER_RING] = RENDER_RING;
+   batch->actual_ring[BLT_RING] = BLT_RING;
+   if (devinfo->gen < 6)
+      batch->actual_ring[BLT_RING] = RENDER_RING;
+
+   batch->has_llc = devinfo->has_llc;
+   batch->has_mmap_wc = test_has_mmap_wc(batch->fd);
+   batch->has_softpin = test_has_softpin(batch->fd);
+   batch->max_aperture = 3*get_max_aperture(batch->fd, batch->gen)/4;
+
+   /* The requirement for using I915_EXEC_NO_RELOC is:
+    *
+    *      The addresses written in the objects must match the corresponding
+    *      reloc.presumed_offset which in turn must match the corresponding
+    *      execobject.offset.
+    *
+    *      Any render targets written to in the batch must be flagged with
+    *      EXEC_OBJECT_WRITE.
+    *
+    *      To avoid stalling, execobject.offset should match the current
+    *      address of that object within the active context.
+    *
+    * So if we are successfully able to maintain the relocations at the point
+    * of constructions, we can tell the kernel that it can skip doing
+    * relocations.
+    */
+   if (test_has_fast_reloc(batch->fd))
+      batch->base_flags |= I915_EXEC_NO_RELOC;
+   if (test_has_handle_lut(batch->fd))
+      batch->base_flags |= I915_EXEC_HANDLE_LUT;
+   if (test_has_batch_first(batch->fd))
+      batch->base_flags |= I915_EXEC_BATCH_FIRST;
+
+   batch->seqno_bo = brw_bo_create(batch, "seqno", 4096, 0, 0);
+   if (batch->seqno_bo == NULL) {
+      err = -ENOMEM;
+      goto err;
+   }
+   brw_bo_enable_snoop(batch->seqno_bo);
+   batch->seqno_map =
+      memset(brw_bo_map(batch->seqno_bo, MAP_WRITE | MAP_PERSISTENT, NULL),
+             0, CACHELINE_BYTES*__BRW_NUM_RINGS);
+
+   /* Create a new hardware context.  Using a hardware context means that
+    * our GPU state will be saved/restored on context switch, allowing us
+    * to assume that the GPU is in the same state we left it in.
+    *
+    * This is required for transform feedback buffer offsets, query objects,
+    * and also allows us to reduce how much state we have to emit.
+    */
+   memset(&create, 0, sizeof(create));
+   drmIoctl(batch->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create);
+   batch->hw_ctx = create.ctx_id;
+   if (!batch->hw_ctx) {
+      if (devinfo->gen >= 6) {
+         err = -errno;
+         fprintf(stderr, "Gen6+ requires Kernel 3.6 or later.\n");
+         goto err;
+      }
+   }
+
+   err = __brw_batch_next(batch);
+   if (err)
+      goto err;
+
+   return 0;
+
+err:
+   brw_bo_put(batch->seqno_bo);
+   free(batch->reloc);
+   free(batch->exec);
+   return err;
+}
+
+static void __brw_batch_grow_exec(struct brw_batch *batch)
+{
+   struct drm_i915_gem_exec_object2 *new_exec;
+   uint16_t new_size;
+
+   new_size = batch->exec_size * 2;
+   new_exec = NULL;
+   if (likely(new_size > batch->exec_size))
+      new_exec = realloc(batch->exec, new_size*sizeof(new_exec[0]));
+   if (unlikely(new_exec == NULL))
+      longjmp(batch->jmpbuf, -ENOMEM);
+
+   if (new_exec != batch->exec) {
+      struct list_head * const list = &batch->next_request->fences;
+
+      list_for_each_entry_rev(struct __brw_fence, fence, list, link) {
+         int signal = brw_fence_get_signal(fence);
+         struct brw_bo *bo = NULL;
+
+         if (unlikely(signal >= WRITE_SIGNAL)) {
+            if (signal == WRITE_SIGNAL)
+               break;
+            else
+               continue;
+         }
+
+         bo = container_of(fence, bo, read[batch->ring]);
+         bo->exec = new_exec + (bo->exec - batch->exec);
+      }
+
+      batch->exec = new_exec;
+   }
+
+   batch->exec_size = new_size;
+}
+
+static void __brw_batch_grow_reloc(struct brw_batch *batch)
+{
+   struct drm_i915_gem_relocation_entry *new_reloc;
+   uint16_t new_size;
+
+   new_size = batch->reloc_size * 2;
+   new_reloc = NULL;
+   if (likely(new_size > batch->reloc_size))
+      new_reloc = realloc(batch->reloc, new_size*sizeof(new_reloc[0]));
+   if (unlikely(new_reloc == NULL))
+      longjmp(batch->jmpbuf, -ENOMEM);
+
+   batch->reloc = new_reloc;
+   batch->reloc_size = new_size;
+}
+
+/*
+ * Add a relocation entry for the target buffer into the current batch.
+ *
+ * This is the heart of performing fast relocations, both here and in
+ * the corresponding kernel relocation routines.
+ *
+ * - Instead of passing in handles for the kernel convert back into
+ *   the buffer for every relocation, we tell the kernel which
+ *   execobject slot corresponds with the relocation. The kernel is
+ *   able to use a simple LUT constructed as it first looks up each buffer
+ *   for the batch rather than search a small, overfull hashtable. As both
+ *   the number of relocations and buffers in a batch grow, the simple
+ *   LUT is much more efficient (though the LUT itself is less cache
+ *   friendly).
+ *   However, as the batch buffer is by definition the last object in
+ *   the execbuffer array we have to perform a pass to relabel the
+ *   target of all relocations pointing to the batch. (Except when
+ *   the kernel supports batch-first, in which case we can do the relocation
+ *   target processing for the batch inline.)
+ *
+ * - If the kernel has not moved the buffer, it will still be in the same
+ *   location as last time we used it. If we tell the kernel that all the
+ *   relocation entries are the same as the offset for the buffer, then
+ *   the kernel need only check that all the buffers are still in the same
+ *   location and then skip performing relocations entirely. A huge win.
+ *
+ * - As a consequence of telling the kernel to skip processing the relocations,
+ *   we need to tell the kernel about the read/write domains and special needs
+ *   of the buffers.
+ *
+ * - Alternatively, we can request the kernel place the buffer exactly
+ *   where we want it and forgo all relocations to that buffer entirely.
+ *   The buffer is effectively pinned for its lifetime (if the kernel
+ *   does have to move it, for example to swap it out to recover memory,
+ *   the kernel will return it back to our requested location at the start
+ *   of the next batch.) This of course imposes a lot of constraints on where
+ *   we can say the buffers are, they must meet all the alignment constraints
+ *   and not overlap.
+ *
+ * - Essential to all these techniques is that we always use the same
+ *   presumed_offset for the relocations as for submitting the execobject.
+ *   That value must be written into the batch and it must match the value
+ *   we tell the kernel. (This breaks down when using relocation tries shared
+ *   between multiple contexts, hence the need for context-local batch
+ *   management.)
+ *
+ * In contrast to libdrm, we can build the execbuffer array along with
+ * the batch by forgoing the ability to handle general relocation trees.
+ * This avoids having multiple passes to build the execbuffer parameter,
+ * and also gives us a means to cheaply track when a buffer has been
+ * referenced by the batch.
+ */
+uint64_t __brw_batch_reloc(struct brw_batch *batch,
+                           uint32_t batch_offset,
+                           struct brw_bo *target_bo,
+                           uint64_t target_offset,
+                           unsigned read_domains,
+                           unsigned write_domain)
+{
+   assert(batch->inside_begin_count);
+   assert(target_bo->refcnt);
+
+   /* Relocation addresses for a buffer are not shared between contexts (a
+    * buffer may be in different locations in different contexts), we therefore
+    * have to remember the relocation / offset of this buffer in this context,
+    * and not accidentally share it.
+    */
+   if (unlikely(target_bo->batch != batch)) {
+      /* XXX legal sharing between contexts/threads? */
+      target_bo = brw_bo_import(batch, target_bo->base, true);
+      if (unlikely(target_bo == NULL))
+         longjmp(batch->jmpbuf, -ENOMEM);
+      target_bo->refcnt--; /* kept alive by the implicit active reference */
+   }
+   assert(target_bo->batch == batch);
+
+   /* The first time a buffer is used within this batch, we add it both
+    * to the execobject array and start tracking it as active within
+    * this batch. It will then remain active until this batch completes,
+    * or until a later batch if reused afterwards.
+    */
+   if (target_bo->exec == NULL) {
+      int n;
+
+      __DBG(("%d:%s: new exec handle=%d\n", gettid(), __func__,
+             target_bo->handle));
+
+      /* If we do not have full-ppgtt, we can not trust that the kernel
+       * does not reallocate our address and pin it to hardware (i.e.
+       * scanout) - trying to use that address for ourselves results in failure
+       * (ENOSPC). Even with full-ppgtt, the kernel may reuse an address
+       * if the buffer has been evict to swap - and so we may have two
+       * overlapping objects in the same execbuf. As such we can only trust we
+       * maintain our address whilst the object is busy.
+       */
+      if (target_bo->pinned) {
+         bool busy = false;
+         assert(target_bo->active);
+         for (n = 0; n < __BRW_NUM_RINGS; n++) {
+            struct brw_request *rq = brw_fence_get_request(&target_bo->read[n]);
+            if (rq && request_busy(rq, hws_seqno(batch, n))) {
+               busy = true;
+               break;
+            }
+         }
+         if (!busy)
+            target_bo->pinned = false;
+      }
+
+      /* reserve one exec entry for the batch */
+      if (unlikely(batch->emit.nexec + 1 == batch->exec_size))
+         __brw_batch_grow_exec(batch);
+
+      n = batch->emit.nexec++;
+      target_bo->target_handle = has_lut(batch) ? n : target_bo->handle;
+      target_bo->exec = memset(batch->exec + n, 0, sizeof(*target_bo->exec));
+      target_bo->exec->handle = target_bo->handle;
+      target_bo->exec->alignment = target_bo->alignment;
+      target_bo->exec->offset = target_bo->offset;
+      if (batch->has_softpin && 0)
+         target_bo->exec->flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+      if (target_bo->pinned)
+         target_bo->exec->flags |= EXEC_OBJECT_PINNED;
+
+      /* Track the total amount of memory in use by all active requests */
+      if (!target_bo->active) {
+         batch->rss += target_bo->size;
+         if (batch->rss > batch->peak_rss)
+            batch->peak_rss = batch->rss;
+      }
+      target_bo->active |= 1 << batch->ring;
+      target_bo->read[batch->ring].rq =
+         FENCE_MARK_SIGNAL(batch->next_request, READ_SIGNAL);
+      list_movetail(&target_bo->read[batch->ring].link,
+                    &batch->next_request->fences);
+
+      batch->aperture += target_bo->size;
+   }
+
+   /* If we are treating the buffer as pinned, we will tell the kernel it must
+    * put the buffer at our specified address on execution. (The kernel will
+    * stall to make it so if it has to move other buffers out of the way, so
+    * we should be careful not to let that happen. To avoid stalling, we only
+    * consider a buffer that is still active as a valid opportunity for pinning.
+    * The kernel cannot move an active buffer, so we know that whilst it is
+    * active, its address has not been reused.) If the buffer is pinned, we
+    * can forgo tracking the associated relocation entries saving both time for
+    * us, and for the kernel as it will not have to do as much relocation
+    * processing.
+    */
+   if (!target_bo->pinned) {
+      int n;
+
+      if (unlikely(batch->emit.nreloc == batch->reloc_size))
+         __brw_batch_grow_reloc(batch);
+
+      n = batch->emit.nreloc++;
+      batch->reloc[n].offset = batch_offset;
+      batch->reloc[n].delta = target_offset;
+      batch->reloc[n].target_handle = target_bo->target_handle;
+      batch->reloc[n].presumed_offset = target_bo->offset;
+      batch->reloc[n].read_domains = read_domains;
+      batch->reloc[n].write_domain = write_domain;
+
+      /* If we haven't added the batch to the execobject array yet, we
+       * will have to process all the relocations pointing to the
+       * batch when finalizing the request for submission.
+       */
+      if (target_bo->target_handle == -1) {
+         int m = batch->emit.nself++;
+         if (m < 256)
+            batch->self_reloc[m] = n;
+      }
+   }
+
+   /* Maintain the dirty status - for both ourselves (checking for required
+    * flushes within the batch) and for informing the kernel about the required
+    * write hazards. We also track the write hazard for ourselves so that we
+    * can optimised reading the buffer on the CPU whilst it is only being
+    * read by the GPU.
+    */
+   if (write_domain && !target_bo->dirty) {
+      __DBG(("%d:%s: dirty handle=%d\n", gettid(), __func__,
+             target_bo->handle));
+      assert(target_bo != batch->bo);
+      assert(brw_fence_get_request(&target_bo->read[batch->ring]) == batch->next_request);
+
+      target_bo->write.rq =
+         FENCE_MARK_SIGNAL(batch->next_request, WRITE_SIGNAL);
+      list_move(&target_bo->write.link, &batch->next_request->fences);
+
+      target_bo->dirty = true;
+      if (has_lut(batch)) {
+         target_bo->exec->flags |= EXEC_OBJECT_WRITE;
+         if (write_domain == I915_GEM_DOMAIN_INSTRUCTION &&
+             batch->needs_pipecontrol_ggtt_wa)
+            target_bo->exec->flags |= EXEC_OBJECT_NEEDS_GTT;
+      }
+   }
+
+   return __brw_reloc_address(target_bo, target_offset);
+}
+
+static uint32_t __brw_batch_emit_seqno(struct brw_batch *batch,
+                                       unsigned flags)
+{
+   uint32_t seqno = ++batch->seqno;
+   uint32_t *out = batch->tail;
+   const int gen = batch->gen;
+   unsigned offset = CACHELINE_BYTES*batch->ring;
+   uint64_t address;
+   bool pad = false;
+
+   if (batch->ring == RENDER_RING) {
+      if (gen >= 8) {
+         *out++ = _3DSTATE_PIPE_CONTROL | (6 - 2);
+         *out++ = PIPE_CONTROL_WRITE_IMMEDIATE | flags;
+         pad = true;
+      } else if (gen >= 6) {
+         /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
+          * on later platforms.  We always use PPGTT on Gen7+.
+          */
+         *out++ = _3DSTATE_PIPE_CONTROL | (4 - 2);
+         *out++ = PIPE_CONTROL_WRITE_IMMEDIATE | flags;
+      } else {
+         *out++ = (_3DSTATE_PIPE_CONTROL |
+                   PIPE_CONTROL_WRITE_IMMEDIATE |
+                   (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) |
+                   (4 - 2));
+         offset |= PIPE_CONTROL_GLOBAL_GTT_WRITE;
+         pad = true;
+      }
+   } else {
+      if (gen >= 8)
+         *out++ = 0x26<<23 | 1<<14 | (4 - 2);
+      else
+         *out++ = 0x26<<23 | 1<<14 | (3 - 2);
+   }
+   /* Note we lie to the kernel about the write flag to the seqno bo, this is
+    * because we want concurrent writes from both engines which are on separate
+    * cachelines. Otherwise the kernel will insert semaphores between every
+    * pair of batches and serialise their execution.
+    */
+   address =
+      __brw_batch_reloc(batch,
+                        (char *)out - (char *)batch->map,
+                        batch->seqno_bo, offset,
+                        I915_GEM_DOMAIN_INSTRUCTION, 0);
+   if (batch->needs_pipecontrol_ggtt_wa)
+      batch->seqno_bo->exec->flags |= EXEC_OBJECT_NEEDS_GTT;
+   if (gen >= 8) {
+      *(uint64_t *)out = address;
+      out += 2;
+   } else
+      *out++ = address;
+   *out++ = seqno;
+   if (pad)
+      *out++ = 0;
+   batch->tail = out;
+
+   return seqno;
+}
+
+/*
+ * Close the batch by writing all the tail commands (to store register
+ * values between batches, disable profiling, etc). And then to end it all
+ * we set MI_BATCH_BUFFER_END.
+ */
+static uint32_t __brw_batch_finish(struct brw_batch *batch,
+                                   struct perf_debug *info)
+{
+   assert(!batch->next_request->seqno);
+   batch->reserved = 0;
+
+   batch->inside_begin_count++;
+   brw_batch_finish_hook(batch);
+   batch->next_request->seqno =
+      __brw_batch_emit_seqno(batch,
+                             PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                             PIPE_CONTROL_DEPTH_CACHE_FLUSH);
+   batch->emit.nbatch = batch->tail - batch->map;
+   batch->inside_begin_count--;
+   assert(!batch->inside_begin_count);
+   assert(batch->emit.nbatch <= batch->state);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
+      int bytes_for_commands = 4*batch->emit.nbatch;
+      int bytes_for_state = batch->bo->size - 4*batch->state;
+      int total_bytes = bytes_for_commands + bytes_for_state;
+      fprintf(stderr, "%d: Batchbuffer flush at %s:%d (%s) on ring %d with %4db (pkt) + "
+              "%4db (state) = %4db (%0.1f%%), with %d buffers and %d relocations [%d self], RSS %d KiB (cap %dKiB)\n",
+              batch->hw_ctx,
+              info ? info->file : "???",
+              info ? info->line : -1,
+              info ? info->string : "???",
+              batch->ring, bytes_for_commands, bytes_for_state,
+              total_bytes, 100.0f * total_bytes / BATCH_SIZE,
+              batch->emit.nexec, batch->emit.nreloc, batch->emit.nself,
+              (int)(batch->aperture>>10), (int)(batch->max_aperture>>10));
+   }
+
+   batch->map[batch->emit.nbatch] = 0xa << 23;
+   return 4*((batch->emit.nbatch + 2) & ~1);
+}
+
+static void
+__brw_batch_dump(struct brw_batch *batch)
+{
+   struct drm_intel_decode *decode;
+
+   decode = drm_intel_decode_context_alloc(batch->screen->deviceID);
+   if (unlikely(decode == NULL))
+      return;
+
+   drm_intel_decode_set_batch_pointer(decode,
+                                      batch->map, batch->bo->offset,
+                                      batch->emit.nbatch + 1);
+
+   drm_intel_decode_set_output_file(decode, stderr);
+   drm_intel_decode(decode);
+
+   drm_intel_decode_context_free(decode);
+
+   brw_debug_batch(batch);
+}
+
+static void
+__brw_batch_throttle(struct brw_batch *batch, struct brw_request *rq)
+{
+   if (unlikely(batch->disable_throttling))
+      return;
+
+   /* Wait for the swapbuffers before the one we just emitted, so we
+    * don't get too many swaps outstanding for apps that are GPU-heavy
+    * but not CPU-heavy.
+    *
+    * We're using intelDRI2Flush (called from the loader before
+    * swapbuffer) and glFlush (for front buffer rendering) as the
+    * indicator that a frame is done and then throttle when we get
+    * here as we prepare to render the next frame.  At this point for
+    * round trips for swap/copy and getting new buffers are done and
+    * we'll spend less time waiting on the GPU.
+    *
+    * Unfortunately, we don't have a handle to the batch containing
+    * the swap, and getting our hands on that doesn't seem worth it,
+    * so we just use the first batch we emitted after the last swap.
+    */
+   if (batch->need_swap_throttle) {
+      if (batch->throttle[0])
+         __brw_request_wait(batch->throttle[0], -1, NULL);
+      batch->throttle[0] = batch->throttle[1];
+      batch->throttle[1] = rq;
+      batch->need_flush_throttle = false;
+      batch->need_swap_throttle = false;
+   }
+
+   if (batch->need_flush_throttle) {
+      drmIoctl(batch->fd, DRM_IOCTL_I915_GEM_THROTTLE, NULL);
+      batch->need_flush_throttle = false;
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
+      struct drm_i915_gem_wait wait;
+
+      memset(&wait, 0, sizeof(wait));
+      wait.bo_handle = RQ_BO(rq)->handle;
+      wait.timeout_ns = -1;
+      drmIoctl(batch->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
+   }
+}
+
+/*
+ * If we added relocations pointing to the batch before we knew
+ * its final index (the kernel assumes that the batch is last unless
+ * told otherwise), then we have to go through all the relocations
+ * and point them back to the batch.
+ */
+static void __brw_batch_fixup_self_relocations(struct brw_batch *batch)
+{
+   uint32_t target = batch->bo->target_handle;
+   int n, count;
+
+   count = MIN2(batch->emit.nself, 256);
+   for (n = 0; n < count; n++)
+      batch->reloc[batch->self_reloc[n]].target_handle = target;
+   if (n == 256) {
+      for (n = batch->self_reloc[255] + 1; n < batch->emit.nreloc; n++) {
+         if (batch->reloc[n].target_handle == -1)
+            batch->reloc[n].target_handle = target;
+      }
+   }
+}
+
+static void __brw_batch_undo_self_relocations(struct brw_batch *batch)
+{
+   if (batch->emit.nself >= 256) {
+      uint32_t target = batch->bo->target_handle;
+
+      for (int n = batch->self_reloc[255] + 1; n < batch->emit.nreloc; n++)
+         if (batch->reloc[n].target_handle == target)
+            batch->reloc[n].target_handle = -1;
+   }
+
+   batch->bo->target_handle = -1;
+}
+
+/*
+ * Check to see if the oldest requests have completed and retire them.
+ */
+static void __brw_batch_retire(struct brw_batch *batch)
+{
+   uint32_t seqno = hws_seqno(batch, batch->ring);
+
+   do {
+      struct brw_request *rq;
+
+      rq = batch->requests[batch->ring].lru;
+      if (rq->next == NULL)
+         break;
+
+      if (request_busy(rq, seqno))
+         break;
+
+      __brw_request_retire(rq);
+   } while (1);
+}
+
+/*
+ * Finalize the batch, submit it to hardware, and start a new batch/request.
+ */
+int brw_batch_flush(struct brw_batch *batch, struct perf_debug *perf)
+{
+   struct drm_i915_gem_execbuffer2 execbuf;
+   struct drm_i915_gem_exec_object2 *exec;
+   struct brw_request *rq = batch->next_request;
+
+   __DBG(("%d: %s: emit=%d, nreloc=%d, nexec=%d\n", gettid(), __func__,
+          batch->emit.nbatch, batch->emit.nreloc, batch->emit.nexec));
+
+   assert(!batch->inside_begin_count);
+   assert(batch->tail == batch->map + batch->emit.nbatch);
+   if (unlikely(batch->emit.nbatch == 0))
+      return 0;
+
+   if (unlikely(rq == NULL))
+      return -ENOMEM;
+
+   if (unlikely(perf))
+      brw_batch_report_flush_hook(batch, perf);
+
+   memset(&execbuf, 0, sizeof(execbuf));
+   execbuf.batch_len = __brw_batch_finish(batch, perf);
+   assert(execbuf.batch_len % 8 == 0);
+
+   assert(RQ_BO(rq) == rq->bo);
+   assert(rq->bo == batch->bo);
+   assert(brw_fence_get_request(&rq->bo->write) == NULL);
+   assert(brw_fence_get_request(&rq->bo->read[batch->ring]) == rq);
+   assert(rq->bo->exec != NULL);
+   assert(rq->bo->dirty);
+   assert(rq->seqno);
+
+   /* After we call __brw_batch_finish() as the callbacks may add relocs! */
+   if (rq->bo->target_handle == -1) {
+      rq->bo->target_handle =
+         has_lut(batch) ? batch->emit.nexec : rq->bo->handle;
+      rq->bo->exec =
+         memset(&batch->exec[batch->emit.nexec++], 0, sizeof(*exec));
+
+      __brw_batch_fixup_self_relocations(batch);
+   }
+
+   assert(rq->bo->exec != (void *)1);
+   exec = rq->bo->exec;
+   exec->handle = rq->bo->handle;
+   exec->offset = rq->bo->offset;
+   exec->alignment = rq->bo->alignment;
+   exec->relocation_count = batch->emit.nreloc;
+   exec->relocs_ptr = (uintptr_t)batch->reloc;
+   if (batch->has_softpin)
+      exec->flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+   if (rq->bo->pinned)
+      exec->flags |= EXEC_OBJECT_PINNED;
+   assert((exec->flags & EXEC_OBJECT_WRITE) == 0);
+
+   execbuf.buffers_ptr = (uintptr_t)batch->exec;
+   execbuf.buffer_count = batch->emit.nexec;
+   if (batch->ring == RENDER_RING || batch->has_softpin)
+      execbuf.rsvd1 = batch->hw_ctx;
+   execbuf.flags = hw_ring[batch->ring] | (batch->flags & EXEC_FLAGS);
+
+   if (unlikely(batch->no_hw))
+      goto skip;
+
+   if (unlikely(drmIoctl(batch->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf))){
+      static int once;
+
+      if (unlikely(errno == ENOSPC)) {
+         if (batch->base_flags & I915_EXEC_BATCH_FIRST)
+            __brw_batch_undo_self_relocations(batch);
+         rq->seqno = 0;
+         batch->has_softpin = false;
+         batch->reserved = BATCH_RESERVED / 4 + 8;
+         return -ENOSPC;
+      }
+
+      if (!once) {
+         fprintf(stderr,
+                 "Failed to submit batch buffer, rendering will be incorrect: %s [%d]\n",
+                 strerror(errno), errno);
+         once = 1;
+      }
+
+      /* submit a dummy execbuf to keep the fences accurate */
+      batch->map[0] = 0xa << 23;
+      execbuf.batch_len = 0;
+
+      if (drmIoctl(batch->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf)) {
+         assert(errno != ENOSPC);
+         /* Abandon hope all ye who enter here!
+          *
+          * The kernel is refusing to execute our batch, fence tracking is
+          * lost and we can only hope that not too much corruption ensues.
+          * Most likely the GPU is wedged and so no further rendering may
+          * occur, but it may just be a transient failure!
+          *
+          * At any rate, we have to decouple our fences so that we don't die
+          * later on when trying to use them.
+          */
+         list_for_each_entry_safe(struct __brw_fence, fence, &rq->fences, link) {
+            int signal = brw_fence_get_signal(fence);
+            list_inithead(&fence->link);
+            fence->rq = NULL;
+
+            if (signal == READ_SIGNAL) {
+               struct brw_bo *bo = NULL;
+
+               bo =  container_of(fence, bo, read[batch->ring]);
+               bo->exec = NULL;
+               bo->dirty = false;
+               bo->target_handle = -1;
+
+               bo->active &= ~(1 << batch->ring);
+               if (!bo->active) {
+                  assert(batch->rss >= bo->size);
+                  batch->rss -= bo->size;
+
+                  if (likely(bo->reusable))
+                     list_move(&bo->link, &batch->inactive);
+
+                  if (unlikely(!bo->refcnt))
+                     __brw_bo_free(bo);
+               }
+            }
+         }
+
+         __brw_batch_reset(batch);
+         return -errno;
+      }
+   }
+
+   if (DBG_PERF_IDLE && batch->idle_time[batch->ring] < 0) {
+      batch->idle_time[batch->ring] += get_time();
+      fprintf(stderr, "GPU command queue %d idle for %.3fms\n",
+              batch->ring, batch->idle_time[batch->ring] * 1000);
+   }
+
+skip:
+   list_for_each_entry_rev(struct __brw_fence, fence, &rq->fences, link) {
+      int signal = brw_fence_get_signal(fence);
+      struct brw_bo *bo = NULL;
+
+      if (unlikely(signal >= WRITE_SIGNAL)) {
+         if (signal == WRITE_SIGNAL)
+            break;
+         else
+            continue;
+      }
+
+      bo = container_of(fence, bo, read[batch->ring]);
+      assert(bo->exec);
+      assert(brw_fence_get_request(&bo->read[batch->ring]) == rq);
+
+      bo->offset = bo->exec->offset;
+      bo->pinned = batch->has_softpin;
+      bo->exec = NULL;
+      bo->dirty = false;
+      bo->target_handle = -1;
+      bo->domain = DOMAIN_NONE;
+   }
+   assert(!rq->bo->dirty);
+   assert(rq->bo->exec == NULL);
+   rq->bo = RQ_MARK_RING(rq, batch->ring);
+   if (batch->requests[batch->ring].mru)
+      batch->requests[batch->ring].mru->next = rq;
+   else
+      batch->requests[batch->ring].lru = rq;
+   batch->requests[batch->ring].mru = rq;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
+      __brw_batch_dump(batch);
+
+   __brw_batch_throttle(batch, rq);
+   __brw_batch_retire(batch);
+
+   brw_batch_clear_dirty(batch);
+
+   return __brw_batch_next(batch);
+}
+
+/*
+ * Is the GPU still processing the most recent batch submitted?
+ * (Note does not include the batch currently being constructed.)
+ */
+bool brw_batch_busy(struct brw_batch *batch)
+{
+   return __brw_request_busy(batch->requests[batch->ring].mru, 0, NULL);
+}
+
+/*
+ * Wait for all GPU processing to complete.
+ */
+void brw_batch_wait(struct brw_batch *batch, struct perf_debug *perf)
+{
+   brw_batch_flush(batch, perf);
+   for (unsigned n = 0; n < __BRW_NUM_RINGS; n++)
+      __brw_request_wait(batch->requests[n].mru, -1, perf);
+}
+
+static bool __is_uncached(int fd, uint32_t handle)
+{
+   struct drm_i915_gem_caching arg;
+
+   memset(&arg, 0, sizeof(arg));
+   arg.handle = handle;
+   drmIoctl(fd, DRM_IOCTL_I915_GEM_GET_CACHING, &arg);
+   /* There is no right answer if an error occurs here. Fortunately, the
+    * only error is ENOENT and that's impossible!
+    */
+   return arg.caching != I915_CACHING_CACHED;
+}
+
+/*
+ * Wrap a drm_intel_bo reference in a struct brw_bo. Ownership of that
+ * reference is transferred to the struct brw_bo.
+ */
+struct brw_bo *brw_bo_import(struct brw_batch *batch,
+                             drm_intel_bo *base,
+                             bool borrow)
+{
+   struct brw_bo *bo;
+   uint32_t tiling, swizzling;
+
+   if (unlikely(base == NULL))
+      return NULL;
+
+   assert(base->handle);
+   assert(base->size);
+
+   if (borrow) {
+      bo = __brw_batch_lookup_handle(batch, base->handle);
+      if (bo) {
+         bo->refcnt++;
+         return bo;
+      }
+   }
+
+   if (batch->freed_bo) {
+      bo = batch->freed_bo;
+      batch->freed_bo = (struct brw_bo *)bo->base;
+   } else {
+      bo = malloc(sizeof(*bo));
+      if (unlikely(bo == NULL))
+         return NULL;
+   }
+
+   memset(bo, 0, sizeof(*bo));
+
+   bo->handle = base->handle;
+   bo->batch = batch;
+   bo->refcnt = 1;
+   bo->offset = base->offset64;
+   bo->alignment = base->align;
+   bo->size = base->size;
+
+   drm_intel_bo_get_tiling(base, &tiling, &swizzling);
+   bo->tiling = tiling;
+   bo->reusable = !borrow;
+   bo->cache_coherent = batch->has_llc; /* XXX libdrm bookkeeping */
+
+   batch->vmsize += bo->size;
+
+   for (unsigned n = 0; n < __BRW_NUM_RINGS; n++)
+      list_inithead(&bo->read[n].link);
+   list_inithead(&bo->write.link);
+
+   bo->base = base;
+   if (borrow) {
+      list_add(&bo->link, borrowed(batch, bo->handle));
+      drm_intel_bo_reference(base);
+      if (bo->cache_coherent)
+         bo->cache_coherent = !__is_uncached(batch->fd, bo->handle);
+   } else {
+      list_add(&bo->link, &batch->inactive);
+      /* If the buffer hasn't been used before on the GPU, presume it is a
+       * new buffer in the CPU write domain. However, a buffer may have been
+       * mapped and unused - but that should be relatively rare compared to
+       * the optimisation chance of first writing through the CPU.
+       */
+      if (bo->offset == 0)
+         __brw_bo_set_domain(bo, DOMAIN_CPU, true);
+   }
+
+   return bo;
+}
+
+/*
+ * Search the list of active buffers (a local short lived cache) for
+ * something of the right size to reuse for the allocation request.
+ */
+static struct brw_bo *__brw_bo_create__cached(struct brw_batch *batch,
+                                              uint64_t size)
+{
+   list_for_each_entry(struct brw_bo, bo, &batch->active, link) {
+      assert(bo->batch == batch);
+      assert(bo->active);
+
+      if (bo->size < size || 3*size > 4*bo->size)
+         continue;
+
+      list_move(&bo->link, &batch->inactive);
+      bo->refcnt++;
+      return bo;
+   }
+
+   return NULL;
+}
+
+struct brw_bo *brw_bo_create(struct brw_batch *batch,
+                             const char *name,
+                             uint64_t size,
+                             uint64_t alignment,
+                             unsigned flags)
+{
+   drm_intel_bo *base;
+   struct brw_bo *bo;
+
+   if (flags & BO_ALLOC_FOR_RENDER) {
+      bo = __brw_bo_create__cached(batch, size);
+      if (bo) {
+         /* XXX rename */
+         bo->alignment = alignment;
+         if (bo->tiling != I915_TILING_NONE) {
+            uint32_t tiling = I915_TILING_NONE;
+            drm_intel_bo_set_tiling(bo->base, &tiling, 0);
+            bo->tiling = tiling;
+         }
+         if (bo->tiling != I915_TILING_NONE) {
+            list_move(&bo->link, &batch->active);
+            bo->refcnt--;
+         } else
+            return bo;
+      }
+   }
+
+   base = drm_intel_bo_alloc(batch->bufmgr, name, size, alignment);
+   if (unlikely(base == NULL))
+      return NULL;
+
+   bo = brw_bo_import(batch, base, false);
+   if (unlikely(bo == NULL)) {
+      drm_intel_bo_unreference(base);
+      return NULL;
+   }
+
+   return bo;
+}
+
+static bool __brw_bo_set_caching(struct brw_bo *bo, int caching)
+{
+   struct drm_i915_gem_caching arg;
+
+   memset(&arg, 0, sizeof(arg));
+   arg.handle = bo->handle;
+   arg.caching = caching;
+   return drmIoctl(bo->batch->fd, DRM_IOCTL_I915_GEM_SET_CACHING, &arg) == 0;
+}
+
+void brw_bo_enable_snoop(struct brw_bo *bo)
+{
+   assert(bo->refcnt);
+   assert(bo->batch);
+   assert(bo->reusable);
+
+   if (bo->cache_coherent)
+      return;
+
+   if (bo->tiling)
+      return; /* XXX abort? */
+
+   if (!__brw_bo_set_caching(bo, I915_CACHING_CACHED))
+      return;
+
+   drm_intel_bo_disable_reuse(bo->base);
+   if (bo->reusable)
+      list_move(&bo->link, &bo->batch->inactive);
+
+   bo->reusable = false;
+   bo->cache_coherent = true;
+}
+
+static uint64_t brw_surface_size(int cpp,
+                                 uint32_t width,
+                                 uint32_t height,
+                                 uint32_t tiling,
+                                 uint32_t *pitch)
+{
+   uint32_t tile_width, tile_height;
+
+   switch (tiling) {
+   default:
+   case I915_TILING_NONE:
+      tile_width = 64;
+      tile_height = 2;
+      break;
+   case I915_TILING_X:
+      tile_width = 512;
+      tile_height = 8;
+      break;
+   case I915_TILING_Y:
+      tile_width = 128;
+      tile_height = 32;
+      break;
+   }
+
+   *pitch = ALIGN(width * cpp, tile_width);
+   height = ALIGN(height, tile_height);
+   height *= *pitch;
+   return ALIGN(height, 4096);
+}
+
+struct brw_bo *
+brw_bo_create_tiled(struct brw_batch *batch,
+                    const char *name,
+                    uint32_t width,
+                    uint32_t height,
+                    int cpp,
+                    uint32_t *tiling,
+                    uint32_t *pitch,
+                    unsigned flags)
+{
+   unsigned long __pitch;
+   drm_intel_bo *base;
+   struct brw_bo *bo;
+
+   if (flags & BO_ALLOC_FOR_RENDER) {
+      uint64_t size = brw_surface_size(cpp, width, height, *tiling, pitch);
+
+      bo = __brw_bo_create__cached(batch, size);
+      if (bo) {
+         /* XXX rename */
+         bo->alignment = 0;
+         drm_intel_bo_set_tiling(bo->base, tiling, *pitch);
+         bo->tiling = *tiling;
+         return bo;
+      }
+   }
+
+   base = drm_intel_bo_alloc_tiled(batch->bufmgr, name,
+                                   width, height, cpp,
+                                   tiling, &__pitch, flags);
+   if (unlikely(base == NULL))
+      return NULL;
+
+   *pitch = __pitch;
+   bo = brw_bo_import(batch, base, false);
+   if (unlikely(bo == NULL)) {
+      drm_intel_bo_unreference(base);
+      return NULL;
+   }
+
+   return bo;
+}
+
+/*
+ * Import a foreign buffer from another process using the global
+ * (flinked) name.
+ */
+struct brw_bo *brw_bo_create_from_name(struct brw_batch *batch,
+                                       const char *name,
+                                       uint32_t global_name)
+{
+   drm_intel_bo *base;
+   struct brw_bo *bo;
+
+   base = drm_intel_bo_gem_create_from_name(batch->bufmgr, name, global_name);
+   if (unlikely(base == NULL))
+      return NULL;
+
+   bo = brw_bo_import(batch, base, true);
+   drm_intel_bo_unreference(base);
+
+   return bo;
+}
+
+/*
+ * Provide a WC mmapping of the buffer. Coherent everywhere, but
+ * reads are very slow (as they are uncached) unless streamed using movntdqa.
+ * Fenced, so automatically detiled by hardware and constrained to fit in the
+ * aperture (if larger than the aperture the 4.9+ kernel will split the mapping
+ * automatically).
+ */
+static void *brw_bo_map__gtt(struct brw_bo *bo, unsigned flags)
+{
+   __DBG(("%d:%s: handle=%d, flags=%x\n", gettid(), __func__,
+          bo->handle, flags));
+
+   if (flags & MAP_DETILED && bo->tiling)
+      return NULL;
+
+   if (bo->map__gtt == NULL)
+      bo->map__gtt = drm_intel_gem_bo_map__gtt(bo->base);
+
+   if ((flags & MAP_ASYNC) == 0)
+      __brw_bo_set_domain(bo, DOMAIN_GTT, flags & MAP_WRITE);
+
+   return bo->map__gtt;
+}
+
+/*
+ * Provide a WC mmapping of the buffer. Coherent everywhere, but
+ * reads are very slow (as they are uncached) unless streamed using movntdqa.
+ * Unfenced, not constrained by the mappable aperture (no waitings for
+ * evictions).
+ */
+static void *brw_bo_map__wc(struct brw_bo *bo, unsigned flags)
+{
+   __DBG(("%d:%s: handle=%d, flags=%x\n", gettid(), __func__,
+          bo->handle, flags));
+
+   if (bo->map__wc == NULL)
+      bo->map__wc = drm_intel_gem_bo_map__wc(bo->base);
+
+   if ((flags & MAP_ASYNC) == 0)
+      __brw_bo_set_domain(bo, DOMAIN_GTT, flags & MAP_WRITE);
+
+   return bo->map__wc;
+}
+
+/*
+ * Provide a WB mmapping of the buffer. Incoherent on non-LLC platforms
+ * and will trigger clflushes of the entire buffer. Unfenced, not
+ * constrained by the mappable aperture.
+ */
+static void *brw_bo_map__cpu(struct brw_bo *bo, unsigned flags)
+{
+   __DBG(("%d:%s: handle=%d, flags=%x\n", gettid(), __func__,
+          bo->handle, flags));
+
+   if (bo->map__cpu == NULL)
+      bo->map__cpu = drm_intel_gem_bo_map__cpu(bo->base);
+   assert(bo->map__cpu);
+
+   if ((flags & MAP_ASYNC) == 0)
+      __brw_bo_set_domain(bo, DOMAIN_CPU, flags & MAP_WRITE);
+
+   return bo->map__cpu;
+}
+
+static bool can_map__cpu(struct brw_bo *bo, unsigned flags)
+{
+   if (bo->cache_coherent)
+      return true;
+
+   if (flags & MAP_PERSISTENT)
+      return false;
+
+   /* For an active non-atomic mapping, it is possible for a subsequent
+    * map on a different (or even the same) bo to cause the batch to be
+    * flushed and so invalidate the incoherent CPU mmap.
+    */
+   if (bo->exec && (flags & MAP_DETILED) == 0)
+      return false;
+
+   if (bo->domain == DOMAIN_CPU)
+      return true;
+
+   if (flags & MAP_COHERENT)
+      return false;
+
+   return (flags & MAP_WRITE) == 0;
+}
+
+/*
+ * Map the buffer for access by the CPU, either for writing or reading,
+ * and return a pointer for that access.
+ *
+ * If the async flag is not set, any previous writing by the GPU is
+ * waited upon, and if write access is required all GPU reads as well.
+ *
+ * If the async flag is set, the kernel is not informed of the access
+ * and the access may be concurrent with GPU access. Also importantly,
+ * cache domain tracking for the buffer is *not* maintained and so access
+ * modes are limited to coherent modes (taking into account the current
+ * cache domain).
+ *
+ * If the detiled flag is set, the caller will perform manual detiling
+ * through the mapping, and so we do not allocate a fence for the operation.
+ * This can return NULL on failure, for example if the kernel doesn't support
+ * such an operation.
+ *
+ * The method for mapping the buffer is chosen based on the hardware
+ * architecture (LLC has fast coherent reads and writes, non-LLC has fast
+ * coherent writes, slow coherent reads but faster incoherent reads)
+ * and mode of operation. In theory, for every desired access mode,
+ * the pointer is the fastest direct CPU access to the immediate buffer.
+ * However, direct CPU access to this buffer may not always be the fastest
+ * method of accessing the data within that buffer by the CPU!
+ *
+ * Returns NULL on error.
+ */
+void *brw_bo_map(struct brw_bo *bo, unsigned flags, struct perf_debug *perf)
+{
+   assert(bo->refcnt);
+   assert(bo->batch);
+
+   __DBG(("%d:%s: handle=%d, flags=%x\n", gettid(), __func__,
+          bo->handle, flags));
+
+   if ((flags & MAP_ASYNC) == 0) {
+      struct __brw_fence *fences;
+      int nfence;
+
+      if (flags & MAP_WRITE) {
+	 fences = bo->read;
+	 nfence = __BRW_NUM_RINGS;
+      } else {
+	 fences = &bo->write;
+	 nfence = 1;
+      }
+      while (nfence--) {
+	 struct brw_request *rq = brw_fence_get_request(fences++);
+	 if (!rq)
+	    continue;
+
+	 if (rq->seqno == 0)
+	    brw_batch_flush(bo->batch, perf);
+
+	 if (unlikely(perf))
+	    __brw_request_wait(rq, -1, perf);
+      }
+   }
+
+   if (bo->tiling && (flags & MAP_DETILED) == 0)
+      return brw_bo_map__gtt(bo, flags);
+   else if (can_map__cpu(bo, flags))
+      return brw_bo_map__cpu(bo, flags);
+   else if (bo->batch->has_mmap_wc)
+      return brw_bo_map__wc(bo, flags);
+   else
+      return brw_bo_map__gtt(bo, flags);
+}
+
+/*
+ * Write a portion of the *linear* buffer using the pointer provided.
+ *
+ * This is conceptually equivalent to calling
+ *   memcpy(brw_bo_map(MAP_WRITE | MAP_DETILED | flags) + offset, data, size)
+ * but can be much more efficient as it will try to avoid cache domain
+ * side-effects (if any).
+ */
+void brw_bo_write(struct brw_bo *bo,
+                  uint64_t offset,
+                  const void *data,
+                  uint64_t length,
+                  unsigned flags,
+                  struct perf_debug *perf)
+{
+   struct drm_i915_gem_pwrite pwrite;
+   void *map;
+
+   __DBG(("%d:%s: handle=%d, offset=%llu, length=%lld, flags=%x\n",
+          gettid(), __func__,
+          bo->handle, (long long)offset, (long long)length, flags));
+
+   assert(bo->refcnt);
+   assert(bo->batch);
+   assert(offset < bo->size);
+   assert(length <= bo->size - offset);
+
+   map = brw_bo_map(bo, MAP_WRITE | MAP_DETILED | flags, perf);
+   if (map) {
+      memcpy(map + offset, data, length);
+      return;
+   }
+
+   memset(&pwrite, 0, sizeof(pwrite));
+   pwrite.handle = bo->handle;
+   pwrite.offset = offset;
+   pwrite.size = length;
+   pwrite.data_ptr = (uintptr_t)data;
+   if (unlikely(drmIoctl(bo->batch->fd, DRM_IOCTL_I915_GEM_PWRITE, &pwrite)))
+      return;
+
+   assert(bo->refcnt);
+   bo->domain = DOMAIN_GTT;
+}
+
+/*
+ * Read a portion of the *linear* buffer into the pointer provided.
+ *
+ * This is conceptually equivalent to calling
+ *   memcpy(data, brw_bo_map(MAP_READ | MAP_DETILED | flags) + offset, size)
+ * but can be much more efficient as it will try to avoid cache domain
+ * side-effects (if any).
+ */
+void brw_bo_read(struct brw_bo *bo,
+                 uint64_t offset,
+                 void *data,
+                 uint64_t length,
+                 unsigned flags,
+                 struct perf_debug *perf)
+{
+   struct drm_i915_gem_pread pread;
+   void *map;
+
+   __DBG(("%d:%s: handle=%d, offset=%llu, length=%llu, flags=%x\n",
+          gettid(), __func__,
+          bo->handle, (long long)offset, (long long)length, flags));
+
+   assert(bo->refcnt);
+   assert(bo->batch);
+   assert(offset < bo->size);
+   assert(length <= bo->size - offset);
+
+   if (bo->cache_coherent || bo->domain == DOMAIN_CPU) {
+      map = brw_bo_map(bo, MAP_READ | MAP_DETILED | flags, perf);
+      if (map) {
+         memcpy(data, map + offset, length);
+         return;
+      }
+   }
+
+   if ((flags & MAP_ASYNC) == 0) {
+      struct brw_request *rq = brw_fence_get_request(&bo->write);
+      if (rq && rq->seqno == 0)
+         brw_batch_flush(bo->batch, perf);
+   }
+
+   memset(&pread, 0, sizeof(pread));
+   pread.handle = bo->handle;
+   pread.offset = offset;
+   pread.size = length;
+   pread.data_ptr = (uintptr_t)data;
+
+   if (unlikely(perf))
+      __brw_request_wait(brw_fence_get_request(&bo->write), -1, perf);
+
+   if (unlikely(drmIoctl(bo->batch->fd, DRM_IOCTL_I915_GEM_PREAD, &pread)))
+      return;
+
+   assert(bo->refcnt);
+   if (bo->domain != DOMAIN_CPU)
+      bo->domain = DOMAIN_NONE;
+}
+
+/*
+ * After the final reference to a bo is released, free the buffer.
+ *
+ * If the buffer is still active, and it is reusable, the buffer is
+ * transferred to the local active cache and may be reallocated on the
+ * next call to brw_bo_create() or brw_bo_create_tiled(). Otherwise the
+ * buffer is returned back to the shared screen bufmgr pool.
+ */
+void  __brw_bo_free(struct brw_bo *bo)
+{
+   struct brw_batch *batch;
+
+   __DBG(("%d:%s: handle=%d\n", gettid(), __func__, bo->handle));
+
+   assert(bo->refcnt == 0);
+
+   if (bo->active) {
+      assert(bo->batch);
+      if (bo->reusable)
+         list_move(&bo->link, &bo->batch->active);
+      return;
+   }
+
+   assert(!bo->write.rq);
+   list_del(&bo->link);
+
+   if (bo->offset)
+      bo->base->offset64 = bo->offset;
+   drm_intel_bo_unreference(bo->base);
+
+   batch = bo->batch;
+   if (batch == NULL) {
+      free(bo);
+      return;
+   }
+
+   batch->vmsize -= bo->size;
+   if (batch->vmsize < batch->peak_rss)
+      batch->peak_rss = batch->vmsize;
+
+   bo->base = (drm_intel_bo *)batch->freed_bo;
+   batch->freed_bo = bo;
+}
+
+/* Prepare the fresh batch and its request for the first command packet */
+static void __brw_batch_start(struct brw_batch *batch)
+{
+   struct brw_request *rq = batch->next_request;
+
+   assert(!batch->bo->dirty);
+   assert(list_empty(&rq->fences));
+   assert(brw_fence_get_request(&rq->bo->read[batch->ring]) == NULL);
+   assert(!rq->bo->active);
+
+   list_add(&rq->bo->read[batch->ring].link, &rq->fences);
+   rq->bo->read[batch->ring].rq = FENCE_MARK_SIGNAL(rq, READ_SIGNAL);
+   rq->bo->active = 1 << batch->ring;
+   batch->rss += BATCH_SIZE;
+
+   /* An early allocation error should be impossible */
+   brw_batch_start_hook(batch);
+   batch->emit.nbatch = batch->tail - batch->map;
+
+   batch->bo->dirty = true;
+}
+
+/* Record the position (and reloc entries) before the start of a command packet,
+ * so we can unroll this command packet upon overflow.
+ */
+static void __brw_batch_save(struct brw_batch *batch)
+{
+   assert(batch->next_request->seqno == 0);
+   assert(batch->emit.nbatch < batch->state - batch->reserved);
+
+   batch->saved = batch->emit;
+}
+
+/*
+ * Mark the beginning of a batch construction critical section, during which
+ * the batch is not allowed to be flushed. Access to the batch prior to this
+ * call is invalid. Access after this call but with instructions for another
+ * ring is also invalid. All BATCH_EMIT() must be inside a brw_batch_begin(),
+ * brw_batch_end() pairing - the exception to this rule are when inside the
+ * brw_start_batch() and brw_finish_batch() callbacks.
+ *
+ * Control returns to the caller of brw_batch_begin() if an error is
+ * encountered whilst inside the critical section. If the return code
+ * is negative, a fatal error occurred. If the return code is positive,
+ * the batch had to be flushed and the critical section needs to be restarted.
+ *
+ * On success 0 is returned.
+ *
+ * Must be paired with brw_batch_end().
+ */
+int __brw_batch_begin(struct brw_batch *batch,
+                      uint32_t bytes,
+                      enum brw_gpu_ring ring)
+{
+   uint16_t space;
+
+   __DBG(("%d:%s: bytes=%d, ring=%d, depth=%d, head=%d\n", gettid(), __func__,
+          bytes, ring, batch->inside_begin_count - 1, batch->emit.nbatch));
+
+   assert(batch->inside_begin_count == 1);
+   if (unlikely(batch->next_request == NULL)) {
+      batch->inside_begin_count = 0;
+      return -ENOMEM;
+   }
+
+   assert(batch->next_request->seqno == 0);
+   assert(batch->tail == batch->map + batch->emit.nbatch);
+   ring = batch->actual_ring[ring];
+   if (ring != batch->ring)
+      space = 0;
+   else
+      space = batch->state - batch->reserved - batch->emit.nbatch;
+   if (unlikely(bytes/4 > space)) {
+      int err;
+
+      batch->inside_begin_count = 0;
+
+      err = brw_batch_flush(batch, NULL);
+      if (unlikely(err))
+         return err;
+
+      assert(batch->next_request);
+      assert(!batch->bo->dirty);
+
+      assert(batch->inside_begin_count == 0);
+      batch->inside_begin_count = 1;
+
+      assert(batch->emit.nbatch == 0);
+   }
+
+   batch->ring = ring;
+
+   if (!batch->bo->dirty)
+      __brw_batch_start(batch);
+
+   __brw_batch_save(batch);
+   return 0;
+}
+
+/*
+ * Mark the end of a batch construction critical section. After this call
+ * the batch is inaccessible until the next brw_batch_begin().
+ *
+ * We may flush the batch to hardware if it exceeds the aperture
+ * high water mark. If the batch submission fails, we rollback to the
+ * end of the previous critical section and try flushing again. If that
+ * should fail, we report the error back to the caller. If the rollback
+ * succeeds, we jump back to the brw_batch_begin() with a fresh request
+ * and run through the critical section again.
+ *
+ * Returns 0 on success and no errors have occurred.
+ *
+ * Must be paired with brw_batch_begin().
+ */
+void brw_batch_end(struct brw_batch *batch)
+{
+   int err;
+
+   __DBG(("%d: %s: depth=%d, tail=%d\n", gettid(), __func__,
+          batch->inside_begin_count - 1, (int)(batch->tail - batch->map)));
+
+   assert(batch->inside_begin_count);
+   if (--batch->inside_begin_count)
+      return;
+
+   batch->emit.nbatch = batch->tail - batch->map;
+   assert(batch->emit.nbatch < batch->state - batch->reserved);
+
+   /* Check to see if this packet will cause us to overflow our aperture
+    * space available for the batch. If it is too large the kernel will
+    * not be able to fit everything into memory for execution and will
+    * reject the execbuf with -ENOSPC.
+    */
+   err = 0;
+   if (batch->aperture > batch->max_aperture)
+      err = brw_batch_flush(batch, NULL);
+   if (likely(err != -ENOSPC))
+      return;
+
+   /* We ran out of space, roll back to before this last packet, submit
+    * (it should fit!) and then restart the command emission afresh. We
+    * will have to emit different state for the new batch, so we can not
+    * just copy across the residual relocations.
+    */
+   batch->emit = batch->saved;
+   batch->tail = batch->map + batch->emit.nbatch;
+
+   if (unlikely(brw_batch_flush(batch, NULL) == -ENOSPC)) {
+      static GLuint msg_id;
+      if (!msg_id) {
+         struct brw_context *brw = container_of(batch, brw, batch);
+         fprintf(stderr, "WARNING: Aperture space exceeded!\n");
+         _mesa_gl_debug(&brw->ctx, &msg_id,
+                        MESA_DEBUG_SOURCE_API,
+                        MESA_DEBUG_TYPE_OTHER,
+                        MESA_DEBUG_SEVERITY_HIGH,
+                        "Aperture space exceeded!\n");
+      }
+      longjmp(batch->jmpbuf, -ENOSPC);
+   }
+
+   batch->inside_begin_count++;
+   __brw_batch_start(batch);
+
+   __brw_batch_save(batch);
+   longjmp(batch->jmpbuf, 1);
+}
+
+/*
+ * How much of the batch is used, both by the 3DSTATE at the beginning of
+ * the batch, and the data at the end?
+ */
+static inline int __brw_batch_size(struct brw_batch *batch)
+{
+   return batch->emit.nbatch + BATCH_SIZE/4 - batch->state;
+}
+
+/*
+ * After a high-level draw command, check to see if we want to flush
+ * the batch to the hardware for either debug reasons or for sanity.
+ */
+int brw_batch_maybe_flush(struct brw_batch *batch)
+{
+   if (unlikely(batch->always_flush))
+      goto flush;
+
+   /* If the working set exceeds the GTT's limits, we will need to evict
+    * textures in order to execute batches. As we have no method for predicting
+    * when we need to evict, we need to frequently flush the batch so that any
+    * stalls are minimised.
+    */
+   if (batch->peak_rss > batch->max_aperture && __brw_batch_size(batch) > 2048)
+      goto flush;
+
+   return 0;
+
+flush:
+   if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
+      fprintf(stderr, "Forcing batchbuffer flush after %d: debug.always_flush?=%d, rss=%d [cap %d], vmasize=%d\n",
+              batch->emit.nbatch,
+              batch->always_flush,
+              (int)(batch->peak_rss >> 20), (int)(batch->max_aperture >> 20),
+              (int)(batch->vmsize >> 20));
+   }
+   return brw_batch_flush(batch, NULL);
+}
+
+/*
+ * Query the kernel for the number of times our hardware context has
+ * been implicated in a reset event - either guilty or just a victim,
+ * and the number of resets that have occurred overall.
+ */
+int brw_batch_get_reset_stats(struct brw_batch *batch,
+                              uint32_t *reset_count,
+                              uint32_t *active,
+                              uint32_t *pending)
+{
+   struct drm_i915_reset_stats stats;
+
+   memset(&stats, 0, sizeof(stats));
+   stats.ctx_id = batch->hw_ctx;
+   if (unlikely(drmIoctl(batch->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats)))
+      return -errno;
+
+   *reset_count = stats.reset_count;
+   *active = stats.batch_active;
+   *pending = stats.batch_pending;
+   return 0;
+}
+
+/*
+ * Mark the buffers as being invalid to prevent stale dereferences when
+ * tearing down shared resources.
+ */
+static void __brw_bo_list_fini(struct list_head *list)
+{
+   list_for_each_entry_safe(struct brw_bo, bo, list, link) {
+      assert(bo->batch);
+      assert(!bo->active);
+
+      bo->batch = NULL;
+      list_delinit(&bo->link);
+   }
+}
+
+/* Normally we never free a request as they get recycled between batches.
+ * Except when we need to teardown the batch manager and free everything.
+ */
+static void __brw_request_free(struct brw_request *rq)
+{
+   /* Opencode the free(bo) here to handle batch->next_request */
+   assert(RQ_BO(rq) == rq->bo);
+   list_delinit(&rq->bo->link);
+   free(rq->bo);
+   free(rq);
+}
+
+static void
+__brw_batch_fini__requests(struct brw_batch *batch)
+{
+   struct brw_request *rq = batch->next_request;
+
+   /* Clean batch, just put request back onto the freelist */
+   if (!rq->bo->dirty) {
+      assert(list_empty(&rq->fences));
+      rq->next = batch->freed_rq;
+      batch->freed_rq = rq;
+      goto retire_requests;
+   }
+
+   /* Incomplete batch, decouple buffers from the request */
+   list_for_each_entry_rev(struct __brw_fence, fence, &rq->fences, link) {
+      int signal = brw_fence_get_signal(fence);
+      struct brw_bo *bo = NULL;
+
+      if (unlikely(signal >= WRITE_SIGNAL)) {
+         if (signal == WRITE_SIGNAL)
+            break;
+         else
+            continue;
+      }
+
+      bo = container_of(fence, bo, read[batch->ring]);
+      assert(bo->exec);
+      assert(brw_fence_get_request(&bo->read[batch->ring]) == rq);
+
+      bo->exec = NULL;
+      bo->dirty = false;
+      bo->target_handle = -1;
+   }
+   assert(!rq->bo->dirty);
+   rq->bo = RQ_MARK_RING(rq, batch->ring);
+   if (batch->requests[batch->ring].mru)
+      batch->requests[batch->ring].mru->next = rq;
+   else
+      batch->requests[batch->ring].lru = rq;
+   batch->requests[batch->ring].mru = rq;
+
+retire_requests:
+   for (unsigned n = 0; n < __BRW_NUM_RINGS; n++) {
+      rq = batch->requests[n].mru;
+      if (rq == NULL)
+         continue;
+
+      /* Note that the request and buffers are not truly idle here. It is
+       * safe as the kernel will keep a reference whilst the buffers are
+       * active (so we can shutdown ahead of time), but we need to disable
+       * our runtime assertions that the request is idle at the time of
+       * retiring.
+       */
+      __brw_request_retire(rq);
+
+      assert(batch->requests[n].lru == NULL);
+      assert(batch->requests[n].mru == NULL);
+   }
+
+   while (batch->freed_rq) {
+      rq = batch->freed_rq;
+      batch->freed_rq = rq->next;
+      __brw_request_free(rq);
+   }
+}
+
+/*
+ * Teardown the batch manager and free all associated memory and resources.
+ */
+void brw_batch_fini(struct brw_batch *batch)
+{
+   /* All bo should have been released before the destructor is called */
+   __brw_batch_fini__requests(batch);
+
+   brw_bo_put(batch->seqno_bo);
+   assert(list_empty(&batch->active));
+   for (unsigned n = 0; n < 1 << BORROWED_BITS; n++)
+      __brw_bo_list_fini(&batch->borrowed[n]);
+   __brw_bo_list_fini(&batch->inactive);
+
+   while (batch->freed_bo) {
+      struct brw_bo *bo = batch->freed_bo;
+      batch->freed_bo = (struct brw_bo *)bo->base;
+      free(bo);
+   }
+
+   free(batch->exec);
+   free(batch->reloc);
+
+   if (batch->hw_ctx) {
+      struct drm_i915_gem_context_destroy destroy;
+
+      memset(&destroy, 0, sizeof(destroy));
+      destroy.ctx_id = batch->hw_ctx;
+      drmIoctl(batch->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &destroy);
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_batch.h b/src/mesa/drivers/dri/i965/brw_batch.h
index e67e874a8e..e6e2f801ad 100644
--- a/src/mesa/drivers/dri/i965/brw_batch.h
+++ b/src/mesa/drivers/dri/i965/brw_batch.h
@@ -31,57 +31,115 @@
 extern "C" {
 #endif
 
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
 #include <setjmp.h>
+#include <assert.h>
+
+#include <intel_aub.h>
 
 #include <intel_bufmgr.h>
 
 #include "util/list.h"
+#include "util/macros.h"
+
+struct _drm_intel_bufmgr;
+struct _drm_intel_bo;
 
-typedef drm_intel_bo brw_bo;
+struct intel_screen;
+struct perf_debug;
 
 enum brw_gpu_ring {
-   UNKNOWN_RING,
-   RENDER_RING,
+   RENDER_RING = 0,
    BLT_RING,
+   __BRW_NUM_RINGS,
+};
+
+struct brw_batch;
+struct brw_request;
+
+enum brw_bo_domain { DOMAIN_NONE, DOMAIN_CPU, DOMAIN_GTT };
+
+/* A fence is created at the current point on the order batch timeline. When
+ * the GPU passes that point, the fence will be signalled. Or you can wait
+ * for a fence to complete.
+ */
+struct __brw_fence {
+   struct brw_request *rq;
+   struct list_head link;
+   uint32_t seqno;
 };
 
+typedef struct brw_bo {
+   struct brw_batch *batch;
+   struct drm_i915_gem_exec_object2 *exec;
+   struct __brw_fence read[__BRW_NUM_RINGS], write;
+
+   unsigned active : __BRW_NUM_RINGS;
+   unsigned dirty : 1;
+   unsigned domain : 2;
+   unsigned tiling : 4;
+   unsigned pinned : 1;
+   unsigned cache_coherent : 1;
+   unsigned reusable : 1;
+
+   unsigned refcnt;
+   uint32_t handle;
+   uint32_t target_handle;
+   uint64_t size;
+   uint64_t alignment;
+   uint64_t offset;
+
+   struct _drm_intel_bo *base;
+   struct list_head link;
+
+   void *map__cpu;
+   void *map__gtt;
+   void *map__wc;
+} brw_bo;
+
 typedef struct brw_batch {
-   /** Current batchbuffer being queued up. */
-   brw_bo *bo;
-   /** Last BO submitted to the hardware.  Used for glFinish(). */
-   brw_bo *last_bo;
+   int fd;
 
-#ifdef DEBUG
-   uint16_t emit, total;
-#endif
-   uint16_t reserved_space;
-   uint32_t *map_next;
+   struct brw_bo *bo;
    uint32_t *map;
-   uint32_t *cpu_map;
-#define BATCH_SZ (8192*sizeof(uint32_t))
+   uint32_t *tail;
+
+   uint32_t flags;
+#define BATCH_HAS_STATE_BASE    (1 << 31)
+   uint32_t base_flags;
 
-   uint32_t state_batch_offset;
    enum brw_gpu_ring ring;
-   unsigned flags;
-#define BATCH_HAS_SOL           (1 << 0)
-#define BATCH_HAS_STATE_BASE    (1 << 1)
-   int gen;
+   uint32_t hw_ctx;
 
-   jmp_buf jmpbuf;
-   bool repeat;
-   unsigned begin_count;
-   bool no_batch_wrap;
+   uint16_t reserved;
+   uint16_t state;
 
-   struct {
-      uint32_t *map_next;
-      int reloc_count;
-   } saved;
+   struct brw_batch_state {
+      uint16_t nbatch;
+      uint16_t nexec;
+      uint16_t nreloc;
+      uint16_t nself;
+   } emit, saved;
 
-   dri_bufmgr *bufmgr;
+   uint32_t seqno;
+   const uint32_t *seqno_map;
+   struct brw_bo *seqno_bo;
 
-   /** Framerate throttling: @{ */
-   brw_bo *throttle_batch[2];
+   uint64_t aperture;
+   uint64_t max_aperture;
+   uint64_t rss, peak_rss, vmsize;
+
+   int gen;
+   bool has_softpin;
+   bool has_llc;
+   bool has_mmap_wc;
+   bool needs_pipecontrol_ggtt_wa;
+
+   bool always_flush;
 
+   /** Framerate throttling: @{ */
    /* Limit the number of outstanding SwapBuffers by waiting for an earlier
     * frame of rendering to complete. This gives a very precise cap to the
     * latency between input and output such that rendering never gets more
@@ -94,12 +152,39 @@ typedef struct brw_batch {
 
    /** General throttling, not caught by throttling between SwapBuffers */
    bool need_flush_throttle;
+   bool disable_throttling;
    /** @} */
 
-   bool always_flush;
-   bool disable_throttling;
+   bool no_hw;
+
+   unsigned inside_begin_count;
+   jmp_buf jmpbuf;
+
+   uint16_t exec_size;
+   uint16_t reloc_size;
+
+   struct drm_i915_gem_exec_object2 *exec;
+   struct drm_i915_gem_relocation_entry *reloc;
+   uint16_t self_reloc[256];
+
+   int actual_ring[__BRW_NUM_RINGS];
+   struct brw_request *next_request;
+   struct {
+      struct brw_request *lru, *mru;
+   } requests[__BRW_NUM_RINGS];
+   struct brw_request *throttle[2];
+   struct brw_request *freed_rq;
+
+   double idle_time[__BRW_NUM_RINGS];
 
-   drm_intel_context *hw_ctx;
+   struct intel_screen *screen;
+   struct _drm_intel_bufmgr *bufmgr;
+   struct list_head active, inactive;
+
+#define BORROWED_BITS 3
+   struct list_head borrowed[1<<BORROWED_BITS];
+
+   struct brw_bo *freed_bo;
 
    /**
     * Set of brw_bo* that have been rendered to within this batchbuffer
@@ -109,190 +194,291 @@ typedef struct brw_batch {
    struct set *render_cache;
 } brw_batch;
 
-/**
- * Number of bytes to reserve for commands necessary to complete a batch.
- *
- * This includes:
- * - MI_BATCHBUFFER_END (4 bytes)
- * - Optional MI_NOOP for ensuring the batch length is qword aligned (4 bytes)
- * - Any state emitted by vtbl->finish_batch():
- *   - Gen4-5 record ending occlusion query values (4 * 4 = 16 bytes)
- *   - Disabling OA counters on Gen6+ (3 DWords = 12 bytes)
- *   - Ending MI_REPORT_PERF_COUNT on Gen5+, plus associated PIPE_CONTROLs:
- *     - Two sets of PIPE_CONTROLs, which become 4 PIPE_CONTROLs each on SNB,
- *       which are 5 DWords each ==> 2 * 4 * 5 * 4 = 160 bytes
- *     - 3 DWords for MI_REPORT_PERF_COUNT itself on Gen6+.  ==> 12 bytes.
- *       On Ironlake, it's 6 DWords, but we have some slack due to the lack of
- *       Sandybridge PIPE_CONTROL madness.
- *   - CC_STATE workaround on HSW (17 * 4 = 68 bytes)
- *     - 10 dwords for initial mi_flush
- *     - 2 dwords for CC state setup
- *     - 5 dwords for the required pipe control at the end
- *   - Restoring L3 configuration: (24 dwords = 96 bytes)
- *     - 2*6 dwords for two PIPE_CONTROL flushes.
- *     - 7 dwords for L3 configuration set-up.
- *     - 5 dwords for L3 atomic set-up (on HSW).
- */
-#define BATCH_RESERVED 308
+int brw_batch_init(struct brw_batch *batch,
+                   struct intel_screen *screen);
 
-inline static brw_bo *brw_bo_create(brw_batch *batch,
-                                    const char *name,
-                                    uint64_t size,
-                                    uint64_t alignment,
-                                    unsigned flags)
+#define GEN8_HIGH_ADDRESS_BIT 47
+static uint64_t gen8_canonical_address(uint64_t address)
 {
-   return drm_intel_bo_alloc(batch->bufmgr, name, size, alignment);
+	const uint8_t shift = 63 - GEN8_HIGH_ADDRESS_BIT;
+	return (int64_t)(address << shift) >> shift;
 }
 
-inline static brw_bo *brw_bo_create_tiled(brw_batch *batch,
-                                          const char *name,
-                                          uint32_t width,
-                                          uint32_t height,
-                                          uint32_t cpp,
-                                          uint32_t *tiling,
-                                          uint32_t *pitch,
-                                          unsigned flags)
+inline static uint64_t
+__brw_reloc_address(brw_bo *bo, uint64_t offset)
 {
-   unsigned long __pitch;
-   brw_bo *bo = drm_intel_bo_alloc_tiled(batch->bufmgr, name,
-                                         width, height, cpp,
-                                         tiling, &__pitch,
-                                         flags);
-   *pitch = __pitch;
-   return bo;
+   return gen8_canonical_address(bo->offset + offset);
 }
 
-inline static brw_bo *brw_bo_create_from_name(brw_batch *batch,
-                                              const char *name,
-                                              uint32_t global_name)
+inline static uint64_t
+brw_reloc_address(brw_bo *bo, uint64_t offset)
 {
-   return drm_intel_bo_gem_create_from_name(batch->bufmgr, name, global_name);
+   return bo ? __brw_reloc_address(bo, offset) : 0;
 }
 
-inline static brw_bo *brw_bo_get(brw_bo *bo)
+/** Add a relocation entry to the current batch
+ * XXX worth specialising 32bit variant?
+ */
+uint64_t __brw_batch_reloc(struct brw_batch *batch,
+                           uint32_t batch_offset,
+                           struct brw_bo *target_bo,
+                           uint64_t target_offset,
+                           unsigned read_domains,
+                           unsigned write_domain);
+static inline uint64_t brw_batch_reloc(struct brw_batch *batch,
+                                       uint32_t batch_offset,
+                                       struct brw_bo *target_bo,
+                                       uint64_t target_offset,
+                                       unsigned read_domains,
+                                       unsigned write_domain)
 {
-   drm_intel_bo_reference(bo);
-   return bo;
-}
+   if (target_bo == NULL)
+      return target_offset;
 
-inline static void brw_bo_put(brw_bo *bo)
-{
-   if (bo)
-      drm_intel_bo_unreference(bo);
+   return __brw_batch_reloc(batch, batch_offset,
+                            target_bo, target_offset,
+                            read_domains, write_domain);
 }
 
-inline static int brw_bo_madvise(brw_bo *bo, int state)
+int brw_batch_get_reset_stats(struct brw_batch *batch,
+                              uint32_t *reset_count,
+                              uint32_t *active,
+                              uint32_t *pending);
+
+bool brw_batch_busy(struct brw_batch *batch);
+/** Wait for the last submitted rendering to complete */
+void brw_batch_wait(struct brw_batch *batch,
+                    struct perf_debug *stall);
+
+void brw_batch_fini(struct brw_batch *batch);
+
+/* Wrap a drm_intel_bo within a local struct brw_bo */
+struct brw_bo *
+brw_bo_import(struct brw_batch *batch,
+              struct _drm_intel_bo *base,
+              bool borrow);
+
+/* Create a local brw_bo for a linear/unfenced buffer and allocate the buffer */
+struct brw_bo *
+brw_bo_create(struct brw_batch *batch,
+              const char *name,
+              uint64_t size,
+              uint64_t alignment,
+              unsigned flags);
+
+/* Create a local brw_bo for a tiled buffer and allocate the buffer */
+struct brw_bo *
+brw_bo_create_tiled(struct brw_batch *batch,
+                    const char *name,
+                    uint32_t width,
+                    uint32_t height,
+                    int cpp,
+                    uint32_t *tiling,
+                    uint32_t *pitch,
+                    unsigned flags);
+
+/* Create a local brw_bo for a foreign buffer using its global flinked name */
+struct brw_bo *brw_bo_create_from_name(struct brw_batch *batch,
+                                       const char *name,
+                                       uint32_t global_name);
+
+/* Enable CPU cache coherent to the buffer. On LLC, normally all buffers
+ * are cache coherent, but on non-LLC architectures we can tell the GPU
+ * to snoop from and to flush into the CPU cache. Performing the snoop
+ * is slower for the GPU, but eliminates the uncached penalty from the CPU,
+ * so it only useful for streaming data (read once) to the GPU or when
+ * we need to read anything back from the GPU.
+ */
+void brw_bo_enable_snoop(struct brw_bo *bo);
+
+void brw_bo_mark_dirty(struct brw_batch *batch, brw_bo *bo);
+void brw_batch_clear_dirty(struct brw_batch *batch);
+
+inline static int brw_bo_madvise(struct brw_bo *bo, int state)
 {
-   return drm_intel_bo_madvise(bo, state);
+   return drm_intel_bo_madvise(bo->base, state);
 }
 
-inline static uint32_t brw_bo_flink(brw_bo *bo)
+inline static uint32_t brw_bo_flink(struct brw_bo *bo)
 {
    uint32_t name = 0;
-   drm_intel_bo_flink(bo, &name);
+   drm_intel_bo_flink(bo->base, &name);
    return name;
 }
 
-void brw_batch_clear_dirty(brw_batch *batch);
-void brw_bo_mark_dirty(brw_batch *batch, brw_bo *bo);
+int brw_bo_wait(struct brw_bo *bo, int64_t timeout);
+
+void brw_bo_write(struct brw_bo *bo, uint64_t offset,
+                  const void *data, uint64_t length,
+                  unsigned flags,
+                  struct perf_debug *perf);
+void brw_bo_read(struct brw_bo *bo, uint64_t offset,
+                 void *data, uint64_t length,
+                 unsigned flags,
+                 struct perf_debug *perf);
 
-inline static bool brw_batch_busy(brw_batch *batch)
+static inline struct brw_request *brw_fence_get_request(struct __brw_fence *f)
 {
-   return batch->last_bo && drm_intel_bo_busy(batch->last_bo);
+	return (struct brw_request *)((uintptr_t)f->rq & ~3);
 }
 
-#define GEN8_HIGH_ADDRESS_BIT 47
-static uint64_t gen8_canonical_address(uint64_t address)
+static inline int brw_fence_get_signal(struct __brw_fence *f)
 {
-	const uint8_t shift = 63 - GEN8_HIGH_ADDRESS_BIT;
-	return (int64_t)(address << shift) >> shift;
+	return (uintptr_t)f->rq & 3;
 }
 
-inline static uint64_t
-__brw_reloc_address(brw_bo *bo, uint64_t offset)
+bool __brw_request_busy(struct brw_request *rq,
+                        unsigned flags,
+                        struct perf_debug *perf);
+
+static inline bool brw_bo_busy(struct brw_bo *bo,
+                               unsigned flags,
+                               struct perf_debug *perf)
+#define BUSY_READ 0
+#define BUSY_WRITE 1
+#define BUSY_FLUSH 2
 {
-   return gen8_canonical_address(bo->offset64 + offset);
+   struct __brw_fence *fences;
+   int nfence;
+
+   if (!bo)
+      return false;
+
+   assert(bo->refcnt);
+   if (flags & BUSY_WRITE) {
+      fences = bo->read;
+      nfence = __BRW_NUM_RINGS;
+   } else {
+      fences = &bo->write;
+      nfence = 1;
+   }
+   while (nfence--) {
+      struct brw_request *rq = brw_fence_get_request(fences++);
+      if (rq && __brw_request_busy(rq, flags, perf))
+	 return true;
+   }
+
+   return false;
 }
 
-inline static uint64_t
-brw_reloc_address(brw_bo *bo, uint64_t offset)
+void *brw_bo_map(struct brw_bo *bo, unsigned flags, struct perf_debug *perf);
+/* Must match MapBufferRange interface (for convenience) */
+#define MAP_READ        GL_MAP_READ_BIT
+#define MAP_WRITE       GL_MAP_WRITE_BIT
+#define MAP_ASYNC       GL_MAP_UNSYNCHRONIZED_BIT
+#define MAP_PERSISTENT  GL_MAP_PERSISTENT_BIT
+#define MAP_COHERENT    GL_MAP_COHERENT_BIT
+/* internal */
+#define MAP_INTERNAL_MASK       (0xff << 24)
+#define MAP_DETILED             (0x01 << 24)
+
+/* Take a new reference to the brw_bo */
+static inline struct brw_bo *brw_bo_get(struct brw_bo *bo)
 {
-   return bo ? __brw_reloc_address(bo, offset) : 0;
+   assert(bo != NULL && bo->refcnt > 0);
+   bo->refcnt++;
+   return bo;
 }
 
-inline static uint64_t
-__brw_batch_reloc(brw_batch *batch,
-		  uint32_t batch_offset,
-		  brw_bo *target_bo,
-		  uint64_t target_offset,
-		  unsigned read_domains,
-		  unsigned write_domain)
+/* Release a reference to the brw_bo */
+void  __brw_bo_free(struct brw_bo *bo);
+static inline void brw_bo_put(struct brw_bo *bo)
 {
-   int ret;
+   assert(bo == NULL || bo->refcnt > 0);
+   if (bo && --bo->refcnt == 0)
+      __brw_bo_free(bo);
+}
 
-   ret = drm_intel_bo_emit_reloc(batch->bo, batch_offset,
-                                 target_bo, target_offset,
-                                 read_domains, write_domain);
-   assert(ret == 0);
-   (void)ret;
+/* Control batch command insertion and submission to hw */
+MUST_CHECK int __brw_batch_begin(struct brw_batch *batch,
+                                 uint32_t estimated_bytes,
+                                 enum brw_gpu_ring ring);
+#define brw_batch_begin(batch, sz, ring) ({                             \
+   int __ret = 0;                                                       \
+   if ((batch)->inside_begin_count++ == 0) {                            \
+      __ret = __brw_batch_begin((batch), (sz), (ring));                 \
+      if (likely(__ret == 0))                                           \
+         __ret = setjmp((batch)->jmpbuf);                               \
+   }                                                                    \
+   __ret; })
+void brw_batch_end(struct brw_batch *batch);
+int brw_batch_flush(struct brw_batch *batch, struct perf_debug *perf);
+int brw_batch_maybe_flush(struct brw_batch *batch);
 
-   return __brw_reloc_address(target_bo, target_offset);
-}
+void brw_batch_start_hook(brw_batch *batch);
+void brw_batch_finish_hook(brw_batch *batch);
 
-inline static uint64_t
-brw_batch_reloc(brw_batch *batch,
-                uint32_t batch_offset,
-                brw_bo *target_bo,
-                uint64_t target_offset,
-                unsigned read_domains,
-                unsigned write_domain)
+/* Interfaces for writing commands into the batch */
+static inline int brw_batch_count(struct brw_batch *batch)
 {
-   if (target_bo == NULL)
-      return 0;
-
-   return __brw_batch_reloc(batch, batch_offset,
-			    target_bo, target_offset,
-			    read_domains, write_domain);
+   return batch->tail - batch->map;
 }
 
-struct perf_debug;
-int brw_batch_flush(struct brw_batch *batch, struct perf_debug *info);
-
-inline static void brw_batch_maybe_flush(struct brw_batch *batch)
+inline static void brw_batch_cacheline_evade(struct brw_batch *batch,
+                                             unsigned sz)
 {
-   if (unlikely(batch->always_flush))
-      brw_batch_flush(batch, NULL);
+#define CACHELINE 64
+   if (((uintptr_t)batch->tail & (CACHELINE - 1)) > (CACHELINE - sz)) {
+      int pad = CACHELINE - ((uintptr_t)batch->tail & (CACHELINE - 1));
+      memset(batch->tail, 0, pad);
+      batch->tail += pad / sizeof(*batch->tail);
+   }
+#undef CACHELINE
 }
 
-void intel_batchbuffer_save_state(struct brw_batch *batch);
-void intel_batchbuffer_reset_to_saved(struct brw_batch *batch);
-void intel_batchbuffer_require_space(struct brw_batch *batch, GLuint sz,
-                                     enum brw_gpu_ring ring);
+static inline uint32_t * __brw_batch_check(struct brw_batch *batch,
+                                           int count,
+                                           enum brw_gpu_ring ring)
+{
+   uint32_t *ptr;
 
-void brw_batch_start_hook(struct brw_batch *batch);
-void brw_batch_finish_hook(struct brw_batch *batch);
+   assert(batch->inside_begin_count);
+   assert(brw_batch_count(batch) + count < batch->state - batch->reserved);
+   assert(batch->ring == batch->actual_ring[ring]);
 
-#define USED_BATCH(batch) ((uintptr_t)((batch)->map_next - (batch)->map))
+   ptr = batch->tail;
+   batch->tail += count;
+   return ptr;
+}
 
-static inline unsigned
-intel_batchbuffer_space(struct brw_batch *batch)
+static inline void brw_batch_data(struct brw_batch *batch,
+                                  const void *data,
+                                  int bytes)
 {
-   return (batch->state_batch_offset - batch->reserved_space)
-      - USED_BATCH(batch)*4;
+   assert(batch->inside_begin_count);
+   assert(brw_batch_count(batch) + bytes/4 < batch->state - batch->reserved);
+   assert((bytes & 3) == 0);
+
+   memcpy(batch->tail, data, bytes);
+   batch->tail += bytes / sizeof(*batch->tail);
 }
 
-int __brw_batch_begin(struct brw_batch *batch,
-                      const int sz_bytes,
-                      enum brw_gpu_ring ring);
-#define brw_batch_begin(batch, sz, ring) ({                             \
-   int __ret = 0;                                                       \
-   if ((batch)->begin_count++ == 0) {                                   \
-      __ret = __brw_batch_begin((batch), (sz), (ring));                 \
-      if (__ret == 0)                                                   \
-         __ret = setjmp((batch)->jmpbuf);                               \
-   }                                                                    \
-   __ret; })
-void brw_batch_end(struct brw_batch *batch);
+static inline uint32_t float_as_int(float f)
+{
+   union {
+      float f;
+      uint32_t dw;
+   } fi;
+
+   fi.f = f;
+   return fi.dw;
+}
+
+#define BEGIN_BATCH(n) do { \
+   uint32_t *__map = __brw_batch_check(&brw->batch, n, RENDER_RING)
+#define BEGIN_BATCH_BLT(n) do { \
+   uint32_t *__map = __brw_batch_check(&brw->batch, n, BLT_RING)
+#define OUT_BATCH(dw) *__map++ = (dw)
+#define OUT_BATCH_F(f) *__map++ = float_as_int(f)
+#define OUT_RELOC(bo, read, write, delta) \
+   *__map = brw_batch_reloc(&brw->batch, \
+                            4*(__map - brw->batch.map), \
+                            bo, delta, read, write), __map++
+#define OUT_RELOC64(bo, read, write, delta) \
+   *(uint64_t *)__map = brw_batch_reloc(&brw->batch, \
+                                        4*(__map - brw->batch.map), \
+                                        bo, delta, read, write), __map += 2
+#define ADVANCE_BATCH() assert(__map == brw->batch.tail); } while(0)
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 7500c4872a..541c2885fe 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -419,9 +419,7 @@ intel_finish(struct gl_context * ctx)
    struct brw_context *brw = brw_context(ctx);
 
    intel_flush_front(ctx, PERF_DEBUG(brw, "Finish"));
-
-   if (brw->batch.last_bo)
-      drm_intel_bo_wait_rendering(brw->batch.last_bo);
+   brw_batch_wait(&brw->batch, PERF_DEBUG(brw, "Finish"));
 }
 
 static void
@@ -967,6 +965,18 @@ brwCreateContext(gl_api api,
    brw->driContext = driContextPriv;
    brw->screen = screen;
 
+   if (brw_batch_init(&brw->batch, screen)) {
+      fprintf(stderr, "%s: failed to alloc batch\n", __func__);
+      *dri_ctx_error = __DRI_CTX_ERROR_NO_MEMORY;
+      return false;
+   }
+
+   if (brw_init_pipe_control(brw, devinfo)) {
+      fprintf(stderr, "%s: failed to alloc workarounds\n", __func__);
+      *dri_ctx_error = __DRI_CTX_ERROR_NO_MEMORY;
+      return false;
+   }
+
    brw->gen = devinfo->gen;
    brw->gt = devinfo->gt;
    brw->is_g4x = devinfo->is_g4x;
@@ -1063,18 +1073,6 @@ brwCreateContext(gl_api api,
 
    intel_fbo_init(brw);
 
-   if (!intel_batchbuffer_init(&brw->batch,
-			       screen->bufmgr, brw->gen, brw->has_llc)) {
-      intelDestroyContext(driContextPriv);
-      return false;
-   }
-
-   if (brw_init_pipe_control(brw, devinfo)) {
-      *dri_ctx_error = __DRI_CTX_ERROR_NO_MEMORY;
-      intelDestroyContext(driContextPriv);
-      return false;
-   }
-
    brw_init_state(brw);
 
    intelInitExtensions(ctx);
@@ -1174,14 +1172,14 @@ intelDestroyContext(__DRIcontext * driContextPriv)
    if (ctx->swrast_context)
       _swrast_DestroyContext(&brw->ctx);
 
+   /* free the Mesa context */
+   _mesa_free_context_data(&brw->ctx);
+
    brw_fini_pipe_control(brw);
-   intel_batchbuffer_free(&brw->batch);
+   brw_batch_fini(&brw->batch);
 
    driDestroyOptionCache(&brw->optionCache);
 
-   /* free the Mesa context */
-   _mesa_free_context_data(&brw->ctx);
-
    ralloc_free(brw);
    driContextPriv->driverPrivate = NULL;
 }
@@ -1669,12 +1667,14 @@ intel_update_image_buffer(struct brw_context *intel,
    else
       last_mt = rb->singlesample_mt;
 
-   if (last_mt && last_mt->bo == buffer->bo)
+   if (last_mt && last_mt->bo->handle == buffer->bo->handle)
       return;
 
-   intel_update_winsys_renderbuffer_miptree(intel, rb, buffer->bo,
+   brw_bo *bo = brw_bo_import(&intel->batch, buffer->bo, true);
+   intel_update_winsys_renderbuffer_miptree(intel, rb, bo,
                                             buffer->width, buffer->height,
                                             buffer->pitch);
+   brw_bo_put(bo);
 
    if (_mesa_is_front_buffer_drawing(fb) &&
        buffer_type == __DRI_IMAGE_BUFFER_FRONT &&
@@ -1769,6 +1769,9 @@ void brw_batch_start_hook(brw_batch *batch)
     */
    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
       brw_collect_and_report_shader_time(brw);
+
+   if (brw->use_resource_streamer)
+      brw->batch.flags |= I915_EXEC_RESOURCE_STREAMER;
 }
 
 /**
@@ -1820,8 +1823,6 @@ void brw_batch_finish_hook(brw_batch *batch)
     */
    brw_emit_query_end(brw);
 
-   brw->cache.bo_used_by_gpu = true;
-
    brw->state_batch_count = 0;
 
    brw->ib.type = -1;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index b2000791eb..0152f0a482 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -50,7 +50,8 @@ extern "C" {
         #define virtual virt
 #endif
 
-#include <intel_bufmgr.h>
+#include <drm.h>
+#include <i915_drm.h>
 #ifdef __cplusplus
 	#undef virtual
 }
@@ -495,7 +496,6 @@ struct brw_cache {
    GLuint size, n_items;
 
    uint32_t next_offset;
-   bool bo_used_by_gpu;
 };
 
 
@@ -547,9 +547,6 @@ struct brw_query_object {
 
    /** Last index in bo with query data for this object. */
    int last_index;
-
-   /** True if we know the batch has been flushed since we ended the query. */
-   bool flushed;
 };
 
 #define MAX_GS_INPUT_VERTICES 6
@@ -1281,7 +1278,7 @@ bool brw_check_conditional_render(struct brw_context *brw);
 /*======================================================================
  * brw_state_dump.c
  */
-void brw_debug_batch(struct brw_context *brw);
+void brw_debug_batch(brw_batch *batch);
 
 /*======================================================================
  * intel_tex_validate.c
@@ -1409,12 +1406,6 @@ bool brw_render_target_supported(struct brw_context *brw,
                                  struct gl_renderbuffer *rb);
 uint32_t brw_depth_format(struct brw_context *brw, mesa_format format);
 
-/* intel_buffer_objects.c */
-int brw_bo_map(struct brw_context *brw, brw_bo *bo, int write_enable,
-               const char *bo_name);
-int brw_bo_map_gtt(struct brw_context *brw, brw_bo *bo,
-                   const char *bo_name);
-
 /* intel_extensions.c */
 extern void intelInitExtensions(struct gl_context *ctx);
 
@@ -1692,9 +1683,4 @@ void brw_query_internal_format(struct gl_context *ctx, GLenum target,
 }
 #endif
 
-/* Temporary include to hide some mechanical changes for brw-batch */
-#define INCLUDE_INTEL_BATCHBUFFER_H
-#include "intel_batchbuffer.h"
-#undef INCLUDE_INTEL_BATCHBUFFER_H
-
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
index a8a8f9ef36..aab3bf141a 100644
--- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -169,6 +169,14 @@ brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
 void
 brw_pipe_control_flush(struct brw_context *brw, unsigned flags)
 {
+   /* Nothing inside batch, rely on kernel flush before batch */
+   if (brw_batch_count(&brw->batch) == 0)
+      return;
+
+   /* Need to switch rings, again we can rely on the kernel flush in between */
+   if (brw->batch.ring != RENDER_RING)
+      return;
+
    if (brw_batch_begin(&brw->batch, 60, RENDER_RING) >= 0) {
       brw_emit_pipe_control_flush(brw, flags);
       brw_batch_end(&brw->batch);
@@ -355,7 +363,10 @@ brw_emit_post_sync_nonzero_flush(struct brw_context *brw)
 void
 brw_emit_mi_flush(struct brw_context *brw)
 {
-   if (brw->batch.ring == BLT_RING && brw->gen >= 6) {
+   if (brw_batch_count(&brw->batch) == 0)
+      return;
+
+   if (brw->batch.ring == BLT_RING) {
       BEGIN_BATCH_BLT(4);
       OUT_BATCH(MI_FLUSH_DW);
       OUT_BATCH(0);
@@ -385,6 +396,14 @@ brw_emit_mi_flush(struct brw_context *brw)
 void
 brw_mi_flush(struct brw_context *brw, enum brw_gpu_ring ring)
 {
+   /* Nothing inside batch, rely on kernel flush before batch */
+   if (brw_batch_count(&brw->batch) == 0)
+      return;
+
+   /* Need to switch rings, again we can rely on the kernel flush in between */
+   if (brw->batch.actual_ring[ring] != brw->batch.ring)
+      return;
+
    if (brw_batch_begin(&brw->batch, 60, ring) >= 0) {
       brw_emit_mi_flush(brw);
       brw_batch_end(&brw->batch);
@@ -406,7 +425,10 @@ brw_init_pipe_control(struct brw_context *brw,
     * it, we do not need to add our own reference count and can simply
     * rely on the bo always existing for the duration of the context.
     */
-   brw->workaround_bo = brw->screen->workaround_bo;
+   brw->workaround_bo =
+      brw_bo_import(&brw->batch, brw->screen->workaround_bo, true);
+   if (brw->workaround_bo == NULL)
+      return -ENOMEM;
 
    brw->pipe_controls_since_last_cs_stall = 0;
    return 0;
@@ -415,4 +437,5 @@ brw_init_pipe_control(struct brw_context *brw,
 void
 brw_fini_pipe_control(struct brw_context *brw)
 {
+   brw_bo_put(brw->workaround_bo);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 5941861fdf..ad6095165a 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -566,8 +566,7 @@ brw_collect_shader_time(struct brw_context *brw)
     * delaying reading the reports, but it doesn't look like it's a big
     * overhead compared to the cost of tracking the time in the first place.
     */
-   drm_intel_bo_map(brw->shader_time.bo, true);
-   void *bo_map = brw->shader_time.bo->virtual;
+   void *bo_map = brw_bo_map(brw->shader_time.bo, MAP_WRITE, NULL);
 
    for (int i = 0; i < brw->shader_time.num_entries; i++) {
       uint32_t *times = bo_map + i * 3 * SHADER_TIME_STRIDE;
@@ -580,7 +579,6 @@ brw_collect_shader_time(struct brw_context *brw)
    /* Zero the BO out to clear it out for our next collection.
     */
    memset(bo_map, 0, brw->shader_time.bo->size);
-   drm_intel_bo_unmap(brw->shader_time.bo);
 }
 
 void
diff --git a/src/mesa/drivers/dri/i965/brw_program_cache.c b/src/mesa/drivers/dri/i965/brw_program_cache.c
index db372a2c65..e5b1d9719d 100644
--- a/src/mesa/drivers/dri/i965/brw_program_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_program_cache.c
@@ -173,26 +173,16 @@ brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
    brw_bo *new_bo;
 
    new_bo = brw_bo_create(&brw->batch, "program cache", new_size, 64, 0);
-   if (brw->has_llc)
-      drm_intel_gem_bo_map_unsynchronized(new_bo);
 
    /* Copy any existing data that needs to be saved. */
    if (cache->next_offset != 0) {
-      if (brw->has_llc) {
-         memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset);
-      } else {
-         drm_intel_bo_map(cache->bo, false);
-         drm_intel_bo_subdata(new_bo, 0, cache->next_offset,
-                              cache->bo->virtual);
-         drm_intel_bo_unmap(cache->bo);
-      }
+      brw_bo_read(cache->bo, 0,
+                  brw_bo_map(new_bo, MAP_WRITE, NULL), cache->next_offset,
+                  MAP_ASYNC, NULL);
    }
 
-   if (brw->has_llc)
-      drm_intel_bo_unmap(cache->bo);
    brw_bo_put(cache->bo);
    cache->bo = new_bo;
-   cache->bo_used_by_gpu = false;
 
    /* Since we have a new BO in place, we need to signal the units
     * that depend on it (state base address on gen5+, or unit state before).
@@ -209,24 +199,18 @@ brw_lookup_prog(const struct brw_cache *cache,
                 enum brw_cache_id cache_id,
                 const void *data, unsigned data_size)
 {
-   const struct brw_context *brw = cache->brw;
    unsigned i;
    const struct brw_cache_item *item;
 
    for (i = 0; i < cache->size; i++) {
       for (item = cache->items[i]; item; item = item->next) {
-         int ret;
-
          if (item->cache_id != cache_id || item->size != data_size)
             continue;
 
-         if (!brw->has_llc)
-            drm_intel_bo_map(cache->bo, false);
-         ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
-         if (!brw->has_llc)
-            drm_intel_bo_unmap(cache->bo);
-         if (ret)
-            continue;
+         void *old =
+            brw_bo_map(cache->bo, MAP_READ | MAP_ASYNC, NULL) + item->offset;
+         if (memcmp(old, data, item->size))
+	    continue;
 
          return item;
       }
@@ -239,7 +223,6 @@ static uint32_t
 brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
 {
    uint32_t offset;
-   struct brw_context *brw = cache->brw;
 
    /* Allocate space in the cache BO for our new program. */
    if (cache->next_offset + size > cache->bo->size) {
@@ -251,14 +234,6 @@ brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
       brw_cache_new_bo(cache, new_size);
    }
 
-   /* If we would block on writing to an in-use program BO, just
-    * recreate it.
-    */
-   if (!brw->has_llc && cache->bo_used_by_gpu) {
-      perf_debug("Copying busy program cache buffer.\n");
-      brw_cache_new_bo(cache, cache->bo->size);
-   }
-
    offset = cache->next_offset;
 
    /* Programs are always 64-byte aligned, so set up the next one now */
@@ -279,7 +254,6 @@ brw_upload_cache(struct brw_cache *cache,
                  uint32_t *out_offset,
                  void *out_aux)
 {
-   struct brw_context *brw = cache->brw;
    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
    const struct brw_cache_item *matching_data =
       brw_lookup_prog(cache, cache_id, data, data_size);
@@ -306,11 +280,7 @@ brw_upload_cache(struct brw_cache *cache,
       item->offset = brw_alloc_item_data(cache, data_size);
 
       /* Copy data to the buffer */
-      if (brw->has_llc) {
-         memcpy((char *)cache->bo->virtual + item->offset, data, data_size);
-      } else {
-         drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
-      }
+      brw_bo_write(cache->bo, item->offset, data, data_size, MAP_ASYNC, NULL);
    }
 
    /* Set up the memory containing the key and aux_data */
@@ -347,8 +317,6 @@ brw_init_caches(struct brw_context *brw)
       calloc(cache->size, sizeof(struct brw_cache_item *));
 
    cache->bo = brw_bo_create(&brw->batch, "program cache", 4096, 64, 0);
-   if (brw->has_llc)
-      drm_intel_gem_bo_map_unsynchronized(cache->bo);
 }
 
 static void
@@ -379,6 +347,9 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
 
    cache->n_items = 0;
 
+   brw_bo_put(cache->bo);
+   cache->bo = brw_bo_create(&brw->batch, "program cache", 4096, 64, 0);
+
    /* Start putting programs into the start of the BO again, since
     * we'll never find the old results.
     */
@@ -401,8 +372,6 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
    brw->gs.base.prog_data = NULL;
    brw->wm.base.prog_data = NULL;
    brw->cs.base.prog_data = NULL;
-
-   brw_batch_flush(&brw->batch, NULL);
 }
 
 void
@@ -425,11 +394,10 @@ brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 
    DBG("%s\n", __func__);
 
-   if (brw->has_llc)
-      drm_intel_bo_unmap(cache->bo);
+   brw_clear_cache(brw, cache);
    brw_bo_put(cache->bo);
    cache->bo = NULL;
-   brw_clear_cache(brw, cache);
+
    free(cache->items);
    cache->items = NULL;
    cache->size = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index 68decf84d7..34f55152ae 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -100,17 +100,7 @@ brw_queryobj_get_results(struct gl_context *ctx,
     * still contributing to it, flush it now so the results will be present
     * when mapped.
     */
-   if (drm_intel_bo_references(brw->batch.bo, query->bo))
-      brw_batch_flush(&brw->batch, PERF_DEBUG(brw, "GetQuery"));
-
-   if (unlikely(brw->perf_debug)) {
-      if (drm_intel_bo_busy(query->bo)) {
-         perf_debug("Stalling on the GPU waiting for a query object.\n");
-      }
-   }
-
-   drm_intel_bo_map(query->bo, false);
-   results = query->bo->virtual;
+   results = brw_bo_map(query->bo, MAP_READ, PERF_DEBUG(brw, "GetQuery"));
    switch (query->Base.Target) {
    case GL_TIME_ELAPSED_EXT:
       /* The query BO contains the starting and ending timestamps.
@@ -156,7 +146,6 @@ brw_queryobj_get_results(struct gl_context *ctx,
    default:
       unreachable("Unrecognized query target in brw_queryobj_get_results()");
    }
-   drm_intel_bo_unmap(query->bo);
 
    /* Now that we've processed the data stored in the query's buffer object,
     * we can release it.
@@ -370,10 +359,8 @@ static void brw_check_query(struct gl_context *ctx, struct gl_query_object *q)
     *      not ready yet on the first time it is queried.  This ensures that
     *      the async query will return true in finite time.
     */
-   if (query->bo && drm_intel_bo_references(brw->batch.bo, query->bo))
-      brw_batch_flush(&brw->batch, PERF_DEBUG(brw, "CheckQuery"));
-
-   if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) {
+   if (!brw_bo_busy(query->bo, BUSY_READ | BUSY_FLUSH,
+                    PERF_DEBUG(brw, "CheckQuery"))) {
       brw_queryobj_get_results(ctx, query);
       query->Base.Ready = true;
    }
@@ -497,8 +484,6 @@ brw_query_counter(struct gl_context *ctx, struct gl_query_object *q)
 
    brw_write_timestamp(brw, query->bo, 0);
    brw_batch_end(&brw->batch);
-
-   query->flushed = false;
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/brw_reset.c b/src/mesa/drivers/dri/i965/brw_reset.c
index 69b21d1cff..00dfa25434 100644
--- a/src/mesa/drivers/dri/i965/brw_reset.c
+++ b/src/mesa/drivers/dri/i965/brw_reset.c
@@ -52,7 +52,7 @@ brw_get_graphics_reset_status(struct gl_context *ctx)
    if (brw->reset_count != 0)
       return GL_NO_ERROR;
 
-   err = drm_intel_get_reset_stats(brw->batch.hw_ctx,
+   err = brw_batch_get_reset_stats(&brw->batch,
                                    &reset_count, &active, &pending);
    if (err)
       return GL_NO_ERROR;
@@ -85,8 +85,8 @@ brw_check_for_reset(struct brw_context *brw)
    uint32_t pending;
    int err;
 
-   err = drm_intel_get_reset_stats(brw->batch.hw_ctx, &reset_count, &active,
-                                   &pending);
+   err = brw_batch_get_reset_stats(&brw->batch,
+                                   &reset_count, &active, &pending);
    if (err)
       return;
 
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 32946ecca7..bd3d9a5f8e 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -244,7 +244,7 @@ void brw_destroy_caches( struct brw_context *brw );
  * brw_state_batch.c
  */
 #define BRW_BATCH_STRUCT(brw, s) \
-   intel_batchbuffer_data(brw, (s), sizeof(*(s)), RENDER_RING)
+   brw_batch_data(&brw->batch, (s), sizeof(*(s)))
 
 void *__brw_state_batch(struct brw_context *brw,
                         enum aub_state_struct_type type,
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index e221eed03b..3444d73748 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -79,27 +79,13 @@ __brw_state_batch(struct brw_context *brw,
                   uint32_t *out_offset)
 
 {
-   brw_batch *batch = &brw->batch;
-   uint32_t offset;
-
-   assert(size < batch->bo->size);
-   offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
-
-   /* If allocating from the top would wrap below the batchbuffer, or
-    * if the batch's used space (plus the reserved pad) collides with our
-    * space, then flush and try again.
-    */
-   if (batch->state_batch_offset < size ||
-       offset < 4 * USED_BATCH(batch) + batch->reserved_space) {
-      brw_batch_flush(batch, NULL);
-      offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
-   }
-
-   batch->state_batch_offset = offset;
+   assert(size < brw->batch.bo->size);
+   brw->batch.state = ROUND_DOWN_TO(4*brw->batch.state - size, alignment)/4;
+   assert(brw->batch.state > brw->batch.tail - brw->batch.map);
 
    if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
-      brw_track_state_batch(brw, type, offset, size, index);
+      brw_track_state_batch(brw, type, 4*brw->batch.state, size, index);
 
-   *out_offset = offset;
-   return batch->map + (offset>>2);
+   *out_offset = 4*brw->batch.state;
+   return brw->batch.map + brw->batch.state;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index d8e37f9bff..df0b6cea59 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -70,7 +70,7 @@ static const char *surface_tiling[] = {
 
 static void *batch_in(struct brw_context *brw, unsigned offset)
 {
-   return (void *)brw->batch.bo->virtual + offset;
+   return (void *)brw->batch.map + offset;
 }
 
 static void
@@ -726,8 +726,6 @@ dump_prog_cache(struct brw_context *brw)
    struct brw_cache *cache = &brw->cache;
    unsigned int b;
 
-   drm_intel_bo_map(brw->cache.bo, false);
-
    for (b = 0; b < cache->size; b++) {
       struct brw_cache_item *item;
 
@@ -768,12 +766,11 @@ dump_prog_cache(struct brw_context *brw)
 	 }
 
          fprintf(stderr, "%s:\n", name);
-         brw_disassemble(&brw->screen->devinfo, brw->cache.bo->virtual,
+         brw_disassemble(&brw->screen->devinfo,
+                         brw_bo_map(brw->cache.bo, MAP_READ | MAP_ASYNC, NULL),
                          item->offset, item->size, stderr);
       }
    }
-
-   drm_intel_bo_unmap(brw->cache.bo);
 }
 
 static void
@@ -876,12 +873,11 @@ dump_state_batch(struct brw_context *brw)
  * The buffer offsets printed rely on the buffer containing the last offset
  * it was validated at.
  */
-void brw_debug_batch(struct brw_context *brw)
+void brw_debug_batch(struct brw_batch *batch)
 {
-   drm_intel_bo_map(brw->batch.bo, false);
-   dump_state_batch(brw);
-   drm_intel_bo_unmap(brw->batch.bo);
+   struct brw_context *brw = container_of(batch, brw, batch);
 
+   dump_state_batch(brw);
    if (0)
       dump_prog_cache(brw);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_sync.c b/src/mesa/drivers/dri/i965/brw_sync.c
index 13868b8c69..988b1bc38b 100644
--- a/src/mesa/drivers/dri/i965/brw_sync.c
+++ b/src/mesa/drivers/dri/i965/brw_sync.c
@@ -78,7 +78,7 @@ brw_fence_insert(struct brw_context *brw, struct brw_fence *fence)
    assert(!fence->batch_bo);
    assert(!fence->signalled);
 
-   brw_mi_flush(brw, RENDER_RING);
+   brw_mi_flush(brw, brw->batch.ring);
    fence->batch_bo = brw_bo_get(brw->batch.bo);
    brw_batch_flush(&brw->batch, PERF_DEBUG(brw, "SyncFence"));
 }
@@ -89,7 +89,7 @@ brw_fence_has_completed_locked(struct brw_fence *fence)
    if (fence->signalled)
       return true;
 
-   if (fence->batch_bo && !drm_intel_bo_busy(fence->batch_bo)) {
+   if (brw_bo_busy(fence->batch_bo, BUSY_WRITE | BUSY_FLUSH, NULL)) {
       brw_bo_put(fence->batch_bo);
       fence->batch_bo = NULL;
       fence->signalled = true;
@@ -128,7 +128,7 @@ brw_fence_client_wait_locked(struct brw_context *brw, struct brw_fence *fence,
    if (timeout > INT64_MAX)
       timeout = INT64_MAX;
 
-   if (drm_intel_gem_bo_wait(fence->batch_bo, timeout) != 0)
+   if (drm_intel_gem_bo_wait(fence->batch_bo->base, timeout) != 0)
       return false;
 
    fence->signalled = true;
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index b381fb1c8f..6d2a39e438 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -249,12 +249,6 @@ void brw_upload_urb_fence(struct brw_context *brw)
    uf.bits1.cs_fence  = brw->urb.size;
 
    /* erratum: URB_FENCE must not cross a 64byte cacheline */
-   if ((USED_BATCH(&brw->batch) & 15) > 12) {
-      int pad = 16 - (USED_BATCH(&brw->batch) & 15);
-      do
-         *brw->batch.map_next++ = MI_NOOP;
-      while (--pad);
-   }
-
+   brw_batch_cacheline_evade(&brw->batch, sizeof(uf));
    BRW_BATCH_STRUCT(brw, &uf);
 }
diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
index a499c87df6..508398c8a2 100644
--- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@@ -172,8 +172,8 @@ gen6_queryobj_get_results(struct gl_context *ctx,
    if (query->bo == NULL)
       return;
 
-   brw_bo_map(brw, query->bo, false, "query object");
-   uint64_t *results = query->bo->virtual;
+   uint64_t *results =
+      brw_bo_map(query->bo, MAP_READ, PERF_DEBUG(brw, "GetQuery"));
    switch (query->Base.Target) {
    case GL_TIME_ELAPSED:
       /* The query BO contains the starting and ending timestamps.
@@ -252,7 +252,6 @@ gen6_queryobj_get_results(struct gl_context *ctx,
    default:
       unreachable("Unrecognized query target in brw_queryobj_get_results()");
    }
-   drm_intel_bo_unmap(query->bo);
 
    /* Now that we've processed the data stored in the query's buffer object,
     * we can release it.
@@ -411,7 +410,6 @@ gen6_end_query(struct gl_context *ctx, struct gl_query_object *q)
    /* The current batch contains the commands to handle EndQuery(),
     * but they won't actually execute until it is flushed.
     */
-   query->flushed = false;
 
    /* For ARB_query_buffer_object: The result is now available */
    set_query_availability(brw, query, true);
@@ -420,22 +418,6 @@ gen6_end_query(struct gl_context *ctx, struct gl_query_object *q)
 }
 
 /**
- * Flush the batch if it still references the query object BO.
- */
-static void
-flush_batch_if_needed(struct brw_context *brw, struct brw_query_object *query)
-{
-   /* If the batch doesn't reference the BO, it must have been flushed
-    * (for example, due to being full).  Record that it's been flushed.
-    */
-   query->flushed = query->flushed ||
-      !drm_intel_bo_references(brw->batch.bo, query->bo);
-
-   if (!query->flushed)
-      brw_batch_flush(&brw->batch, PERF_DEBUG(brw, "GetQuery"));
-}
-
-/**
  * The WaitQuery() driver hook.
  *
  * Wait for a query result to become available and return it.  This is the
@@ -443,15 +425,12 @@ flush_batch_if_needed(struct brw_context *brw, struct brw_query_object *query)
  */
 static void gen6_wait_query(struct gl_context *ctx, struct gl_query_object *q)
 {
-   struct brw_context *brw = brw_context(ctx);
    struct brw_query_object *query = (struct brw_query_object *)q;
 
    /* If the application has requested the query result, but this batch is
     * still contributing to it, flush it now to finish that work so the
     * result will become available (eventually).
     */
-   flush_batch_if_needed(brw, query);
-
    gen6_queryobj_get_results(ctx, query);
 }
 
@@ -463,7 +442,6 @@ static void gen6_wait_query(struct gl_context *ctx, struct gl_query_object *q)
  */
 static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q)
 {
-   struct brw_context *brw = brw_context(ctx);
    struct brw_query_object *query = (struct brw_query_object *)q;
 
    /* If query->bo is NULL, we've already gathered the results - this is a
@@ -479,9 +457,8 @@ static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q)
     *      not ready yet on the first time it is queried.  This ensures that
     *      the async query will return true in finite time.
     */
-   flush_batch_if_needed(brw, query);
-
-   if (!drm_intel_bo_busy(query->bo)) {
+   if (!brw_bo_busy(query->bo, BUSY_READ | BUSY_FLUSH,
+                    PERF_DEBUG(brw_context(ctx), "CheckQuery"))) {
       gen6_queryobj_get_results(ctx, query);
    }
 }
diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c
index 0df5c8005d..7e00d9fd0f 100644
--- a/src/mesa/drivers/dri/i965/gen7_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c
@@ -361,14 +361,7 @@ gen7_tally_prims_generated(struct brw_context *brw,
    /* If the current batch is still contributing to the number of primitives
     * generated, flush it now so the results will be present when mapped.
     */
-   if (drm_intel_bo_references(brw->batch.bo, obj->prim_count_bo))
-      brw_batch_flush(&brw->batch, perf);
-
-   if (unlikely(brw->perf_debug && drm_intel_bo_busy(obj->prim_count_bo)))
-      perf_debug("Stalling for # of transform feedback primitives written.\n");
-
-   drm_intel_bo_map(obj->prim_count_bo, false);
-   uint64_t *prim_counts = obj->prim_count_bo->virtual;
+   uint64_t *prim_counts = brw_bo_map(obj->prim_count_bo, MAP_READ, perf);
 
    assert(obj->prim_count_buffer_index % (2 * BRW_MAX_XFB_STREAMS) == 0);
    int pairs = obj->prim_count_buffer_index / (2 * BRW_MAX_XFB_STREAMS);
@@ -381,8 +374,6 @@ gen7_tally_prims_generated(struct brw_context *brw,
       prim_counts += 2 * BRW_MAX_XFB_STREAMS; /* move to the next pair */
    }
 
-   drm_intel_bo_unmap(obj->prim_count_bo);
-
    /* We've already gathered up the old data; we can safely overwrite it now. */
    obj->prim_count_buffer_index = 0;
 }
@@ -498,7 +489,7 @@ gen7_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
       brw_obj->zero_offsets = true;
    } else if (!brw->has_pipelined_so) {
       brw_batch_flush(&brw->batch, PERF_DEBUG(brw, "BeginTransformFeedback"));
-      brw->batch.flags |= BATCH_HAS_SOL;
+      brw->batch.flags |= I915_EXEC_GEN7_SOL_RESET;
    }
 
    /* We're about to lose the information needed to compute the number of
diff --git a/src/mesa/drivers/dri/i965/genX_blorp_exec.c b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
index 58a891f383..6f40c44e3d 100644
--- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
+++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
@@ -37,12 +37,7 @@ blorp_emit_dwords(struct blorp_batch *batch, unsigned n)
 {
    assert(batch->blorp->driver_ctx == batch->driver_batch);
    struct brw_context *brw = batch->driver_batch;
-
-   intel_batchbuffer_begin(brw, n, RENDER_RING);
-   uint32_t *map = brw->batch.map_next;
-   brw->batch.map_next += n;
-   intel_batchbuffer_advance(brw);
-   return map;
+   return __brw_batch_check(&brw->batch, n, RENDER_RING);
 }
 
 static uint64_t
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
deleted file mode 100644
index 45d9907e26..0000000000
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ /dev/null
@@ -1,480 +0,0 @@
-/*
- * Copyright 2006 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#define INCLUDE_INTEL_BATCHBUFFER_H
-#include "intel_batchbuffer.h"
-#include "intel_buffer_objects.h"
-#include "intel_bufmgr.h"
-#include "intel_buffers.h"
-#include "intel_fbo.h"
-#include "brw_context.h"
-#include "brw_defines.h"
-#include "brw_state.h"
-
-#include <xf86drm.h>
-#include <i915_drm.h>
-
-static void
-intel_batchbuffer_reset(struct brw_batch *batch, bool has_llc);
-
-int
-intel_batchbuffer_init(struct brw_batch *batch, dri_bufmgr *bufmgr,
-                       int gen, bool has_llc)
-{
-   batch->bufmgr = bufmgr;
-   batch->gen = gen;
-
-   intel_batchbuffer_reset(batch, has_llc);
-
-   if (!has_llc) {
-      batch->cpu_map = malloc(BATCH_SZ);
-      batch->map = batch->cpu_map;
-      batch->map_next = batch->cpu_map;
-   }
-
-   if (gen >= 6) {
-      /* Create a new hardware context.  Using a hardware context means that
-       * our GPU state will be saved/restored on context switch, allowing us
-       * to assume that the GPU is in the same state we left it in.
-       *
-       * This is required for transform feedback buffer offsets, query objects,
-       * and also allows us to reduce how much state we have to emit.
-       */
-      batch->hw_ctx = drm_intel_gem_context_create(bufmgr);
-
-      if (!batch->hw_ctx) {
-         fprintf(stderr, "Gen6+ requires Kernel 3.6 or later.\n");
-         return false;
-      }
-   }
-
-   return true;
-}
-
-static void
-intel_batchbuffer_reset(struct brw_batch *batch, bool has_llc)
-{
-   brw_bo_put(batch->last_bo);
-   batch->last_bo = batch->bo;
-
-   brw_batch_clear_dirty(batch);
-
-   batch->bo = brw_bo_create(batch, "batchbuffer", BATCH_SZ, 4096, 0);
-   if (has_llc) {
-      drm_intel_bo_map(batch->bo, true);
-      batch->map = batch->bo->virtual;
-   }
-   batch->map_next = batch->map;
-
-   batch->reserved_space = BATCH_RESERVED;
-   batch->state_batch_offset = batch->bo->size;
-   batch->flags = 0;
-
-   /* We don't know what ring the new batch will be sent to until we see the
-    * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
-    */
-   batch->ring = UNKNOWN_RING;
-}
-
-void
-intel_batchbuffer_save_state(struct brw_batch *batch)
-{
-   batch->saved.map_next = batch->map_next;
-   batch->saved.reloc_count = drm_intel_gem_bo_get_reloc_count(batch->bo);
-}
-
-void
-intel_batchbuffer_reset_to_saved(struct brw_batch *batch)
-{
-   drm_intel_gem_bo_clear_relocs(batch->bo, batch->saved.reloc_count);
-
-   batch->map_next = batch->saved.map_next;
-   if (USED_BATCH(batch) == 0)
-      batch->ring = UNKNOWN_RING;
-}
-
-void
-intel_batchbuffer_free(struct brw_batch *batch)
-{
-   free(batch->cpu_map);
-   brw_bo_put(batch->last_bo);
-   brw_bo_put(batch->bo);
-
-   brw_bo_put(batch->throttle_batch[1]);
-   brw_bo_put(batch->throttle_batch[0]);
-
-   drm_intel_gem_context_destroy(batch->hw_ctx);
-}
-
-void
-intel_batchbuffer_require_space(struct brw_batch *batch, GLuint sz,
-                                enum brw_gpu_ring ring)
-{
-   /* If we're switching rings, implicitly flush the batch. */
-   if (unlikely(ring != batch->ring) && batch->ring != UNKNOWN_RING &&
-       batch->gen >= 6) {
-      brw_batch_flush(batch, NULL);
-   }
-
-#ifdef DEBUG
-   assert(sz < BATCH_SZ - BATCH_RESERVED);
-#endif
-   if (intel_batchbuffer_space(batch) < sz)
-      brw_batch_flush(batch, NULL);
-
-   enum brw_gpu_ring prev_ring = batch->ring;
-   /* The intel_batchbuffer_flush() calls above might have changed
-    * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
-    */
-   batch->ring = ring;
-
-   if (unlikely(prev_ring == UNKNOWN_RING && ring == RENDER_RING))
-      brw_batch_start_hook(batch);
-}
-
-static void
-do_batch_dump(struct brw_context *brw)
-{
-   struct drm_intel_decode *decode;
-   brw_batch *batch = &brw->batch;
-   int ret;
-
-   decode = drm_intel_decode_context_alloc(brw->screen->deviceID);
-   if (!decode)
-      return;
-
-   ret = drm_intel_bo_map(batch->bo, false);
-   if (ret == 0) {
-      drm_intel_decode_set_batch_pointer(decode,
-					 batch->bo->virtual,
-					 batch->bo->offset64,
-                                         USED_BATCH(batch));
-   } else {
-      fprintf(stderr,
-	      "WARNING: failed to map batchbuffer (%s), "
-	      "dumping uploaded data instead.\n", strerror(ret));
-
-      drm_intel_decode_set_batch_pointer(decode,
-					 batch->map,
-					 batch->bo->offset64,
-                                         USED_BATCH(batch));
-   }
-
-   drm_intel_decode_set_output_file(decode, stderr);
-   drm_intel_decode(decode);
-
-   drm_intel_decode_context_free(decode);
-
-   if (ret == 0) {
-      drm_intel_bo_unmap(batch->bo);
-
-      brw_debug_batch(brw);
-   }
-}
-
-/**
- * Called when starting a new batch buffer.
- */
-static void
-brw_new_batch(struct brw_context *brw)
-{
-   /* Create a new batchbuffer and reset the associated state: */
-   drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0);
-   intel_batchbuffer_reset(&brw->batch, brw->has_llc);
-}
-
-static void
-throttle(struct brw_context *brw)
-{
-   /* Wait for the swapbuffers before the one we just emitted, so we
-    * don't get too many swaps outstanding for apps that are GPU-heavy
-    * but not CPU-heavy.
-    *
-    * We're using intelDRI2Flush (called from the loader before
-    * swapbuffer) and glFlush (for front buffer rendering) as the
-    * indicator that a frame is done and then throttle when we get
-    * here as we prepare to render the next frame.  At this point for
-    * round trips for swap/copy and getting new buffers are done and
-    * we'll spend less time waiting on the GPU.
-    *
-    * Unfortunately, we don't have a handle to the batch containing
-    * the swap, and getting our hands on that doesn't seem worth it,
-    * so we just use the first batch we emitted after the last swap.
-    */
-   if (brw->batch.need_swap_throttle && brw->batch.throttle_batch[0]) {
-      if (brw->batch.throttle_batch[1]) {
-         if (!brw->batch.disable_throttling)
-            drm_intel_bo_wait_rendering(brw->batch.throttle_batch[1]);
-         brw_bo_put(brw->batch.throttle_batch[1]);
-      }
-      brw->batch.throttle_batch[1] = brw->batch.throttle_batch[0];
-      brw->batch.throttle_batch[0] = NULL;
-      brw->batch.need_swap_throttle = false;
-      /* Throttling here is more precise than the throttle ioctl, so skip it */
-      brw->batch.need_flush_throttle = false;
-   }
-
-   if (brw->batch.need_flush_throttle) {
-      __DRIscreen *dri_screen = brw->screen->driScrnPriv;
-      drmCommandNone(dri_screen->fd, DRM_I915_GEM_THROTTLE);
-      brw->batch.need_flush_throttle = false;
-   }
-}
-
-/* Drop when RS headers get pulled to libdrm */
-#ifndef I915_EXEC_RESOURCE_STREAMER
-#define I915_EXEC_RESOURCE_STREAMER (1<<15)
-#endif
-
-/* TODO: Push this whole function into bufmgr.
- */
-static int
-do_flush_locked(struct brw_context *brw)
-{
-   brw_batch *batch = &brw->batch;
-   int ret = 0;
-
-   if (brw->has_llc) {
-      drm_intel_bo_unmap(batch->bo);
-   } else {
-      ret = drm_intel_bo_subdata(batch->bo, 0, 4 * USED_BATCH(batch), batch->map);
-      if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
-	 ret = drm_intel_bo_subdata(batch->bo,
-				    batch->state_batch_offset,
-				    batch->bo->size - batch->state_batch_offset,
-				    (char *)batch->map + batch->state_batch_offset);
-      }
-   }
-
-   if (!brw->screen->no_hw) {
-      int flags;
-
-      if (brw->gen >= 6 && batch->ring == BLT_RING) {
-         flags = I915_EXEC_BLT;
-      } else {
-         flags = I915_EXEC_RENDER |
-            (brw->use_resource_streamer ? I915_EXEC_RESOURCE_STREAMER : 0);
-      }
-      if (batch->flags & BATCH_HAS_SOL)
-	 flags |= I915_EXEC_GEN7_SOL_RESET;
-
-      if (ret == 0) {
-         if (batch->hw_ctx == NULL || batch->ring != RENDER_RING) {
-            ret = drm_intel_bo_mrb_exec(batch->bo, 4 * USED_BATCH(batch),
-                                        NULL, 0, 0, flags);
-         } else {
-            ret = drm_intel_gem_bo_context_exec(batch->bo, batch->hw_ctx,
-                                                4 * USED_BATCH(batch), flags);
-         }
-      }
-
-      throttle(brw);
-   }
-
-   if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
-      do_batch_dump(brw);
-
-   if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
-      brw_check_for_reset(brw);
-
-   if (ret != 0) {
-      fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
-      exit(1);
-   }
-
-   return ret;
-}
-
-int
-brw_batch_flush(struct brw_batch *batch, struct perf_debug *info)
-{
-   struct brw_context *brw = container_of(batch, brw, batch);
-   int ret;
-
-   if (USED_BATCH(batch) == 0)
-      return 0;
-
-   if (brw->batch.throttle_batch[0] == NULL)
-      brw->batch.throttle_batch[0] = brw_bo_get(brw->batch.bo);
-
-   if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
-      int bytes_for_commands = 4 * USED_BATCH(batch);
-      int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
-      int total_bytes = bytes_for_commands + bytes_for_state;
-      fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
-              "%4db (state) = %4db (%0.1f%%)\n",
-              info ? info->file : "???", info ? info->line : -1,
-              bytes_for_commands, bytes_for_state,
-              total_bytes,
-              100.0f * total_bytes / BATCH_SZ);
-   }
-
-   if (unlikely(info))
-      brw_batch_report_flush_hook(batch, info);
-
-   brw->batch.reserved_space = 0;
-
-   brw->batch.begin_count++;
-   brw_batch_finish_hook(&brw->batch);
-   brw->batch.begin_count--;
-
-   /* Mark the end of the buffer. */
-   intel_batchbuffer_emit_dword(&brw->batch, MI_BATCH_BUFFER_END);
-   if (USED_BATCH(&brw->batch) & 1) {
-      /* Round batchbuffer usage to 2 DWORDs. */
-      intel_batchbuffer_emit_dword(&brw->batch, MI_NOOP);
-   }
-
-   intel_upload_finish(brw);
-
-   /* Check that we didn't just wrap our batchbuffer at a bad time. */
-   assert(!brw->batch.no_batch_wrap);
-
-   ret = do_flush_locked(brw);
-
-   if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
-      fprintf(stderr, "waiting for idle\n");
-      drm_intel_bo_wait_rendering(brw->batch.bo);
-   }
-
-   /* Start a new batch buffer. */
-   brw_new_batch(brw);
-
-   return ret;
-}
-
-
-/*  This is the only way buffers get added to the validate list.
- */
-uint32_t
-intel_batchbuffer_reloc(struct brw_batch *batch,
-                        brw_bo *buffer, uint32_t offset,
-                        uint32_t read_domains, uint32_t write_domain,
-                        uint32_t delta)
-{
-   int ret;
-
-   ret = drm_intel_bo_emit_reloc(batch->bo, offset,
-				 buffer, delta,
-				 read_domains, write_domain);
-   assert(ret == 0);
-   (void)ret;
-
-   /* Using the old buffer offset, write in what the right data would be, in
-    * case the buffer doesn't move and we can short-circuit the relocation
-    * processing in the kernel
-    */
-   return buffer->offset64 + delta;
-}
-
-uint64_t
-intel_batchbuffer_reloc64(struct brw_batch *batch,
-                          brw_bo *buffer, uint32_t offset,
-                          uint32_t read_domains, uint32_t write_domain,
-                          uint32_t delta)
-{
-   int ret = drm_intel_bo_emit_reloc(batch->bo, offset,
-                                     buffer, delta,
-                                     read_domains, write_domain);
-   assert(ret == 0);
-   (void) ret;
-
-   /* Using the old buffer offset, write in what the right data would be, in
-    * case the buffer doesn't move and we can short-circuit the relocation
-    * processing in the kernel
-    */
-   return buffer->offset64 + delta;
-}
-
-
-void
-intel_batchbuffer_data(struct brw_context *brw,
-                       const void *data, GLuint bytes, enum brw_gpu_ring ring)
-{
-   assert((bytes & 3) == 0);
-   intel_batchbuffer_require_space(&brw->batch, bytes, ring);
-   memcpy(brw->batch.map_next, data, bytes);
-   brw->batch.map_next += bytes >> 2;
-}
-
-int __brw_batch_begin(struct brw_batch *batch,
-                      const int sz_bytes,
-                      enum brw_gpu_ring ring)
-{
-   assert(batch->begin_count == 1);
-   assert(!batch->repeat);
-
-   intel_batchbuffer_require_space(batch, sz_bytes, ring);
-   intel_batchbuffer_save_state(batch);
-
-   assert(!batch->no_batch_wrap);
-   batch->no_batch_wrap = true;
-
-   return 0;
-}
-
-void brw_batch_end(struct brw_batch *batch)
-{
-   assert(batch->begin_count);
-   if (--batch->begin_count)
-      return;
-
-   assert(batch->no_batch_wrap);
-   batch->no_batch_wrap = false;
-
-   if (dri_bufmgr_check_aperture_space(&batch->bo, 1)) {
-      if (!batch->repeat) {
-         enum brw_gpu_ring ring = batch->ring;
-
-         intel_batchbuffer_reset_to_saved(batch);
-         brw_batch_flush(batch, NULL);
-
-         batch->begin_count++;
-         batch->no_batch_wrap = true;
-
-         batch->ring = ring;
-         if (ring == RENDER_RING)
-            brw_batch_start_hook(batch);
-
-         batch->repeat = true;
-         longjmp(batch->jmpbuf, 1);
-      }
-
-      if (unlikely(brw_batch_flush(batch, NULL) == -ENOSPC)) {
-         static GLuint msg_id;
-         if (!msg_id) {
-            struct brw_context *brw = container_of(batch, brw, batch);
-            fprintf(stderr, "WARNING: Aperture space exceeded!\n");
-            _mesa_gl_debug(&brw->ctx, &msg_id,
-                           MESA_DEBUG_SOURCE_API,
-                           MESA_DEBUG_TYPE_OTHER,
-                           MESA_DEBUG_SEVERITY_HIGH,
-                           "Aperture space exceeded!\n");
-         }
-      }
-   }
-
-   batch->repeat = false;
-}
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
deleted file mode 100644
index fb03c474a6..0000000000
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ /dev/null
@@ -1,148 +0,0 @@
-#ifndef INCLUDE_INTEL_BATCHBUFFER_H
-#error "Do not include intel_batchbuffer.h"
-#endif
-
-#ifndef INTEL_BATCHBUFFER_H
-#define INTEL_BATCHBUFFER_H
-
-#include "main/mtypes.h"
-
-#include "brw_context.h"
-#include "intel_bufmgr.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct brw_batch;
-struct brw_context;
-enum brw_gpu_ring;
-
-int intel_batchbuffer_init(struct brw_batch *batch, dri_bufmgr *bufmgr,
-                            int gen, bool has_llc);
-void intel_batchbuffer_free(struct brw_batch *brw);
-
-/* Unlike bmBufferData, this currently requires the buffer be mapped.
- * Consider it a convenience function wrapping multple
- * intel_buffer_dword() calls.
- */
-void intel_batchbuffer_data(struct brw_context *brw,
-                            const void *data, GLuint bytes,
-                            enum brw_gpu_ring ring);
-
-uint32_t intel_batchbuffer_reloc(struct brw_batch *batch,
-                                 brw_bo *buffer,
-                                 uint32_t offset,
-                                 uint32_t read_domains,
-                                 uint32_t write_domain,
-                                 uint32_t delta);
-uint64_t intel_batchbuffer_reloc64(struct brw_batch *batch,
-                                   brw_bo *buffer,
-                                   uint32_t offset,
-                                   uint32_t read_domains,
-                                   uint32_t write_domain,
-                                   uint32_t delta);
-
-static inline uint32_t float_as_int(float f)
-{
-   union {
-      float f;
-      uint32_t d;
-   } fi;
-
-   fi.f = f;
-   return fi.d;
-}
-
-/* Inline functions - might actually be better off with these
- * non-inlined.  Certainly better off switching all command packets to
- * be passed as structs rather than dwords, but that's a little bit of
- * work...
- */
-static inline void
-intel_batchbuffer_emit_dword(struct brw_batch *batch, GLuint dword)
-{
-#ifdef DEBUG
-   assert(intel_batchbuffer_space(batch) >= 4);
-#endif
-   *batch->map_next++ = dword;
-   assert(batch->ring != UNKNOWN_RING);
-}
-
-static inline void
-intel_batchbuffer_emit_float(struct brw_batch *batch, float f)
-{
-   intel_batchbuffer_emit_dword(batch, float_as_int(f));
-}
-
-static inline void
-intel_batchbuffer_begin(struct brw_context *brw, int n, enum brw_gpu_ring ring)
-{
-   intel_batchbuffer_require_space(&brw->batch, n * 4, ring);
-
-#ifdef DEBUG
-   brw->batch.emit = USED_BATCH(&brw->batch);
-   brw->batch.total = n;
-#endif
-}
-
-static inline void
-intel_batchbuffer_advance(struct brw_context *brw)
-{
-#ifdef DEBUG
-   brw_batch *batch = &brw->batch;
-   unsigned int _n = USED_BATCH(batch) - batch->emit;
-   assert(batch->total != 0);
-   if (_n != batch->total) {
-      fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n",
-	      _n, batch->total);
-      abort();
-   }
-   batch->total = 0;
-#else
-   (void) brw;
-#endif
-}
-
-#define BEGIN_BATCH(n) do {                            \
-   intel_batchbuffer_begin(brw, (n), RENDER_RING);     \
-   uint32_t *__map = brw->batch.map_next;              \
-   brw->batch.map_next += (n)
-
-#define BEGIN_BATCH_BLT(n) do {                        \
-   intel_batchbuffer_begin(brw, (n), BLT_RING);        \
-   uint32_t *__map = brw->batch.map_next;              \
-   brw->batch.map_next += (n)
-
-#define OUT_BATCH(d) *__map++ = (d)
-#define OUT_BATCH_F(f) OUT_BATCH(float_as_int((f)))
-
-#define OUT_RELOC(buf, read_domains, write_domain, delta) do {    \
-   uint32_t __offset = (__map - brw->batch.map) * 4;              \
-   OUT_BATCH(intel_batchbuffer_reloc(&brw->batch, (buf), __offset, \
-                                     (read_domains),              \
-                                     (write_domain),              \
-                                     (delta)));                   \
-} while (0)
-
-/* Handle 48-bit address relocations for Gen8+ */
-#define OUT_RELOC64(buf, read_domains, write_domain, delta) do {      \
-   uint32_t __offset = (__map - brw->batch.map) * 4;                  \
-   uint64_t reloc64 = intel_batchbuffer_reloc64(&brw->batch, (buf), __offset, \
-                                                (read_domains),       \
-                                                (write_domain),       \
-                                                (delta));             \
-   OUT_BATCH(reloc64);                                                \
-   OUT_BATCH(reloc64 >> 32);                                          \
-} while (0)
-
-#define ADVANCE_BATCH()                  \
-   assert(__map == brw->batch.map_next); \
-   intel_batchbuffer_advance(brw);       \
-} while (0)
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index 1c42321fae..9e961b03a2 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -828,7 +828,7 @@ intelEmitImmediateColorExpandBlit(struct brw_context *brw,
    OUT_BATCH(SET_FIELD(y + h, BLT_Y) | SET_FIELD(x + w, BLT_X));
    ADVANCE_BATCH();
 
-   intel_batchbuffer_data(brw, src_bits, dwords * 4, BLT_RING);
+   brw_batch_data(&brw->batch, src_bits, dwords * 4);
 
    brw_emit_mi_flush(brw);
 
diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.c b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
index 137297c087..cc98170bb5 100644
--- a/src/mesa/drivers/dri/i965/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
@@ -38,46 +38,6 @@
 #include "intel_blit.h"
 #include "intel_buffer_objects.h"
 
-/**
- * Map a buffer object; issue performance warnings if mapping causes stalls.
- *
- * This matches the drm_intel_bo_map API, but takes an additional human-readable
- * name for the buffer object to use in the performance debug message.
- */
-int
-brw_bo_map(struct brw_context *brw,
-           brw_bo *bo, int write_enable,
-           const char *bo_name)
-{
-   if (likely(!brw->perf_debug) || !drm_intel_bo_busy(bo))
-      return drm_intel_bo_map(bo, write_enable);
-
-   double start_time = get_time();
-
-   int ret = drm_intel_bo_map(bo, write_enable);
-
-   perf_debug("CPU mapping a busy %s BO stalled and took %.03f ms.\n",
-              bo_name, (get_time() - start_time) * 1000);
-
-   return ret;
-}
-
-int
-brw_bo_map_gtt(struct brw_context *brw, brw_bo *bo, const char *bo_name)
-{
-   if (likely(!brw->perf_debug) || !drm_intel_bo_busy(bo))
-      return drm_intel_gem_bo_map_gtt(bo);
-
-   double start_time = get_time();
-
-   int ret = drm_intel_gem_bo_map_gtt(bo);
-
-   perf_debug("GTT mapping a busy %s BO stalled and took %.03f ms.\n",
-              bo_name, (get_time() - start_time) * 1000);
-
-   return ret;
-}
-
 static void
 mark_buffer_gpu_usage(struct intel_buffer_object *intel_obj,
                                uint32_t offset, uint32_t size)
@@ -89,6 +49,9 @@ mark_buffer_gpu_usage(struct intel_buffer_object *intel_obj,
 static void
 mark_buffer_inactive(struct intel_buffer_object *intel_obj)
 {
+   if (brw_bo_busy(intel_obj->buffer, BUSY_WRITE, NULL))
+      return;
+
    intel_obj->gpu_active_start = ~0;
    intel_obj->gpu_active_end = 0;
 }
@@ -212,7 +175,7 @@ brw_buffer_data(struct gl_context *ctx,
          return false;
 
       if (data != NULL)
-	 drm_intel_bo_subdata(intel_obj->buffer, 0, size, data);
+         brw_bo_write(intel_obj->buffer, 0, data, size, 0, NULL);
    }
 
    return true;
@@ -237,7 +200,6 @@ brw_buffer_subdata(struct gl_context *ctx,
 {
    struct brw_context *brw = brw_context(ctx);
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
-   bool busy;
 
    if (size == 0)
       return;
@@ -255,28 +217,17 @@ brw_buffer_subdata(struct gl_context *ctx,
     */
    if (offset + size <= intel_obj->gpu_active_start ||
        intel_obj->gpu_active_end <= offset) {
-      if (brw->has_llc) {
-         drm_intel_gem_bo_map_unsynchronized(intel_obj->buffer);
-         memcpy(intel_obj->buffer->virtual + offset, data, size);
-         drm_intel_bo_unmap(intel_obj->buffer);
-
-         if (intel_obj->gpu_active_end > intel_obj->gpu_active_start)
-            intel_obj->prefer_stall_to_blit = true;
-         return;
-      } else {
-         perf_debug("BufferSubData could be unsynchronized, but !LLC doesn't support it yet\n");
-      }
+      brw_bo_write(intel_obj->buffer, offset, data, size, MAP_ASYNC, NULL);
+      if (intel_obj->gpu_active_end > intel_obj->gpu_active_start)
+         intel_obj->prefer_stall_to_blit = intel_obj->buffer->cache_coherent;
+      return;
    }
 
-   busy =
-      drm_intel_bo_busy(intel_obj->buffer) ||
-      drm_intel_bo_references(brw->batch.bo, intel_obj->buffer);
-
-   if (busy) {
+   if (brw_bo_busy(intel_obj->buffer, BUSY_WRITE | BUSY_FLUSH, NULL)) {
       if (size == intel_obj->Base.Size) {
 	 /* Replace the current busy bo so the subdata doesn't stall. */
          brw_bo_put(intel_obj->buffer);
-	 alloc_buffer_object(brw, intel_obj);
+         alloc_buffer_object(brw, intel_obj);
       } else if (!intel_obj->prefer_stall_to_blit) {
          perf_debug("Using a blit copy to avoid stalling on "
                     "glBufferSubData(%ld, %ld) (%ldkb) to a busy "
@@ -287,12 +238,13 @@ brw_buffer_subdata(struct gl_context *ctx,
          brw_bo *temp_bo =
             brw_bo_create(&brw->batch, "subdata temp", size, 64, 0);
 
-	 drm_intel_bo_subdata(temp_bo, 0, size, data);
+         brw_bo_write(temp_bo, 0, data, size, 0,
+                      PERF_DEBUG(brw, "BufferSubData"));
 
-	 intel_emit_linear_blit(brw,
-				intel_obj->buffer, offset,
-				temp_bo, 0,
-				size);
+         intel_emit_linear_blit(brw,
+                                intel_obj->buffer, offset,
+                                temp_bo, 0,
+                                size);
 
          brw_bo_put(temp_bo);
          return;
@@ -303,11 +255,11 @@ brw_buffer_subdata(struct gl_context *ctx,
                     (long)offset, (long)offset + size, (long)(size/1024),
                     intel_obj->gpu_active_start,
                     intel_obj->gpu_active_end);
-         brw_batch_flush(&brw->batch, PERF_DEBUG(brw, "BufferSubData"));
       }
    }
 
-   drm_intel_bo_subdata(intel_obj->buffer, offset, size, data);
+   brw_bo_write(intel_obj->buffer, offset, data, size, 0,
+                PERF_DEBUG(brw, "BufferSubData"));
    mark_buffer_inactive(intel_obj);
 }
 
@@ -326,14 +278,10 @@ brw_get_buffer_subdata(struct gl_context *ctx,
                        struct gl_buffer_object *obj)
 {
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
-   struct brw_context *brw = brw_context(ctx);
 
    assert(intel_obj);
-   if (drm_intel_bo_references(brw->batch.bo, intel_obj->buffer)) {
-      brw_batch_flush(&brw->batch, PERF_DEBUG(brw, "BufferSubData"));
-   }
-   drm_intel_bo_get_subdata(intel_obj->buffer, offset, size, data);
-
+   brw_bo_read(intel_obj->buffer, offset, data, size, 0,
+               PERF_DEBUG(brw_context(ctx), "GetBufferSubData"));
    mark_buffer_inactive(intel_obj);
 }
 
@@ -388,19 +336,11 @@ brw_map_buffer_range(struct gl_context *ctx,
     * achieve the required synchronization.
     */
    if (!(access & GL_MAP_UNSYNCHRONIZED_BIT)) {
-      if (drm_intel_bo_references(brw->batch.bo, intel_obj->buffer)) {
-	 if (access & GL_MAP_INVALIDATE_BUFFER_BIT) {
+      if ((access & GL_MAP_INVALIDATE_BUFFER_BIT)) {
+         if (brw_bo_busy(intel_obj->buffer, BUSY_WRITE | BUSY_FLUSH, NULL)) {
             brw_bo_put(intel_obj->buffer);
-	    alloc_buffer_object(brw, intel_obj);
-	 } else {
-            perf_debug("Stalling on the GPU for mapping a busy buffer "
-                       "object\n");
-            brw_batch_flush(&brw->batch, PERF_DEBUG(brw, "miptree"));
-	 }
-      } else if (drm_intel_bo_busy(intel_obj->buffer) &&
-		 (access & GL_MAP_INVALIDATE_BUFFER_BIT)) {
-         brw_bo_put(intel_obj->buffer);
-	 alloc_buffer_object(brw, intel_obj);
+            alloc_buffer_object(brw, intel_obj);
+         }
       }
    }
 
@@ -415,46 +355,37 @@ brw_map_buffer_range(struct gl_context *ctx,
     */
    if (!(access & (GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_PERSISTENT_BIT)) &&
        (access & GL_MAP_INVALIDATE_RANGE_BIT) &&
-       drm_intel_bo_busy(intel_obj->buffer)) {
+       brw_bo_busy(intel_obj->buffer, BUSY_WRITE | BUSY_FLUSH, NULL)) {
       /* Ensure that the base alignment of the allocation meets the alignment
        * guarantees the driver has advertised to the application.
        */
       const unsigned alignment = ctx->Const.MinMapBufferAlignment;
 
       intel_obj->map_extra[index] = (uintptr_t) offset % alignment;
-      intel_obj->range_map_bo[index] = brw_bo_create(&brw->batch,
-                                                     "BO blit temp",
-                                                     length +
-                                                     intel_obj->map_extra[index],
-                                                     alignment, 0);
-      if (brw->has_llc) {
-         brw_bo_map(brw, intel_obj->range_map_bo[index],
-                    (access & GL_MAP_WRITE_BIT) != 0, "range-map");
-      } else {
-         drm_intel_gem_bo_map_gtt(intel_obj->range_map_bo[index]);
-      }
+      intel_obj->range_map_bo[index] =
+         brw_bo_create(&brw->batch, "BO blit temp",
+                       length + intel_obj->map_extra[index], alignment, 0);
+
       obj->Mappings[index].Pointer =
-         intel_obj->range_map_bo[index]->virtual + intel_obj->map_extra[index];
+         brw_bo_map(intel_obj->range_map_bo[index], MAP_WRITE, NULL) +
+         intel_obj->map_extra[index];
+
       return obj->Mappings[index].Pointer;
    }
 
-   if (access & GL_MAP_UNSYNCHRONIZED_BIT) {
-      if (!brw->has_llc && brw->perf_debug &&
-          drm_intel_bo_busy(intel_obj->buffer)) {
-         perf_debug("MapBufferRange with GL_MAP_UNSYNCHRONIZED_BIT stalling (it's actually synchronized on non-LLC platforms)\n");
-      }
-      drm_intel_gem_bo_map_unsynchronized(intel_obj->buffer);
-   } else if (!brw->has_llc && (!(access & GL_MAP_READ_BIT) ||
-                              (access & GL_MAP_PERSISTENT_BIT))) {
-      drm_intel_gem_bo_map_gtt(intel_obj->buffer);
-      mark_buffer_inactive(intel_obj);
-   } else {
-      brw_bo_map(brw, intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0,
-                 "MapBufferRange");
-      mark_buffer_inactive(intel_obj);
-   }
+   STATIC_ASSERT(GL_MAP_UNSYNCHRONIZED_BIT == MAP_ASYNC);
+   STATIC_ASSERT(GL_MAP_WRITE_BIT == MAP_WRITE);
+   STATIC_ASSERT(GL_MAP_READ_BIT == MAP_READ);
+   STATIC_ASSERT(GL_MAP_PERSISTENT_BIT == MAP_PERSISTENT);
+   STATIC_ASSERT(GL_MAP_COHERENT_BIT == MAP_COHERENT);
+   assert((access & MAP_INTERNAL_MASK) == 0);
+
+   obj->Mappings[index].Pointer =
+      brw_bo_map(intel_obj->buffer, access,
+                 PERF_DEBUG(brw, "MapBufferRange")) + offset;
+
+   mark_buffer_inactive(intel_obj);
 
-   obj->Mappings[index].Pointer = intel_obj->buffer->virtual + offset;
    return obj->Mappings[index].Pointer;
 }
 
@@ -542,8 +473,6 @@ brw_unmap_buffer(struct gl_context *ctx,
    assert(intel_obj);
    assert(obj->Mappings[index].Pointer);
    if (intel_obj->range_map_bo[index] != NULL) {
-      drm_intel_bo_unmap(intel_obj->range_map_bo[index]);
-
       if (!(obj->Mappings[index].AccessFlags & GL_MAP_FLUSH_EXPLICIT_BIT)) {
          intel_emit_linear_blit(brw,
                                 intel_obj->buffer, obj->Mappings[index].Offset,
@@ -563,8 +492,6 @@ brw_unmap_buffer(struct gl_context *ctx,
 
       brw_bo_put(intel_obj->range_map_bo[index]);
       intel_obj->range_map_bo[index] = NULL;
-   } else if (intel_obj->buffer != NULL) {
-      drm_intel_bo_unmap(intel_obj->buffer);
    }
    obj->Mappings[index].Pointer = NULL;
    obj->Mappings[index].Offset = 0;
diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.h b/src/mesa/drivers/dri/i965/intel_buffer_objects.h
index f7fa34d565..2f231856b8 100644
--- a/src/mesa/drivers/dri/i965/intel_buffer_objects.h
+++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.h
@@ -101,8 +101,6 @@ void *intel_upload_space(struct brw_context *brw,
                          brw_bo **out_bo,
                          uint32_t *out_offset);
 
-void intel_upload_finish(struct brw_context *brw);
-
 /* Hook the bufferobject implementation into mesa:
  */
 void intelInitBufferObjectFuncs(struct dd_function_table *functions);
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index f892be4009..8f8f7e5ad5 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -356,13 +356,15 @@ intel_image_target_renderbuffer_storage(struct gl_context *ctx,
    irb = intel_renderbuffer(rb);
    intel_miptree_release(&irb->mt);
 
+   struct brw_bo *bo = brw_bo_import(&brw->batch, image->bo, true);
+
    /* Disable creation of the miptree's aux buffers because the driver exposes
     * no EGL API to manage them. That is, there is no API for resolving the aux
     * buffer's content to the main buffer nor for invalidating the aux buffer's
     * content.
     */
    irb->mt = intel_miptree_create_for_bo(brw,
-                                         image->bo,
+                                         bo,
                                          image->format,
                                          image->offset,
                                          image->width,
@@ -370,6 +372,7 @@ intel_image_target_renderbuffer_storage(struct gl_context *ctx,
                                          1,
                                          image->pitch,
                                          MIPTREE_LAYOUT_DISABLE_AUX);
+   brw_bo_put(bo);
    if (!irb->mt)
       return;
 
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 40d20cf8c2..76e8923b1b 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -580,7 +580,8 @@ intel_get_yf_ys_bo_size(struct intel_mipmap_tree *mt,
                         uint32_t *alignment, uint32_t *pitch)
 {
    uint32_t tile_width, tile_height;
-   unsigned long stride, size, aligned_y;
+   uint32_t stride, aligned_y;
+   uint64_t size;
 
    assert(mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE);
    intel_get_tile_dims(mt->tiling, mt->tr_mode, mt->cpp,
@@ -589,7 +590,7 @@ intel_get_yf_ys_bo_size(struct intel_mipmap_tree *mt,
    aligned_y = ALIGN(mt->total_height, tile_height);
    stride = mt->total_width * mt->cpp;
    stride = ALIGN(stride, tile_width);
-   size = stride * aligned_y;
+   size = (uint64_t)stride * aligned_y;
 
    if (mt->tr_mode == INTEL_MIPTREE_TRMODE_YF) {
       assert(size % 4096 == 0);
@@ -767,15 +768,12 @@ intel_miptree_create_for_bo(struct brw_context *brw,
                             uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt;
-   uint32_t tiling, swizzle;
    GLenum target;
 
-   drm_intel_bo_get_tiling(bo, &tiling, &swizzle);
-
    /* Nothing will be able to use this miptree with the BO if the offset isn't
     * aligned.
     */
-   if (tiling != I915_TILING_NONE)
+   if (bo->tiling != I915_TILING_NONE)
       assert(offset % 4096 == 0);
 
    /* miptrees can't handle negative pitch.  If you need flipping of images,
@@ -802,7 +800,7 @@ intel_miptree_create_for_bo(struct brw_context *brw,
    mt->bo = brw_bo_get(bo);
    mt->pitch = pitch;
    mt->offset = offset;
-   mt->tiling = tiling;
+   mt->tiling = bo->tiling;
 
    return mt;
 }
@@ -1435,7 +1433,7 @@ intel_miptree_copy_teximage(struct brw_context *brw,
    intel_obj->needs_validate = true;
 }
 
-static void
+static bool
 intel_miptree_init_mcs(struct brw_context *brw,
                        struct intel_mipmap_tree *mt,
                        int init_value)
@@ -1452,16 +1450,12 @@ intel_miptree_init_mcs(struct brw_context *brw,
     *
     * Note: the clear value for MCS buffers is all 1's, so we memset to 0xff.
     */
-   const int ret = brw_bo_map_gtt(brw, mt->mcs_buf->bo, "miptree");
-   if (unlikely(ret)) {
-      fprintf(stderr, "Failed to map mcs buffer into GTT\n");
-      brw_bo_put(mt->mcs_buf->bo);
-      free(mt->mcs_buf);
-      return;
-   }
-   void *data = mt->mcs_buf->bo->virtual;
+   void *data = brw_bo_map(mt->mcs_buf->bo, MAP_WRITE, PERF_DEBUG(brw, "MCS"));
+   if (!data)
+      return false;
+
    memset(data, init_value, mt->mcs_buf->size);
-   drm_intel_bo_unmap(mt->mcs_buf->bo);
+   return true;
 }
 
 static struct intel_miptree_aux_buffer *
@@ -1562,7 +1556,12 @@ intel_miptree_alloc_mcs(struct brw_context *brw,
    if (!mt->mcs_buf)
       return false;
 
-   intel_miptree_init_mcs(brw, mt, 0xFF);
+   if (!intel_miptree_init_mcs(brw, mt, 0xFF)) {
+         brw_bo_put(mt->mcs_buf->bo);
+         free(mt->mcs_buf);
+         mt->mcs_buf = NULL;
+         return false;
+   }
 
    /* Multisampled miptrees are only supported for single level. */
    assert(mt->first_level == 0);
@@ -1611,15 +1610,15 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
    const uint32_t alloc_flags =
       is_lossless_compressed ? 0 : BO_ALLOC_FOR_RENDER;
    uint32_t tiling = I915_TILING_Y;
-   unsigned long pitch;
+   uint32_t pitch;
 
    /* ISL has stricter set of alignment rules then the drm allocator.
     * Therefore one can pass the ISL dimensions in terms of bytes instead of
     * trying to recalculate based on different format block sizes.
     */
-   buf->bo = drm_intel_bo_alloc_tiled(brw->batch.bufmgr, "ccs-miptree",
-                                      buf->pitch, buf->size / buf->pitch,
-                                      1, &tiling, &pitch, alloc_flags);
+   buf->bo = brw_bo_create_tiled(&brw->batch, "ccs-miptree",
+                                 buf->pitch, buf->size / buf->pitch,
+                                 1, &tiling, &pitch, alloc_flags);
    if (buf->bo) {
       assert(pitch == buf->pitch);
       assert(tiling == I915_TILING_Y);
@@ -1644,7 +1643,13 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
        *   "If Software wants to enable Color Compression without Fast clear,
        *    Software needs to initialize MCS with zeros."
        */
-      intel_miptree_init_mcs(brw, mt, 0);
+      if (!intel_miptree_init_mcs(brw, mt, 0)) {
+         brw_bo_put(mt->mcs_buf->bo);
+         free(mt->mcs_buf);
+         mt->mcs_buf = NULL;
+         return false;
+      }
+
       mt->msaa_layout = INTEL_MSAA_LAYOUT_CMS;
    }
 
@@ -2501,24 +2506,12 @@ intel_miptree_map_raw(struct brw_context *brw,
     * resolve any pending fast color clears before we map.
     */
    intel_miptree_all_slices_resolve_color(brw, mt, 0);
-
-   brw_bo *bo = mt->bo;
-
-   if (drm_intel_bo_references(brw->batch.bo, bo))
-      brw_batch_flush(&brw->batch, PERF_DEBUG(brw, "miptree"));
-
-   if (mt->tiling != I915_TILING_NONE)
-      brw_bo_map_gtt(brw, bo, "miptree");
-   else
-      brw_bo_map(brw, bo, mode & GL_MAP_WRITE_BIT, "miptree");
-
-   return bo->virtual;
+   return brw_bo_map(mt->bo, mode, PERF_DEBUG(brw, "TexImage"));
 }
 
 static void
 intel_miptree_unmap_raw(struct intel_mipmap_tree *mt)
 {
-   drm_intel_bo_unmap(mt->bo);
 }
 
 static void
@@ -2569,7 +2562,6 @@ intel_miptree_map_gtt(struct brw_context *brw,
 static void
 intel_miptree_unmap_gtt(struct intel_mipmap_tree *mt)
 {
-   intel_miptree_unmap_raw(mt);
 }
 
 static void
@@ -2668,7 +2660,7 @@ intel_miptree_map_movntdqa(struct brw_context *brw,
    image_x += map->x;
    image_y += map->y;
 
-   void *src = intel_miptree_map_raw(brw, mt, map->mode);
+   void *src = intel_miptree_map_raw(brw, mt, map->mode | GL_MAP_COHERENT_BIT);
    if (!src)
       return;
 
@@ -3051,11 +3043,11 @@ use_intel_mipree_map_blit(struct brw_context *brw,
                           unsigned int level,
                           unsigned int slice)
 {
+   /* It's probably not worth swapping to the blit ring because of
+    * all the overhead involved. But, we must use blitter for the
+    * surfaces with INTEL_MIPTREE_TRMODE_{YF,YS}.
+    */
    if (brw->has_llc &&
-      /* It's probably not worth swapping to the blit ring because of
-       * all the overhead involved. But, we must use blitter for the
-       * surfaces with INTEL_MIPTREE_TRMODE_{YF,YS}.
-       */
        (!(mode & GL_MAP_WRITE_BIT) ||
         mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) &&
        !mt->compressed &&
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
index b3a140244d..e11db4c16e 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -48,7 +48,6 @@
 
 #include "main/mtypes.h"
 #include "isl/isl.h"
-#include "intel_bufmgr.h"
 #include "intel_resolve_map.h"
 #include <GL/internal/dri_interface.h>
 
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_copy.c b/src/mesa/drivers/dri/i965/intel_pixel_copy.c
index d1b0c95460..85d6df7d11 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_copy.c
@@ -145,8 +145,6 @@ do_blit_copypixels(struct gl_context * ctx,
       return false;
    }
 
-   brw_batch_flush(&brw->batch, PERF_DEBUG(brw, "CopyPixels"));
-
    /* Clip to destination buffer. */
    orig_dstx = dstx;
    orig_dsty = dsty;
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c
index e17b6d8876..b13dad1933 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@@ -80,11 +80,6 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
    int dst_pitch;
 
-   /* The miptree's buffer. */
-   brw_bo *bo;
-
-   int error = 0;
-
    uint32_t cpp;
    mem_copy_fn mem_copy = NULL;
 
@@ -134,22 +129,18 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
       return false;
    }
 
+   /* tiled_to_linear() assumes that if the object is swizzled, it
+    * is using I915_BIT6_SWIZZLE_9_10 for X and I915_BIT6_SWIZZLE_9 for Y.
+    * This is only true on gen5 and above.
+    */
+   if (brw->gen < 5 && brw->has_swizzling)
+      return false;
+
    /* Since we are going to read raw data to the miptree, we need to resolve
     * any pending fast color clears before we start.
     */
    intel_miptree_all_slices_resolve_color(brw, irb->mt, 0);
 
-   bo = irb->mt->bo;
-
-   if (drm_intel_bo_references(brw->batch.bo, bo))
-      brw_batch_flush(&brw->batch, PERF_DEBUG(brw, "ReadPixels"));
-
-   error = brw_bo_map(brw, bo, false /* write enable */, "miptree");
-   if (error) {
-      DBG("%s: failed to map bo\n", __func__);
-      return false;
-   }
-
    xoffset += irb->mt->level[irb->mt_level].slice[irb->mt_layer].x_offset;
    yoffset += irb->mt->level[irb->mt_level].slice[irb->mt_layer].y_offset;
 
@@ -181,19 +172,17 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
        pack->Alignment, pack->RowLength, pack->SkipPixels,
        pack->SkipRows);
 
-   tiled_to_linear(
+   return tiled_to_linear(
       xoffset * cpp, (xoffset + width) * cpp,
       yoffset, yoffset + height,
       pixels - (ptrdiff_t) yoffset * dst_pitch - (ptrdiff_t) xoffset * cpp,
-      bo->virtual + irb->mt->offset,
+      brw_bo_map(irb->mt->bo, MAP_READ | MAP_DETILED,
+                 PERF_DEBUG(brw, "ReadPixels")) + irb->mt->offset,
       dst_pitch, irb->mt->pitch,
       brw->has_swizzling,
       irb->mt->tiling,
       mem_copy
    );
-
-   drm_intel_bo_unmap(bo);
-   return true;
 }
 
 void
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index f65e21eaf0..15eb1d4405 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -96,7 +96,6 @@ DRI_CONF_END
 };
 
 #include "intel_buffers.h"
-#include "intel_bufmgr.h"
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
 #include "intel_screen.h"
@@ -359,7 +358,7 @@ intel_setup_image_from_mipmap_tree(struct brw_context *brw, __DRIimage *image,
                                                   &image->tile_y);
 
    drm_intel_bo_unreference(image->bo);
-   image->bo = mt->bo;
+   image->bo = mt->bo->base;
    drm_intel_bo_reference(image->bo);
 }
 
@@ -421,7 +420,7 @@ intel_create_image_from_renderbuffer(__DRIcontext *context,
    image->offset = 0;
    image->data = loaderPrivate;
    drm_intel_bo_unreference(image->bo);
-   image->bo = irb->mt->bo;
+   image->bo = irb->mt->bo->base;
    drm_intel_bo_reference(image->bo);
    image->width = rb->Width;
    image->height = rb->Height;
@@ -1149,7 +1148,7 @@ intel_init_bufmgr(struct intel_screen *screen)
 
    screen->no_hw = getenv("INTEL_NO_HW") != NULL;
 
-   screen->bufmgr = intel_bufmgr_gem_init(dri_screen->fd, BATCH_SZ);
+   screen->bufmgr = intel_bufmgr_gem_init(dri_screen->fd, 4096);
    if (screen->bufmgr == NULL) {
       fprintf(stderr, "[%s:%u] Error initializing buffer manager.\n",
 	      __func__, __LINE__);
@@ -1652,9 +1651,6 @@ __DRIconfig **intelInitScreen2(__DRIscreen *dri_screen)
 
    brw_process_intel_debug_variable();
 
-   if (INTEL_DEBUG & DEBUG_BUFMGR)
-      dri_bufmgr_set_debug(screen->bufmgr, true);
-
    if ((INTEL_DEBUG & DEBUG_SHADER_TIME) && screen->devinfo.gen < 7) {
       fprintf(stderr,
               "shader_time debugging requires gen7 (Ivybridge) or better.\n");
diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
index 08f922365b..4b91ddc3df 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@@ -32,11 +32,12 @@
 #include <GL/internal/dri_interface.h>
 
 #include "dri_util.h"
-#include "intel_bufmgr.h"
 #include "common/gen_device_info.h"
 #include "i915_drm.h"
 #include "xmlconfig.h"
 
+#include <intel_bufmgr.h>
+
 struct intel_screen
 {
    int deviceID;
@@ -107,6 +108,12 @@ struct intel_screen
    int eu_total;
 };
 
+static inline int intel_screen_to_fd(struct intel_screen *scr)
+{
+   __DRIscreen *psp = scr->driScrnPriv;
+   return psp->fd;
+}
+
 extern void intelDestroyContext(__DRIcontext * driContextPriv);
 
 extern GLboolean intelUnbindContext(__DRIcontext * driContextPriv);
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index 454766cb9f..2e7f6ab299 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -127,7 +127,9 @@ intelTexImage(struct gl_context * ctx,
    struct intel_texture_image *intelImage = intel_texture_image(texImage);
    bool ok;
 
-   bool tex_busy = intelImage->mt && drm_intel_bo_busy(intelImage->mt->bo);
+   bool tex_busy =
+      intelImage->mt &&
+      brw_bo_busy(intelImage->mt->bo, BUSY_WRITE | BUSY_FLUSH, NULL);
 
    DBG("%s mesa_format %s target %s format %s type %s level %d %dx%dx%d\n",
        __func__, _mesa_get_format_name(texImage->TexFormat),
@@ -218,12 +220,14 @@ create_mt_for_planar_dri_image(struct brw_context *brw,
        * resolving the aux buffer's content to the main buffer nor for
        * invalidating the aux buffer's content.
        */
+      struct brw_bo *bo = brw_bo_import(&brw->batch, image->bo, true);
       struct intel_mipmap_tree *mt =
-         intel_miptree_create_for_bo(brw, image->bo, format,
+         intel_miptree_create_for_bo(brw, bo, format,
                                      image->offsets[index],
                                      width, height, 1,
                                      image->strides[index],
                                      MIPTREE_LAYOUT_DISABLE_AUX);
+      brw_bo_put(bo);
       if (mt == NULL)
          return NULL;
 
@@ -257,10 +261,12 @@ create_mt_for_dri_image(struct brw_context *brw,
     * buffer's content to the main buffer nor for invalidating the aux buffer's
     * content.
     */
-   mt = intel_miptree_create_for_bo(brw, image->bo, image->format,
+   struct brw_bo *bo = brw_bo_import(&brw->batch, image->bo, true);
+   mt = intel_miptree_create_for_bo(brw, bo, image->format,
                                     0, image->width, image->height, 1,
                                     image->pitch,
                                     MIPTREE_LAYOUT_DISABLE_AUX);
+   brw_bo_put(bo);
    if (mt == NULL)
       return NULL;
 
@@ -464,11 +470,6 @@ intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx,
    struct intel_texture_image *image = intel_texture_image(texImage);
    int dst_pitch;
 
-   /* The miptree's buffer. */
-   brw_bo *bo;
-
-   int error = 0;
-
    uint32_t cpp;
    mem_copy_fn mem_copy = NULL;
 
@@ -518,21 +519,18 @@ intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx,
       return false;
    }
 
+   /* tiled_to_linear() assumes that if the object is swizzled, it
+    * is using I915_BIT6_SWIZZLE_9_10 for X and I915_BIT6_SWIZZLE_9 for Y.
+    * This is only true on gen5 and above.
+    */
+   if (brw->gen < 5 && brw->has_swizzling)
+      return false;
+
    /* Since we are going to write raw data to the miptree, we need to resolve
     * any pending fast color clears before we start.
     */
    intel_miptree_all_slices_resolve_color(brw, image->mt, 0);
 
-   bo = image->mt->bo;
-
-   if (drm_intel_bo_references(brw->batch.bo, bo))
-      brw_batch_flush(&brw->batch, PERF_DEBUG(brw, "miptree"));
-
-   error = brw_bo_map(brw, bo, false /* write enable */, "miptree");
-   if (error) {
-      DBG("%s: failed to map bo\n", __func__);
-      return false;
-   }
 
    dst_pitch = _mesa_image_row_stride(packing, width, format, type);
 
@@ -550,19 +548,17 @@ intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx,
    xoffset += image->mt->level[level].level_x;
    yoffset += image->mt->level[level].level_y;
 
-   tiled_to_linear(
+   return tiled_to_linear(
       xoffset * cpp, (xoffset + width) * cpp,
       yoffset, yoffset + height,
       pixels - (ptrdiff_t) yoffset * dst_pitch - (ptrdiff_t) xoffset * cpp,
-      bo->virtual,
+      brw_bo_map(image->mt->bo, MAP_READ | MAP_DETILED,
+                 PERF_DEBUG(brw, "TexGetSubImage")),
       dst_pitch, image->mt->pitch,
       brw->has_swizzling,
       image->mt->tiling,
       mem_copy
    );
-
-   drm_intel_bo_unmap(bo);
-   return true;
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
index 6cd60002a7..26b937496d 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
@@ -82,11 +82,6 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
    struct intel_texture_image *image = intel_texture_image(texImage);
    int src_pitch;
 
-   /* The miptree's buffer. */
-   brw_bo *bo;
-
-   int error = 0;
-
    uint32_t cpp;
    mem_copy_fn mem_copy = NULL;
 
@@ -99,7 +94,7 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
     * with _mesa_image_row_stride. However, before removing the restrictions
     * we need tests.
     */
-   if (!brw->has_llc ||
+   if (!brw->has_llc || /* XXX works for WC as well, but memcpy need tuning */
        !(type == GL_UNSIGNED_BYTE || type == GL_UNSIGNED_INT_8_8_8_8_REV) ||
        !(texImage->TexObject->Target == GL_TEXTURE_2D ||
          texImage->TexObject->Target == GL_TEXTURE_RECTANGLE) ||
@@ -135,22 +130,18 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
       return false;
    }
 
+   /* linear_to_tiled() assumes that if the object is swizzled, it
+    * is using I915_BIT6_SWIZZLE_9_10 for X and I915_BIT6_SWIZZLE_9 for Y.
+    * This is only true on gen5 and above.
+    */
+   if (brw->gen < 5 && brw->has_swizzling)
+      return false;
+
    /* Since we are going to write raw data to the miptree, we need to resolve
     * any pending fast color clears before we start.
     */
    intel_miptree_all_slices_resolve_color(brw, image->mt, 0);
 
-   bo = image->mt->bo;
-
-   if (drm_intel_bo_references(brw->batch.bo, bo))
-      brw_batch_flush(&brw->batch, PERF_DEBUG(brw, "miptree"));
-
-   error = brw_bo_map(brw, bo, true /* write enable */, "miptree");
-   if (error || bo->virtual == NULL) {
-      DBG("%s: failed to map bo\n", __func__);
-      return false;
-   }
-
    src_pitch = _mesa_image_row_stride(packing, width, format, type);
 
    /* We postponed printing this message until having committed to executing
@@ -171,19 +162,17 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
    xoffset += image->mt->level[level].level_x;
    yoffset += image->mt->level[level].level_y;
 
-   linear_to_tiled(
+   return linear_to_tiled(
       xoffset * cpp, (xoffset + width) * cpp,
       yoffset, yoffset + height,
-      bo->virtual,
+      brw_bo_map(image->mt->bo, MAP_WRITE | MAP_DETILED,
+                 PERF_DEBUG(brw, "TexSubImage")),
       pixels - (ptrdiff_t) yoffset * src_pitch - (ptrdiff_t) xoffset * cpp,
       image->mt->pitch, src_pitch,
       brw->has_swizzling,
       image->mt->tiling,
       mem_copy
    );
-
-   drm_intel_bo_unmap(bo);
-   return true;
 }
 
 static void
@@ -199,7 +188,7 @@ intelTexSubImage(struct gl_context * ctx,
    struct intel_mipmap_tree *mt = intel_texture_image(texImage)->mt;
    bool ok;
 
-   bool tex_busy = mt && drm_intel_bo_busy(mt->bo);
+   bool tex_busy = mt && brw_bo_busy(mt->bo, BUSY_WRITE | BUSY_FLUSH, NULL);
 
    if (mt && mt->format == MESA_FORMAT_S_UINT8)
       mt->r8stencil_needs_update = true;
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index c888e466eb..3026ab9a78 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -627,7 +627,7 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
  * 'dst' is the start of the texture and 'src' is the corresponding
  * address to copy from, though copying begins at (xt1, yt1).
  */
-void
+bool
 linear_to_tiled(uint32_t xt1, uint32_t xt2,
                 uint32_t yt1, uint32_t yt2,
                 char *dst, const char *src,
@@ -643,6 +643,9 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
    uint32_t tw, th, span;
    uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
 
+   if (unlikely(!dst))
+      return false;
+
    if (tiling == I915_TILING_X) {
       tw = xtile_width;
       th = xtile_height;
@@ -705,6 +708,8 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
                    mem_copy);
       }
    }
+
+   return true;
 }
 
 /**
@@ -718,7 +723,7 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
  * 'dst' is the start of the texture and 'src' is the corresponding
  * address to copy from, though copying begins at (xt1, yt1).
  */
-void
+bool
 tiled_to_linear(uint32_t xt1, uint32_t xt2,
                 uint32_t yt1, uint32_t yt2,
                 char *dst, const char *src,
@@ -734,6 +739,9 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
    uint32_t tw, th, span;
    uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
 
+   if (unlikely(!src))
+      return false;
+
    if (tiling == I915_TILING_X) {
       tw = xtile_width;
       th = xtile_height;
@@ -796,6 +804,8 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
                    mem_copy);
       }
    }
+
+   return true;
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
index d9148bb623..7440074d1a 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
@@ -37,7 +37,7 @@
 
 typedef void *(*mem_copy_fn)(void *dest, const void *src, size_t n);
 
-void
+bool
 linear_to_tiled(uint32_t xt1, uint32_t xt2,
                 uint32_t yt1, uint32_t yt2,
                 char *dst, const char *src,
@@ -46,7 +46,7 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
                 uint32_t tiling,
                 mem_copy_fn mem_copy);
 
-void
+bool
 tiled_to_linear(uint32_t xt1, uint32_t xt2,
                 uint32_t yt1, uint32_t yt2,
                 char *dst, const char *src,
diff --git a/src/mesa/drivers/dri/i965/intel_upload.c b/src/mesa/drivers/dri/i965/intel_upload.c
index 4ace69271c..9f34c358ba 100644
--- a/src/mesa/drivers/dri/i965/intel_upload.c
+++ b/src/mesa/drivers/dri/i965/intel_upload.c
@@ -43,13 +43,9 @@
 
 #define INTEL_UPLOAD_SIZE (64*1024)
 
-void
+static void
 intel_upload_finish(struct brw_context *brw)
 {
-   if (!brw->upload.bo)
-      return;
-
-   drm_intel_bo_unmap(brw->upload.bo);
    brw_bo_put(brw->upload.bo);
    brw->upload.bo = NULL;
    brw->upload.next_offset = 0;
@@ -96,10 +92,6 @@ intel_upload_space(struct brw_context *brw,
    if (!brw->upload.bo) {
       brw->upload.bo = brw_bo_create(&brw->batch, "streamed data",
                                      MAX2(INTEL_UPLOAD_SIZE, size), 4096, 0);
-      if (brw->has_llc)
-         drm_intel_bo_map(brw->upload.bo, true);
-      else
-         drm_intel_gem_bo_map_gtt(brw->upload.bo);
    }
 
    brw->upload.next_offset = offset + size;
@@ -110,7 +102,7 @@ intel_upload_space(struct brw_context *brw,
       *out_bo = brw_bo_get(brw->upload.bo);
    }
 
-   return brw->upload.bo->virtual + offset;
+   return brw_bo_map(brw->upload.bo, MAP_WRITE | MAP_ASYNC, NULL) + offset;
 }
 
 /**
-- 
2.11.0