[Mesa-dev] [PATCH 04/18] i965: Introduce a context-local batch manager

Tue Jul 7 03:40:14 PDT 2015

On 06/07/15 13:33, Chris Wilson wrote:
> When submitting commands to the GPU every cycle of latency counts;
> mutexes, spinlocks, even atomics quickly add to substantial overhead.
>
> This "batch manager" acts as thread-local shim over the buffer manager
> (drm_intel_bufmgr_gem). As we are only ever used from within a single
> context, we can rely on the upper layers providing thread safety.
> This allows us to import buffers from the shared screen (sharing buffers
> between multiple contexts, threads and users) and wrap that handle in
> our own. Similarly, we want to share the buffer cache between all
> users on the file and so allocate from the global threadsafe buffer
> manager, with a very small and transient local cache of active buffers.
>
> The batch manager provides a cheap way of busyness tracking and very
> efficient batch construction and kernel submission.
>
> The restrictions over and above the generic submission engine in
> intel_bufmgr_gem are:
>       - not thread-safe
>       - flat relocations, only the batch buffer itself carries
>         relocations. Relocations relative to auxiliary buffers
>         must be performed via STATE_BASE
>       - direct mapping of the batch for writes, expect reads
>         from the batch to be slow
>       - the batch is a fixed 64k in size
>       - access to the batch must be wrapped by brw_batch_begin/_end
>       - all relocations must be immediately written into the batch
>
> The importance of the flat relocation tree with local offset handling is
> that it allows us to use the "relocation-less" execbuffer interfaces,
> dramatically reducing the overhead of batch submission. However, that
> can be relaxed to allow other buffers than the batch buffer to carry
> relocations, if need be.
>
> ivb/bdw OglBatch7 improves by ~20% above and beyond my kernel relocation
> speedups.

We get 18% improvement on a customer benchmark when setting a very small 
resolution to be entirely CPU-limited.

>
> ISSUES:
> * shared mipmap trees
>    - we instantiate a context local copy on use, but what are the semantics for
>      serializing read/writes between them - do we need automagic flushing of
>      execution on other contexts and common busyness tracking?
>    - we retain references to the bo past the lifetime of its parent
>      batchmgr as the mipmap_tree is retained past the lifetime of its
>      original context, see glx_arb_create_context/default_major_version
> * OglMultithread is nevertheless unhappy; but that looks like undefined
>    behaviour - i.e. a buggy client concurrently executing the same GL
>    context in multiple threads, unpatched is equally buggy.
> * Add full-ppgtt softpinning support (no more relocations, at least for
>    the first 256TiB), at the moment there is a limited proof-of-principle
>    demonstration
> * polish and move to libdrm; though at the cost of sealing the structs?
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
> Cc: Kristian Høgsberg <krh at bitplanet.net>
> Cc: Kenneth Graunke <kenneth at whitecape.org>
> Cc: Jesse Barnes <jbarnes at virtuousgeek.org>
> Cc: Ian Romanick <ian.d.romanick at intel.com>
> Cc: Abdiel Janulgue <abdiel.janulgue at linux.intel.com>
> Cc: Eero Tamminen <eero.t.tamminen at intel.com>
> Cc: Martin Peres <martin.peres at linux.intel.com>
> ---
>   src/mesa/drivers/dri/i965/Makefile.sources         |    4 +-
>   src/mesa/drivers/dri/i965/brw_batch.c              | 1946 ++++++++++++++++++++
>   src/mesa/drivers/dri/i965/brw_batch.h              |  377 ++++
>   src/mesa/drivers/dri/i965/brw_binding_tables.c     |    1 -
>   src/mesa/drivers/dri/i965/brw_blorp.cpp            |   46 +-
>   src/mesa/drivers/dri/i965/brw_cc.c                 |   16 +-
>   src/mesa/drivers/dri/i965/brw_clear.c              |    1 -
>   src/mesa/drivers/dri/i965/brw_clip.c               |    2 -
>   src/mesa/drivers/dri/i965/brw_clip_line.c          |    2 -
>   src/mesa/drivers/dri/i965/brw_clip_point.c         |    2 -
>   src/mesa/drivers/dri/i965/brw_clip_state.c         |   14 +-
>   src/mesa/drivers/dri/i965/brw_clip_tri.c           |    2 -
>   src/mesa/drivers/dri/i965/brw_clip_unfilled.c      |    2 -
>   src/mesa/drivers/dri/i965/brw_clip_util.c          |    2 -
>   src/mesa/drivers/dri/i965/brw_compute.c            |   42 +-
>   src/mesa/drivers/dri/i965/brw_conditional_render.c |    2 +-
>   src/mesa/drivers/dri/i965/brw_context.c            |  233 ++-
>   src/mesa/drivers/dri/i965/brw_context.h            |  144 +-
>   src/mesa/drivers/dri/i965/brw_cs.cpp               |    6 +-
>   src/mesa/drivers/dri/i965/brw_curbe.c              |    1 -
>   src/mesa/drivers/dri/i965/brw_draw.c               |  103 +-
>   src/mesa/drivers/dri/i965/brw_draw_upload.c        |   23 +-
>   src/mesa/drivers/dri/i965/brw_ff_gs.c              |    2 -
>   src/mesa/drivers/dri/i965/brw_ff_gs_emit.c         |    1 -
>   src/mesa/drivers/dri/i965/brw_fs.cpp               |    5 +-
>   src/mesa/drivers/dri/i965/brw_meta_fast_clear.c    |   11 +-
>   src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c  |    1 -
>   src/mesa/drivers/dri/i965/brw_meta_updownsample.c  |    1 -
>   src/mesa/drivers/dri/i965/brw_misc_state.c         |   10 +-
>   src/mesa/drivers/dri/i965/brw_object_purgeable.c   |    8 +-
>   .../drivers/dri/i965/brw_performance_monitor.c     |   88 +-
>   src/mesa/drivers/dri/i965/brw_pipe_control.c       |   24 +-
>   src/mesa/drivers/dri/i965/brw_primitive_restart.c  |    2 -
>   src/mesa/drivers/dri/i965/brw_program.c            |   23 +-
>   src/mesa/drivers/dri/i965/brw_queryobj.c           |   68 +-
>   src/mesa/drivers/dri/i965/brw_reset.c              |   14 +-
>   src/mesa/drivers/dri/i965/brw_sampler_state.c      |    8 +-
>   src/mesa/drivers/dri/i965/brw_sf.c                 |    2 -
>   src/mesa/drivers/dri/i965/brw_sf_emit.c            |    2 -
>   src/mesa/drivers/dri/i965/brw_sf_state.c           |   21 +-
>   src/mesa/drivers/dri/i965/brw_state.h              |    2 +-
>   src/mesa/drivers/dri/i965/brw_state_batch.c        |   41 +-
>   src/mesa/drivers/dri/i965/brw_state_cache.c        |   70 +-
>   src/mesa/drivers/dri/i965/brw_state_dump.c         |   77 +-
>   src/mesa/drivers/dri/i965/brw_state_upload.c       |   16 +-
>   src/mesa/drivers/dri/i965/brw_structs.h            |   33 +-
>   src/mesa/drivers/dri/i965/brw_urb.c                |    9 +-
>   src/mesa/drivers/dri/i965/brw_vec4.cpp             |    5 +-
>   src/mesa/drivers/dri/i965/brw_vs_state.c           |   33 +-
>   src/mesa/drivers/dri/i965/brw_vs_surface_state.c   |    4 +-
>   src/mesa/drivers/dri/i965/brw_wm_state.c           |   38 +-
>   src/mesa/drivers/dri/i965/brw_wm_surface_state.c   |   76 +-
>   src/mesa/drivers/dri/i965/gen6_blorp.cpp           |   17 +-
>   src/mesa/drivers/dri/i965/gen6_cc.c                |    1 -
>   src/mesa/drivers/dri/i965/gen6_clip_state.c        |    1 -
>   src/mesa/drivers/dri/i965/gen6_depth_state.c       |    1 -
>   src/mesa/drivers/dri/i965/gen6_depthstencil.c      |    1 -
>   src/mesa/drivers/dri/i965/gen6_gs_state.c          |    1 -
>   src/mesa/drivers/dri/i965/gen6_multisample_state.c |    1 -
>   src/mesa/drivers/dri/i965/gen6_queryobj.c          |   56 +-
>   src/mesa/drivers/dri/i965/gen6_sampler_state.c     |    1 -
>   src/mesa/drivers/dri/i965/gen6_scissor_state.c     |    1 -
>   src/mesa/drivers/dri/i965/gen6_sf_state.c          |    1 -
>   src/mesa/drivers/dri/i965/gen6_sol.c               |    9 +-
>   src/mesa/drivers/dri/i965/gen6_surface_state.c     |   13 +-
>   src/mesa/drivers/dri/i965/gen6_urb.c               |    1 -
>   src/mesa/drivers/dri/i965/gen6_viewport_state.c    |    1 -
>   src/mesa/drivers/dri/i965/gen6_vs_state.c          |    2 +-
>   src/mesa/drivers/dri/i965/gen6_wm_state.c          |    1 -
>   src/mesa/drivers/dri/i965/gen7_blorp.cpp           |   16 +-
>   src/mesa/drivers/dri/i965/gen7_disable.c           |    1 -
>   src/mesa/drivers/dri/i965/gen7_gs_state.c          |    1 -
>   src/mesa/drivers/dri/i965/gen7_misc_state.c        |    3 +-
>   src/mesa/drivers/dri/i965/gen7_sf_state.c          |    1 -
>   src/mesa/drivers/dri/i965/gen7_sol_state.c         |   49 +-
>   src/mesa/drivers/dri/i965/gen7_urb.c               |    1 -
>   src/mesa/drivers/dri/i965/gen7_viewport_state.c    |    1 -
>   src/mesa/drivers/dri/i965/gen7_vs_state.c          |    1 -
>   src/mesa/drivers/dri/i965/gen7_wm_state.c          |    1 -
>   src/mesa/drivers/dri/i965/gen7_wm_surface_state.c  |   60 +-
>   src/mesa/drivers/dri/i965/gen8_blend_state.c       |    1 -
>   src/mesa/drivers/dri/i965/gen8_depth_state.c       |   16 +-
>   src/mesa/drivers/dri/i965/gen8_disable.c           |    1 -
>   src/mesa/drivers/dri/i965/gen8_draw_upload.c       |    1 -
>   src/mesa/drivers/dri/i965/gen8_gs_state.c          |    1 -
>   src/mesa/drivers/dri/i965/gen8_misc_state.c        |    1 -
>   src/mesa/drivers/dri/i965/gen8_multisample_state.c |    1 -
>   src/mesa/drivers/dri/i965/gen8_ps_state.c          |    1 -
>   src/mesa/drivers/dri/i965/gen8_sf_state.c          |    1 -
>   src/mesa/drivers/dri/i965/gen8_sol_state.c         |    3 +-
>   src/mesa/drivers/dri/i965/gen8_surface_state.c     |   73 +-
>   src/mesa/drivers/dri/i965/gen8_viewport_state.c    |    1 -
>   src/mesa/drivers/dri/i965/gen8_vs_state.c          |    1 -
>   src/mesa/drivers/dri/i965/gen8_wm_depth_stencil.c  |    1 -
>   src/mesa/drivers/dri/i965/intel_batchbuffer.c      |  480 -----
>   src/mesa/drivers/dri/i965/intel_batchbuffer.h      |  179 --
>   src/mesa/drivers/dri/i965/intel_blit.c             |   68 +-
>   src/mesa/drivers/dri/i965/intel_blit.h             |   10 +-
>   src/mesa/drivers/dri/i965/intel_buffer_objects.c   |  222 +--
>   src/mesa/drivers/dri/i965/intel_buffer_objects.h   |   18 +-
>   src/mesa/drivers/dri/i965/intel_debug.c            |    6 -
>   src/mesa/drivers/dri/i965/intel_extensions.c       |   48 +-
>   src/mesa/drivers/dri/i965/intel_fbo.c              |   46 +-
>   src/mesa/drivers/dri/i965/intel_fbo.h              |    4 -
>   src/mesa/drivers/dri/i965/intel_image.h            |    6 +-
>   src/mesa/drivers/dri/i965/intel_mipmap_tree.c      |   98 +-
>   src/mesa/drivers/dri/i965/intel_mipmap_tree.h      |   11 +-
>   src/mesa/drivers/dri/i965/intel_pixel_bitmap.c     |    3 +-
>   src/mesa/drivers/dri/i965/intel_pixel_copy.c       |    3 -
>   src/mesa/drivers/dri/i965/intel_pixel_draw.c       |    2 +-
>   src/mesa/drivers/dri/i965/intel_pixel_read.c       |   28 +-
>   src/mesa/drivers/dri/i965/intel_screen.c           |   68 +-
>   src/mesa/drivers/dri/i965/intel_screen.h           |   16 +-
>   src/mesa/drivers/dri/i965/intel_syncobj.c          |   86 +-
>   src/mesa/drivers/dri/i965/intel_tex.c              |    6 +-
>   src/mesa/drivers/dri/i965/intel_tex_image.c        |   35 +-
>   src/mesa/drivers/dri/i965/intel_tex_subimage.c     |   33 +-
>   src/mesa/drivers/dri/i965/intel_tiled_memcpy.c     |   14 +-
>   src/mesa/drivers/dri/i965/intel_tiled_memcpy.h     |    4 +-
>   src/mesa/drivers/dri/i965/intel_upload.c           |   33 +-
>   120 files changed, 3341 insertions(+), 2199 deletions(-)
>   create mode 100644 src/mesa/drivers/dri/i965/brw_batch.c
>   create mode 100644 src/mesa/drivers/dri/i965/brw_batch.h
>   delete mode 100644 src/mesa/drivers/dri/i965/intel_batchbuffer.c
>   delete mode 100644 src/mesa/drivers/dri/i965/intel_batchbuffer.h
>
> diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
> index 5a33aac..92e2ad0 100644
> --- a/src/mesa/drivers/dri/i965/Makefile.sources
> +++ b/src/mesa/drivers/dri/i965/Makefile.sources
> @@ -1,4 +1,6 @@
>   i965_FILES = \
> +	brw_batch.c \
> +	brw_batch.h \
>   	brw_binding_tables.c \
>   	brw_blorp_blit.cpp \
>   	brw_blorp_blit_eu.cpp \
> @@ -185,8 +187,6 @@ i965_FILES = \
>   	gen8_wm_depth_stencil.c \
>   	intel_asm_annotation.c \
>   	intel_asm_annotation.h \
> -	intel_batchbuffer.c \
> -	intel_batchbuffer.h \
>   	intel_blit.c \
>   	intel_blit.h \
>   	intel_buffer_objects.c \
> diff --git a/src/mesa/drivers/dri/i965/brw_batch.c b/src/mesa/drivers/dri/i965/brw_batch.c
> new file mode 100644
> index 0000000..24e96c6
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_batch.c
> @@ -0,0 +1,1946 @@
> +/*
> + * Copyright (c) 2015 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + *
> + * Authors:
> + *    Chris Wilson <chris at chris-wilson.co.uk>
> + *
> + */
> +#include "brw_batch.h"
> +#include "brw_context.h" /* XXX brw_finish_batch() */
> +
> +#include <sys/types.h>
> +#include <sys/mman.h>
> +#include <stdlib.h>
> +#include <setjmp.h>
> +
> +#include <intel_bufmgr.h>
> +#include <i915_drm.h>
> +#include <xf86drm.h>
> +#include <errno.h>
> +
> +#include "intel_screen.h"
> +
> +/*
> + * When submitting commands to the GPU every cycle of latency counts;
> + * mutexes, spinlocks, even atomics quickly add to substantial overhead.
> + *
> + * This "batch manager" acts as thread-local shim over the buffer manager
> + * (drm_intel_bufmgr_gem). As we are only ever used from within a single
> + * context, we can rely on the upper layers providing thread safety. This
> + * allows us to import buffers from the shared screen (sharing buffers
> + * between multiple contexts, threads and users) and wrap that handle in
> + * our own. Similarly, we want to share the buffer cache between all users
> + * on the file and so allocate from the global threadsafe buffer manager,
> + * with a very small and transient local cache of active buffers.
> + *
> + * The batch manager provides a cheap way of busyness tracking and very
> + * efficient batch construction and kernel submission.
> + *
> + * The restrictions over and above the generic submission engine in
> + * intel_bufmgr_gem are:
> + * 	- not thread-safe
> + * 	- flat relocations, only the batch buffer itself carries
> + * 	  relocations. Relocations relative to auxiliary buffers
> + * 	  must be performed via STATE_BASE
> + * 	- direct mapping of the batch for writes, expect reads
> + * 	  from the batch to be slow
> + * 	- the batch is a fixed 64k in size
> + * 	- access to the batch must be wrapped by brw_batch_begin/_end
> + * 	- all relocations must be immediately written into the batch
> + */
> +
> +/**
> + * Number of bytes to reserve for commands necessary to complete a batch.
> + *
> + * This includes:
> + * - MI_BATCHBUFFER_END (4 bytes)
> + * - Optional MI_NOOP for ensuring the batch length is qword aligned (4 bytes)
> + * - Any state emitted by vtbl->finish_batch():
> + *   - Gen4-5 record ending occlusion query values (4 * 4 = 16 bytes)
> + *   - Disabling OA counters on Gen6+ (3 DWords = 12 bytes)
> + *   - Ending MI_REPORT_PERF_COUNT on Gen5+, plus associated PIPE_CONTROLs:
> + *     - Two sets of PIPE_CONTROLs, which become 3 PIPE_CONTROLs each on SNB,
> + *       which are 4 DWords each ==> 2 * 3 * 4 * 4 = 96 bytes
> + *     - 3 DWords for MI_REPORT_PERF_COUNT itself on Gen6+.  ==> 12 bytes.
> + *       On Ironlake, it's 6 DWords, but we have some slack due to the lack of
> + *       Sandybridge PIPE_CONTROL madness.
> + *
> + * Total: 140 bytes
> + */
> +#define BATCH_RESERVED 140
> +
> +/* Surface offsets are limited to a maximum of 64k from the surface base */
> +#define BATCH_SIZE (64 << 10)
> +
> +/* XXX Temporary home until kernel patches land */
> +#define I915_PARAM_HAS_EXEC_SOFTPIN 37
> +#define EXEC_OBJECT_PINNED	(1<<4)
> +#define I915_PARAM_HAS_EXEC_BATCH_FIRST 38
> +#define I915_EXEC_BATCH_FIRST (1<<16)
> +
> +#define DBG_NO_FAST_RELOC 0
> +#define DBG_NO_HANDLE_LUT 0
> +#define DBG_NO_BATCH_FIRST 0
> +#define DBG_NO_SOFTPIN 0
> +
> +#define PERF_IDLE 0 /* ring mask */
> +
> +inline static void list_move(struct list_head *from, struct list_head *to)
> +{
> +   list_del(from);
> +   list_add(from, to);
> +}
> +
> +inline static void list_move_tail(struct list_head *from, struct list_head *to)
> +{
> +   list_del(from);
> +   list_addtail(from, to);
> +}
> +
> +static const unsigned hw_ring[] = {
> +   [RENDER_RING] = I915_EXEC_RENDER,
> +   [BLT_RING] = I915_EXEC_BLT,
> +};
> +
> +/*
> + * The struct brw_request is central to efficiently tracking GPU activity,
> + * and the busyness of all buffers. It serves as both a read and a write
> + * fence on the buffers (and as the external GL fence). This is done by
> + * associating each relocation (i.e. every use of a buffer by a GPU within
> + * a batch) with the request as a read fence (for a read-only relocation)
> + * or as both the read/write fences (for a writeable relocation).
> + *
> + * Then if we ever need to query whether a particular buffer is active,
> + * we can look at the appropriate fence and see whether it has expired.
> + * If not we can then ask the kernel if has just retired and report back.
> + * If the request is still undergoing construction and not been submitted,
> + * we have that information immediately available and can report busyness
> + * without having to search.
> + *
> + * Periodically (after every new request) we poll for request completion,
> + * asking if the oldest is complete. This allows us to then maintain the
> + * busyness state of all buffers without having to query every buffer
> + * every time.
> + *
> + * After certain events (such as mapping or waiting on a buffer), we know that
> + * the buffer is idle and so is the associated fence and all fences older.
> + *
> + * A nice side-effect of tracking requests, and buffer busyness is that we
> + * can also track a reasonable measure of how much of the aperture is filled
> + * by active buffers (a resident set size). This is useful for predicting
> + * when the kernel will start evicting our buffers, for example.
> + */
> +struct brw_request {
> +   struct brw_bo *bo;
> +   struct list_head link, write, read, fence;
> +};
> +#define RQ_MARK_RING(rq, ring) ((struct brw_bo *)((uintptr_t)((rq)->bo) | (ring)))
> +#define RQ_BO(rq) ((struct brw_bo *)((uintptr_t)(rq)->bo & ~3))
> +#define RQ_RING(rq) (((unsigned)(uintptr_t)(rq)->bo & 3))
> +
> +static bool __brw_bo_busy(struct brw_bo *bo)
> +{
> +   struct drm_i915_gem_busy busy;
> +
> +   memset(&busy, 0, sizeof(busy));
> +   busy.handle = bo->handle;
> +   busy.busy = ~0;
> +   drmIoctl(bo->batch->fd, DRM_IOCTL_I915_GEM_BUSY, &busy);
> +   /* If an error occurs here, it can only be due to flushing the
> +    * buffer on the hardware i.e. the buffer itself is still busy.
> +    * Possible errors are:
> +    * 	-ENOENT: the buffer didn't exist, impossible!
> +    * 	-ENOMEM: the kernel failed to flush due to allocation failures
> +    * 	         scary, but the buffer is busy.
> +    * 	-EIO:    the kernel should have marked the buffer as idle during
> +    * 	         the reset, if it hasn't it will never and the buffer
> +    * 	         itself will never become idle.
> +    * 	(-EINTR, -EAGAIN eaten by drmIoctl()).
> +    */
> +   return busy.busy;
> +}
> +
> +/*
> + * Retire this and all older requests.
> + */
> +static void __brw_request_retire(struct brw_request *rq)
> +{
> +   struct brw_batch *batch = RQ_BO(rq)->batch;
> +   struct list_head *list = &batch->requests[RQ_RING(rq)];
> +   struct brw_request *tmp;
> +
> +   assert(!__brw_bo_busy(RQ_BO(rq)) || batch->fini);
> +
> +   if (PERF_IDLE & (1 << RQ_RING(rq)) && rq->link.next == list)
> +      batch->idle_time[RQ_RING(rq)] = -get_time();
> +
> +   do {
> +      tmp = list_last_entry(list, struct brw_request, link);
> +      assert(!__brw_bo_busy(RQ_BO(tmp)) || batch->fini);
> +      list_del(&tmp->link);
> +      assert(RQ_BO(tmp)->exec == NULL);
> +
> +      while (!list_empty(&tmp->write)) {
> +         struct brw_bo *bo;
> +
> +         bo = list_first_entry(&tmp->write,
> +                               struct brw_bo,
> +                               write.link);
> +
> +         assert(bo->write.rq == tmp);
> +         assert(bo->read.rq);
> +
> +         list_delinit(&bo->write.link);
> +         bo->write.rq = NULL;
> +      }
> +      assert(RQ_BO(tmp)->write.rq == NULL);
> +
> +      while (!list_empty(&tmp->read)) {
> +         struct brw_bo *bo;
> +
> +         bo = list_first_entry(&tmp->read,
> +                               struct brw_bo,
> +                               read.link);
> +         assert(bo->exec == NULL);
> +         assert(bo->read.rq == tmp);
> +         if (bo->write.rq) {
> +            assert(RQ_RING(bo->write.rq) != RQ_RING(rq));
> +            __brw_request_retire(bo->write.rq);
> +         }
> +         assert(bo->write.rq == NULL);
> +
> +         list_delinit(&bo->read.link);
> +         bo->read.rq = NULL;
> +
> +         assert(batch->rss >= bo->size);
> +         batch->rss -= bo->size;
> +
> +         if (bo->reusable)
> +            list_move(&bo->link, &batch->inactive);
> +
> +         if (!bo->refcnt)
> +            __brw_bo_free(bo);
> +      }
> +      assert(RQ_BO(tmp)->read.rq == NULL);
> +
> +      while (!list_empty(&tmp->fence)) {
> +         struct brw_fence *fence;
> +
> +         fence = list_first_entry(&tmp->fence,
> +                                  struct brw_fence,
> +                                  link);
> +         assert(fence->rq == tmp);
> +         list_del(&fence->link);
> +         fence->rq = NULL;
> +
> +         if (fence->signal)
> +            fence->signal(fence);
> +      }
> +
> +      if (tmp == batch->throttle)
> +         batch->throttle = NULL;
> +
> +      tmp->bo = RQ_BO(tmp); /* strip off the ring id */
> +      tmp->link.next = (struct list_head *)batch->freed_rq;
> +      batch->freed_rq = tmp;
> +   } while (tmp != rq);
> +}
> +
> +/*
> + * Is the request busy? First we can see if this request
> + * has already been retired (idle), or if this request is still under
> + * construction (busy). Failing that to the best of our knowledge, it is
> + * still being processed by the GPU, so then we must ask the kernel if the
> + * request is now idle. If we find it is idle, we now know this and all
> + * older requests are also idle.
> + */
> +bool __brw_request_busy(struct brw_request *rq, unsigned flags)
> +{
> +   struct brw_bo *bo;
> +   if (rq == NULL)
> +      return false;
> +
> +   bo = RQ_BO(rq);
> +   if (bo->read.rq == NULL)
> +      return false;
> +
> +   assert(bo->read.rq == rq);
> +
> +   if (bo->exec != NULL) {
> +      if (flags & BUSY_FLUSH)
> +         brw_batch_flush(bo->batch);
> +      return true;
> +   }
> +
> +   if (__brw_bo_busy(bo))
> +      return true;
> +
> +   __brw_request_retire(rq);
> +   return false;
> +}
> +
> +/*
> + * Update the cache domain tracked by the kernel. This can have a number
> + * of side-effects but is essential in order to make coherency and
> + * serialisation between the GPU and CPU. If there is conflicting GPU access
> + * then set-domain will wait until the GPU has finished accessing the buffer
> + * before proceeding to change the domain. If the buffer is not cache coherent
> + * and we request CPU access, the kernel will clflush that buffer to make it
> + * coherent with the CPU access. Both of these imply delays and overhead, so
> + * we do our best to avoid moving buffers to the GTT/CPU domains. However,
> + * if we do, we know the buffer and its requst are idle so we can update
> + * our request tracking after a blocking call.
> + */
> +static void __brw_bo_set_domain(struct brw_bo *bo, unsigned domain, bool write)
> +{
> +   struct drm_i915_gem_set_domain set_domain;
> +   struct brw_request *rq;
> +
> +   if (bo->exec) /* flush failed, pretend we are ASYNC | INCOHERENT */
> +      return;
> +
> +   memset(&set_domain, 0, sizeof(set_domain));
> +   set_domain.handle = bo->handle;
> +   set_domain.read_domains = domain;
> +   if (write)
> +      set_domain.write_domain = domain;
> +
> +   domain = domain == I915_GEM_DOMAIN_CPU ? DOMAIN_CPU : DOMAIN_GTT;
> +   if (bo->domain == domain)
> +      return;
> +
> +   if (drmIoctl(bo->batch->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain))
> +      return;
> +
> +   rq = write ? bo->read.rq : bo->write.rq;
> +   if (rq)
> +      __brw_request_retire(rq);
> +
> +   bo->domain = write ? domain : DOMAIN_NONE;
> +   assert(bo->refcnt);
> +}
> +
> +/*
> + * Wait for the buffer to become completely idle, i.e. not being accessed by
> + * the GPU at all (neither for oustanding reads or writes).
> + * This is equivalent to setting the buffer write domain to GTT, but the
> + * wait ioctl avoids the set-domain side-effects (e.g. clflushing in
> + * some circumstances).
> + */
> +static int __brw_bo_wait(struct brw_bo *bo, int64_t timeout)
> +{
> +   struct drm_i915_gem_wait wait;
> +
> +   assert(bo->exec == NULL);
> +
> +   if (!brw_bo_busy(bo, BUSY_WRITE | BUSY_RETIRE))
> +      return 0;
> +
> +   memset(&wait, 0, sizeof(wait));
> +   wait.bo_handle = bo->handle;
> +   wait.timeout_ns = timeout;
> +   wait.flags = 0;
> +
> +   if (drmIoctl(bo->batch->fd, DRM_IOCTL_I915_GEM_WAIT, &wait)) {
> +      int ret = -errno;
> +      if (timeout < 0) {
> +         __brw_bo_set_domain(bo, I915_GEM_DOMAIN_GTT, true);
> +         ret = 0;
> +      }
> +      return ret;
> +   }
> +
> +   assert(bo->read.rq);
> +   __brw_request_retire(bo->read.rq);
> +   return 0;
> +}
> +
> +static inline uint32_t hash_32(uint32_t hash, unsigned bits)
> +{
> +   return (hash * 0x9e37001) >> (32 - bits);
> +}
> +
> +static inline struct list_head *borrowed(struct brw_batch *batch, uint32_t handle)
> +{
> +   return &batch->borrowed[hash_32(handle, BORROWED_BITS)];
> +}
> +
> +/*
> + * We have context local bo, but those may be shared between contexts by
> + * shared mipmaps and other buffers. If we find we are dealing with a bo
> + * belonging to another batch, we need to translate that into a local bo
> + * for associating with our fences.
> + */
> +static struct brw_bo *__brw_batch_lookup_handle(struct brw_batch *batch,
> +                                                uint32_t handle)
> +{
> +   /* XXX may need a resizable ht? */
> +   struct list_head *hlist = borrowed(batch, handle);
> +
> +   list_for_each_entry(struct brw_bo, bo, hlist, link)
> +      if (bo->handle == handle)
> +         return bo;
> +
> +   return NULL;
> +}
> +
> +inline static bool has_lut(struct brw_batch *batch)
> +{
> +   return batch->batch_base_flags & I915_EXEC_HANDLE_LUT;
> +}
> +
> +/*
> + * Prepare the batch manager for constructing a new batch/request.
> + *
> + * Reset all the accounting we do per-batch, and allocate ourselves a new
> + * batch bo.
> + */
> +static int __brw_batch_reset(struct brw_batch *batch)
> +{
> +   struct brw_request *rq;
> +
> +retry:
> +   rq = batch->freed_rq;
> +   if (unlikely(rq == NULL)) {
> +      rq = malloc(sizeof(*rq));
> +      if (unlikely(rq == NULL))
> +         goto oom;
> +
> +      rq->bo = brw_bo_create(batch, "batch", BATCH_SIZE, 0, 0);
> +      if (unlikely(rq->bo == NULL)) {
> +         free(rq);
> +         goto oom;
> +      }
> +      rq->bo->target_handle = -1;
> +
> +      /* We are inheriting a foriegn buffer, so call set-domain */
> +      brw_bo_map(rq->bo, MAP_WRITE);
> +
> +      list_inithead(&rq->read);
> +      list_inithead(&rq->write);
> +      list_inithead(&rq->fence);
> +   } else {
> +      assert(list_empty(&rq->read));
> +      assert(list_empty(&rq->write));
> +      assert(list_empty(&rq->fence));
> +      batch->freed_rq = (struct brw_request *)rq->link.next;
> +   }
> +
> +   assert(RQ_BO(rq) == rq->bo);
> +   batch->map = brw_bo_map(rq->bo, MAP_WRITE | MAP_ASYNC);
> +   if (unlikely(batch->map == NULL)) {
> +      brw_bo_put(rq->bo);
> +      free(rq);
> +
> +oom:
> +      if (list_empty(&batch->requests[batch->ring])) {
> +         batch->next_request = NULL;
> +         return -ENOMEM;
> +      }
> +
> +      /* force the synchronization to recover some memory */
> +      rq = list_last_entry(&batch->requests[batch->ring],
> +                           struct brw_request, link);
> +      __brw_bo_wait(RQ_BO(rq), -1);
> +      goto retry;
> +   }
> +
> +   batch->bo = rq->bo;
> +   memset(&batch->emit, 0, sizeof(batch->emit));
> +   batch->next_request = rq;
> +
> +   batch->reserved = BATCH_RESERVED / 4;
> +   batch->state = BATCH_SIZE / 4;
> +   batch->aperture = 0;
> +   batch->batch_flags = batch->batch_base_flags;
> +
> +   list_add(&rq->bo->read.link, &rq->read);
> +   if (batch->batch_base_flags & I915_EXEC_BATCH_FIRST) {
> +      rq->bo->target_handle =
> +	 has_lut(batch) ? batch->emit.nexec : rq->bo->handle;
> +      rq->bo->exec =
> +	 memset(&batch->exec[batch->emit.nexec++], 0, sizeof(*rq->bo->exec));
> +   } else
> +      rq->bo->exec = (void *)1;
> +   rq->bo->read.rq = rq;
> +   batch->rss += BATCH_SIZE;
> +   return 0;
> +}
> +
> +static int gem_param(int fd, int name)
> +{
> +   drm_i915_getparam_t gp;
> +   int v = -1; /* No param uses (yet) the sign bit, reserve it for errors */
> +
> +   memset(&gp, 0, sizeof(gp));
> +   gp.param = name;
> +   gp.value = &v;
> +   if (drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp))
> +      return -1;
> +
> +   return v;
> +}
> +
> +static bool test_has_fast_reloc(int fd)
> +{
> +   if (DBG_NO_FAST_RELOC)
> +      return DBG_NO_FAST_RELOC < 0;
> +
> +   return gem_param(fd, I915_PARAM_HAS_EXEC_NO_RELOC) > 0;
> +}
> +
> +static bool test_has_handle_lut(int fd)
> +{
> +   if (DBG_NO_HANDLE_LUT)
> +      return DBG_NO_HANDLE_LUT < 0;
> +
> +   return gem_param(fd, I915_PARAM_HAS_EXEC_HANDLE_LUT) > 0;
> +}
> +
> +static bool test_has_batch_first(int fd)
> +{
> +   if (DBG_NO_BATCH_FIRST)
> +      return DBG_NO_BATCH_FIRST < 0;
> +
> +   return gem_param(fd, I915_PARAM_HAS_EXEC_BATCH_FIRST) > 0;
> +}
> +
> +static bool test_has_softpin(int fd)
> +{
> +   if (DBG_NO_SOFTPIN)
> +      return DBG_NO_SOFTPIN < 0;
> +
> +   if (gem_param(fd, I915_PARAM_HAS_ALIASING_PPGTT) < 2)
> +      return false;
> +
> +   return gem_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN) > 0;
> +}
> +
> +static uint64_t __get_max_aperture(int fd)
> +{
> +	struct drm_i915_gem_get_aperture aperture;
> +
> +	if (gem_param(fd, I915_PARAM_HAS_ALIASING_PPGTT) > 2)
> +		return (uint64_t)1 << 48;
> +
> +	memset(&aperture, 0, sizeof(aperture));
> +	if (drmIoctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture))
> +		return 512 << 20; /* Minimum found on gen4+ */
> +
> +	return aperture.aper_size;
> +}
> +
> +static uint64_t get_max_aperture(int fd)
> +{
> +	static uint64_t max_aperture;
> +
> +	if (max_aperture == 0)
> +		max_aperture = __get_max_aperture(fd);
> +
> +	return max_aperture;
> +}
> +
> +/*
> + * Initialise the batch-manager for the context.
> + *
> + * We use the devinfo and settings found in intel_screen to set ourselves up
> + * for the hardware environment, and supplement that with our own feature
> + * tests. (These too should probably move to intel_screen and shared between
> + * all contexts.)
> + */
> +int brw_batch_init(struct brw_batch *batch,
> +		   struct intel_screen *screen)
> +{
> +   const struct brw_device_info *devinfo;
> +   int ret;
> +   int n;
> +
> +   batch->fd = intel_screen_to_fd(screen);
> +   batch->bufmgr = screen->bufmgr;
> +   batch->screen = screen;
> +
> +   devinfo = screen->devinfo;
> +
> +   batch->no_hw = screen->no_hw;
> +
> +   batch->needs_pipecontrol_ggtt_wa = devinfo->gen == 6;
> +   batch->reloc_size = 512;
> +   batch->exec_size = 256;
> +   batch->reloc = malloc(sizeof(batch->reloc[0])*batch->reloc_size);
> +   batch->exec = malloc(sizeof(batch->exec[0])*batch->exec_size);
> +   if (unlikely(batch->reloc == NULL || batch->exec == NULL)) {
> +      ret = -ENOMEM;
> +      goto err;
> +   }
> +
> +   for (n = 0; n < 1 << BORROWED_BITS; n++)
> +      list_inithead(&batch->borrowed[n]);
> +   list_inithead(&batch->active);
> +   list_inithead(&batch->inactive);
> +   for (n = 0; n < __BRW_NUM_RINGS; n++)
> +      list_inithead(&batch->requests[n]);
> +
> +   batch->actual_ring[RENDER_RING] = RENDER_RING;
> +   batch->actual_ring[BLT_RING] = BLT_RING;
> +   if (devinfo->gen < 6)
> +      batch->actual_ring[BLT_RING] = RENDER_RING;
> +
> +   batch->has_mmap_wc = true; /* tested on first use */
> +   batch->has_llc = devinfo->has_llc;
> +   batch->has_softpin = test_has_softpin(batch->fd);
> +   batch->max_aperture = 3*get_max_aperture(batch->fd)/4;
> +
> +   if (test_has_fast_reloc(batch->fd))
> +      batch->batch_base_flags |= I915_EXEC_NO_RELOC;
> +   if (test_has_handle_lut(batch->fd))
> +      batch->batch_base_flags |= I915_EXEC_HANDLE_LUT;
> +   if (test_has_batch_first(batch->fd))
> +      batch->batch_base_flags |= I915_EXEC_BATCH_FIRST;
> +
> +   if (devinfo->gen >= 6) {
> +      /* Create a new hardware context.  Using a hardware context means that
> +       * our GPU state will be saved/restored on context switch, allowing us
> +       * to assume that the GPU is in the same state we left it in.
> +       *
> +       * This is required for transform feedback buffer offsets, query objects,
> +       * and also allows us to reduce how much state we have to emit.
> +       */
> +      struct drm_i915_gem_context_create create;
> +
> +      memset(&create, 0, sizeof(create));
> +      drmIoctl(batch->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create);
> +      batch->hw_ctx = create.ctx_id;
> +      if (!batch->hw_ctx) {
> +	 ret = -errno;
> +	 fprintf(stderr, "Gen6+ requires Kernel 3.6 or later.\n");
> +	 goto err;
> +      }
> +   }
> +
> +   if (unlikely(INTEL_DEBUG & DEBUG_AUB))
> +      drm_intel_aub_enable(&batch->aub, batch->fd, devinfo->gen);
> +
> +   ret = __brw_batch_reset(batch);
> +   if (ret)
> +      goto err;
> +
> +   return 0;
> +
> +err:
> +   drm_intel_aub_enable(&batch->aub, -1, 0);
> +   free(batch->reloc);
> +   free(batch->exec);
> +   return ret;
> +}
> +
> +/*
> + * Mark a bo as being written to by this batch.
> + *
> + * We frequently dirty a buffer and then emit a global cache flush
> + * cleaning all the dirty buffers within a batch. Afterwards, we may
> + * then write to the same buffer, but may not re-emit a relocation and
> + * so we need to notify that the buffer is now dirty again. Normally
> + * we can rely the relocation marking the write buffers as dirty.
> + *
> + * All caches are flushed by the kernel between batches, so at the end
> + * of each batch we can mark all buffers as clean again. (Before we can
> + * access the buffer, either by the GPU in the next batch or by the CPU
> + * following a set-domain call, that access will be after the flush has
> + * finished.)
> + */
> +void brw_bo_mark_dirty(struct brw_batch *batch, struct brw_bo *bo)
> +{
> +   if (unlikely(bo->batch != batch)) {
> +      bo = __brw_batch_lookup_handle(batch, bo->handle);
> +      assert(bo);
> +   }
> +   assert(bo->batch == batch);
> +   assert(bo != bo->batch->bo);
> +
> +   /* We should only be called on objects already in the batch for writing */
> +   assert(bo->exec);
> +   assert(bo->read.rq == batch->next_request);
> +   assert(bo->write.rq == batch->next_request);
> +   assert(bo->domain == DOMAIN_GPU);
> +
> +   if (bo->dirty)
> +      return;
> +
> +   list_move(&bo->write.link, &batch->next_request->write);
> +   bo->dirty = true;
> +}
> +
> +/*
> + * At the end of each batch and when explicitly flushing caches within
> + * a batch, we can mark all the buffers within that batch as now clean.
> + */
> +void brw_batch_clear_dirty(struct brw_batch *batch)
> +{
> +   struct list_head *write = &batch->next_request->write;
> +
> +   list_for_each_entry(struct brw_request_node, node, write, link) {
> +      struct brw_bo *bo = container_of(node, bo, write);
> +      if (!bo->dirty)
> +         break;
> +
> +      bo->dirty = false;
> +   }
> +}
> +
> +static void __brw_batch_grow_exec(struct brw_batch *batch)
> +{
> +   struct drm_i915_gem_exec_object2 *new_exec;
> +   uint16_t new_size;
> +
> +   new_size = batch->exec_size * 2;
> +   new_exec = NULL;
> +   if (likely(new_size > batch->exec_size))
> +      new_exec = realloc(batch->exec, new_size*sizeof(new_exec[0]));
> +   if (unlikely(new_exec == NULL))
> +      longjmp(batch->jmpbuf, -ENOMEM);
> +
> +   if (new_exec != batch->exec) {
> +      struct list_head *read = &batch->next_request->read;
> +
> +      list_for_each_entry(struct brw_bo, bo, read, read.link)
> +         bo->exec = new_exec + (bo->exec - batch->exec);
> +
> +      batch->exec = new_exec;
> +   }
> +
> +   batch->exec_size = new_size;
> +}
> +
> +static void __brw_batch_grow_reloc(struct brw_batch *batch)
> +{
> +   struct drm_i915_gem_relocation_entry *new_reloc;
> +   uint16_t new_size;
> +
> +   new_size = batch->reloc_size * 2;
> +   new_reloc = NULL;
> +   if (likely(new_size > batch->reloc_size))
> +      new_reloc = realloc(batch->reloc, new_size*sizeof(new_reloc[0]));
> +   if (unlikely(new_reloc == NULL))
> +      longjmp(batch->jmpbuf, -ENOMEM);
> +
> +   batch->reloc = new_reloc;
> +   batch->reloc_size = new_size;
> +}
> +
> +/*
> + * Add a relocation entry for the target buffer into the current batch.
> + *
> + * This is the heart of performing fast relocations, both here and in
> + * the corresponding kernel relocation routines.
> + *
> + * - Instead of passing in handles for the kernel convert back into
> + *   the buffer for every relocation, we tell the kernel which
> + *   execobject slot corresponds with the relocation. The kernel is
> + *   able to use a simple LUT constructed as it first looks up each buffer
> + *   for the batch rather than search a small, overfull hashtable. As both
> + *   the number of relocations and buffers in a batch grow, the simple
> + *   LUT is much more efficient (though the LUT itself is less cache
> + *   friendly).
> + *   However, as the batch buffer is by definition the last object in
> + *   the execbuffer array we have to perform a pass to relabel the
> + *   target of all relocations pointing to the batch. (Except when
> + *   the kernel supports batch-first, in which case we can do the relocation
> + *   target processing for the batch inline.)
> + *
> + * - If the kernel has not moved the buffer, it will still be in the same
> + *   location as last time we used it. If we tell the kernel that all the
> + *   relocation entries are the same as the offset for the buffer, then
> + *   the kernel need only check that all the buffers are still in the same
> + *   location and then skip performing relocations entirely. A huge win.
> + *
> + * - As a consequence of telling the kernel to skip processing the relocations,
> + *   we need to tell the kernel about the read/write domains and special needs
> + *   of the buffers.
> + *
> + * - Alternatively, we can request the kernel place the buffer exactly
> + *   where we want it and forgo all relocations to that buffer entirely.
> + *   The buffer is effectively pinned for its lifetime (if the kernel
> + *   does have to move it, for example to swap it out to recover memory,
> + *   the kernel will return it back to our requested location at the start
> + *   of the next batch.) This of course imposes a lot of constraints on where
> + *   we can say the buffers are, they must meet all the alignment constraints
> + *   and not overlap.
> + *
> + * - Essential to all these techniques is that we always use the same
> + *   presumed_offset for the relocations as for submitting the execobject.
> + *   That value must be written into the batch and it must match the value
> + *   we tell the kernel. (This breaks down when using relocation tries shared
> + *   between multiple contexts, hence the need for context-local batch
> + *   management.)
> + *
> + * In contrast to libdrm, we can build the execbuffer array along with
> + * the batch by forgoing the ability to handle general relocation trees.
> + * This avoids having multiple passes to build the execbuffer parameter,
> + * and also gives us a means to cheaply track when a buffer has been
> + * referenced by the batch.
> + */
> +uint64_t __brw_batch_reloc(struct brw_batch *batch,
> +                           uint32_t batch_offset,
> +                           struct brw_bo *target_bo,
> +                           uint64_t target_offset,
> +                           unsigned read_domains,
> +                           unsigned write_domain)
> +{
> +   assert(target_bo->refcnt);
> +   if (unlikely(target_bo->batch != batch)) {
> +      /* XXX legal sharing between contexts/threads? */
> +      target_bo = brw_bo_import(batch, target_bo->base, true);
> +      if (unlikely(target_bo == NULL))
> +         longjmp(batch->jmpbuf, -ENOMEM);
> +      target_bo->refcnt--; /* kept alive by the implicit active reference */
> +   }
> +   assert(target_bo->batch == batch);
> +
> +   if (target_bo->exec == NULL) {
> +      int n;
> +
> +      /* reserve one exec entry for the batch */
> +      if (unlikely(batch->emit.nexec + 1 == batch->exec_size))
> +         __brw_batch_grow_exec(batch);
> +
> +      n = batch->emit.nexec++;
> +      target_bo->target_handle = has_lut(batch) ? n : target_bo->handle;
> +      target_bo->exec = memset(batch->exec + n, 0, sizeof(*target_bo->exec));
> +      target_bo->exec->handle = target_bo->handle;
> +      target_bo->exec->alignment = target_bo->alignment;
> +      target_bo->exec->offset = target_bo->offset;
> +      if (target_bo->pinned)
> +         target_bo->exec->flags = EXEC_OBJECT_PINNED;
> +
> +      /* Track the total amount of memory in use by all active requests */
> +      if (target_bo->read.rq == NULL) {
> +	      batch->rss += target_bo->size;
> +	      if (batch->rss > batch->peak_rss)
> +		      batch->peak_rss = batch->rss;
> +      }
> +      target_bo->read.rq = batch->next_request;
> +      list_move_tail(&target_bo->read.link, &batch->next_request->read);
> +
> +      batch->aperture += target_bo->size;
> +   }
> +
> +   if (!target_bo->pinned) {
> +      int n;
> +
> +      if (unlikely(batch->emit.nreloc == batch->reloc_size))
> +         __brw_batch_grow_reloc(batch);
> +
> +      n = batch->emit.nreloc++;
> +      batch->reloc[n].offset = batch_offset;
> +      batch->reloc[n].delta = target_offset;
> +      batch->reloc[n].target_handle = target_bo->target_handle;
> +      batch->reloc[n].presumed_offset = target_bo->offset;
> +      batch->reloc[n].read_domains = read_domains;
> +      batch->reloc[n].write_domain = write_domain;
> +
> +      /* If we haven't added the batch to the execobject array yet, we
> +       * will have to process all the relocations pointing to the
> +       * batch when finalizing the request for submission.
> +       */
> +      if (target_bo->target_handle == -1) {
> +         int m = batch->emit.nself++;
> +         if (m < 256)
> +            batch->self_reloc[m] = n;
> +      }
> +   }
> +
> +   if (write_domain && !target_bo->dirty) {
> +      assert(target_bo != batch->bo);
> +      target_bo->write.rq = batch->next_request;
> +      list_move(&target_bo->write.link, &batch->next_request->write);
> +      assert(target_bo->write.rq == target_bo->read.rq);
> +      target_bo->dirty = true;
> +      target_bo->domain = DOMAIN_GPU;
> +      if (has_lut(batch)) {
> +         target_bo->exec->flags |= EXEC_OBJECT_WRITE;
> +         if (write_domain == I915_GEM_DOMAIN_INSTRUCTION &&
> +             batch->needs_pipecontrol_ggtt_wa)
> +            target_bo->exec->flags |= EXEC_OBJECT_NEEDS_GTT;
> +      }
> +   }
> +
> +   return target_bo->offset + target_offset;
> +}
> +
> +/*
> + * Close the batch by writing all the tail commands (to store register
> + * values between batches, disable profiling, etc). And then to end it all
> + * we set MI_BATCH_BUFFER_END.
> + */
> +static uint32_t __brw_batch_finish(struct brw_batch *batch)
> +{
> +   batch->reserved = 0;
> +
> +   /* Catch any final allocation errors, rolling back is marginally safer */
> +   batch->saved = batch->emit;
> +   if (setjmp(batch->jmpbuf) == 0)
> +	   brw_finish_batch(batch);
> +   else
> +	   batch->emit = batch->saved;
> +
> +   if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
> +      int bytes_for_commands = 4 * batch->emit.nbatch;
> +      int bytes_for_state = batch->bo->size - 4*batch->state;
> +      int total_bytes = bytes_for_commands + bytes_for_state;
> +      fprintf(stderr, "Batchbuffer flush on ring %d with %4db (pkt) + "
> +              "%4db (state) = %4db (%0.1f%%), with %d buffers and %d relocations [%d self], RSS %d KiB (cap %dKiB)\n",
> +              batch->ring, bytes_for_commands, bytes_for_state,
> +              total_bytes, 100.0f * total_bytes / BATCH_SIZE,
> +              batch->emit.nexec, batch->emit.nreloc, batch->emit.nself,
> +              (int)(batch->aperture>>10), (int)(batch->max_aperture>>10));
> +   }
> +
> +   batch->map[batch->emit.nbatch] = 0xa << 23;
> +   return 4*((batch->emit.nbatch + 2) & ~1);
> +}
> +
> +static void
> +__brw_batch_throttle(struct brw_batch *batch, struct brw_request *rq)
> +{
> +   /* Wait for the swapbuffers before the one we just emitted, so we
> +    * don't get too many swaps outstanding for apps that are GPU-heavy
> +    * but not CPU-heavy.
> +    *
> +    * We're using intelDRI2Flush (called from the loader before
> +    * swapbuffer) and glFlush (for front buffer rendering) as the
> +    * indicator that a frame is done and then throttle when we get
> +    * here as we prepare to render the next frame.  At this point for
> +    * round trips for swap/copy and getting new buffers are done and
> +    * we'll spend less time waiting on the GPU.
> +    *
> +    * Unfortunately, we don't have a handle to the batch containing
> +    * the swap, and getting our hands on that doesn't seem worth it,
> +    * so we just use the first batch we emitted after the last swap.
> +    */
> +   if (batch->need_swap_throttle) {
> +      if (batch->throttle && !batch->disable_throttling)
> +         __brw_bo_wait(RQ_BO(batch->throttle), -1);
> +      batch->throttle = rq;
> +      batch->need_flush_throttle = false;
> +      batch->need_swap_throttle = false;
> +   }
> +
> +   if (batch->need_flush_throttle) {
> +      drmCommandNone(batch->fd, DRM_I915_GEM_THROTTLE);
> +      batch->need_flush_throttle = false;
> +   }
> +
> +   if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
> +      fprintf(stderr, "waiting for idle\n");
> +      __brw_bo_wait(RQ_BO(rq), -1);
> +   }
> +}
> +
> +/*
> + * If we added relocations pointing to the batch before we knew
> + * its final index (the kernel assumes that the batch is last unless
> + * told otherwise), then we have to go through all the relocations
> + * and point them back to the batch.
> + */
> +static void __brw_batch_fixup_self_relocations(struct brw_batch *batch)
> +{
> +   uint32_t target = batch->bo->target_handle;
> +   int n, count;
> +
> +   count = MIN2(batch->emit.nself, 256);
> +   for (n = 0; n < count; n++)
> +      batch->reloc[batch->self_reloc[n]].target_handle = target;
> +   if (n == 256) {
> +      for (n = batch->self_reloc[255] + 1; n < batch->emit.nself; n++) {
> +         if (batch->reloc[n].target_handle == -1)
> +            batch->reloc[n].target_handle = target;
> +      }
> +   }
> +}
> +
> +static void
> +__brw_batch_dump(struct brw_batch *batch)
> +{
> +   struct drm_intel_decode *decode;
> +
> +   decode = drm_intel_decode_context_alloc(batch->screen->deviceID);
> +   if (!decode)
> +      return;
> +
> +   drm_intel_decode_set_batch_pointer(decode,
> +                                      batch->map, batch->bo->offset,
> +                                      batch->emit.nbatch + 1);
> +
> +   drm_intel_decode_set_output_file(decode, stderr);
> +   drm_intel_decode(decode);
> +
> +   drm_intel_decode_context_free(decode);
> +
> +   brw_debug_batch(batch);
> +}
> +
> +static void
> +__brw_batch_aub(struct brw_batch *batch)
> +{
> +   drm_intel_bo **bos;
> +   struct brw_request *rq = batch->next_request;
> +   int count;
> +
> +   bos = malloc(sizeof(drm_intel_bo *) * batch->emit.nexec);
> +   if (unlikely(bos == NULL))
> +      return;
> +
> +   count = 0;
> +   list_for_each_entry(struct brw_bo, bo, &rq->read, read.link)
> +      bos[count++] = bo->base;
> +   assert(count == batch->emit.nexec);
> +
> +   drm_intel_aub_exec(&batch->aub, batch->bo->base, bos, count,
> +                      hw_ring[batch->ring], batch->emit.nbatch*4);
> +
> +   free(bos);
> +}
> +
> +/*
> + * Check to see if the oldest requests have completed and retire them.
> + */
> +static void __brw_batch_retire(struct brw_batch *batch)
> +{
> +   struct list_head *list = &batch->requests[batch->ring];
> +
> +   do {
> +      struct brw_request *rq;
> +
> +      rq = list_last_entry(list, struct brw_request, link);
> +      if (__brw_bo_busy(RQ_BO(rq)))
> +	 break;
> +
> +      __brw_request_retire(rq);
> +   } while (!list_empty(list));
> +}
> +
> +/*
> + * Finalize the batch, submit it to hardware, and start a new batch/request.
> + */
> +int brw_batch_flush(struct brw_batch *batch)
> +{
> +   struct drm_i915_gem_execbuffer2 execbuf;
> +   struct drm_i915_gem_exec_object2 *exec;
> +   struct brw_request *rq = batch->next_request;
> +
> +   if (unlikely(batch->emit.nbatch == 0))
> +      return 0;
> +
> +   if (unlikely(rq == NULL))
> +      return -ENOMEM;
> +
> +   if (unlikely(INTEL_DEBUG & DEBUG_AUB))
> +      brw_annotate_batch(batch);
> +
> +   memset(&execbuf, 0, sizeof(execbuf));
> +   execbuf.batch_len = __brw_batch_finish(batch);
> +
> +   assert(rq->bo == batch->bo);
> +   assert(rq->bo->write.rq == NULL);
> +   assert(rq->bo->read.rq == rq);
> +   assert(rq->bo->exec != NULL);
> +   assert(rq->bo->dirty);
> +
> +   /* After we call __brw_batch_finish() as the callbacks may add relocs! */
> +   if (rq->bo->target_handle == -1) {
> +      rq->bo->target_handle =
> +	 has_lut(batch) ? batch->emit.nexec : rq->bo->handle;
> +      rq->bo->exec =
> +	 memset(&batch->exec[batch->emit.nexec++], 0, sizeof(*exec));
> +
> +      __brw_batch_fixup_self_relocations(batch);
> +   }
> +
> +   exec = rq->bo->exec;
> +   exec->handle = rq->bo->handle;
> +   exec->offset = rq->bo->offset;
> +   exec->alignment = rq->bo->alignment;
> +   exec->relocation_count = batch->emit.nreloc;
> +   exec->relocs_ptr = (uintptr_t)batch->reloc;
> +   if (rq->bo->pinned)
> +      exec->flags |= EXEC_OBJECT_PINNED;
> +   assert((exec->flags & EXEC_OBJECT_WRITE) == 0);
> +
> +   execbuf.buffers_ptr = (uintptr_t)batch->exec;
> +   execbuf.buffer_count = batch->emit.nexec;
> +   if (batch->ring == RENDER_RING || batch->has_softpin)
> +      execbuf.rsvd1 = batch->hw_ctx;
> +   execbuf.flags = hw_ring[batch->ring] | batch->batch_flags;
> +
> +   if (unlikely(batch->no_hw)) {
> +      /* submit a dummy execbuf to keep the fences accurate */
> +      batch->map[0] = 0xa << 23;
> +      execbuf.batch_len = 8;
> +   }
> +
> +   if (unlikely(drmIoctl(batch->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf))){
> +      if (errno == ENOSPC)
> +         return -ENOSPC;
> +
> +      fprintf(stderr,
> +              "Failed to submit batch buffer, rendering will be incorrect: %s [%d]\n",
> +              strerror(errno), errno);
> +
> +      /* submit a dummy execbuf to keep the fences accurate */
> +      batch->map[0] = 0xa << 23;
> +      execbuf.batch_len = 8;
> +
> +      if (drmIoctl(batch->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf))
> +         return -errno;
> +   }
> +
> +   if (PERF_IDLE && batch->idle_time[batch->ring] < 0) {
> +      batch->idle_time[batch->ring] += get_time();
> +      fprintf(stderr, "GPU command queue %d idle for %.3fms\n",
> +	      batch->ring, batch->idle_time[batch->ring] * 1000);
> +   }
> +
> +   list_for_each_entry(struct brw_bo, bo, &rq->read, read.link) {
> +      assert(bo->exec);
> +      assert(bo->read.rq == rq);
> +      bo->offset = bo->exec->offset;
> +      bo->exec = NULL;
> +      bo->dirty = false;
> +      bo->target_handle = -1;
> +      if (bo->domain != DOMAIN_GPU)
> +         bo->domain = DOMAIN_NONE;
> +   }
> +   assert(!rq->bo->dirty);
> +   list_add(&rq->link, &batch->requests[batch->ring]);
> +   rq->bo->pinned = batch->has_softpin;
> +   rq->bo = RQ_MARK_RING(rq, batch->ring);
> +
> +   if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
> +      __brw_batch_dump(batch);
> +
> +   if (unlikely(batch->aub.file))
> +      __brw_batch_aub(batch);
> +
> +   __brw_batch_throttle(batch, rq);
> +   __brw_batch_retire(batch);
> +
> +   return __brw_batch_reset(batch);
> +}
> +
> +/*
> + * Is the GPU still processing the most recent batch submitted?
> + * (Note does not include the batch currently being constructed.)
> + */
> +bool brw_batch_busy(struct brw_batch *batch)
> +{
> +   struct brw_request *rq;
> +
> +   if (list_empty(&batch->requests[batch->ring]))
> +      return false;
> +
> +   rq = list_first_entry(&batch->requests[batch->ring],
> +                         struct brw_request, link);
> +
> +   return __brw_request_busy(rq, 0);
> +}
> +
> +/*
> + * Wait for all GPU processing to complete.
> + */
> +void brw_batch_wait(struct brw_batch *batch)
> +{
> +   int n;
> +
> +   for (n = 0; n < __BRW_NUM_RINGS; n++) {
> +      struct brw_request *rq;
> +
> +      if (list_empty(&batch->requests[n]))
> +         continue;
> +
> +      rq = list_first_entry(&batch->requests[n],
> +                            struct brw_request, link);
> +
> +      __brw_bo_wait(rq->bo, -1);
> +   }
> +}
> +
> +static bool __is_uncached(int fd, uint32_t handle)
> +{
> +   struct drm_i915_gem_caching arg;
> +
> +   memset(&arg, 0, sizeof(arg));
> +   arg.handle = handle;
> +   drmIoctl(fd, DRM_IOCTL_I915_GEM_GET_CACHING, &arg);
> +   /* There is no right answer if an error occurs here. Fortunately, the
> +    * only error is ENOENT and that's impossible!
> +    */
> +   return arg.caching != I915_CACHING_CACHED;
> +}
> +
> +/*
> + * Wrap a drm_intel_bo reference in a struct brw_bo. Owernership
> + * of that reference is transferred to the struct brw_bo.
> + */
> +struct brw_bo *brw_bo_import(struct brw_batch *batch,
> +			     drm_intel_bo *base,
> +			     bool borrow)
> +{
> +   struct brw_bo *bo;
> +   uint32_t tiling, swizzling;
> +
> +   if (unlikely(base == NULL))
> +      return NULL;
> +
> +   assert(base->handle);
> +   assert(base->size);
> +
> +   if (borrow) {
> +      bo = __brw_batch_lookup_handle(batch, base->handle);
> +      if (bo) {
> +         bo->refcnt++;
> +         return bo;
> +      }
> +   }
> +
> +   if (batch->freed_bo) {
> +      bo = batch->freed_bo;
> +      batch->freed_bo = (struct brw_bo *)bo->base;
> +   } else {
> +      bo = malloc(sizeof(*bo));
> +      if (unlikely(bo == NULL))
> +         return NULL;
> +   }
> +
> +   memset(bo, 0, sizeof(*bo));
> +
> +   bo->handle = base->handle;
> +   bo->batch = batch;
> +   bo->refcnt = 1;
> +   bo->offset = base->offset64;
> +   bo->alignment = base->align;
> +   bo->size = base->size;
> +
> +   drm_intel_bo_get_tiling(base, &tiling, &swizzling);
> +   bo->tiling = tiling;
> +   bo->swizzling = swizzling;
> +   bo->reusable = !borrow;
> +   bo->cache_coherent = batch->has_llc; /* XXX libdrm bookkeeping */
> +
> +   batch->vmsize += bo->size;
> +
> +   list_inithead(&bo->read.link);
> +   list_inithead(&bo->write.link);
> +
> +   bo->base = base;
> +   if (borrow) {
> +      list_add(&bo->link, borrowed(batch, bo->handle));
> +      drm_intel_bo_reference(base);
> +      if (bo->cache_coherent)
> +         bo->cache_coherent = !__is_uncached(batch->fd, bo->handle);
> +   } else {
> +      list_add(&bo->link, &batch->inactive);
> +      /* If the buffer hasn't been used before on the GPU, presume it is a
> +       * new buffer in the CPU write domain. However, a buffer may have been
> +       * mapped and unused - but that should be relatively rare compared to
> +       * the optimisation chance of first writing through the CPU.
> +       */
> +      if (bo->offset == 0)
> +         __brw_bo_set_domain(bo, I915_GEM_DOMAIN_CPU, true);
> +   }
> +
> +   return bo;
> +}
> +
> +/*
> + * Search the list of active buffers (a local short lived cache) for
> + * something of the right size to reuse for the allocation request.
> + */
> +static struct brw_bo *__brw_bo_create__cached(struct brw_batch *batch,
> +					      uint64_t size)
> +{
> +   list_for_each_entry(struct brw_bo, bo, &batch->active, link) {
> +      assert(bo->batch == batch);
> +      assert(bo->read.rq != NULL);
> +
> +      if (bo->size < size || 3*size > 4*bo->size)
> +	 continue;
> +
> +      list_move(&bo->link, &batch->inactive);
> +      bo->refcnt++;
> +      return bo;
> +   }
> +
> +   return NULL;
> +}
> +
> +struct brw_bo *brw_bo_create(struct brw_batch *batch,
> +			     const char *name,
> +			     uint64_t size,
> +			     uint64_t alignment,
> +			     unsigned flags)
> +{
> +   drm_intel_bo *base;
> +   struct brw_bo *bo;
> +
> +   if (flags & BO_ALLOC_FOR_RENDER) {
> +      bo = __brw_bo_create__cached(batch, size);
> +      if (bo) {
> +	 /* XXX rename */
> +	 bo->alignment = alignment;
> +	 drm_intel_bo_set_tiling(bo->base, I915_TILING_NONE, 0);
> +	 return bo;
> +      }
> +   }
> +
> +   base = drm_intel_bo_alloc(batch->bufmgr, name, size, alignment);
> +   if (unlikely(base == NULL))
> +      return NULL;
> +
> +   bo = brw_bo_import(batch, base, false);
> +   if (unlikely(bo == NULL)) {
> +      drm_intel_bo_unreference(base);
> +      return NULL;
> +   }
> +
> +   return bo;
> +}
> +
> +static uint64_t brw_surface_size(int cpp,
> +				 uint32_t width,
> +				 uint32_t height,
> +				 uint32_t tiling,
> +				 uint32_t *pitch)
> +{
> +   uint32_t tile_width, tile_height;
> +
> +   switch (tiling) {
> +   default:
> +   case I915_TILING_NONE:
> +      tile_width = 64;
> +      tile_height = 2;
> +      break;
> +   case I915_TILING_X:
> +      tile_width = 512;
> +      tile_height = 8;
> +      break;
> +   case I915_TILING_Y:
> +      tile_width = 128;
> +      tile_height = 32;
> +      break;
> +   }
> +
> +   *pitch = ALIGN(width * cpp, tile_width);
> +   height = ALIGN(height, tile_height);
> +   height *= *pitch;
> +   return ALIGN(height, 4096);
> +}
> +
> +struct brw_bo *
> +brw_bo_create_tiled(struct brw_batch *batch,
> +		    const char *name,
> +		    uint32_t width,
> +		    uint32_t height,
> +		    int cpp,
> +		    uint32_t *tiling,
> +		    uint32_t *pitch,
> +		    unsigned flags)
> +{
> +   unsigned long __pitch;
> +   drm_intel_bo *base;
> +   struct brw_bo *bo;
> +
> +   if (flags & BO_ALLOC_FOR_RENDER) {
> +      uint64_t size = brw_surface_size(cpp, width, height, *tiling, pitch);
> +
> +      bo = __brw_bo_create__cached(batch, size);
> +      if (bo) {
> +	 /* XXX rename */
> +	 bo->alignment = 0;
> +         drm_intel_bo_set_tiling(bo->base, tiling, *pitch);
> +	 return bo;
> +      }
> +   }
> +
> +   base = drm_intel_bo_alloc_tiled(batch->bufmgr, name,
> +                                   width, height, cpp,
> +                                   tiling, &__pitch, flags);
> +   if (unlikely(base == NULL))
> +      return NULL;
> +
> +   *pitch = __pitch;
> +   bo = brw_bo_import(batch, base, false);
> +   if (unlikely(bo == NULL)) {
> +      drm_intel_bo_unreference(base);
> +      return NULL;
> +   }
> +
> +   return bo;
> +}
> +
> +/*
> + * Import a foriegn buffer from another process using the global
> + * (flinked) name.
> + */
> +struct brw_bo *brw_bo_create_from_name(struct brw_batch *batch,
> +				       const char *name,
> +				       uint32_t global_name)
> +{
> +   drm_intel_bo *base;
> +   struct brw_bo *bo;
> +
> +   base = drm_intel_bo_gem_create_from_name(batch->bufmgr, name, global_name);
> +   if (unlikely(base == NULL))
> +      return NULL;
> +
> +   bo = brw_bo_import(batch, base, true);
> +   drm_intel_bo_unreference(base);
> +
> +   return bo;
> +}
> +
> +bool brw_batch_create_fence(struct brw_batch *batch,
> +                            struct brw_fence *fence)
> +{
> +   brw_batch_flush(batch);
> +
> +   if (list_empty(&batch->requests[batch->ring]))
> +      return false;
> +
> +   fence->rq = list_first_entry(&batch->requests[batch->ring],
> +                                struct brw_request, link);
> +   list_add(&fence->link, &fence->rq->fence);
> +   return true;
> +}
> +
> +int brw_fence_wait(struct brw_fence *fence, int64_t timeout)
> +{
> +   if (fence->rq == NULL)
> +      return 0;
> +
> +   return __brw_bo_wait(fence->rq->bo, timeout);
> +}
> +
> +void brw_fence_destroy(struct brw_fence *fence)
> +{
> +   if (fence->rq == NULL)
> +      return;
> +
> +   list_del(&fence->link);
> +   fence->rq = NULL;
> +}
> +
> +/*
> + * Write a portion of the *linear* buffer using the pointer provided.
> + *
> + * This is conceptually equivalent to calling
> + *   memcpy(brw_bo_map(MAP_WRITE | MAP_DETILED | flags) + offset, data, size)
> + * but can be much more efficient as it will try to avoid cache domain
> + * side-effects (if any).
> + */
> +void brw_bo_write(struct brw_bo *bo,
> +		  uint64_t offset,
> +		  const void *data,
> +		  uint64_t length,
> +		  unsigned flags)
> +{
> +   struct drm_i915_gem_pwrite pwrite;
> +   void *map;
> +
> +   assert(offset < bo->size);
> +   assert(length <= bo->size - offset);
> +
> +   map = brw_bo_map(bo, MAP_WRITE | MAP_DETILED | flags);
> +   if (map) {
> +      memcpy(map + offset, data, length);
> +      return;
> +   }
> +
> +   memset(&pwrite, 0, sizeof(pwrite));
> +   pwrite.handle = bo->handle;
> +   pwrite.offset = offset;
> +   pwrite.size = length;
> +   pwrite.data_ptr = (uintptr_t)data;
> +   if (drmIoctl(bo->batch->fd, DRM_IOCTL_I915_GEM_PWRITE, &pwrite))
> +      return;
> +
> +   if (bo->read.rq)
> +      __brw_request_retire(bo->read.rq);
> +
> +   assert(bo->refcnt);
> +   bo->domain = DOMAIN_GTT;
> +}
> +
> +/*
> + * Read a portion of the *linear* buffer into the pointer provided.
> + *
> + * This is conceptually equivalent to calling
> + *   memcpy(data, brw_bo_map(MAP_READ | MAP_DETILED | flags) + offset, size)
> + * but can be much more efficient as it will try to avoid cache domain
> + * side-effects (if any).
> + */
> +void brw_bo_read(struct brw_bo *bo,
> +		 uint64_t offset,
> +		 void *data,
> +		 uint64_t length,
> +		 unsigned flags)
> +{
> +   struct drm_i915_gem_pread pread;
> +   void *map;
> +
> +   assert(offset < bo->size);
> +   assert(length <= bo->size - offset);
> +
> +   if (bo->cache_coherent) {
> +      map = brw_bo_map(bo, MAP_READ | MAP_DETILED | flags);
> +      if (map) {
> +         memcpy(data, map + offset, length);
> +         return;
> +      }
> +   } else {
> +      if ((flags & MAP_ASYNC) == 0) {
> +         struct brw_request *rq = bo->write.rq;
> +         if (rq && rq->bo->exec)
> +            brw_batch_flush(bo->batch);
> +      }
> +   }
> +
> +   memset(&pread, 0, sizeof(pread));
> +   pread.handle = bo->handle;
> +   pread.offset = offset;
> +   pread.size = length;
> +   pread.data_ptr = (uintptr_t)data;
> +   if (drmIoctl(bo->batch->fd, DRM_IOCTL_I915_GEM_PREAD, &pread))
> +      return;
> +
> +   if (bo->write.rq)
> +      __brw_request_retire(bo->write.rq);
> +
> +   assert(bo->refcnt);
> +   if (bo->domain != DOMAIN_CPU)
> +      bo->domain = DOMAIN_NONE;
> +}
> +
> +/*
> + * Provide a WC mmaping of the buffer. Coherent everywhere, but
> + * reads are very slow (as they are uncached). Fenced, so automatically
> + * detiled by hardware and constrained to fit in the aperture.
> + */
> +static void *brw_bo_map__gtt(struct brw_bo *bo, unsigned flags)
> +{
> +   if (flags & MAP_DETILED && bo->tiling)
> +      return NULL;
> +
> +   if (bo->map__gtt == NULL)
> +      bo->map__gtt = drm_intel_gem_bo_map__gtt(bo->base);
> +
> +   if ((flags & MAP_ASYNC) == 0)
> +      __brw_bo_set_domain(bo, I915_GEM_DOMAIN_GTT, flags & MAP_WRITE);
> +
> +   return bo->map__gtt;
> +}
> +
> +/*
> + * Provide a WC mmaping of the buffer. Coherent everywhere, but
> + * reads are very slow (as they are uncached). Unfenced, not
> + * constrained by the mappable aperture.
> + */
> +static void *brw_bo_map__wc(struct brw_bo *bo, unsigned flags)
> +{
> +   if (!bo->batch->has_mmap_wc)
> +      return brw_bo_map__gtt(bo, flags);
> +
> +   if (bo->map__wc == NULL) {
> +      bo->map__wc = drm_intel_gem_bo_map__wc(bo->base);
> +      if (bo->map__wc == NULL) {
> +         bo->batch->has_mmap_wc = false;
> +         return brw_bo_map__gtt(bo, flags);
> +      }
> +   }
> +   assert(bo->map__wc);
> +
> +   if ((flags & MAP_ASYNC) == 0)
> +      __brw_bo_set_domain(bo, I915_GEM_DOMAIN_GTT, flags & MAP_WRITE);
> +
> +   return bo->map__wc;
> +}
> +
> +/*
> + * Provide a WB mmaping of the buffer. Incoherent on non-LLC platforms
> + * and will trigger clflushes of the entire buffer. Unfenced, not
> + * constrained by the mappable aperture.
> + */
> +static void *brw_bo_map__cpu(struct brw_bo *bo, unsigned flags)
> +{
> +   if (bo->map__cpu == NULL)
> +      bo->map__cpu = drm_intel_gem_bo_map__cpu(bo->base);
> +   assert(bo->map__cpu);
> +
> +   if ((flags & MAP_ASYNC) == 0)
> +      __brw_bo_set_domain(bo, I915_GEM_DOMAIN_CPU, flags & MAP_WRITE);
> +
> +   return bo->map__cpu;
> +}
> +
> +static bool can_map__cpu(struct brw_bo *bo, unsigned flags)
> +{
> +   if (bo->cache_coherent)
> +      return true;
> +
> +   if (flags & MAP_COHERENT)
> +      return false;
> +
> +   if (bo->domain == DOMAIN_CPU)
> +      return true;
> +
> +   return (flags & MAP_WRITE) == 0;
> +}
> +
> +/*
> + * Map the buffer for access by the CPU, either for writing or reading,
> + * and return a pointer for that access.
> + *
> + * If the async flag is not set, any previous writing by the GPU is
> + * waited upon, and if write access is required all GPU reads as well.
> + *
> + * If the async flag is set, the kernel is not informed of the access
> + * and the access may be concurrent with GPU access. Also importantly,
> + * cache domain tracking for the buffer is *not* maintained and so access
> + * modes are limited to coherent modes (taking into account the current
> + * cache domain).
> + *
> + * If the detiled flag is set, the caller will perform manual detiling
> + * through the mapping, and so we do not allocate a fence for the operation.
> + * This can return NULL on failure, for example if the kernel doesn't support
> + * such an operation.
> + *
> + * The method for mapping the buffer is chosen based on the hardware
> + * architecture (LLC has fast coherent reads and writes, non-LLC has fast
> + * coherent writes, slow coherent reads but faster incoherent reads)
> + * and mode of operation. In theory, for every desired access mode,
> + * the pointer is the fastest direct CPU access to the immediate buffer.
> + * However, direct CPU access to this buffer may not always be the fastest
> + * method of accessing the data within that buffer by the CPU!
> + *
> + * Returns NULL on error.
> + */
> +void *brw_bo_map(struct brw_bo *bo, unsigned flags)
> +{
> +   assert(bo->refcnt);
> +
> +   if ((flags & MAP_ASYNC) == 0) {
> +      struct brw_request *rq;
> +
> +      rq = flags & MAP_WRITE ? bo->read.rq : bo->write.rq;
> +      if (rq && RQ_BO(rq)->dirty)
> +         brw_batch_flush(bo->batch);
> +   }
> +
> +   if (bo->tiling && (flags & MAP_DETILED) == 0)
> +      return brw_bo_map__gtt(bo, flags);
> +   else if (can_map__cpu(bo, flags))
> +      return brw_bo_map__cpu(bo, flags);
> +   else
> +      return brw_bo_map__wc(bo, flags);
> +}
> +
> +/*
> + * After the final reference to a bo is released, free the buffer.
> + *
> + * If the buffer is still active, and it is reusable, the buffer is
> + * transferred to the local active cache and may be reallocated on the
> + * next call to brw_bo_create() or brw_bo_create_tiled(). Otherwise the
> + * buffer is returned back to the shared screen bufmgr pool.
> + */
> +void  __brw_bo_free(struct brw_bo *bo)
> +{
> +   struct brw_batch *batch;
> +
> +   assert(bo->refcnt == 0);
> +
> +   if (bo->read.rq) {
> +      assert(bo->batch);
> +      if (bo->reusable)
> +         list_move(&bo->link, &bo->batch->active);
> +      return;
> +   }
> +
> +   assert(!bo->write.rq);
> +   list_del(&bo->link);
> +
> +   if (bo->offset)
> +      bo->base->offset64 = bo->offset;
> +   drm_intel_bo_unreference(bo->base);
> +
> +   batch = bo->batch;
> +   if (batch == NULL) {
> +      free(bo);
> +      return;
> +   }
> +
> +   batch->vmsize -= bo->size;
> +   if (batch->vmsize < batch->peak_rss)
> +      batch->peak_rss = batch->vmsize;
> +
> +   bo->base = (drm_intel_bo *)batch->freed_bo;
> +   batch->freed_bo = bo;
> +}
> +
> +/*
> + * Mark the beginning of a batch construction critical section, during which
> + * the batch is not allowed to be flushed. Access to the batch prior to this
> + * call is invalid. Access after this call but with instructions for another
> + * ring is also invalid. All BATCH_EMIT() must be inside a brw_batch_begin(),
> + * brw_batch_end() pairing - the exception to this rule are when inside the
> + * brw_start_batch() and brw_finish_batch() callbacks.
> + *
> + * Control returns to the caller of brw_batch_begin() if an error is
> + * encountered whilst inside the critical section. If the return code
> + * is negative, a fatal error occurred. If the return code is positive,
> + * the batch had to be flushed and the critical section needs to be restarted.
> + *
> + * On success 0 is returned.
> + *
> + * Must be paired with brw_batch_end().
> + */
> +int brw_batch_begin(struct brw_batch *batch,
> +		    uint32_t bytes,
> +		    enum brw_gpu_ring ring)
> +{
> +   uint16_t space;
> +
> +   if (unlikely(batch->next_request == NULL))
> +      return -ENOMEM;
> +
> +   ring = batch->actual_ring[ring];
> +   if (ring != batch->ring)
> +      space = 0;
> +   else
> +      space = batch->state - batch->reserved - batch->emit.nbatch;
> +   if (unlikely(bytes/4 > space)) {
> +      int ret = brw_batch_flush(batch);
> +      if (ret)
> +         return ret;
> +   }
> +
> +   batch->ring = ring;
> +   batch->bo->dirty = true;
> +
> +   if (batch->emit.nbatch == 0)
> +      /* An early allocation error should be impossible */
> +      brw_start_batch(batch);
> +
> +   assert(batch->ring == ring);
> +   batch->saved = batch->emit;
> +   return setjmp(batch->jmpbuf);
> +}
> +
> +/*
> + * Mark the end of a batch construction critical section. After this call
> + * the batch is inaccessible until the next brw_batch_begin().
> + *
> + * We may flush the batch to hardware if it exceeds the aperture
> + * high water mark. If the batch submission fails, we rollback to the
> + * end of the previous critical section and try flushing again. If that
> + * should fail, we report the error back to the caller. If the rollback
> + * succeeds, we jump back to the brw_batch_begin() with a fresh request
> + * and run through the critical section again.
> + *
> + * Returns 0 on success and no errors have occurred.
> + *
> + * Must be paired with brw_batch_begin().
> + */
> +int brw_batch_end(struct brw_batch *batch)
> +{
> +   int ret;
> +
> +   if (batch->aperture < batch->max_aperture)
> +      return 0;
> +
> +   ret = brw_batch_flush(batch);
> +   if (likely(ret == 0))
> +      return 0;
> +
> +   if (batch->saved.nbatch == batch->emit.nbatch)
> +      return ret;
> +
> +   batch->emit = batch->saved;
> +
> +   ret = brw_batch_flush(batch);
> +   if (ret != -ENOSPC)
> +      return ret;
> +
> +   longjmp(batch->jmpbuf, 1);
> +}
> +
> +/*
> + * How much of the batch is used, both by the 3DSTATE at the beginning of
> + * the batch, and the data at the end?
> + */
> +inline static int __brw_batch_count(struct brw_batch *batch)
> +{
> +   return batch->emit.nbatch + BATCH_SIZE/4 - batch->state;
> +}
> +
> +/*
> + * After a high-level draw command, check to see if we want to flush
> + * the batch to the hardware for either debug reasons or for sanity.
> + */
> +int brw_batch_maybe_flush(struct brw_batch *batch)
> +{
> +   if (batch->always_flush)
> +      goto flush;
> +
> +   /* If the working set exceeds the GTT's limits, we will need to evict
> +    * textures in order to execute batches. As we have no method for predicting
> +    * when we need to evict, we need to frequently flush the batch so that any
> +    * stalls are minimised.
> +    */
> +   if (batch->peak_rss > batch->max_aperture && __brw_batch_count(batch) > 2048)
> +      goto flush;
> +
> +   return 0;
> +
> +flush:
> +   if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
> +      fprintf(stderr, "Forcing batchbuffer flush after %d: debug.always_flush?=%d, rss=%d [cap %d], vmasize=%d\n",
> +	      batch->emit.nbatch,
> +	      batch->always_flush,
> +              (int)(batch->peak_rss >> 20), (int)(batch->max_aperture >> 20),
> +	      (int)(batch->vmsize >> 20));
> +   }
> +   return brw_batch_flush(batch);
> +}
> +
> +/*
> + * Query the kernel for the number of times our hardware context has
> + * been implicated in a reset event - either guilty or just a victim,
> + * and the number of resets that have occurred overall.
> + */
> +int brw_batch_get_reset_stats(struct brw_batch *batch,
> +			      uint32_t *reset_count,
> +			      uint32_t *active,
> +			      uint32_t *pending)
> +{
> +   struct drm_i915_reset_stats stats;
> +
> +   if (!batch->hw_ctx)
> +      return -ENODEV;
> +
> +   memset(&stats, 0, sizeof(stats));
> +   stats.ctx_id = batch->hw_ctx;
> +   if (drmIoctl(batch->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats))
> +      return -errno;
> +
> +   *reset_count = stats.reset_count;
> +   *active = stats.batch_active;
> +   *pending = stats.batch_pending;
> +   return 0;
> +}
> +
> +/*
> + * Mark the buffers as being invalid to prevent stale dereferences when
> + * tearing down shared resources.
> + */
> +static void __brw_bo_list_fini(struct list_head *list)
> +{
> +   while (!list_empty(list)) {
> +      struct brw_bo *bo = list_first_entry(list, struct brw_bo, link);
> +
> +      assert(bo->batch);
> +      assert(bo->read.rq == NULL);
> +
> +      bo->batch = NULL;
> +      list_delinit(&bo->link);
> +   }
> +}
> +
> +/* Normally we never free a request as they get recycled between batches.
> + * Except when we need to teardown the batch manager and free everything.
> + */
> +static void __brw_request_free(struct brw_request *rq)
> +{
> +   /* Opencode the free(bo) here to handle batch->next_request */
> +   assert(RQ_BO(rq) == rq->bo);
> +   list_delinit(&rq->bo->link);
> +   free(rq->bo);
> +   free(rq);
> +}
> +
> +/*
> + * Teardown the batch manager and free all associated memory and resources.
> + */
> +void brw_batch_fini(struct brw_batch *batch)
> +{
> +   int n;
> +
> +   /* All bo should have been released before the destructor is called */
> +   batch->fini = true;
> +
> +   drm_intel_aub_enable(&batch->aub, -1, 0);
> +
> +   for (n = 0; n < __BRW_NUM_RINGS; n++) {
> +      struct brw_request *rq;
> +
> +      if (list_empty(&batch->requests[n]))
> +         continue;
> +
> +      /* Note that the request and buffers are not truly idle here. It is
> +       * safe as the kernel will keep a reference whilst the buffers are
> +       * active (so we can shutdown ahead of time), but we need to disable
> +       * our runtime assertions that the request is idle at the time of
> +       * retiring.
> +       */
> +      rq = list_first_entry(&batch->requests[n], struct brw_request, link);
> +      __brw_request_retire(rq);
> +
> +      assert(list_empty(&batch->requests[n]));
> +   }
> +
> +   while (batch->freed_rq) {
> +      struct brw_request *rq = batch->freed_rq;
> +      batch->freed_rq = (struct brw_request *)rq->link.next;
> +      __brw_request_free(rq);
> +   }
> +   __brw_request_free(batch->next_request);
> +
> +   assert(list_empty(&batch->active));
> +   for (n = 0; n < 1 << BORROWED_BITS; n++)
> +      __brw_bo_list_fini(&batch->borrowed[n]);
> +   __brw_bo_list_fini(&batch->inactive);
> +
> +   while (batch->freed_bo) {
> +      struct brw_bo *bo = batch->freed_bo;
> +      batch->freed_bo = (struct brw_bo *)bo->base;
> +      free(bo);
> +   }
> +
> +   free(batch->exec);
> +   free(batch->reloc);
> +
> +   if (batch->hw_ctx) {
> +      struct drm_i915_gem_context_destroy destroy;
> +
> +      memset(&destroy, 0, sizeof(destroy));
> +      destroy.ctx_id = batch->hw_ctx;
> +      drmIoctl(batch->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &destroy);
> +   }
> +}
> diff --git a/src/mesa/drivers/dri/i965/brw_batch.h b/src/mesa/drivers/dri/i965/brw_batch.h
> new file mode 100644
> index 0000000..0b5468b
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_batch.h
> @@ -0,0 +1,377 @@
> +#ifndef BRW_BATCH_H
> +#define BRW_BATCH_H
> +
> +#include <stdbool.h>
> +#include <stdint.h>
> +#include <string.h>
> +#include <setjmp.h>
> +#include <assert.h>
> +
> +#include <intel_aub.h>
> +#include <intel_bufmgr.h>
> +
> +#include "util/list.h"
> +
> +#define HAS_GCC(major, minor) defined(__GNUC__) && (__GNUC__ > (major) || __GNUC__ == (major) && __GNUC_MINOR__ >= (minor))
> +
> +#if HAS_GCC(3, 4)
> +#define must_check  __attribute__((warn_unused_result))
> +#else
> +#define must_check
> +#endif
> +
> +struct _drm_intel_bufmgr;
> +struct _drm_intel_bo;
> +
> +struct intel_screen;
> +
> +enum brw_gpu_ring {
> +   RENDER_RING = 0,
> +   BLT_RING,
> +   __BRW_NUM_RINGS,
> +};
> +
> +struct brw_batch;
> +struct brw_bo;
> +struct brw_request;
> +
> +enum brw_bo_domain { DOMAIN_NONE, DOMAIN_CPU, DOMAIN_GTT, DOMAIN_GPU };
> +
> +struct brw_bo {
> +   struct brw_batch *batch;
> +   struct drm_i915_gem_exec_object2 *exec;
> +   struct brw_request_node {
> +      struct brw_request *rq;
> +      struct list_head link;
> +   } read, write;
> +
> +   unsigned dirty : 1;
> +   unsigned domain : 2;
> +   unsigned tiling : 4;
> +   unsigned swizzling : 4;
> +   unsigned pinned : 1;
> +   unsigned cache_coherent : 1;
> +   unsigned reusable : 1;
> +
> +   unsigned refcnt;
> +   uint32_t handle;
> +   uint32_t target_handle;
> +   uint64_t size;
> +   uint64_t alignment;
> +   uint64_t offset;
> +
> +   struct _drm_intel_bo *base;
> +   struct list_head link;
> +
> +   void *map__cpu;
> +   void *map__gtt;
> +   void *map__wc;
> +};
> +
> +struct brw_batch {
> +   int fd;
> +
> +   struct brw_bo *bo;
> +   uint32_t *map;
> +
> +   uint32_t batch_flags;
> +   uint32_t batch_base_flags;
> +
> +   enum brw_gpu_ring ring;
> +   uint32_t hw_ctx;
> +
> +   uint16_t reserved;
> +   uint16_t state;
> +
> +   struct brw_batch_state {
> +      uint16_t nbatch;
> +      uint16_t nexec;
> +      uint16_t nreloc;
> +      uint16_t nself;
> +   } emit, saved;
> +
> +   uint64_t aperture;
> +   uint64_t max_aperture;
> +   uint64_t rss, peak_rss, vmsize;
> +
> +   bool has_softpin : 1;
> +   bool has_llc : 1;
> +   bool has_mmap_wc : 1;
> +   bool needs_pipecontrol_ggtt_wa : 1;
> +
> +   bool always_flush : 1;
> +
> +   /** Framerate throttling: @{ */
> +   /* Limit the number of outstanding SwapBuffers by waiting for an earlier
> +    * frame of rendering to complete. This gives a very precise cap to the
> +    * latency between input and output such that rendering never gets more
> +    * than a frame behind the user. (With the caveat that we technically are
> +    * not using the SwapBuffers itself as a barrier but the first batch
> +    * submitted afterwards, which may be immediately prior to the next
> +    * SwapBuffers.)
> +    */
> +   bool need_swap_throttle : 1;
> +
> +   /** General throttling, not caught by throttling between SwapBuffers */
> +   bool need_flush_throttle : 1;
> +   bool disable_throttling : 1;
> +   /** @} */
> +
> +   bool no_hw : 1;
> +   bool fini : 1;
> +
> +   jmp_buf jmpbuf;
> +
> +   uint16_t exec_size;
> +   uint16_t reloc_size;
> +
> +   struct drm_i915_gem_exec_object2 *exec;
> +   struct drm_i915_gem_relocation_entry *reloc;
> +   uint16_t self_reloc[256];
> +
> +   int actual_ring[__BRW_NUM_RINGS];
> +   struct list_head requests[__BRW_NUM_RINGS];
> +   struct brw_request *throttle;
> +   struct brw_request *next_request;
> +   struct brw_request *freed_rq;
> +
> +   double idle_time[__BRW_NUM_RINGS];
> +
> +   struct intel_screen *screen;
> +   struct _drm_intel_bufmgr *bufmgr;
> +   struct list_head active, inactive;
> +
> +#define BORROWED_BITS 3
> +   struct list_head borrowed[1<<BORROWED_BITS];
> +
> +   struct brw_bo *freed_bo;
> +
> +   drm_intel_aub aub;
> +};
> +
> +int brw_batch_init(struct brw_batch *batch,
> +		   struct intel_screen *screen);
> +
> +void brw_batch_clear_dirty(struct brw_batch *batch);
> +
> +/** Add a relocation entry to the current batch
> + * XXX worth specialising 32bit variant?
> + */
> +uint64_t __brw_batch_reloc(struct brw_batch *batch,
> +                           uint32_t batch_offset,
> +                           struct brw_bo *target_bo,
> +                           uint64_t target_offset,
> +                           unsigned read_domains,
> +                           unsigned write_domain);
> +must_check static inline uint64_t brw_batch_reloc(struct brw_batch *batch,
> +						  uint32_t batch_offset,
> +						  struct brw_bo *target_bo,
> +						  uint64_t target_offset,
> +						  unsigned read_domains,
> +						  unsigned write_domain)
> +{
> +   if (target_bo == NULL)
> +      return target_offset;
> +
> +   return __brw_batch_reloc(batch, batch_offset,
> +                            target_bo, target_offset,
> +                            read_domains, write_domain);
> +}
> +
> +int brw_batch_get_reset_stats(struct brw_batch *batch,
> +			      uint32_t *reset_count,
> +			      uint32_t *active,
> +			      uint32_t *pending);
> +
> +bool brw_batch_busy(struct brw_batch *batch);
> +/** Wait for the last submitted rendering to complete */
> +void brw_batch_wait(struct brw_batch *batch);
> +
> +void brw_batch_fini(struct brw_batch *batch);
> +
> +/* Wrap a drm_intel_bo within a local struct brw_bo */
> +struct brw_bo *
> +brw_bo_import(struct brw_batch *batch,
> +	      struct _drm_intel_bo *base,
> +	      bool borrow);
> +
> +/* Create a local brw_bo for a linear/unfenced buffer and allocate the buffer */
> +struct brw_bo *
> +brw_bo_create(struct brw_batch *batch,
> +	      const char *name,
> +	      uint64_t size,
> +	      uint64_t alignment,
> +	      unsigned flags);
> +
> +/* Create a local brw_bo for a tiled buffer and allocate the buffer */
> +struct brw_bo *
> +brw_bo_create_tiled(struct brw_batch *batch,
> +		    const char *name,
> +		    uint32_t width,
> +		    uint32_t height,
> +		    int cpp,
> +		    uint32_t *tiling,
> +		    uint32_t *pitch,
> +		    unsigned flags);
> +
> +/* Create a local brw_bo for a foreign buffer using its global flinked name */
> +struct brw_bo *brw_bo_create_from_name(struct brw_batch *batch,
> +				       const char *name,
> +				       uint32_t global_name);
> +
> +void brw_bo_mark_dirty(struct brw_batch *batch, struct brw_bo *bo);
> +
> +inline static int brw_bo_madvise(struct brw_bo *bo, int state)
> +{
> +   return drm_intel_bo_madvise(bo->base, state);
> +}
> +
> +inline static uint32_t brw_bo_flink(struct brw_bo *bo)
> +{
> +   uint32_t name = 0;
> +   drm_intel_bo_flink(bo->base, &name);
> +   return name;
> +}
> +
> +int brw_bo_wait(struct brw_bo *bo, int64_t timeout);
> +
> +void brw_bo_write(struct brw_bo *bo, uint64_t offset,
> +		  const void *data, uint64_t length,
> +		  unsigned flags);
> +void brw_bo_read(struct brw_bo *bo, uint64_t offset,
> +		 void *data, uint64_t length,
> +		 unsigned flags);
> +
> +bool __brw_request_busy(struct brw_request *rq, unsigned flags);
> +static inline bool brw_bo_busy(struct brw_bo *bo, unsigned flags)
> +#define BUSY_READ 0
> +#define BUSY_WRITE 1
> +#define BUSY_FLUSH 2
> +#define BUSY_RETIRE 4
> +{
> +   struct brw_request *rq;
> +
> +   if (!bo)
> +      return false;
> +
> +   assert(bo->refcnt);
> +   rq = flags & BUSY_WRITE ? bo->read.rq : bo->write.rq;
> +   if (!rq) {
> +      assert(!bo->exec);
> +      return false;
> +   }
> +
> +   if (flags & (BUSY_FLUSH | BUSY_RETIRE))
> +      return __brw_request_busy(rq, flags);
> +
> +   return true;
> +}
> +
> +void *brw_bo_map(struct brw_bo *bo, unsigned flags);
> +#define MAP_READ 0x0
> +#define MAP_WRITE 0x1
> +#define MAP_ASYNC 0x2
> +#define MAP_COHERENT 0x4
> +#define MAP_DETILED 0x8
> +
> +/* Take a new reference to the brw_bo */
> +static inline struct brw_bo *brw_bo_get(struct brw_bo *bo)
> +{
> +   assert(bo != NULL && bo->refcnt > 0);
> +   bo->refcnt++;
> +   return bo;
> +}
> +
> +/* Release a reference to the brw_bo */
> +void  __brw_bo_free(struct brw_bo *bo);
> +static inline void brw_bo_put(struct brw_bo *bo)
> +{
> +   assert(bo == NULL || bo->refcnt > 0);
> +   if (bo && --bo->refcnt == 0)
> +      __brw_bo_free(bo);
> +}
> +
> +/* A fence is created at the current point on the order batch timeline. When
> + * the GPU passes that point, the fence will be signalled. Or you can wait
> + * for a fence to complete.
> + */
> +struct brw_fence {
> +   struct brw_request *rq;
> +   struct list_head link;
> +   void (*signal)(struct brw_fence *);
> +};
> +
> +bool
> +brw_batch_create_fence(struct brw_batch *batch,
> +                       struct brw_fence *fence);
> +
> +static inline bool
> +brw_fence_busy(struct brw_fence *fence)
> +{
> +   return __brw_request_busy(fence->rq, BUSY_FLUSH | BUSY_RETIRE);
> +}
> +
> +int brw_fence_wait(struct brw_fence *fence, int64_t timeout);
> +void brw_fence_destroy(struct brw_fence *fence);
> +
> +/* Control batch command insertion and submission to hw */
> +must_check int brw_batch_begin(struct brw_batch *batch,
> +			       uint32_t estimated_bytes,
> +			       enum brw_gpu_ring ring);
> +int brw_batch_end(struct brw_batch *batch);
> +int brw_batch_flush(struct brw_batch *batch);
> +int brw_batch_maybe_flush(struct brw_batch *batch);
> +
> +/* Interfaces for writing commands into the batch */
> +static inline void __brw_batch_check(struct brw_batch *batch, int count, enum brw_gpu_ring ring)
> +{
> +   assert(batch->emit.nbatch + count < batch->state - batch->reserved);
> +   assert(batch->ring == batch->actual_ring[ring]);
> +}
> +
> +static inline void brw_batch_emit(struct brw_batch *batch, uint32_t dw)
> +{
> +   batch->map[batch->emit.nbatch++] = dw;
> +}
> +
> +static inline void brw_batch_data(struct brw_batch *batch,
> +				  const void *data,
> +				  int bytes)
> +{
> +   assert(batch->emit.nbatch + bytes/4 < batch->state - batch->reserved);
> +   assert((bytes & 3) == 0);
> +   memcpy(batch->map + batch->emit.nbatch, data, bytes);
> +   batch->emit.nbatch += bytes / 4;
> +}
> +
> +static inline uint32_t float_as_int(float f)
> +{
> +   union {
> +      float f;
> +      uint32_t dw;
> +   } fi;
> +
> +   fi.f = f;
> +   return fi.dw;
> +}
> +
> +static inline void brw_batch_emit64(struct brw_batch *batch, uint64_t qw)
> +{
> +   *(uint64_t *)(batch->map + batch->emit.nbatch) = qw;
> +   batch->emit.nbatch += 2;
> +}
> +
> +#define BEGIN_BATCH(n) __brw_batch_check(&brw->batch, n, RENDER_RING)
> +#define BEGIN_BATCH_BLT(n) __brw_batch_check(&brw->batch, n, BLT_RING)
> +#define OUT_BATCH(dw) brw_batch_emit(&brw->batch, dw)
> +#define OUT_BATCH_F(f) brw_batch_emit(&brw->batch, float_as_int(f))
> +#define OUT_RELOC(bo, read_domains, write_domain, delta) \
> +	OUT_BATCH(brw_batch_reloc(&brw->batch, brw->batch.emit.nbatch*4, \
> +				  bo, delta, read_domains, write_domain))
> +#define OUT_BATCH64(qw) brw_batch_emit64(&brw->batch, qw)
> +#define OUT_RELOC64(bo, read_domains, write_domain, delta) \
> +	OUT_BATCH64(brw_batch_reloc(&brw->batch, brw->batch.emit.nbatch*4,\
> +				    bo, delta, read_domains, write_domain))
> +#define ADVANCE_BATCH()
> +
> +#endif /* BRW_BATCH_H */
> diff --git a/src/mesa/drivers/dri/i965/brw_binding_tables.c b/src/mesa/drivers/dri/i965/brw_binding_tables.c
> index 98ff0dd..697b4c7 100644
> --- a/src/mesa/drivers/dri/i965/brw_binding_tables.c
> +++ b/src/mesa/drivers/dri/i965/brw_binding_tables.c
> @@ -42,7 +42,6 @@
>   #include "brw_context.h"
>   #include "brw_defines.h"
>   #include "brw_state.h"
> -#include "intel_batchbuffer.h"
>   
>   /**
>    * Upload a shader stage's binding table as indirect state.
> diff --git a/src/mesa/drivers/dri/i965/brw_blorp.cpp b/src/mesa/drivers/dri/i965/brw_blorp.cpp
> index 2ccfae1..e1a9f56 100644
> --- a/src/mesa/drivers/dri/i965/brw_blorp.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_blorp.cpp
> @@ -22,7 +22,6 @@
>    */
>   
>   #include <errno.h>
> -#include "intel_batchbuffer.h"
>   #include "intel_fbo.h"
>   
>   #include "brw_blorp.h"
> @@ -211,7 +210,9 @@ brw_blorp_exec(struct brw_context *brw, const brw_blorp_params *params)
>   {
>      struct gl_context *ctx = &brw->ctx;
>      uint32_t estimated_max_batch_usage = 1500;
> -   bool check_aperture_failed_once = false;
> +
> +   if (brw_batch_begin(&brw->batch, estimated_max_batch_usage, RENDER_RING) < 0)
> +      return;
>   
>      /* Flush the sampler and render caches.  We definitely need to flush the
>       * sampler cache so that we get updated contents from the render cache for
> @@ -222,13 +223,6 @@ brw_blorp_exec(struct brw_context *brw, const brw_blorp_params *params)
>       */
>      brw_emit_mi_flush(brw);
>   
> -retry:
> -   intel_batchbuffer_require_space(brw, estimated_max_batch_usage, RENDER_RING);
> -   intel_batchbuffer_save_state(brw);
> -   drm_intel_bo *saved_bo = brw->batch.bo;
> -   uint32_t saved_used = brw->batch.used;
> -   uint32_t saved_state_batch_offset = brw->batch.state_batch_offset;
> -
>      switch (brw->gen) {
>      case 6:
>         gen6_blorp_exec(brw, params);
> @@ -241,37 +235,18 @@ retry:
>         unreachable("not reached");
>      }
>   
> -   /* Make sure we didn't wrap the batch unintentionally, and make sure we
> -    * reserved enough space that a wrap will never happen.
> -    */
> -   assert(brw->batch.bo == saved_bo);
> -   assert((brw->batch.used - saved_used) * 4 +
> -          (saved_state_batch_offset - brw->batch.state_batch_offset) <
> -          estimated_max_batch_usage);
> -   /* Shut up compiler warnings on release build */
> -   (void)saved_bo;
> -   (void)saved_used;
> -   (void)saved_state_batch_offset;
> +   brw_emit_mi_flush(brw);
>   
>      /* Check if the blorp op we just did would make our batch likely to fail to
>       * map all the BOs into the GPU at batch exec time later.  If so, flush the
>       * batch and try again with nothing else in the batch.
>       */
> -   if (dri_bufmgr_check_aperture_space(&brw->batch.bo, 1)) {
> -      if (!check_aperture_failed_once) {
> -         check_aperture_failed_once = true;
> -         intel_batchbuffer_reset_to_saved(brw);
> -         intel_batchbuffer_flush(brw);
> -         goto retry;
> -      } else {
> -         int ret = intel_batchbuffer_flush(brw);
> -         WARN_ONCE(ret == -ENOSPC,
> -                   "i965: blorp emit exceeded available aperture space\n");
> -      }
> +   if (brw_batch_end(&brw->batch)) {
> +      WARN_ONCE(1, "i965: blorp emit exceeded available aperture space\n");
> +      return;
>      }
>   
> -   if (unlikely(brw->always_flush_batch))
> -      intel_batchbuffer_flush(brw);
> +   brw_batch_maybe_flush(&brw->batch);
>   
>      /* We've smashed all state compared to what the normal 3D pipeline
>       * rendering tracks for GL.
> @@ -279,11 +254,6 @@ retry:
>      brw->ctx.NewDriverState = ~0ull;
>      brw->no_depth_or_stencil = false;
>      brw->ib.type = -1;
> -
> -   /* Flush the sampler cache so any texturing from the destination is
> -    * coherent.
> -    */
> -   brw_emit_mi_flush(brw);
>   }
>   
>   brw_hiz_op_params::brw_hiz_op_params(struct intel_mipmap_tree *mt,
> diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
> index 354c733..4f62b29 100644
> --- a/src/mesa/drivers/dri/i965/brw_cc.c
> +++ b/src/mesa/drivers/dri/i965/brw_cc.c
> @@ -36,7 +36,6 @@
>   #include "brw_util.h"
>   #include "main/macros.h"
>   #include "main/stencil.h"
> -#include "intel_batchbuffer.h"
>   
>   static void
>   brw_upload_cc_vp(struct brw_context *brw)
> @@ -227,17 +226,14 @@ static void upload_cc_unit(struct brw_context *brw)
>         cc->cc5.statistics_enable = 1;
>   
>      /* BRW_NEW_CC_VP */
> -   cc->cc4.cc_viewport_state_offset = (brw->batch.bo->offset64 +
> -				       brw->cc.vp_offset) >> 5; /* reloc */
> +   cc->cc4.cc_viewport_state_offset =
> +      brw_batch_reloc(&brw->batch,
> +		      (brw->cc.state_offset +
> +		       offsetof(struct brw_cc_unit_state, cc4)),
> +		      brw->batch.bo, brw->cc.vp_offset,
> +		      I915_GEM_DOMAIN_INSTRUCTION, 0) >> 5;
>   
>      brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
> -
> -   /* Emit CC viewport relocation */
> -   drm_intel_bo_emit_reloc(brw->batch.bo,
> -			   (brw->cc.state_offset +
> -			    offsetof(struct brw_cc_unit_state, cc4)),
> -			   brw->batch.bo, brw->cc.vp_offset,
> -			   I915_GEM_DOMAIN_INSTRUCTION, 0);
>   }
>   
>   const struct brw_tracked_state brw_cc_unit = {
> diff --git a/src/mesa/drivers/dri/i965/brw_clear.c b/src/mesa/drivers/dri/i965/brw_clear.c
> index f981388..571e692 100644
> --- a/src/mesa/drivers/dri/i965/brw_clear.c
> +++ b/src/mesa/drivers/dri/i965/brw_clear.c
> @@ -32,7 +32,6 @@
>   #include "swrast/swrast.h"
>   #include "drivers/common/meta.h"
>   
> -#include "intel_batchbuffer.h"
>   #include "intel_blit.h"
>   #include "intel_fbo.h"
>   #include "intel_mipmap_tree.h"
> diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
> index 3a73c64..e044375 100644
> --- a/src/mesa/drivers/dri/i965/brw_clip.c
> +++ b/src/mesa/drivers/dri/i965/brw_clip.c
> @@ -33,8 +33,6 @@
>   #include "main/macros.h"
>   #include "main/enums.h"
>   
> -#include "intel_batchbuffer.h"
> -
>   #include "brw_defines.h"
>   #include "brw_context.h"
>   #include "brw_eu.h"
> diff --git a/src/mesa/drivers/dri/i965/brw_clip_line.c b/src/mesa/drivers/dri/i965/brw_clip_line.c
> index 8e34f7c..65db789 100644
> --- a/src/mesa/drivers/dri/i965/brw_clip_line.c
> +++ b/src/mesa/drivers/dri/i965/brw_clip_line.c
> @@ -34,8 +34,6 @@
>   #include "main/enums.h"
>   #include "program/program.h"
>   
> -#include "intel_batchbuffer.h"
> -
>   #include "brw_defines.h"
>   #include "brw_context.h"
>   #include "brw_eu.h"
> diff --git a/src/mesa/drivers/dri/i965/brw_clip_point.c b/src/mesa/drivers/dri/i965/brw_clip_point.c
> index 81487d3..9c886ff 100644
> --- a/src/mesa/drivers/dri/i965/brw_clip_point.c
> +++ b/src/mesa/drivers/dri/i965/brw_clip_point.c
> @@ -34,8 +34,6 @@
>   #include "main/enums.h"
>   #include "program/program.h"
>   
> -#include "intel_batchbuffer.h"
> -
>   #include "brw_defines.h"
>   #include "brw_context.h"
>   #include "brw_eu.h"
> diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c b/src/mesa/drivers/dri/i965/brw_clip_state.c
> index dee74db..8307ecd 100644
> --- a/src/mesa/drivers/dri/i965/brw_clip_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_clip_state.c
> @@ -133,16 +133,14 @@ brw_upload_clip_unit(struct brw_context *brw)
>          ctx->ViewportArray[0].Width == fb_width &&
>          ctx->ViewportArray[0].Height == fb_height)
>      {
> +      /* emit clip viewport relocation */
>         clip->clip5.guard_band_enable = 1;
>         clip->clip6.clipper_viewport_state_ptr =
> -         (brw->batch.bo->offset64 + brw->clip.vp_offset) >> 5;
> -
> -      /* emit clip viewport relocation */
> -      drm_intel_bo_emit_reloc(brw->batch.bo,
> -                              (brw->clip.state_offset +
> -                               offsetof(struct brw_clip_unit_state, clip6)),
> -                              brw->batch.bo, brw->clip.vp_offset,
> -                              I915_GEM_DOMAIN_INSTRUCTION, 0);
> +	 brw_batch_reloc(&brw->batch,
> +			 (brw->clip.state_offset +
> +			  offsetof(struct brw_clip_unit_state, clip6)),
> +			 brw->batch.bo, brw->clip.vp_offset,
> +			 I915_GEM_DOMAIN_INSTRUCTION, 0) >> 5;
>      }
>   
>      /* _NEW_TRANSFORM */
> diff --git a/src/mesa/drivers/dri/i965/brw_clip_tri.c b/src/mesa/drivers/dri/i965/brw_clip_tri.c
> index cca7eb1..64db7e4 100644
> --- a/src/mesa/drivers/dri/i965/brw_clip_tri.c
> +++ b/src/mesa/drivers/dri/i965/brw_clip_tri.c
> @@ -34,8 +34,6 @@
>   #include "main/enums.h"
>   #include "program/program.h"
>   
> -#include "intel_batchbuffer.h"
> -
>   #include "brw_defines.h"
>   #include "brw_context.h"
>   #include "brw_eu.h"
> diff --git a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
> index 6baf620..48c2648 100644
> --- a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
> +++ b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
> @@ -34,8 +34,6 @@
>   #include "main/enums.h"
>   #include "program/program.h"
>   
> -#include "intel_batchbuffer.h"
> -
>   #include "brw_defines.h"
>   #include "brw_context.h"
>   #include "brw_eu.h"
> diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/mesa/drivers/dri/i965/brw_clip_util.c
> index 40ad144..7b953b2 100644
> --- a/src/mesa/drivers/dri/i965/brw_clip_util.c
> +++ b/src/mesa/drivers/dri/i965/brw_clip_util.c
> @@ -35,8 +35,6 @@
>   #include "main/enums.h"
>   #include "program/program.h"
>   
> -#include "intel_batchbuffer.h"
> -
>   #include "brw_defines.h"
>   #include "brw_context.h"
>   #include "brw_eu.h"
> diff --git a/src/mesa/drivers/dri/i965/brw_compute.c b/src/mesa/drivers/dri/i965/brw_compute.c
> index 5693ab5..7dfcd6b 100644
> --- a/src/mesa/drivers/dri/i965/brw_compute.c
> +++ b/src/mesa/drivers/dri/i965/brw_compute.c
> @@ -30,7 +30,6 @@
>   #include "brw_context.h"
>   #include "brw_draw.h"
>   #include "brw_state.h"
> -#include "intel_batchbuffer.h"
>   #include "brw_defines.h"
>   
>   
> @@ -87,7 +86,7 @@ brw_dispatch_compute(struct gl_context *ctx, const GLuint *num_groups)
>   {
>      struct brw_context *brw = brw_context(ctx);
>      int estimated_buffer_space_needed;
> -   bool fail_next = false;
> +   int ret;
>   
>      if (!_mesa_check_conditional_render(ctx))
>         return;
> @@ -105,48 +104,23 @@ brw_dispatch_compute(struct gl_context *ctx, const GLuint *num_groups)
>      estimated_buffer_space_needed += 1024; /* push constants */
>      estimated_buffer_space_needed += 512; /* misc. pad */
>   
> -   /* Flush the batch if it's approaching full, so that we don't wrap while
> -    * we've got validated state that needs to be in the same batch as the
> -    * primitives.
> -    */
> -   intel_batchbuffer_require_space(brw, estimated_buffer_space_needed,
> -                                   RENDER_RING);
> -   intel_batchbuffer_save_state(brw);
> +   ret = brw_batch_begin(&brw->batch, estimated_buffer_space_needed, RENDER_RING);
> +   if (ret < 0)
> +	   return;
>   
> - retry:
> -   brw->no_batch_wrap = true;
>      brw_upload_compute_state(brw);
> -
>      brw_emit_gpgpu_walker(brw, num_groups);
>   
> -   brw->no_batch_wrap = false;
> -
> -   if (dri_bufmgr_check_aperture_space(&brw->batch.bo, 1)) {
> -      if (!fail_next) {
> -         intel_batchbuffer_reset_to_saved(brw);
> -         intel_batchbuffer_flush(brw);
> -         fail_next = true;
> -         goto retry;
> -      } else {
> -         if (intel_batchbuffer_flush(brw) == -ENOSPC) {
> -            static bool warned = false;
> -
> -            if (!warned) {
> -               fprintf(stderr, "i965: Single compute shader dispatch "
> -                       "exceeded available aperture space\n");
> -               warned = true;
> -            }
> -         }
> -      }
> -   }
> +   ret = brw_batch_end(&brw->batch);
> +   if (ret)
> +	   return;
>   
>      /* Now that we know we haven't run out of aperture space, we can safely
>       * reset the dirty bits.
>       */
>      brw_compute_state_finished(brw);
>   
> -   if (brw->always_flush_batch)
> -      intel_batchbuffer_flush(brw);
> +   brw_batch_maybe_flush(&brw->batch);
>   
>      brw_state_cache_check_size(brw);
>   
> diff --git a/src/mesa/drivers/dri/i965/brw_conditional_render.c b/src/mesa/drivers/dri/i965/brw_conditional_render.c
> index 6d37c3b..ffd10a6 100644
> --- a/src/mesa/drivers/dri/i965/brw_conditional_render.c
> +++ b/src/mesa/drivers/dri/i965/brw_conditional_render.c
> @@ -35,7 +35,7 @@
>   
>   #include "brw_context.h"
>   #include "brw_defines.h"
> -#include "intel_batchbuffer.h"
> +#include "intel_reg.h"
>   
>   static void
>   set_predicate_enable(struct brw_context *brw,
> diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
> index 8150b94..a8ed0b8 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.c
> +++ b/src/mesa/drivers/dri/i965/brw_context.c
> @@ -54,13 +54,13 @@
>   #include "brw_draw.h"
>   #include "brw_state.h"
>   
> -#include "intel_batchbuffer.h"
>   #include "intel_buffer_objects.h"
>   #include "intel_buffers.h"
>   #include "intel_fbo.h"
>   #include "intel_mipmap_tree.h"
>   #include "intel_pixel.h"
>   #include "intel_image.h"
> +#include "intel_reg.h"
>   #include "intel_tex.h"
>   #include "intel_tex_obj.h"
>   
> @@ -186,7 +186,8 @@ intel_update_state(struct gl_context * ctx, GLuint new_state)
>   	 continue;
>         intel_miptree_all_slices_resolve_depth(brw, tex_obj->mt);
>         intel_miptree_resolve_color(brw, tex_obj->mt);
> -      brw_render_cache_set_check_flush(brw, tex_obj->mt->bo);
> +      if (tex_obj->mt->bo->dirty)
> +	 brw_emit_mi_flush(brw);
>      }
>   
>      _mesa_lock_context_textures(ctx);
> @@ -214,7 +215,7 @@ intel_flush_front(struct gl_context *ctx)
>             * performance.
>             */
>            intel_resolve_for_dri2_flush(brw, driDrawable);
> -         intel_batchbuffer_flush(brw);
> +         brw_batch_flush(&brw->batch);
>   
>            flushFront(screen)(driDrawable, driDrawable->loaderPrivate);
>   
> @@ -224,6 +225,7 @@ intel_flush_front(struct gl_context *ctx)
>            brw->front_buffer_dirty = false;
>         }
>      }
> +   brw_batch_flush(&brw->batch);
>   }
>   
>   static void
> @@ -231,10 +233,9 @@ intel_glFlush(struct gl_context *ctx)
>   {
>      struct brw_context *brw = brw_context(ctx);
>   
> -   intel_batchbuffer_flush(brw);
>      intel_flush_front(ctx);
>   
> -   brw->need_flush_throttle = true;
> +   brw->batch.need_flush_throttle = true;
>   }
>   
>   static void
> @@ -244,8 +245,7 @@ intel_finish(struct gl_context * ctx)
>   
>      intel_glFlush(ctx);
>   
> -   if (brw->batch.last_bo)
> -      drm_intel_bo_wait_rendering(brw->batch.last_bo);
> +   brw_batch_wait(&brw->batch);
>   }
>   
>   static void
> @@ -623,15 +623,6 @@ brw_process_driconf_options(struct brw_context *brw)
>      driParseConfigFiles(options, &brw->intelScreen->optionCache,
>                          brw->driContext->driScreenPriv->myNum, "i965");
>   
> -   int bo_reuse_mode = driQueryOptioni(options, "bo_reuse");
> -   switch (bo_reuse_mode) {
> -   case DRI_CONF_BO_REUSE_DISABLED:
> -      break;
> -   case DRI_CONF_BO_REUSE_ALL:
> -      intel_bufmgr_gem_enable_reuse(brw->bufmgr);
> -      break;
> -   }
> -
>      if (!driQueryOptionb(options, "hiz")) {
>          brw->has_hiz = false;
>          /* On gen6, you can only do separate stencil with HIZ. */
> @@ -641,7 +632,7 @@ brw_process_driconf_options(struct brw_context *brw)
>   
>      if (driQueryOptionb(options, "always_flush_batch")) {
>         fprintf(stderr, "flushing batchbuffer before/after each draw call\n");
> -      brw->always_flush_batch = true;
> +      brw->batch.always_flush = true;
>      }
>   
>      if (driQueryOptionb(options, "always_flush_cache")) {
> @@ -651,7 +642,7 @@ brw_process_driconf_options(struct brw_context *brw)
>   
>      if (driQueryOptionb(options, "disable_throttling")) {
>         fprintf(stderr, "disabling flush throttling\n");
> -      brw->disable_throttling = true;
> +      brw->batch.disable_throttling = true;
>      }
>   
>      brw->precompile = driQueryOptionb(&brw->optionCache, "shader_precompile");
> @@ -707,7 +698,18 @@ brwCreateContext(gl_api api,
>      driContextPriv->driverPrivate = brw;
>      brw->driContext = driContextPriv;
>      brw->intelScreen = screen;
> -   brw->bufmgr = screen->bufmgr;
> +
> +   if (brw_batch_init(&brw->batch, screen)) {
> +      fprintf(stderr, "%s: failed to alloc batch\n", __func__);
> +      *dri_ctx_error = __DRI_CTX_ERROR_NO_MEMORY;
> +      return false;
> +   }
> +
> +   if (brw_init_pipe_control(brw, devinfo)) {
> +      fprintf(stderr, "%s: failed to alloc workarounds\n", __func__);
> +      *dri_ctx_error = __DRI_CTX_ERROR_NO_MEMORY;
> +      return false;
> +   }
>   
>      brw->gen = devinfo->gen;
>      brw->gt = devinfo->gt;
> @@ -800,31 +802,6 @@ brwCreateContext(gl_api api,
>   
>      intel_fbo_init(brw);
>   
> -   intel_batchbuffer_init(brw);
> -
> -   if (brw->gen >= 6) {
> -      /* Create a new hardware context.  Using a hardware context means that
> -       * our GPU state will be saved/restored on context switch, allowing us
> -       * to assume that the GPU is in the same state we left it in.
> -       *
> -       * This is required for transform feedback buffer offsets, query objects,
> -       * and also allows us to reduce how much state we have to emit.
> -       */
> -      brw->hw_ctx = drm_intel_gem_context_create(brw->bufmgr);
> -
> -      if (!brw->hw_ctx) {
> -         fprintf(stderr, "Gen6+ requires Kernel 3.6 or later.\n");
> -         intelDestroyContext(driContextPriv);
> -         return false;
> -      }
> -   }
> -
> -   if (brw_init_pipe_control(brw, devinfo)) {
> -      *dri_ctx_error = __DRI_CTX_ERROR_NO_MEMORY;
> -      intelDestroyContext(driContextPriv);
> -      return false;
> -   }
> -
>      brw_init_state(brw);
>   
>      intelInitExtensions(ctx);
> @@ -911,8 +888,8 @@ intelDestroyContext(__DRIcontext * driContextPriv)
>   
>      /* Dump a final BMP in case the application doesn't call SwapBuffers */
>      if (INTEL_DEBUG & DEBUG_AUB) {
> -      intel_batchbuffer_flush(brw);
> -      aub_dump_bmp(&brw->ctx);
> +      brw_batch_flush(&brw->batch);
> +      aub_dump_bmp(brw);
>      }
>   
>      _mesa_meta_free(&brw->ctx);
> @@ -929,15 +906,10 @@ intelDestroyContext(__DRIcontext * driContextPriv)
>      brw_destroy_state(brw);
>      brw_draw_destroy(brw);
>   
> -   drm_intel_bo_unreference(brw->curbe.curbe_bo);
> -   if (brw->vs.base.scratch_bo)
> -      drm_intel_bo_unreference(brw->vs.base.scratch_bo);
> -   if (brw->gs.base.scratch_bo)
> -      drm_intel_bo_unreference(brw->gs.base.scratch_bo);
> -   if (brw->wm.base.scratch_bo)
> -      drm_intel_bo_unreference(brw->wm.base.scratch_bo);
> -
> -   drm_intel_gem_context_destroy(brw->hw_ctx);
> +   brw_bo_put(brw->curbe.curbe_bo);
> +   brw_bo_put(brw->vs.base.scratch_bo);
> +   brw_bo_put(brw->gs.base.scratch_bo);
> +   brw_bo_put(brw->wm.base.scratch_bo);
>   
>      if (ctx->swrast_context) {
>         _swsetup_DestroyContext(&brw->ctx);
> @@ -948,19 +920,14 @@ intelDestroyContext(__DRIcontext * driContextPriv)
>      if (ctx->swrast_context)
>         _swrast_DestroyContext(&brw->ctx);
>   
> -   brw_fini_pipe_control(brw);
> -   intel_batchbuffer_free(brw);
> +   /* free the Mesa context */
> +   _mesa_free_context_data(&brw->ctx);
>   
> -   drm_intel_bo_unreference(brw->throttle_batch[1]);
> -   drm_intel_bo_unreference(brw->throttle_batch[0]);
> -   brw->throttle_batch[1] = NULL;
> -   brw->throttle_batch[0] = NULL;
> +   brw_fini_pipe_control(brw);
> +   brw_batch_fini(&brw->batch);
>   
>      driDestroyOptionCache(&brw->optionCache);
>   
> -   /* free the Mesa context */
> -   _mesa_free_context_data(&brw->ctx);
> -
>      ralloc_free(brw);
>      driContextPriv->driverPrivate = NULL;
>   }
> @@ -1293,7 +1260,6 @@ intel_query_dri2_buffers(struct brw_context *brw,
>          * query, we need to make sure all the pending drawing has landed in the
>          * real front buffer.
>          */
> -      intel_batchbuffer_flush(brw);
>         intel_flush_front(&brw->ctx);
>   
>         attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
> @@ -1305,7 +1271,6 @@ intel_query_dri2_buffers(struct brw_context *brw,
>          * So before doing the query, make sure all the pending drawing has
>          * landed in the real front buffer.
>          */
> -      intel_batchbuffer_flush(brw);
>         intel_flush_front(&brw->ctx);
>      }
>   
> @@ -1346,7 +1311,7 @@ intel_process_dri2_buffer(struct brw_context *brw,
>                             const char *buffer_name)
>   {
>      struct gl_framebuffer *fb = drawable->driverPrivate;
> -   drm_intel_bo *bo;
> +   struct brw_bo *bo;
>   
>      if (!rb)
>         return;
> @@ -1370,7 +1335,7 @@ intel_process_dri2_buffer(struct brw_context *brw,
>   	* name, then drm_intel_bo_flink() is a low-cost getter.  It does not
>   	* create a new name.
>   	*/
> -      drm_intel_bo_flink(last_mt->bo, &old_name);
> +      old_name = brw_bo_flink(last_mt->bo);
>      }
>   
>      if (old_name == buffer->name)
> @@ -1383,9 +1348,7 @@ intel_process_dri2_buffer(struct brw_context *brw,
>                 buffer->cpp, buffer->pitch);
>      }
>   
> -   intel_miptree_release(&rb->mt);
> -   bo = drm_intel_bo_gem_create_from_name(brw->bufmgr, buffer_name,
> -                                          buffer->name);
> +   bo = brw_bo_create_from_name(&brw->batch, buffer_name, buffer->name);
>      if (!bo) {
>         fprintf(stderr,
>                 "Failed to open BO for returned DRI2 buffer "
> @@ -1396,9 +1359,11 @@ intel_process_dri2_buffer(struct brw_context *brw,
>         return;
>      }
>   
> +   //intel_miptree_release(&rb->mt);
>      intel_update_winsys_renderbuffer_miptree(brw, rb, bo,
>                                               drawable->w, drawable->h,
>                                               buffer->pitch);
> +   brw_bo_put(bo);
>   
>      if (brw_is_front_buffer_drawing(fb) &&
>          (buffer->attachment == __DRI_BUFFER_FRONT_LEFT ||
> @@ -1408,8 +1373,6 @@ intel_process_dri2_buffer(struct brw_context *brw,
>      }
>   
>      assert(rb->mt);
> -
> -   drm_intel_bo_unreference(bo);
>   }
>   
>   /**
> @@ -1451,12 +1414,14 @@ intel_update_image_buffer(struct brw_context *intel,
>      else
>         last_mt = rb->singlesample_mt;
>   
> -   if (last_mt && last_mt->bo == buffer->bo)
> +   if (last_mt && last_mt->bo->handle == buffer->bo->handle)
>         return;
>   
> -   intel_update_winsys_renderbuffer_miptree(intel, rb, buffer->bo,
> +   struct brw_bo *bo = brw_bo_import(&intel->batch, buffer->bo, true);
> +   intel_update_winsys_renderbuffer_miptree(intel, rb, bo,
>                                               buffer->width, buffer->height,
>                                               buffer->pitch);
> +   brw_bo_put(bo);
>   
>      if (brw_is_front_buffer_drawing(fb) &&
>          buffer_type == __DRI_IMAGE_BUFFER_FRONT &&
> @@ -1520,3 +1485,121 @@ intel_update_image_buffers(struct brw_context *brw, __DRIdrawable *drawable)
>                                   __DRI_IMAGE_BUFFER_BACK);
>      }
>   }
> +
> +/**
> + * Called when starting a new batch buffer.
> + */
> +void
> +brw_start_batch(struct brw_batch *batch)
> +{
> +   struct brw_context *brw = container_of(batch, brw, batch);
> +
> +   if (batch->ring != RENDER_RING)
> +      return;
> +
> +   /* If the kernel supports hardware contexts, then most hardware state is
> +    * preserved between batches; we only need to re-emit state that is required
> +    * to be in every batch.  Otherwise we need to re-emit all the state that
> +    * would otherwise be stored in the context (which for all intents and
> +    * purposes means everything).
> +    */
> +   if (!batch->hw_ctx)
> +      brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
> +
> +   brw->ctx.NewDriverState |= BRW_NEW_BATCH;
> +
> +   brw->emitted_mi_flush = 0;
> +   brw->state_batch_count = 0;
> +
> +   brw->ib.type = -1;
> +
> +   /* We need to periodically reap the shader time results, because rollover
> +    * happens every few seconds.  We also want to see results every once in a
> +    * while, because many programs won't cleanly destroy our context, so the
> +    * end-of-run printout may not happen.
> +    */
> +   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
> +      brw_collect_and_report_shader_time(brw);
> +
> +   if (INTEL_DEBUG & DEBUG_PERFMON)
> +      brw_dump_perf_monitors(brw);
> +
> +   brw_perf_monitor_new_batch(brw);
> +}
> +
> +/**
> + * Called from brw_batch_flush before emitting MI_BATCHBUFFER_END and sending
> + * it off.
> + *
> + * This function can emit state (say, to preserve registers that aren't saved
> + * between batches).  All of this state MUST fit in the reserved space at the
> + * end of the batchbuffer.  If you add more GPU state, increase the reserved
> + * space by updating the BATCH_RESERVED macro.
> + */
> +void brw_finish_batch(struct brw_batch *batch)
> +{
> +   struct brw_context *brw = container_of(batch, brw, batch);
> +
> +   if (batch->ring != RENDER_RING)
> +      return;
> +
> +   /* Capture the closing pipeline statistics register values necessary to
> +    * support query objects (in the non-hardware context world).
> +    */
> +   brw_emit_query_end(brw);
> +
> +   /* We may also need to snapshot and disable OA counters. */
> +   brw_perf_monitor_finish_batch(brw);
> +}
> +
> +static void
> +load_sized_register_mem(struct brw_context *brw,
> +                        uint32_t reg,
> +                        struct brw_bo *bo,
> +                        uint32_t read_domains, uint32_t write_domain,
> +                        uint32_t offset,
> +                        int size)
> +{
> +   int i;
> +
> +   /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
> +   assert(brw->gen >= 7);
> +
> +   if (brw->gen >= 8) {
> +      BEGIN_BATCH(4 * size);
> +      for (i = 0; i < size; i++) {
> +         OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
> +         OUT_BATCH(reg + i * 4);
> +         OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
> +      }
> +      ADVANCE_BATCH();
> +   } else {
> +      BEGIN_BATCH(3 * size);
> +      for (i = 0; i < size; i++) {
> +         OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
> +         OUT_BATCH(reg + i * 4);
> +         OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
> +      }
> +      ADVANCE_BATCH();
> +   }
> +}
> +
> +void
> +brw_load_register_mem(struct brw_context *brw,
> +                      uint32_t reg,
> +                      struct brw_bo *bo,
> +                      uint32_t read_domains, uint32_t write_domain,
> +                      uint32_t offset)
> +{
> +   load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
> +}
> +
> +void
> +brw_load_register_mem64(struct brw_context *brw,
> +                        uint32_t reg,
> +                        struct brw_bo *bo,
> +                        uint32_t read_domains, uint32_t write_domain,
> +                        uint32_t offset)
> +{
> +   load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
> +}
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index db0fc48..e4fded3 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -50,7 +50,6 @@ extern "C" {
>   #endif
>   
>   #include <drm.h>
> -#include <intel_bufmgr.h>
>   #include <i915_drm.h>
>   #ifdef __cplusplus
>   	#undef virtual
> @@ -65,6 +64,8 @@ extern "C" {
>   #include "intel_tex_obj.h"
>   #include "intel_resolve_map.h"
>   
> +#include "brw_batch.h"
> +
>   /* Glossary:
>    *
>    * URB - uniform resource buffer.  A mid-sized buffer which is
> @@ -790,11 +791,10 @@ struct brw_cache {
>      struct brw_context *brw;
>   
>      struct brw_cache_item **items;
> -   drm_intel_bo *bo;
> +   struct brw_bo *bo;
>      GLuint size, n_items;
>   
>      uint32_t next_offset;
> -   bool bo_used_by_gpu;
>   
>      /**
>       * Optional functions used in determining whether the prog_data for a new
> @@ -829,7 +829,7 @@ enum shader_time_shader_type {
>   
>   struct brw_vertex_buffer {
>      /** Buffer object containing the uploaded vertex data */
> -   drm_intel_bo *bo;
> +   struct brw_bo *bo;
>      uint32_t offset;
>      /** Byte stride between elements in the uploaded array */
>      GLuint stride;
> @@ -848,41 +848,10 @@ struct brw_query_object {
>      struct gl_query_object Base;
>   
>      /** Last query BO associated with this query. */
> -   drm_intel_bo *bo;
> +   struct brw_bo *bo;
>   
>      /** Last index in bo with query data for this object. */
>      int last_index;
> -
> -   /** True if we know the batch has been flushed since we ended the query. */
> -   bool flushed;
> -};
> -
> -enum brw_gpu_ring {
> -   UNKNOWN_RING,
> -   RENDER_RING,
> -   BLT_RING,
> -};
> -
> -struct intel_batchbuffer {
> -   /** Current batchbuffer being queued up. */
> -   drm_intel_bo *bo;
> -   /** Last BO submitted to the hardware.  Used for glFinish(). */
> -   drm_intel_bo *last_bo;
> -
> -   uint16_t emit, total;
> -   uint16_t used, reserved_space;
> -   uint32_t *map;
> -   uint32_t *cpu_map;
> -#define BATCH_SZ (8192*sizeof(uint32_t))
> -
> -   uint32_t state_batch_offset;
> -   enum brw_gpu_ring ring;
> -   bool needs_sol_reset;
> -
> -   struct {
> -      uint16_t used;
> -      int reloc_count;
> -   } saved;
>   };
>   
>   #define BRW_MAX_XFB_STREAMS 4
> @@ -891,7 +860,7 @@ struct brw_transform_feedback_object {
>      struct gl_transform_feedback_object base;
>   
>      /** A buffer to hold SO_WRITE_OFFSET(n) values while paused. */
> -   drm_intel_bo *offset_bo;
> +   struct brw_bo *offset_bo;
>   
>      /** If true, SO_WRITE_OFFSET(n) should be reset to zero at next use. */
>      bool zero_offsets;
> @@ -904,7 +873,7 @@ struct brw_transform_feedback_object {
>       *  @{
>       */
>      uint64_t prims_generated[BRW_MAX_XFB_STREAMS];
> -   drm_intel_bo *prim_count_bo;
> +   struct brw_bo *prim_count_bo;
>      unsigned prim_count_buffer_index; /**< in number of uint64_t units */
>      /** @} */
>   
> @@ -930,7 +899,7 @@ struct brw_stage_state
>       * Optional scratch buffer used to store spilled register values and
>       * variably-indexed GRF arrays.
>       */
> -   drm_intel_bo *scratch_bo;
> +   struct brw_bo *scratch_bo;
>   
>      /** Offset in the program cache to the program */
>      uint32_t prog_offset;
> @@ -998,7 +967,7 @@ struct brw_context
>                                            bool rw, bool for_gather);
>         void (*emit_buffer_surface_state)(struct brw_context *brw,
>                                           uint32_t *out_offset,
> -                                        drm_intel_bo *bo,
> +                                        struct brw_bo *bo,
>                                           unsigned buffer_offset,
>                                           unsigned surface_format,
>                                           unsigned buffer_size,
> @@ -1026,22 +995,13 @@ struct brw_context
>   
>      } vtbl;
>   
> -   dri_bufmgr *bufmgr;
> -
> -   drm_intel_context *hw_ctx;
> +   struct brw_batch batch;
>   
>      /** BO for post-sync nonzero writes for gen6 workaround. */
> -   drm_intel_bo *workaround_bo;
> +   struct brw_bo *workaround_bo;
>      uint8_t pipe_controls_since_last_cs_stall;
>   
>      /**
> -    * Set of drm_intel_bo * that have been rendered to within this batchbuffer
> -    * and would need flushing before being used from another cache domain that
> -    * isn't coherent with it (i.e. the sampler).
> -    */
> -   struct set *render_cache;
> -
> -   /**
>       * Number of resets observed in the system at context creation.
>       *
>       * This is tracked in the context so that we can determine that another
> @@ -1049,11 +1009,8 @@ struct brw_context
>       */
>      uint32_t reset_count;
>   
> -   struct intel_batchbuffer batch;
> -   bool no_batch_wrap;
> -
>      struct {
> -      drm_intel_bo *bo;
> +      struct brw_bo *bo;
>         uint32_t next_offset;
>      } upload;
>   
> @@ -1065,23 +1022,6 @@ struct brw_context
>       */
>      bool front_buffer_dirty;
>   
> -   /** Framerate throttling: @{ */
> -   drm_intel_bo *throttle_batch[2];
> -
> -   /* Limit the number of outstanding SwapBuffers by waiting for an earlier
> -    * frame of rendering to complete. This gives a very precise cap to the
> -    * latency between input and output such that rendering never gets more
> -    * than a frame behind the user. (With the caveat that we technically are
> -    * not using the SwapBuffers itself as a barrier but the first batch
> -    * submitted afterwards, which may be immediately prior to the next
> -    * SwapBuffers.)
> -    */
> -   bool need_swap_throttle;
> -
> -   /** General throttling, not caught by throttling between SwapBuffers */
> -   bool need_flush_throttle;
> -   /** @} */
> -
>      GLuint stats_wm;
>   
>      /**
> @@ -1089,9 +1029,7 @@ struct brw_context
>       * @{
>       */
>      bool no_rast;
> -   bool always_flush_batch;
>      bool always_flush_cache;
> -   bool disable_throttling;
>      bool precompile;
>   
>      driOptionCache optionCache;
> @@ -1169,7 +1107,7 @@ struct brw_context
>          * Buffer and offset used for GL_ARB_shader_draw_parameters
>          * (for now, only gl_BaseVertex).
>          */
> -      drm_intel_bo *draw_params_bo;
> +      struct brw_bo *draw_params_bo;
>         uint32_t draw_params_offset;
>      } draw;
>   
> @@ -1209,7 +1147,7 @@ struct brw_context
>         const struct _mesa_index_buffer *ib;
>   
>         /* Updates are signaled by BRW_NEW_INDEX_BUFFER. */
> -      drm_intel_bo *bo;
> +      struct brw_bo *bo;
>         GLuint type;
>   
>         /* Offset to index buffer index to use in CMD_3D_PRIM so that we can
> @@ -1294,7 +1232,7 @@ struct brw_context
>          * Pointer to the (intel_upload.c-generated) BO containing the uniforms
>          * for upload to the CURBE.
>          */
> -      drm_intel_bo *curbe_bo;
> +      struct brw_bo *curbe_bo;
>         /** Offset within curbe_bo of space for current curbe entry */
>         GLuint curbe_offset;
>      } curbe;
> @@ -1384,7 +1322,7 @@ struct brw_context
>          * Buffer object used in place of multisampled null render targets on
>          * Gen6.  See brw_emit_null_surface_state().
>          */
> -      drm_intel_bo *multisampled_null_render_target_bo;
> +      struct brw_bo *multisampled_null_render_target_bo;
>         uint32_t fast_clear_op;
>      } wm;
>   
> @@ -1421,7 +1359,7 @@ struct brw_context
>          * A buffer object storing OA counter snapshots taken at the start and
>          * end of each batch (creating "bookends" around the batch).
>          */
> -      drm_intel_bo *bookend_bo;
> +      struct brw_bo *bookend_bo;
>   
>         /** The number of snapshots written to bookend_bo. */
>         int bookend_snapshots;
> @@ -1459,6 +1397,7 @@ struct brw_context
>         int index;
>      } *state_batch_list;
>      int state_batch_count;
> +   int emitted_mi_flush;
>   
>      uint32_t render_target_format[MESA_FORMAT_COUNT];
>      bool format_supported_as_render_target[MESA_FORMAT_COUNT];
> @@ -1492,7 +1431,7 @@ struct brw_context
>      int basevertex;
>   
>      struct {
> -      drm_intel_bo *bo;
> +      struct brw_bo *bo;
>         const char **names;
>         int *ids;
>         enum shader_time_shader_type *types;
> @@ -1508,6 +1447,14 @@ struct brw_context
>      struct intel_screen *intelScreen;
>   };
>   
> +static inline int brw_to_fd(struct brw_context *brw)
> +{
> +   return intel_screen_to_fd(brw->intelScreen);
> +}
> +
> +void brw_start_batch(struct brw_batch *batch);
> +void brw_finish_batch(struct brw_batch *batch);
> +
>   /*======================================================================
>    * brw_vtbl.c
>    */
> @@ -1605,10 +1552,10 @@ void brw_emit_query_end(struct brw_context *brw);
>   
>   /** gen6_queryobj.c */
>   void gen6_init_queryobj_functions(struct dd_function_table *functions);
> -void brw_write_timestamp(struct brw_context *brw, drm_intel_bo *bo, int idx);
> -void brw_write_depth_count(struct brw_context *brw, drm_intel_bo *bo, int idx);
> +void brw_write_timestamp(struct brw_context *brw, struct brw_bo *bo, int idx);
> +void brw_write_depth_count(struct brw_context *brw, struct brw_bo *bo, int idx);
>   void brw_store_register_mem64(struct brw_context *brw,
> -                              drm_intel_bo *bo, uint32_t reg, int idx);
> +                              struct brw_bo *bo, uint32_t reg, int idx);
>   
>   /** brw_conditional_render.c */
>   void brw_init_conditional_render_functions(struct dd_function_table *functions);
> @@ -1617,20 +1564,21 @@ bool brw_check_conditional_render(struct brw_context *brw);
>   /** intel_batchbuffer.c */
>   void brw_load_register_mem(struct brw_context *brw,
>                              uint32_t reg,
> -                           drm_intel_bo *bo,
> +                           struct brw_bo *bo,
>                              uint32_t read_domains, uint32_t write_domain,
>                              uint32_t offset);
>   void brw_load_register_mem64(struct brw_context *brw,
>                                uint32_t reg,
> -                             drm_intel_bo *bo,
> +                             struct brw_bo *bo,
>                                uint32_t read_domains, uint32_t write_domain,
>                                uint32_t offset);
>   
>   /*======================================================================
>    * brw_state_dump.c
>    */
> -void brw_debug_batch(struct brw_context *brw);
> -void brw_annotate_aub(struct brw_context *brw);
> +void brw_debug_batch(struct brw_batch *batch);
> +void brw_annotate_batch(struct brw_batch *batch);
> +void aub_dump_bmp(struct brw_context *brw);
>   
>   /*======================================================================
>    * brw_tex.c
> @@ -1645,7 +1593,7 @@ void brwInitFragProgFuncs( struct dd_function_table *functions );
>   
>   int brw_get_scratch_size(int size);
>   void brw_get_scratch_bo(struct brw_context *brw,
> -			drm_intel_bo **scratch_bo, int size);
> +			struct brw_bo **scratch_bo, int size);
>   void brw_init_shader_time(struct brw_context *brw);
>   int brw_get_shader_time_index(struct brw_context *brw,
>                                 struct gl_shader_program *shader_prog,
> @@ -1705,7 +1653,7 @@ void brw_prepare_vertices(struct brw_context *brw);
>   /* brw_wm_surface_state.c */
>   void brw_init_surface_formats(struct brw_context *brw);
>   void brw_create_constant_surface(struct brw_context *brw,
> -                                 drm_intel_bo *bo,
> +                                 struct brw_bo *bo,
>                                    uint32_t offset,
>                                    uint32_t size,
>                                    uint32_t *out_offset,
> @@ -1739,12 +1687,6 @@ void brw_dump_perf_monitors(struct brw_context *brw);
>   void brw_perf_monitor_new_batch(struct brw_context *brw);
>   void brw_perf_monitor_finish_batch(struct brw_context *brw);
>   
> -/* intel_buffer_objects.c */
> -int brw_bo_map(struct brw_context *brw, drm_intel_bo *bo, int write_enable,
> -               const char *bo_name);
> -int brw_bo_map_gtt(struct brw_context *brw, drm_intel_bo *bo,
> -                   const char *bo_name);
> -
>   /* intel_extensions.c */
>   extern void intelInitExtensions(struct gl_context *ctx);
>   
> @@ -1913,13 +1855,9 @@ brw_program_reloc(struct brw_context *brw, uint32_t state_offset,
>         return prog_offset;
>      }
>   
> -   drm_intel_bo_emit_reloc(brw->batch.bo,
> -			   state_offset,
> -			   brw->cache.bo,
> -			   prog_offset,
> -			   I915_GEM_DOMAIN_INSTRUCTION, 0);
> -
> -   return brw->cache.bo->offset64 + prog_offset;
> +   return brw_batch_reloc(&brw->batch, state_offset,
> +			  brw->cache.bo, prog_offset,
> +			  I915_GEM_DOMAIN_INSTRUCTION, 0);
>   }
>   
>   bool brw_do_cubemap_normalize(struct exec_list *instructions);
> @@ -2006,7 +1944,7 @@ void brw_fini_pipe_control(struct brw_context *brw);
>   
>   void brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags);
>   void brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
> -                                 drm_intel_bo *bo, uint32_t offset,
> +                                 struct brw_bo *bo, uint32_t offset,
>                                    uint32_t imm_lower, uint32_t imm_upper);
>   void brw_emit_mi_flush(struct brw_context *brw);
>   void brw_emit_post_sync_nonzero_flush(struct brw_context *brw);
> diff --git a/src/mesa/drivers/dri/i965/brw_cs.cpp b/src/mesa/drivers/dri/i965/brw_cs.cpp
> index 4c5082c..ee1f481 100644
> --- a/src/mesa/drivers/dri/i965/brw_cs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_cs.cpp
> @@ -30,7 +30,6 @@
>   #include "brw_wm.h"
>   #include "intel_mipmap_tree.h"
>   #include "brw_state.h"
> -#include "intel_batchbuffer.h"
>   
>   extern "C"
>   bool
> @@ -68,8 +67,7 @@ brw_cs_emit(struct brw_context *brw,
>      double start_time = 0;
>   
>      if (unlikely(brw->perf_debug)) {
> -      start_busy = (brw->batch.last_bo &&
> -                    drm_intel_bo_busy(brw->batch.last_bo));
> +      start_busy = brw_batch_busy(&brw->batch);
>         start_time = get_time();
>      }
>   
> @@ -152,7 +150,7 @@ brw_cs_emit(struct brw_context *brw,
>         }
>         shader->compiled_once = true;
>   
> -      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
> +      if (start_busy && !brw_batch_busy(&brw->batch)) {
>            perf_debug("CS compile took %.03f ms and stalled the GPU\n",
>                       (get_time() - start_time) * 1000);
>         }
> diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
> index befd7a9..29b75bc 100644
> --- a/src/mesa/drivers/dri/i965/brw_curbe.c
> +++ b/src/mesa/drivers/dri/i965/brw_curbe.c
> @@ -57,7 +57,6 @@
>   #include "program/prog_parameter.h"
>   #include "program/prog_print.h"
>   #include "program/prog_statevars.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_buffer_objects.h"
>   #include "brw_context.h"
>   #include "brw_defines.h"
> diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
> index 69ad4d4..1e967da 100644
> --- a/src/mesa/drivers/dri/i965/brw_draw.c
> +++ b/src/mesa/drivers/dri/i965/brw_draw.c
> @@ -48,11 +48,11 @@
>   #include "brw_state.h"
>   #include "brw_vs.h"
>   
> -#include "intel_batchbuffer.h"
>   #include "intel_buffers.h"
>   #include "intel_fbo.h"
>   #include "intel_mipmap_tree.h"
>   #include "intel_buffer_objects.h"
> +#include "intel_reg.h"
>   
>   #define FILE_DEBUG_FLAG DEBUG_PRIMS
>   
> @@ -172,6 +172,37 @@ static GLuint trim(GLenum prim, GLuint length)
>         return length;
>   }
>   
> +static void mark_fb_dirty(struct brw_context *brw)
> +{
> +   const struct gl_framebuffer *fb = brw->ctx.DrawBuffer;
> +   struct intel_renderbuffer *irb;
> +
> +   if (!brw->emitted_mi_flush)
> +      return;
> +
> +   for (int i = 0; i < fb->_NumColorDrawBuffers; i++) {
> +      irb = intel_renderbuffer(fb->_ColorDrawBuffers[i]);
> +      if (irb)
> +         brw_bo_mark_dirty(&brw->batch, irb->mt->bo);
> +   }
> +
> +   irb = intel_renderbuffer(fb->Attachment[BUFFER_DEPTH].Renderbuffer);
> +   if (irb)
> +      brw_bo_mark_dirty(&brw->batch, irb->mt->bo);
> +
> +   if (brw->ctx.Stencil._Enabled) {
> +      irb = intel_renderbuffer(fb->Attachment[BUFFER_STENCIL].Renderbuffer);
> +      if (irb) {
> +         struct intel_mipmap_tree *mt = irb->mt;
> +         if (mt && mt->stencil_mt)
> +            mt = mt->stencil_mt;
> +         if (mt)
> +            brw_bo_mark_dirty(&brw->batch, mt->bo);
> +      }
> +   }
> +
> +   brw->emitted_mi_flush = 0;
> +}
>   
>   static void brw_emit_prim(struct brw_context *brw,
>   			  const struct _mesa_prim *prim,
> @@ -223,9 +254,10 @@ static void brw_emit_prim(struct brw_context *brw,
>      /* If indirect, emit a bunch of loads from the indirect BO. */
>      if (prim->is_indirect) {
>         struct gl_buffer_object *indirect_buffer = brw->ctx.DrawIndirectBuffer;
> -      drm_intel_bo *bo = intel_bufferobj_buffer(brw,
> -            intel_buffer_object(indirect_buffer),
> -            prim->indirect_offset, 5 * sizeof(GLuint));
> +      struct brw_bo *bo =
> +	 intel_bufferobj_buffer(brw,
> +				intel_buffer_object(indirect_buffer),
> +				prim->indirect_offset, 5 * sizeof(GLuint));
>   
>         indirect_flag = GEN7_3DPRIM_INDIRECT_PARAMETER_ENABLE;
>   
> @@ -283,6 +315,8 @@ static void brw_emit_prim(struct brw_context *brw,
>      OUT_BATCH(base_vertex_location);
>      ADVANCE_BATCH();
>   
> +   mark_fb_dirty(brw);
> +
>      if (brw->always_flush_cache) {
>         brw_emit_mi_flush(brw);
>      }
> @@ -296,7 +330,7 @@ static void brw_merge_inputs( struct brw_context *brw,
>      GLuint i;
>   
>      for (i = 0; i < brw->vb.nr_buffers; i++) {
> -      drm_intel_bo_unreference(brw->vb.buffers[i].bo);
> +      brw_bo_put(brw->vb.buffers[i].bo);
>         brw->vb.buffers[i].bo = NULL;
>      }
>      brw->vb.nr_buffers = 0;
> @@ -367,7 +401,6 @@ static void brw_postdraw_set_buffers_need_resolve(struct brw_context *brw)
>      struct intel_renderbuffer *front_irb = NULL;
>      struct intel_renderbuffer *back_irb = intel_get_renderbuffer(fb, BUFFER_BACK_LEFT);
>      struct intel_renderbuffer *depth_irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
> -   struct intel_renderbuffer *stencil_irb = intel_get_renderbuffer(fb, BUFFER_STENCIL);
>      struct gl_renderbuffer_attachment *depth_att = &fb->Attachment[BUFFER_DEPTH];
>   
>      if (brw_is_front_buffer_drawing(fb))
> @@ -379,20 +412,6 @@ static void brw_postdraw_set_buffers_need_resolve(struct brw_context *brw)
>         back_irb->need_downsample = true;
>      if (depth_irb && ctx->Depth.Mask) {
>         intel_renderbuffer_att_set_needs_depth_resolve(depth_att);
> -      brw_render_cache_set_add_bo(brw, depth_irb->mt->bo);
> -   }
> -
> -   if (ctx->Extensions.ARB_stencil_texturing &&
> -       stencil_irb && ctx->Stencil._WriteEnabled) {
> -      brw_render_cache_set_add_bo(brw, stencil_irb->mt->bo);
> -   }
> -
> -   for (int i = 0; i < fb->_NumColorDrawBuffers; i++) {
> -      struct intel_renderbuffer *irb =
> -         intel_renderbuffer(fb->_ColorDrawBuffers[i]);
> -
> -      if (irb)
> -         brw_render_cache_set_add_bo(brw, irb->mt->bo);
>      }
>   }
>   
> @@ -410,7 +429,6 @@ static void brw_try_draw_prims( struct gl_context *ctx,
>   {
>      struct brw_context *brw = brw_context(ctx);
>      GLuint i;
> -   bool fail_next = false;
>   
>      if (ctx->NewState)
>         _mesa_update_state( ctx );
> @@ -457,6 +475,7 @@ static void brw_try_draw_prims( struct gl_context *ctx,
>      for (i = 0; i < nr_prims; i++) {
>         int estimated_max_prim_size;
>         const int sampler_state_size = 16;
> +      int ret;
>   
>         estimated_max_prim_size = 512; /* batchbuffer commands */
>         estimated_max_prim_size += BRW_MAX_TEX_UNIT *
> @@ -469,8 +488,9 @@ static void brw_try_draw_prims( struct gl_context *ctx,
>          * we've got validated state that needs to be in the same batch as the
>          * primitives.
>          */
> -      intel_batchbuffer_require_space(brw, estimated_max_prim_size, RENDER_RING);
> -      intel_batchbuffer_save_state(brw);
> +      ret = brw_batch_begin(&brw->batch, estimated_max_prim_size, RENDER_RING);
> +      if (ret < 0)
> +	 break;
>   
>         if (brw->num_instances != prims[i].num_instances ||
>             brw->basevertex != prims[i].basevertex) {
> @@ -485,13 +505,12 @@ static void brw_try_draw_prims( struct gl_context *ctx,
>         brw->draw.gl_basevertex =
>            prims[i].indexed ? prims[i].basevertex : prims[i].start;
>   
> -      drm_intel_bo_unreference(brw->draw.draw_params_bo);
> +      brw_bo_put(brw->draw.draw_params_bo);
>   
>         if (prims[i].is_indirect) {
>            /* Point draw_params_bo at the indirect buffer. */
>            brw->draw.draw_params_bo =
> -            intel_buffer_object(ctx->DrawIndirectBuffer)->buffer;
> -         drm_intel_bo_reference(brw->draw.draw_params_bo);
> +            brw_bo_get(intel_buffer_object(ctx->DrawIndirectBuffer)->buffer);
>            brw->draw.draw_params_offset =
>               prims[i].indirect_offset + (prims[i].indexed ? 12 : 8);
>         } else {
> @@ -507,35 +526,20 @@ static void brw_try_draw_prims( struct gl_context *ctx,
>         else
>   	 gen6_set_prim(brw, &prims[i]);
>   
> -retry:
> -
>         /* Note that before the loop, brw->ctx.NewDriverState was set to != 0, and
>          * that the state updated in the loop outside of this block is that in
> -       * *_set_prim or intel_batchbuffer_flush(), which only impacts
> -       * brw->ctx.NewDriverState.
> +       * *_set_prim, which only impacts brw->ctx.NewDriverState.
>          */
>         if (brw->ctx.NewDriverState) {
> -	 brw->no_batch_wrap = true;
>   	 brw_upload_render_state(brw);
>         }
>   
>         brw_emit_prim(brw, &prims[i], brw->primitive);
>   
> -      brw->no_batch_wrap = false;
> -
> -      if (dri_bufmgr_check_aperture_space(&brw->batch.bo, 1)) {
> -	 if (!fail_next) {
> -	    intel_batchbuffer_reset_to_saved(brw);
> -	    intel_batchbuffer_flush(brw);
> -	    fail_next = true;
> -	    goto retry;
> -	 } else {
> -            int ret = intel_batchbuffer_flush(brw);
> -            WARN_ONCE(ret == -ENOSPC,
> -                      "i965: Single primitive emit exceeded "
> -                      "available aperture space\n");
> -	 }
> -      }
> +      ret = brw_batch_end(&brw->batch);
> +      WARN_ONCE(ret == -ENOSPC,
> +		"i965: Single primitive emit exceeded "
> +		"available aperture space\n");
>   
>         /* Now that we know we haven't run out of aperture space, we can safely
>          * reset the dirty bits.
> @@ -544,8 +548,7 @@ retry:
>            brw_render_state_finished(brw);
>      }
>   
> -   if (brw->always_flush_batch)
> -      intel_batchbuffer_flush(brw);
> +   brw_batch_maybe_flush(&brw->batch);
>   
>      brw_state_cache_check_size(brw);
>      brw_postdraw_set_buffers_need_resolve(brw);
> @@ -628,7 +631,7 @@ void brw_draw_destroy( struct brw_context *brw )
>      int i;
>   
>      for (i = 0; i < brw->vb.nr_buffers; i++) {
> -      drm_intel_bo_unreference(brw->vb.buffers[i].bo);
> +      brw_bo_put(brw->vb.buffers[i].bo);
>         brw->vb.buffers[i].bo = NULL;
>      }
>      brw->vb.nr_buffers = 0;
> @@ -638,6 +641,6 @@ void brw_draw_destroy( struct brw_context *brw )
>      }
>      brw->vb.nr_enabled = 0;
>   
> -   drm_intel_bo_unreference(brw->ib.bo);
> +   brw_bo_put(brw->ib.bo);
>      brw->ib.bo = NULL;
>   }
> diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
> index 320e40e..bc2e8fa 100644
> --- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
> @@ -37,7 +37,6 @@
>   #include "brw_context.h"
>   #include "brw_state.h"
>   
> -#include "intel_batchbuffer.h"
>   #include "intel_buffer_objects.h"
>   
>   static GLuint double_types[5] = {
> @@ -361,7 +360,7 @@ copy_array_to_vbo_array(struct brw_context *brw,
>         intel_upload_data(brw, element->glarray->Ptr,
>                           element->glarray->_ElementSize,
>                           element->glarray->_ElementSize,
> -			&buffer->bo, &buffer->offset);
> +                        &buffer->bo, &buffer->offset);
>   
>         buffer->stride = 0;
>         return;
> @@ -480,9 +479,8 @@ brw_prepare_vertices(struct brw_context *brw)
>                             glarray->_ElementSize);
>                  }
>               }
> -            buffer->bo = intel_bufferobj_buffer(brw, intel_buffer,
> -                                                offset, size);
> -            drm_intel_bo_reference(buffer->bo);
> +            buffer->bo = brw_bo_get(intel_bufferobj_buffer(brw, intel_buffer,
> +							   offset, size));
>   
>   	    input->buffer = j++;
>   	    input->offset = 0;
> @@ -596,7 +594,7 @@ brw_prepare_shader_draw_parameters(struct brw_context *brw)
>      /* For non-indirect draws, upload gl_BaseVertex. */
>      if (brw->vs.prog_data->uses_vertexid && brw->draw.draw_params_bo == NULL) {
>         intel_upload_data(brw, &brw->draw.gl_basevertex, 4, 4,
> -			&brw->draw.draw_params_bo,
> +                        &brw->draw.draw_params_bo,
>                           &brw->draw.draw_params_offset);
>      }
>   }
> @@ -607,7 +605,7 @@ brw_prepare_shader_draw_parameters(struct brw_context *brw)
>   static void
>   emit_vertex_buffer_state(struct brw_context *brw,
>                            unsigned buffer_nr,
> -                         drm_intel_bo *bo,
> +                         struct brw_bo *bo,
>                            unsigned bo_ending_address,
>                            unsigned bo_offset,
>                            unsigned stride,
> @@ -860,7 +858,7 @@ static void brw_upload_indices(struct brw_context *brw)
>      struct gl_context *ctx = &brw->ctx;
>      const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
>      GLuint ib_size;
> -   drm_intel_bo *old_bo = brw->ib.bo;
> +   struct brw_bo *old_bo = brw->ib.bo;
>      struct gl_buffer_object *bufferobj;
>      GLuint offset;
>      GLuint ib_type_size;
> @@ -878,7 +876,7 @@ static void brw_upload_indices(struct brw_context *brw)
>         /* Get new bufferobj, offset:
>          */
>         intel_upload_data(brw, index_buffer->ptr, ib_size, ib_type_size,
> -			&brw->ib.bo, &offset);
> +                        &brw->ib.bo, &offset);
>      } else {
>         offset = (GLuint) (unsigned long) index_buffer->ptr;
>   
> @@ -901,13 +899,12 @@ static void brw_upload_indices(struct brw_context *brw)
>   
>            ctx->Driver.UnmapBuffer(ctx, bufferobj, MAP_INTERNAL);
>         } else {
> -         drm_intel_bo *bo =
> +         struct brw_bo *bo =
>               intel_bufferobj_buffer(brw, intel_buffer_object(bufferobj),
>                                      offset, ib_size);
>            if (bo != brw->ib.bo) {
> -            drm_intel_bo_unreference(brw->ib.bo);
> -            brw->ib.bo = bo;
> -            drm_intel_bo_reference(bo);
> +            brw_bo_put(brw->ib.bo);
> +            brw->ib.bo = brw_bo_get(bo);
>            }
>         }
>      }
> diff --git a/src/mesa/drivers/dri/i965/brw_ff_gs.c b/src/mesa/drivers/dri/i965/brw_ff_gs.c
> index f72f37f..bd452c3 100644
> --- a/src/mesa/drivers/dri/i965/brw_ff_gs.c
> +++ b/src/mesa/drivers/dri/i965/brw_ff_gs.c
> @@ -34,8 +34,6 @@
>   #include "main/enums.h"
>   #include "main/transformfeedback.h"
>   
> -#include "intel_batchbuffer.h"
> -
>   #include "brw_defines.h"
>   #include "brw_context.h"
>   #include "brw_eu.h"
> diff --git a/src/mesa/drivers/dri/i965/brw_ff_gs_emit.c b/src/mesa/drivers/dri/i965/brw_ff_gs_emit.c
> index 50bda61..56a29b4 100644
> --- a/src/mesa/drivers/dri/i965/brw_ff_gs_emit.c
> +++ b/src/mesa/drivers/dri/i965/brw_ff_gs_emit.c
> @@ -35,7 +35,6 @@
>   #include "main/enums.h"
>   
>   #include "program/program.h"
> -#include "intel_batchbuffer.h"
>   
>   #include "brw_defines.h"
>   #include "brw_context.h"
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 189da1d..5a0fef8 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -3928,8 +3928,7 @@ brw_wm_fs_emit(struct brw_context *brw,
>      double start_time = 0;
>   
>      if (unlikely(brw->perf_debug)) {
> -      start_busy = (brw->batch.last_bo &&
> -                    drm_intel_bo_busy(brw->batch.last_bo));
> +      start_busy = brw_batch_busy(&brw->batch);
>         start_time = get_time();
>      }
>   
> @@ -4015,7 +4014,7 @@ brw_wm_fs_emit(struct brw_context *brw,
>            brw_wm_debug_recompile(brw, prog, key);
>         shader->compiled_once = true;
>   
> -      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
> +      if (start_busy && !brw_batch_busy(&brw->batch)) {
>            perf_debug("FS compile took %.03f ms and stalled the GPU\n",
>                       (get_time() - start_time) * 1000);
>         }
> diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
> index 5b8191c..a571a74 100644
> --- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
> +++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
> @@ -49,7 +49,6 @@
>   #include "brw_context.h"
>   #include "brw_draw.h"
>   #include "intel_fbo.h"
> -#include "intel_batchbuffer.h"
>   
>   #include "brw_blorp.h"
>   
> @@ -623,7 +622,10 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
>       *     write-flush must be issued before sending any DRAW commands on that
>       *     render target.
>       */
> -   brw_emit_mi_flush(brw);
> +   if (fast_clear_buffers && brw_batch_begin(&brw->batch, 60, RENDER_RING) >= 0) {
> +      brw_emit_mi_flush(brw);
> +      brw_batch_end(&brw->batch);
> +   }
>   
>      /* If we had to fall back to plain clear for any buffers, clear those now
>       * by calling into meta.
> @@ -677,7 +679,10 @@ brw_meta_resolve_color(struct brw_context *brw,
>      GLuint fbo, rbo;
>      struct rect rect;
>   
> -   brw_emit_mi_flush(brw);
> +   if (brw_batch_begin(&brw->batch, 60, RENDER_RING) >= 0) {
> +      brw_emit_mi_flush(brw);
> +      brw_batch_end(&brw->batch);
> +   }
>   
>      _mesa_meta_begin(ctx, MESA_META_ALL);
>   
> diff --git a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
> index d4abfe6..6a35163 100644
> --- a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
> +++ b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
> @@ -43,7 +43,6 @@
>    */
>   
>   #include "brw_context.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_fbo.h"
>   
>   #include "main/blit.h"
> diff --git a/src/mesa/drivers/dri/i965/brw_meta_updownsample.c b/src/mesa/drivers/dri/i965/brw_meta_updownsample.c
> index f39d50a..37a0968 100644
> --- a/src/mesa/drivers/dri/i965/brw_meta_updownsample.c
> +++ b/src/mesa/drivers/dri/i965/brw_meta_updownsample.c
> @@ -22,7 +22,6 @@
>    */
>   
>   #include "brw_context.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_fbo.h"
>   
>   #include "main/blit.h"
> diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
> index 1bbb16c..5dd45da 100644
> --- a/src/mesa/drivers/dri/i965/brw_misc_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
> @@ -31,9 +31,9 @@
>   
>   
>   
> -#include "intel_batchbuffer.h"
>   #include "intel_fbo.h"
>   #include "intel_mipmap_tree.h"
> +#include "intel_reg.h"
>   
>   #include "brw_context.h"
>   #include "brw_state.h"
> @@ -543,10 +543,10 @@ brw_emit_depthbuffer(struct brw_context *brw)
>         height = stencil_irb->Base.Base.Height;
>      }
>   
> -   if (depth_mt)
> -      brw_render_cache_set_check_flush(brw, depth_mt->bo);
> -   if (stencil_mt)
> -      brw_render_cache_set_check_flush(brw, stencil_mt->bo);
> +   if (depth_mt && depth_mt->bo->dirty)
> +      brw_emit_mi_flush(brw);
> +   if (stencil_mt && stencil_mt->bo->dirty)
> +      brw_emit_mi_flush(brw);
>   
>      brw->vtbl.emit_depth_stencil_hiz(brw, depth_mt, depth_offset,
>                                       depthbuffer_format, depth_surface_type,
> diff --git a/src/mesa/drivers/dri/i965/brw_object_purgeable.c b/src/mesa/drivers/dri/i965/brw_object_purgeable.c
> index 20f66f2..3240ee3 100644
> --- a/src/mesa/drivers/dri/i965/brw_object_purgeable.c
> +++ b/src/mesa/drivers/dri/i965/brw_object_purgeable.c
> @@ -38,12 +38,12 @@
>   #include "intel_mipmap_tree.h"
>   
>   static GLenum
> -intel_buffer_purgeable(drm_intel_bo *buffer)
> +intel_buffer_purgeable(struct brw_bo *buffer)
>   {
>      int retained = 0;
>   
>      if (buffer != NULL)
> -      retained = drm_intel_bo_madvise(buffer, I915_MADV_DONTNEED);
> +      retained = brw_bo_madvise(buffer, I915_MADV_DONTNEED);
>   
>      return retained ? GL_VOLATILE_APPLE : GL_RELEASED_APPLE;
>   }
> @@ -101,13 +101,13 @@ intel_render_object_purgeable(struct gl_context * ctx,
>   }
>   
>   static GLenum
> -intel_buffer_unpurgeable(drm_intel_bo *buffer)
> +intel_buffer_unpurgeable(struct brw_bo *buffer)
>   {
>      int retained;
>   
>      retained = 0;
>      if (buffer != NULL)
> -      retained = drm_intel_bo_madvise(buffer, I915_MADV_WILLNEED);
> +      retained = brw_bo_madvise(buffer, I915_MADV_WILLNEED);
>   
>      return retained ? GL_RETAINED_APPLE : GL_UNDEFINED_APPLE;
>   }
> diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
> index 0a12375..fd94348 100644
> --- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c
> +++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
> @@ -54,7 +54,8 @@
>   
>   #include "brw_context.h"
>   #include "brw_defines.h"
> -#include "intel_batchbuffer.h"
> +
> +#include "intel_reg.h"
>   
>   #define FILE_DEBUG_FLAG DEBUG_PERFMON
>   
> @@ -69,7 +70,7 @@ struct brw_perf_monitor_object
>      /**
>       * BO containing OA counter snapshots at monitor Begin/End time.
>       */
> -   drm_intel_bo *oa_bo;
> +   struct brw_bo *oa_bo;
>   
>      /** Indexes into bookend_bo (snapshot numbers) for various segments. */
>      int oa_head_end;
> @@ -90,7 +91,7 @@ struct brw_perf_monitor_object
>       * BO containing starting and ending snapshots for any active pipeline
>       * statistics counters.
>       */
> -   drm_intel_bo *pipeline_stats_bo;
> +   struct brw_bo *pipeline_stats_bo;
>   
>      /**
>       * Storage for final pipeline statistics counter results.
> @@ -615,15 +616,13 @@ gather_statistics_results(struct brw_context *brw,
>         return;
>      }
>   
> -   drm_intel_bo_map(monitor->pipeline_stats_bo, false);
> -   uint64_t *start = monitor->pipeline_stats_bo->virtual;
> +   uint64_t *start = brw_bo_map(monitor->pipeline_stats_bo, MAP_READ);
>      uint64_t *end = start + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint64_t));
>   
>      for (int i = 0; i < num_counters; i++) {
>         monitor->pipeline_stats_results[i] = end[i] - start[i];
>      }
> -   drm_intel_bo_unmap(monitor->pipeline_stats_bo);
> -   drm_intel_bo_unreference(monitor->pipeline_stats_bo);
> +   brw_bo_put(monitor->pipeline_stats_bo);
>      monitor->pipeline_stats_bo = NULL;
>   }
>   
> @@ -701,16 +700,21 @@ stop_oa_counters(struct brw_context *brw)
>    */
>   static void
>   emit_mi_report_perf_count(struct brw_context *brw,
> -                          drm_intel_bo *bo,
> +                          struct brw_bo *bo,
>                             uint32_t offset_in_bytes,
>                             uint32_t report_id)
>   {
>      assert(offset_in_bytes % 64 == 0);
>   
>      /* Make sure the commands to take a snapshot fits in a single batch. */
> -   intel_batchbuffer_require_space(brw, MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4,
> -                                   RENDER_RING);
> -   int batch_used = brw->batch.used;
> +   if (brw_batch_begin(&brw->batch,
> +                       MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4,
> +                       RENDER_RING) < 0)
> +      return;
> +
> +   /* If the OA counters aren't already on, enable them. */
> +   if (brw->perfmon.oa_users == 0)
> +      start_oa_counters(brw);
>   
>      /* Reports apparently don't always get written unless we flush first. */
>      brw_emit_mi_flush(brw);
> @@ -752,9 +756,7 @@ emit_mi_report_perf_count(struct brw_context *brw,
>   
>      /* Reports apparently don't always get written unless we flush after. */
>      brw_emit_mi_flush(brw);
> -
> -   (void) batch_used;
> -   assert(brw->batch.used - batch_used <= MI_REPORT_PERF_COUNT_BATCH_DWORDS * 4);
> +   brw_batch_end(&brw->batch);
>   }
>   
>   /**
> @@ -892,8 +894,7 @@ gather_oa_results(struct brw_context *brw,
>      struct gl_perf_monitor_object *m = &monitor->base;
>      assert(monitor->oa_bo != NULL);
>   
> -   drm_intel_bo_map(monitor->oa_bo, false);
> -   uint32_t *monitor_buffer = monitor->oa_bo->virtual;
> +   uint32_t *monitor_buffer = brw_bo_map(monitor->oa_bo, MAP_READ);
>   
>      /* If monitoring was entirely contained within a single batch, then the
>       * bookend BO is irrelevant.  Just subtract monitor->bo's two snapshots.
> @@ -903,7 +904,6 @@ gather_oa_results(struct brw_context *brw,
>                    monitor_buffer,
>                    monitor_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES /
>                                      sizeof(uint32_t)));
> -      drm_intel_bo_unmap(monitor->oa_bo);
>         return;
>      }
>   
> @@ -950,13 +950,11 @@ gather_oa_results(struct brw_context *brw,
>                                      sizeof(uint32_t)));
>      }
>   
> -   drm_intel_bo_unmap(monitor->oa_bo);
> -
>      /* If the monitor has ended, then we've gathered all the results, and
>       * can free the monitor's OA BO.
>       */
>      if (m->Ended) {
> -      drm_intel_bo_unreference(monitor->oa_bo);
> +      brw_bo_put(monitor->oa_bo);
>         monitor->oa_bo = NULL;
>   
>         /* The monitor's OA result is now resolved. */
> @@ -989,8 +987,7 @@ wrap_bookend_bo(struct brw_context *brw)
>       */
>      assert(brw->perfmon.oa_users > 0);
>   
> -   drm_intel_bo_map(brw->perfmon.bookend_bo, false);
> -   uint32_t *bookend_buffer = brw->perfmon.bookend_bo->virtual;
> +   uint32_t *bookend_buffer = brw_bo_map(brw->perfmon.bookend_bo, MAP_READ);
>      for (int i = 0; i < brw->perfmon.unresolved_elements; i++) {
>         struct brw_perf_monitor_object *monitor = brw->perfmon.unresolved[i];
>         struct gl_perf_monitor_object *m = &monitor->base;
> @@ -1011,7 +1008,6 @@ wrap_bookend_bo(struct brw_context *brw)
>            assert(monitor->oa_tail_start == -1);
>         }
>      }
> -   drm_intel_bo_unmap(brw->perfmon.bookend_bo);
>   
>      brw->perfmon.bookend_snapshots = 0;
>   }
> @@ -1060,7 +1056,7 @@ reinitialize_perf_monitor(struct brw_context *brw,
>                             struct brw_perf_monitor_object *monitor)
>   {
>      if (monitor->oa_bo) {
> -      drm_intel_bo_unreference(monitor->oa_bo);
> +      brw_bo_put(monitor->oa_bo);
>         monitor->oa_bo = NULL;
>      }
>   
> @@ -1077,7 +1073,7 @@ reinitialize_perf_monitor(struct brw_context *brw,
>      monitor->oa_results = NULL;
>   
>      if (monitor->pipeline_stats_bo) {
> -      drm_intel_bo_unreference(monitor->pipeline_stats_bo);
> +      brw_bo_put(monitor->pipeline_stats_bo);
>         monitor->pipeline_stats_bo = NULL;
>      }
>   
> @@ -1105,34 +1101,25 @@ brw_begin_perf_monitor(struct gl_context *ctx,
>          * wasting memory for contexts that don't use performance monitors.
>          */
>         if (!brw->perfmon.bookend_bo) {
> -         brw->perfmon.bookend_bo = drm_intel_bo_alloc(brw->bufmgr,
> -                                                      "OA bookend BO",
> -                                                      BOOKEND_BO_SIZE_BYTES, 64);
> +	 brw->perfmon.bookend_bo = brw_bo_create(&brw->batch,
> +						 "OA bookend BO",
> +						 BOOKEND_BO_SIZE_BYTES, 0, 0);
>         }
>   
>         monitor->oa_bo =
> -         drm_intel_bo_alloc(brw->bufmgr, "perf. monitor OA bo", 4096, 64);
> +         brw_bo_create(&brw->batch, "perf. monitor OA bo", 4096, 0, 0);
>   #ifdef DEBUG
>         /* Pre-filling the BO helps debug whether writes landed. */
> -      drm_intel_bo_map(monitor->oa_bo, true);
> -      memset((char *) monitor->oa_bo->virtual, 0xff, 4096);
> -      drm_intel_bo_unmap(monitor->oa_bo);
> +      memset(brw_bo_map(monitor->oa_bo, MAP_WRITE), 0xff, 4096);
>   #endif
>   
>         /* Allocate storage for accumulated OA counter values. */
>         monitor->oa_results =
>            calloc(brw->perfmon.entries_per_oa_snapshot, sizeof(uint32_t));
>   
> -      /* If the OA counters aren't already on, enable them. */
> -      if (brw->perfmon.oa_users == 0) {
> -         /* Ensure the OACONTROL enable and snapshot land in the same batch. */
> -         int space = (MI_REPORT_PERF_COUNT_BATCH_DWORDS + 3) * 4;
> -         intel_batchbuffer_require_space(brw, space, RENDER_RING);
> -         start_oa_counters(brw);
> -      }
> -
>         /* Take a starting OA counter snapshot. */
>         emit_mi_report_perf_count(brw, monitor->oa_bo, 0, REPORT_ID);
> +      brw->perfmon.oa_users++;
>   
>         monitor->oa_head_end = brw->perfmon.bookend_snapshots;
>         monitor->oa_middle_start = brw->perfmon.bookend_snapshots + 1;
> @@ -1140,13 +1127,11 @@ brw_begin_perf_monitor(struct gl_context *ctx,
>   
>         /* Add the monitor to the unresolved list. */
>         add_to_unresolved_monitor_list(brw, monitor);
> -
> -      ++brw->perfmon.oa_users;
>      }
>   
>      if (monitor_needs_statistics_registers(brw, m)) {
>         monitor->pipeline_stats_bo =
> -         drm_intel_bo_alloc(brw->bufmgr, "perf. monitor stats bo", 4096, 64);
> +         brw_bo_create(&brw->batch, "perf. monitor stats bo", 4096, 0, 0);
>   
>         /* Take starting snapshots. */
>         snapshot_statistics_registers(brw, monitor, 0);
> @@ -1238,15 +1223,11 @@ brw_is_perf_monitor_result_available(struct gl_context *ctx,
>      bool stats_available = true;
>   
>      if (monitor_needs_oa(brw, m)) {
> -      oa_available = !monitor->oa_bo ||
> -         (!drm_intel_bo_references(brw->batch.bo, monitor->oa_bo) &&
> -          !drm_intel_bo_busy(monitor->oa_bo));
> +      oa_available = !brw_bo_busy(monitor->oa_bo, BUSY_READ);
>      }
>   
>      if (monitor_needs_statistics_registers(brw, m)) {
> -      stats_available = !monitor->pipeline_stats_bo ||
> -         (!drm_intel_bo_references(brw->batch.bo, monitor->pipeline_stats_bo) &&
> -          !drm_intel_bo_busy(monitor->pipeline_stats_bo));
> +      stats_available = !brw_bo_busy(monitor->pipeline_stats_bo, BUSY_READ);
>      }
>   
>      return oa_available && stats_available;
> @@ -1293,11 +1274,9 @@ brw_get_perf_monitor_result(struct gl_context *ctx,
>             * Using an unsynchronized mapping avoids stalling for an
>             * indeterminate amount of time.
>             */
> -         drm_intel_gem_bo_map_unsynchronized(brw->perfmon.bookend_bo);
> -
> -         gather_oa_results(brw, monitor, brw->perfmon.bookend_bo->virtual);
> -
> -         drm_intel_bo_unmap(brw->perfmon.bookend_bo);
> +         gather_oa_results(brw, monitor,
> +			   brw_bo_map(brw->perfmon.bookend_bo,
> +				      MAP_READ | MAP_ASYNC));
>         }
>   
>         for (int i = 0; i < brw->perfmon.entries_per_oa_snapshot; i++) {
> @@ -1386,7 +1365,6 @@ void
>   brw_perf_monitor_new_batch(struct brw_context *brw)
>   {
>      assert(brw->batch.ring == RENDER_RING);
> -   assert(brw->gen < 6 || brw->batch.used == 0);
>   
>      if (brw->perfmon.oa_users == 0)
>         return;
> diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
> index 05e14cd..c64fbb1 100644
> --- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
> +++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
> @@ -22,7 +22,7 @@
>    */
>   
>   #include "brw_context.h"
> -#include "intel_batchbuffer.h"
> +
>   #include "intel_fbo.h"
>   #include "intel_reg.h"
>   
> @@ -135,7 +135,7 @@ brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
>    */
>   void
>   brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
> -                            drm_intel_bo *bo, uint32_t offset,
> +                            struct brw_bo *bo, uint32_t offset,
>                               uint32_t imm_lower, uint32_t imm_upper)
>   {
>      if (brw->gen >= 8) {
> @@ -289,7 +289,15 @@ brw_emit_post_sync_nonzero_flush(struct brw_context *brw)
>   void
>   brw_emit_mi_flush(struct brw_context *brw)
>   {
> -   if (brw->batch.ring == BLT_RING && brw->gen >= 6) {
> +   if (brw->batch.emit.nbatch == 0)
> +      return;
> +
> +   if (brw->batch.state - brw->batch.emit.nbatch < 20) {
> +      brw_batch_flush(&brw->batch);
> +      return;
> +   }
> +
> +   if (brw->batch.ring == BLT_RING) {
>         BEGIN_BATCH_BLT(4);
>         OUT_BATCH(MI_FLUSH_DW);
>         OUT_BATCH(0);
> @@ -327,7 +335,8 @@ brw_emit_mi_flush(struct brw_context *brw)
>         brw_emit_pipe_control_flush(brw, flags);
>      }
>   
> -   brw_render_cache_set_clear(brw);
> +   brw_batch_clear_dirty(&brw->batch);
> +   brw->emitted_mi_flush = 1;
>   }
>   
>   int
> @@ -341,12 +350,11 @@ brw_init_pipe_control(struct brw_context *brw,
>       * the gen6 workaround because it involves actually writing to
>       * the buffer, and the kernel doesn't let us write to the batch.
>       */
> -   brw->workaround_bo = brw->intelScreen->workaround_bo;
> +   brw->workaround_bo =
> +      brw_bo_import(&brw->batch, brw->intelScreen->workaround_bo, true);
>      if (brw->workaround_bo == NULL)
>         return -ENOMEM;
>   
> -   drm_intel_bo_reference(brw->workaround_bo);
> -
>      brw->pipe_controls_since_last_cs_stall = 0;
>   
>      return 0;
> @@ -355,5 +363,5 @@ brw_init_pipe_control(struct brw_context *brw,
>   void
>   brw_fini_pipe_control(struct brw_context *brw)
>   {
> -   drm_intel_bo_unreference(brw->workaround_bo);
> +   brw_bo_put(brw->workaround_bo);
>   }
> diff --git a/src/mesa/drivers/dri/i965/brw_primitive_restart.c b/src/mesa/drivers/dri/i965/brw_primitive_restart.c
> index 2c7a7e8..163d8a2 100644
> --- a/src/mesa/drivers/dri/i965/brw_primitive_restart.c
> +++ b/src/mesa/drivers/dri/i965/brw_primitive_restart.c
> @@ -33,8 +33,6 @@
>   #include "brw_defines.h"
>   #include "brw_draw.h"
>   
> -#include "intel_batchbuffer.h"
> -
>   /**
>    * Check if the hardware's cut index support can handle the primitive
>    * restart index value (pre-Haswell only).
> diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
> index 85e271d..3c047e2 100644
> --- a/src/mesa/drivers/dri/i965/brw_program.c
> +++ b/src/mesa/drivers/dri/i965/brw_program.c
> @@ -45,7 +45,8 @@
>   #include "brw_shader.h"
>   #include "brw_nir.h"
>   #include "brw_wm.h"
> -#include "intel_batchbuffer.h"
> +
> +#include "intel_reg.h"
>   
>   static unsigned
>   get_new_program_id(struct intel_screen *screen)
> @@ -259,17 +260,17 @@ brw_get_scratch_size(int size)
>   
>   void
>   brw_get_scratch_bo(struct brw_context *brw,
> -		   drm_intel_bo **scratch_bo, int size)
> +		   struct brw_bo **scratch_bo, int size)
>   {
> -   drm_intel_bo *old_bo = *scratch_bo;
> +   struct brw_bo *old_bo = *scratch_bo;
>   
>      if (old_bo && old_bo->size < size) {
> -      drm_intel_bo_unreference(old_bo);
> +      brw_bo_put(old_bo);
>         old_bo = NULL;
>      }
>   
>      if (!old_bo) {
> -      *scratch_bo = drm_intel_bo_alloc(brw->bufmgr, "scratch bo", size, 4096);
> +      *scratch_bo = brw_bo_create(&brw->batch, "scratch bo", size, 4096, 0);
>      }
>   }
>   
> @@ -297,9 +298,9 @@ void
>   brw_init_shader_time(struct brw_context *brw)
>   {
>      const int max_entries = 2048;
> -   brw->shader_time.bo =
> -      drm_intel_bo_alloc(brw->bufmgr, "shader time",
> -                         max_entries * SHADER_TIME_STRIDE * 3, 4096);
> +   brw->shader_time.bo = brw_bo_create(&brw->batch, "shader time",
> +				       max_entries * SHADER_TIME_STRIDE * 3,
> +				       4096, 0);
>      brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
>      brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
>      brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
> @@ -462,8 +463,7 @@ brw_collect_shader_time(struct brw_context *brw)
>       * delaying reading the reports, but it doesn't look like it's a big
>       * overhead compared to the cost of tracking the time in the first place.
>       */
> -   drm_intel_bo_map(brw->shader_time.bo, true);
> -   void *bo_map = brw->shader_time.bo->virtual;
> +   void *bo_map = brw_bo_map(brw->shader_time.bo, MAP_WRITE);
>   
>      for (int i = 0; i < brw->shader_time.num_entries; i++) {
>         uint32_t *times = bo_map + i * 3 * SHADER_TIME_STRIDE;
> @@ -476,7 +476,6 @@ brw_collect_shader_time(struct brw_context *brw)
>      /* Zero the BO out to clear it out for our next collection.
>       */
>      memset(bo_map, 0, brw->shader_time.bo->size);
> -   drm_intel_bo_unmap(brw->shader_time.bo);
>   }
>   
>   void
> @@ -529,7 +528,7 @@ brw_get_shader_time_index(struct brw_context *brw,
>   void
>   brw_destroy_shader_time(struct brw_context *brw)
>   {
> -   drm_intel_bo_unreference(brw->shader_time.bo);
> +   brw_bo_put(brw->shader_time.bo);
>      brw->shader_time.bo = NULL;
>   }
>   
> diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
> index aea4d9b..4d248d7 100644
> --- a/src/mesa/drivers/dri/i965/brw_queryobj.c
> +++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
> @@ -40,15 +40,17 @@
>   #include "brw_context.h"
>   #include "brw_defines.h"
>   #include "brw_state.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_reg.h"
>   
>   /**
>    * Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer.
>    */
>   void
> -brw_write_timestamp(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
> +brw_write_timestamp(struct brw_context *brw, struct brw_bo *query_bo, int idx)
>   {
> +   if (brw_batch_begin(&brw->batch, 60, RENDER_RING) < 0)
> +      return;
> +
>      if (brw->gen == 6) {
>         /* Emit Sandybridge workaround flush: */
>         brw_emit_pipe_control_flush(brw,
> @@ -58,13 +60,15 @@ brw_write_timestamp(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
>   
>      brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_TIMESTAMP,
>                                  query_bo, idx * sizeof(uint64_t), 0, 0);
> +
> +   brw_batch_end(&brw->batch);
>   }
>   
>   /**
>    * Emit PIPE_CONTROLs to write the PS_DEPTH_COUNT register into a buffer.
>    */
>   void
> -brw_write_depth_count(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
> +brw_write_depth_count(struct brw_context *brw, struct brw_bo *query_bo, int idx)
>   {
>      uint32_t flags;
>   
> @@ -78,8 +82,13 @@ brw_write_depth_count(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
>      if (brw->predicate.supported)
>         flags |= PIPE_CONTROL_FLUSH_ENABLE;
>   
> +   if (brw_batch_begin(&brw->batch, 40, RENDER_RING) < 0)
> +      return;
> +
>      brw_emit_pipe_control_write(brw, flags, query_bo,
>                                  idx * sizeof(uint64_t), 0, 0);
> +
> +   brw_batch_end(&brw->batch);
>   }
>   
>   /**
> @@ -89,12 +98,10 @@ static void
>   brw_queryobj_get_results(struct gl_context *ctx,
>   			 struct brw_query_object *query)
>   {
> -   struct brw_context *brw = brw_context(ctx);
> -
>      int i;
>      uint64_t *results;
>   
> -   assert(brw->gen < 6);
> +   assert(brw_context(ctx)->gen < 6);
>   
>      if (query->bo == NULL)
>         return;
> @@ -103,17 +110,7 @@ brw_queryobj_get_results(struct gl_context *ctx,
>       * still contributing to it, flush it now so the results will be present
>       * when mapped.
>       */
> -   if (drm_intel_bo_references(brw->batch.bo, query->bo))
> -      intel_batchbuffer_flush(brw);
> -
> -   if (unlikely(brw->perf_debug)) {
> -      if (drm_intel_bo_busy(query->bo)) {
> -         perf_debug("Stalling on the GPU waiting for a query object.\n");
> -      }
> -   }
> -
> -   drm_intel_bo_map(query->bo, false);
> -   results = query->bo->virtual;
> +   results = brw_bo_map(query->bo, MAP_READ);
>      switch (query->Base.Target) {
>      case GL_TIME_ELAPSED_EXT:
>         /* The query BO contains the starting and ending timestamps.
> @@ -159,12 +156,11 @@ brw_queryobj_get_results(struct gl_context *ctx,
>      default:
>         unreachable("Unrecognized query target in brw_queryobj_get_results()");
>      }
> -   drm_intel_bo_unmap(query->bo);
>   
>      /* Now that we've processed the data stored in the query's buffer object,
>       * we can release it.
>       */
> -   drm_intel_bo_unreference(query->bo);
> +   brw_bo_put(query->bo);
>      query->bo = NULL;
>   }
>   
> @@ -196,7 +192,7 @@ brw_delete_query(struct gl_context *ctx, struct gl_query_object *q)
>   {
>      struct brw_query_object *query = (struct brw_query_object *)q;
>   
> -   drm_intel_bo_unreference(query->bo);
> +   brw_bo_put(query->bo);
>      free(query);
>   }
>   
> @@ -235,8 +231,8 @@ brw_begin_query(struct gl_context *ctx, struct gl_query_object *q)
>          * obtain the time elapsed.  Notably, this includes time elapsed while
>          * the system was doing other work, such as running other applications.
>          */
> -      drm_intel_bo_unreference(query->bo);
> -      query->bo = drm_intel_bo_alloc(brw->bufmgr, "timer query", 4096, 4096);
> +      brw_bo_put(query->bo);
> +      query->bo = brw_bo_create(&brw->batch, "timer query", 4096, 4096, 0);
>         brw_write_timestamp(brw, query->bo, 0);
>         break;
>   
> @@ -250,7 +246,7 @@ brw_begin_query(struct gl_context *ctx, struct gl_query_object *q)
>          * Since we're starting a new query, we need to be sure to throw away
>          * any previous occlusion query results.
>          */
> -      drm_intel_bo_unreference(query->bo);
> +      brw_bo_put(query->bo);
>         query->bo = NULL;
>         query->last_index = -1;
>   
> @@ -350,10 +346,9 @@ static void brw_wait_query(struct gl_context *ctx, struct gl_query_object *q)
>    */
>   static void brw_check_query(struct gl_context *ctx, struct gl_query_object *q)
>   {
> -   struct brw_context *brw = brw_context(ctx);
>      struct brw_query_object *query = (struct brw_query_object *)q;
>   
> -   assert(brw->gen < 6);
> +   assert(brw_context(ctx)->gen < 6);
>   
>      /* From the GL_ARB_occlusion_query spec:
>       *
> @@ -362,10 +357,7 @@ static void brw_check_query(struct gl_context *ctx, struct gl_query_object *q)
>       *      not ready yet on the first time it is queried.  This ensures that
>       *      the async query will return true in finite time.
>       */
> -   if (query->bo && drm_intel_bo_references(brw->batch.bo, query->bo))
> -      intel_batchbuffer_flush(brw);
> -
> -   if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) {
> +   if (!brw_bo_busy(query->bo, BUSY_READ | BUSY_FLUSH)) {
>         brw_queryobj_get_results(ctx, query);
>         query->Base.Ready = true;
>      }
> @@ -394,7 +386,7 @@ ensure_bo_has_space(struct gl_context *ctx, struct brw_query_object *query)
>            brw_queryobj_get_results(ctx, query);
>         }
>   
> -      query->bo = drm_intel_bo_alloc(brw->bufmgr, "query", 4096, 1);
> +      query->bo = brw_bo_create(&brw->batch, "query", 4096, 0, 0);
>         query->last_index = 0;
>      }
>   }
> @@ -425,7 +417,7 @@ brw_emit_query_begin(struct brw_context *brw)
>      struct gl_context *ctx = &brw->ctx;
>      struct brw_query_object *query = brw->query.obj;
>   
> -   if (brw->hw_ctx)
> +   if (brw->batch.hw_ctx)
>         return;
>   
>      /* Skip if we're not doing any queries, or we've already recorded the
> @@ -452,7 +444,7 @@ brw_emit_query_end(struct brw_context *brw)
>   {
>      struct brw_query_object *query = brw->query.obj;
>   
> -   if (brw->hw_ctx)
> +   if (brw->batch.hw_ctx)
>         return;
>   
>      if (!brw->query.begin_emitted)
> @@ -479,11 +471,10 @@ brw_query_counter(struct gl_context *ctx, struct gl_query_object *q)
>   
>      assert(q->Target == GL_TIMESTAMP);
>   
> -   drm_intel_bo_unreference(query->bo);
> -   query->bo = drm_intel_bo_alloc(brw->bufmgr, "timestamp query", 4096, 4096);
> -   brw_write_timestamp(brw, query->bo, 0);
> +   brw_bo_put(query->bo);
> +   query->bo = brw_bo_create(&brw->batch, "timestamp query", 4096, 0, 0);
>   
> -   query->flushed = false;
> +   brw_write_timestamp(brw, query->bo, 0);
>   }
>   
>   /**
> @@ -495,9 +486,10 @@ static uint64_t
>   brw_get_timestamp(struct gl_context *ctx)
>   {
>      struct brw_context *brw = brw_context(ctx);
> -   uint64_t result = 0;
> +   uint64_t result;
>   
> -   drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &result);
> +   if (drm_intel_reg_read(brw->intelScreen->bufmgr, TIMESTAMP, &result))
> +      return 0;
>   
>      /* See logic in brw_queryobj_get_results() */
>      result = result >> 32;
> diff --git a/src/mesa/drivers/dri/i965/brw_reset.c b/src/mesa/drivers/dri/i965/brw_reset.c
> index e3182b1..e9b280b 100644
> --- a/src/mesa/drivers/dri/i965/brw_reset.c
> +++ b/src/mesa/drivers/dri/i965/brw_reset.c
> @@ -36,12 +36,6 @@ brw_get_graphics_reset_status(struct gl_context *ctx)
>      uint32_t active;
>      uint32_t pending;
>   
> -   /* If hardware contexts are not being used (or
> -    * DRM_IOCTL_I915_GET_RESET_STATS is not supported), this function should
> -    * not be accessible.
> -    */
> -   assert(brw->hw_ctx != NULL);
> -
>      /* A reset status other than NO_ERROR was returned last time. I915 returns
>       * nonzero active/pending only if reset has been encountered and completed.
>       * Return NO_ERROR from now on.
> @@ -49,8 +43,12 @@ brw_get_graphics_reset_status(struct gl_context *ctx)
>      if (brw->reset_count != 0)
>         return GL_NO_ERROR;
>   
> -   err = drm_intel_get_reset_stats(brw->hw_ctx, &reset_count, &active,
> -                                   &pending);
> +   /* If hardware contexts are not being used (or
> +    * DRM_IOCTL_I915_GET_RESET_STATS is not supported), this function should
> +    * not be accessible.
> +    */
> +   err = brw_batch_get_reset_stats(&brw->batch,
> +				   &reset_count, &active, &pending);
>      if (err)
>         return GL_NO_ERROR;
>   
> diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
> index 22ccbfe..a56356b 100644
> --- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
> @@ -39,7 +39,6 @@
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_mipmap_tree.h"
>   
>   #include "main/macros.h"
> @@ -99,14 +98,13 @@ brw_emit_sampler_state(struct brw_context *brw,
>              SET_FIELD(mag_filter, BRW_SAMPLER_MAG_FILTER) |
>              SET_FIELD(min_filter, BRW_SAMPLER_MIN_FILTER);
>   
> -   ss[2] = border_color_offset;
>      if (brw->gen < 6) {
> -      ss[2] += brw->batch.bo->offset64; /* reloc */
> -      drm_intel_bo_emit_reloc(brw->batch.bo,
> +      ss[2] = brw_batch_reloc(&brw->batch,
>                                 batch_offset_for_sampler_state + 8,
>                                 brw->batch.bo, border_color_offset,
>                                 I915_GEM_DOMAIN_SAMPLER, 0);
> -   }
> +   } else
> +      ss[2] = border_color_offset;
>   
>      ss[3] = SET_FIELD(max_anisotropy, BRW_SAMPLER_MAX_ANISOTROPY) |
>              SET_FIELD(address_rounding, BRW_SAMPLER_ADDRESS_ROUNDING);
> diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
> index 872464c..52deb57 100644
> --- a/src/mesa/drivers/dri/i965/brw_sf.c
> +++ b/src/mesa/drivers/dri/i965/brw_sf.c
> @@ -36,8 +36,6 @@
>   #include "main/enums.h"
>   #include "main/fbobject.h"
>   
> -#include "intel_batchbuffer.h"
> -
>   #include "brw_defines.h"
>   #include "brw_context.h"
>   #include "brw_eu.h"
> diff --git a/src/mesa/drivers/dri/i965/brw_sf_emit.c b/src/mesa/drivers/dri/i965/brw_sf_emit.c
> index b3ee5c1..28e2e56 100644
> --- a/src/mesa/drivers/dri/i965/brw_sf_emit.c
> +++ b/src/mesa/drivers/dri/i965/brw_sf_emit.c
> @@ -34,8 +34,6 @@
>   #include "main/macros.h"
>   #include "main/enums.h"
>   
> -#include "intel_batchbuffer.h"
> -
>   #include "brw_defines.h"
>   #include "brw_context.h"
>   #include "brw_eu.h"
> diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
> index 5d98922..076166b 100644
> --- a/src/mesa/drivers/dri/i965/brw_sf_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
> @@ -133,7 +133,6 @@ static void upload_sf_unit( struct brw_context *brw )
>   {
>      struct gl_context *ctx = &brw->ctx;
>      struct brw_sf_unit_state *sf;
> -   drm_intel_bo *bo = brw->batch.bo;
>      int chipset_max_threads;
>      bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
>   
> @@ -179,9 +178,6 @@ static void upload_sf_unit( struct brw_context *brw )
>         sf->thread4.stats_enable = 1;
>   
>      /* BRW_NEW_SF_VP */
> -   sf->sf5.sf_viewport_state_offset = (brw->batch.bo->offset64 +
> -				       brw->sf.vp_offset) >> 5; /* reloc */
> -
>      sf->sf5.viewport_transform = 1;
>   
>      /* _NEW_SCISSOR */
> @@ -200,6 +196,15 @@ static void upload_sf_unit( struct brw_context *brw )
>       */
>      sf->sf5.front_winding ^= render_to_fbo;
>   
> +   sf->sf5.sf_viewport_state_offset =
> +      brw_batch_reloc(&brw->batch,
> +		      (brw->sf.state_offset + offsetof(struct brw_sf_unit_state, sf5)),
> +		      brw->batch.bo,
> +		      brw->sf.vp_offset | sf->dw5,
> +		      I915_GEM_DOMAIN_INSTRUCTION, 0) >> 5;
> +
> +
> +
>      /* _NEW_POLYGON */
>      switch (ctx->Polygon.CullFlag ? ctx->Polygon.CullFaceMode : GL_NONE) {
>      case GL_FRONT:
> @@ -290,14 +295,6 @@ static void upload_sf_unit( struct brw_context *brw )
>       * something loaded through the GPE (L2 ISC), so it's INSTRUCTION domain.
>       */
>   
> -   /* Emit SF viewport relocation */
> -   drm_intel_bo_emit_reloc(bo, (brw->sf.state_offset +
> -				offsetof(struct brw_sf_unit_state, sf5)),
> -			   brw->batch.bo, (brw->sf.vp_offset |
> -					     sf->sf5.front_winding |
> -					     (sf->sf5.viewport_transform << 1)),
> -			   I915_GEM_DOMAIN_INSTRUCTION, 0);
> -
>      brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
>   }
>   
> diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
> index 987672f..02c83db 100644
> --- a/src/mesa/drivers/dri/i965/brw_state.h
> +++ b/src/mesa/drivers/dri/i965/brw_state.h
> @@ -227,7 +227,7 @@ void brw_destroy_caches( struct brw_context *brw );
>    * brw_state_batch.c
>    */
>   #define BRW_BATCH_STRUCT(brw, s) \
> -   intel_batchbuffer_data(brw, (s), sizeof(*(s)), RENDER_RING)
> +   brw_batch_data(&brw->batch, (s), sizeof(*(s)))
>   
>   void *__brw_state_batch(struct brw_context *brw,
>                           enum aub_state_struct_type type,
> diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
> index a405a80..d79cb48 100644
> --- a/src/mesa/drivers/dri/i965/brw_state_batch.c
> +++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
> @@ -30,7 +30,7 @@
>     */
>   
>   #include "brw_state.h"
> -#include "intel_batchbuffer.h"
> +
>   #include "main/imports.h"
>   #include "util/ralloc.h"
>   
> @@ -41,14 +41,12 @@ brw_track_state_batch(struct brw_context *brw,
>                         int size,
>                         int index)
>   {
> -   struct intel_batchbuffer *batch = &brw->batch;
> -
>      if (!brw->state_batch_list) {
>         /* Our structs are always aligned to at least 32 bytes, so
>          * our array doesn't need to be any larger
>          */
>         brw->state_batch_list = ralloc_size(brw, sizeof(*brw->state_batch_list) *
> -					  batch->bo->size / 32);
> +					  brw->batch.bo->size / 32);
>      }
>   
>      brw->state_batch_list[brw->state_batch_count].offset = offset;
> @@ -81,13 +79,14 @@ make_annotation(drm_intel_aub_annotation *annotation, uint32_t type,
>    * is annotated according to the type of each data structure.
>    */
>   void
> -brw_annotate_aub(struct brw_context *brw)
> +brw_annotate_batch(struct brw_batch *batch)
>   {
> +   struct brw_context *brw = container_of(batch, brw, batch);
>      unsigned annotation_count = 2 * brw->state_batch_count + 1;
>      drm_intel_aub_annotation annotations[annotation_count];
>      int a = 0;
>      make_annotation(&annotations[a++], AUB_TRACE_TYPE_BATCH, 0,
> -                   4*brw->batch.used);
> +                   4*brw->batch.emit.nbatch);
>      for (int i = brw->state_batch_count; i-- > 0; ) {
>         uint32_t type = brw->state_batch_list[i].type;
>         uint32_t start_offset = brw->state_batch_list[i].offset;
> @@ -98,8 +97,8 @@ brw_annotate_aub(struct brw_context *brw)
>                         AUB_TRACE_SUBTYPE(type), end_offset);
>      }
>      assert(a == annotation_count);
> -   drm_intel_bufmgr_gem_set_aub_annotations(brw->batch.bo, annotations,
> -                                            annotation_count);
> +   drm_intel_bufmgr_gem_set_aub_annotations(brw->batch.bo->base,
> +					    annotations, annotation_count);
>   }
>   
>   /**
> @@ -125,27 +124,13 @@ __brw_state_batch(struct brw_context *brw,
>                     uint32_t *out_offset)
>   
>   {
> -   struct intel_batchbuffer *batch = &brw->batch;
> -   uint32_t offset;
> -
> -   assert(size < batch->bo->size);
> -   offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
> -
> -   /* If allocating from the top would wrap below the batchbuffer, or
> -    * if the batch's used space (plus the reserved pad) collides with our
> -    * space, then flush and try again.
> -    */
> -   if (batch->state_batch_offset < size ||
> -       offset < 4*batch->used + batch->reserved_space) {
> -      intel_batchbuffer_flush(brw);
> -      offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment);
> -   }
> -
> -   batch->state_batch_offset = offset;
> +   assert(size < brw->batch.bo->size);
> +   brw->batch.state = ROUND_DOWN_TO(4*brw->batch.state - size, alignment)/4;
> +   assert(brw->batch.state > brw->batch.emit.nbatch);
>   
>      if (unlikely(INTEL_DEBUG & (DEBUG_BATCH | DEBUG_AUB)))
> -      brw_track_state_batch(brw, type, offset, size, index);
> +      brw_track_state_batch(brw, type, 4*brw->batch.state, size, index);
>   
> -   *out_offset = offset;
> -   return batch->map + (offset>>2);
> +   *out_offset = 4*brw->batch.state;
> +   return brw->batch.map + brw->batch.state;
>   }
> diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
> index 157b33d..1210bb7 100644
> --- a/src/mesa/drivers/dri/i965/brw_state_cache.c
> +++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
> @@ -45,7 +45,6 @@
>    */
>   
>   #include "main/imports.h"
> -#include "intel_batchbuffer.h"
>   #include "brw_state.h"
>   #include "brw_vs.h"
>   #include "brw_wm.h"
> @@ -169,29 +168,19 @@ static void
>   brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
>   {
>      struct brw_context *brw = cache->brw;
> -   drm_intel_bo *new_bo;
> +   struct brw_bo *new_bo;
>   
> -   new_bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
> -   if (brw->has_llc)
> -      drm_intel_gem_bo_map_unsynchronized(new_bo);
> +   new_bo = brw_bo_create(&brw->batch, "program cache", new_size, 64, 0);
>   
>      /* Copy any existing data that needs to be saved. */
>      if (cache->next_offset != 0) {
> -      if (brw->has_llc) {
> -         memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset);
> -      } else {
> -         drm_intel_bo_map(cache->bo, false);
> -         drm_intel_bo_subdata(new_bo, 0, cache->next_offset,
> -                              cache->bo->virtual);
> -         drm_intel_bo_unmap(cache->bo);
> -      }
> +      brw_bo_read(cache->bo, 0,
> +                  brw_bo_map(new_bo, MAP_WRITE), cache->next_offset,
> +                  MAP_ASYNC);
>      }
>   
> -   if (brw->has_llc)
> -      drm_intel_bo_unmap(cache->bo);
> -   drm_intel_bo_unreference(cache->bo);
> +   brw_bo_put(cache->bo);
>      cache->bo = new_bo;
> -   cache->bo_used_by_gpu = false;
>   
>      /* Since we have a new BO in place, we need to signal the units
>       * that depend on it (state base address on gen5+, or unit state before).
> @@ -209,7 +198,6 @@ brw_try_upload_using_copy(struct brw_cache *cache,
>   			  const void *data,
>   			  const void *aux)
>   {
> -   struct brw_context *brw = cache->brw;
>      int i;
>      struct brw_cache_item *item;
>   
> @@ -231,11 +219,9 @@ brw_try_upload_using_copy(struct brw_cache *cache,
>   	    continue;
>   	 }
>   
> -         if (!brw->has_llc)
> -            drm_intel_bo_map(cache->bo, false);
> -	 ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
> -         if (!brw->has_llc)
> -            drm_intel_bo_unmap(cache->bo);
> +	 ret = memcmp(brw_bo_map(cache->bo, MAP_READ | MAP_ASYNC) + item->offset,
> +		      data,
> +		      item->size);
>   	 if (ret)
>   	    continue;
>   
> @@ -253,8 +239,6 @@ brw_upload_item_data(struct brw_cache *cache,
>   		     struct brw_cache_item *item,
>   		     const void *data)
>   {
> -   struct brw_context *brw = cache->brw;
> -
>      /* Allocate space in the cache BO for our new program. */
>      if (cache->next_offset + item->size > cache->bo->size) {
>         uint32_t new_size = cache->bo->size * 2;
> @@ -265,16 +249,11 @@ brw_upload_item_data(struct brw_cache *cache,
>         brw_cache_new_bo(cache, new_size);
>      }
>   
> -   /* If we would block on writing to an in-use program BO, just
> -    * recreate it.
> -    */
> -   if (!brw->has_llc && cache->bo_used_by_gpu) {
> -      perf_debug("Copying busy program cache buffer.\n");
> -      brw_cache_new_bo(cache, cache->bo->size);
> -   }
> -
>      item->offset = cache->next_offset;
>   
> +   /* Copy data to the buffer */
> +   brw_bo_write(cache->bo, item->offset, data, item->size, MAP_ASYNC);
> +
>      /* Programs are always 64-byte aligned, so set up the next one now */
>      cache->next_offset = ALIGN(item->offset + item->size, 64);
>   }
> @@ -291,7 +270,6 @@ brw_upload_cache(struct brw_cache *cache,
>   		 uint32_t *out_offset,
>   		 void *out_aux)
>   {
> -   struct brw_context *brw = cache->brw;
>      struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
>      GLuint hash;
>      void *tmp;
> @@ -331,13 +309,6 @@ brw_upload_cache(struct brw_cache *cache,
>      cache->items[hash] = item;
>      cache->n_items++;
>   
> -   /* Copy data to the buffer */
> -   if (brw->has_llc) {
> -      memcpy((char *) cache->bo->virtual + item->offset, data, data_size);
> -   } else {
> -      drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
> -   }
> -
>      *out_offset = item->offset;
>      *(void **)out_aux = (void *)((char *)item->key + item->key_size);
>      cache->brw->ctx.NewDriverState |= 1 << cache_id;
> @@ -355,11 +326,7 @@ brw_init_caches(struct brw_context *brw)
>      cache->items =
>         calloc(cache->size, sizeof(struct brw_cache_item *));
>   
> -   cache->bo = drm_intel_bo_alloc(brw->bufmgr,
> -				  "program cache",
> -				  4096, 64);
> -   if (brw->has_llc)
> -      drm_intel_gem_bo_map_unsynchronized(cache->bo);
> +   cache->bo = brw_bo_create(&brw->batch, "program cache", 4096, 64, 0);
>   
>      cache->aux_compare[BRW_CACHE_VS_PROG] = brw_vs_prog_data_compare;
>      cache->aux_compare[BRW_CACHE_GS_PROG] = brw_gs_prog_data_compare;
> @@ -394,6 +361,9 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
>   
>      cache->n_items = 0;
>   
> +   brw_bo_put(cache->bo);
> +   cache->bo = brw_bo_create(&brw->batch, "program cache", 4096, 64, 0);
> +
>      /* Start putting programs into the start of the BO again, since
>       * we'll never find the old results.
>       */
> @@ -404,7 +374,6 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
>       */
>      brw->NewGLState |= ~0;
>      brw->ctx.NewDriverState |= ~0ull;
> -   intel_batchbuffer_flush(brw);
>   }
>   
>   void
> @@ -427,11 +396,10 @@ brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
>   
>      DBG("%s\n", __func__);
>   
> -   if (brw->has_llc)
> -      drm_intel_bo_unmap(cache->bo);
> -   drm_intel_bo_unreference(cache->bo);
> -   cache->bo = NULL;
>      brw_clear_cache(brw, cache);
> +   brw_bo_put(cache->bo);
> +   cache->bo = NULL;
> +
>      free(cache->items);
>      cache->items = NULL;
>      cache->size = 0;
> diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
> index b6f4d59..efa0cb0 100644
> --- a/src/mesa/drivers/dri/i965/brw_state_dump.c
> +++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
> @@ -26,7 +26,6 @@
>    */
>   
>   #include "main/mtypes.h"
> -#include "intel_batchbuffer.h"
>   
>   #include "brw_context.h"
>   #include "brw_defines.h"
> @@ -67,6 +66,11 @@ static const char *surface_tiling[] = {
>      "Y-tiled"
>   };
>   
> +static void *batch_in(struct brw_context *brw, unsigned offset)
> +{
> +	return (void *)brw->batch.map + offset;
> +}
> +
>   static void
>   batch_out(struct brw_context *brw, const char *name, uint32_t offset,
>   	  int index, char *fmt, ...) PRINTFLIKE(5, 6);
> @@ -75,7 +79,7 @@ static void
>   batch_out(struct brw_context *brw, const char *name, uint32_t offset,
>   	  int index, char *fmt, ...)
>   {
> -   uint32_t *data = brw->batch.bo->virtual + offset;
> +   uint32_t *data = batch_in(brw, offset);
>      va_list va;
>   
>      fprintf(stderr, "0x%08x:      0x%08x: %8s: ",
> @@ -89,7 +93,7 @@ static void
>   batch_out64(struct brw_context *brw, const char *name, uint32_t offset,
>               int index, char *fmt, ...)
>   {
> -   uint32_t *tmp = brw->batch.bo->virtual + offset;
> +   uint32_t *tmp = batch_in(brw, offset);
>   
>      /* Swap the dwords since we want to handle this as a 64b value, but the data
>       * is typically emitted as dwords.
> @@ -121,7 +125,7 @@ get_965_surfacetype(unsigned int surfacetype)
>   static void dump_vs_state(struct brw_context *brw, uint32_t offset)
>   {
>      const char *name = "VS_STATE";
> -   struct brw_vs_unit_state *vs = brw->batch.bo->virtual + offset;
> +   struct brw_vs_unit_state *vs = batch_in(brw, offset);
>   
>      batch_out(brw, name, offset, 0, "thread0\n");
>      batch_out(brw, name, offset, 1, "thread1\n");
> @@ -136,7 +140,7 @@ static void dump_vs_state(struct brw_context *brw, uint32_t offset)
>   static void dump_gs_state(struct brw_context *brw, uint32_t offset)
>   {
>      const char *name = "GS_STATE";
> -   struct brw_gs_unit_state *gs = brw->batch.bo->virtual + offset;
> +   struct brw_gs_unit_state *gs = batch_in(brw, offset);
>   
>      batch_out(brw, name, offset, 0, "thread0\n");
>      batch_out(brw, name, offset, 1, "thread1\n");
> @@ -151,7 +155,7 @@ static void dump_gs_state(struct brw_context *brw, uint32_t offset)
>   static void dump_clip_state(struct brw_context *brw, uint32_t offset)
>   {
>      const char *name = "CLIP_STATE";
> -   struct brw_clip_unit_state *clip = brw->batch.bo->virtual + offset;
> +   struct brw_clip_unit_state *clip = batch_in(brw, offset);
>   
>      batch_out(brw, name, offset, 0, "thread0\n");
>      batch_out(brw, name, offset, 1, "thread1\n");
> @@ -170,7 +174,7 @@ static void dump_clip_state(struct brw_context *brw, uint32_t offset)
>   static void dump_sf_state(struct brw_context *brw, uint32_t offset)
>   {
>      const char *name = "SF_STATE";
> -   struct brw_sf_unit_state *sf = brw->batch.bo->virtual + offset;
> +   struct brw_sf_unit_state *sf = batch_in(brw, offset);
>   
>      batch_out(brw, name, offset, 0, "thread0\n");
>      batch_out(brw, name, offset, 1, "thread1\n");
> @@ -186,7 +190,7 @@ static void dump_sf_state(struct brw_context *brw, uint32_t offset)
>   static void dump_wm_state(struct brw_context *brw, uint32_t offset)
>   {
>      const char *name = "WM_STATE";
> -   struct brw_wm_unit_state *wm = brw->batch.bo->virtual + offset;
> +   struct brw_wm_unit_state *wm = batch_in(brw, offset);
>   
>      batch_out(brw, name, offset, 0, "thread0\n");
>      batch_out(brw, name, offset, 1, "thread1\n");
> @@ -213,7 +217,7 @@ static void dump_wm_state(struct brw_context *brw, uint32_t offset)
>   static void dump_surface_state(struct brw_context *brw, uint32_t offset)
>   {
>      const char *name = "SURF";
> -   uint32_t *surf = brw->batch.bo->virtual + offset;
> +   uint32_t *surf = batch_in(brw, offset);
>   
>      batch_out(brw, name, offset, 0, "%s %s\n",
>   	     get_965_surfacetype(GET_FIELD(surf[0], BRW_SURFACE_TYPE)),
> @@ -237,7 +241,7 @@ static void dump_surface_state(struct brw_context *brw, uint32_t offset)
>   static void dump_gen7_surface_state(struct brw_context *brw, uint32_t offset)
>   {
>      const char *name = "SURF";
> -   uint32_t *surf = brw->batch.bo->virtual + offset;
> +   uint32_t *surf = batch_in(brw, offset);
>   
>      batch_out(brw, name, offset, 0, "%s %s %s\n",
>                get_965_surfacetype(GET_FIELD(surf[0], BRW_SURFACE_TYPE)),
> @@ -276,7 +280,7 @@ static float q_to_float(uint32_t data, int integer_end, int integer_start,
>   static void
>   dump_gen8_surface_state(struct brw_context *brw, uint32_t offset, int index)
>   {
> -   uint32_t *surf = brw->batch.bo->virtual + offset;
> +   uint32_t *surf = batch_in(brw, offset);
>      int aux_mode = surf[6] & INTEL_MASK(2, 0);
>      const char *aux_str;
>      char *name;
> @@ -350,8 +354,7 @@ dump_sdc(struct brw_context *brw, uint32_t offset)
>      const char *name = "SDC";
>   
>      if (brw->gen >= 5 && brw->gen <= 6) {
> -      struct gen5_sampler_default_color *sdc = (brw->batch.bo->virtual +
> -                                                offset);
> +      struct gen5_sampler_default_color *sdc = batch_in(brw, offset);
>         batch_out(brw, name, offset, 0, "unorm rgba\n");
>         batch_out(brw, name, offset, 1, "r %f\n", sdc->f[0]);
>         batch_out(brw, name, offset, 2, "b %f\n", sdc->f[1]);
> @@ -365,7 +368,7 @@ dump_sdc(struct brw_context *brw, uint32_t offset)
>         batch_out(brw, name, offset, 10, "s16 ba\n");
>         batch_out(brw, name, offset, 11, "s8 rgba\n");
>      } else {
> -      float *sdc = brw->batch.bo->virtual + offset;
> +      float *sdc = batch_in(brw, offset);
>         batch_out(brw, name, offset, 0, "r %f\n", sdc[0]);
>         batch_out(brw, name, offset, 1, "g %f\n", sdc[1]);
>         batch_out(brw, name, offset, 2, "b %f\n", sdc[2]);
> @@ -377,7 +380,7 @@ static void dump_sampler_state(struct brw_context *brw,
>   			       uint32_t offset, uint32_t size)
>   {
>      int i;
> -   uint32_t *samp = brw->batch.bo->virtual + offset;
> +   uint32_t *samp = batch_in(brw, offset);
>   
>      for (i = 0; i < size / 16; i++) {
>         char name[20];
> @@ -396,7 +399,7 @@ static void dump_sampler_state(struct brw_context *brw,
>   static void gen7_dump_sampler_state(struct brw_context *brw,
>                                       uint32_t offset, uint32_t size)
>   {
> -   const uint32_t *samp = brw->batch.bo->virtual + offset;
> +   const uint32_t *samp = batch_in(brw, offset);
>      char name[20];
>   
>      for (int i = 0; i < size / 16; i++) {
> @@ -436,7 +439,7 @@ static void dump_sf_viewport_state(struct brw_context *brw,
>   				   uint32_t offset)
>   {
>      const char *name = "SF VP";
> -   struct brw_sf_viewport *vp = brw->batch.bo->virtual + offset;
> +   struct brw_sf_viewport *vp = batch_in(brw, offset);
>   
>      assert(brw->gen < 7);
>   
> @@ -457,7 +460,7 @@ static void dump_clip_viewport_state(struct brw_context *brw,
>   				     uint32_t offset)
>   {
>      const char *name = "CLIP VP";
> -   struct brw_clipper_viewport *vp = brw->batch.bo->virtual + offset;
> +   struct brw_clipper_viewport *vp = batch_in(brw, offset);
>   
>      assert(brw->gen < 7);
>   
> @@ -471,7 +474,7 @@ static void dump_sf_clip_viewport_state(struct brw_context *brw,
>   					uint32_t offset)
>   {
>      const char *name = "SF_CLIP VP";
> -   struct gen7_sf_clip_viewport *vp = brw->batch.bo->virtual + offset;
> +   struct gen7_sf_clip_viewport *vp = batch_in(brw, offset);
>   
>      assert(brw->gen >= 7);
>   
> @@ -486,7 +489,7 @@ static void dump_sf_clip_viewport_state(struct brw_context *brw,
>      batch_out(brw, name, offset, 9, "guardband ymin = %f\n", vp->guardband.ymin);
>      batch_out(brw, name, offset, 10, "guardband ymax = %f\n", vp->guardband.ymax);
>      if (brw->gen >= 8) {
> -      float *cc_vp = brw->batch.bo->virtual + offset;
> +      float *cc_vp = batch_in(brw, offset);
>         batch_out(brw, name, offset, 12, "Min extents: %.2fx%.2f\n",
>                   cc_vp[12], cc_vp[14]);
>         batch_out(brw, name, offset, 14, "Max extents: %.2fx%.2f\n",
> @@ -498,7 +501,7 @@ static void dump_sf_clip_viewport_state(struct brw_context *brw,
>   static void dump_cc_viewport_state(struct brw_context *brw, uint32_t offset)
>   {
>      const char *name = "CC VP";
> -   struct brw_cc_viewport *vp = brw->batch.bo->virtual + offset;
> +   struct brw_cc_viewport *vp = batch_in(brw, offset);
>   
>      batch_out(brw, name, offset, 0, "min_depth = %f\n", vp->min_depth);
>      batch_out(brw, name, offset, 1, "max_depth = %f\n", vp->max_depth);
> @@ -507,7 +510,7 @@ static void dump_cc_viewport_state(struct brw_context *brw, uint32_t offset)
>   static void dump_depth_stencil_state(struct brw_context *brw, uint32_t offset)
>   {
>      const char *name = "D_S";
> -   struct gen6_depth_stencil_state *ds = brw->batch.bo->virtual + offset;
> +   struct gen6_depth_stencil_state *ds = batch_in(brw, offset);
>   
>      batch_out(brw, name, offset, 0,
>   	     "stencil %sable, func %d, write %sable\n",
> @@ -541,7 +544,7 @@ static void dump_cc_state_gen4(struct brw_context *brw, uint32_t offset)
>   static void dump_cc_state_gen6(struct brw_context *brw, uint32_t offset)
>   {
>      const char *name = "CC";
> -   struct gen6_color_calc_state *cc = brw->batch.bo->virtual + offset;
> +   struct gen6_color_calc_state *cc = batch_in(brw, offset);
>   
>      batch_out(brw, name, offset, 0,
>   	     "alpha test format %s, round disable %d, stencil ref %d, "
> @@ -568,7 +571,7 @@ static void dump_blend_state(struct brw_context *brw, uint32_t offset)
>   static void
>   gen8_dump_blend_state(struct brw_context *brw, uint32_t offset, uint32_t size)
>   {
> -   const uint32_t *blend = brw->batch.bo->virtual + offset;
> +   const uint32_t *blend = batch_in(brw, offset);
>      const char *logicop[] =
>      {
>           "LOGICOP_CLEAR (BLACK)",
> @@ -655,7 +658,7 @@ static void
>   dump_scissor(struct brw_context *brw, uint32_t offset)
>   {
>      const char *name = "SCISSOR";
> -   struct gen6_scissor_rect *scissor = brw->batch.bo->virtual + offset;
> +   struct gen6_scissor_rect *scissor = batch_in(brw, offset);
>   
>      batch_out(brw, name, offset, 0, "xmin %d, ymin %d\n",
>   	     scissor->xmin, scissor->ymin);
> @@ -667,8 +670,8 @@ static void
>   dump_vs_constants(struct brw_context *brw, uint32_t offset, uint32_t size)
>   {
>      const char *name = "VS_CONST";
> -   uint32_t *as_uint = brw->batch.bo->virtual + offset;
> -   float *as_float = brw->batch.bo->virtual + offset;
> +   uint32_t *as_uint = batch_in(brw, offset);
> +   float *as_float = batch_in(brw, offset);
>      int i;
>   
>      for (i = 0; i < size / 4; i += 4) {
> @@ -683,8 +686,8 @@ static void
>   dump_wm_constants(struct brw_context *brw, uint32_t offset, uint32_t size)
>   {
>      const char *name = "WM_CONST";
> -   uint32_t *as_uint = brw->batch.bo->virtual + offset;
> -   float *as_float = brw->batch.bo->virtual + offset;
> +   uint32_t *as_uint = batch_in(brw, offset);
> +   float *as_float = batch_in(brw, offset);
>      int i;
>   
>      for (i = 0; i < size / 4; i += 4) {
> @@ -700,7 +703,7 @@ static void dump_binding_table(struct brw_context *brw, uint32_t offset,
>   {
>      char name[20];
>      int i;
> -   uint32_t *data = brw->batch.bo->virtual + offset;
> +   uint32_t *data = batch_in(brw, offset);
>   
>      for (i = 0; i < size / 4; i++) {
>         if (data[i] == 0)
> @@ -717,8 +720,6 @@ dump_prog_cache(struct brw_context *brw)
>      struct brw_cache *cache = &brw->cache;
>      unsigned int b;
>   
> -   drm_intel_bo_map(brw->cache.bo, false);
> -
>      for (b = 0; b < cache->size; b++) {
>         struct brw_cache_item *item;
>   
> @@ -753,12 +754,11 @@ dump_prog_cache(struct brw_context *brw)
>   	 }
>   
>            fprintf(stderr, "%s:\n", name);
> -         brw_disassemble(brw->intelScreen->devinfo, brw->cache.bo->virtual,
> +         brw_disassemble(brw->intelScreen->devinfo,
> +			 brw_bo_map(brw->cache.bo, MAP_READ | MAP_ASYNC),
>                            item->offset, item->size, stderr);
>         }
>      }
> -
> -   drm_intel_bo_unmap(brw->cache.bo);
>   }
>   
>   static void
> @@ -861,12 +861,11 @@ dump_state_batch(struct brw_context *brw)
>    * The buffer offsets printed rely on the buffer containing the last offset
>    * it was validated at.
>    */
> -void brw_debug_batch(struct brw_context *brw)
> +void brw_debug_batch(struct brw_batch *batch)
>   {
> -   drm_intel_bo_map(brw->batch.bo, false);
> -   dump_state_batch(brw);
> -   drm_intel_bo_unmap(brw->batch.bo);
> +   struct brw_context *brw = container_of(batch, brw, batch);
>   
> +   dump_state_batch(brw);
>      if (0)
>         dump_prog_cache(brw);
>   }
> diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
> index 7662c3b..285c6c1 100644
> --- a/src/mesa/drivers/dri/i965/brw_state_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
> @@ -34,8 +34,8 @@
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "drivers/common/meta.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_buffers.h"
> +#include "intel_reg.h"
>   #include "brw_vs.h"
>   #include "brw_ff_gs.h"
>   #include "brw_gs.h"
> @@ -338,15 +338,21 @@ static const struct brw_tracked_state *gen8_compute_atoms[] =
>      &brw_cs_state,
>   };
>   
> -static void
> +static int
>   brw_upload_initial_gpu_state(struct brw_context *brw)
>   {
> +   int ret;
> +
>      /* On platforms with hardware contexts, we can set our initial GPU state
>       * right away rather than doing it via state atoms.  This saves a small
>       * amount of overhead on every draw call.
>       */
> -   if (!brw->hw_ctx)
> -      return;
> +   if (!brw->batch.hw_ctx)
> +      return 0;
> +
> +   ret = brw_batch_begin(&brw->batch, 200, RENDER_RING);
> +   if (ret < 0)
> +      return ret;
>   
>      if (brw->gen == 6)
>         brw_emit_post_sync_nonzero_flush(brw);
> @@ -366,6 +372,8 @@ brw_upload_initial_gpu_state(struct brw_context *brw)
>      if (brw->gen >= 8) {
>         gen8_emit_3dstate_sample_pattern(brw);
>      }
> +
> +   return brw_batch_end(&brw->batch);
>   }
>   
>   static inline const struct brw_tracked_state *
> diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
> index 55338c0..e167254 100644
> --- a/src/mesa/drivers/dri/i965/brw_structs.h
> +++ b/src/mesa/drivers/dri/i965/brw_structs.h
> @@ -391,13 +391,16 @@ struct brw_sf_unit_state
>         unsigned pad3:1;
>      } thread4;
>   
> -   struct
> +   union
>      {
> -      unsigned front_winding:1;
> -      unsigned viewport_transform:1;
> -      unsigned pad0:3;
> -      unsigned sf_viewport_state_offset:27; /* Offset from GENERAL_STATE_BASE */
> -   } sf5;
> +      struct {
> +	 unsigned front_winding:1;
> +	 unsigned viewport_transform:1;
> +	 unsigned pad0:3;
> +	 unsigned sf_viewport_state_offset:27; /* Offset from GENERAL_STATE_BASE */
> +      } sf5;
> +      uint32_t dw5;
> +   };
>   
>      struct
>      {
> @@ -525,15 +528,17 @@ struct brw_wm_unit_state
>      struct thread2 thread2;
>      struct thread3 thread3;
>   
> -   struct {
> -      unsigned stats_enable:1;
> -      unsigned depth_buffer_clear:1;
> -      unsigned sampler_count:3;
> -      unsigned sampler_state_pointer:27;
> -   } wm4;
> +   union {
> +      struct {
> +	 unsigned stats_enable:1;
> +	 unsigned depth_buffer_clear:1;
> +	 unsigned sampler_count:3;
> +	 unsigned sampler_state_pointer:27;
> +      } wm4;
> +      uint32_t dw4;
> +   };
>   
> -   struct
> -   {
> +   struct {
>         unsigned enable_8_pix:1;
>         unsigned enable_16_pix:1;
>         unsigned enable_32_pix:1;
> diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
> index 6fcf1b0..fa79fba 100644
> --- a/src/mesa/drivers/dri/i965/brw_urb.c
> +++ b/src/mesa/drivers/dri/i965/brw_urb.c
> @@ -31,11 +31,12 @@
>   
>   
>   
> -#include "intel_batchbuffer.h"
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
>   
> +#include "intel_reg.h"
> +
>   #define VS 0
>   #define GS 1
>   #define CLP 2
> @@ -249,10 +250,10 @@ void brw_upload_urb_fence(struct brw_context *brw)
>      uf.bits1.cs_fence  = brw->urb.size;
>   
>      /* erratum: URB_FENCE must not cross a 64byte cacheline */
> -   if ((brw->batch.used & 15) > 12) {
> -      int pad = 16 - (brw->batch.used & 15);
> +   if ((brw->batch.emit.nbatch & 15) > 12) {
> +      int pad = 16 - (brw->batch.emit.nbatch & 15);
>         do
> -	 brw->batch.map[brw->batch.used++] = MI_NOOP;
> +	 brw_batch_emit(&brw->batch, MI_NOOP);
>         while (--pad);
>      }
>   
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> index a5c686c..8902729 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
> @@ -1868,8 +1868,7 @@ brw_vs_emit(struct brw_context *brw,
>      const unsigned *assembly = NULL;
>   
>      if (unlikely(brw->perf_debug)) {
> -      start_busy = (brw->batch.last_bo &&
> -                    drm_intel_bo_busy(brw->batch.last_bo));
> +      start_busy = brw_batch_busy(&brw->batch);
>         start_time = get_time();
>      }
>   
> @@ -1965,7 +1964,7 @@ brw_vs_emit(struct brw_context *brw,
>         if (shader->compiled_once) {
>            brw_vs_debug_recompile(brw, prog, &c->key);
>         }
> -      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
> +      if (start_busy && !brw_batch_busy(&brw->batch)) {
>            perf_debug("VS compile took %.03f ms and stalled the GPU\n",
>                       (get_time() - start_time) * 1000);
>         }
> diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c
> index b9b97a7..59d8c8c 100644
> --- a/src/mesa/drivers/dri/i965/brw_vs_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
> @@ -80,10 +80,16 @@ brw_upload_vs_unit(struct brw_context *brw)
>         brw->vs.prog_data->base.base.binding_table.size_bytes / 4;
>   
>      if (brw->vs.prog_data->base.base.total_scratch != 0) {
> -      vs->thread2.scratch_space_base_pointer =
> -	 stage_state->scratch_bo->offset64 >> 10; /* reloc */
>         vs->thread2.per_thread_scratch_space =
>   	 ffs(brw->vs.prog_data->base.base.total_scratch) - 11;
> +
> +      vs->thread2.scratch_space_base_pointer =
> +	 brw_batch_reloc(&brw->batch,
> +			 stage_state->state_offset + offsetof(struct brw_vs_unit_state, thread2),
> +			 stage_state->scratch_bo,
> +			 vs->thread2.per_thread_scratch_space,
> +			 I915_GEM_DOMAIN_RENDER,
> +			 I915_GEM_DOMAIN_RENDER) >> 10;
>      } else {
>         vs->thread2.scratch_space_base_pointer = 0;
>         vs->thread2.per_thread_scratch_space = 0;
> @@ -158,24 +164,11 @@ brw_upload_vs_unit(struct brw_context *brw)
>      if (stage_state->sampler_count) {
>         /* BRW_NEW_SAMPLER_STATE_TABLE - reloc */
>         vs->vs5.sampler_state_pointer =
> -         (brw->batch.bo->offset64 + stage_state->sampler_offset) >> 5;
> -      drm_intel_bo_emit_reloc(brw->batch.bo,
> -                              stage_state->state_offset +
> -                              offsetof(struct brw_vs_unit_state, vs5),
> -                              brw->batch.bo,
> -                              (stage_state->sampler_offset |
> -                               vs->vs5.sampler_count),
> -                              I915_GEM_DOMAIN_INSTRUCTION, 0);
> -   }
> -
> -   /* Emit scratch space relocation */
> -   if (brw->vs.prog_data->base.base.total_scratch != 0) {
> -      drm_intel_bo_emit_reloc(brw->batch.bo,
> -			      stage_state->state_offset +
> -			      offsetof(struct brw_vs_unit_state, thread2),
> -			      stage_state->scratch_bo,
> -			      vs->thread2.per_thread_scratch_space,
> -			      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
> +	 brw_batch_reloc(&brw->batch,
> +			 stage_state->state_offset + offsetof(struct brw_vs_unit_state, vs5),
> +			 brw->batch.bo,
> +			 (stage_state->sampler_offset | vs->vs5.sampler_count),
> +			 I915_GEM_DOMAIN_INSTRUCTION, 0) >> 5;
>      }
>   
>      brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
> diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
> index b2f91bd..b6e0ad8 100644
> --- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
> @@ -74,7 +74,7 @@ brw_upload_pull_constants(struct brw_context *brw,
>   
>      /* BRW_NEW_*_PROG_DATA | _NEW_PROGRAM_CONSTANTS */
>      uint32_t size = prog_data->nr_pull_params * 4;
> -   drm_intel_bo *const_bo = NULL;
> +   struct brw_bo *const_bo = NULL;
>      uint32_t const_offset;
>      gl_constant_value *constants = intel_upload_space(brw, size, 64,
>                                                        &const_bo, &const_offset);
> @@ -96,7 +96,7 @@ brw_upload_pull_constants(struct brw_context *brw,
>      brw_create_constant_surface(brw, const_bo, const_offset, size,
>                                  &stage_state->surf_offset[surf_index],
>                                  dword_pitch);
> -   drm_intel_bo_unreference(const_bo);
> +   brw_bo_put(const_bo);
>   
>      brw->ctx.NewDriverState |= brw_new_constbuf;
>   }
> diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
> index 0cd4390..1195f55 100644
> --- a/src/mesa/drivers/dri/i965/brw_wm_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
> @@ -140,10 +140,15 @@ brw_upload_wm_unit(struct brw_context *brw)
>         prog_data->base.binding_table.size_bytes / 4;
>   
>      if (prog_data->base.total_scratch != 0) {
> -      wm->thread2.scratch_space_base_pointer =
> -	 brw->wm.base.scratch_bo->offset64 >> 10; /* reloc */
>         wm->thread2.per_thread_scratch_space =
>   	 ffs(prog_data->base.total_scratch) - 11;
> +
> +      wm->thread2.scratch_space_base_pointer =
> +	 brw_batch_reloc(&brw->batch,
> +			 brw->wm.base.state_offset + offsetof(struct brw_wm_unit_state, thread2),
> +			 brw->wm.base.scratch_bo,
> +			 wm->thread2.per_thread_scratch_space,
> +			 I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER) >> 10;
>      } else {
>         wm->thread2.scratch_space_base_pointer = 0;
>         wm->thread2.per_thread_scratch_space = 0;
> @@ -167,8 +172,12 @@ brw_upload_wm_unit(struct brw_context *brw)
>   
>      if (brw->wm.base.sampler_count) {
>         /* BRW_NEW_SAMPLER_STATE_TABLE - reloc */
> -      wm->wm4.sampler_state_pointer = (brw->batch.bo->offset64 +
> -				       brw->wm.base.sampler_offset) >> 5;
> +      wm->wm4.sampler_state_pointer =
> +	 brw_batch_reloc(&brw->batch,
> +			 brw->wm.base.state_offset + offsetof(struct brw_wm_unit_state, wm4),
> +			 brw->batch.bo,
> +			 brw->wm.base.sampler_offset | wm->dw4,
> +			 I915_GEM_DOMAIN_INSTRUCTION, 0) >> 5;
>      } else {
>         wm->wm4.sampler_state_pointer = 0;
>      }
> @@ -229,27 +238,6 @@ brw_upload_wm_unit(struct brw_context *brw)
>      if (unlikely(INTEL_DEBUG & DEBUG_STATS) || brw->stats_wm)
>         wm->wm4.stats_enable = 1;
>   
> -   /* Emit scratch space relocation */
> -   if (prog_data->base.total_scratch != 0) {
> -      drm_intel_bo_emit_reloc(brw->batch.bo,
> -			      brw->wm.base.state_offset +
> -			      offsetof(struct brw_wm_unit_state, thread2),
> -			      brw->wm.base.scratch_bo,
> -			      wm->thread2.per_thread_scratch_space,
> -			      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
> -   }
> -
> -   /* Emit sampler state relocation */
> -   if (brw->wm.base.sampler_count != 0) {
> -      drm_intel_bo_emit_reloc(brw->batch.bo,
> -			      brw->wm.base.state_offset +
> -			      offsetof(struct brw_wm_unit_state, wm4),
> -			      brw->batch.bo, (brw->wm.base.sampler_offset |
> -                                              wm->wm4.stats_enable |
> -                                              (wm->wm4.sampler_count << 2)),
> -			      I915_GEM_DOMAIN_INSTRUCTION, 0);
> -   }
> -
>      brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
>   }
>   
> diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
> index 72aad96..f488557 100644
> --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
> @@ -38,7 +38,6 @@
>   #include "main/framebuffer.h"
>   
>   #include "intel_mipmap_tree.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_tex.h"
>   #include "intel_fbo.h"
>   #include "intel_buffer_objects.h"
> @@ -243,7 +242,7 @@ brw_get_texture_swizzle(const struct gl_context *ctx,
>   static void
>   gen4_emit_buffer_surface_state(struct brw_context *brw,
>                                  uint32_t *out_offset,
> -                               drm_intel_bo *bo,
> +                               struct brw_bo *bo,
>                                  unsigned buffer_offset,
>                                  unsigned surface_format,
>                                  unsigned buffer_size,
> @@ -257,7 +256,10 @@ gen4_emit_buffer_surface_state(struct brw_context *brw,
>      surf[0] = BRW_SURFACE_BUFFER << BRW_SURFACE_TYPE_SHIFT |
>                surface_format << BRW_SURFACE_FORMAT_SHIFT |
>                (brw->gen >= 6 ? BRW_SURFACE_RC_READ_WRITE : 0);
> -   surf[1] = (bo ? bo->offset64 : 0) + buffer_offset; /* reloc */
> +   surf[1] = brw_batch_reloc(&brw->batch, *out_offset + 4,
> +                             bo, buffer_offset,
> +                             I915_GEM_DOMAIN_SAMPLER,
> +                             (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
>      surf[2] = (buffer_size & 0x7f) << BRW_SURFACE_WIDTH_SHIFT |
>                ((buffer_size >> 7) & 0x1fff) << BRW_SURFACE_HEIGHT_SHIFT;
>      surf[3] = ((buffer_size >> 20) & 0x7f) << BRW_SURFACE_DEPTH_SHIFT |
> @@ -268,10 +270,6 @@ gen4_emit_buffer_surface_state(struct brw_context *brw,
>       * physical cache.  It is mapped in hardware to the sampler cache."
>       */
>      if (bo) {
> -      drm_intel_bo_emit_reloc(brw->batch.bo, *out_offset + 4,
> -                              bo, buffer_offset,
> -                              I915_GEM_DOMAIN_SAMPLER,
> -                              (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
>      }
>   }
>   
> @@ -285,7 +283,7 @@ brw_update_buffer_texture_surface(struct gl_context *ctx,
>      struct intel_buffer_object *intel_obj =
>         intel_buffer_object(tObj->BufferObject);
>      uint32_t size = tObj->BufferSize;
> -   drm_intel_bo *bo = NULL;
> +   struct brw_bo *bo = NULL;
>      mesa_format format = tObj->_BufferObjectFormat;
>      uint32_t brw_format = brw_format_for_mesa_format(format);
>      int texel_size = _mesa_get_format_bytes(format);
> @@ -367,7 +365,11 @@ brw_update_texture_surface(struct gl_context *ctx,
>   	      BRW_SURFACE_CUBEFACE_ENABLES |
>   	      tex_format << BRW_SURFACE_FORMAT_SHIFT);
>   
> -   surf[1] = mt->bo->offset64 + mt->offset; /* reloc */
> +   surf[1] = brw_batch_reloc(&brw->batch,
> +			     *surf_offset + 4,
> +			     mt->bo,
> +			     mt->offset,
> +			     I915_GEM_DOMAIN_SAMPLER, 0);
>   
>      surf[2] = ((intelObj->_MaxLevel - tObj->BaseLevel) << BRW_SURFACE_LOD_SHIFT |
>   	      (mt->logical_width0 - 1) << BRW_SURFACE_WIDTH_SHIFT |
> @@ -381,13 +383,6 @@ brw_update_texture_surface(struct gl_context *ctx,
>                 SET_FIELD(tObj->BaseLevel - mt->first_level, BRW_SURFACE_MIN_LOD));
>   
>      surf[5] = mt->align_h == 4 ? BRW_SURFACE_VERTICAL_ALIGN_ENABLE : 0;
> -
> -   /* Emit relocation to surface contents */
> -   drm_intel_bo_emit_reloc(brw->batch.bo,
> -                           *surf_offset + 4,
> -                           mt->bo,
> -                           surf[1] - mt->bo->offset64,
> -                           I915_GEM_DOMAIN_SAMPLER, 0);
>   }
>   
>   /**
> @@ -396,7 +391,7 @@ brw_update_texture_surface(struct gl_context *ctx,
>    */
>   void
>   brw_create_constant_surface(struct brw_context *brw,
> -			    drm_intel_bo *bo,
> +			    struct brw_bo *bo,
>   			    uint32_t offset,
>   			    uint32_t size,
>   			    uint32_t *out_offset,
> @@ -424,7 +419,7 @@ brw_update_sol_surface(struct brw_context *brw,
>   {
>      struct intel_buffer_object *intel_bo = intel_buffer_object(buffer_obj);
>      uint32_t offset_bytes = 4 * offset_dwords;
> -   drm_intel_bo *bo = intel_bufferobj_buffer(brw, intel_bo,
> +   struct brw_bo *bo = intel_bufferobj_buffer(brw, intel_bo,
>                                                offset_bytes,
>                                                buffer_obj->Size - offset_bytes);
>      uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 6 * 4, 32,
> @@ -480,19 +475,17 @@ brw_update_sol_surface(struct brw_context *brw,
>         BRW_SURFACE_MIPMAPLAYOUT_BELOW << BRW_SURFACE_MIPLAYOUT_SHIFT |
>         surface_format << BRW_SURFACE_FORMAT_SHIFT |
>         BRW_SURFACE_RC_READ_WRITE;
> -   surf[1] = bo->offset64 + offset_bytes; /* reloc */
> +   surf[1] = brw_batch_reloc(&brw->batch,
> +			     *out_offset + 4,
> +			     bo, offset_bytes,
> +			     I915_GEM_DOMAIN_RENDER,
> +			     I915_GEM_DOMAIN_RENDER);
>      surf[2] = (width << BRW_SURFACE_WIDTH_SHIFT |
>   	      height << BRW_SURFACE_HEIGHT_SHIFT);
>      surf[3] = (depth << BRW_SURFACE_DEPTH_SHIFT |
>                 pitch_minus_1 << BRW_SURFACE_PITCH_SHIFT);
>      surf[4] = 0;
>      surf[5] = 0;
> -
> -   /* Emit relocation to surface contents. */
> -   drm_intel_bo_emit_reloc(brw->batch.bo,
> -			   *out_offset + 4,
> -			   bo, offset_bytes,
> -			   I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
>   }
>   
>   /* Creates a new WM constant buffer reflecting the current fragment program's
> @@ -560,7 +553,7 @@ brw_emit_null_surface_state(struct brw_context *brw,
>       *     - Surface Format must be R8G8B8A8_UNORM.
>       */
>      unsigned surface_type = BRW_SURFACE_NULL;
> -   drm_intel_bo *bo = NULL;
> +   struct brw_bo *bo = NULL;
>      unsigned pitch_minus_1 = 0;
>      uint32_t multisampling_state = 0;
>      uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 6 * 4, 32,
> @@ -600,7 +593,10 @@ brw_emit_null_surface_state(struct brw_context *brw,
>   		  1 << BRW_SURFACE_WRITEDISABLE_B_SHIFT |
>   		  1 << BRW_SURFACE_WRITEDISABLE_A_SHIFT);
>      }
> -   surf[1] = bo ? bo->offset64 : 0;
> +   surf[1] = brw_batch_reloc(&brw->batch, *out_offset + 4,
> +                             bo, 0,
> +                             I915_GEM_DOMAIN_RENDER,
> +                             I915_GEM_DOMAIN_RENDER);
>      surf[2] = ((width - 1) << BRW_SURFACE_WIDTH_SHIFT |
>                 (height - 1) << BRW_SURFACE_HEIGHT_SHIFT);
>   
> @@ -613,13 +609,6 @@ brw_emit_null_surface_state(struct brw_context *brw,
>                 pitch_minus_1 << BRW_SURFACE_PITCH_SHIFT);
>      surf[4] = multisampling_state;
>      surf[5] = 0;
> -
> -   if (bo) {
> -      drm_intel_bo_emit_reloc(brw->batch.bo,
> -                              *out_offset + 4,
> -                              bo, 0,
> -                              I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
> -   }
>   }
>   
>   /**
> @@ -676,8 +665,12 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
>   
>      /* reloc */
>      assert(mt->offset % mt->cpp == 0);
> -   surf[1] = (intel_renderbuffer_get_tile_offsets(irb, &tile_x, &tile_y) +
> -	      mt->bo->offset64 + mt->offset);
> +   surf[1] = brw_batch_reloc(&brw->batch, offset + 4,
> +			     mt->bo,
> +			     mt->offset +
> +			     intel_renderbuffer_get_tile_offsets(irb, &tile_x, &tile_y),
> +			     I915_GEM_DOMAIN_RENDER,
> +			     I915_GEM_DOMAIN_RENDER);
>   
>      surf[2] = ((rb->Width - 1) << BRW_SURFACE_WIDTH_SHIFT |
>   	      (rb->Height - 1) << BRW_SURFACE_HEIGHT_SHIFT);
> @@ -719,13 +712,6 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
>         }
>      }
>   
> -   drm_intel_bo_emit_reloc(brw->batch.bo,
> -                           offset + 4,
> -                           mt->bo,
> -                           surf[1] - mt->bo->offset64,
> -                           I915_GEM_DOMAIN_RENDER,
> -                           I915_GEM_DOMAIN_RENDER);
> -
>      return offset;
>   }
>   
> @@ -904,7 +890,7 @@ brw_upload_ubo_surfaces(struct brw_context *brw,
>   
>         binding = &ctx->UniformBufferBindings[shader->UniformBlocks[i].Binding];
>         intel_bo = intel_buffer_object(binding->BufferObject);
> -      drm_intel_bo *bo =
> +      struct brw_bo *bo =
>            intel_bufferobj_buffer(brw, intel_bo,
>                                   binding->Offset,
>                                   binding->BufferObject->Size - binding->Offset);
> @@ -963,7 +949,7 @@ brw_upload_abo_surfaces(struct brw_context *brw,
>            &ctx->AtomicBufferBindings[prog->AtomicBuffers[i].Binding];
>         struct intel_buffer_object *intel_bo =
>            intel_buffer_object(binding->BufferObject);
> -      drm_intel_bo *bo = intel_bufferobj_buffer(
> +      struct brw_bo *bo = intel_bufferobj_buffer(
>            brw, intel_bo, binding->Offset, intel_bo->Base.Size - binding->Offset);
>   
>         brw->vtbl.emit_buffer_surface_state(brw, &surf_offsets[i], bo,
> diff --git a/src/mesa/drivers/dri/i965/gen6_blorp.cpp b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
> index 54c4a6d..6aa772c 100644
> --- a/src/mesa/drivers/dri/i965/gen6_blorp.cpp
> +++ b/src/mesa/drivers/dri/i965/gen6_blorp.cpp
> @@ -23,7 +23,6 @@
>   
>   #include <assert.h>
>   
> -#include "intel_batchbuffer.h"
>   #include "intel_fbo.h"
>   #include "intel_mipmap_tree.h"
>   
> @@ -386,9 +385,12 @@ gen6_blorp_emit_surface_state(struct brw_context *brw,
>                 BRW_SURFACE_CUBEFACE_ENABLES |
>                 surface->brw_surfaceformat << BRW_SURFACE_FORMAT_SHIFT);
>   
> -   /* reloc */
> -   surf[1] = (surface->compute_tile_offsets(&tile_x, &tile_y) +
> -              mt->bo->offset64);
> +   surf[1] = brw_batch_reloc(&brw->batch,
> +			     wm_surf_offset + 4,
> +			     mt->bo,
> +			     surface->compute_tile_offsets(&tile_x, &tile_y),
> +			     read_domains, write_domain);
> +
>   
>      surf[2] = (0 << BRW_SURFACE_LOD_SHIFT |
>                 (width - 1) << BRW_SURFACE_WIDTH_SHIFT |
> @@ -416,13 +418,6 @@ gen6_blorp_emit_surface_state(struct brw_context *brw,
>                 (surface->mt->align_h == 4 ?
>                  BRW_SURFACE_VERTICAL_ALIGN_ENABLE : 0));
>   
> -   /* Emit relocation to surface contents */
> -   drm_intel_bo_emit_reloc(brw->batch.bo,
> -                           wm_surf_offset + 4,
> -                           mt->bo,
> -                           surf[1] - mt->bo->offset64,
> -                           read_domains, write_domain);
> -
>      return wm_surf_offset;
>   }
>   
> diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
> index 2b76e24..2fbb075 100644
> --- a/src/mesa/drivers/dri/i965/gen6_cc.c
> +++ b/src/mesa/drivers/dri/i965/gen6_cc.c
> @@ -29,7 +29,6 @@
>   #include "brw_state.h"
>   #include "brw_defines.h"
>   #include "brw_util.h"
> -#include "intel_batchbuffer.h"
>   #include "main/macros.h"
>   #include "main/enums.h"
>   #include "main/glformats.h"
> diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
> index 9a29366..d7fe872 100644
> --- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
> @@ -29,7 +29,6 @@
>   #include "brw_state.h"
>   #include "brw_defines.h"
>   #include "brw_util.h"
> -#include "intel_batchbuffer.h"
>   #include "main/fbobject.h"
>   #include "main/framebuffer.h"
>   
> diff --git a/src/mesa/drivers/dri/i965/gen6_depth_state.c b/src/mesa/drivers/dri/i965/gen6_depth_state.c
> index 8f0d7dc..b1a9dd1 100644
> --- a/src/mesa/drivers/dri/i965/gen6_depth_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_depth_state.c
> @@ -22,7 +22,6 @@
>    */
>   
>   
> -#include "intel_batchbuffer.h"
>   #include "intel_fbo.h"
>   #include "intel_mipmap_tree.h"
>   
> diff --git a/src/mesa/drivers/dri/i965/gen6_depthstencil.c b/src/mesa/drivers/dri/i965/gen6_depthstencil.c
> index 2c625c9..ed731c5 100644
> --- a/src/mesa/drivers/dri/i965/gen6_depthstencil.c
> +++ b/src/mesa/drivers/dri/i965/gen6_depthstencil.c
> @@ -25,7 +25,6 @@
>    *
>    */
>   
> -#include "intel_batchbuffer.h"
>   #include "intel_fbo.h"
>   #include "brw_context.h"
>   #include "brw_defines.h"
> diff --git a/src/mesa/drivers/dri/i965/gen6_gs_state.c b/src/mesa/drivers/dri/i965/gen6_gs_state.c
> index eb4c586..3d4bb68 100644
> --- a/src/mesa/drivers/dri/i965/gen6_gs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_gs_state.c
> @@ -28,7 +28,6 @@
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> -#include "intel_batchbuffer.h"
>   
>   static void
>   gen6_upload_gs_push_constants(struct brw_context *brw)
> diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
> index 36734f5..878c13a 100644
> --- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
> @@ -21,7 +21,6 @@
>    * IN THE SOFTWARE.
>    */
>   
> -#include "intel_batchbuffer.h"
>   
>   #include "brw_context.h"
>   #include "brw_defines.h"
> diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
> index 9f4a5db..120aedb 100644
> --- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
> +++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
> @@ -36,7 +36,6 @@
>   #include "brw_context.h"
>   #include "brw_defines.h"
>   #include "brw_state.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_reg.h"
>   
>   /*
> @@ -50,7 +49,7 @@
>    */
>   void
>   brw_store_register_mem64(struct brw_context *brw,
> -                         drm_intel_bo *bo, uint32_t reg, int idx)
> +                         struct brw_bo *bo, uint32_t reg, int idx)
>   {
>      assert(brw->gen >= 6);
>   
> @@ -84,7 +83,7 @@ brw_store_register_mem64(struct brw_context *brw,
>   
>   static void
>   write_primitives_generated(struct brw_context *brw,
> -                           drm_intel_bo *query_bo, int stream, int idx)
> +                           struct brw_bo *query_bo, int stream, int idx)
>   {
>      brw_emit_mi_flush(brw);
>   
> @@ -98,7 +97,7 @@ write_primitives_generated(struct brw_context *brw,
>   
>   static void
>   write_xfb_primitives_written(struct brw_context *brw,
> -                             drm_intel_bo *bo, int stream, int idx)
> +                             struct brw_bo *bo, int stream, int idx)
>   {
>      brw_emit_mi_flush(brw);
>   
> @@ -119,7 +118,7 @@ pipeline_target_to_index(int target)
>   }
>   
>   static void
> -emit_pipeline_stat(struct brw_context *brw, drm_intel_bo *bo,
> +emit_pipeline_stat(struct brw_context *brw, struct brw_bo *bo,
>                      int stream, int target, int idx)
>   {
>      /* One source of confusion is the tessellation shader statistics. The
> @@ -175,8 +174,7 @@ gen6_queryobj_get_results(struct gl_context *ctx,
>      if (query->bo == NULL)
>         return;
>   
> -   brw_bo_map(brw, query->bo, false, "query object");
> -   uint64_t *results = query->bo->virtual;
> +   uint64_t *results = brw_bo_map(query->bo, MAP_READ);
>      switch (query->Base.Target) {
>      case GL_TIME_ELAPSED:
>         /* The query BO contains the starting and ending timestamps.
> @@ -255,12 +253,11 @@ gen6_queryobj_get_results(struct gl_context *ctx,
>      default:
>         unreachable("Unrecognized query target in brw_queryobj_get_results()");
>      }
> -   drm_intel_bo_unmap(query->bo);
>   
>      /* Now that we've processed the data stored in the query's buffer object,
>       * we can release it.
>       */
> -   drm_intel_bo_unreference(query->bo);
> +   brw_bo_put(query->bo);
>      query->bo = NULL;
>   
>      query->Base.Ready = true;
> @@ -279,8 +276,11 @@ gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q)
>      struct brw_query_object *query = (struct brw_query_object *)q;
>   
>      /* Since we're starting a new query, we need to throw away old results. */
> -   drm_intel_bo_unreference(query->bo);
> -   query->bo = drm_intel_bo_alloc(brw->bufmgr, "query results", 4096, 4096);
> +   brw_bo_put(query->bo);
> +   query->bo = brw_bo_create(&brw->batch, "query results", 4096, 4096, 0);
> +
> +   if (brw_batch_begin(&brw->batch, 120, RENDER_RING) < 0)
> +      return;
>   
>      switch (query->Base.Target) {
>      case GL_TIME_ELAPSED:
> @@ -337,6 +337,8 @@ gen6_begin_query(struct gl_context *ctx, struct gl_query_object *q)
>      default:
>         unreachable("Unrecognized query target in brw_begin_query()");
>      }
> +
> +   brw_batch_end(&brw->batch);
>   }
>   
>   /**
> @@ -353,6 +355,9 @@ gen6_end_query(struct gl_context *ctx, struct gl_query_object *q)
>      struct brw_context *brw = brw_context(ctx);
>      struct brw_query_object *query = (struct brw_query_object *)q;
>   
> +   if (brw_batch_begin(&brw->batch, 120, RENDER_RING) < 0)
> +      return;
> +
>      switch (query->Base.Target) {
>      case GL_TIME_ELAPSED:
>         brw_write_timestamp(brw, query->bo, 1);
> @@ -391,26 +396,7 @@ gen6_end_query(struct gl_context *ctx, struct gl_query_object *q)
>         unreachable("Unrecognized query target in brw_end_query()");
>      }
>   
> -   /* The current batch contains the commands to handle EndQuery(),
> -    * but they won't actually execute until it is flushed.
> -    */
> -   query->flushed = false;
> -}
> -
> -/**
> - * Flush the batch if it still references the query object BO.
> - */
> -static void
> -flush_batch_if_needed(struct brw_context *brw, struct brw_query_object *query)
> -{
> -   /* If the batch doesn't reference the BO, it must have been flushed
> -    * (for example, due to being full).  Record that it's been flushed.
> -    */
> -   query->flushed = query->flushed ||
> -      !drm_intel_bo_references(brw->batch.bo, query->bo);
> -
> -   if (!query->flushed)
> -      intel_batchbuffer_flush(brw);
> +   brw_batch_end(&brw->batch);
>   }
>   
>   /**
> @@ -421,15 +407,12 @@ flush_batch_if_needed(struct brw_context *brw, struct brw_query_object *query)
>    */
>   static void gen6_wait_query(struct gl_context *ctx, struct gl_query_object *q)
>   {
> -   struct brw_context *brw = brw_context(ctx);
>      struct brw_query_object *query = (struct brw_query_object *)q;
>   
>      /* If the application has requested the query result, but this batch is
>       * still contributing to it, flush it now to finish that work so the
>       * result will become available (eventually).
>       */
> -   flush_batch_if_needed(brw, query);
> -
>      gen6_queryobj_get_results(ctx, query);
>   }
>   
> @@ -441,7 +424,6 @@ static void gen6_wait_query(struct gl_context *ctx, struct gl_query_object *q)
>    */
>   static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q)
>   {
> -   struct brw_context *brw = brw_context(ctx);
>      struct brw_query_object *query = (struct brw_query_object *)q;
>   
>      /* If query->bo is NULL, we've already gathered the results - this is a
> @@ -457,9 +439,7 @@ static void gen6_check_query(struct gl_context *ctx, struct gl_query_object *q)
>       *      not ready yet on the first time it is queried.  This ensures that
>       *      the async query will return true in finite time.
>       */
> -   flush_batch_if_needed(brw, query);
> -
> -   if (!drm_intel_bo_busy(query->bo)) {
> +   if (!brw_bo_busy(query->bo, BUSY_READ | BUSY_FLUSH)) {
>         gen6_queryobj_get_results(ctx, query);
>      }
>   }
> diff --git a/src/mesa/drivers/dri/i965/gen6_sampler_state.c b/src/mesa/drivers/dri/i965/gen6_sampler_state.c
> index 9e7da58..ecc6b21 100644
> --- a/src/mesa/drivers/dri/i965/gen6_sampler_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_sampler_state.c
> @@ -28,7 +28,6 @@
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> -#include "intel_batchbuffer.h"
>   
>   static void
>   upload_sampler_state_pointers(struct brw_context *brw)
> diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
> index 17b4a7f..4a343f4 100644
> --- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
> @@ -28,7 +28,6 @@
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> -#include "intel_batchbuffer.h"
>   #include "main/fbobject.h"
>   #include "main/framebuffer.h"
>   
> diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
> index b00517e..026fee3 100644
> --- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
> @@ -32,7 +32,6 @@
>   #include "main/macros.h"
>   #include "main/fbobject.h"
>   #include "main/framebuffer.h"
> -#include "intel_batchbuffer.h"
>   
>   /**
>    * Determine the appropriate attribute override value to store into the
> diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c
> index 3899ce9..39bf8e9 100644
> --- a/src/mesa/drivers/dri/i965/gen6_sol.c
> +++ b/src/mesa/drivers/dri/i965/gen6_sol.c
> @@ -29,7 +29,6 @@
>   #include "main/bufferobj.h"
>   #include "main/macros.h"
>   #include "brw_context.h"
> -#include "intel_batchbuffer.h"
>   #include "brw_defines.h"
>   #include "brw_state.h"
>   #include "main/transformfeedback.h"
> @@ -205,9 +204,9 @@ brw_new_transform_feedback(struct gl_context *ctx, GLuint name)
>      _mesa_init_transform_feedback_object(&brw_obj->base, name);
>   
>      brw_obj->offset_bo =
> -      drm_intel_bo_alloc(brw->bufmgr, "transform feedback offsets", 16, 64);
> +      brw_bo_create(&brw->batch, "transform feedback offsets", 16, 64, 0);
>      brw_obj->prim_count_bo =
> -      drm_intel_bo_alloc(brw->bufmgr, "xfb primitive counts", 4096, 64);
> +      brw_bo_create(&brw->batch, "xfb primitive counts", 4096, 64, 0);
>   
>      return &brw_obj->base;
>   }
> @@ -223,8 +222,8 @@ brw_delete_transform_feedback(struct gl_context *ctx,
>         _mesa_reference_buffer_object(ctx, &obj->Buffers[i], NULL);
>      }
>   
> -   drm_intel_bo_unreference(brw_obj->offset_bo);
> -   drm_intel_bo_unreference(brw_obj->prim_count_bo);
> +   brw_bo_put(brw_obj->offset_bo);
> +   brw_bo_put(brw_obj->prim_count_bo);
>   
>      free(brw_obj);
>   }
> diff --git a/src/mesa/drivers/dri/i965/gen6_surface_state.c b/src/mesa/drivers/dri/i965/gen6_surface_state.c
> index 03e913a..53abbba 100644
> --- a/src/mesa/drivers/dri/i965/gen6_surface_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_surface_state.c
> @@ -30,7 +30,6 @@
>   #include "program/prog_parameter.h"
>   
>   #include "intel_mipmap_tree.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_tex.h"
>   #include "intel_fbo.h"
>   #include "intel_buffer_objects.h"
> @@ -95,7 +94,10 @@ gen6_update_renderbuffer_surface(struct brw_context *brw,
>   
>      /* reloc */
>      assert(mt->offset % mt->cpp == 0);
> -   surf[1] = mt->bo->offset64 + mt->offset;
> +   surf[1] = brw_batch_reloc(&brw->batch, offset + 4,
> +			     mt->bo, mt->offset,
> +			     I915_GEM_DOMAIN_RENDER,
> +			     I915_GEM_DOMAIN_RENDER);
>   
>      /* In the gen6 PRM Volume 1 Part 1: Graphics Core, Section 7.18.3.7.1
>       * (Surface Arrays For all surfaces other than separate stencil buffer):
> @@ -127,13 +129,6 @@ gen6_update_renderbuffer_surface(struct brw_context *brw,
>   
>      surf[5] = (mt->align_h == 4 ? BRW_SURFACE_VERTICAL_ALIGN_ENABLE : 0);
>   
> -   drm_intel_bo_emit_reloc(brw->batch.bo,
> -                           offset + 4,
> -                           mt->bo,
> -                           surf[1] - mt->bo->offset64,
> -                           I915_GEM_DOMAIN_RENDER,
> -                           I915_GEM_DOMAIN_RENDER);
> -
>      return offset;
>   }
>   
> diff --git a/src/mesa/drivers/dri/i965/gen6_urb.c b/src/mesa/drivers/dri/i965/gen6_urb.c
> index c7311fd..78cb973 100644
> --- a/src/mesa/drivers/dri/i965/gen6_urb.c
> +++ b/src/mesa/drivers/dri/i965/gen6_urb.c
> @@ -26,7 +26,6 @@
>    */
>   
>   #include "main/macros.h"
> -#include "intel_batchbuffer.h"
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
> index 7c8d884..2845f3b 100644
> --- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
> @@ -28,7 +28,6 @@
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> -#include "intel_batchbuffer.h"
>   #include "main/fbobject.h"
>   #include "main/framebuffer.h"
>   #include "main/viewport.h"
> diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
> index 35d10ef..89bb426 100644
> --- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
> @@ -29,9 +29,9 @@
>   #include "brw_state.h"
>   #include "brw_defines.h"
>   #include "brw_util.h"
> +#include "intel_reg.h"
>   #include "program/prog_parameter.h"
>   #include "program/prog_statevars.h"
> -#include "intel_batchbuffer.h"
>   #include "glsl/glsl_parser_extras.h"
>   
>   /**
> diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
> index d1748ba..e3b6054 100644
> --- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
> @@ -34,7 +34,6 @@
>   #include "program/prog_parameter.h"
>   #include "program/prog_statevars.h"
>   #include "main/framebuffer.h"
> -#include "intel_batchbuffer.h"
>   
>   static void
>   gen6_upload_wm_push_constants(struct brw_context *brw)
> diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> index abace6d..62d735e 100644
> --- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> +++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> @@ -23,7 +23,6 @@
>   
>   #include <assert.h>
>   
> -#include "intel_batchbuffer.h"
>   #include "intel_fbo.h"
>   #include "intel_mipmap_tree.h"
>   
> @@ -168,9 +167,11 @@ gen7_blorp_emit_surface_state(struct brw_context *brw,
>      else
>         surf[0] |= GEN7_SURFACE_ARYSPC_FULL;
>   
> -   /* reloc */
> -   surf[1] =
> -      surface->compute_tile_offsets(&tile_x, &tile_y) + mt->bo->offset64;
> +   surf[1] = brw_batch_reloc(&brw->batch,
> +			     wm_surf_offset + 4,
> +			     mt->bo,
> +			     surface->compute_tile_offsets(&tile_x, &tile_y),
> +			     read_domains, write_domain);
>   
>      /* Note that the low bits of these fields are missing, so
>       * there's the possibility of getting in trouble.
> @@ -204,13 +205,6 @@ gen7_blorp_emit_surface_state(struct brw_context *brw,
>                     SET_FIELD(HSW_SCS_ALPHA, GEN7_SURFACE_SCS_A));
>      }
>   
> -   /* Emit relocation to surface contents */
> -   drm_intel_bo_emit_reloc(brw->batch.bo,
> -                           wm_surf_offset + 4,
> -                           mt->bo,
> -                           surf[1] - mt->bo->offset64,
> -                           read_domains, write_domain);
> -
>      gen7_check_surface_setup(surf, is_render_target);
>   
>      return wm_surf_offset;
> diff --git a/src/mesa/drivers/dri/i965/gen7_disable.c b/src/mesa/drivers/dri/i965/gen7_disable.c
> index 2c43cd7..6d0be45 100644
> --- a/src/mesa/drivers/dri/i965/gen7_disable.c
> +++ b/src/mesa/drivers/dri/i965/gen7_disable.c
> @@ -24,7 +24,6 @@
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> -#include "intel_batchbuffer.h"
>   
>   static void
>   disable_stages(struct brw_context *brw)
> diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c
> index 8d6d3fe..02add4c 100644
> --- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c
> @@ -24,7 +24,6 @@
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> -#include "intel_batchbuffer.h"
>   
>   static void
>   upload_gs_state(struct brw_context *brw)
> diff --git a/src/mesa/drivers/dri/i965/gen7_misc_state.c b/src/mesa/drivers/dri/i965/gen7_misc_state.c
> index a14d4a0..fb20b22 100644
> --- a/src/mesa/drivers/dri/i965/gen7_misc_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_misc_state.c
> @@ -22,7 +22,6 @@
>    */
>   
>   #include "main/mtypes.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_mipmap_tree.h"
>   #include "intel_fbo.h"
>   #include "brw_context.h"
> @@ -53,7 +52,7 @@ gen7_emit_depth_stencil_hiz(struct brw_context *brw,
>   
>      /* Skip repeated NULL depth/stencil emits (think 2D rendering). */
>      if (!mt && brw->no_depth_or_stencil) {
> -      assert(brw->hw_ctx);
> +      assert(brw->batch.hw_ctx);
>         return;
>      }
>   
> diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c
> index 4fa46a8..3742cb7 100644
> --- a/src/mesa/drivers/dri/i965/gen7_sf_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c
> @@ -28,7 +28,6 @@
>   #include "main/macros.h"
>   #include "main/fbobject.h"
>   #include "main/framebuffer.h"
> -#include "intel_batchbuffer.h"
>   
>   static void
>   upload_sbe_state(struct brw_context *brw)
> diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c
> index 41573a8..cabd77a 100644
> --- a/src/mesa/drivers/dri/i965/gen7_sol_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c
> @@ -31,8 +31,8 @@
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_buffer_objects.h"
> +#include "intel_reg.h"
>   #include "main/transformfeedback.h"
>   
>   static void
> @@ -52,7 +52,7 @@ upload_3dstate_so_buffers(struct brw_context *brw)
>      for (i = 0; i < 4; i++) {
>         struct intel_buffer_object *bufferobj =
>   	 intel_buffer_object(xfb_obj->Buffers[i]);
> -      drm_intel_bo *bo;
> +      struct brw_bo *bo;
>         uint32_t start, end;
>         uint32_t stride;
>   
> @@ -314,14 +314,7 @@ gen7_tally_prims_generated(struct brw_context *brw,
>      /* If the current batch is still contributing to the number of primitives
>       * generated, flush it now so the results will be present when mapped.
>       */
> -   if (drm_intel_bo_references(brw->batch.bo, obj->prim_count_bo))
> -      intel_batchbuffer_flush(brw);
> -
> -   if (unlikely(brw->perf_debug && drm_intel_bo_busy(obj->prim_count_bo)))
> -      perf_debug("Stalling for # of transform feedback primitives written.\n");
> -
> -   drm_intel_bo_map(obj->prim_count_bo, false);
> -   uint64_t *prim_counts = obj->prim_count_bo->virtual;
> +   uint64_t *prim_counts = brw_bo_map(obj->prim_count_bo, MAP_READ);
>   
>      assert(obj->prim_count_buffer_index % (2 * BRW_MAX_XFB_STREAMS) == 0);
>      int pairs = obj->prim_count_buffer_index / (2 * BRW_MAX_XFB_STREAMS);
> @@ -334,8 +327,6 @@ gen7_tally_prims_generated(struct brw_context *brw,
>         prim_counts += 2 * BRW_MAX_XFB_STREAMS; /* move to the next pair */
>      }
>   
> -   drm_intel_bo_unmap(obj->prim_count_bo);
> -
>      /* We've already gathered up the old data; we can safely overwrite it now. */
>      obj->prim_count_buffer_index = 0;
>   }
> @@ -446,9 +437,6 @@ gen7_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
>      /* Reset the SO buffer offsets to 0. */
>      if (brw->gen >= 8) {
>         brw_obj->zero_offsets = true;
> -   } else {
> -      intel_batchbuffer_flush(brw);
> -      brw->batch.needs_sol_reset = true;
>      }
>   
>      /* We're about to lose the information needed to compute the number of
> @@ -457,14 +445,30 @@ gen7_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
>       */
>      brw_compute_xfb_vertices_written(brw, brw_obj);
>   
> +   if (brw_batch_begin(&brw->batch, 300, RENDER_RING) < 0)
> +      return;
> +
>      /* No primitives have been generated yet. */
>      for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) {
>         brw_obj->prims_generated[i] = 0;
>      }
>   
> +   /* Reset the SOL buffer offset registers. */
> +   if (brw->gen < 8) {
> +      for (int i = 0; i < 4; i++) {
> +         BEGIN_BATCH(3);
> +         OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
> +         OUT_BATCH(GEN7_SO_WRITE_OFFSET(i));
> +         OUT_BATCH(0);
> +         ADVANCE_BATCH();
> +      }
> +   }
> +
>      /* Store the starting value of the SO_NUM_PRIMS_WRITTEN counters. */
>      gen7_save_primitives_written_counters(brw, brw_obj);
>   
> +   brw_batch_end(&brw->batch);
> +
>      brw_obj->primitive_mode = mode;
>   }
>   
> @@ -482,9 +486,14 @@ gen7_end_transform_feedback(struct gl_context *ctx,
>      struct brw_transform_feedback_object *brw_obj =
>         (struct brw_transform_feedback_object *) obj;
>   
> +   if (brw_batch_begin(&brw->batch, 300, RENDER_RING) < 0)
> +      return;
> +
>      /* Store the ending value of the SO_NUM_PRIMS_WRITTEN counters. */
>      gen7_save_primitives_written_counters(brw, brw_obj);
>   
> +   brw_batch_end(&brw->batch);
> +
>      /* EndTransformFeedback() means that we need to update the number of
>       * vertices written.  Since it's only necessary if DrawTransformFeedback()
>       * is called and it means mapping a buffer object, we delay computing it
> @@ -501,6 +510,9 @@ gen7_pause_transform_feedback(struct gl_context *ctx,
>      struct brw_transform_feedback_object *brw_obj =
>         (struct brw_transform_feedback_object *) obj;
>   
> +   if (brw_batch_begin(&brw->batch, 300, RENDER_RING) < 0)
> +      return;
> +
>      /* Flush any drawing so that the counters have the right values. */
>      brw_emit_mi_flush(brw);
>   
> @@ -523,6 +535,8 @@ gen7_pause_transform_feedback(struct gl_context *ctx,
>       * from our counts.
>       */
>      gen7_save_primitives_written_counters(brw, brw_obj);
> +
> +   brw_batch_end(&brw->batch);
>   }
>   
>   void
> @@ -533,6 +547,9 @@ gen7_resume_transform_feedback(struct gl_context *ctx,
>      struct brw_transform_feedback_object *brw_obj =
>         (struct brw_transform_feedback_object *) obj;
>   
> +   if (brw_batch_begin(&brw->batch, 300, RENDER_RING) < 0)
> +      return;
> +
>      /* Reload the SOL buffer offset registers. */
>      if (brw->gen < 8) {
>         for (int i = 0; i < 4; i++) {
> @@ -548,4 +565,6 @@ gen7_resume_transform_feedback(struct gl_context *ctx,
>   
>      /* Store the new starting value of the SO_NUM_PRIMS_WRITTEN counters. */
>      gen7_save_primitives_written_counters(brw, brw_obj);
> +
> +   brw_batch_end(&brw->batch);
>   }
> diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
> index d371c19..bc631b1 100644
> --- a/src/mesa/drivers/dri/i965/gen7_urb.c
> +++ b/src/mesa/drivers/dri/i965/gen7_urb.c
> @@ -22,7 +22,6 @@
>    */
>   
>   #include "main/macros.h"
> -#include "intel_batchbuffer.h"
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> diff --git a/src/mesa/drivers/dri/i965/gen7_viewport_state.c b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
> index b655205..a32e25f 100644
> --- a/src/mesa/drivers/dri/i965/gen7_viewport_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
> @@ -24,7 +24,6 @@
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> -#include "intel_batchbuffer.h"
>   #include "main/fbobject.h"
>   #include "main/framebuffer.h"
>   #include "main/viewport.h"
> diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
> index 4b17d06..b6e90bc 100644
> --- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
> @@ -27,7 +27,6 @@
>   #include "brw_util.h"
>   #include "program/prog_parameter.h"
>   #include "program/prog_statevars.h"
> -#include "intel_batchbuffer.h"
>   
>   
>   void
> diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
> index ea11ae8..56400e3 100644
> --- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
> @@ -31,7 +31,6 @@
>   #include "program/prog_parameter.h"
>   #include "program/prog_statevars.h"
>   #include "main/framebuffer.h"
> -#include "intel_batchbuffer.h"
>   
>   static void
>   upload_wm_state(struct brw_context *brw)
> diff --git a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
> index 15ab2b0..991e7bc 100644
> --- a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
> @@ -28,7 +28,6 @@
>   #include "program/prog_parameter.h"
>   
>   #include "intel_mipmap_tree.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_tex.h"
>   #include "intel_fbo.h"
>   #include "intel_buffer_objects.h"
> @@ -116,19 +115,14 @@ gen7_set_surface_mcs_info(struct brw_context *brw,
>       * thus have their lower 12 bits zero), we can use an ordinary reloc to do
>       * the necessary address translation.
>       */
> -   assert ((mcs_mt->bo->offset64 & 0xfff) == 0);
> -
> -   surf[6] = GEN7_SURFACE_MCS_ENABLE |
> -             SET_FIELD(pitch_tiles - 1, GEN7_SURFACE_MCS_PITCH) |
> -             mcs_mt->bo->offset64;
> -
> -   drm_intel_bo_emit_reloc(brw->batch.bo,
> -                           surf_offset + 6 * 4,
> -                           mcs_mt->bo,
> -                           surf[6] & 0xfff,
> -                           is_render_target ? I915_GEM_DOMAIN_RENDER
> -                           : I915_GEM_DOMAIN_SAMPLER,
> -                           is_render_target ? I915_GEM_DOMAIN_RENDER : 0);
> +   surf[6] = brw_batch_reloc(&brw->batch,
> +			     surf_offset + 6 * 4,
> +			     mcs_mt->bo,
> +			     GEN7_SURFACE_MCS_ENABLE |
> +			     SET_FIELD(pitch_tiles - 1, GEN7_SURFACE_MCS_PITCH),
> +			     is_render_target ? I915_GEM_DOMAIN_RENDER
> +			     : I915_GEM_DOMAIN_SAMPLER,
> +			     is_render_target ? I915_GEM_DOMAIN_RENDER : 0);
>   }
>   
>   
> @@ -221,7 +215,7 @@ gen7_check_surface_setup(uint32_t *surf, bool is_render_target)
>   static void
>   gen7_emit_buffer_surface_state(struct brw_context *brw,
>                                  uint32_t *out_offset,
> -                               drm_intel_bo *bo,
> +                               struct brw_bo *bo,
>                                  unsigned buffer_offset,
>                                  unsigned surface_format,
>                                  unsigned buffer_size,
> @@ -235,7 +229,9 @@ gen7_emit_buffer_surface_state(struct brw_context *brw,
>      surf[0] = BRW_SURFACE_BUFFER << BRW_SURFACE_TYPE_SHIFT |
>                surface_format << BRW_SURFACE_FORMAT_SHIFT |
>                BRW_SURFACE_RC_READ_WRITE;
> -   surf[1] = (bo ? bo->offset64 : 0) + buffer_offset; /* reloc */
> +   surf[1] = brw_batch_reloc(&brw->batch, *out_offset + 4,
> +			     bo, buffer_offset, I915_GEM_DOMAIN_SAMPLER,
> +			     (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
>      surf[2] = SET_FIELD((buffer_size - 1) & 0x7f, GEN7_SURFACE_WIDTH) |
>                SET_FIELD(((buffer_size - 1) >> 7) & 0x3fff, GEN7_SURFACE_HEIGHT);
>      if (surface_format == BRW_SURFACEFORMAT_RAW)
> @@ -253,13 +249,6 @@ gen7_emit_buffer_surface_state(struct brw_context *brw,
>                     SET_FIELD(HSW_SCS_ALPHA, GEN7_SURFACE_SCS_A));
>      }
>   
> -   /* Emit relocation to surface contents */
> -   if (bo) {
> -      drm_intel_bo_emit_reloc(brw->batch.bo, *out_offset + 4,
> -                              bo, buffer_offset, I915_GEM_DOMAIN_SAMPLER,
> -                              (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
> -   }
> -
>      gen7_check_surface_setup(surf, false /* is_render_target */);
>   }
>   
> @@ -299,7 +288,10 @@ gen7_emit_texture_surface_state(struct brw_context *brw,
>      if (mt->array_layout == ALL_SLICES_AT_EACH_LOD)
>         surf[0] |= GEN7_SURFACE_ARYSPC_LOD0;
>   
> -   surf[1] = mt->bo->offset64 + mt->offset; /* reloc */
> +   surf[1] = brw_batch_reloc(&brw->batch, *surf_offset + 4,
> +			     mt->bo, mt->offset,
> +			     I915_GEM_DOMAIN_SAMPLER,
> +			     (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
>   
>      surf[2] = SET_FIELD(mt->logical_width0 - 1, GEN7_SURFACE_WIDTH) |
>                SET_FIELD(mt->logical_height0 - 1, GEN7_SURFACE_HEIGHT);
> @@ -336,14 +328,6 @@ gen7_emit_texture_surface_state(struct brw_context *brw,
>                                   mt->mcs_mt, false /* is RT */);
>      }
>   
> -   /* Emit relocation to surface contents */
> -   drm_intel_bo_emit_reloc(brw->batch.bo,
> -                           *surf_offset + 4,
> -                           mt->bo,
> -                           surf[1] - mt->bo->offset64,
> -                           I915_GEM_DOMAIN_SAMPLER,
> -                           (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
> -
>      gen7_check_surface_setup(surf, false /* is_render_target */);
>   }
>   
> @@ -519,7 +503,10 @@ gen7_update_renderbuffer_surface(struct brw_context *brw,
>      }
>   
>      assert(mt->offset % mt->cpp == 0);
> -   surf[1] = mt->bo->offset64 + mt->offset;
> +   surf[1] = brw_batch_reloc(&brw->batch, offset + 4,
> +			     mt->bo, mt->offset,
> +			     I915_GEM_DOMAIN_RENDER,
> +			     I915_GEM_DOMAIN_RENDER);
>   
>      assert(brw->has_surface_tile_offset);
>   
> @@ -550,13 +537,6 @@ gen7_update_renderbuffer_surface(struct brw_context *brw,
>                     SET_FIELD(HSW_SCS_ALPHA, GEN7_SURFACE_SCS_A));
>      }
>   
> -   drm_intel_bo_emit_reloc(brw->batch.bo,
> -                           offset + 4,
> -                           mt->bo,
> -                           surf[1] - mt->bo->offset64,
> -                           I915_GEM_DOMAIN_RENDER,
> -                           I915_GEM_DOMAIN_RENDER);
> -
>      gen7_check_surface_setup(surf, true /* is_render_target */);
>   
>      return offset;
> diff --git a/src/mesa/drivers/dri/i965/gen8_blend_state.c b/src/mesa/drivers/dri/i965/gen8_blend_state.c
> index 786c79a..31a3deb 100644
> --- a/src/mesa/drivers/dri/i965/gen8_blend_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_blend_state.c
> @@ -26,7 +26,6 @@
>   #include "brw_defines.h"
>   #include "brw_util.h"
>   #include "brw_wm.h"
> -#include "intel_batchbuffer.h"
>   #include "main/macros.h"
>   #include "main/enums.h"
>   #include "main/glformats.h"
> diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
> index 8f23702..739e974 100644
> --- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
> @@ -21,9 +21,9 @@
>    * IN THE SOFTWARE.
>    */
>   
> -#include "intel_batchbuffer.h"
>   #include "intel_mipmap_tree.h"
>   #include "intel_fbo.h"
> +#include "intel_reg.h"
>   #include "intel_resolve_map.h"
>   #include "brw_context.h"
>   #include "brw_state.h"
> @@ -52,7 +52,7 @@ emit_depth_packets(struct brw_context *brw,
>   
>      /* Skip repeated NULL depth/stencil emits (think 2D rendering). */
>      if (!depth_mt && !stencil_mt && brw->no_depth_or_stencil) {
> -      assert(brw->hw_ctx);
> +      assert(brw->batch.hw_ctx);
>         return;
>      }
>   
> @@ -400,6 +400,9 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
>      if (op == GEN6_HIZ_OP_NONE)
>         return;
>   
> +   if (brw_batch_begin(&brw->batch, 1000, RENDER_RING) < 0)
> +      return;
> +
>      /* Disable the PMA stall fix since we're about to do a HiZ operation. */
>      if (brw->gen == 8)
>         write_pma_stall_bits(brw, 0);
> @@ -508,7 +511,14 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
>      ADVANCE_BATCH();
>   
>      /* Mark this buffer as needing a TC flush, as we've rendered to it. */
> -   brw_render_cache_set_add_bo(brw, mt->bo);
> +   assert(mt->bo->dirty);
> +
> +   if (brw_batch_end(&brw->batch)) {
> +      struct gl_context *ctx = &brw->ctx;
> +      WARN_ONCE(1, "i965: blorp emit exceeded available aperture space\n");
> +   }
> +
> +   brw_batch_maybe_flush(&brw->batch);
>   
>      /* We've clobbered all of the depth packets, and the drawing rectangle,
>       * so we need to ensure those packets are re-emitted before the next
> diff --git a/src/mesa/drivers/dri/i965/gen8_disable.c b/src/mesa/drivers/dri/i965/gen8_disable.c
> index da0d4a5..ec86fee 100644
> --- a/src/mesa/drivers/dri/i965/gen8_disable.c
> +++ b/src/mesa/drivers/dri/i965/gen8_disable.c
> @@ -24,7 +24,6 @@
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> -#include "intel_batchbuffer.h"
>   
>   static void
>   disable_stages(struct brw_context *brw)
> diff --git a/src/mesa/drivers/dri/i965/gen8_draw_upload.c b/src/mesa/drivers/dri/i965/gen8_draw_upload.c
> index 1af90ec..dc5e915 100644
> --- a/src/mesa/drivers/dri/i965/gen8_draw_upload.c
> +++ b/src/mesa/drivers/dri/i965/gen8_draw_upload.c
> @@ -32,7 +32,6 @@
>   #include "brw_context.h"
>   #include "brw_state.h"
>   
> -#include "intel_batchbuffer.h"
>   #include "intel_buffer_objects.h"
>   
>   static void
> diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c
> index 26a02d3..e5c3d23 100644
> --- a/src/mesa/drivers/dri/i965/gen8_gs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c
> @@ -24,7 +24,6 @@
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> -#include "intel_batchbuffer.h"
>   
>   static void
>   gen8_upload_gs_state(struct brw_context *brw)
> diff --git a/src/mesa/drivers/dri/i965/gen8_misc_state.c b/src/mesa/drivers/dri/i965/gen8_misc_state.c
> index b20038e..83376cd 100644
> --- a/src/mesa/drivers/dri/i965/gen8_misc_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_misc_state.c
> @@ -21,7 +21,6 @@
>    * IN THE SOFTWARE.
>    */
>   
> -#include "intel_batchbuffer.h"
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> diff --git a/src/mesa/drivers/dri/i965/gen8_multisample_state.c b/src/mesa/drivers/dri/i965/gen8_multisample_state.c
> index 75cbe06..da5b32b 100644
> --- a/src/mesa/drivers/dri/i965/gen8_multisample_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_multisample_state.c
> @@ -21,7 +21,6 @@
>    * IN THE SOFTWARE.
>    */
>   
> -#include "intel_batchbuffer.h"
>   
>   #include "brw_context.h"
>   #include "brw_defines.h"
> diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
> index a88f109..0c66e91 100644
> --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
> @@ -25,7 +25,6 @@
>   #include "program/program.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> -#include "intel_batchbuffer.h"
>   
>   void
>   gen8_upload_ps_extra(struct brw_context *brw,
> diff --git a/src/mesa/drivers/dri/i965/gen8_sf_state.c b/src/mesa/drivers/dri/i965/gen8_sf_state.c
> index c2b585d..d0174a1 100644
> --- a/src/mesa/drivers/dri/i965/gen8_sf_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_sf_state.c
> @@ -27,7 +27,6 @@
>   #include "brw_util.h"
>   #include "main/macros.h"
>   #include "main/fbobject.h"
> -#include "intel_batchbuffer.h"
>   
>   static void
>   upload_sbe(struct brw_context *brw)
> diff --git a/src/mesa/drivers/dri/i965/gen8_sol_state.c b/src/mesa/drivers/dri/i965/gen8_sol_state.c
> index 58ead68..07212ab 100644
> --- a/src/mesa/drivers/dri/i965/gen8_sol_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_sol_state.c
> @@ -31,7 +31,6 @@
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_buffer_objects.h"
>   #include "main/transformfeedback.h"
>   
> @@ -70,7 +69,7 @@ gen8_upload_3dstate_so_buffers(struct brw_context *brw)
>         uint32_t start = xfb_obj->Offset[i];
>         assert(start % 4 == 0);
>         uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
> -      drm_intel_bo *bo =
> +      struct brw_bo *bo =
>            intel_bufferobj_buffer(brw, bufferobj, start, end - start);
>         assert(end <= bo->size);
>   
> diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c
> index bd3eb00..2a815d7 100644
> --- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
> @@ -29,7 +29,6 @@
>   #include "program/prog_parameter.h"
>   
>   #include "intel_mipmap_tree.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_tex.h"
>   #include "intel_fbo.h"
>   #include "intel_buffer_objects.h"
> @@ -145,7 +144,7 @@ allocate_surface_state(struct brw_context *brw, uint32_t *out_offset, int index)
>   static void
>   gen8_emit_buffer_surface_state(struct brw_context *brw,
>                                  uint32_t *out_offset,
> -                               drm_intel_bo *bo,
> +                               struct brw_bo *bo,
>                                  unsigned buffer_offset,
>                                  unsigned surface_format,
>                                  unsigned buffer_size,
> @@ -171,15 +170,11 @@ gen8_emit_buffer_surface_state(struct brw_context *brw,
>                SET_FIELD(HSW_SCS_GREEN, GEN7_SURFACE_SCS_G) |
>                SET_FIELD(HSW_SCS_BLUE,  GEN7_SURFACE_SCS_B) |
>                SET_FIELD(HSW_SCS_ALPHA, GEN7_SURFACE_SCS_A);
> -   /* reloc */
> -   *((uint64_t *) &surf[8]) = (bo ? bo->offset64 : 0) + buffer_offset;
> -
>      /* Emit relocation to surface contents. */
> -   if (bo) {
> -      drm_intel_bo_emit_reloc(brw->batch.bo, *out_offset + 8 * 4,
> -                              bo, buffer_offset, I915_GEM_DOMAIN_SAMPLER,
> -                              rw ? I915_GEM_DOMAIN_SAMPLER : 0);
> -   }
> +   *((uint64_t *)&surf[8]) =
> +	   brw_batch_reloc(&brw->batch, *out_offset + 8 * 4,
> +			   bo, buffer_offset, I915_GEM_DOMAIN_SAMPLER,
> +			   rw ? I915_GEM_DOMAIN_SAMPLER : 0);
>   }
>   
>   static void
> @@ -275,27 +270,18 @@ gen8_emit_texture_surface_state(struct brw_context *brw,
>         SET_FIELD(swizzle_to_scs(GET_SWZ(swizzle, 2)), GEN7_SURFACE_SCS_B) |
>         SET_FIELD(swizzle_to_scs(GET_SWZ(swizzle, 3)), GEN7_SURFACE_SCS_A);
>   
> -   *((uint64_t *) &surf[8]) = mt->bo->offset64 + mt->offset; /* reloc */
> -
> -   if (aux_mt) {
> -      *((uint64_t *) &surf[10]) = aux_mt->bo->offset64;
> -      drm_intel_bo_emit_reloc(brw->batch.bo, *surf_offset + 10 * 4,
> -                              aux_mt->bo, 0,
> -                              I915_GEM_DOMAIN_SAMPLER,
> -                              (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
> -   } else {
> -      surf[10] = 0;
> -      surf[11] = 0;
> -   }
> -   surf[12] = 0;
> -
> -   /* Emit relocation to surface contents */
> -   drm_intel_bo_emit_reloc(brw->batch.bo,
> -                           *surf_offset + 8 * 4,
> -                           mt->bo,
> -                           mt->offset,
> +   *((uint64_t *)&surf[8]) =
> +	   brw_batch_reloc(&brw->batch, *surf_offset + 8 * 4,
> +			   mt->bo, mt->offset,
>                              I915_GEM_DOMAIN_SAMPLER,
>                              (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
> +
> +   *((uint64_t *)&surf[10]) =
> +	   brw_batch_reloc(&brw->batch, *surf_offset + 10 * 4,
> +			   aux_mt ? aux_mt->bo : NULL, 0,
> +			   I915_GEM_DOMAIN_SAMPLER,
> +			   (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
> +   surf[12] = 0;
>   }
>   
>   static void
> @@ -502,27 +488,18 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
>                SET_FIELD(HSW_SCS_ALPHA, GEN7_SURFACE_SCS_A);
>   
>      assert(mt->offset % mt->cpp == 0);
> -   *((uint64_t *) &surf[8]) = mt->bo->offset64 + mt->offset; /* reloc */
> -
> -   if (aux_mt) {
> -      *((uint64_t *) &surf[10]) = aux_mt->bo->offset64;
> -      drm_intel_bo_emit_reloc(brw->batch.bo,
> -                              offset + 10 * 4,
> -                              aux_mt->bo, 0,
> -                              I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
> -   } else {
> -      surf[10] = 0;
> -      surf[11] = 0;
> -   }
> +   *((uint64_t *) &surf[8]) =
> +      brw_batch_reloc(&brw->batch, offset + 8*4,
> +		      mt->bo, mt->offset,
> +		      I915_GEM_DOMAIN_RENDER,
> +		      I915_GEM_DOMAIN_RENDER);
> +
> +   *((uint64_t *)&surf[10]) =
> +      brw_batch_reloc(&brw->batch, offset + 10 * 4,
> +		      aux_mt ? aux_mt->bo : NULL, 0,
> +		      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
>      surf[12] = 0;
>   
> -   drm_intel_bo_emit_reloc(brw->batch.bo,
> -                           offset + 8 * 4,
> -                           mt->bo,
> -                           mt->offset,
> -                           I915_GEM_DOMAIN_RENDER,
> -                           I915_GEM_DOMAIN_RENDER);
> -
>      return offset;
>   }
>   
> diff --git a/src/mesa/drivers/dri/i965/gen8_viewport_state.c b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
> index 2d8eeb1..2cc6f61 100644
> --- a/src/mesa/drivers/dri/i965/gen8_viewport_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
> @@ -24,7 +24,6 @@
>   #include "brw_context.h"
>   #include "brw_state.h"
>   #include "brw_defines.h"
> -#include "intel_batchbuffer.h"
>   #include "main/fbobject.h"
>   #include "main/framebuffer.h"
>   #include "main/viewport.h"
> diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c b/src/mesa/drivers/dri/i965/gen8_vs_state.c
> index 28f5add..f14951d 100644
> --- a/src/mesa/drivers/dri/i965/gen8_vs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c
> @@ -27,7 +27,6 @@
>   #include "brw_util.h"
>   #include "program/prog_parameter.h"
>   #include "program/prog_statevars.h"
> -#include "intel_batchbuffer.h"
>   
>   static void
>   upload_vs_state(struct brw_context *brw)
> diff --git a/src/mesa/drivers/dri/i965/gen8_wm_depth_stencil.c b/src/mesa/drivers/dri/i965/gen8_wm_depth_stencil.c
> index 2c843b2..0f1128a 100644
> --- a/src/mesa/drivers/dri/i965/gen8_wm_depth_stencil.c
> +++ b/src/mesa/drivers/dri/i965/gen8_wm_depth_stencil.c
> @@ -21,7 +21,6 @@
>    * IN THE SOFTWARE.
>    */
>   
> -#include "intel_batchbuffer.h"
>   #include "intel_fbo.h"
>   #include "brw_context.h"
>   #include "brw_defines.h"
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> deleted file mode 100644
> index 969d92c..0000000
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> +++ /dev/null
> @@ -1,480 +0,0 @@
> -/**************************************************************************
> - *
> - * Copyright 2006 VMware, Inc.
> - * All Rights Reserved.
> - *
> - * Permission is hereby granted, free of charge, to any person obtaining a
> - * copy of this software and associated documentation files (the
> - * "Software"), to deal in the Software without restriction, including
> - * without limitation the rights to use, copy, modify, merge, publish,
> - * distribute, sub license, and/or sell copies of the Software, and to
> - * permit persons to whom the Software is furnished to do so, subject to
> - * the following conditions:
> - *
> - * The above copyright notice and this permission notice (including the
> - * next paragraph) shall be included in all copies or substantial portions
> - * of the Software.
> - *
> - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
> - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
> - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
> - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
> - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
> - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
> - *
> - **************************************************************************/
> -
> -#include "intel_batchbuffer.h"
> -#include "intel_buffer_objects.h"
> -#include "intel_reg.h"
> -#include "intel_bufmgr.h"
> -#include "intel_buffers.h"
> -#include "intel_fbo.h"
> -#include "brw_context.h"
> -
> -#include <xf86drm.h>
> -#include <i915_drm.h>
> -
> -static void
> -intel_batchbuffer_reset(struct brw_context *brw);
> -
> -void
> -intel_batchbuffer_init(struct brw_context *brw)
> -{
> -   intel_batchbuffer_reset(brw);
> -
> -   if (!brw->has_llc) {
> -      brw->batch.cpu_map = malloc(BATCH_SZ);
> -      brw->batch.map = brw->batch.cpu_map;
> -   }
> -}
> -
> -static void
> -intel_batchbuffer_reset(struct brw_context *brw)
> -{
> -   if (brw->batch.last_bo != NULL) {
> -      drm_intel_bo_unreference(brw->batch.last_bo);
> -      brw->batch.last_bo = NULL;
> -   }
> -   brw->batch.last_bo = brw->batch.bo;
> -
> -   brw_render_cache_set_clear(brw);
> -
> -   brw->batch.bo = drm_intel_bo_alloc(brw->bufmgr, "batchbuffer",
> -					BATCH_SZ, 4096);
> -   if (brw->has_llc) {
> -      drm_intel_bo_map(brw->batch.bo, true);
> -      brw->batch.map = brw->batch.bo->virtual;
> -   }
> -
> -   brw->batch.reserved_space = BATCH_RESERVED;
> -   brw->batch.state_batch_offset = brw->batch.bo->size;
> -   brw->batch.used = 0;
> -   brw->batch.needs_sol_reset = false;
> -
> -   /* We don't know what ring the new batch will be sent to until we see the
> -    * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
> -    */
> -   brw->batch.ring = UNKNOWN_RING;
> -}
> -
> -void
> -intel_batchbuffer_save_state(struct brw_context *brw)
> -{
> -   brw->batch.saved.used = brw->batch.used;
> -   brw->batch.saved.reloc_count =
> -      drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
> -}
> -
> -void
> -intel_batchbuffer_reset_to_saved(struct brw_context *brw)
> -{
> -   drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
> -
> -   brw->batch.used = brw->batch.saved.used;
> -   if (brw->batch.used == 0)
> -      brw->batch.ring = UNKNOWN_RING;
> -}
> -
> -void
> -intel_batchbuffer_free(struct brw_context *brw)
> -{
> -   free(brw->batch.cpu_map);
> -   drm_intel_bo_unreference(brw->batch.last_bo);
> -   drm_intel_bo_unreference(brw->batch.bo);
> -}
> -
> -static void
> -do_batch_dump(struct brw_context *brw)
> -{
> -   struct drm_intel_decode *decode;
> -   struct intel_batchbuffer *batch = &brw->batch;
> -   int ret;
> -
> -   decode = drm_intel_decode_context_alloc(brw->intelScreen->deviceID);
> -   if (!decode)
> -      return;
> -
> -   ret = drm_intel_bo_map(batch->bo, false);
> -   if (ret == 0) {
> -      drm_intel_decode_set_batch_pointer(decode,
> -					 batch->bo->virtual,
> -					 batch->bo->offset64,
> -					 batch->used);
> -   } else {
> -      fprintf(stderr,
> -	      "WARNING: failed to map batchbuffer (%s), "
> -	      "dumping uploaded data instead.\n", strerror(ret));
> -
> -      drm_intel_decode_set_batch_pointer(decode,
> -					 batch->map,
> -					 batch->bo->offset64,
> -					 batch->used);
> -   }
> -
> -   drm_intel_decode_set_output_file(decode, stderr);
> -   drm_intel_decode(decode);
> -
> -   drm_intel_decode_context_free(decode);
> -
> -   if (ret == 0) {
> -      drm_intel_bo_unmap(batch->bo);
> -
> -      brw_debug_batch(brw);
> -   }
> -}
> -
> -void
> -intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw)
> -{
> -   /* We may need to enable and snapshot OA counters. */
> -   brw_perf_monitor_new_batch(brw);
> -}
> -
> -/**
> - * Called when starting a new batch buffer.
> - */
> -static void
> -brw_new_batch(struct brw_context *brw)
> -{
> -   /* Create a new batchbuffer and reset the associated state: */
> -   drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0);
> -   intel_batchbuffer_reset(brw);
> -
> -   /* If the kernel supports hardware contexts, then most hardware state is
> -    * preserved between batches; we only need to re-emit state that is required
> -    * to be in every batch.  Otherwise we need to re-emit all the state that
> -    * would otherwise be stored in the context (which for all intents and
> -    * purposes means everything).
> -    */
> -   if (brw->hw_ctx == NULL)
> -      brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
> -
> -   brw->ctx.NewDriverState |= BRW_NEW_BATCH;
> -
> -   brw->state_batch_count = 0;
> -
> -   brw->ib.type = -1;
> -
> -   /* We need to periodically reap the shader time results, because rollover
> -    * happens every few seconds.  We also want to see results every once in a
> -    * while, because many programs won't cleanly destroy our context, so the
> -    * end-of-run printout may not happen.
> -    */
> -   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
> -      brw_collect_and_report_shader_time(brw);
> -
> -   if (INTEL_DEBUG & DEBUG_PERFMON)
> -      brw_dump_perf_monitors(brw);
> -}
> -
> -/**
> - * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
> - * sending it off.
> - *
> - * This function can emit state (say, to preserve registers that aren't saved
> - * between batches).  All of this state MUST fit in the reserved space at the
> - * end of the batchbuffer.  If you add more GPU state, increase the reserved
> - * space by updating the BATCH_RESERVED macro.
> - */
> -static void
> -brw_finish_batch(struct brw_context *brw)
> -{
> -   /* Capture the closing pipeline statistics register values necessary to
> -    * support query objects (in the non-hardware context world).
> -    */
> -   brw_emit_query_end(brw);
> -
> -   /* We may also need to snapshot and disable OA counters. */
> -   if (brw->batch.ring == RENDER_RING)
> -      brw_perf_monitor_finish_batch(brw);
> -
> -   /* Mark that the current program cache BO has been used by the GPU.
> -    * It will be reallocated if we need to put new programs in for the
> -    * next batch.
> -    */
> -   brw->cache.bo_used_by_gpu = true;
> -}
> -
> -static void
> -throttle(struct brw_context *brw)
> -{
> -   /* Wait for the swapbuffers before the one we just emitted, so we
> -    * don't get too many swaps outstanding for apps that are GPU-heavy
> -    * but not CPU-heavy.
> -    *
> -    * We're using intelDRI2Flush (called from the loader before
> -    * swapbuffer) and glFlush (for front buffer rendering) as the
> -    * indicator that a frame is done and then throttle when we get
> -    * here as we prepare to render the next frame.  At this point for
> -    * round trips for swap/copy and getting new buffers are done and
> -    * we'll spend less time waiting on the GPU.
> -    *
> -    * Unfortunately, we don't have a handle to the batch containing
> -    * the swap, and getting our hands on that doesn't seem worth it,
> -    * so we just use the first batch we emitted after the last swap.
> -    */
> -   if (brw->need_swap_throttle && brw->throttle_batch[0]) {
> -      if (brw->throttle_batch[1]) {
> -         if (!brw->disable_throttling)
> -            drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
> -         drm_intel_bo_unreference(brw->throttle_batch[1]);
> -      }
> -      brw->throttle_batch[1] = brw->throttle_batch[0];
> -      brw->throttle_batch[0] = NULL;
> -      brw->need_swap_throttle = false;
> -      /* Throttling here is more precise than the throttle ioctl, so skip it */
> -      brw->need_flush_throttle = false;
> -   }
> -
> -   if (brw->need_flush_throttle) {
> -      __DRIscreen *psp = brw->intelScreen->driScrnPriv;
> -      drmCommandNone(psp->fd, DRM_I915_GEM_THROTTLE);
> -      brw->need_flush_throttle = false;
> -   }
> -}
> -
> -/* TODO: Push this whole function into bufmgr.
> - */
> -static int
> -do_flush_locked(struct brw_context *brw)
> -{
> -   struct intel_batchbuffer *batch = &brw->batch;
> -   int ret = 0;
> -
> -   if (brw->has_llc) {
> -      drm_intel_bo_unmap(batch->bo);
> -   } else {
> -      ret = drm_intel_bo_subdata(batch->bo, 0, 4*batch->used, batch->map);
> -      if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
> -	 ret = drm_intel_bo_subdata(batch->bo,
> -				    batch->state_batch_offset,
> -				    batch->bo->size - batch->state_batch_offset,
> -				    (char *)batch->map + batch->state_batch_offset);
> -      }
> -   }
> -
> -   if (!brw->intelScreen->no_hw) {
> -      int flags;
> -
> -      if (brw->gen >= 6 && batch->ring == BLT_RING) {
> -         flags = I915_EXEC_BLT;
> -      } else {
> -         flags = I915_EXEC_RENDER;
> -      }
> -      if (batch->needs_sol_reset)
> -	 flags |= I915_EXEC_GEN7_SOL_RESET;
> -
> -      if (ret == 0) {
> -         if (unlikely(INTEL_DEBUG & DEBUG_AUB))
> -            brw_annotate_aub(brw);
> -
> -	 if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
> -	    ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0,
> -					flags);
> -	 } else {
> -	    ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx,
> -						4 * batch->used, flags);
> -	 }
> -      }
> -
> -      throttle(brw);
> -   }
> -
> -   if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
> -      do_batch_dump(brw);
> -
> -   if (ret != 0) {
> -      fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
> -      exit(1);
> -   }
> -
> -   return ret;
> -}
> -
> -int
> -_intel_batchbuffer_flush(struct brw_context *brw,
> -			 const char *file, int line)
> -{
> -   int ret;
> -
> -   if (brw->batch.used == 0)
> -      return 0;
> -
> -   if (brw->throttle_batch[0] == NULL) {
> -      brw->throttle_batch[0] = brw->batch.bo;
> -      drm_intel_bo_reference(brw->throttle_batch[0]);
> -   }
> -
> -   if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
> -      int bytes_for_commands = 4 * brw->batch.used;
> -      int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
> -      int total_bytes = bytes_for_commands + bytes_for_state;
> -      fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
> -              "%4db (state) = %4db (%0.1f%%)\n", file, line,
> -              bytes_for_commands, bytes_for_state,
> -              total_bytes,
> -              100.0f * total_bytes / BATCH_SZ);
> -   }
> -
> -   brw->batch.reserved_space = 0;
> -
> -   brw_finish_batch(brw);
> -
> -   /* Mark the end of the buffer. */
> -   intel_batchbuffer_emit_dword(brw, MI_BATCH_BUFFER_END);
> -   if (brw->batch.used & 1) {
> -      /* Round batchbuffer usage to 2 DWORDs. */
> -      intel_batchbuffer_emit_dword(brw, MI_NOOP);
> -   }
> -
> -   intel_upload_finish(brw);
> -
> -   /* Check that we didn't just wrap our batchbuffer at a bad time. */
> -   assert(!brw->no_batch_wrap);
> -
> -   ret = do_flush_locked(brw);
> -
> -   if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
> -      fprintf(stderr, "waiting for idle\n");
> -      drm_intel_bo_wait_rendering(brw->batch.bo);
> -   }
> -
> -   /* Start a new batch buffer. */
> -   brw_new_batch(brw);
> -
> -   return ret;
> -}
> -
> -
> -/*  This is the only way buffers get added to the validate list.
> - */
> -bool
> -intel_batchbuffer_emit_reloc(struct brw_context *brw,
> -                             drm_intel_bo *buffer,
> -                             uint32_t read_domains, uint32_t write_domain,
> -			     uint32_t delta)
> -{
> -   int ret;
> -
> -   ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
> -				 buffer, delta,
> -				 read_domains, write_domain);
> -   assert(ret == 0);
> -   (void)ret;
> -
> -   /* Using the old buffer offset, write in what the right data would be, in
> -    * case the buffer doesn't move and we can short-circuit the relocation
> -    * processing in the kernel
> -    */
> -   intel_batchbuffer_emit_dword(brw, buffer->offset64 + delta);
> -
> -   return true;
> -}
> -
> -bool
> -intel_batchbuffer_emit_reloc64(struct brw_context *brw,
> -                               drm_intel_bo *buffer,
> -                               uint32_t read_domains, uint32_t write_domain,
> -			       uint32_t delta)
> -{
> -   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
> -                                     buffer, delta,
> -                                     read_domains, write_domain);
> -   assert(ret == 0);
> -   (void) ret;
> -
> -   /* Using the old buffer offset, write in what the right data would be, in
> -    * case the buffer doesn't move and we can short-circuit the relocation
> -    * processing in the kernel
> -    */
> -   uint64_t offset = buffer->offset64 + delta;
> -   intel_batchbuffer_emit_dword(brw, offset);
> -   intel_batchbuffer_emit_dword(brw, offset >> 32);
> -
> -   return true;
> -}
> -
> -
> -void
> -intel_batchbuffer_data(struct brw_context *brw,
> -                       const void *data, GLuint bytes, enum brw_gpu_ring ring)
> -{
> -   assert((bytes & 3) == 0);
> -   intel_batchbuffer_require_space(brw, bytes, ring);
> -   memcpy(brw->batch.map + brw->batch.used, data, bytes);
> -   brw->batch.used += bytes >> 2;
> -}
> -
> -static void
> -load_sized_register_mem(struct brw_context *brw,
> -                        uint32_t reg,
> -                        drm_intel_bo *bo,
> -                        uint32_t read_domains, uint32_t write_domain,
> -                        uint32_t offset,
> -                        int size)
> -{
> -   int i;
> -
> -   /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
> -   assert(brw->gen >= 7);
> -
> -   if (brw->gen >= 8) {
> -      BEGIN_BATCH(4 * size);
> -      for (i = 0; i < size; i++) {
> -         OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
> -         OUT_BATCH(reg + i * 4);
> -         OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
> -      }
> -      ADVANCE_BATCH();
> -   } else {
> -      BEGIN_BATCH(3 * size);
> -      for (i = 0; i < size; i++) {
> -         OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
> -         OUT_BATCH(reg + i * 4);
> -         OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
> -      }
> -      ADVANCE_BATCH();
> -   }
> -}
> -
> -void
> -brw_load_register_mem(struct brw_context *brw,
> -                      uint32_t reg,
> -                      drm_intel_bo *bo,
> -                      uint32_t read_domains, uint32_t write_domain,
> -                      uint32_t offset)
> -{
> -   load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
> -}
> -
> -void
> -brw_load_register_mem64(struct brw_context *brw,
> -                        uint32_t reg,
> -                        drm_intel_bo *bo,
> -                        uint32_t read_domains, uint32_t write_domain,
> -                        uint32_t offset)
> -{
> -   load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
> -}
> diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> deleted file mode 100644
> index ef8a6ff..0000000
> --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
> +++ /dev/null
> @@ -1,179 +0,0 @@
> -#ifndef INTEL_BATCHBUFFER_H
> -#define INTEL_BATCHBUFFER_H
> -
> -#include "main/mtypes.h"
> -
> -#include "brw_context.h"
> -#include "intel_bufmgr.h"
> -#include "intel_reg.h"
> -
> -#ifdef __cplusplus
> -extern "C" {
> -#endif
> -
> -/**
> - * Number of bytes to reserve for commands necessary to complete a batch.
> - *
> - * This includes:
> - * - MI_BATCHBUFFER_END (4 bytes)
> - * - Optional MI_NOOP for ensuring the batch length is qword aligned (4 bytes)
> - * - Any state emitted by vtbl->finish_batch():
> - *   - Gen4-5 record ending occlusion query values (4 * 4 = 16 bytes)
> - *   - Disabling OA counters on Gen6+ (3 DWords = 12 bytes)
> - *   - Ending MI_REPORT_PERF_COUNT on Gen5+, plus associated PIPE_CONTROLs:
> - *     - Two sets of PIPE_CONTROLs, which become 3 PIPE_CONTROLs each on SNB,
> - *       which are 4 DWords each ==> 2 * 3 * 4 * 4 = 96 bytes
> - *     - 3 DWords for MI_REPORT_PERF_COUNT itself on Gen6+.  ==> 12 bytes.
> - *       On Ironlake, it's 6 DWords, but we have some slack due to the lack of
> - *       Sandybridge PIPE_CONTROL madness.
> - */
> -#define BATCH_RESERVED 146
> -
> -struct intel_batchbuffer;
> -
> -void intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw);
> -void intel_batchbuffer_init(struct brw_context *brw);
> -void intel_batchbuffer_free(struct brw_context *brw);
> -void intel_batchbuffer_save_state(struct brw_context *brw);
> -void intel_batchbuffer_reset_to_saved(struct brw_context *brw);
> -
> -int _intel_batchbuffer_flush(struct brw_context *brw,
> -			     const char *file, int line);
> -
> -#define intel_batchbuffer_flush(intel) \
> -	_intel_batchbuffer_flush(intel, __FILE__, __LINE__)
> -
> -
> -
> -/* Unlike bmBufferData, this currently requires the buffer be mapped.
> - * Consider it a convenience function wrapping multple
> - * intel_buffer_dword() calls.
> - */
> -void intel_batchbuffer_data(struct brw_context *brw,
> -                            const void *data, GLuint bytes,
> -                            enum brw_gpu_ring ring);
> -
> -bool intel_batchbuffer_emit_reloc(struct brw_context *brw,
> -                                       drm_intel_bo *buffer,
> -				       uint32_t read_domains,
> -				       uint32_t write_domain,
> -				       uint32_t offset);
> -bool intel_batchbuffer_emit_reloc64(struct brw_context *brw,
> -                                    drm_intel_bo *buffer,
> -                                    uint32_t read_domains,
> -                                    uint32_t write_domain,
> -                                    uint32_t offset);
> -static inline uint32_t float_as_int(float f)
> -{
> -   union {
> -      float f;
> -      uint32_t d;
> -   } fi;
> -
> -   fi.f = f;
> -   return fi.d;
> -}
> -
> -/* Inline functions - might actually be better off with these
> - * non-inlined.  Certainly better off switching all command packets to
> - * be passed as structs rather than dwords, but that's a little bit of
> - * work...
> - */
> -static inline unsigned
> -intel_batchbuffer_space(struct brw_context *brw)
> -{
> -   return (brw->batch.state_batch_offset - brw->batch.reserved_space)
> -      - brw->batch.used*4;
> -}
> -
> -
> -static inline void
> -intel_batchbuffer_emit_dword(struct brw_context *brw, GLuint dword)
> -{
> -#ifdef DEBUG
> -   assert(intel_batchbuffer_space(brw) >= 4);
> -#endif
> -   brw->batch.map[brw->batch.used++] = dword;
> -   assert(brw->batch.ring != UNKNOWN_RING);
> -}
> -
> -static inline void
> -intel_batchbuffer_emit_float(struct brw_context *brw, float f)
> -{
> -   intel_batchbuffer_emit_dword(brw, float_as_int(f));
> -}
> -
> -static inline void
> -intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
> -                                enum brw_gpu_ring ring)
> -{
> -   /* If we're switching rings, implicitly flush the batch. */
> -   if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
> -       brw->gen >= 6) {
> -      intel_batchbuffer_flush(brw);
> -   }
> -
> -#ifdef DEBUG
> -   assert(sz < BATCH_SZ - BATCH_RESERVED);
> -#endif
> -   if (intel_batchbuffer_space(brw) < sz)
> -      intel_batchbuffer_flush(brw);
> -
> -   enum brw_gpu_ring prev_ring = brw->batch.ring;
> -   /* The intel_batchbuffer_flush() calls above might have changed
> -    * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
> -    */
> -   brw->batch.ring = ring;
> -
> -   if (unlikely(prev_ring == UNKNOWN_RING && ring == RENDER_RING))
> -      intel_batchbuffer_emit_render_ring_prelude(brw);
> -}
> -
> -static inline void
> -intel_batchbuffer_begin(struct brw_context *brw, int n, enum brw_gpu_ring ring)
> -{
> -   intel_batchbuffer_require_space(brw, n * 4, ring);
> -
> -   brw->batch.emit = brw->batch.used;
> -#ifdef DEBUG
> -   brw->batch.total = n;
> -#endif
> -}
> -
> -static inline void
> -intel_batchbuffer_advance(struct brw_context *brw)
> -{
> -#ifdef DEBUG
> -   struct intel_batchbuffer *batch = &brw->batch;
> -   unsigned int _n = batch->used - batch->emit;
> -   assert(batch->total != 0);
> -   if (_n != batch->total) {
> -      fprintf(stderr, "ADVANCE_BATCH: %d of %d dwords emitted\n",
> -	      _n, batch->total);
> -      abort();
> -   }
> -   batch->total = 0;
> -#endif
> -}
> -
> -#define BEGIN_BATCH(n) intel_batchbuffer_begin(brw, n, RENDER_RING)
> -#define BEGIN_BATCH_BLT(n) intel_batchbuffer_begin(brw, n, BLT_RING)
> -#define OUT_BATCH(d) intel_batchbuffer_emit_dword(brw, d)
> -#define OUT_BATCH_F(f) intel_batchbuffer_emit_float(brw, f)
> -#define OUT_RELOC(buf, read_domains, write_domain, delta) do {		\
> -   intel_batchbuffer_emit_reloc(brw, buf,			\
> -				read_domains, write_domain, delta);	\
> -} while (0)
> -
> -/* Handle 48-bit address relocations for Gen8+ */
> -#define OUT_RELOC64(buf, read_domains, write_domain, delta) do { \
> -   intel_batchbuffer_emit_reloc64(brw, buf, read_domains, write_domain, delta);	\
> -} while (0)
> -
> -#define ADVANCE_BATCH() intel_batchbuffer_advance(brw);
> -
> -#ifdef __cplusplus
> -}
> -#endif
> -
> -#endif
> diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
> index bc39053..1acbace 100644
> --- a/src/mesa/drivers/dri/i965/intel_blit.c
> +++ b/src/mesa/drivers/dri/i965/intel_blit.c
> @@ -39,7 +39,6 @@
>   #include "intel_buffers.h"
>   #include "intel_fbo.h"
>   #include "intel_reg.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_mipmap_tree.h"
>   
>   #define FILE_DEBUG_FLAG DEBUG_BLIT
> @@ -397,11 +396,11 @@ alignment_valid(struct brw_context *brw, unsigned offset, uint32_t tiling)
>   
>   static bool
>   can_fast_copy_blit(struct brw_context *brw,
> -		   drm_intel_bo *src_buffer,
> +		   struct brw_bo *src_buffer,
>                      int16_t src_x, int16_t src_y,
>                      uintptr_t src_offset, uint32_t src_pitch,
>                      uint32_t src_tiling, uint32_t src_tr_mode,
> -		   drm_intel_bo *dst_buffer,
> +		   struct brw_bo *dst_buffer,
>                      int16_t dst_x, int16_t dst_y,
>                      uintptr_t dst_offset, uint32_t dst_pitch,
>                      uint32_t dst_tiling, uint32_t dst_tr_mode,
> @@ -507,12 +506,12 @@ bool
>   intelEmitCopyBlit(struct brw_context *brw,
>   		  GLuint cpp,
>   		  GLshort src_pitch,
> -		  drm_intel_bo *src_buffer,
> +		  struct brw_bo *src_buffer,
>   		  GLuint src_offset,
>   		  uint32_t src_tiling,
>   		  uint32_t src_tr_mode,
>   		  GLshort dst_pitch,
> -		  drm_intel_bo *dst_buffer,
> +		  struct brw_bo *dst_buffer,
>   		  GLuint dst_offset,
>   		  uint32_t dst_tiling,
>   		  uint32_t dst_tr_mode,
> @@ -521,10 +520,9 @@ intelEmitCopyBlit(struct brw_context *brw,
>   		  GLshort w, GLshort h,
>   		  GLenum logic_op)
>   {
> -   GLuint CMD, BR13, pass = 0;
> +   GLuint CMD, BR13;
>      int dst_y2 = dst_y + h;
>      int dst_x2 = dst_x + w;
> -   drm_intel_bo *aper_array[3];
>      bool dst_y_tiled = dst_tiling == I915_TILING_Y;
>      bool src_y_tiled = src_tiling == I915_TILING_Y;
>      bool use_fast_copy_blit = false;
> @@ -532,25 +530,8 @@ intelEmitCopyBlit(struct brw_context *brw,
>      if ((dst_y_tiled || src_y_tiled) && brw->gen < 6)
>         return false;
>   
> -   /* do space check before going any further */
> -   do {
> -       aper_array[0] = brw->batch.bo;
> -       aper_array[1] = dst_buffer;
> -       aper_array[2] = src_buffer;
> -
> -       if (dri_bufmgr_check_aperture_space(aper_array, 3) != 0) {
> -           intel_batchbuffer_flush(brw);
> -           pass++;
> -       } else
> -           break;
> -   } while (pass < 2);
> -
> -   if (pass >= 2)
> -      return false;
> -
>      unsigned length = brw->gen >= 8 ? 10 : 8;
>   
> -   intel_batchbuffer_require_space(brw, length * 4, BLT_RING);
>      DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
>          __func__,
>          src_buffer, src_pitch, src_offset, src_x, src_y,
> @@ -661,6 +642,9 @@ intelEmitCopyBlit(struct brw_context *brw,
>      assert(dst_offset + (dst_y + h - 1) * abs(dst_pitch) +
>             (w * cpp) <= dst_buffer->size);
>   
> +   if (brw_batch_begin(&brw->batch, 20, BLT_RING) < 0)
> +      return false;
> +
>      BEGIN_BATCH_BLT_TILED(length, dst_y_tiled, src_y_tiled);
>      OUT_BATCH(CMD | (length - 2));
>      OUT_BATCH(BR13 | (uint16_t)dst_pitch);
> @@ -688,10 +672,7 @@ intelEmitCopyBlit(struct brw_context *brw,
>      }
>   
>      ADVANCE_BATCH_TILED(dst_y_tiled, src_y_tiled);
> -
> -   brw_emit_mi_flush(brw);
> -
> -   return true;
> +   return brw_batch_end(&brw->batch) == 0;
>   }
>   
>   bool
> @@ -700,7 +681,7 @@ intelEmitImmediateColorExpandBlit(struct brw_context *brw,
>   				  GLubyte *src_bits, GLuint src_size,
>   				  GLuint fg_color,
>   				  GLshort dst_pitch,
> -				  drm_intel_bo *dst_buffer,
> +				  struct brw_bo *dst_buffer,
>   				  GLuint dst_offset,
>   				  uint32_t dst_tiling,
>   				  GLshort x, GLshort y,
> @@ -728,9 +709,6 @@ intelEmitImmediateColorExpandBlit(struct brw_context *brw,
>          dst_buffer, dst_pitch, dst_offset, x, y, w, h, src_size, dwords);
>   
>      unsigned xy_setup_blt_length = brw->gen >= 8 ? 10 : 8;
> -   intel_batchbuffer_require_space(brw, (xy_setup_blt_length * 4) +
> -                                        (3 * 4) + dwords * 4, BLT_RING);
> -
>      opcode = XY_SETUP_BLT_CMD;
>      if (cpp == 4)
>         opcode |= XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
> @@ -746,6 +724,9 @@ intelEmitImmediateColorExpandBlit(struct brw_context *brw,
>      if (dst_tiling != I915_TILING_NONE)
>         blit_cmd |= XY_DST_TILED;
>   
> +   if (brw_batch_begin(&brw->batch, 20 + dwords, BLT_RING) < 0)
> +      return false;
> +
>      BEGIN_BATCH_BLT(xy_setup_blt_length + 3);
>      OUT_BATCH(opcode | (xy_setup_blt_length - 2));
>      OUT_BATCH(br13);
> @@ -771,11 +752,9 @@ intelEmitImmediateColorExpandBlit(struct brw_context *brw,
>      OUT_BATCH(SET_FIELD(y + h, BLT_Y) | SET_FIELD(x + w, BLT_X));
>      ADVANCE_BATCH();
>   
> -   intel_batchbuffer_data(brw, src_bits, dwords * 4, BLT_RING);
> +   brw_batch_data(&brw->batch, src_bits, dwords * 4);
>   
> -   brw_emit_mi_flush(brw);
> -
> -   return true;
> +   return brw_batch_end(&brw->batch) == 0;
>   }
>   
>   /* We don't have a memmove-type blit like some other hardware, so we'll do a
> @@ -784,9 +763,9 @@ intelEmitImmediateColorExpandBlit(struct brw_context *brw,
>    */
>   void
>   intel_emit_linear_blit(struct brw_context *brw,
> -		       drm_intel_bo *dst_bo,
> +		       struct brw_bo *dst_bo,
>   		       unsigned int dst_offset,
> -		       drm_intel_bo *src_bo,
> +		       struct brw_bo *src_bo,
>   		       unsigned int src_offset,
>   		       unsigned int size)
>   {
> @@ -853,7 +832,6 @@ intel_miptree_set_alpha_to_one(struct brw_context *brw,
>   {
>      uint32_t BR13, CMD;
>      int pitch, cpp;
> -   drm_intel_bo *aper_array[2];
>   
>      pitch = mt->pitch;
>      cpp = mt->cpp;
> @@ -871,14 +849,8 @@ intel_miptree_set_alpha_to_one(struct brw_context *brw,
>      }
>      BR13 |= pitch;
>   
> -   /* do space check before going any further */
> -   aper_array[0] = brw->batch.bo;
> -   aper_array[1] = mt->bo;
> -
> -   if (drm_intel_bufmgr_check_aperture_space(aper_array,
> -					     ARRAY_SIZE(aper_array)) != 0) {
> -      intel_batchbuffer_flush(brw);
> -   }
> +   if (brw_batch_begin(&brw->batch, 20, BLT_RING) < 0)
> +      return;
>   
>      unsigned length = brw->gen >= 8 ? 7 : 6;
>      bool dst_y_tiled = mt->tiling == I915_TILING_Y;
> @@ -900,5 +872,5 @@ intel_miptree_set_alpha_to_one(struct brw_context *brw,
>      OUT_BATCH(0xffffffff); /* white, but only alpha gets written */
>      ADVANCE_BATCH_TILED(dst_y_tiled, false);
>   
> -   brw_emit_mi_flush(brw);
> +   brw_batch_end(&brw->batch);
>   }
> diff --git a/src/mesa/drivers/dri/i965/intel_blit.h b/src/mesa/drivers/dri/i965/intel_blit.h
> index c3d19a5..a0e4e2b 100644
> --- a/src/mesa/drivers/dri/i965/intel_blit.h
> +++ b/src/mesa/drivers/dri/i965/intel_blit.h
> @@ -34,12 +34,12 @@ bool
>   intelEmitCopyBlit(struct brw_context *brw,
>                     GLuint cpp,
>                     GLshort src_pitch,
> -                  drm_intel_bo *src_buffer,
> +                  struct brw_bo *src_buffer,
>                     GLuint src_offset,
>                     uint32_t src_tiling,
>                     uint32_t src_tr_mode,
>                     GLshort dst_pitch,
> -                  drm_intel_bo *dst_buffer,
> +                  struct brw_bo *dst_buffer,
>                     GLuint dst_offset,
>                     uint32_t dst_tiling,
>                     uint32_t dst_tr_mode,
> @@ -66,16 +66,16 @@ intelEmitImmediateColorExpandBlit(struct brw_context *brw,
>   				  GLubyte *src_bits, GLuint src_size,
>   				  GLuint fg_color,
>   				  GLshort dst_pitch,
> -				  drm_intel_bo *dst_buffer,
> +				  struct brw_bo *dst_buffer,
>   				  GLuint dst_offset,
>   				  uint32_t dst_tiling,
>   				  GLshort x, GLshort y,
>   				  GLshort w, GLshort h,
>   				  GLenum logic_op);
>   void intel_emit_linear_blit(struct brw_context *brw,
> -			    drm_intel_bo *dst_bo,
> +			    struct brw_bo *dst_bo,
>   			    unsigned int dst_offset,
> -			    drm_intel_bo *src_bo,
> +			    struct brw_bo *src_bo,
>   			    unsigned int src_offset,
>   			    unsigned int size);
>   
> diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.c b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
> index ff05b5c..fda5c9f 100644
> --- a/src/mesa/drivers/dri/i965/intel_buffer_objects.c
> +++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
> @@ -39,47 +39,6 @@
>   #include "brw_context.h"
>   #include "intel_blit.h"
>   #include "intel_buffer_objects.h"
> -#include "intel_batchbuffer.h"
> -
> -/**
> - * Map a buffer object; issue performance warnings if mapping causes stalls.
> - *
> - * This matches the drm_intel_bo_map API, but takes an additional human-readable
> - * name for the buffer object to use in the performance debug message.
> - */
> -int
> -brw_bo_map(struct brw_context *brw,
> -           drm_intel_bo *bo, int write_enable,
> -           const char *bo_name)
> -{
> -   if (likely(!brw->perf_debug) || !drm_intel_bo_busy(bo))
> -      return drm_intel_bo_map(bo, write_enable);
> -
> -   double start_time = get_time();
> -
> -   int ret = drm_intel_bo_map(bo, write_enable);
> -
> -   perf_debug("CPU mapping a busy %s BO stalled and took %.03f ms.\n",
> -              bo_name, (get_time() - start_time) * 1000);
> -
> -   return ret;
> -}
> -
> -int
> -brw_bo_map_gtt(struct brw_context *brw, drm_intel_bo *bo, const char *bo_name)
> -{
> -   if (likely(!brw->perf_debug) || !drm_intel_bo_busy(bo))
> -      return drm_intel_gem_bo_map_gtt(bo);
> -
> -   double start_time = get_time();
> -
> -   int ret = drm_intel_gem_bo_map_gtt(bo);
> -
> -   perf_debug("GTT mapping a busy %s BO stalled and took %.03f ms.\n",
> -              bo_name, (get_time() - start_time) * 1000);
> -
> -   return ret;
> -}
>   
>   static void
>   mark_buffer_gpu_usage(struct intel_buffer_object *intel_obj,
> @@ -92,17 +51,20 @@ mark_buffer_gpu_usage(struct intel_buffer_object *intel_obj,
>   static void
>   mark_buffer_inactive(struct intel_buffer_object *intel_obj)
>   {
> +   if (brw_bo_busy(intel_obj->buffer, BUSY_WRITE))
> +      return;
> +
>      intel_obj->gpu_active_start = ~0;
>      intel_obj->gpu_active_end = 0;
>   }
>   
> -/** Allocates a new drm_intel_bo to store the data for the buffer object. */
> +/** Allocates a new brw_bo to store the data for the buffer object. */
>   static void
>   alloc_buffer_object(struct brw_context *brw,
>                       struct intel_buffer_object *intel_obj)
>   {
> -   intel_obj->buffer = drm_intel_bo_alloc(brw->bufmgr, "bufferobj",
> -					  intel_obj->Base.Size, 64);
> +   intel_obj->buffer =
> +      brw_bo_create(&brw->batch, "bufferobj", intel_obj->Base.Size, 64, 0);
>   
>      /* the buffer might be bound as a uniform buffer, need to update it
>       */
> @@ -119,7 +81,7 @@ alloc_buffer_object(struct brw_context *brw,
>   static void
>   release_buffer(struct intel_buffer_object *intel_obj)
>   {
> -   drm_intel_bo_unreference(intel_obj->buffer);
> +   brw_bo_put(intel_obj->buffer);
>      intel_obj->buffer = NULL;
>   }
>   
> @@ -166,7 +128,7 @@ brw_delete_buffer(struct gl_context * ctx, struct gl_buffer_object *obj)
>       */
>      _mesa_buffer_unmap_all_mappings(ctx, obj);
>   
> -   drm_intel_bo_unreference(intel_obj->buffer);
> +   brw_bo_put(intel_obj->buffer);
>      free(intel_obj);
>   }
>   
> @@ -213,7 +175,7 @@ brw_buffer_data(struct gl_context *ctx,
>            return false;
>   
>         if (data != NULL)
> -	 drm_intel_bo_subdata(intel_obj->buffer, 0, size, data);
> +	 brw_bo_write(intel_obj->buffer, 0, data, size, 0);
>      }
>   
>      return true;
> @@ -238,64 +200,49 @@ brw_buffer_subdata(struct gl_context *ctx,
>   {
>      struct brw_context *brw = brw_context(ctx);
>      struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
> -   bool busy;
>   
>      if (size == 0)
>         return;
>   
>      assert(intel_obj);
>   
> -   /* See if we can unsynchronized write the data into the user's BO. This
> -    * avoids GPU stalls in unfortunately common user patterns (uploading
> -    * sequentially into a BO, with draw calls in between each upload).
> -    *
> -    * Once we've hit this path, we mark this GL BO as preferring stalling to
> -    * blits, so that we can hopefully hit this path again in the future
> -    * (otherwise, an app that might occasionally stall but mostly not will end
> -    * up with blitting all the time, at the cost of bandwidth)
> -    */
> -   if (offset + size <= intel_obj->gpu_active_start ||
> -       intel_obj->gpu_active_end <= offset) {
> -      if (brw->has_llc) {
> -         drm_intel_gem_bo_map_unsynchronized(intel_obj->buffer);
> -         memcpy(intel_obj->buffer->virtual + offset, data, size);
> -         drm_intel_bo_unmap(intel_obj->buffer);
> -
> -         if (intel_obj->gpu_active_end > intel_obj->gpu_active_start)
> -            intel_obj->prefer_stall_to_blit = true;
> -         return;
> -      } else {
> -         perf_debug("BufferSubData could be unsynchronized, but !LLC doesn't support it yet\n");
> -      }
> -   }
> -
> -   busy =
> -      drm_intel_bo_busy(intel_obj->buffer) ||
> -      drm_intel_bo_references(brw->batch.bo, intel_obj->buffer);
> -
> -   if (busy) {
> +   if (brw_bo_busy(intel_obj->buffer, BUSY_WRITE | BUSY_RETIRE)) {
>         if (size == intel_obj->Base.Size) {
>   	 /* Replace the current busy bo so the subdata doesn't stall. */
> -	 drm_intel_bo_unreference(intel_obj->buffer);
> +	 brw_bo_put(intel_obj->buffer);
>   	 alloc_buffer_object(brw, intel_obj);
> +      } else if (offset + size <= intel_obj->gpu_active_start ||
> +                 intel_obj->gpu_active_end <= offset) {
> +         /* See if we can unsynchronized write the data into the user's BO.
> +          * This avoids GPU stalls in unfortunately common user patterns
> +          * (uploading sequentially into a BO, with draw calls in between
> +          * each upload).
> +          *
> +          * Once we've hit this path, we mark this GL BO as preferring
> +          * stalling to blits, so that we can hopefully hit this path again
> +          * in the future (otherwise, an app that might occasionally stall
> +          * but mostly not will end up with blitting all the time, at the
> +          * cost of bandwidth).
> +          */
> +         brw_bo_write(intel_obj->buffer, offset, data, size, MAP_ASYNC);
> +         if (intel_obj->gpu_active_end > intel_obj->gpu_active_start)
> +            intel_obj->prefer_stall_to_blit = intel_obj->buffer->cache_coherent;
> +	 return;
>         } else if (!intel_obj->prefer_stall_to_blit) {
> +	 uint32_t upload;
>            perf_debug("Using a blit copy to avoid stalling on "
>                       "glBufferSubData(%ld, %ld) (%ldkb) to a busy "
>                       "(%d-%d) buffer object.\n",
>                       (long)offset, (long)offset + size, (long)(size/1024),
>                       intel_obj->gpu_active_start,
>                       intel_obj->gpu_active_end);
> -	 drm_intel_bo *temp_bo =
> -	    drm_intel_bo_alloc(brw->bufmgr, "subdata temp", size, 64);
> -
> -	 drm_intel_bo_subdata(temp_bo, 0, size, data);
> -
> +	 struct brw_bo *bo = NULL;
> +         intel_upload_data(brw, data, size, 64, &bo, &upload);
>   	 intel_emit_linear_blit(brw,
>   				intel_obj->buffer, offset,
> -				temp_bo, 0,
> +				bo, upload,
>   				size);
> -
> -	 drm_intel_bo_unreference(temp_bo);
> +	 brw_bo_put(bo);
>            return;
>         } else {
>            perf_debug("Stalling on glBufferSubData(%ld, %ld) (%ldkb) to a busy "
> @@ -304,11 +251,10 @@ brw_buffer_subdata(struct gl_context *ctx,
>                       (long)offset, (long)offset + size, (long)(size/1024),
>                       intel_obj->gpu_active_start,
>                       intel_obj->gpu_active_end);
> -         intel_batchbuffer_flush(brw);
>         }
>      }
>   
> -   drm_intel_bo_subdata(intel_obj->buffer, offset, size, data);
> +   brw_bo_write(intel_obj->buffer, offset, data, size, 0);
>      mark_buffer_inactive(intel_obj);
>   }
>   
> @@ -327,14 +273,9 @@ brw_get_buffer_subdata(struct gl_context *ctx,
>                          struct gl_buffer_object *obj)
>   {
>      struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
> -   struct brw_context *brw = brw_context(ctx);
>   
>      assert(intel_obj);
> -   if (drm_intel_bo_references(brw->batch.bo, intel_obj->buffer)) {
> -      intel_batchbuffer_flush(brw);
> -   }
> -   drm_intel_bo_get_subdata(intel_obj->buffer, offset, size, data);
> -
> +   brw_bo_read(intel_obj->buffer, offset, data, size, 0);
>      mark_buffer_inactive(intel_obj);
>   }
>   
> @@ -365,6 +306,7 @@ brw_map_buffer_range(struct gl_context *ctx,
>   {
>      struct brw_context *brw = brw_context(ctx);
>      struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
> +   unsigned map_flags;
>   
>      assert(intel_obj);
>   
> @@ -389,19 +331,11 @@ brw_map_buffer_range(struct gl_context *ctx,
>       * achieve the required synchronization.
>       */
>      if (!(access & GL_MAP_UNSYNCHRONIZED_BIT)) {
> -      if (drm_intel_bo_references(brw->batch.bo, intel_obj->buffer)) {
> -	 if (access & GL_MAP_INVALIDATE_BUFFER_BIT) {
> -	    drm_intel_bo_unreference(intel_obj->buffer);
> +      if ((access & GL_MAP_INVALIDATE_BUFFER_BIT)) {
> +	 if (brw_bo_busy(intel_obj->buffer, BUSY_WRITE | BUSY_RETIRE)) {
> +	    brw_bo_put(intel_obj->buffer);
>   	    alloc_buffer_object(brw, intel_obj);
> -	 } else {
> -            perf_debug("Stalling on the GPU for mapping a busy buffer "
> -                       "object\n");
> -	    intel_batchbuffer_flush(brw);
>   	 }
> -      } else if (drm_intel_bo_busy(intel_obj->buffer) &&
> -		 (access & GL_MAP_INVALIDATE_BUFFER_BIT)) {
> -	 drm_intel_bo_unreference(intel_obj->buffer);
> -	 alloc_buffer_object(brw, intel_obj);
>         }
>      }
>   
> @@ -416,46 +350,41 @@ brw_map_buffer_range(struct gl_context *ctx,
>       */
>      if (!(access & (GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_PERSISTENT_BIT)) &&
>          (access & GL_MAP_INVALIDATE_RANGE_BIT) &&
> -       drm_intel_bo_busy(intel_obj->buffer)) {
> +       brw_bo_busy(intel_obj->buffer, BUSY_WRITE | BUSY_RETIRE)) {
>         /* Ensure that the base alignment of the allocation meets the alignment
>          * guarantees the driver has advertised to the application.
>          */
>         const unsigned alignment = ctx->Const.MinMapBufferAlignment;
>   
>         intel_obj->map_extra[index] = (uintptr_t) offset % alignment;
> -      intel_obj->range_map_bo[index] = drm_intel_bo_alloc(brw->bufmgr,
> -                                                          "BO blit temp",
> -                                                          length +
> -                                                          intel_obj->map_extra[index],
> -                                                          alignment);
> -      if (brw->has_llc) {
> -         brw_bo_map(brw, intel_obj->range_map_bo[index],
> -                    (access & GL_MAP_WRITE_BIT) != 0, "range-map");
> -      } else {
> -         drm_intel_gem_bo_map_gtt(intel_obj->range_map_bo[index]);
> -      }
> +      intel_obj->range_map_bo[index] =
> +	      brw_bo_create(&brw->batch,
> +			    "BO blit temp",
> +			    length + intel_obj->map_extra[index],
> +			    alignment, 0);
> +
>         obj->Mappings[index].Pointer =
> -         intel_obj->range_map_bo[index]->virtual + intel_obj->map_extra[index];
> +	 brw_bo_map(intel_obj->range_map_bo[index], MAP_WRITE) +
> +	 intel_obj->map_extra[index];
> +
>         return obj->Mappings[index].Pointer;
>      }
>   
> -   if (access & GL_MAP_UNSYNCHRONIZED_BIT) {
> -      if (!brw->has_llc && brw->perf_debug &&
> -          drm_intel_bo_busy(intel_obj->buffer)) {
> -         perf_debug("MapBufferRange with GL_MAP_UNSYNCHRONIZED_BIT stalling (it's actually synchronized on non-LLC platforms)\n");
> -      }
> -      drm_intel_gem_bo_map_unsynchronized(intel_obj->buffer);
> -   } else if (!brw->has_llc && (!(access & GL_MAP_READ_BIT) ||
> -                              (access & GL_MAP_PERSISTENT_BIT))) {
> -      drm_intel_gem_bo_map_gtt(intel_obj->buffer);
> -      mark_buffer_inactive(intel_obj);
> -   } else {
> -      brw_bo_map(brw, intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0,
> -                 "MapBufferRange");
> -      mark_buffer_inactive(intel_obj);
> -   }
> +   map_flags = 0;
> +   if (access & GL_MAP_UNSYNCHRONIZED_BIT)
> +      map_flags |= MAP_ASYNC;
> +   if (access & GL_MAP_WRITE_BIT)
> +      map_flags |= MAP_WRITE;
> +   if (access & GL_MAP_READ_BIT)
> +      map_flags |= MAP_READ;
> +   if (access & GL_MAP_PERSISTENT_BIT)
> +      map_flags |= MAP_COHERENT;
> +
> +   obj->Mappings[index].Pointer =
> +      brw_bo_map(intel_obj->buffer, map_flags) + offset;
> +
> +   mark_buffer_inactive(intel_obj);
>   
> -   obj->Mappings[index].Pointer = intel_obj->buffer->virtual + offset;
>      return obj->Mappings[index].Pointer;
>   }
>   
> @@ -543,8 +472,6 @@ brw_unmap_buffer(struct gl_context *ctx,
>      assert(intel_obj);
>      assert(obj->Mappings[index].Pointer);
>      if (intel_obj->range_map_bo[index] != NULL) {
> -      drm_intel_bo_unmap(intel_obj->range_map_bo[index]);
> -
>         if (!(obj->Mappings[index].AccessFlags & GL_MAP_FLUSH_EXPLICIT_BIT)) {
>            intel_emit_linear_blit(brw,
>                                   intel_obj->buffer, obj->Mappings[index].Offset,
> @@ -555,18 +482,10 @@ brw_unmap_buffer(struct gl_context *ctx,
>                                  obj->Mappings[index].Length);
>         }
>   
> -      /* Since we've emitted some blits to buffers that will (likely) be used
> -       * in rendering operations in other cache domains in this batch, emit a
> -       * flush.  Once again, we wish for a domain tracker in libdrm to cover
> -       * usage inside of a batchbuffer.
> -       */
> -      brw_emit_mi_flush(brw);
> -
> -      drm_intel_bo_unreference(intel_obj->range_map_bo[index]);
> +      brw_bo_put(intel_obj->range_map_bo[index]);
>         intel_obj->range_map_bo[index] = NULL;
> -   } else if (intel_obj->buffer != NULL) {
> -      drm_intel_bo_unmap(intel_obj->buffer);
>      }
> +
>      obj->Mappings[index].Pointer = NULL;
>      obj->Mappings[index].Offset = 0;
>      obj->Mappings[index].Length = 0;
> @@ -581,7 +500,7 @@ brw_unmap_buffer(struct gl_context *ctx,
>    * Anywhere that uses buffer objects in the pipeline should be using this to
>    * mark the range of the buffer that is being accessed by the pipeline.
>    */
> -drm_intel_bo *
> +struct brw_bo *
>   intel_bufferobj_buffer(struct brw_context *brw,
>                          struct intel_buffer_object *intel_obj,
>                          uint32_t offset, uint32_t size)
> @@ -615,7 +534,7 @@ brw_copy_buffer_subdata(struct gl_context *ctx,
>      struct brw_context *brw = brw_context(ctx);
>      struct intel_buffer_object *intel_src = intel_buffer_object(src);
>      struct intel_buffer_object *intel_dst = intel_buffer_object(dst);
> -   drm_intel_bo *src_bo, *dst_bo;
> +   struct brw_bo *src_bo, *dst_bo;
>   
>      if (size == 0)
>         return;
> @@ -626,13 +545,6 @@ brw_copy_buffer_subdata(struct gl_context *ctx,
>      intel_emit_linear_blit(brw,
>   			  dst_bo, write_offset,
>   			  src_bo, read_offset, size);
> -
> -   /* Since we've emitted some blits to buffers that will (likely) be used
> -    * in rendering operations in other cache domains in this batch, emit a
> -    * flush.  Once again, we wish for a domain tracker in libdrm to cover
> -    * usage inside of a batchbuffer.
> -    */
> -   brw_emit_mi_flush(brw);
>   }
>   
>   void
> diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.h b/src/mesa/drivers/dri/i965/intel_buffer_objects.h
> index 5eaf9dc..179c0576 100644
> --- a/src/mesa/drivers/dri/i965/intel_buffer_objects.h
> +++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.h
> @@ -40,9 +40,9 @@ struct gl_buffer_object;
>   struct intel_buffer_object
>   {
>      struct gl_buffer_object Base;
> -   drm_intel_bo *buffer;     /* the low-level buffer manager's buffer handle */
> +   struct brw_bo *buffer; /* the low-level buffer manager's buffer handle */
>   
> -   drm_intel_bo *range_map_bo[MAP_COUNT];
> +   struct brw_bo *range_map_bo[MAP_COUNT];
>   
>      /**
>       * Alignment offset from the range_map_bo temporary mapping to the returned
> @@ -84,26 +84,24 @@ struct intel_buffer_object
>   
>   /* Get the bm buffer associated with a GL bufferobject:
>    */
> -drm_intel_bo *intel_bufferobj_buffer(struct brw_context *brw,
> -                                     struct intel_buffer_object *obj,
> -                                     uint32_t offset,
> -                                     uint32_t size);
> +struct brw_bo *intel_bufferobj_buffer(struct brw_context *brw,
> +				      struct intel_buffer_object *obj,
> +				      uint32_t offset,
> +				      uint32_t size);
>   
>   void intel_upload_data(struct brw_context *brw,
>                          const void *data,
>                          uint32_t size,
>                          uint32_t alignment,
> -                       drm_intel_bo **out_bo,
> +                       struct brw_bo **out_bo,
>                          uint32_t *out_offset);
>   
>   void *intel_upload_space(struct brw_context *brw,
>                            uint32_t size,
>                            uint32_t alignment,
> -                         drm_intel_bo **out_bo,
> +                         struct brw_bo **out_bo,
>                            uint32_t *out_offset);
>   
> -void intel_upload_finish(struct brw_context *brw);
> -
>   /* Hook the bufferobject implementation into mesa:
>    */
>   void intelInitBufferObjectFuncs(struct dd_function_table *functions);
> diff --git a/src/mesa/drivers/dri/i965/intel_debug.c b/src/mesa/drivers/dri/i965/intel_debug.c
> index b68c212..be786ec 100644
> --- a/src/mesa/drivers/dri/i965/intel_debug.c
> +++ b/src/mesa/drivers/dri/i965/intel_debug.c
> @@ -93,17 +93,11 @@ brw_process_intel_debug_variable(struct intel_screen *screen)
>      uint64_t intel_debug = driParseDebugString(getenv("INTEL_DEBUG"), debug_control);
>      (void) p_atomic_cmpxchg(&INTEL_DEBUG, 0, intel_debug);
>   
> -   if (INTEL_DEBUG & DEBUG_BUFMGR)
> -      dri_bufmgr_set_debug(screen->bufmgr, true);
> -
>      if ((INTEL_DEBUG & DEBUG_SHADER_TIME) && screen->devinfo->gen < 7) {
>         fprintf(stderr,
>                 "shader_time debugging requires gen7 (Ivybridge) or better.\n");
>         INTEL_DEBUG &= ~DEBUG_SHADER_TIME;
>      }
> -
> -   if (INTEL_DEBUG & DEBUG_AUB)
> -      drm_intel_bufmgr_gem_set_aub_dump(screen->bufmgr, true);
>   }
>   
>   /**
> diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
> index 6b3bd12..c9a2007 100644
> --- a/src/mesa/drivers/dri/i965/intel_extensions.c
> +++ b/src/mesa/drivers/dri/i965/intel_extensions.c
> @@ -28,7 +28,6 @@
>   #include "main/version.h"
>   
>   #include "brw_context.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_reg.h"
>   #include "utils.h"
>   
> @@ -50,12 +49,15 @@ can_do_pipelined_register_writes(struct brw_context *brw)
>      if (result != -1)
>         return result;
>   
> +   result = false;
> +
>      /* We use SO_WRITE_OFFSET0 since you're supposed to write it (unlike the
>       * statistics registers), and we already reset it to zero before using it.
>       */
>      const int reg = GEN7_SO_WRITE_OFFSET(0);
>      const int expected_value = 0x1337d0d0;
>      const int offset = 100;
> +   int ret;
>   
>      /* The register we picked only exists on Gen7+. */
>      assert(brw->gen == 7);
> @@ -64,10 +66,12 @@ can_do_pipelined_register_writes(struct brw_context *brw)
>      /* Set a value in a BO to a known quantity.  The workaround BO already
>       * exists and doesn't contain anything important, so we may as well use it.
>       */
> -   drm_intel_bo_map(brw->workaround_bo, true);
> -   data = brw->workaround_bo->virtual;
> +   data = brw_bo_map(brw->workaround_bo, MAP_WRITE);
>      data[offset] = 0xffffffff;
> -   drm_intel_bo_unmap(brw->workaround_bo);
> +
> +   ret = brw_batch_begin(&brw->batch, 60, RENDER_RING);
> +   if (ret < 0)
> +      return result;
>   
>      /* Write the register. */
>      BEGIN_BATCH(3);
> @@ -87,17 +91,13 @@ can_do_pipelined_register_writes(struct brw_context *brw)
>                offset * sizeof(uint32_t));
>      ADVANCE_BATCH();
>   
> -   intel_batchbuffer_flush(brw);
> +   if (brw_batch_end(&brw->batch))
> +      return result;
>   
>      /* Check whether the value got written. */
> -   drm_intel_bo_map(brw->workaround_bo, false);
> -   data = brw->workaround_bo->virtual;
> -   bool success = data[offset] == expected_value;
> -   drm_intel_bo_unmap(brw->workaround_bo);
> -
> -   result = success;
> -
> -   return success;
> +   data = brw_bo_map(brw->workaround_bo, MAP_READ);
> +   result = data[offset] == expected_value;
> +   return result;
>   }
>   
>   static bool
> @@ -120,10 +120,12 @@ can_write_oacontrol(struct brw_context *brw)
>      /* Set a value in a BO to a known quantity.  The workaround BO already
>       * exists and doesn't contain anything important, so we may as well use it.
>       */
> -   drm_intel_bo_map(brw->workaround_bo, true);
> -   data = brw->workaround_bo->virtual;
> +   data = brw_bo_map(brw->workaround_bo, MAP_WRITE);
>      data[offset] = 0xffffffff;
> -   drm_intel_bo_unmap(brw->workaround_bo);
> +
> +   result = false;
> +   if (brw_batch_begin(&brw->batch, 60, RENDER_RING) < 0)
> +      return result;
>   
>      /* Write OACONTROL. */
>      BEGIN_BATCH(3);
> @@ -152,17 +154,13 @@ can_write_oacontrol(struct brw_context *brw)
>      OUT_BATCH(0);
>      ADVANCE_BATCH();
>   
> -   intel_batchbuffer_flush(brw);
> +   if (brw_batch_end(&brw->batch))
> +      return result;
>   
>      /* Check whether the value got written. */
> -   drm_intel_bo_map(brw->workaround_bo, false);
> -   data = brw->workaround_bo->virtual;
> -   bool success = data[offset] == expected_value;
> -   drm_intel_bo_unmap(brw->workaround_bo);
> -
> -   result = success;
> -
> -   return success;
> +   data = brw_bo_map(brw->workaround_bo, MAP_READ);
> +   result = data[offset] == expected_value;
> +   return result;
>   }
>   
>   /**
> diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
> index 05e3f8b..f5395c1 100644
> --- a/src/mesa/drivers/dri/i965/intel_fbo.c
> +++ b/src/mesa/drivers/dri/i965/intel_fbo.c
> @@ -43,7 +43,6 @@
>   #include "swrast/swrast.h"
>   #include "drivers/common/meta.h"
>   
> -#include "intel_batchbuffer.h"
>   #include "intel_buffers.h"
>   #include "intel_blit.h"
>   #include "intel_fbo.h"
> @@ -377,13 +376,15 @@ intel_image_target_renderbuffer_storage(struct gl_context *ctx,
>      irb = intel_renderbuffer(rb);
>      intel_miptree_release(&irb->mt);
>   
> +   struct brw_bo *bo = brw_bo_import(&brw->batch, image->bo, true);
> +
>      /* Disable creation of the miptree's aux buffers because the driver exposes
>       * no EGL API to manage them. That is, there is no API for resolving the aux
>       * buffer's content to the main buffer nor for invalidating the aux buffer's
>       * content.
>       */
>      irb->mt = intel_miptree_create_for_bo(brw,
> -                                         image->bo,
> +                                         bo,
>                                            image->format,
>                                            image->offset,
>                                            image->width,
> @@ -391,6 +392,7 @@ intel_image_target_renderbuffer_storage(struct gl_context *ctx,
>                                            1,
>                                            image->pitch,
>                                            MIPTREE_LAYOUT_DISABLE_AUX);
> +   brw_bo_put(bo);
>      if (!irb->mt)
>         return;
>   
> @@ -1044,43 +1046,6 @@ intel_renderbuffer_move_to_temp(struct brw_context *brw,
>      intel_miptree_release(&new_mt);
>   }
>   
> -void
> -brw_render_cache_set_clear(struct brw_context *brw)
> -{
> -   struct set_entry *entry;
> -
> -   set_foreach(brw->render_cache, entry) {
> -      _mesa_set_remove(brw->render_cache, entry);
> -   }
> -}
> -
> -void
> -brw_render_cache_set_add_bo(struct brw_context *brw, drm_intel_bo *bo)
> -{
> -   _mesa_set_add(brw->render_cache, bo);
> -}
> -
> -/**
> - * Emits an appropriate flush for a BO if it has been rendered to within the
> - * same batchbuffer as a read that's about to be emitted.
> - *
> - * The GPU has separate, incoherent caches for the render cache and the
> - * sampler cache, along with other caches.  Usually data in the different
> - * caches don't interact (e.g. we don't render to our driver-generated
> - * immediate constant data), but for render-to-texture in FBOs we definitely
> - * do.  When a batchbuffer is flushed, the kernel will ensure that everything
> - * necessary is flushed before another use of that BO, but for reuse from
> - * different caches within a batchbuffer, it's all our responsibility.
> - */
> -void
> -brw_render_cache_set_check_flush(struct brw_context *brw, drm_intel_bo *bo)
> -{
> -   if (!_mesa_set_search(brw->render_cache, bo))
> -      return;
> -
> -   brw_emit_mi_flush(brw);
> -}
> -
>   /**
>    * Do one-time context initializations related to GL_EXT_framebuffer_object.
>    * Hook in device driver functions.
> @@ -1101,7 +1066,4 @@ intel_fbo_init(struct brw_context *brw)
>         dd->BlitFramebuffer = gen4_blit_framebuffer;
>      dd->EGLImageTargetRenderbufferStorage =
>         intel_image_target_renderbuffer_storage;
> -
> -   brw->render_cache = _mesa_set_create(brw, _mesa_hash_pointer,
> -                                        _mesa_key_pointer_equal);
>   }
> diff --git a/src/mesa/drivers/dri/i965/intel_fbo.h b/src/mesa/drivers/dri/i965/intel_fbo.h
> index c7cc570..6c761e6 100644
> --- a/src/mesa/drivers/dri/i965/intel_fbo.h
> +++ b/src/mesa/drivers/dri/i965/intel_fbo.h
> @@ -240,10 +240,6 @@ void
>   intel_renderbuffer_upsample(struct brw_context *brw,
>                               struct intel_renderbuffer *irb);
>   
> -void brw_render_cache_set_clear(struct brw_context *brw);
> -void brw_render_cache_set_add_bo(struct brw_context *brw, drm_intel_bo *bo);
> -void brw_render_cache_set_check_flush(struct brw_context *brw, drm_intel_bo *bo);
> -
>   unsigned
>   intel_quantize_num_samples(struct intel_screen *intel, unsigned num_samples);
>   
> diff --git a/src/mesa/drivers/dri/i965/intel_image.h b/src/mesa/drivers/dri/i965/intel_image.h
> index a82cf3b..c464a17 100644
> --- a/src/mesa/drivers/dri/i965/intel_image.h
> +++ b/src/mesa/drivers/dri/i965/intel_image.h
> @@ -42,7 +42,6 @@
>   #include <xf86drm.h>
>   
>   #include "main/mtypes.h"
> -#include "intel_bufmgr.h"
>   #include <GL/internal/dri_interface.h>
>   
>   #ifdef __cplusplus
> @@ -66,8 +65,11 @@ struct intel_image_format {
>      } planes[3];
>   };
>   
> +struct _drm_intel_bo;
> +
>   struct __DRIimageRec {
> -   drm_intel_bo *bo;
> +   struct _drm_intel_bo *bo;
> +
>      uint32_t pitch; /**< in bytes */
>      GLenum internal_format;
>      uint32_t dri_format;
> diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
> index fb896a9..e8bbc04 100644
> --- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
> +++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
> @@ -28,7 +28,6 @@
>   #include <GL/gl.h>
>   #include <GL/internal/dri_interface.h>
>   
> -#include "intel_batchbuffer.h"
>   #include "intel_mipmap_tree.h"
>   #include "intel_resolve_map.h"
>   #include "intel_tex.h"
> @@ -561,12 +560,12 @@ intel_lower_compressed_format(struct brw_context *brw, mesa_format format)
>   /* This function computes Yf/Ys tiled bo size, alignment and pitch. */
>   static uint64_t
>   intel_get_yf_ys_bo_size(struct intel_mipmap_tree *mt, unsigned *alignment,
> -                        uint64_t *pitch)
> +                        uint32_t *pitch)
>   {
>      const uint32_t bpp = mt->cpp * 8;
>      const uint32_t aspect_ratio = (bpp == 16 || bpp == 64) ? 2 : 1;
> -   uint32_t tile_width, tile_height;
> -   uint64_t stride, size, aligned_y;
> +   uint32_t tile_width, tile_height, stride;
> +   uint64_t size, aligned_y;
>   
>      assert(mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE);
>   
> @@ -662,21 +661,21 @@ intel_miptree_create(struct brw_context *brw,
>      if (layout_flags & MIPTREE_LAYOUT_ACCELERATED_UPLOAD)
>         alloc_flags |= BO_ALLOC_FOR_RENDER;
>   
> -   unsigned long pitch;
> +   uint32_t pitch;
>      mt->etc_format = etc_format;
>   
>      if (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) {
> -      unsigned alignment = 0;
> -      unsigned long size;
> +      uint32_t alignment;
> +      uint64_t size;
>         size = intel_get_yf_ys_bo_size(mt, &alignment, &pitch);
>         assert(size);
> -      mt->bo = drm_intel_bo_alloc_for_render(brw->bufmgr, "miptree",
> -                                             size, alignment);
> +      mt->bo = brw_bo_create(&brw->batch, "miptree",
> +			     size, alignment,
> +			     alloc_flags);
>      } else {
> -      mt->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "miptree",
> -                                        total_width, total_height, mt->cpp,
> -                                        &mt->tiling, &pitch,
> -                                        alloc_flags);
> +      mt->bo = brw_bo_create_tiled(&brw->batch, "miptree",
> +				   total_width, total_height, mt->cpp,
> +				   &mt->tiling, &pitch, alloc_flags);
>      }
>   
>      mt->pitch = pitch;
> @@ -690,10 +689,10 @@ intel_miptree_create(struct brw_context *brw,
>                    mt->total_width, mt->total_height);
>   
>         mt->tiling = I915_TILING_X;
> -      drm_intel_bo_unreference(mt->bo);
> -      mt->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "miptree",
> -                                  total_width, total_height, mt->cpp,
> -                                  &mt->tiling, &pitch, alloc_flags);
> +      brw_bo_put(mt->bo);
> +      mt->bo = brw_bo_create_tiled(&brw->batch, "miptree",
> +				   total_width, total_height, mt->cpp,
> +				   &mt->tiling, &pitch, alloc_flags);
>         mt->pitch = pitch;
>      }
>   
> @@ -729,7 +728,7 @@ intel_miptree_create(struct brw_context *brw,
>   
>   struct intel_mipmap_tree *
>   intel_miptree_create_for_bo(struct brw_context *brw,
> -                            drm_intel_bo *bo,
> +                            struct brw_bo *bo,
>                               mesa_format format,
>                               uint32_t offset,
>                               uint32_t width,
> @@ -739,15 +738,12 @@ intel_miptree_create_for_bo(struct brw_context *brw,
>                               uint32_t layout_flags)
>   {
>      struct intel_mipmap_tree *mt;
> -   uint32_t tiling, swizzle;
>      GLenum target;
>   
> -   drm_intel_bo_get_tiling(bo, &tiling, &swizzle);
> -
>      /* Nothing will be able to use this miptree with the BO if the offset isn't
>       * aligned.
>       */
> -   if (tiling != I915_TILING_NONE)
> +   if (bo->tiling != I915_TILING_NONE)
>         assert(offset % 4096 == 0);
>   
>      /* miptrees can't handle negative pitch.  If you need flipping of images,
> @@ -772,11 +768,10 @@ intel_miptree_create_for_bo(struct brw_context *brw,
>      if (!mt)
>         return NULL;
>   
> -   drm_intel_bo_reference(bo);
> -   mt->bo = bo;
> +   mt->bo = brw_bo_get(bo);
>      mt->pitch = pitch;
>      mt->offset = offset;
> -   mt->tiling = tiling;
> +   mt->tiling = bo->tiling;
>   
>      return mt;
>   }
> @@ -794,7 +789,7 @@ intel_miptree_create_for_bo(struct brw_context *brw,
>   void
>   intel_update_winsys_renderbuffer_miptree(struct brw_context *intel,
>                                            struct intel_renderbuffer *irb,
> -                                         drm_intel_bo *bo,
> +                                         struct brw_bo *bo,
>                                            uint32_t width, uint32_t height,
>                                            uint32_t pitch)
>   {
> @@ -926,13 +921,13 @@ intel_miptree_release(struct intel_mipmap_tree **mt)
>   
>         DBG("%s deleting %p\n", __func__, *mt);
>   
> -      drm_intel_bo_unreference((*mt)->bo);
> +      brw_bo_put((*mt)->bo);
>         intel_miptree_release(&(*mt)->stencil_mt);
>         if ((*mt)->hiz_buf) {
>            if ((*mt)->hiz_buf->mt)
>               intel_miptree_release(&(*mt)->hiz_buf->mt);
>            else
> -            drm_intel_bo_unreference((*mt)->hiz_buf->bo);
> +            brw_bo_put((*mt)->hiz_buf->bo);
>            free((*mt)->hiz_buf);
>         }
>         intel_miptree_release(&(*mt)->mcs_mt);
> @@ -1570,17 +1565,17 @@ intel_gen7_hiz_buf_create(struct brw_context *brw,
>         }
>      }
>   
> -   unsigned long pitch;
> +   uint32_t pitch;
>      uint32_t tiling = I915_TILING_Y;
> -   buf->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "hiz",
> -                                      hz_width, hz_height, 1,
> -                                      &tiling, &pitch,
> -                                      BO_ALLOC_FOR_RENDER);
> +   buf->bo = brw_bo_create_tiled(&brw->batch, "hiz",
> +				 hz_width, hz_height, 1,
> +				 &tiling, &pitch,
> +				 BO_ALLOC_FOR_RENDER);
>      if (!buf->bo) {
>         free(buf);
>         return NULL;
>      } else if (tiling != I915_TILING_Y) {
> -      drm_intel_bo_unreference(buf->bo);
> +      brw_bo_put(buf->bo);
>         free(buf);
>         return NULL;
>      }
> @@ -1673,17 +1668,17 @@ intel_gen8_hiz_buf_create(struct brw_context *brw,
>         }
>      }
>   
> -   unsigned long pitch;
> +   uint32_t pitch;
>      uint32_t tiling = I915_TILING_Y;
> -   buf->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "hiz",
> -                                      hz_width, hz_height, 1,
> -                                      &tiling, &pitch,
> -                                      BO_ALLOC_FOR_RENDER);
> +   buf->bo = brw_bo_create_tiled(&brw->batch, "hiz",
> +				 hz_width, hz_height, 1,
> +				 &tiling, &pitch,
> +				 BO_ALLOC_FOR_RENDER);
>      if (!buf->bo) {
>         free(buf);
>         return NULL;
>      } else if (tiling != I915_TILING_Y) {
> -      drm_intel_bo_unreference(buf->bo);
> +      brw_bo_put(buf->bo);
>         free(buf);
>         return NULL;
>      }
> @@ -2064,25 +2059,13 @@ intel_miptree_map_raw(struct brw_context *brw, struct intel_mipmap_tree *mt)
>       * resolve any pending fast color clears before we map.
>       */
>      intel_miptree_resolve_color(brw, mt);
> -
> -   drm_intel_bo *bo = mt->bo;
> -
> -   if (drm_intel_bo_references(brw->batch.bo, bo))
> -      intel_batchbuffer_flush(brw);
> -
> -   if (mt->tiling != I915_TILING_NONE)
> -      brw_bo_map_gtt(brw, bo, "miptree");
> -   else
> -      brw_bo_map(brw, bo, true, "miptree");
> -
> -   return bo->virtual;
> +   return brw_bo_map(mt->bo, MAP_WRITE);
>   }
>   
>   void
>   intel_miptree_unmap_raw(struct brw_context *brw,
>                           struct intel_mipmap_tree *mt)
>   {
> -   drm_intel_bo_unmap(mt->bo);
>   }
>   
>   static void
> @@ -2622,11 +2605,10 @@ use_intel_mipree_map_blit(struct brw_context *brw,
>                             unsigned int level,
>                             unsigned int slice)
>   {
> -   if (brw->has_llc &&
> -      /* It's probably not worth swapping to the blit ring because of
> -       * all the overhead involved.
> -       */
> -       !(mode & GL_MAP_WRITE_BIT) &&
> +   /* It's probably not worth swapping to the blit ring because of
> +    * all the overhead involved.
> +    */
> +   if (!(mode & GL_MAP_WRITE_BIT) &&
>          !mt->compressed &&
>          (mt->tiling == I915_TILING_X ||
>           /* Prior to Sandybridge, the blitter can't handle Y tiling */
> diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
> index bde6daa..7e91c97 100644
> --- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
> +++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
> @@ -33,7 +33,7 @@
>    * The hardware has a fixed layout of a texture depending on parameters such
>    * as the target/type (2D, 3D, CUBE), width, height, pitch, and number of
>    * mipmap levels.  The individual level/layer slices are each 2D rectangles of
> - * pixels at some x/y offset from the start of the drm_intel_bo.
> + * pixels at some x/y offset from the start of the brw_bo.
>    *
>    * Original OpenGL allowed texture miplevels to be specified in arbitrary
>    * order, and a texture may change size over time.  Thus, each
> @@ -49,7 +49,6 @@
>   #include <assert.h>
>   
>   #include "main/mtypes.h"
> -#include "intel_bufmgr.h"
>   #include "intel_resolve_map.h"
>   #include <GL/internal/dri_interface.h>
>   
> @@ -321,7 +320,7 @@ enum miptree_array_layout {
>   struct intel_miptree_aux_buffer
>   {
>      /** Buffer object containing the pixel data. */
> -   drm_intel_bo *bo;
> +   struct brw_bo *bo;
>   
>      uint32_t pitch; /**< pitch in bytes. */
>   
> @@ -340,7 +339,7 @@ enum intel_miptree_tr_mode {
>   struct intel_mipmap_tree
>   {
>      /** Buffer object containing the pixel data. */
> -   drm_intel_bo *bo;
> +   struct brw_bo *bo;
>   
>      uint32_t pitch; /**< pitch in bytes. */
>   
> @@ -557,7 +556,7 @@ struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
>   
>   struct intel_mipmap_tree *
>   intel_miptree_create_for_bo(struct brw_context *brw,
> -                            drm_intel_bo *bo,
> +                            struct brw_bo *bo,
>                               mesa_format format,
>                               uint32_t offset,
>                               uint32_t width,
> @@ -569,7 +568,7 @@ intel_miptree_create_for_bo(struct brw_context *brw,
>   void
>   intel_update_winsys_renderbuffer_miptree(struct brw_context *intel,
>                                            struct intel_renderbuffer *irb,
> -                                         drm_intel_bo *bo,
> +					 struct brw_bo *bo,
>                                            uint32_t width, uint32_t height,
>                                            uint32_t pitch);
>   
> diff --git a/src/mesa/drivers/dri/i965/intel_pixel_bitmap.c b/src/mesa/drivers/dri/i965/intel_pixel_bitmap.c
> index 224dc65..bd40a92 100644
> --- a/src/mesa/drivers/dri/i965/intel_pixel_bitmap.c
> +++ b/src/mesa/drivers/dri/i965/intel_pixel_bitmap.c
> @@ -44,7 +44,6 @@
>   
>   #include "brw_context.h"
>   #include "intel_screen.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_blit.h"
>   #include "intel_fbo.h"
>   #include "intel_image.h"
> @@ -314,7 +313,7 @@ do_blit_bitmap( struct gl_context *ctx,
>   out:
>   
>      if (unlikely(INTEL_DEBUG & DEBUG_SYNC))
> -      intel_batchbuffer_flush(brw);
> +      brw_batch_flush(&brw->batch);
>   
>      if (_mesa_is_bufferobj(unpack->BufferObj)) {
>         /* done with PBO so unmap it now */
> diff --git a/src/mesa/drivers/dri/i965/intel_pixel_copy.c b/src/mesa/drivers/dri/i965/intel_pixel_copy.c
> index ce053ed..4313588 100644
> --- a/src/mesa/drivers/dri/i965/intel_pixel_copy.c
> +++ b/src/mesa/drivers/dri/i965/intel_pixel_copy.c
> @@ -39,7 +39,6 @@
>   #include "intel_pixel.h"
>   #include "intel_fbo.h"
>   #include "intel_blit.h"
> -#include "intel_batchbuffer.h"
>   
>   #define FILE_DEBUG_FLAG DEBUG_PIXEL
>   
> @@ -149,8 +148,6 @@ do_blit_copypixels(struct gl_context * ctx,
>         return false;
>      }
>   
> -   intel_batchbuffer_flush(brw);
> -
>      /* Clip to destination buffer. */
>      orig_dstx = dstx;
>      orig_dsty = dsty;
> diff --git a/src/mesa/drivers/dri/i965/intel_pixel_draw.c b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
> index 6c6bd86..09eea3e 100644
> --- a/src/mesa/drivers/dri/i965/intel_pixel_draw.c
> +++ b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
> @@ -60,7 +60,7 @@ do_blit_drawpixels(struct gl_context * ctx,
>      struct brw_context *brw = brw_context(ctx);
>      struct intel_buffer_object *src = intel_buffer_object(unpack->BufferObj);
>      GLuint src_offset;
> -   drm_intel_bo *src_buffer;
> +   struct brw_bo *src_buffer;
>   
>      DBG("%s\n", __func__);
>   
> diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c
> index 3fe506e..1789023 100644
> --- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
> +++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
> @@ -39,7 +39,6 @@
>   
>   #include "brw_context.h"
>   #include "intel_screen.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_blit.h"
>   #include "intel_buffers.h"
>   #include "intel_fbo.h"
> @@ -84,11 +83,6 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
>      struct intel_renderbuffer *irb = intel_renderbuffer(rb);
>      int dst_pitch;
>   
> -   /* The miptree's buffer. */
> -   drm_intel_bo *bo;
> -
> -   int error = 0;
> -
>      uint32_t cpp;
>      mem_copy_fn mem_copy = NULL;
>   
> @@ -96,7 +90,7 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
>       * a 2D BGRA, RGBA, L8 or A8 texture. It could be generalized to support
>       * more types.
>       */
> -   if (!brw->has_llc ||
> +   if (!irb->mt->bo->cache_coherent ||
>          !(type == GL_UNSIGNED_BYTE || type == GL_UNSIGNED_INT_8_8_8_8_REV) ||
>          pixels == NULL ||
>          _mesa_is_bufferobj(pack->BufferObj) ||
> @@ -155,19 +149,6 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
>       */
>      intel_miptree_resolve_color(brw, irb->mt);
>   
> -   bo = irb->mt->bo;
> -
> -   if (drm_intel_bo_references(brw->batch.bo, bo)) {
> -      perf_debug("Flushing before mapping a referenced bo.\n");
> -      intel_batchbuffer_flush(brw);
> -   }
> -
> -   error = brw_bo_map(brw, bo, false /* write enable */, "miptree");
> -   if (error) {
> -      DBG("%s: failed to map bo\n", __func__);
> -      return false;
> -   }
> -
>      dst_pitch = _mesa_image_row_stride(pack, width, format, type);
>   
>      /* For a window-system renderbuffer, the buffer is actually flipped
> @@ -196,19 +177,16 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
>          pack->Alignment, pack->RowLength, pack->SkipPixels,
>          pack->SkipRows);
>   
> -   tiled_to_linear(
> +   return tiled_to_linear(
>         xoffset * cpp, (xoffset + width) * cpp,
>         yoffset, yoffset + height,
>         pixels - (ptrdiff_t) yoffset * dst_pitch - (ptrdiff_t) xoffset * cpp,
> -      bo->virtual,
> +      brw_bo_map(irb->mt->bo, MAP_READ | MAP_DETILED),
>         dst_pitch, irb->mt->pitch,
>         brw->has_swizzling,
>         irb->mt->tiling,
>         mem_copy
>      );
> -
> -   drm_intel_bo_unmap(bo);
> -   return true;
>   }
>   
>   void
> diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
> index cd8e6eb..fcf50fc 100644
> --- a/src/mesa/drivers/dri/i965/intel_screen.c
> +++ b/src/mesa/drivers/dri/i965/intel_screen.c
> @@ -44,6 +44,8 @@
>   #include "utils.h"
>   #include "xmlpool.h"
>   
> +#include "intel_reg.h"
> +
>   static const __DRIconfigOptionsExtension brw_config_options = {
>      .base = { __DRI_CONFIG_OPTIONS, 1 },
>      .xml =
> @@ -91,9 +93,7 @@ DRI_CONF_BEGIN
>   DRI_CONF_END
>   };
>   
> -#include "intel_batchbuffer.h"
>   #include "intel_buffers.h"
> -#include "intel_bufmgr.h"
>   #include "intel_fbo.h"
>   #include "intel_mipmap_tree.h"
>   #include "intel_screen.h"
> @@ -118,9 +118,9 @@ get_time(void)
>   }
>   
>   void
> -aub_dump_bmp(struct gl_context *ctx)
> +aub_dump_bmp(struct brw_context *brw)
>   {
> -   struct gl_framebuffer *fb = ctx->DrawBuffer;
> +   struct gl_framebuffer *fb = brw->ctx.DrawBuffer;
>   
>      for (int i = 0; i < fb->_NumColorDrawBuffers; i++) {
>         struct intel_renderbuffer *irb =
> @@ -138,14 +138,16 @@ aub_dump_bmp(struct gl_context *ctx)
>   	    continue;
>   	 }
>   
> -         drm_intel_gem_bo_aub_dump_bmp(irb->mt->bo,
> -				       irb->draw_x,
> -				       irb->draw_y,
> -				       irb->Base.Base.Width,
> -				       irb->Base.Base.Height,
> -				       format,
> -				       irb->mt->pitch,
> -				       0);
> +	 drm_intel_aub_bo_bmp(&brw->batch.aub,
> +                              irb->mt->bo->base,
> +                              irb->draw_x,
> +                              irb->draw_y,
> +                              irb->Base.Base.Width,
> +                              irb->Base.Base.Height,
> +                              format,
> +                              irb->mt->bo->tiling,
> +                              irb->mt->pitch,
> +                              0);
>         }
>      }
>   }
> @@ -176,15 +178,15 @@ intel_dri2_flush_with_flags(__DRIcontext *cPriv,
>      if (flags & __DRI2_FLUSH_DRAWABLE)
>         intel_resolve_for_dri2_flush(brw, dPriv);
>   
> +   brw_batch_flush(&brw->batch);
> +
>      if (reason == __DRI2_THROTTLE_SWAPBUFFER)
> -      brw->need_swap_throttle = true;
> +      brw->batch.need_swap_throttle = true;
>      if (reason == __DRI2_THROTTLE_FLUSHFRONT)
> -      brw->need_flush_throttle = true;
> -
> -   intel_batchbuffer_flush(brw);
> +      brw->batch.need_flush_throttle = true;
>   
>      if (INTEL_DEBUG & DEBUG_AUB) {
> -      aub_dump_bmp(ctx);
> +      aub_dump_bmp(brw);
>      }
>   }
>   
> @@ -359,8 +361,8 @@ intel_setup_image_from_mipmap_tree(struct brw_context *brw, __DRIimage *image,
>                                                     &image->tile_y);
>   
>      drm_intel_bo_unreference(image->bo);
> -   image->bo = mt->bo;
> -   drm_intel_bo_reference(mt->bo);
> +   image->bo = mt->bo->base;
> +   drm_intel_bo_reference(image->bo);
>   }
>   
>   static __DRIimage *
> @@ -421,8 +423,8 @@ intel_create_image_from_renderbuffer(__DRIcontext *context,
>      image->offset = 0;
>      image->data = loaderPrivate;
>      drm_intel_bo_unreference(image->bo);
> -   image->bo = irb->mt->bo;
> -   drm_intel_bo_reference(irb->mt->bo);
> +   image->bo = irb->mt->bo->base;
> +   drm_intel_bo_reference(image->bo);
>      image->width = rb->Width;
>      image->height = rb->Height;
>      image->pitch = irb->mt->pitch;
> @@ -526,7 +528,7 @@ intel_create_image(__DRIscreen *screen,
>      if (image == NULL)
>         return NULL;
>   
> -
> +
>      cpp = _mesa_get_format_bytes(image->format);
>      image->bo = drm_intel_bo_alloc_tiled(intelScreen->bufmgr, "image",
>                                           width, height, cpp, &tiling,
> @@ -553,7 +555,7 @@ intel_query_image(__DRIimage *image, int attrib, int *value)
>         *value = image->bo->handle;
>         return true;
>      case __DRI_IMAGE_ATTRIB_NAME:
> -      return !drm_intel_bo_flink(image->bo, (uint32_t *) value);
> +      return drm_intel_bo_flink(image->bo, (uint32_t *)value) == 0;
>      case __DRI_IMAGE_ATTRIB_FORMAT:
>         *value = image->dri_format;
>         return true;
> @@ -569,9 +571,7 @@ intel_query_image(__DRIimage *image, int attrib, int *value)
>         *value = image->planar_format->components;
>         return true;
>      case __DRI_IMAGE_ATTRIB_FD:
> -      if (drm_intel_bo_gem_export_to_prime(image->bo, value) == 0)
> -         return true;
> -      return false;
> +      return drm_intel_bo_gem_export_to_prime(image->bo, value) == 0;
>      case __DRI_IMAGE_ATTRIB_FOURCC:
>         if (intel_lookup_fourcc(image->dri_format, value))
>            return true;
> @@ -1083,13 +1083,27 @@ intel_init_bufmgr(struct intel_screen *intelScreen)
>   
>      intelScreen->no_hw = getenv("INTEL_NO_HW") != NULL;
>   
> -   intelScreen->bufmgr = intel_bufmgr_gem_init(spriv->fd, BATCH_SZ);
> +   intelScreen->bufmgr = intel_bufmgr_gem_init(spriv->fd, 0);
>      if (intelScreen->bufmgr == NULL) {
>         fprintf(stderr, "[%s:%u] Error initializing buffer manager.\n",
>   	      __func__, __LINE__);
>         return false;
>      }
>   
> +#if 0 /* XXX */
> +   driParseConfigFiles(options, &brw->intelScreen->optionCache,
> +                       brw->driContext->driScreenPriv->myNum, "i965");
> +   switch (driQueryOptioni(options, "bo_reuse")) {
> +   case DRI_CONF_BO_REUSE_DISABLED:
> +      break;
> +   case DRI_CONF_BO_REUSE_ALL:
> +      drm_intel_bufmgr_gem_enable_reuse(intelScreen->bufmgr);
> +      break;
> +   }
> +#else
> +   drm_intel_bufmgr_gem_enable_reuse(intelScreen->bufmgr);
> +#endif
> +
>      drm_intel_bufmgr_gem_enable_fenced_relocs(intelScreen->bufmgr);
>   
>      if (!intel_get_boolean(spriv, I915_PARAM_HAS_RELAXED_DELTA)) {
> diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
> index e55fddb..733654c 100644
> --- a/src/mesa/drivers/dri/i965/intel_screen.h
> +++ b/src/mesa/drivers/dri/i965/intel_screen.h
> @@ -34,16 +34,20 @@
>   #include <GL/internal/dri_interface.h>
>   
>   #include "dri_util.h"
> -#include "intel_bufmgr.h"
>   #include "brw_device_info.h"
>   #include "i915_drm.h"
>   #include "xmlconfig.h"
>   
> +#include <intel_bufmgr.h>
> +
>   struct intel_screen
>   {
>      int deviceID;
>      const struct brw_device_info *devinfo;
>   
> +   drm_intel_bufmgr *bufmgr;
> +   drm_intel_bo *workaround_bo;
> +
>      __DRIscreen *driScrnPriv;
>   
>      bool no_hw;
> @@ -59,9 +63,6 @@ struct intel_screen
>       */
>      bool has_context_reset_notification;
>   
> -   dri_bufmgr *bufmgr;
> -   drm_intel_bo *workaround_bo;
> -
>      /**
>       * A unique ID for shader programs.
>       */
> @@ -83,6 +84,12 @@ struct intel_screen
>      int cmd_parser_version;
>    };
>   
> +static inline int intel_screen_to_fd(struct intel_screen *scr)
> +{
> +   __DRIscreen *psp = scr->driScrnPriv;
> +   return psp->fd;
> +}
> +
>   extern void intelDestroyContext(__DRIcontext * driContextPriv);
>   
>   extern GLboolean intelUnbindContext(__DRIcontext * driContextPriv);
> @@ -96,7 +103,6 @@ intelMakeCurrent(__DRIcontext * driContextPriv,
>                    __DRIdrawable * driReadPriv);
>   
>   double get_time(void);
> -void aub_dump_bmp(struct gl_context *ctx);
>   
>   const int*
>   intel_supported_msaa_modes(const struct intel_screen  *screen);
> diff --git a/src/mesa/drivers/dri/i965/intel_syncobj.c b/src/mesa/drivers/dri/i965/intel_syncobj.c
> index c44c4be..544380e 100644
> --- a/src/mesa/drivers/dri/i965/intel_syncobj.c
> +++ b/src/mesa/drivers/dri/i965/intel_syncobj.c
> @@ -41,56 +41,13 @@
>   #include "main/imports.h"
>   
>   #include "brw_context.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_reg.h"
>   
> -struct brw_fence {
> -   /** The fence waits for completion of this batch. */
> -   drm_intel_bo *batch_bo;
> -
> -   bool signalled;
> -};
> -
>   struct intel_gl_sync_object {
>      struct gl_sync_object Base;
>      struct brw_fence fence;
>   };
>   
> -static void
> -brw_fence_finish(struct brw_fence *fence)
> -{
> -   if (fence->batch_bo)
> -      drm_intel_bo_unreference(fence->batch_bo);
> -}
> -
> -static void
> -brw_fence_insert(struct brw_context *brw, struct brw_fence *fence)
> -{
> -   assert(!fence->batch_bo);
> -   assert(!fence->signalled);
> -
> -   brw_emit_mi_flush(brw);
> -   fence->batch_bo = brw->batch.bo;
> -   drm_intel_bo_reference(fence->batch_bo);
> -   intel_batchbuffer_flush(brw);
> -}
> -
> -static bool
> -brw_fence_has_completed(struct brw_fence *fence)
> -{
> -   if (fence->signalled)
> -      return true;
> -
> -   if (fence->batch_bo && !drm_intel_bo_busy(fence->batch_bo)) {
> -      drm_intel_bo_unreference(fence->batch_bo);
> -      fence->batch_bo = NULL;
> -      fence->signalled = true;
> -      return true;
> -   }
> -
> -   return false;
> -}
> -
>   /**
>    * Return true if the function successfully signals or has already signalled.
>    * (This matches the behavior expected from __DRI2fence::client_wait_sync).
> @@ -99,27 +56,15 @@ static bool
>   brw_fence_client_wait(struct brw_context *brw, struct brw_fence *fence,
>                         uint64_t timeout)
>   {
> -   if (fence->signalled)
> -      return true;
> -
> -   assert(fence->batch_bo);
> -
>      /* DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and returns
> -    * immediately for timeouts <= 0.  The best we can do is to clamp the
> -    * timeout to INT64_MAX.  This limits the maximum timeout from 584 years to
> -    * 292 years - likely not a big deal.
> +    * immediately for timeout == 0, and indefinitely if timeout is negative.
> +    * The best we can do is to clamp the timeout to INT64_MAX.  This limits
> +    * the maximum timeout from 584 years to 292 years - likely not a big deal.
>       */
>      if (timeout > INT64_MAX)
>         timeout = INT64_MAX;
>   
> -   if (drm_intel_gem_bo_wait(fence->batch_bo, timeout) != 0)
> -      return false;
> -
> -   fence->signalled = true;
> -   drm_intel_bo_unreference(fence->batch_bo);
> -   fence->batch_bo = NULL;
> -
> -   return true;
> +   return brw_fence_wait(fence, timeout) == 0;
>   }
>   
>   static void
> @@ -149,18 +94,27 @@ intel_gl_delete_sync_object(struct gl_context *ctx, struct gl_sync_object *s)
>   {
>      struct intel_gl_sync_object *sync = (struct intel_gl_sync_object *)s;
>   
> -   brw_fence_finish(&sync->fence);
> +   brw_fence_destroy(&sync->fence);
>      free(sync);
>   }
>   
>   static void
> +__intel_fence_signal(struct brw_fence *fence)
> +{
> +   struct intel_gl_sync_object *sync = container_of(fence, sync, fence);
> +
> +   sync->Base.StatusFlag = 1;
> +}
> +
> +static void
>   intel_gl_fence_sync(struct gl_context *ctx, struct gl_sync_object *s,
>                       GLenum condition, GLbitfield flags)
>   {
>      struct brw_context *brw = brw_context(ctx);
>      struct intel_gl_sync_object *sync = (struct intel_gl_sync_object *)s;
>   
> -   brw_fence_insert(brw, &sync->fence);
> +   sync->fence.signal = __intel_fence_signal;
> +   s->StatusFlag = !brw_batch_create_fence(&brw->batch, &sync->fence);
>   }
>   
>   static void
> @@ -170,8 +124,7 @@ intel_gl_client_wait_sync(struct gl_context *ctx, struct gl_sync_object *s,
>      struct brw_context *brw = brw_context(ctx);
>      struct intel_gl_sync_object *sync = (struct intel_gl_sync_object *)s;
>   
> -   if (brw_fence_client_wait(brw, &sync->fence, timeout))
> -      s->StatusFlag = 1;
> +   brw_fence_client_wait(brw, &sync->fence, timeout);
>   }
>   
>   static void
> @@ -189,8 +142,7 @@ intel_gl_check_sync(struct gl_context *ctx, struct gl_sync_object *s)
>   {
>      struct intel_gl_sync_object *sync = (struct intel_gl_sync_object *)s;
>   
> -   if (brw_fence_has_completed(&sync->fence))
> -      s->StatusFlag = 1;
> +   brw_fence_busy(&sync->fence);
>   }
>   
>   void
> @@ -214,7 +166,7 @@ intel_dri_create_fence(__DRIcontext *ctx)
>      if (!fence)
>         return NULL;
>   
> -   brw_fence_insert(brw, fence);
> +   brw_batch_create_fence(&brw->batch, fence);
>   
>      return fence;
>   }
> @@ -224,7 +176,7 @@ intel_dri_destroy_fence(__DRIscreen *screen, void *driver_fence)
>   {
>      struct brw_fence *fence = driver_fence;
>   
> -   brw_fence_finish(fence);
> +   brw_fence_destroy(fence);
>      free(fence);
>   }
>   
> diff --git a/src/mesa/drivers/dri/i965/intel_tex.c b/src/mesa/drivers/dri/i965/intel_tex.c
> index b0181ad..831f104 100644
> --- a/src/mesa/drivers/dri/i965/intel_tex.c
> +++ b/src/mesa/drivers/dri/i965/intel_tex.c
> @@ -330,9 +330,9 @@ intel_set_texture_storage_for_buffer_object(struct gl_context *ctx,
>   
>      assert(intel_texobj->mt == NULL);
>   
> -   drm_intel_bo *bo = intel_bufferobj_buffer(brw, intel_buffer_obj,
> -                                             buffer_offset,
> -                                             row_stride * image->Height);
> +   struct brw_bo *bo = intel_bufferobj_buffer(brw, intel_buffer_obj,
> +					      buffer_offset,
> +					      row_stride * image->Height);
>      intel_texobj->mt =
>         intel_miptree_create_for_bo(brw, bo,
>                                     image->TexFormat,
> diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
> index e077d5e..50f3352 100644
> --- a/src/mesa/drivers/dri/i965/intel_tex_image.c
> +++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
> @@ -19,7 +19,6 @@
>   
>   #include "intel_mipmap_tree.h"
>   #include "intel_buffer_objects.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_tex.h"
>   #include "intel_blit.h"
>   #include "intel_fbo.h"
> @@ -94,7 +93,9 @@ intelTexImage(struct gl_context * ctx,
>      struct intel_texture_image *intelImage = intel_texture_image(texImage);
>      bool ok;
>   
> -   bool tex_busy = intelImage->mt && drm_intel_bo_busy(intelImage->mt->bo);
> +   bool tex_busy =
> +      intelImage->mt &&
> +      brw_bo_busy(intelImage->mt->bo, BUSY_WRITE | BUSY_RETIRE);
>   
>      DBG("%s mesa_format %s target %s format %s type %s level %d %dx%dx%d\n",
>          __func__, _mesa_get_format_name(texImage->TexFormat),
> @@ -146,7 +147,7 @@ intelTexImage(struct gl_context * ctx,
>   static void
>   intel_set_texture_image_bo(struct gl_context *ctx,
>                              struct gl_texture_image *image,
> -                           drm_intel_bo *bo,
> +                           struct brw_bo *bo,
>                              GLenum target,
>                              GLenum internalFormat,
>                              mesa_format format,
> @@ -341,13 +342,15 @@ intel_image_target_texture_2d(struct gl_context *ctx, GLenum target,
>       * buffer's content to the main buffer nor for invalidating the aux buffer's
>       * content.
>       */
> -   intel_set_texture_image_bo(ctx, texImage, image->bo,
> +   struct brw_bo *bo = brw_bo_import(&brw->batch, image->bo, true);
> +   intel_set_texture_image_bo(ctx, texImage, bo,
>                                 target, image->internal_format,
>                                 image->format, image->offset,
>                                 image->width,  image->height,
>                                 image->pitch,
>                                 image->tile_x, image->tile_y,
>                                 MIPTREE_LAYOUT_DISABLE_AUX);
> +   brw_bo_put(bo);
>   }
>   
>   /**
> @@ -368,11 +371,6 @@ intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx,
>      struct intel_texture_image *image = intel_texture_image(texImage);
>      int dst_pitch;
>   
> -   /* The miptree's buffer. */
> -   drm_intel_bo *bo;
> -
> -   int error = 0;
> -
>      uint32_t cpp;
>      mem_copy_fn mem_copy = NULL;
>   
> @@ -427,18 +425,6 @@ intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx,
>       */
>      intel_miptree_resolve_color(brw, image->mt);
>   
> -   bo = image->mt->bo;
> -
> -   if (drm_intel_bo_references(brw->batch.bo, bo)) {
> -      perf_debug("Flushing before mapping a referenced bo.\n");
> -      intel_batchbuffer_flush(brw);
> -   }
> -
> -   error = brw_bo_map(brw, bo, false /* write enable */, "miptree");
> -   if (error) {
> -      DBG("%s: failed to map bo\n", __func__);
> -      return false;
> -   }
>   
>      dst_pitch = _mesa_image_row_stride(packing, width, format, type);
>   
> @@ -456,19 +442,16 @@ intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx,
>      xoffset += image->mt->level[level].level_x;
>      yoffset += image->mt->level[level].level_y;
>   
> -   tiled_to_linear(
> +   return tiled_to_linear(
>         xoffset * cpp, (xoffset + width) * cpp,
>         yoffset, yoffset + height,
>         pixels - (ptrdiff_t) yoffset * dst_pitch - (ptrdiff_t) xoffset * cpp,
> -      bo->virtual,
> +      brw_bo_map(image->mt->bo, MAP_READ | MAP_DETILED),
>         dst_pitch, image->mt->pitch,
>         brw->has_swizzling,
>         image->mt->tiling,
>         mem_copy
>      );
> -
> -   drm_intel_bo_unmap(bo);
> -   return true;
>   }
>   
>   static void
> diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
> index 7507f76..6b1bff5 100644
> --- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
> +++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
> @@ -38,7 +38,6 @@
>   #include "drivers/common/meta.h"
>   
>   #include "brw_context.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_tex.h"
>   #include "intel_mipmap_tree.h"
>   #include "intel_blit.h"
> @@ -86,11 +85,6 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
>      struct intel_texture_image *image = intel_texture_image(texImage);
>      int src_pitch;
>   
> -   /* The miptree's buffer. */
> -   drm_intel_bo *bo;
> -
> -   int error = 0;
> -
>      uint32_t cpp;
>      mem_copy_fn mem_copy = NULL;
>   
> @@ -103,8 +97,7 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
>       * with _mesa_image_row_stride. However, before removing the restrictions
>       * we need tests.
>       */
> -   if (!brw->has_llc ||
> -       !(type == GL_UNSIGNED_BYTE || type == GL_UNSIGNED_INT_8_8_8_8_REV) ||
> +   if (!(type == GL_UNSIGNED_BYTE || type == GL_UNSIGNED_INT_8_8_8_8_REV) ||
>          !(texImage->TexObject->Target == GL_TEXTURE_2D ||
>            texImage->TexObject->Target == GL_TEXTURE_RECTANGLE) ||
>          pixels == NULL ||
> @@ -141,19 +134,6 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
>       */
>      intel_miptree_resolve_color(brw, image->mt);
>   
> -   bo = image->mt->bo;
> -
> -   if (drm_intel_bo_references(brw->batch.bo, bo)) {
> -      perf_debug("Flushing before mapping a referenced bo.\n");
> -      intel_batchbuffer_flush(brw);
> -   }
> -
> -   error = brw_bo_map(brw, bo, true /* write enable */, "miptree");
> -   if (error || bo->virtual == NULL) {
> -      DBG("%s: failed to map bo\n", __func__);
> -      return false;
> -   }
> -
>      src_pitch = _mesa_image_row_stride(packing, width, format, type);
>   
>      /* We postponed printing this message until having committed to executing
> @@ -174,19 +154,16 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
>      xoffset += image->mt->level[level].level_x;
>      yoffset += image->mt->level[level].level_y;
>   
> -   linear_to_tiled(
> +   return linear_to_tiled(
>         xoffset * cpp, (xoffset + width) * cpp,
>         yoffset, yoffset + height,
> -      bo->virtual,
> +      brw_bo_map(image->mt->bo, MAP_WRITE | MAP_DETILED),
>         pixels - (ptrdiff_t) yoffset * src_pitch - (ptrdiff_t) xoffset * cpp,
>         image->mt->pitch, src_pitch,
>         brw->has_swizzling,
>         image->mt->tiling,
>         mem_copy
>      );
> -
> -   drm_intel_bo_unmap(bo);
> -   return true;
>   }
>   
>   static void
> @@ -202,7 +179,9 @@ intelTexSubImage(struct gl_context * ctx,
>      struct intel_texture_image *intelImage = intel_texture_image(texImage);
>      bool ok;
>   
> -   bool tex_busy = intelImage->mt && drm_intel_bo_busy(intelImage->mt->bo);
> +   bool tex_busy =
> +      intelImage->mt &&
> +      brw_bo_busy(intelImage->mt->bo, BUSY_WRITE | BUSY_RETIRE);
>   
>      DBG("%s mesa_format %s target %s format %s type %s level %d %dx%dx%d\n",
>          __func__, _mesa_get_format_name(texImage->TexFormat),
> diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> index dcf0462..404bc2f 100644
> --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> @@ -552,7 +552,7 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
>    * 'dst' is the start of the texture and 'src' is the corresponding
>    * address to copy from, though copying begins at (xt1, yt1).
>    */
> -void
> +bool
>   linear_to_tiled(uint32_t xt1, uint32_t xt2,
>                   uint32_t yt1, uint32_t yt2,
>                   char *dst, const char *src,
> @@ -568,6 +568,9 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
>      uint32_t tw, th, span;
>      uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
>   
> +   if (!dst)
> +      return false;
> +
>      if (tiling == I915_TILING_X) {
>         tw = xtile_width;
>         th = xtile_height;
> @@ -630,6 +633,8 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
>                      mem_copy);
>         }
>      }
> +
> +   return true;
>   }
>   
>   /**
> @@ -643,7 +648,7 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
>    * 'dst' is the start of the texture and 'src' is the corresponding
>    * address to copy from, though copying begins at (xt1, yt1).
>    */
> -void
> +bool
>   tiled_to_linear(uint32_t xt1, uint32_t xt2,
>                   uint32_t yt1, uint32_t yt2,
>                   char *dst, const char *src,
> @@ -659,6 +664,9 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
>      uint32_t tw, th, span;
>      uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
>   
> +   if (!src)
> +      return false;
> +
>      if (tiling == I915_TILING_X) {
>         tw = xtile_width;
>         th = xtile_height;
> @@ -721,6 +729,8 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
>                      mem_copy);
>         }
>      }
> +
> +   return true;
>   }
>   
>   
> diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
> index 9dc1088..a64e516 100644
> --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
> +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
> @@ -37,7 +37,7 @@
>   
>   typedef void *(*mem_copy_fn)(void *dest, const void *src, size_t n);
>   
> -void
> +bool
>   linear_to_tiled(uint32_t xt1, uint32_t xt2,
>                   uint32_t yt1, uint32_t yt2,
>                   char *dst, const char *src,
> @@ -46,7 +46,7 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2,
>                   uint32_t tiling,
>                   mem_copy_fn mem_copy);
>   
> -void
> +bool
>   tiled_to_linear(uint32_t xt1, uint32_t xt2,
>                   uint32_t yt1, uint32_t yt2,
>                   char *dst, const char *src,
> diff --git a/src/mesa/drivers/dri/i965/intel_upload.c b/src/mesa/drivers/dri/i965/intel_upload.c
> index 870aabc..0da5936 100644
> --- a/src/mesa/drivers/dri/i965/intel_upload.c
> +++ b/src/mesa/drivers/dri/i965/intel_upload.c
> @@ -36,7 +36,6 @@
>   #include "brw_context.h"
>   #include "intel_blit.h"
>   #include "intel_buffer_objects.h"
> -#include "intel_batchbuffer.h"
>   #include "intel_fbo.h"
>   #include "intel_mipmap_tree.h"
>   
> @@ -50,14 +49,10 @@
>   #define ALIGN_NPOT(value, alignment) \
>      (((value) + (alignment) - 1) / (alignment) * (alignment))
>   
> -void
> +static void
>   intel_upload_finish(struct brw_context *brw)
>   {
> -   if (!brw->upload.bo)
> -      return;
> -
> -   drm_intel_bo_unmap(brw->upload.bo);
> -   drm_intel_bo_unreference(brw->upload.bo);
> +   brw_bo_put(brw->upload.bo);
>      brw->upload.bo = NULL;
>      brw->upload.next_offset = 0;
>   }
> @@ -89,7 +84,7 @@ void *
>   intel_upload_space(struct brw_context *brw,
>                      uint32_t size,
>                      uint32_t alignment,
> -                   drm_intel_bo **out_bo,
> +                   struct brw_bo **out_bo,
>                      uint32_t *out_offset)
>   {
>      uint32_t offset;
> @@ -101,24 +96,20 @@ intel_upload_space(struct brw_context *brw,
>      }
>   
>      if (!brw->upload.bo) {
> -      brw->upload.bo = drm_intel_bo_alloc(brw->bufmgr, "streamed data",
> -                                          MAX2(INTEL_UPLOAD_SIZE, size), 4096);
> -      if (brw->has_llc)
> -         drm_intel_bo_map(brw->upload.bo, true);
> -      else
> -         drm_intel_gem_bo_map_gtt(brw->upload.bo);
> +      brw->upload.bo = brw_bo_create(&brw->batch, "streamed data",
> +				     MAX2(INTEL_UPLOAD_SIZE, size), 4096, 0);
>      }
>   
>      brw->upload.next_offset = offset + size;
>   
>      *out_offset = offset;
> +
>      if (*out_bo != brw->upload.bo) {
> -      drm_intel_bo_unreference(*out_bo);
> -      *out_bo = brw->upload.bo;
> -      drm_intel_bo_reference(brw->upload.bo);
> +      brw_bo_put(*out_bo);
> +      *out_bo = brw_bo_get(brw->upload.bo);
>      }
>   
> -   return brw->upload.bo->virtual + offset;
> +   return brw_bo_map(brw->upload.bo, MAP_WRITE | MAP_ASYNC) + offset;
>   }
>   
>   /**
> @@ -131,9 +122,9 @@ intel_upload_data(struct brw_context *brw,
>                     const void *data,
>                     uint32_t size,
>                     uint32_t alignment,
> -                  drm_intel_bo **out_bo,
> +		  struct brw_bo **out_bo,
>                     uint32_t *out_offset)
>   {
> -   void *dst = intel_upload_space(brw, size, alignment, out_bo, out_offset);
> -   memcpy(dst, data, size);
> +   memcpy(intel_upload_space(brw, size, alignment, out_bo, out_offset),
> +	  data, size);
>   }