Mesa (master): i965: Use state streaming on programs, and state base address on gen5+.

Eric Anholt anholt at kemper.freedesktop.org
Sat Jun 18 23:50:22 UTC 2011


Module: Mesa
Branch: master
Commit: c173541d9769d41a85cc899bc49699a3587df4bf
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=c173541d9769d41a85cc899bc49699a3587df4bf

Author: Eric Anholt <eric at anholt.net>
Date:   Wed Apr 27 13:33:10 2011 -0700

i965: Use state streaming on programs, and state base address on gen5+.

There will be a little bit of thrashing of the program cache BO as the
cache warms up, but once the application is in steady state, this
reduces relocations on gen5 and later.

On my T420 laptop, cairogl firefox-talos-gfx performance improves 2.6%
+/- 1.3% (n=6).  No statistically significant performance difference
on nexuiz (n=5).

---

 src/mesa/drivers/dri/i965/brw_clip.c         |   24 ++--
 src/mesa/drivers/dri/i965/brw_clip_state.c   |   19 ++--
 src/mesa/drivers/dri/i965/brw_context.h      |   55 ++++++---
 src/mesa/drivers/dri/i965/brw_fs.cpp         |    6 +-
 src/mesa/drivers/dri/i965/brw_gs.c           |   24 ++---
 src/mesa/drivers/dri/i965/brw_gs_state.c     |   19 ++--
 src/mesa/drivers/dri/i965/brw_misc_state.c   |   10 +-
 src/mesa/drivers/dri/i965/brw_sf.c           |   22 ++--
 src/mesa/drivers/dri/i965/brw_sf_state.c     |   15 ++-
 src/mesa/drivers/dri/i965/brw_state.h        |   28 ++--
 src/mesa/drivers/dri/i965/brw_state_cache.c  |  164 ++++++++++++--------------
 src/mesa/drivers/dri/i965/brw_state_dump.c   |   26 ++--
 src/mesa/drivers/dri/i965/brw_state_upload.c |   17 ++--
 src/mesa/drivers/dri/i965/brw_vs.c           |   22 ++--
 src/mesa/drivers/dri/i965/brw_vs_state.c     |   18 ++--
 src/mesa/drivers/dri/i965/brw_vtbl.c         |   12 +-
 src/mesa/drivers/dri/i965/brw_wm.c           |   21 ++--
 src/mesa/drivers/dri/i965/brw_wm_state.c     |   40 +++----
 src/mesa/drivers/dri/i965/gen6_gs_state.c    |    2 +-
 src/mesa/drivers/dri/i965/gen6_urb.c         |    2 +-
 src/mesa/drivers/dri/i965/gen6_vs_state.c    |    2 +-
 src/mesa/drivers/dri/i965/gen6_wm_state.c    |   10 +-
 src/mesa/drivers/dri/i965/gen7_disable.c     |    2 +-
 src/mesa/drivers/dri/i965/gen7_urb.c         |    2 +-
 src/mesa/drivers/dri/i965/gen7_vs_state.c    |    2 +-
 src/mesa/drivers/dri/i965/gen7_wm_state.c    |    9 +-
 26 files changed, 276 insertions(+), 297 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
index c7d428b..d82206b 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -146,15 +146,12 @@ static void compile_clip_prog( struct brw_context *brw,
       printf("\n");
    }
 
-   /* Upload
-    */
-   drm_intel_bo_unreference(brw->clip.prog_bo);
-   brw->clip.prog_bo = brw_upload_cache(&brw->cache,
-					BRW_CLIP_PROG,
-					&c.key, sizeof(c.key),
-					program, program_size,
-					&c.prog_data, sizeof(c.prog_data),
-					&brw->clip.prog_data);
+   brw_upload_cache(&brw->cache,
+		    BRW_CLIP_PROG,
+		    &c.key, sizeof(c.key),
+		    program, program_size,
+		    &c.prog_data, sizeof(c.prog_data),
+		    &brw->clip.prog_offset, &brw->clip.prog_data);
    ralloc_free(mem_ctx);
 }
 
@@ -271,12 +268,11 @@ static void upload_clip_prog(struct brw_context *brw)
       }
    }
 
-   drm_intel_bo_unreference(brw->clip.prog_bo);
-   brw->clip.prog_bo = brw_search_cache(&brw->cache, BRW_CLIP_PROG,
-					&key, sizeof(key),
-					&brw->clip.prog_data);
-   if (brw->clip.prog_bo == NULL)
+   if (!brw_search_cache(&brw->cache, BRW_CLIP_PROG,
+			 &key, sizeof(key),
+			 &brw->clip.prog_offset, &brw->clip.prog_data)) {
       compile_clip_prog( brw, &key );
+   }
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c b/src/mesa/drivers/dri/i965/brw_clip_state.c
index 6015c8c..b9efbb7 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_state.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_state.c
@@ -43,11 +43,15 @@ brw_prepare_clip_unit(struct brw_context *brw)
    clip = brw_state_batch(brw, sizeof(*clip), 32, &brw->clip.state_offset);
    memset(clip, 0, sizeof(*clip));
 
-   /* CACHE_NEW_CLIP_PROG */
+   /* BRW_NEW_PROGRAM_CACHE | CACHE_NEW_CLIP_PROG */
    clip->thread0.grf_reg_count = (ALIGN(brw->clip.prog_data->total_grf, 16) /
 				 16 - 1);
-   /* reloc */
-   clip->thread0.kernel_start_pointer = brw->clip.prog_bo->offset >> 6;
+   clip->thread0.kernel_start_pointer =
+      brw_program_reloc(brw,
+			brw->clip.state_offset +
+			offsetof(struct brw_clip_unit_state, thread0),
+			brw->clip.prog_offset +
+			(clip->thread0.grf_reg_count << 1)) >> 6;
 
    clip->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
    clip->thread1.single_program_flow = 1;
@@ -110,14 +114,6 @@ brw_prepare_clip_unit(struct brw_context *brw)
    clip->viewport_ymin = -1;
    clip->viewport_ymax = 1;
 
-   /* Emit clip program relocation */
-   assert(brw->clip.prog_bo);
-   drm_intel_bo_emit_reloc(intel->batch.bo,
-			   (brw->clip.state_offset +
-			    offsetof(struct brw_clip_unit_state, thread0)),
-			   brw->clip.prog_bo, clip->thread0.grf_reg_count << 1,
-			   I915_GEM_DOMAIN_INSTRUCTION, 0);
-
    brw->state.dirty.cache |= CACHE_NEW_CLIP_UNIT;
 }
 
@@ -125,6 +121,7 @@ const struct brw_tracked_state brw_clip_unit = {
    .dirty = {
       .mesa  = _NEW_TRANSFORM,
       .brw   = (BRW_NEW_BATCH |
+		BRW_NEW_PROGRAM_CACHE |
 		BRW_NEW_CURBE_OFFSETS |
 		BRW_NEW_URB_FENCE),
       .cache = CACHE_NEW_CLIP_PROG
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 621b6f8..16b71f6 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -142,7 +142,8 @@ enum brw_state_id {
    BRW_STATE_NR_VS_SURFACES,
    BRW_STATE_INDEX_BUFFER,
    BRW_STATE_VS_CONSTBUF,
-   BRW_STATE_WM_CONSTBUF
+   BRW_STATE_WM_CONSTBUF,
+   BRW_STATE_PROGRAM_CACHE,
 };
 
 #define BRW_NEW_URB_FENCE               (1 << BRW_STATE_URB_FENCE)
@@ -172,6 +173,7 @@ enum brw_state_id {
 #define BRW_NEW_INDEX_BUFFER           (1 << BRW_STATE_INDEX_BUFFER)
 #define BRW_NEW_VS_CONSTBUF            (1 << BRW_STATE_VS_CONSTBUF)
 #define BRW_NEW_WM_CONSTBUF            (1 << BRW_STATE_WM_CONSTBUF)
+#define BRW_NEW_PROGRAM_CACHE		(1 << BRW_STATE_PROGRAM_CACHE)
 
 struct brw_state_flags {
    /** State update flags signalled by mesa internals */
@@ -365,7 +367,8 @@ struct brw_cache_item {
    GLuint key_size;		/* for variable-sized keys */
    const void *key;
 
-   drm_intel_bo *bo;
+   uint32_t offset;
+   uint32_t size;
 
    struct brw_cache_item *next;
 };   
@@ -376,14 +379,11 @@ struct brw_cache {
    struct brw_context *brw;
 
    struct brw_cache_item **items;
+   drm_intel_bo *bo;
    GLuint size, n_items;
 
-   char *name[BRW_MAX_CACHE];
-
-   /* Record of the last BOs chosen for each cache_id.  Used to set
-    * brw->state.dirty.cache when a new cache item is chosen.
-    */
-   drm_intel_bo *last_bo[BRW_MAX_CACHE];
+   uint32_t next_offset;
+   bool bo_used_by_gpu;
 };
 
 
@@ -634,8 +634,9 @@ struct brw_context
       struct brw_vs_prog_data *prog_data;
       int8_t *constant_map; /* variable array following prog_data */
 
-      drm_intel_bo *prog_bo;
       drm_intel_bo *const_bo;
+      /** Offset in the program cache to the VS program */
+      uint32_t prog_offset;
       uint32_t state_offset;
 
       /** Binding table of pointers to surf_bo entries */
@@ -651,14 +652,16 @@ struct brw_context
       struct brw_gs_prog_data *prog_data;
 
       GLboolean prog_active;
+      /** Offset in the program cache to the CLIP program pre-gen6 */
+      uint32_t prog_offset;
       uint32_t state_offset;
-      drm_intel_bo *prog_bo;
    } gs;
 
    struct {
       struct brw_clip_prog_data *prog_data;
 
-      drm_intel_bo *prog_bo;
+      /** Offset in the program cache to the CLIP program pre-gen6 */
+      uint32_t prog_offset;
 
       /* Offset in the batch to the CLIP state on pre-gen6. */
       uint32_t state_offset;
@@ -673,7 +676,8 @@ struct brw_context
    struct {
       struct brw_sf_prog_data *prog_data;
 
-      drm_intel_bo *prog_bo;
+      /** Offset in the program cache to the CLIP program pre-gen6 */
+      uint32_t prog_offset;
       uint32_t state_offset;
       uint32_t vp_offset;
    } sf;
@@ -700,12 +704,14 @@ struct brw_context
       GLuint sampler_count;
       uint32_t sampler_offset;
 
+      /** Offset in the program cache to the WM program */
+      uint32_t prog_offset;
+
       /** Binding table of pointers to surf_bo entries */
       uint32_t bind_bo_offset;
       uint32_t surf_offset[BRW_WM_MAX_SURF];
       uint32_t state_offset; /* offset in batchbuffer to pre-gen6 WM state */
 
-      drm_intel_bo *prog_bo;
       drm_intel_bo *const_bo; /* pull constant buffer. */
       /**
        * This is offset in the batch to the push constants on gen6.
@@ -717,9 +723,6 @@ struct brw_context
 
 
    struct {
-      /* gen4 */
-      drm_intel_bo *prog_bo;
-
       uint32_t state_offset;
       uint32_t blend_state_offset;
       uint32_t depth_stencil_state_offset;
@@ -874,6 +877,26 @@ brw_register_blocks(int reg_count)
    return ALIGN(reg_count, 16) / 16 - 1;
 }
 
+static inline uint32_t
+brw_program_reloc(struct brw_context *brw, uint32_t state_offset,
+		  uint32_t prog_offset)
+{
+   struct intel_context *intel = &brw->intel;
+
+   if (intel->gen >= 5) {
+      /* Using state base address. */
+      return prog_offset;
+   }
+
+   drm_intel_bo_emit_reloc(intel->batch.bo,
+			   state_offset,
+			   brw->cache.bo,
+			   prog_offset,
+			   I915_GEM_DOMAIN_INSTRUCTION, 0);
+
+   return brw->cache.bo->offset + prog_offset;
+}
+
 GLboolean brw_do_cubemap_normalize(struct exec_list *instructions);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 7c73a8f..8580c78 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1697,14 +1697,12 @@ brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
 
    key.program_string_id = bfp->id;
 
-   drm_intel_bo *old_prog_bo = brw->wm.prog_bo;
+   uint32_t old_prog_offset = brw->wm.prog_offset;
    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
-   brw->wm.prog_bo = NULL;
 
    bool success = do_wm_prog(brw, prog, bfp, &key);
 
-   drm_intel_bo_unreference(brw->wm.prog_bo);
-   brw->wm.prog_bo = old_prog_bo;
+   brw->wm.prog_offset = old_prog_offset;
    brw->wm.prog_data = old_prog_data;
 
    return success;
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 001cd62..3171e97 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -121,14 +121,11 @@ static void compile_gs_prog( struct brw_context *brw,
       printf("\n");
     }
 
-   /* Upload
-    */
-   drm_intel_bo_unreference(brw->gs.prog_bo);
-   brw->gs.prog_bo = brw_upload_cache(&brw->cache, BRW_GS_PROG,
-				      &c.key, sizeof(c.key),
-				      program, program_size,
-				      &c.prog_data, sizeof(c.prog_data),
-				      &brw->gs.prog_data);
+   brw_upload_cache(&brw->cache, BRW_GS_PROG,
+		    &c.key, sizeof(c.key),
+		    program, program_size,
+		    &c.prog_data, sizeof(c.prog_data),
+		    &brw->gs.prog_offset, &brw->gs.prog_data);
    ralloc_free(mem_ctx);
 }
 
@@ -189,15 +186,12 @@ static void prepare_gs_prog(struct brw_context *brw)
       brw->gs.prog_active = key.need_gs_prog;
    }
 
-   drm_intel_bo_unreference(brw->gs.prog_bo);
-   brw->gs.prog_bo = NULL;
-
    if (brw->gs.prog_active) {
-      brw->gs.prog_bo = brw_search_cache(&brw->cache, BRW_GS_PROG,
-					 &key, sizeof(key),
-					 &brw->gs.prog_data);
-      if (brw->gs.prog_bo == NULL)
+      if (!brw_search_cache(&brw->cache, BRW_GS_PROG,
+			    &key, sizeof(key),
+			    &brw->gs.prog_offset, &brw->gs.prog_data)) {
 	 compile_gs_prog( brw, &key );
+      }
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_gs_state.c b/src/mesa/drivers/dri/i965/brw_gs_state.c
index 542874b..bbfefcd 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_state.c
@@ -45,12 +45,17 @@ brw_prepare_gs_unit(struct brw_context *brw)
 
    memset(gs, 0, sizeof(*gs));
 
-   /* CACHE_NEW_GS_PROG */
+   /* BRW_NEW_PROGRAM_CACHE | CACHE_NEW_GS_PROG */
    if (brw->gs.prog_active) {
       gs->thread0.grf_reg_count = (ALIGN(brw->gs.prog_data->total_grf, 16) /
 				   16 - 1);
-      /* reloc */
-      gs->thread0.kernel_start_pointer = brw->gs.prog_bo->offset >> 6;
+
+      gs->thread0.kernel_start_pointer =
+	 brw_program_reloc(brw,
+			   brw->gs.state_offset +
+			   offsetof(struct brw_gs_unit_state, thread0),
+			   brw->gs.prog_offset +
+			   (gs->thread0.grf_reg_count << 1)) >> 6;
 
       gs->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
       gs->thread1.single_program_flow = 1;
@@ -69,13 +74,6 @@ brw_prepare_gs_unit(struct brw_context *brw)
 	 gs->thread4.max_threads = 1;
       else
 	 gs->thread4.max_threads = 0;
-
-      /* Emit GS program relocation */
-      drm_intel_bo_emit_reloc(intel->batch.bo,
-			      (brw->gs.state_offset +
-			       offsetof(struct brw_gs_unit_state, thread0)),
-			      brw->gs.prog_bo, gs->thread0.grf_reg_count << 1,
-			      I915_GEM_DOMAIN_INSTRUCTION, 0);
    }
 
    if (intel->gen == 5)
@@ -91,6 +89,7 @@ const struct brw_tracked_state brw_gs_unit = {
    .dirty = {
       .mesa  = 0,
       .brw   = (BRW_NEW_BATCH |
+		BRW_NEW_PROGRAM_CACHE |
 		BRW_NEW_CURBE_OFFSETS |
 		BRW_NEW_URB_FENCE),
       .cache = CACHE_NEW_GS_PROG
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 1f3b64f..b0f95dd 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -706,7 +706,9 @@ static void upload_state_base_address( struct brw_context *brw )
 				   I915_GEM_DOMAIN_INSTRUCTION), 0, 1);
 
        OUT_BATCH(1); /* Indirect object base address: MEDIA_OBJECT data */
-       OUT_BATCH(1); /* Instruction base address: shader kernels (incl. SIP) */
+       OUT_RELOC(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+		 1); /* Instruction base address: shader kernels (incl. SIP) */
+
        OUT_BATCH(1); /* General state upper bound */
        OUT_BATCH(1); /* Dynamic state upper bound */
        OUT_BATCH(1); /* Indirect object upper bound */
@@ -719,7 +721,8 @@ static void upload_state_base_address( struct brw_context *brw )
        OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0,
 		 1); /* Surface state base address */
        OUT_BATCH(1); /* Indirect object base address */
-       OUT_BATCH(1); /* Instruction base address */
+       OUT_RELOC(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+		 1); /* Instruction base address */
        OUT_BATCH(1); /* General state upper bound */
        OUT_BATCH(1); /* Indirect object upper bound */
        OUT_BATCH(1); /* Instruction access upper bound */
@@ -740,7 +743,8 @@ static void upload_state_base_address( struct brw_context *brw )
 const struct brw_tracked_state brw_state_base_address = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_BATCH,
+      .brw = (BRW_NEW_BATCH |
+	      BRW_NEW_PROGRAM_CACHE),
       .cache = 0,
    },
    .emit = upload_state_base_address
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
index c222777..fca30a7 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -120,14 +120,11 @@ static void compile_sf_prog( struct brw_context *brw,
       printf("\n");
    }
 
-   /* Upload
-    */
-   drm_intel_bo_unreference(brw->sf.prog_bo);
-   brw->sf.prog_bo = brw_upload_cache(&brw->cache, BRW_SF_PROG,
-				      &c.key, sizeof(c.key),
-				      program, program_size,
-				      &c.prog_data, sizeof(c.prog_data),
-				      &brw->sf.prog_data);
+   brw_upload_cache(&brw->cache, BRW_SF_PROG,
+		    &c.key, sizeof(c.key),
+		    program, program_size,
+		    &c.prog_data, sizeof(c.prog_data),
+		    &brw->sf.prog_offset, &brw->sf.prog_data);
    ralloc_free(mem_ctx);
 }
 
@@ -191,12 +188,11 @@ static void upload_sf_prog(struct brw_context *brw)
       key.frontface_ccw = (ctx->Polygon.FrontFace == GL_CCW) ^ (ctx->DrawBuffer->Name != 0);
    }
 
-   drm_intel_bo_unreference(brw->sf.prog_bo);
-   brw->sf.prog_bo = brw_search_cache(&brw->cache, BRW_SF_PROG,
-				      &key, sizeof(key),
-				      &brw->sf.prog_data);
-   if (brw->sf.prog_bo == NULL)
+   if (!brw_search_cache(&brw->cache, BRW_SF_PROG,
+			 &key, sizeof(key),
+			 &brw->sf.prog_offset, &brw->sf.prog_data)) {
       compile_sf_prog( brw, &key );
+   }
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index 78b22c4..eb3d103 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -133,9 +133,14 @@ static void upload_sf_unit( struct brw_context *brw )
 
    memset(sf, 0, sizeof(*sf));
 
-   /* CACHE_NEW_SF_PROG */
+   /* BRW_NEW_PROGRAM_CACHE | CACHE_NEW_SF_PROG */
    sf->thread0.grf_reg_count = ALIGN(brw->sf.prog_data->total_grf, 16) / 16 - 1;
-   sf->thread0.kernel_start_pointer = brw->sf.prog_bo->offset >> 6; /* reloc */
+   sf->thread0.kernel_start_pointer =
+      brw_program_reloc(brw,
+			brw->sf.state_offset +
+			offsetof(struct brw_sf_unit_state, thread0),
+			brw->sf.prog_offset +
+			(sf->thread0.grf_reg_count << 1)) >> 6;
 
    sf->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
 
@@ -282,11 +287,6 @@ static void upload_sf_unit( struct brw_context *brw )
    /* STATE_PREFETCH command description describes this state as being
     * something loaded through the GPE (L2 ISC), so it's INSTRUCTION domain.
     */
-   /* Emit SF program relocation */
-   drm_intel_bo_emit_reloc(bo, (brw->sf.state_offset +
-				offsetof(struct brw_sf_unit_state, thread0)),
-			   brw->sf.prog_bo, sf->thread0.grf_reg_count << 1,
-			   I915_GEM_DOMAIN_INSTRUCTION, 0);
 
    /* Emit SF viewport relocation */
    drm_intel_bo_emit_reloc(bo, (brw->sf.state_offset +
@@ -308,6 +308,7 @@ const struct brw_tracked_state brw_sf_unit = {
 		_NEW_SCISSOR |
 		_NEW_BUFFERS),
       .brw   = (BRW_NEW_BATCH |
+		BRW_NEW_PROGRAM_CACHE |
 		BRW_NEW_URB_FENCE),
       .cache = (CACHE_NEW_SF_VP |
 		CACHE_NEW_SF_PROG)
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 544ef7d..b384651 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -145,21 +145,21 @@ void brw_clear_validated_bos(struct brw_context *brw);
  * brw_state_cache.c
  */
 
-drm_intel_bo *brw_upload_cache(struct brw_cache *cache,
-			       enum brw_cache_id cache_id,
-			       const void *key,
-			       GLuint key_sz,
-			       const void *data,
-			       GLuint data_sz,
-			       const void *aux,
-			       GLuint aux_sz,
-			       void *aux_return);
-
-drm_intel_bo *brw_search_cache( struct brw_cache *cache,
-			  enum brw_cache_id cache_id,
-			  const void *key,
-			  GLuint key_size,
-			  void *aux_return);
+void brw_upload_cache(struct brw_cache *cache,
+		      enum brw_cache_id cache_id,
+		      const void *key,
+		      GLuint key_sz,
+		      const void *data,
+		      GLuint data_sz,
+		      const void *aux,
+		      GLuint aux_sz,
+		      uint32_t *out_offset, void *out_aux);
+
+bool brw_search_cache(struct brw_cache *cache,
+		      enum brw_cache_id cache_id,
+		      const void *key,
+		      GLuint key_size,
+		      uint32_t *inout_offset, void *out_aux);
 void brw_state_cache_check_size( struct brw_context *brw );
 
 void brw_init_caches( struct brw_context *brw );
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index f13a41f..d13711b 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -45,6 +45,7 @@
  */
 
 #include "main/imports.h"
+#include "intel_batchbuffer.h"
 #include "brw_state.h"
 
 #define FILE_DEBUG_FLAG DEBUG_STATE
@@ -67,23 +68,6 @@ hash_key(struct brw_cache_item *item)
    return hash;
 }
 
-
-/**
- * Marks a new buffer as being chosen for the given cache id.
- */
-static void
-update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
-		  drm_intel_bo *bo)
-{
-   if (bo == cache->last_bo[cache_id])
-      return; /* no change */
-
-   drm_intel_bo_unreference(cache->last_bo[cache_id]);
-   cache->last_bo[cache_id] = bo;
-   drm_intel_bo_reference(cache->last_bo[cache_id]);
-   cache->brw->state.dirty.cache |= 1 << cache_id;
-}
-
 static int
 brw_cache_item_equals(const struct brw_cache_item *a,
 		      const struct brw_cache_item *b)
@@ -145,12 +129,13 @@ rehash(struct brw_cache *cache)
 /**
  * Returns the buffer object matching cache_id and key, or NULL.
  */
-drm_intel_bo *
+bool
 brw_search_cache(struct brw_cache *cache,
                  enum brw_cache_id cache_id,
                  const void *key, GLuint key_size,
-                 void *aux_return)
+                 uint32_t *inout_offset, void *out_aux)
 {
+   struct brw_context *brw = cache->brw;
    struct brw_cache_item *item;
    struct brw_cache_item lookup;
    GLuint hash;
@@ -164,19 +149,45 @@ brw_search_cache(struct brw_cache *cache,
    item = search_cache(cache, hash, &lookup);
 
    if (item == NULL)
-      return NULL;
+      return false;
 
-   if (aux_return)
-      *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+   *(void **)out_aux = ((char *)item->key + item->key_size);
 
-   update_cache_last(cache, cache_id, item->bo);
+   if (item->offset != *inout_offset) {
+      brw->state.dirty.cache |= (1 << cache_id);
+      *inout_offset = item->offset;
+   }
 
-   drm_intel_bo_reference(item->bo);
-   return item->bo;
+   return true;
 }
 
+static void
+brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
+{
+   struct brw_context *brw = cache->brw;
+   struct intel_context *intel = &brw->intel;
+   drm_intel_bo *new_bo;
+
+   new_bo = drm_intel_bo_alloc(intel->bufmgr, "program cache", new_size, 64);
 
-drm_intel_bo *
+   /* Copy any existing data that needs to be saved. */
+   if (cache->next_offset != 0) {
+      drm_intel_bo_map(cache->bo, false);
+      drm_intel_bo_subdata(new_bo, 0, cache->next_offset, cache->bo->virtual);
+      drm_intel_bo_unmap(cache->bo);
+   }
+
+   drm_intel_bo_unreference(cache->bo);
+   cache->bo = new_bo;
+   cache->bo_used_by_gpu = false;
+
+   /* Since we have a new BO in place, we need to signal the units
+    * that depend on it (state base address on gen5+, or unit state before).
+    */
+   brw->state.dirty.brw |= BRW_NEW_PROGRAM_CACHE;
+}
+
+void
 brw_upload_cache(struct brw_cache *cache,
 		 enum brw_cache_id cache_id,
 		 const void *key,
@@ -185,12 +196,12 @@ brw_upload_cache(struct brw_cache *cache,
 		 GLuint data_size,
 		 const void *aux,
 		 GLuint aux_size,
-		 void *aux_return)
+		 uint32_t *out_offset,
+		 void *out_aux)
 {
    struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
    GLuint hash;
    void *tmp;
-   drm_intel_bo *bo;
 
    item->cache_id = cache_id;
    item->key = key;
@@ -198,10 +209,28 @@ brw_upload_cache(struct brw_cache *cache,
    hash = hash_key(item);
    item->hash = hash;
 
-   /* Create the buffer object to contain the data */
-   bo = drm_intel_bo_alloc(cache->brw->intel.bufmgr,
-			   cache->name[cache_id], data_size, 1 << 6);
+   /* Allocate space in the cache BO for our new program. */
+   if (cache->next_offset + data_size > cache->bo->size) {
+      uint32_t new_size = cache->bo->size * 2;
+
+      while (cache->next_offset + data_size > new_size)
+	 new_size *= 2;
+
+      brw_cache_new_bo(cache, new_size);
+   }
+
+   /* If we would block on writing to an in-use program BO, just
+    * recreate it.
+    */
+   if (cache->bo_used_by_gpu) {
+      brw_cache_new_bo(cache, cache->bo->size);
+   }
+
+   item->offset = cache->next_offset;
+   item->size = data_size;
 
+   /* Programs are always 64-byte aligned, so set up the next one now */
+   cache->next_offset = ALIGN(item->offset + data_size, 64);
 
    /* Set up the memory containing the key and aux_data */
    tmp = malloc(key_size + aux_size);
@@ -211,9 +240,6 @@ brw_upload_cache(struct brw_cache *cache,
 
    item->key = tmp;
 
-   item->bo = bo;
-   drm_intel_bo_reference(bo);
-
    if (cache->n_items > cache->size * 1.5)
       rehash(cache);
 
@@ -222,34 +248,18 @@ brw_upload_cache(struct brw_cache *cache,
    cache->items[hash] = item;
    cache->n_items++;
 
-   if (aux_return) {
-      *(void **)aux_return = (void *)((char *)item->key + item->key_size);
-   }
-
-   DBG("upload %s: %d bytes to cache id %d\n",
-       cache->name[cache_id],
-       data_size, cache_id);
-
    /* Copy data to the buffer */
-   drm_intel_bo_subdata(bo, 0, data_size, data);
-
-   update_cache_last(cache, cache_id, bo);
+   drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
 
-   return bo;
-}
-
-static void
-brw_init_cache_id(struct brw_cache *cache,
-                  const char *name,
-                  enum brw_cache_id id)
-{
-   cache->name[id] = strdup(name);
+   *out_offset = item->offset;
+   *(void **)out_aux = (void *)((char *)item->key + item->key_size);
+   cache->brw->state.dirty.cache |= 1 << cache_id;
 }
 
-
 void
 brw_init_caches(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_cache *cache = &brw->cache;
 
    cache->brw = brw;
@@ -259,36 +269,15 @@ brw_init_caches(struct brw_context *brw)
    cache->items = (struct brw_cache_item **)
       calloc(1, cache->size * sizeof(struct brw_cache_item));
 
-   brw_init_cache_id(cache, "CC_VP", BRW_CC_VP);
-   brw_init_cache_id(cache, "CC_UNIT", BRW_CC_UNIT);
-   brw_init_cache_id(cache, "WM_PROG", BRW_WM_PROG);
-   brw_init_cache_id(cache, "SAMPLER", BRW_SAMPLER);
-   brw_init_cache_id(cache, "WM_UNIT", BRW_WM_UNIT);
-   brw_init_cache_id(cache, "SF_PROG", BRW_SF_PROG);
-   brw_init_cache_id(cache, "SF_VP", BRW_SF_VP);
-
-   brw_init_cache_id(cache, "SF_UNIT", BRW_SF_UNIT);
-
-   brw_init_cache_id(cache, "VS_UNIT", BRW_VS_UNIT);
-
-   brw_init_cache_id(cache, "VS_PROG", BRW_VS_PROG);
-
-   brw_init_cache_id(cache, "CLIP_UNIT", BRW_CLIP_UNIT);
-
-   brw_init_cache_id(cache, "CLIP_PROG", BRW_CLIP_PROG);
-   brw_init_cache_id(cache, "CLIP_VP", BRW_CLIP_VP);
-
-   brw_init_cache_id(cache, "GS_UNIT", BRW_GS_UNIT);
-
-   brw_init_cache_id(cache, "GS_PROG", BRW_GS_PROG);
-   brw_init_cache_id(cache, "BLEND_STATE", BRW_BLEND_STATE);
-   brw_init_cache_id(cache, "COLOR_CALC_STATE", BRW_COLOR_CALC_STATE);
-   brw_init_cache_id(cache, "DEPTH_STENCIL_STATE", BRW_DEPTH_STENCIL_STATE);
+   cache->bo = drm_intel_bo_alloc(intel->bufmgr,
+				  "program cache",
+				  4096, 64);
 }
 
 static void
 brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
 {
+   struct intel_context *intel = &brw->intel;
    struct brw_cache_item *c, *next;
    GLuint i;
 
@@ -297,7 +286,6 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
    for (i = 0; i < cache->size; i++) {
       for (c = cache->items[i]; c; c = next) {
 	 next = c->next;
-	 drm_intel_bo_unreference(c->bo);
 	 free((void *)c->key);
 	 free(c);
       }
@@ -306,9 +294,18 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
 
    cache->n_items = 0;
 
+   /* Start putting programs into the start of the BO again, since
+    * we'll never find the old results.
+    */
+   cache->next_offset = 0;
+
+   /* We need to make sure that the programs get regenerated, since
+    * any offsets leftover in brw_context will no longer be valid.
+    */
    brw->state.dirty.mesa |= ~0;
    brw->state.dirty.brw |= ~0;
    brw->state.dirty.cache |= ~0;
+   intel_batchbuffer_flush(intel);
 }
 
 void
@@ -325,15 +322,10 @@ brw_state_cache_check_size(struct brw_context *brw)
 static void
 brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 {
-   GLuint i;
 
    DBG("%s\n", __FUNCTION__);
 
    brw_clear_cache(brw, cache);
-   for (i = 0; i < BRW_MAX_CACHE; i++) {
-      drm_intel_bo_unreference(cache->last_bo[i]);
-      free(cache->name[i]);
-   }
    free(cache->items);
    cache->items = NULL;
    cache->size = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index ff06cb3..7a3a88f 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -459,21 +459,19 @@ static void dump_blend_state(struct brw_context *brw)
 
 }
 
-static void brw_debug_prog(const char *name, drm_intel_bo *prog)
+static void brw_debug_prog(struct brw_context *brw,
+			   const char *name, uint32_t prog_offset)
 {
    unsigned int i;
    uint32_t *data;
 
-   if (prog == NULL)
-      return;
-
-   drm_intel_bo_map(prog, GL_FALSE);
+   drm_intel_bo_map(brw->cache.bo, false);
 
-   data = prog->virtual;
+   data = brw->cache.bo->virtual + prog_offset;
 
-   for (i = 0; i < prog->size / 4 / 4; i++) {
+   for (i = 0; i < brw->cache.bo->size / 4 / 4; i++) {
       fprintf(stderr, "%8s: 0x%08x: 0x%08x 0x%08x 0x%08x 0x%08x\n",
-	      name, (unsigned int)prog->offset + i * 4 * 4,
+	      name, (unsigned int)brw->cache.bo->offset + i * 4 * 4,
 	      data[i * 4], data[i * 4 + 1], data[i * 4 + 2], data[i * 4 + 3]);
       /* Stop at the end of the program.  It'd be nice to keep track of the actual
        * intended program size instead of guessing like this.
@@ -485,7 +483,7 @@ static void brw_debug_prog(const char *name, drm_intel_bo *prog)
 	 break;
    }
 
-   drm_intel_bo_unmap(prog);
+   drm_intel_bo_unmap(brw->cache.bo);
 }
 
 
@@ -518,17 +516,19 @@ void brw_debug_batch(struct intel_context *intel)
    if (intel->gen < 6)
        state_struct_out("VS", intel->batch.bo, brw->vs.state_offset,
 			sizeof(struct brw_vs_unit_state));
-   brw_debug_prog("VS prog", brw->vs.prog_bo);
+   brw_debug_prog(brw, "VS prog", brw->vs.prog_offset);
 
    if (intel->gen < 6)
        state_struct_out("GS", intel->batch.bo, brw->gs.state_offset,
 			sizeof(struct brw_gs_unit_state));
-   brw_debug_prog("GS prog", brw->gs.prog_bo);
+   if (brw->gs.prog_active) {
+      brw_debug_prog(brw, "GS prog", brw->gs.prog_offset);
+   }
 
    if (intel->gen < 6) {
       state_struct_out("SF", intel->batch.bo, brw->sf.state_offset,
 		       sizeof(struct brw_sf_unit_state));
-      brw_debug_prog("SF prog", brw->sf.prog_bo);
+      brw_debug_prog(brw, "SF prog", brw->sf.prog_offset);
    }
    if (intel->gen >= 7)
       dump_sf_clip_viewport_state(brw);
@@ -540,7 +540,7 @@ void brw_debug_batch(struct intel_context *intel)
    if (intel->gen < 6)
        state_struct_out("WM", intel->batch.bo, brw->wm.state_offset,
 			sizeof(struct brw_wm_unit_state));
-   brw_debug_prog("WM prog", brw->wm.prog_bo);
+   brw_debug_prog(brw, "WM prog", brw->wm.prog_offset);
 
    if (intel->gen >= 6) {
 	dump_cc_viewport_state(brw);
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 6a4c112..50ab490 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -47,11 +47,11 @@ static const struct brw_tracked_state *gen4_atoms[] =
    &brw_check_fallback,
 
    &brw_wm_input_sizes,
-   &brw_vs_prog,
-   &brw_gs_prog, 
-   &brw_clip_prog, 
-   &brw_sf_prog,
-   &brw_wm_prog,
+   &brw_vs_prog, /* must do before GS prog, state base address. */
+   &brw_gs_prog, /* must do before state base address */
+   &brw_clip_prog, /* must do before state base address */
+   &brw_sf_prog, /* must do before state base address */
+   &brw_wm_prog, /* must do before state base address */
 
    /* Once all the programs are done, we know how large urb entry
     * sizes need to be and can decide if we need to change the urb
@@ -110,9 +110,9 @@ static const struct brw_tracked_state *gen6_atoms[] =
    &brw_check_fallback,
 
    &brw_wm_input_sizes,
-   &brw_vs_prog,
-   &brw_gs_prog,
-   &brw_wm_prog,
+   &brw_vs_prog, /* must do before state base address */
+   &brw_gs_prog, /* must do before state base address */
+   &brw_wm_prog, /* must do before state base address */
 
    &gen6_clip_vp,
    &gen6_sf_vp,
@@ -365,6 +365,7 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_PRIMITIVE),
    DEFINE_BIT(BRW_NEW_CONTEXT),
    DEFINE_BIT(BRW_NEW_WM_INPUT_DIMENSIONS),
+   DEFINE_BIT(BRW_NEW_PROGRAM_CACHE),
    DEFINE_BIT(BRW_NEW_PSP),
    DEFINE_BIT(BRW_NEW_WM_SURFACES),
    DEFINE_BIT(BRW_NEW_INDICES),
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 80d5e78..a9ad531 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -105,12 +105,11 @@ static void do_vs_prog( struct brw_context *brw,
    /* constant_map */
    aux_size += c.vp->program.Base.Parameters->NumParameters;
 
-   drm_intel_bo_unreference(brw->vs.prog_bo);
-   brw->vs.prog_bo = brw_upload_cache(&brw->cache, BRW_VS_PROG,
-				      &c.key, sizeof(c.key),
-				      program, program_size,
-				      &c.prog_data, aux_size,
-				      &brw->vs.prog_data);
+   brw_upload_cache(&brw->cache, BRW_VS_PROG,
+		    &c.key, sizeof(c.key),
+		    program, program_size,
+		    &c.prog_data, aux_size,
+		    &brw->vs.prog_offset, &brw->vs.prog_data);
    ralloc_free(mem_ctx);
 }
 
@@ -153,14 +152,11 @@ static void brw_upload_vs_prog(struct brw_context *brw)
       }
    }
 
-   /* Make an early check for the key.
-    */
-   drm_intel_bo_unreference(brw->vs.prog_bo);
-   brw->vs.prog_bo = brw_search_cache(&brw->cache, BRW_VS_PROG,
-				      &key, sizeof(key),
-				      &brw->vs.prog_data);
-   if (brw->vs.prog_bo == NULL)
+   if (!brw_search_cache(&brw->cache, BRW_VS_PROG,
+			 &key, sizeof(key),
+			 &brw->vs.prog_offset, &brw->vs.prog_data)) {
       do_vs_prog(brw, vp, &key);
+   }
    brw->vs.constant_map = ((int8_t *)brw->vs.prog_data +
 			   sizeof(*brw->vs.prog_data));
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c
index 1eee5b7..185020c 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
@@ -58,8 +58,14 @@ brw_prepare_vs_unit(struct brw_context *brw)
    vs = brw_state_batch(brw, sizeof(*vs), 32, &brw->vs.state_offset);
    memset(vs, 0, sizeof(*vs));
 
-   /* CACHE_NEW_VS_PROG */
-   vs->thread0.kernel_start_pointer = brw->vs.prog_bo->offset >> 6; /* reloc */
+   /* BRW_NEW_PROGRAM_CACHE | CACHE_NEW_VS_PROG */
+   vs->thread0.kernel_start_pointer =
+      brw_program_reloc(brw,
+			brw->vs.state_offset +
+			offsetof(struct brw_vs_unit_state, thread0),
+			brw->vs.prog_offset +
+			(vs->thread0.grf_reg_count << 1)) >> 6;
+
    vs->thread0.grf_reg_count = ALIGN(brw->vs.prog_data->total_grf, 16) / 16 - 1;
    vs->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
    /* Choosing multiple program flow means that we may get 2-vertex threads,
@@ -152,13 +158,6 @@ brw_prepare_vs_unit(struct brw_context *brw)
     */
    vs->vs6.vs_enable = 1;
 
-   /* Emit VS program relocation */
-   drm_intel_bo_emit_reloc(intel->batch.bo, (brw->vs.state_offset +
-					     offsetof(struct brw_vs_unit_state,
-						      thread0)),
-			   brw->vs.prog_bo, vs->thread0.grf_reg_count << 1,
-			   I915_GEM_DOMAIN_INSTRUCTION, 0);
-
    brw->state.dirty.cache |= CACHE_NEW_VS_UNIT;
 }
 
@@ -166,6 +165,7 @@ const struct brw_tracked_state brw_vs_unit = {
    .dirty = {
       .mesa  = _NEW_TRANSFORM,
       .brw   = (BRW_NEW_BATCH |
+		BRW_NEW_PROGRAM_CACHE |
 		BRW_NEW_CURBE_OFFSETS |
                 BRW_NEW_NR_VS_SURFACES |
 		BRW_NEW_URB_FENCE),
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index 236c4d2..0f73148 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -69,14 +69,8 @@ static void brw_destroy_context( struct intel_context *intel )
    ralloc_free(brw->wm.compile_data);
 
    dri_bo_release(&brw->curbe.curbe_bo);
-   dri_bo_release(&brw->vs.prog_bo);
    dri_bo_release(&brw->vs.const_bo);
-   dri_bo_release(&brw->gs.prog_bo);
-   dri_bo_release(&brw->clip.prog_bo);
-   dri_bo_release(&brw->sf.prog_bo);
-   dri_bo_release(&brw->wm.prog_bo);
    dri_bo_release(&brw->wm.const_bo);
-   dri_bo_release(&brw->cc.prog_bo);
 
    free(brw->curbe.last_buf);
    free(brw->curbe.next_buf);
@@ -125,6 +119,12 @@ static void brw_new_batch( struct intel_context *intel )
    brw->state.dirty.brw |= BRW_NEW_CONTEXT | BRW_NEW_BATCH;
 
    brw->vb.nr_current_buffers = 0;
+
+   /* Mark that the current program cache BO has been used by the GPU.
+    * It will be reallocated if we need to put new programs in for the
+    * next batch.
+    */
+   brw->cache.bo_used_by_gpu = true;
 }
 
 static void brw_invalidate_state( struct intel_context *intel, GLuint new_state )
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 1aebd12..f1c9985 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -273,12 +273,11 @@ bool do_wm_prog(struct brw_context *brw,
     */
    program = brw_get_program(&c->func, &program_size);
 
-   drm_intel_bo_unreference(brw->wm.prog_bo);
-   brw->wm.prog_bo = brw_upload_cache(&brw->cache, BRW_WM_PROG,
-				      &c->key, sizeof(c->key),
-				      program, program_size,
-				      &c->prog_data, sizeof(c->prog_data),
-				      &brw->wm.prog_data);
+   brw_upload_cache(&brw->cache, BRW_WM_PROG,
+		    &c->key, sizeof(c->key),
+		    program, program_size,
+		    &c->prog_data, sizeof(c->prog_data),
+		    &brw->wm.prog_offset, &brw->wm.prog_data);
 
    return true;
 }
@@ -477,13 +476,9 @@ static void brw_prepare_wm_prog(struct brw_context *brw)
 
    brw_wm_populate_key(brw, &key);
 
-   /* Make an early check for the key.
-    */
-   drm_intel_bo_unreference(brw->wm.prog_bo);
-   brw->wm.prog_bo = brw_search_cache(&brw->cache, BRW_WM_PROG,
-				      &key, sizeof(key),
-				      &brw->wm.prog_data);
-   if (brw->wm.prog_bo == NULL) {
+   if (!brw_search_cache(&brw->cache, BRW_WM_PROG,
+			 &key, sizeof(key),
+			 &brw->wm.prog_offset, &brw->wm.prog_data)) {
       bool success = do_wm_prog(brw, ctx->Shader.CurrentFragmentProgram, fp,
 				&key);
       assert(success);
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index ef98f81..506e2bd 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -90,13 +90,25 @@ brw_prepare_wm_unit(struct brw_context *brw)
 	     brw->wm.prog_data->first_curbe_grf_16);
    }
 
-   /* CACHE_NEW_WM_PROG */
+   /* BRW_NEW_PROGRAM_CACHE | CACHE_NEW_WM_PROG */
    wm->thread0.grf_reg_count = brw->wm.prog_data->reg_blocks;
    wm->wm9.grf_reg_count_2 = brw->wm.prog_data->reg_blocks_16;
-   wm->thread0.kernel_start_pointer = brw->wm.prog_bo->offset >> 6; /* reloc */
-   /* reloc */
-   wm->wm9.kernel_start_pointer_2 = (brw->wm.prog_bo->offset +
-				     brw->wm.prog_data->prog_offset_16) >> 6;
+
+   wm->thread0.kernel_start_pointer =
+      brw_program_reloc(brw,
+			brw->wm.state_offset +
+			offsetof(struct brw_wm_unit_state, thread0),
+			brw->wm.prog_offset +
+			(wm->thread0.grf_reg_count << 1)) >> 6;
+
+   wm->wm9.kernel_start_pointer_2 =
+      brw_program_reloc(brw,
+			brw->wm.state_offset +
+			offsetof(struct brw_wm_unit_state, wm9),
+			brw->wm.prog_offset +
+			brw->wm.prog_data->prog_offset_16 +
+			(wm->wm9.grf_reg_count_2 << 1)) >> 6;
+
    wm->thread1.depth_coef_urb_read_offset = 1;
    wm->thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
 
@@ -214,23 +226,6 @@ brw_prepare_wm_unit(struct brw_context *brw)
    if (unlikely(INTEL_DEBUG & DEBUG_STATS) || intel->stats_wm)
       wm->wm4.stats_enable = 1;
 
-   /* Emit WM program relocation */
-   drm_intel_bo_emit_reloc(intel->batch.bo,
-			   brw->wm.state_offset +
-			   offsetof(struct brw_wm_unit_state, thread0),
-			   brw->wm.prog_bo, wm->thread0.grf_reg_count << 1,
-			   I915_GEM_DOMAIN_INSTRUCTION, 0);
-
-   if (brw->wm.prog_data->prog_offset_16) {
-      drm_intel_bo_emit_reloc(intel->batch.bo,
-			      brw->wm.state_offset +
-			      offsetof(struct brw_wm_unit_state, wm9),
-			      brw->wm.prog_bo,
-			      ((wm->wm9.grf_reg_count_2 << 1) +
-			       brw->wm.prog_data->prog_offset_16),
-			      I915_GEM_DOMAIN_INSTRUCTION, 0);
-   }
-
    /* Emit scratch space relocation */
    if (brw->wm.prog_data->total_scratch != 0) {
       drm_intel_bo_emit_reloc(intel->batch.bo,
@@ -265,6 +260,7 @@ const struct brw_tracked_state brw_wm_unit = {
 	       _NEW_BUFFERS),
 
       .brw = (BRW_NEW_BATCH |
+	      BRW_NEW_PROGRAM_CACHE |
 	      BRW_NEW_FRAGMENT_PROGRAM |
 	      BRW_NEW_CURBE_OFFSETS |
 	      BRW_NEW_NR_WM_SURFACES),
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_state.c b/src/mesa/drivers/dri/i965/gen6_gs_state.c
index c1d0a73..e73e782 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_gs_state.c
@@ -45,7 +45,7 @@ upload_gs_state(struct brw_context *brw)
    ADVANCE_BATCH();
 
    // GS should never be used on Gen6.  Disable it.
-   assert(brw->gs.prog_bo == NULL);
+   assert(!brw->gs.prog_active);
    BEGIN_BATCH(7);
    OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2));
    OUT_BATCH(0); /* prog_bo */
diff --git a/src/mesa/drivers/dri/i965/gen6_urb.c b/src/mesa/drivers/dri/i965/gen6_urb.c
index 62645a6..b410511 100644
--- a/src/mesa/drivers/dri/i965/gen6_urb.c
+++ b/src/mesa/drivers/dri/i965/gen6_urb.c
@@ -64,7 +64,7 @@ upload_urb(struct brw_context *brw)
    assert(brw->urb.nr_vs_entries % 4 == 0);
    assert(brw->urb.nr_gs_entries % 4 == 0);
    /* GS requirement */
-   assert(!brw->gs.prog_bo || brw->urb.vs_size < 5);
+   assert(!brw->gs.prog_active || brw->urb.vs_size < 5);
 
    BEGIN_BATCH(3);
    OUT_BATCH(_3DSTATE_URB << 16 | (3 - 2));
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index b46368e..7838a91 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -147,7 +147,7 @@ upload_vs_state(struct brw_context *brw)
 
    BEGIN_BATCH(6);
    OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2));
-   OUT_RELOC(brw->vs.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_BATCH(brw->vs.prog_offset);
    OUT_BATCH((0 << GEN6_VS_SAMPLER_COUNT_SHIFT) |
 	     GEN6_VS_FLOATING_POINT_MODE_ALT |
 	     (brw->vs.nr_surfaces << GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 43e651d..024a1d8 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -183,7 +183,7 @@ upload_wm_state(struct brw_context *brw)
 
    BEGIN_BATCH(9);
    OUT_BATCH(_3DSTATE_WM << 16 | (9 - 2));
-   OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_BATCH(brw->wm.prog_offset);
    OUT_BATCH(dw2);
    if (brw->wm.prog_data->total_scratch) {
       OUT_RELOC(brw->wm.scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
@@ -195,12 +195,8 @@ upload_wm_state(struct brw_context *brw)
    OUT_BATCH(dw5);
    OUT_BATCH(dw6);
    OUT_BATCH(0); /* kernel 1 pointer */
-   if (brw->wm.prog_data->prog_offset_16) {
-      OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
-		brw->wm.prog_data->prog_offset_16);
-   } else {
-      OUT_BATCH(0); /* kernel 2 pointer */
-   }
+   /* kernel 2 pointer */
+   OUT_BATCH(brw->wm.prog_offset + brw->wm.prog_data->prog_offset_16);
    ADVANCE_BATCH();
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen7_disable.c b/src/mesa/drivers/dri/i965/gen7_disable.c
index 4e94617..a44d315 100644
--- a/src/mesa/drivers/dri/i965/gen7_disable.c
+++ b/src/mesa/drivers/dri/i965/gen7_disable.c
@@ -31,7 +31,7 @@ disable_stages(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
 
-   assert(brw->gs.prog_bo == NULL);
+   assert(!brw->gs.prog_active);
 
    /* Disable the Geometry Shader (GS) Unit */
    BEGIN_BATCH(7);
diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
index 3a61469..b36d780 100644
--- a/src/mesa/drivers/dri/i965/gen7_urb.c
+++ b/src/mesa/drivers/dri/i965/gen7_urb.c
@@ -78,7 +78,7 @@ upload_urb(struct brw_context *brw)
    assert(brw->urb.nr_vs_entries % 8 == 0);
    assert(brw->urb.nr_gs_entries % 8 == 0);
    /* GS requirement */
-   assert(!brw->gs.prog_bo);
+   assert(brw->gs.prog_active);
 
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_VS << 16 | (2 - 2));
diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
index ae7a1d6..0fad3d2 100644
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
@@ -67,7 +67,7 @@ upload_vs_state(struct brw_context *brw)
 
    BEGIN_BATCH(6);
    OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2));
-   OUT_RELOC(brw->vs.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_BATCH(brw->vs.prog_offset);
    OUT_BATCH((0 << GEN6_VS_SAMPLER_COUNT_SHIFT) |
 	     GEN6_VS_FLOATING_POINT_MODE_ALT |
 	     (brw->vs.nr_surfaces << GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT));
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index 6a64eb8..ac6ba2f 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -227,18 +227,13 @@ upload_ps_state(struct brw_context *brw)
 
    BEGIN_BATCH(8);
    OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2));
-   OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+   OUT_BATCH(brw->wm.prog_offset);
    OUT_BATCH(dw2);
    OUT_BATCH(0); /* scratch space base offset */
    OUT_BATCH(dw4);
    OUT_BATCH(dw5);
    OUT_BATCH(0); /* kernel 1 pointer */
-   if (brw->wm.prog_data->prog_offset_16) {
-      OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
-	        brw->wm.prog_data->prog_offset_16);
-   } else {
-      OUT_BATCH(0); /* kernel 2 pointer */
-   }
+   OUT_BATCH(brw->wm.prog_offset + brw->wm.prog_data->prog_offset_16);
    ADVANCE_BATCH();
 }
 




More information about the mesa-commit mailing list