xf86-video-intel: Branch 'intel-batchbuffer' - 76 commits - configure.ac src/brw_defines.h src/common.h src/i810_reg.h src/i830_batchbuffer.h src/i830_display.c src/i830_dri.c src/i830_driver.c src/i830_exa.c src/i830.h src/i830_memory.c src/i830_quirks.c src/i965_render.c src/i965_render.h src/intel_batchbuffer.c src/intel_batchbuffer.h src/intel_bufmgr_ttm.c src/reg_dumper/idle.c src/reg_dumper/Makefile.am

Kristian Høgsberg krh at kemper.freedesktop.org
Fri Feb 22 07:36:45 PST 2008


 configure.ac               |    4 
 src/brw_defines.h          |    1 
 src/common.h               |    2 
 src/i810_reg.h             |   36 +
 src/i830.h                 |   19 
 src/i830_batchbuffer.h     |   44 -
 src/i830_display.c         |   90 ++
 src/i830_dri.c             |  155 +---
 src/i830_driver.c          |   72 +-
 src/i830_exa.c             |   57 +
 src/i830_memory.c          |   56 +
 src/i830_quirks.c          |    5 
 src/i965_render.c          | 1471 ++++++++++++++++++++++++---------------------
 src/i965_render.h          |   20 
 src/intel_batchbuffer.c    |   10 
 src/intel_batchbuffer.h    |    7 
 src/intel_bufmgr_ttm.c     |   18 
 src/reg_dumper/Makefile.am |   13 
 src/reg_dumper/idle.c      |  177 +++++
 19 files changed, 1368 insertions(+), 889 deletions(-)

New commits:
commit ef1e421194a27445435ff4f505555de9c734ff13
Merge: 34118dd... c9be596...
Author: Kristian Høgsberg <krh at redhat.com>
Date:   Thu Feb 21 14:42:55 2008 -0500

    Merge commit 'anholt/intel-batchbuffer' into intel-batchbuffer
    
    Conflicts:
    
    	src/dri_bufmgr.h
    	src/i830_dri.c
    	src/i830_driver.c
    	src/i830_exa.c
    	src/i965_render.c
    	src/intel_batchbuffer.c
    	src/intel_batchbuffer.h
    	src/intel_bufmgr_ttm.c

diff --cc src/i830.h
index f66b5a7,df5892b..c2a1268
--- a/src/i830.h
+++ b/src/i830.h
@@@ -530,11 -527,6 +534,12 @@@ typedef struct _I830Rec 
     I830ConfigPrivPtr pVisualConfigsPriv;
     drm_handle_t buffer_map;
     drm_handle_t ring_map;
 +
-     drm_hw_lock_t		*lock;
-     int				 lockRefCount;
-     int				 lockingContext;
-     drm_context_t		 context;
++   drm_hw_lock_t		*lock;
++   int				 lockRefCount;
++   int				 lockingContext;
++   drm_context_t		 context;
++   int irq;
  #endif
  
     /* Broken-out options. */
@@@ -726,17 -717,11 +730,19 @@@ extern Bool I830DRIFinishScreenInit(Scr
  extern void I830DRIUnlock(ScrnInfoPtr pScrn);
  extern Bool I830DRILock(ScrnInfoPtr pScrn);
  extern Bool I830DRISetVBlankInterrupt (ScrnInfoPtr pScrn, Bool on);
- Bool i830_update_dri_buffers(ScrnInfoPtr pScrn);
+ extern Bool i830_update_dri_buffers(ScrnInfoPtr pScrn);
+ extern Bool I830DRISetHWS(ScrnInfoPtr pScrn);
+ extern Bool I830DRIInstIrqHandler(ScrnInfoPtr pScrn);
  #endif
  
 +#ifdef DRI2
 +extern void I830DRI2Prepare(ScreenPtr pScreen);
 +extern void I830DRI2ScreenInit(ScreenPtr pScreen);
 +extern void I830DRI2CloseScreen(ScreenPtr pScreen);
 +extern void I830DRI2Lock(ScreenPtr pScrn);
 +extern void I830DRI2Unlock(ScreenPtr pScrn);
 +#endif
 +
  unsigned long intel_get_pixmap_offset(PixmapPtr pPix);
  unsigned long intel_get_pixmap_pitch(PixmapPtr pPix);
  extern Bool I830AccelInit(ScreenPtr pScreen);
diff --cc src/i830_dri.c
index 114b5f6,2b345c2..a2a810e
--- a/src/i830_dri.c
+++ b/src/i830_dri.c
@@@ -918,46 -909,52 +913,57 @@@ I830DRIDoMappings(ScreenPtr pScreen
  }
  
  Bool
- I830DRIResume(ScreenPtr pScreen)
+ I830DRIInstIrqHandler(ScrnInfoPtr pScrn)
  {
-    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
     I830Ptr pI830 = I830PTR(pScrn);
--   I830DRIPtr pI830DRI = (I830DRIPtr) pI830->pDRIInfo->devPrivate;
- 
-    DPRINTF(PFX, "I830DRIResume\n");
- 
-    I830ResumeDma(pScrn);
++   I830DRIPtr pI830DRI;
  
-    {
-       pI830DRI->irq = drmGetInterruptFromBusID(pI830->drmSubFD,
 -   pI830DRI->irq = drmGetInterruptFromBusID(pI830->drmSubFD,
++   pI830->irq = drmGetInterruptFromBusID(pI830->drmSubFD,
  #if XSERVER_LIBPCIACCESS
--					       ((pI830->PciInfo->domain << 8) |
--						pI830->PciInfo->bus),
--					       pI830->PciInfo->dev,
--					       pI830->PciInfo->func
++					 ((pI830->PciInfo->domain << 8) |
++					  pI830->PciInfo->bus),
++					 pI830->PciInfo->dev,
++					 pI830->PciInfo->func
  #else
--					       ((pciConfigPtr) pI830->
--						PciInfo->thisCard)->busnum,
--					       ((pciConfigPtr) pI830->
--						PciInfo->thisCard)->devnum,
--					       ((pciConfigPtr) pI830->
--						PciInfo->thisCard)->funcnum
++					 ((pciConfigPtr) pI830->
++					  PciInfo->thisCard)->busnum,
++					 ((pciConfigPtr) pI830->
++					  PciInfo->thisCard)->devnum,
++					 ((pciConfigPtr) pI830->
++					  PciInfo->thisCard)->funcnum
  #endif
--					       );
++					 );
  
-       if (drmCtlInstHandler(pI830->drmSubFD, pI830DRI->irq)) {
- 	 xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
- 		    "[drm] failure adding irq handler\n");
- 	 pI830DRI->irq = 0;
- 	 return FALSE;
-       }
-       else
- 	 xf86DrvMsg(pScrn->scrnIndex, X_INFO,
- 		    "[drm] dma control initialized, using IRQ %d\n",
- 		    pI830DRI->irq);
 -   if (drmCtlInstHandler(pI830->drmSubFD, pI830DRI->irq)) {
++   if (drmCtlInstHandler(pI830->drmSubFD, pI830->irq)) {
+        xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+ 	       "[drm] failure adding irq handler\n");
 -       pI830DRI->irq = 0;
++       pI830->irq = 0;
+        return FALSE;
+    } else
+        xf86DrvMsg(pScrn->scrnIndex, X_INFO,
+ 	       "[drm] dma control initialized, using IRQ %d\n",
 -	       pI830DRI->irq);
++	       pI830->irq);
++
++   if (pI830->pDRIInfo) {
++      pI830DRI = (I830DRIPtr) pI830->pDRIInfo->devPrivate;
++      pI830DRI->irq = pI830->irq;
 +   }
  
-    return FALSE;
+    return TRUE;
+ }
+ 
+ Bool
+ I830DRIResume(ScreenPtr pScreen)
+ {
+    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
+ 
+    DPRINTF(PFX, "I830DRIResume\n");
+ 
+    I830ResumeDma(pScrn);
+ 
+    I830DRIInstIrqHandler(pScrn);
+ 
+    return TRUE;
  }
  
  void
@@@ -973,9 -970,9 +979,9 @@@ I830DRICloseScreen(ScreenPtr pScreen
     REGION_UNINIT(pScreen, &pI830->driRegion);
  #endif
  
--   if (pI830DRI->irq) {
++   if (pI830->irq) {
         drmCtlUninstHandler(pI830->drmSubFD);
--       pI830DRI->irq = 0;
++       pI830->irq = 0;
     }
  
     I830CleanupDma(pScrn);
@@@ -1843,235 -1799,8 +1809,222 @@@ I830DRIUnlock(ScrnInfoPtr pScrn
  {
     I830Ptr pI830 = I830PTR(pScrn);
  
 -   if (pI830->directRenderingEnabled && pI830->LockHeld) {
 +   if (!pI830->LockHeld || pI830->directRendering == DRI_TYPE_NONE)
 +      return;
 +   else if (pI830->directRendering == DRI_TYPE_XF86DRI)
        DRIUnlock(screenInfo.screens[pScrn->scrnIndex]);
 -      pI830->LockHeld = 0;
 -   }
 +   else if (pI830->directRendering == DRI_TYPE_DRI2)
 +      I830DRI2Unlock(screenInfo.screens[pScrn->scrnIndex]);
 +
 +   pI830->LockHeld = 0;
 +}
 +
 +void
 +I830DRI2Lock(ScreenPtr pScreen)
 +{
 +    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
 +    I830Ptr pI830 = I830PTR(pScrn);
 +
 +    if (pI830->lockRefCount == 0) {
 +       DRM_LOCK(pI830->drmSubFD, pI830->lock, pI830->context, 0);
 +       pI830->lockingContext = pI830->context;
 +    } else if (pI830->lockingContext != pI830->context) {
 +	xf86DrvMsg(pScreen->myNum, X_ERROR,
 +		   "[DRI] Locking deadlock.\n"
 +		   "\tAlready locked with context %d,\n"
 +		   "\ttrying to lock with context %d.\n",
 +		   pI830->lockingContext, pI830->context);
 +    }
 +
 +    pI830->lockRefCount++;
 +}
 +
 +void
 +I830DRI2Unlock(ScreenPtr pScreen)
 +{
 +    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
 +    I830Ptr pI830 = I830PTR(pScrn);
 +
 +    if (pI830 == NULL)
 +	return;
 +
 +    if (pI830->lockRefCount > 0) {
 +	if (pI830->lockingContext != pI830->context) {
 +	    xf86DrvMsg(pScreen->myNum, X_ERROR,
 +		      "[DRI] Unlocking inconsistency:\n"
 +		      "\tContext %d trying to unlock lock held by context %d\n",
 +		       pI830->context, pI830->lockingContext);
 +	}
 +	pI830->lockRefCount--;
 +    } else {
 +	xf86DrvMsg(pScreen->myNum, X_ERROR,
 +		   "DRIUnlock called when not locked.\n");
 +        return;
 +    }
 +
 +    if (pI830->lockRefCount == 0)
 +       DRM_UNLOCK(pI830->drmSubFD, pI830->lock, pI830->context);
 +}
 +
 +static void
 +I830DRI2BeginClipNotify(ScreenPtr pScreen)
 +{
 +    I830DRI2Lock(pScreen);
 +}
 +
 +static void
 +I830DRI2EndClipNotify(ScreenPtr pScreen)
 +{
 +    I830EmitFlush(xf86Screens[pScreen->myNum]);
 +    I830DRI2Unlock(pScreen);
 +}
 +
 +void
 +I830DRI2Prepare(ScreenPtr pScreen)
 +{
 +    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
 +    I830Ptr pI830 = I830PTR(pScrn);
 +    char busId[64];
 +    drmVersionPtr version;
 +
 +#if XSERVER_LIBPCIACCESS
 +    sprintf(busId, "pci:%04x:%02x:%02x.%d",
 +	    pI830->PciInfo->domain, pI830->PciInfo->bus,
 +	    pI830->PciInfo->dev, pI830->PciInfo->func);
 +#else
 +    snprintf(busId, "PCI:%d:%d:%d",
 +	     ((pciConfigPtr) pI830->PciInfo->thisCard)->busnum,
 +	     ((pciConfigPtr) pI830->PciInfo->thisCard)->devnum,
 +	     ((pciConfigPtr) pI830->PciInfo->thisCard)->funcnum);
 +#endif
 +
 +    /* Low level DRM open */
 +    pI830->drmSubFD = drmOpen("i915", busId);
 +    if (pI830->drmSubFD < 0) {
 +	xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "[DRI2] drmOpen failed\n");
 +	return;
 +    }
 +
 +    version = drmGetVersion(pI830->drmSubFD);
 +    if (!version) {
 +	xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
 +		   "[DRI2] Failed to get DRM version\n");
 +	return;
 +    }
 +       
 +    if (version->version_major != 1 || version->version_minor < 3) {
 +	xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
 +		   "[DRI2] Need at least version 1.3 for DRI2\n");
 +	drmClose(pI830->drmSubFD);
 +	return;
 +    }	       
 +
 +    pI830->LockHeld = 0;
 +    pI830->drmMinor = version->version_minor;
 +    drmFreeVersion(version);
 +
 +    xf86DrvMsg(pScrn->scrnIndex, X_INFO,
 +	       "[DRI2] Opened DRM device successfully\n");
 +
 +    I830InitBufMgr(pScreen);
 +    if (!pI830->use_ttm_batch)
 +	return;
 +
 +    pI830->directRendering = DRI_TYPE_DRI2;
 +}
 +
 +struct __DRILock {
 +    unsigned int block_header;
 +    drm_hw_lock_t lock;
 +};
 +
 +#define DRI2_SAREA_BLOCK_HEADER(type, size) (((type) << 16) | (size))
 +#define DRI2_SAREA_BLOCK_LOCK		0x0001
 +
 +void
 +I830DRI2ScreenInit(ScreenPtr pScreen)
 +{
 +    DRI2InfoRec dri2info;
 +    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
 +    I830Ptr pI830 = I830PTR(pScrn);
 +    I830RingBuffer *ring = pI830->LpRing;
 +    drm_i915_init_t info;
-     int irq;
 +    int fd, major, minor, patch;
 +    const char *driverName;
 +    unsigned int sarea_handle;
 +    void *p;
 +    struct __DRILock *driLock;
 +
 +    dri2info.version = 1;
 +    dri2info.fd = pI830->drmSubFD;
 +    dri2info.driverSareaSize = sizeof *driLock;
 +    dri2info.driverName = IS_I965G(pI830) ? "i965" : "i915";
 +    dri2info.ddxVersionMajor = I830_MAJOR_VERSION;
 +    dri2info.ddxVersionMinor = I830_MINOR_VERSION;
 +    dri2info.ddxVersionPatch = I830_PATCHLEVEL;
 +    dri2info.getPixmapHandle = I830EXAGetPixmapHandle;
 +    dri2info.beginClipNotify = I830DRI2BeginClipNotify;
 +    dri2info.endClipNotify   = I830DRI2EndClipNotify;
 +
 +    p = DRI2ScreenInit(pScreen, &dri2info);
 +    if (!p) {
 +	pI830->directRendering = DRI_TYPE_NONE;
 +	return;
 +    }
 +
 +    driLock = p;
 +    driLock->block_header =
 +	DRI2_SAREA_BLOCK_HEADER(DRI2_SAREA_BLOCK_LOCK, sizeof *driLock);
 +    pI830->lock = &driLock->lock;
 +    pI830->lockRefCount = 0;
 +    pI830->lockingContext = 0;
 +    
 +    if (drmCreateContext(pI830->drmSubFD, &pI830->context)) {
 +	pI830->directRendering = DRI_TYPE_NONE;
 +	return;
 +    }
 +
 +    I830DRI2Lock(pScreen);
 +
 +    /* Get sarea BO handle... maybe we need a dedicated function for
 +     * that or maybe a DRI2 info struct that it fills out. */
 +    DRI2Connect(pScreen, &fd, &driverName, &major, &minor, &patch,
 +		&sarea_handle);
 +
 +    memset(&info, 0, sizeof(info));
 +    info.func = I915_INIT_DMA2;
 +    info.ring_start = ring->mem->offset + pI830->LinearAddr;
 +    info.ring_end = ring->mem->end + pI830->LinearAddr;
 +    info.ring_size = ring->mem->size;
 +    info.mmio_offset = 0;
 +    info.sarea_priv_offset = 0;
 +    info.sarea_handle = sarea_handle;
 +
 +    if (drmCommandWrite(pI830->drmSubFD, DRM_I830_INIT,
 +			&info, sizeof(info))) {
 +	xf86DrvMsg(pScreen->myNum, X_ERROR,
 +		   "I830 Dma Initialization Failed\n");
 +	pI830->directRendering = DRI_TYPE_NONE;
 +      
 +	return;
 +    }
 +
-     if (IS_G33CLASS(pI830)) {
- 	if (!I830SetHWS(pScrn, pI830->hw_status->offset)) {
- 	    pI830->directRendering = DRI_TYPE_NONE;
- 	    return;
- 	}
-     }
- 
 +    if (DEVICE_ID(pI830->PciInfo) != PCI_CHIP_845_G &&
 +	DEVICE_ID(pI830->PciInfo) != PCI_CHIP_I830_M) {
 +	I830SetParam(pScrn, I830_SETPARAM_USE_MI_BATCHBUFFER_START, 1);
 +    }
- 
-     if (!I830InitializeIrq(pScreen, &irq)) {
- 	pI830->directRendering = DRI_TYPE_NONE;
- 	return;
-     }
 +}
 +
 +void
 +I830DRI2CloseScreen(ScreenPtr pScreen)
 +{
 +    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
 +    I830Ptr pI830 = I830PTR(pScrn);
 +
 +    drmCtlUninstHandler(pI830->drmSubFD);
 +    I830CleanupDma(pScrn);
 +    drmDestroyContext(pI830->drmSubFD, pI830->context);
 +    I830DRI2Unlock(pScreen);
 +    DRI2CloseScreen(pScreen);
 +    drmClose(pI830->drmSubFD);
  }
diff --cc src/i830_driver.c
index ca8fa16,11e0b07..48d7e01
--- a/src/i830_driver.c
+++ b/src/i830_driver.c
@@@ -2945,16 -2924,14 +2946,20 @@@ I830ScreenInit(int scrnIndex, ScreenPt
     /* Must be called before EnterVT, so we can acquire the DRI lock when
      * binding our memory.
      */
 -   if (pI830->directRenderingEnabled)
 -      pI830->directRenderingEnabled = I830DRIFinishScreenInit(pScreen);
 +   if (pI830->directRendering == DRI_TYPE_XF86DRI)
 +      pI830->directRendering =
 +	  I830DRIFinishScreenInit(pScreen) ? DRI_TYPE_XF86DRI : DRI_TYPE_NONE;
 +#endif
 +
 +#ifdef DRI2
 +   if (pI830->directRendering == DRI_TYPE_DRI2)
 +       I830DRI2ScreenInit(pScreen);
  #endif
  
+    /* Must force it before EnterVT, so we are in control of VT and
+     * later memory should be bound when allocating, e.g rotate_mem */
+    pScrn->vtSema = TRUE;
+ 
     if (!I830EnterVT(scrnIndex, 0))
        return FALSE;
  
@@@ -3210,24 -3181,24 +3216,56 @@@ I830EnterVT(int scrnIndex, int flags
     i830_stop_ring(pScrn, TRUE);
     SetHWOperatingState(pScrn);
  
 +#ifdef DRI2
 +   if (pI830->directRendering == DRI_TYPE_DRI2) {
 +      I830DRISetVBlankInterrupt (pScrn, TRUE);
 +
++      if (pI830->starting) {
++	 if (HWS_NEED_GFX(pI830) && !I830DRISetHWS(pScrn)) {
++	    if (!I830SetHWS(pScrn, pI830->hw_status->offset)) {
++	       pI830->directRendering = DRI_TYPE_NONE;
++	       return FALSE;
++	    }
++	 }
++
++	 if (!I830DRIInstIrqHandler(pScrn)) {
++	    pI830->directRendering = DRI_TYPE_NONE;
++	    return FALSE;
++	 }
++      }
++
++      I830DRISetVBlankInterrupt (pScrn, TRUE);
++
 +      if (!pI830->starting) {
- 	      I830DRIResume(screenInfo.screens[scrnIndex]);
- 	      i830_refresh_ring(pScrn);
- 	      I830Sync(pScrn);
- 	      DO_RING_IDLE();
++	 I830DRIResume(screenInfo.screens[scrnIndex]);
++	 i830_refresh_ring(pScrn);
++	 I830Sync(pScrn);
++	 DO_RING_IDLE();
 +
- 	      DPRINTF(PFX, "calling dri unlock\n");
- 	      I830DRIUnlock(pScrn);
++	 DPRINTF(PFX, "calling dri unlock\n");
++	 I830DRIUnlock(pScrn);
 +      }
 +   }
 +#endif
 +
  #ifdef XF86DRI
 -   if (pI830->directRenderingEnabled) {
 +   if (pI830->directRendering == DRI_TYPE_XF86DRI) {
+        /* HW status is fixed, we need to set it up before any drm
+ 	* operation which accessing that page, like irq install, etc.
+ 	*/
+        if (pI830->starting) {
+ 	   if (HWS_NEED_GFX(pI830) && !I830DRISetHWS(pScrn)) {
+ 		   xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+ 			   "Fail to setup hardware status page.\n");
+ 		   I830DRICloseScreen(pScrn->pScreen);
+ 		   return FALSE;
+ 	   }
+ 	   if (!I830DRIInstIrqHandler(pScrn)) {
+ 	       I830DRICloseScreen(pScrn->pScreen);
+ 	       return FALSE;
+ 	   }
+        }
+ 
        /* Update buffer offsets in sarea and mappings, since buffer offsets
         * may have changed.
         */
diff --cc src/i830_exa.c
index 22055fd,cccd6bb..daf47e9
--- a/src/i830_exa.c
+++ b/src/i830_exa.c
@@@ -491,28 -444,41 +490,46 @@@ static Bool I830EXAPrepareAccess(Pixmap
-     /* TODO : make this more conditional */
-     intelddx_batchbuffer_flush(pI830->batch);
-     dri_fence_wait(pI830->batch->last_fence);
- 
      if (driver_priv->bo) {
  	mmDebug("mapping %p %d %dx%d\n", pPix, driver_priv->flags, pPix->drawable.width, pPix->drawable.height);
  
- 	if (!(driver_priv->flags & I830_EXA_PIXMAP_IS_MAPPED)) {
- 	    ret = dri_bo_map(driver_priv->bo, 1);
- 	    if (ret)
- 		return FALSE;
+ 	intelddx_batchbuffer_flush(pI830->batch);
  
- 	    driver_priv->flags |= I830_EXA_PIXMAP_IS_MAPPED;
- 	    pPix->devPrivate.ptr = driver_priv->bo->virtual;
+ 	ret = dri_bo_map(driver_priv->bo, TRUE);
+ 	if (ret) {
+ 	    FatalError("Failed to map pixmap: %s\n", strerror(-ret));
+ 	    return FALSE;
  	}
+ 
+ 	pPix->devPrivate.ptr = driver_priv->bo->virtual;
      }
  
 +    dri2Lock(pPix);
 +
      return TRUE;
  }
  
- static void I830ExaFinishAccess(PixmapPtr pPix, int index)
+ static void I830EXAFinishAccess(PixmapPtr pPix, int index)
  {
+     struct i830_exa_pixmap_priv *driver_priv;
+     int ret;
+ 
+     driver_priv = exaGetPixmapDriverPrivate(pPix);
+ 
+     if (!driver_priv)
+ 	return;
+ 
+     if (driver_priv->bo) {
+ 	mmDebug("numapping %p %d %dx%d\n", pPix, driver_priv->flags, pPix->drawable.width, pPix->drawable.height);
+ 
+ 	ret = dri_bo_unmap(driver_priv->bo);
+ 	if (ret) {
+ 	    FatalError("Failed to unmap pixmap: %s\n", strerror(-ret));
++	    dri2Unlock(pPix);
+ 	    return;
+ 	}
+ 
+ 	pPix->devPrivate.ptr = driver_priv->bo->virtual;
+     }
++
 +    dri2Unlock(pPix);
  }
  
  static Bool I830EXAModifyPixmapHeader(PixmapPtr pPixmap, int width, int height,
commit c9be59674a4eb5b0db0cf757972841fa9a75b5f7
Author: Eric Anholt <eric at anholt.net>
Date:   Tue Jan 29 08:24:27 2008 -0800

    Fail if we can't map/unmap pixmap BOs.

diff --git a/src/i830_exa.c b/src/i830_exa.c
index 1422839..cccd6bb 100644
--- a/src/i830_exa.c
+++ b/src/i830_exa.c
@@ -447,8 +447,10 @@ static Bool I830EXAPrepareAccess(PixmapPtr pPix, int index)
 	intelddx_batchbuffer_flush(pI830->batch);
 
 	ret = dri_bo_map(driver_priv->bo, TRUE);
-	if (ret)
+	if (ret) {
+	    FatalError("Failed to map pixmap: %s\n", strerror(-ret));
 	    return FALSE;
+	}
 
 	pPix->devPrivate.ptr = driver_priv->bo->virtual;
     }
@@ -470,8 +472,10 @@ static void I830EXAFinishAccess(PixmapPtr pPix, int index)
 	mmDebug("numapping %p %d %dx%d\n", pPix, driver_priv->flags, pPix->drawable.width, pPix->drawable.height);
 
 	ret = dri_bo_unmap(driver_priv->bo);
-	if (ret)
+	if (ret) {
+	    FatalError("Failed to unmap pixmap: %s\n", strerror(-ret));
 	    return;
+	}
 
 	pPix->devPrivate.ptr = driver_priv->bo->virtual;
     }
commit f66276d813b502c0e7e8ff255923b2090c5c47d3
Author: Eric Anholt <eric at anholt.net>
Date:   Tue Jan 29 05:38:05 2008 -0800

    Allow recursive BO mappings, add assertions for mapping handling, and fix users.
    
    This lets us remove a gratuitous flush in the 965 render acceleration, and
    should be backported to Mesa where it'll fix a segfault bug.

diff --git a/src/i830.h b/src/i830.h
index 0ba85f6..df5892b 100644
--- a/src/i830.h
+++ b/src/i830.h
@@ -645,7 +645,6 @@ typedef struct _I830Rec {
 #define I830_SELECT_THIRD	3
 
 #define I830_EXA_PIXMAP_IS_FRONTBUFFER 1
-#define I830_EXA_PIXMAP_IS_MAPPED 2
 
 /* i830 pixmap private for TTM */
 struct i830_exa_pixmap_priv {
diff --git a/src/i830_exa.c b/src/i830_exa.c
index 71601e4..1422839 100644
--- a/src/i830_exa.c
+++ b/src/i830_exa.c
@@ -411,9 +411,6 @@ static void I830EXADestroyPixmap(ScreenPtr pScreen, void *driverPriv)
 {
     struct i830_exa_pixmap_priv *driver_priv = driverPriv;
 
-    if (driver_priv->flags & I830_EXA_PIXMAP_IS_MAPPED)
-        dri_bo_unmap(driver_priv->bo);
-
     dri_bo_unreference(driver_priv->bo);
     xfree(driverPriv);
 }
@@ -449,20 +446,37 @@ static Bool I830EXAPrepareAccess(PixmapPtr pPix, int index)
 
 	intelddx_batchbuffer_flush(pI830->batch);
 
-	if ((driver_priv->flags & I830_EXA_PIXMAP_IS_MAPPED))
-	    return TRUE;
-
-	ret = dri_bo_map(driver_priv->bo, 1);
+	ret = dri_bo_map(driver_priv->bo, TRUE);
 	if (ret)
 	    return FALSE;
 
-	driver_priv->flags |= I830_EXA_PIXMAP_IS_MAPPED;
 	pPix->devPrivate.ptr = driver_priv->bo->virtual;
     }
 
     return TRUE;
 }
 
+static void I830EXAFinishAccess(PixmapPtr pPix, int index)
+{
+    struct i830_exa_pixmap_priv *driver_priv;
+    int ret;
+
+    driver_priv = exaGetPixmapDriverPrivate(pPix);
+
+    if (!driver_priv)
+	return;
+
+    if (driver_priv->bo) {
+	mmDebug("numapping %p %d %dx%d\n", pPix, driver_priv->flags, pPix->drawable.width, pPix->drawable.height);
+
+	ret = dri_bo_unmap(driver_priv->bo);
+	if (ret)
+	    return;
+
+	pPix->devPrivate.ptr = driver_priv->bo->virtual;
+    }
+}
+
 static Bool I830EXAModifyPixmapHeader(PixmapPtr pPixmap, int width, int height,
 				      int depth, int bitsPerPixel, int devKind,
 				      pointer pPixData)
@@ -537,6 +551,7 @@ I830EXAInit(ScreenPtr pScreen)
     } else {
 	pI830->EXADriverPtr->flags = EXA_OFFSCREEN_PIXMAPS | EXA_HANDLES_PIXMAPS;
 	pI830->EXADriverPtr->PrepareAccess = I830EXAPrepareAccess;
+	pI830->EXADriverPtr->FinishAccess = I830EXAFinishAccess;
     }
 
     DPRINTF(PFX, "EXA Mem: memoryBase 0x%x, end 0x%x, offscreen base 0x%x, "
diff --git a/src/i965_render.c b/src/i965_render.c
index 833cb41..307acf2 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -786,6 +786,7 @@ i965_exastate_reset(struct i965_exastate_buffer *state)
 
     /* Then the surface state buffer */
     if (state->surface_buf != NULL && state->num_ops >= GEN4_MAX_OPS) {
+	dri_bo_unmap(state->surface_buf);
 	dri_bo_unreference(state->surface_buf);
 	state->surface_buf = NULL;
     }
@@ -1449,9 +1450,6 @@ void i965_done_composite(PixmapPtr pDst)
     }
 
     pI830->exa965->num_ops++;
-    if (pI830->exa965->num_ops >= GEN4_MAX_OPS) {
-	intelddx_batchbuffer_flush(pI830->batch);
-    }
 }
 
 static struct i965_exastate_buffer *
diff --git a/src/intel_batchbuffer.c b/src/intel_batchbuffer.c
index 42409e8..e15c6ce 100644
--- a/src/intel_batchbuffer.c
+++ b/src/intel_batchbuffer.c
@@ -274,10 +274,6 @@ uint32_t intelddx_batchbuffer_emit_pixmap(PixmapPtr pPixmap,
 {
     struct i830_exa_pixmap_priv *driver_priv = exaGetPixmapDriverPrivate(pPixmap);
 
-    if (driver_priv->flags & I830_EXA_PIXMAP_IS_MAPPED) {
-	dri_bo_unmap(driver_priv->bo);
-	driver_priv->flags &= ~I830_EXA_PIXMAP_IS_MAPPED;
-    }
     dri_emit_reloc(reloc_buf, flags, delta, offset, driver_priv->bo);
     return driver_priv->bo->offset;
 }
diff --git a/src/intel_bufmgr_ttm.c b/src/intel_bufmgr_ttm.c
index 32b407c..f6c8253 100644
--- a/src/intel_bufmgr_ttm.c
+++ b/src/intel_bufmgr_ttm.c
@@ -101,6 +101,7 @@ typedef struct _dri_bo_ttm {
     int refcount;
     drmBO drm_bo;
     const char *name;
+    unsigned int map_count;
 
     uint64_t last_flags;
 
@@ -186,6 +187,8 @@ intel_add_validate_buffer(dri_bo *buf,
 	ttm_buf->delayed_unmap = GL_FALSE;
     }
 
+    assert(ttm_buf->map_count == 0);
+
     if (ttm_buf->validate_index == -1) {
 	struct intel_validate_entry *entry;
 	struct drm_i915_op_arg *arg;
@@ -369,6 +372,7 @@ dri_ttm_alloc(dri_bufmgr *bufmgr, const char *name,
     ttm_buf->shared = GL_FALSE;
     ttm_buf->delayed_unmap = GL_FALSE;
     ttm_buf->validate_index = -1;
+    ttm_buf->map_count = 0;
 
     DBG("bo_create: %p (%s) %ldb\n", &ttm_buf->bo, ttm_buf->name, size);
 
@@ -424,6 +428,7 @@ intel_ttm_bo_create_from_handle(dri_bufmgr *bufmgr, const char *name,
     ttm_buf->shared = GL_TRUE;
     ttm_buf->delayed_unmap = GL_FALSE;
     ttm_buf->validate_index = -1;
+    ttm_buf->map_count = 0;
 
     DBG("bo_create_from_handle: %p %08x (%s)\n",
 	&ttm_buf->bo, handle, ttm_buf->name);
@@ -472,6 +477,8 @@ dri_ttm_bo_unreference(dri_bo *buf)
 	    }
 	}
 
+	assert(ttm_buf->map_count == 0);
+
 	if (ttm_buf->delayed_unmap)
 	   drmBOUnmap(bufmgr_ttm->fd, &ttm_buf->drm_bo);
 
@@ -500,10 +507,11 @@ dri_ttm_bo_map(dri_bo *buf, GLboolean write_enable)
     if (write_enable)
 	flags |= DRM_BO_FLAG_WRITE;
 
-    assert(buf->virtual == NULL);
-
     DBG("bo_map: %p (%s)\n", &ttm_buf->bo, ttm_buf->name);
 
+    if (ttm_buf->map_count++ != 0)
+	return 0;
+
     /* XXX: What about if we're upgrading from READ to WRITE? */
     if (ttm_buf->delayed_unmap) {
 	buf->virtual = ttm_buf->saved_virtual;
@@ -524,10 +532,12 @@ dri_ttm_bo_unmap(dri_bo *buf)
 
     bufmgr_ttm = (dri_bufmgr_ttm *)buf->bufmgr;
 
-    assert(buf->virtual != NULL);
-
     DBG("bo_unmap: %p (%s)\n", &ttm_buf->bo, ttm_buf->name);
 
+    if (--ttm_buf->map_count > 0)
+	return 0;
+    assert(ttm_buf->map_count >= 0);
+
     if (!ttm_buf->shared) {
 	ttm_buf->saved_virtual = buf->virtual;
 	ttm_buf->delayed_unmap = GL_TRUE;
commit 70692f4fb85c1a3c3c33bacddb44c077aa59b702
Merge: 958fa85... b1e1c7c...
Author: Eric Anholt <eric at anholt.net>
Date:   Wed Jan 30 23:10:41 2008 -0800

    Merge branch 'master' into intel-batchbuffer

diff --cc src/i830_driver.c
index e2d1e6f,a3c64de..11e0b07
--- a/src/i830_driver.c
+++ b/src/i830_driver.c
@@@ -2342,10 -2353,9 +2342,11 @@@ I830BlockHandler(int i
  
      (*pScreen->BlockHandler) (i, blockData, pTimeout, pReadmask);
  
+     pI830->BlockHandler = pScreen->BlockHandler;
      pScreen->BlockHandler = I830BlockHandler;
  
 +    if (pI830->batch)
 +    	intelddx_batchbuffer_flush(pI830->batch);
      I830VideoBlockHandler(i, blockData, pTimeout, pReadmask);
  }
  
commit b1e1c7cd993ab4f1935d750eb8852e74be757e53
Author: Eric Anholt <eric at anholt.net>
Date:   Wed Jan 30 23:01:04 2008 -0800

    Add detail on different units to intel_idle.

diff --git a/src/i810_reg.h b/src/i810_reg.h
index 7902366..2893bc8 100644
--- a/src/i810_reg.h
+++ b/src/i810_reg.h
@@ -351,6 +351,18 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define IPEIR_I965                  0x2064 /* i965 */
 #define IPEHR_I965                  0x2068 /* i965 */
 #define INST_DONE_I965              0x206c
+# define I965_SF_DONE			(1 << 23)
+# define I965_SE_DONE			(1 << 22)
+# define I965_WM_DONE			(1 << 21)
+# define I965_TEXTURE_FETCH_DONE	(1 << 14)
+# define I965_SAMPLER_CACHE_DONE	(1 << 12)
+# define I965_FILTER_DONE		(1 << 11)
+# define I965_PS_DONE			(1 << 9)
+# define I965_CC_DONE			(1 << 8)
+# define I965_MAP_FILTER_DONE		(1 << 7)
+# define I965_MAP_L2_IDLE		(1 << 6)
+# define I965_CP_DONE			(1 << 1)
+# define I965_RING_0_ENABLE		(1 << 0)
 #define INST_PS_I965                0x2070
 
 /* Current active ring head address: 
diff --git a/src/reg_dumper/idle.c b/src/reg_dumper/idle.c
index ec083fd..0077bf4 100644
--- a/src/reg_dumper/idle.c
+++ b/src/reg_dumper/idle.c
@@ -36,13 +36,52 @@
 #include "reg_dumper.h"
 #include "../i810_reg.h"
 
+struct idle_flags {
+    uint32_t instdone_flag;
+    char *name;
+    unsigned int count;
+};
+
+struct idle_flags i965_idle_flags[] = {
+    {I965_SF_DONE, "SF"},
+    {I965_SE_DONE, "SE"},
+    {I965_WM_DONE, "WM"},
+    {I965_TEXTURE_FETCH_DONE, "texture fetch"},
+    {I965_SAMPLER_CACHE_DONE, "sampler cache"},
+    {I965_FILTER_DONE, "filter"},
+    {I965_PS_DONE, "PS"},
+    {I965_CC_DONE, "CC"},
+    {I965_MAP_FILTER_DONE, "map filter"},
+    {I965_MAP_L2_IDLE, "map L2"},
+    {I965_CP_DONE, "CP"},
+    {0, "other"},
+};
+
+/* Fills in the "other" field's idle flags */
+static void
+setup_other_flags(struct idle_flags *idle_flags, int idle_flag_count)
+{
+    uint32_t other_idle_flags;
+    int i;
+
+    other_idle_flags = ~(I965_RING_0_ENABLE);
+    for (i = 0; i < idle_flag_count - 1; i++) {
+	other_idle_flags &= ~idle_flags[i].instdone_flag;
+    }
+    idle_flags[i].instdone_flag = other_idle_flags;
+
+}
+
 int main(int argc, char **argv)
 {
     struct pci_device *dev;
     I830Rec i830;
+    I830Ptr pI830 = &i830;
     ScrnInfoRec scrn;
     int err, mmio_bar;
     void *mmio;
+    struct idle_flags *idle_flags;
+    int idle_flag_count;
 
     err = pci_system_init();
     if (err != 0) {
@@ -76,7 +115,7 @@ int main(int argc, char **argv)
 				dev->regions[mmio_bar].size, 
 				PCI_DEV_MAP_FLAG_WRITABLE,
 				&mmio);
-    
+
     if (err != 0) {
 	fprintf(stderr, "Couldn't map MMIO region: %s\n", strerror(err));
 	exit(1);
@@ -86,23 +125,31 @@ int main(int argc, char **argv)
     scrn.scrnIndex = 0;
     scrn.pI830 = &i830;
 
-    {
-        I830Ptr pI830 = I830PTR((&scrn));
+    /* if (IS_I965) { */
+    idle_flags = i965_idle_flags;
+    idle_flag_count = sizeof(i965_idle_flags) / sizeof(i965_idle_flags[0]);
+
+    setup_other_flags(idle_flags, idle_flag_count);
 
-	CARD32  idle_value = 0xffe5fafe;
+    for (;;) {
+	int i, j;
 
-	for (;;)
-	{
-	    CARD32	busy = 0;
-	    int		i;
+	for (i = 0; i < 100; i++) {
+	    uint32_t instdone = INREG(INST_DONE_I965);
 
-	    for (i = 0; i < 100; i++) {
-		if (INREG (INST_DONE_I965) != idle_value)
-		    busy++;
-		usleep (10000);
+	    for (j = 0; j < idle_flag_count; j++) {
+		if ((instdone & idle_flags[j].instdone_flag) == 0)
+		    idle_flags[j].count++;
 	    }
-	    printf ("load: %d\n", busy);
+
+	    usleep (10000);
+	}
+
+	for (j = 0; j < idle_flag_count; j++) {
+	    printf("%15s: %3d\n", idle_flags[j].name, idle_flags[j].count);
+	    idle_flags[j].count = 0;
 	}
+	printf("\n");
     }
 
     return 0;
commit aa3ac79759581b5eb05293a8cbcf89eb5b76712c
Author: Zhenyu Wang <zhenyu.z.wang at intel.com>
Date:   Thu Jan 31 18:26:46 2008 +0800

    Don't crash if SW cursor
    
    In case of device option or hw cursor allocation fails.

diff --git a/src/i830_memory.c b/src/i830_memory.c
index d97ca0b..06c21ac 100644
--- a/src/i830_memory.c
+++ b/src/i830_memory.c
@@ -1946,7 +1946,8 @@ i830_bind_all_memory(ScrnInfoPtr pScrn)
 	}
 #endif
     }
-    i830_update_cursor_offsets(pScrn);
+    if (!pI830->SWCursor)
+	i830_update_cursor_offsets(pScrn);
 
     return TRUE;
 }
commit 958fa8527a49e281b4b275c4a9141de8e5066c93
Author: Eric Anholt <eric at anholt.net>
Date:   Mon Jan 28 20:19:07 2008 -0800

    Assert that 965 render object offsets match their alignment requirements.

diff --git a/src/i830.h b/src/i830.h
index ab3f760..4beefe0 100644
--- a/src/i830.h
+++ b/src/i830.h
@@ -64,6 +64,10 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "xf86RandR12.h"
 
 #include "xorg-server.h"
+/* The X Server tries to disable our assert()s. Knock that off. */
+#undef NDEBUG
+#include <assert.h>
+
 #ifdef XSERVER_LIBPCIACCESS
 #include <pciaccess.h>
 #endif
diff --git a/src/i965_render.c b/src/i965_render.c
index efcd81b..833cb41 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -508,6 +508,7 @@ sf_state_init (struct brw_sf_unit_state *sf_state, int kernel_offset)
     sf_state->sf6.dest_org_vbias = 0x8;
     sf_state->sf6.dest_org_hbias = 0x8;
 
+    assert((kernel_offset & 63) == 0);
     sf_state->thread0.kernel_start_pointer = kernel_offset >> 6;
 }
 
@@ -549,6 +550,7 @@ sampler_state_init (struct brw_sampler_state *sampler_state,
 	break;
     }
 
+    assert((default_color_offset & 31) == 0);
     sampler_state->ss2.default_color_pointer = default_color_offset >> 5;
 
     sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
@@ -565,6 +567,7 @@ wm_state_init (struct brw_wm_unit_state *wm_state,
     wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
     wm_state->thread1.single_program_flow = 1;
 
+    assert((scratch_offset & 1023) == 0);
     wm_state->thread2.scratch_space_base_pointer = scratch_offset >> 10;
 
     wm_state->thread2.per_thread_scratch_space = 0;
@@ -576,6 +579,7 @@ wm_state_init (struct brw_wm_unit_state *wm_state,
     wm_state->thread3.dispatch_grf_start_reg = 3; /* must match kernel */
 
     wm_state->wm4.stats_enable = 1;  /* statistic */
+    assert((sampler_state_offset & 31) == 0);
     wm_state->wm4.sampler_state_pointer = sampler_state_offset >> 5;
     wm_state->wm4.sampler_count = 1; /* 1-4 samplers used */
     wm_state->wm5.max_threads = PS_MAX_THREADS - 1;
@@ -588,6 +592,7 @@ wm_state_init (struct brw_wm_unit_state *wm_state,
     wm_state->wm5.enable_8_pix = 0;
     wm_state->wm5.early_depth_test = 1;
 
+    assert((kernel_offset & 63) == 0);
     wm_state->thread0.kernel_start_pointer = kernel_offset >> 6;
 
     /* Each pair of attributes (src/mask coords) is one URB entry */
@@ -614,6 +619,7 @@ cc_state_init (struct brw_cc_unit_state *cc_state,
     cc_state->cc3.blend_enable = 1;     /* enable color blend */
     cc_state->cc3.alpha_test = 0;       /* disable alpha test */
 
+    assert((cc_viewport_offset & 31) == 0);
     cc_state->cc4.cc_viewport_state_offset = cc_viewport_offset >> 5;
 
     cc_state->cc5.dither_enable = 0;    /* disable dither */
@@ -887,6 +893,7 @@ gen4_emit_batch_header (ScrnInfoPtr pScrn)
 
 	/* Set system instruction pointer */
 	OUT_BATCH(BRW_STATE_SIP | 0);
+	assert((sip_kernel_offset & 63) == 0);
 	OUT_BATCH(sip_kernel_offset);
 
 	/* URB fence */
@@ -966,7 +973,10 @@ i965_set_picture_surface_state(ScrnInfoPtr pScrn, unsigned int index,
 					 (is_dst ? DRM_BO_FLAG_WRITE : 0) |
 					 DRM_BO_FLAG_READ,
 					 pI830->exa965->surface_buf,
-					 offset + 4, 0);
+					 offset +
+					 offsetof(struct brw_surface_state,
+						  ss1),
+					 0);
     ss->ss2.mip_count = 0;
     ss->ss2.render_target_rotation = 0;
     ss->ss2.height = pPixmap->drawable.height - 1;
@@ -988,7 +998,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     Bool rotation_program = FALSE;
     int wm_state_offset;
     int sf_state_offset, cc_state_offset;
-    char *surface_start_base;
     void *surface_map;
     sampler_state_filter_t src_filter, mask_filter;
     sampler_state_extend_t src_extend, mask_extend;
@@ -1019,8 +1028,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     i965_exastate_reset(pI830->exa965);
     surface_map = pI830->exa965->surface_map;
 
-    surface_start_base = surface_map;
-
     IntelEmitInvarientState(pScrn);
     *pI830->last_3d = LAST_3D_RENDER;
 
@@ -1058,8 +1065,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     binding_table_offset = (offsetof (gen4_surface_state_t, binding_table) +
 			    sizeof (CARD32) * GEN4_BINDING_TABLE_PER_OP *
 			    pI830->exa965->num_ops);
-    binding_table = (void *)(surface_start_base +
-			     binding_table_offset);
+    binding_table = (void *)((char *)surface_map + binding_table_offset);
 
     /* Set up and bind the state buffer for the destination surface */
     binding_table[0] = i965_set_picture_surface_state(pScrn, 0,
@@ -1171,6 +1177,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
    	OUT_BATCH(0); /* clip */
    	OUT_BATCH(0); /* sf */
 	/* Only the PS uses the binding table */
+	assert((binding_table_offset & 31) == 0);
    	OUT_BATCH(binding_table_offset); /* ps */
 
 	/* The drawing rectangle clipping is always on.  Set it to values that
@@ -1189,11 +1196,15 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 
 	/* Set the pointers to the 3d pipeline state */
    	OUT_BATCH(BRW_3DSTATE_PIPELINED_POINTERS | 5);
+	assert((offsetof (gen4_state_t, vs_state) & 31) == 0);
 	OUT_BATCH(offsetof (gen4_state_t, vs_state));  /* 32 byte aligned */
    	OUT_BATCH(BRW_GS_DISABLE);   /* disable GS, resulting in passthrough */
    	OUT_BATCH(BRW_CLIP_DISABLE); /* disable CLIP, resulting in passthrough */
+	assert((sf_state_offset & 31) == 0);
 	OUT_BATCH(sf_state_offset); /* 32 byte aligned */
+	assert((wm_state_offset & 31) == 0);
 	OUT_BATCH(wm_state_offset); /* 32 byte aligned */
+	assert((cc_state_offset & 63) == 0);
 	OUT_BATCH(cc_state_offset); /* 64 byte aligned */
 	ADVANCE_BATCH();
     }
commit 2e43bec8731ba1b172f7a0bf867bbb5c1adbda2d
Author: Jesse Barnes <jesse.barnes at intel.com>
Date:   Wed Jan 30 18:59:12 2008 +0800

    Frame buffer compression support on new chipset

diff --git a/src/i810_reg.h b/src/i810_reg.h
index bed3901..7902366 100644
--- a/src/i810_reg.h
+++ b/src/i810_reg.h
@@ -2634,4 +2634,28 @@ typedef enum {
 #define FBC_LL_SIZE		(1536)
 #define FBC_LL_PAD		(32)
 
+/* Framebuffer compression version 2 */
+#define DPFC_CB_BASE		0x3200
+#define DPFC_CONTROL		0x3208
+#define   DPFC_CTL_EN		(1<<31)
+#define   DPFC_CTL_PLANEA	(0<<30)
+#define   DPFC_CTL_PLANEB	(1<<30)
+#define   DPFC_CTL_FENCE_EN	(1<<29)
+#define   DPFC_CTL_LIMIT_1X	(0<<6)
+#define   DPFC_CTL_LIMIT_2X	(1<<6)
+#define   DPFC_CTL_LIMIT_4X	(2<<6)
+#define DPFC_RECOMP_CTL		0x320c
+#define   DPFC_RECOMP_STALL_EN	(1<<27)
+#define   DPFC_RECOMP_STALL_WM_SHIFT (16)
+#define   DPFC_RECOMP_STALL_WM_MASK (0x07ff0000)
+#define   DPFC_RECOMP_TIMER_COUNT_SHIFT (0)
+#define   DPFC_RECOMP_TIMER_COUNT_MASK (0x0000003f)
+#define DPFC_STATUS		0x3210
+#define   DPFC_INVAL_SEG_SHIFT  (16)
+#define   DPFC_INVAL_SEG_MASK	(0x07ff0000)
+#define   DPFC_COMP_SEG_SHIFT	(0)
+#define   DPFC_COMP_SEG_MASK	(0x000003ff)
+#define DPFC_STATUS2		0x3214
+#define DPFC_FENCE_YOFF		0x3218
+
 #endif /* _I810_REG_H */
diff --git a/src/i830_display.c b/src/i830_display.c
index f61d3c4..39f3637 100644
--- a/src/i830_display.c
+++ b/src/i830_display.c
@@ -571,9 +571,11 @@ i830_use_fb_compression(xf86CrtcPtr crtc)
  *   - SR display watermarks must be equal between 16bpp and 32bpp?
  *
  * FIXME: verify above conditions are true
+ *
+ * Enable 8xx style FB compression
  */
 static void
-i830_enable_fb_compression(xf86CrtcPtr crtc)
+i830_enable_fb_compression_8xx(xf86CrtcPtr crtc)
 {
     ScrnInfoPtr pScrn = crtc->scrn;
     I830Ptr pI830 = I830PTR(pScrn);
@@ -629,8 +631,11 @@ i830_enable_fb_compression(xf86CrtcPtr crtc)
 	       'b' : 'a');
 }
 
+/*
+ * Disable 8xx style FB compression
+ */
 static void
-i830_disable_fb_compression(xf86CrtcPtr crtc)
+i830_disable_fb_compression_8xx(xf86CrtcPtr crtc)
 {
     ScrnInfoPtr pScrn = crtc->scrn;
     I830Ptr pI830 = I830PTR(pScrn);
@@ -648,6 +653,86 @@ i830_disable_fb_compression(xf86CrtcPtr crtc)
     xf86DrvMsg(pScrn->scrnIndex, X_INFO, "fbc disabled on plane %c\n", plane);
 }
 
+static void
+i830_disable_fb_compression2(xf86CrtcPtr crtc)
+{
+    ScrnInfoPtr pScrn = crtc->scrn;
+    I830Ptr pI830 = I830PTR(pScrn);
+    uint32_t dpfc_ctl;
+    char plane = (INREG(DPFC_CONTROL) & DPFC_CTL_PLANEB) ? 'b' : 'a';
+
+    /* Disable compression */
+    dpfc_ctl = INREG(DPFC_CONTROL);
+    dpfc_ctl &= ~DPFC_CTL_EN;
+    OUTREG(DPFC_CONTROL, dpfc_ctl);
+    i830WaitForVblank(pScrn);
+
+    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "fbc2 disabled on plane %c\n", plane);
+}
+
+static void
+i830_enable_fb_compression2(xf86CrtcPtr crtc)
+{
+    ScrnInfoPtr pScrn = crtc->scrn;
+    I830Ptr pI830 = I830PTR(pScrn);
+    I830CrtcPrivatePtr	intel_crtc = crtc->driver_private;
+    int plane = (intel_crtc->plane == 0 ? DPFC_CTL_PLANEA : DPFC_CTL_PLANEB);
+    unsigned long stall_watermark = 200, frames = 50;
+
+    if (INREG(DPFC_CONTROL) & DPFC_CTL_EN) {
+	char cur_plane = (INREG(DPFC_CONTROL) & DPFC_CTL_PLANEB) ? 'b' : 'a';
+	xf86DrvMsg(pScrn->scrnIndex, X_WARNING, "fbc2 already enabled on "
+		   "plane %c, not enabling on plane %c\n", cur_plane,
+		   plane ? 'b' : 'a');
+	return;
+    }
+
+    /* Set it up... */
+    i830_disable_fb_compression2(crtc);
+    OUTREG(DPFC_CB_BASE, pI830->compressed_front_buffer->offset);
+    /* Update i830_memory.c too if compression ratio changes */
+    OUTREG(DPFC_CONTROL, plane | DPFC_CTL_FENCE_EN | DPFC_CTL_LIMIT_4X |
+	   pI830->front_buffer->fence_nr);
+    OUTREG(DPFC_RECOMP_CTL, DPFC_RECOMP_STALL_EN |
+	   (stall_watermark << DPFC_RECOMP_STALL_WM_SHIFT) |
+	   (frames << DPFC_RECOMP_TIMER_COUNT_SHIFT));
+    OUTREG(DPFC_FENCE_YOFF, crtc->y);
+
+    /* Zero buffers */
+    memset(pI830->FbBase + pI830->compressed_front_buffer->offset, 0,
+	   pI830->compressed_front_buffer->size);
+
+    /* enable it... */
+    OUTREG(DPFC_CONTROL, INREG(DPFC_CONTROL) | DPFC_CTL_EN);
+
+    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "fbc2 enabled on plane %c\n", plane ?
+	       'b' : 'a');
+}
+
+static void
+i830_enable_fb_compression(xf86CrtcPtr crtc)
+{
+    ScrnInfoPtr pScrn = crtc->scrn;
+    I830Ptr pI830 = I830PTR(pScrn);
+
+    if (IS_IGD_GM(pI830))
+	return i830_enable_fb_compression2(crtc);
+
+    i830_enable_fb_compression_8xx(crtc);
+}
+
+static void
+i830_disable_fb_compression(xf86CrtcPtr crtc)
+{
+    ScrnInfoPtr pScrn = crtc->scrn;
+    I830Ptr pI830 = I830PTR(pScrn);
+
+    if (IS_IGD_GM(pI830))
+	return i830_disable_fb_compression2(crtc);
+
+    i830_disable_fb_compression_8xx(crtc);
+}
+
 /**
  * Sets the power management mode of the pipe and plane.
  *
diff --git a/src/i830_memory.c b/src/i830_memory.c
index 07e4010..d97ca0b 100644
--- a/src/i830_memory.c
+++ b/src/i830_memory.c
@@ -1274,6 +1274,13 @@ i830_allocate_cursor_buffers(ScrnInfoPtr pScrn)
 static void i830_setup_fb_compression(ScrnInfoPtr pScrn)
 {
     I830Ptr pI830 = I830PTR(pScrn);
+    unsigned long compressed_size;
+    unsigned long fb_height;
+
+    if (pScrn->virtualX > pScrn->virtualY)
+	fb_height = pScrn->virtualX;
+    else
+	fb_height = pScrn->virtualY;
 
     /* Only mobile chips since 845 support this feature */
     if (!IS_MOBILE(pI830)) {
@@ -1281,11 +1288,12 @@ static void i830_setup_fb_compression(ScrnInfoPtr pScrn)
 	goto out;
     }
 
-    /* Clear out any stale state */
-    OUTREG(FBC_CFB_BASE, 0);
-    OUTREG(FBC_LL_BASE, 0);
-    OUTREG(FBC_CONTROL2, 0);
-    OUTREG(FBC_CONTROL, 0);
+    if (IS_IGD_GM(pI830)) {
+	/* Update i830_display.c too if compression ratio changes */
+	compressed_size = fb_height * (pScrn->displayWidth / 4);
+    } else {
+	compressed_size = MB(6);
+    }
 
     /*
      * Compressed framebuffer limitations:
@@ -1300,21 +1308,23 @@ static void i830_setup_fb_compression(ScrnInfoPtr pScrn)
      */
     pI830->compressed_front_buffer =
 	i830_allocate_memory(pScrn, "compressed frame buffer",
-			     MB(6), KB(4), NEED_PHYSICAL_ADDR);
+			     compressed_size, KB(4), NEED_PHYSICAL_ADDR);
 
     if (!pI830->compressed_front_buffer) {
 	pI830->fb_compression = FALSE;
 	goto out;
     }
 
-    pI830->compressed_ll_buffer =
-	i830_allocate_memory(pScrn, "compressed ll buffer",
-			     FBC_LL_SIZE + FBC_LL_PAD, KB(4),
-			     NEED_PHYSICAL_ADDR);
-    if (!pI830->compressed_ll_buffer) {
-	i830_free_memory(pScrn, pI830->compressed_front_buffer);
-	pI830->fb_compression = FALSE;
-	goto out;
+    if (!IS_IGD_GM(pI830)) {
+	pI830->compressed_ll_buffer =
+	    i830_allocate_memory(pScrn, "compressed ll buffer",
+				 FBC_LL_SIZE + FBC_LL_PAD, KB(4),
+				 NEED_PHYSICAL_ADDR);
+	if (!pI830->compressed_ll_buffer) {
+	    i830_free_memory(pScrn, pI830->compressed_front_buffer);
+	    pI830->fb_compression = FALSE;
+	    goto out;
+	}
     }
 
 out:
commit bf629466a46c4037ec7b7cc5ee16be947618bd68
Author: Zhenyu Wang <zhenyu.z.wang at intel.com>
Date:   Wed Jan 30 18:55:20 2008 +0800

    hardware status page initialization rework
    
    Order hardware status page setup more reasonable after
    all memory bound, in case new chipset requires non-stolen
    page and that could be bound then.
    
    Also clean up drm irq handler install function, and put
    first install in starting stage later than status page setup,
    so we won't make device cry for uninitialized status page.

diff --git a/src/i830.h b/src/i830.h
index 9adbaf7..e55e110 100644
--- a/src/i830.h
+++ b/src/i830.h
@@ -695,7 +695,9 @@ extern Bool I830DRIFinishScreenInit(ScreenPtr pScreen);
 extern void I830DRIUnlock(ScrnInfoPtr pScrn);
 extern Bool I830DRILock(ScrnInfoPtr pScrn);
 extern Bool I830DRISetVBlankInterrupt (ScrnInfoPtr pScrn, Bool on);
-Bool i830_update_dri_buffers(ScrnInfoPtr pScrn);
+extern Bool i830_update_dri_buffers(ScrnInfoPtr pScrn);
+extern Bool I830DRISetHWS(ScrnInfoPtr pScrn);
+extern Bool I830DRIInstIrqHandler(ScrnInfoPtr pScrn);
 #endif
 
 unsigned long intel_get_pixmap_offset(PixmapPtr pPix);
diff --git a/src/i830_dri.c b/src/i830_dri.c
index 3400b38..141b970 100644
--- a/src/i830_dri.c
+++ b/src/i830_dri.c
@@ -236,18 +236,18 @@ I830SetParam(ScrnInfoPtr pScrn, int param, int value)
    return TRUE;
 }
 
-static Bool
-I830SetHWS(ScrnInfoPtr pScrn, int addr)
+Bool
+I830DRISetHWS(ScrnInfoPtr pScrn)
 {
     I830Ptr pI830 = I830PTR(pScrn);
     drmI830HWS hws;
 
-    hws.addr = addr;
+    hws.addr = pI830->hw_status->offset;
 
     if (drmCommandWrite(pI830->drmSubFD, DRM_I830_HWS_PAGE_ADDR,
 		&hws, sizeof(drmI830HWS))) {
 	xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
-		"G33 status page initialization Failed\n");
+		"hw status page initialization Failed\n");
 	return FALSE;
     }
     return TRUE;
@@ -813,12 +813,6 @@ I830DRIDoMappings(ScreenPtr pScreen)
       return FALSE;
    }
 
-   if (HWS_NEED_GFX(pI830)) {
-       if (!I830SetHWS(pScrn, pI830->hw_status->offset)) {
-	   DRICloseScreen(pScreen);
-	   return FALSE;
-       }
-   }
    /* init to zero to be safe */
    sarea->front_handle = 0;
    sarea->back_handle = 0;
@@ -881,18 +875,12 @@ I830DRIDoMappings(ScreenPtr pScreen)
 }
 
 Bool
-I830DRIResume(ScreenPtr pScreen)
+I830DRIInstIrqHandler(ScrnInfoPtr pScrn)
 {
-   ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
    I830Ptr pI830 = I830PTR(pScrn);
    I830DRIPtr pI830DRI = (I830DRIPtr) pI830->pDRIInfo->devPrivate;
 
-   DPRINTF(PFX, "I830DRIResume\n");
-
-   I830ResumeDma(pScrn);
-
-   {
-      pI830DRI->irq = drmGetInterruptFromBusID(pI830->drmSubFD,
+   pI830DRI->irq = drmGetInterruptFromBusID(pI830->drmSubFD,
 #if XSERVER_LIBPCIACCESS
 					       ((pI830->PciInfo->domain << 8) |
 						pI830->PciInfo->bus),
@@ -908,19 +896,31 @@ I830DRIResume(ScreenPtr pScreen)
 #endif
 					       );
 
-      if (drmCtlInstHandler(pI830->drmSubFD, pI830DRI->irq)) {
-	 xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
-		    "[drm] failure adding irq handler\n");
-	 pI830DRI->irq = 0;
-	 return FALSE;
-      }
-      else
-	 xf86DrvMsg(pScrn->scrnIndex, X_INFO,
-		    "[drm] dma control initialized, using IRQ %d\n",
-		    pI830DRI->irq);
-   }
+   if (drmCtlInstHandler(pI830->drmSubFD, pI830DRI->irq)) {
+       xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+	       "[drm] failure adding irq handler\n");
+       pI830DRI->irq = 0;
+       return FALSE;
+   } else
+       xf86DrvMsg(pScrn->scrnIndex, X_INFO,
+	       "[drm] dma control initialized, using IRQ %d\n",
+	       pI830DRI->irq);
 
-   return FALSE;
+   return TRUE;
+}
+
+Bool
+I830DRIResume(ScreenPtr pScreen)
+{
+   ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
+
+   DPRINTF(PFX, "I830DRIResume\n");
+
+   I830ResumeDma(pScrn);
+
+   I830DRIInstIrqHandler(pScrn);
+
+   return TRUE;
 }
 
 void
@@ -976,47 +976,16 @@ I830DestroyContext(ScreenPtr pScreen, drm_context_t hwContext,
 Bool
 I830DRIFinishScreenInit(ScreenPtr pScreen)
 {
-   ScrnInfoPtr        pScrn = xf86Screens[pScreen->myNum];
-   I830Ptr pI830 = I830PTR(pScrn);
-
    DPRINTF(PFX, "I830DRIFinishScreenInit\n");
 
    if (!DRIFinishScreenInit(pScreen))
       return FALSE;
 
-   /* Okay now initialize the dma engine */
-   {
-      I830DRIPtr pI830DRI = (I830DRIPtr) pI830->pDRIInfo->devPrivate;
-
-      pI830DRI->irq = drmGetInterruptFromBusID(pI830->drmSubFD,
-#if XSERVER_LIBPCIACCESS
-					       ((pI830->PciInfo->domain << 8) |
-						pI830->PciInfo->bus),
-					       pI830->PciInfo->dev,
-					       pI830->PciInfo->func
-#else
-					       ((pciConfigPtr) pI830->
-						PciInfo->thisCard)->busnum,
-					       ((pciConfigPtr) pI830->
-						PciInfo->thisCard)->devnum,
-					       ((pciConfigPtr) pI830->
-						PciInfo->thisCard)->funcnum
-#endif
-					       );
-
-      if (drmCtlInstHandler(pI830->drmSubFD, pI830DRI->irq)) {
-	 xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
-		    "[drm] failure adding irq handler\n");
-	 pI830DRI->irq = 0;
-	 DRICloseScreen(pScreen);
-	 return FALSE;
-      }
-      else
-	 xf86DrvMsg(pScrn->scrnIndex, X_INFO,
-		    "[drm] dma control initialized, using IRQ %d\n",
-		    pI830DRI->irq);
-	 return TRUE;
-   }
+   /* move irq initialize later in EnterVT, as then we
+    * would finish binding possible hw status page, which
+    * requires irq ctrl ioctl not be called that early.
+    */
+   return TRUE;
 }
 
 #ifdef DAMAGE
diff --git a/src/i830_driver.c b/src/i830_driver.c
index dffc630..a3c64de 100644
--- a/src/i830_driver.c
+++ b/src/i830_driver.c
@@ -3184,6 +3184,22 @@ I830EnterVT(int scrnIndex, int flags)
 
 #ifdef XF86DRI
    if (pI830->directRenderingEnabled) {
+       /* HW status is fixed, we need to set it up before any drm
+	* operation which accessing that page, like irq install, etc.
+	*/
+       if (pI830->starting) {
+	   if (HWS_NEED_GFX(pI830) && !I830DRISetHWS(pScrn)) {
+		   xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+			   "Fail to setup hardware status page.\n");
+		   I830DRICloseScreen(pScrn->pScreen);
+		   return FALSE;
+	   }
+	   if (!I830DRIInstIrqHandler(pScrn)) {
+	       I830DRICloseScreen(pScrn->pScreen);
+	       return FALSE;
+	   }
+       }
+
       /* Update buffer offsets in sarea and mappings, since buffer offsets
        * may have changed.
        */
diff --git a/src/i830_memory.c b/src/i830_memory.c
index 8a5262f..07e4010 100644
--- a/src/i830_memory.c
+++ b/src/i830_memory.c
@@ -468,6 +468,9 @@ i830_allocator_init(ScrnInfoPtr pScrn, unsigned long offset, unsigned long size)
 	/* Can't do TTM on stolen memory */
 	mmsize -= pI830->stolen_size;
 
+	if (HWS_NEED_GFX(pI830) && IS_IGD_GM(pI830))
+	    mmsize -= HWSTATUS_PAGE_SIZE;
+
 	/* Create the aperture allocation */
 	pI830->memory_manager =
 	    i830_allocate_aperture(pScrn, "DRI memory manager",
@@ -1630,13 +1633,17 @@ static Bool
 i830_allocate_hwstatus(ScrnInfoPtr pScrn)
 {
     I830Ptr pI830 = I830PTR(pScrn);
+    int flags;
 
     /* The current DRM will leak the HWS mapping if we update the address
      * after init (at best), so allocate it fixed for its lifetime
      * (i.e. not through buffer objects).
      */
+    flags = NEED_LIFETIME_FIXED;
+    if (IS_IGD_GM(pI830))
+	    flags |= NEED_NON_STOLEN;
     pI830->hw_status = i830_allocate_memory(pScrn, "HW status",
-	    HWSTATUS_PAGE_SIZE, GTT_PAGE_SIZE, NEED_LIFETIME_FIXED);
+	    HWSTATUS_PAGE_SIZE, GTT_PAGE_SIZE, flags);
     if (pI830->hw_status == NULL) {
 	xf86DrvMsg(pScrn->scrnIndex, X_WARNING,
 		"Failed to allocate hw status page.\n");
commit 04032dad28baab80131edbe8fe58aade8149bb71
Author: Zhenyu Wang <zhenyu.z.wang at intel.com>
Date:   Wed Jan 30 18:52:32 2008 +0800

    Wrap up chipsets which needs graphics address for status page
    
    Also add support on new chipset.

diff --git a/src/common.h b/src/common.h
index 3a11e59..c0af1ad 100644
--- a/src/common.h
+++ b/src/common.h
@@ -441,6 +441,8 @@ extern int I810_DEBUG;
 #define IS_MOBILE(pI810) (IS_I830(pI810) || IS_I85X(pI810) || IS_I915GM(pI810) || IS_I945GM(pI810) || IS_I965GM(pI810) || IS_IGD_GM(pI810))
 /* mark chipsets for using gfx VM offset for overlay */
 #define OVERLAY_NOPHYSICAL(pI810) (IS_G33CLASS(pI810))
+/* chipsets require graphics mem for hardware status page */
+#define HWS_NEED_GFX(pI810) (IS_G33CLASS(pI810) || IS_IGD_GM(pI810))
 
 #define GTT_PAGE_SIZE			KB(4)
 #define ROUND_TO(x, y)			(((x) + (y) - 1) / (y) * (y))
diff --git a/src/i830_dri.c b/src/i830_dri.c
index f52a7c3..3400b38 100644
--- a/src/i830_dri.c
+++ b/src/i830_dri.c
@@ -813,7 +813,7 @@ I830DRIDoMappings(ScreenPtr pScreen)
       return FALSE;
    }
 
-   if (IS_G33CLASS(pI830)) {
+   if (HWS_NEED_GFX(pI830)) {
        if (!I830SetHWS(pScrn, pI830->hw_status->offset)) {
 	   DRICloseScreen(pScreen);
 	   return FALSE;
diff --git a/src/i830_memory.c b/src/i830_memory.c
index 85b6528..8a5262f 100644
--- a/src/i830_memory.c
+++ b/src/i830_memory.c
@@ -1635,11 +1635,11 @@ i830_allocate_hwstatus(ScrnInfoPtr pScrn)
      * after init (at best), so allocate it fixed for its lifetime
      * (i.e. not through buffer objects).
      */
-    pI830->hw_status = i830_allocate_memory(pScrn, "G33 hw status",
+    pI830->hw_status = i830_allocate_memory(pScrn, "HW status",
 	    HWSTATUS_PAGE_SIZE, GTT_PAGE_SIZE, NEED_LIFETIME_FIXED);
     if (pI830->hw_status == NULL) {
 	xf86DrvMsg(pScrn->scrnIndex, X_WARNING,
-		"Failed to allocate hw status page for G33.\n");
+		"Failed to allocate hw status page.\n");
 	return FALSE;
     }
     return TRUE;
@@ -1652,7 +1652,7 @@ i830_allocate_3d_memory(ScrnInfoPtr pScrn)
 
     DPRINTF(PFX, "i830_allocate_3d_memory\n");
 
-    if (IS_G33CLASS(pI830)) {
+    if (HWS_NEED_GFX(pI830)) {
 	if (!i830_allocate_hwstatus(pScrn))
 	    return FALSE;
     }
commit b9c3fa79d9fd48c34536f92e6c1c70fe3a619410
Author: Keith Packard <keithp at keithp.com>
Date:   Fri Jan 11 15:51:00 2008 -0800

    Add intel_idle to measure GPU load ratio.
    
    Samples GPU idle register to detect when the GPU is busy; prints out the
    number of busy samples per 100 total samples once per second.

diff --git a/src/reg_dumper/Makefile.am b/src/reg_dumper/Makefile.am
index aee26d0..bba388f 100644
--- a/src/reg_dumper/Makefile.am
+++ b/src/reg_dumper/Makefile.am
@@ -1,4 +1,4 @@
-noinst_PROGRAMS = intel_reg_dumper
+noinst_PROGRAMS = intel_reg_dumper intel_idle
 
 intel_reg_dumper_SOURCES = \
 	main.c \
@@ -6,7 +6,16 @@ intel_reg_dumper_SOURCES = \
 	xprintf.c \
 	../i830_debug.c
 
+intel_idle_SOURCES = \
+	idle.c \
+	reg_dumper.h \
+	xprintf.c \
+	../i830_debug.c
+
 intel_reg_dumper_LDADD = $(PCIACCESS_LIBS)
 
-intel_reg_dumper_CFLAGS = $(PCIACCESS_CFLAGS) $(WARN_CFLAGS) \
+intel_idle_LDADD = $(PCIACCESS_LIBS)
+
+AM_CFLAGS = $(PCIACCESS_CFLAGS) $(WARN_CFLAGS) \
 	-I$(srcdir)/.. -DREG_DUMPER
+
diff --git a/src/reg_dumper/idle.c b/src/reg_dumper/idle.c
new file mode 100644
index 0000000..ec083fd
--- /dev/null
+++ b/src/reg_dumper/idle.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright © 2007 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric at anholt.net>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <pciaccess.h>
+#include <err.h>
+#include <unistd.h>
+
+#include "reg_dumper.h"
+#include "../i810_reg.h"
+
+int main(int argc, char **argv)
+{
+    struct pci_device *dev;
+    I830Rec i830;
+    ScrnInfoRec scrn;
+    int err, mmio_bar;
+    void *mmio;
+
+    err = pci_system_init();
+    if (err != 0) {
+	fprintf(stderr, "Couldn't initialize PCI system: %s\n", strerror(err));
+	exit(1);
+    }
+
+    /* Grab the graphics card */
+    dev = pci_device_find_by_slot(0, 0, 2, 0);
+    if (dev == NULL)
+	errx(1, "Couldn't find graphics card");
+
+    err = pci_device_probe(dev);
+    if (err != 0) {
+	fprintf(stderr, "Couldn't probe graphics card: %s\n", strerror(err));
+	exit(1);
+    }
+
+    if (dev->vendor_id != 0x8086)
+	errx(1, "Graphics card is non-intel");
+
+    i830.PciInfo = &i830.pci_info_rec;
+    i830.PciInfo->chipType = dev->device_id;
+
+    i830.pci_dev = dev;
+
+    mmio_bar = IS_I9XX((&i830)) ? 0 : 1;
+
+    err = pci_device_map_range (dev,
+				dev->regions[mmio_bar].base_addr,
+				dev->regions[mmio_bar].size, 
+				PCI_DEV_MAP_FLAG_WRITABLE,
+				&mmio);
+    
+    if (err != 0) {
+	fprintf(stderr, "Couldn't map MMIO region: %s\n", strerror(err));
+	exit(1);
+    }
+    i830.mmio = mmio;
+
+    scrn.scrnIndex = 0;
+    scrn.pI830 = &i830;
+
+    {
+        I830Ptr pI830 = I830PTR((&scrn));
+
+	CARD32  idle_value = 0xffe5fafe;
+
+	for (;;)
+	{
+	    CARD32	busy = 0;
+	    int		i;
+
+	    for (i = 0; i < 100; i++) {
+		if (INREG (INST_DONE_I965) != idle_value)
+		    busy++;
+		usleep (10000);
+	    }
+	    printf ("load: %d\n", busy);
+	}
+    }
+
+    return 0;
+}
+
+void xf86DrvMsg(int scrnIndex, int severity, const char *format, ...)
+{
+    va_list va;
+
+    switch (severity) {
+    case X_INFO:
+	printf("(II): ");
+	break;
+    case X_WARNING:
+	printf("(WW): ");
+	break;
+    case X_ERROR:
+	printf("(EE): ");
+	break;
+    }
+
+    va_start(va, format);
+    vprintf(format, va);
+    va_end(va);
+}
commit 4eda765ba577d0b06048012945aa73b64ee08664
Author: Eric Anholt <eric at anholt.net>
Date:   Mon Jan 28 19:36:18 2008 -0800

    Emit invarient 3D state before we mark that we've done it.

diff --git a/src/i965_render.c b/src/i965_render.c
index df50d6a..efcd81b 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -839,8 +839,6 @@ gen4_emit_batch_header (ScrnInfoPtr pScrn)
     urb_cs_start = urb_sf_start + urb_sf_size;
     urb_cs_size = URB_CS_ENTRIES * URB_CS_ENTRY_SIZE;
 
-    IntelEmitInvarientState(pScrn);
-
     sip_kernel_offset = offsetof (gen4_state_t, sip_kernel);
 
     /* Begin the long sequence of commands needed to set up the 3D
@@ -1023,6 +1021,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 
     surface_start_base = surface_map;
 
+    IntelEmitInvarientState(pScrn);
     *pI830->last_3d = LAST_3D_RENDER;
 
     pI830->scale_units[0][0] = pSrc->drawable.width;
@@ -1153,8 +1152,18 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     if (pI830->exa965->num_ops == 0)
 	gen4_emit_batch_header (pScrn);
 
-    {
-	BEGIN_BATCH(18);
+     {
+	BEGIN_BATCH(19);
+	/* Flush the map (texture) cache.  The rendering cache covers the blit
+	 * and 3D destination parts of the engine and automatically flushes
+	 * between them, but the map cache has to be flushed separately.
+	 *
+	 * The remaining caches (in particular vertex and instruction) only
+	 * need to be flushed at the start of the batchbuffer, which we do.
+	 * A bare MI_FLUSH does happen to flush vertex cache anyway.
+	 */
+	OUT_BATCH(MI_FLUSH);
+
 	/* Binding table pointers */
    	OUT_BATCH(BRW_3DSTATE_BINDING_TABLE_POINTERS | 4);
    	OUT_BATCH(0); /* vs */
commit 5d9e67aec3ce458d22b7febd3819542abb360534
Author: Erik Andren <erik.andren at gmail.com>
Date:   Fri Jan 25 11:06:01 2008 +0800

    Clevo M720R has no TV-out

diff --git a/src/i830_quirks.c b/src/i830_quirks.c
index cb43603..853a33e 100644
--- a/src/i830_quirks.c
+++ b/src/i830_quirks.c
@@ -69,6 +69,9 @@ static i830_quirk i830_quirk_list[] = {
     /* Apple Mac mini has no lvds, but macbook pro does */
     { PCI_CHIP_I945_GM, 0x8086, 0x7270, quirk_mac_mini },
 
+    /* Clevo M720R has no tv output */
+    { PCI_CHIP_I965_GM, 0x1558, 0x0721, quirk_ignore_tv },
+
     /* Dell Latitude X1 */
     { PCI_CHIP_I915_GM, 0x1028, 0x01a3, quirk_ignore_tv },
     /* Dell XPS 1330 */
commit 6bf53eb48f40ad0c8ea9679ee634447410821b4f
Author: Zhenyu Wang <zhenyu.z.wang at intel.com>
Date:   Fri Jan 25 16:59:13 2008 +0800

    Set vtSema before EnterVT
    
    Which was missing in our ScreenInit and initial EnterVT.
    This not only causes failure in initial rotation with TTM,
    as we won't bind in rotate_mem alloc in this case, and hide
    another bug that we call randr12 function in I830LoadPalete
    before we call xf86RandR12Init.

diff --git a/src/i830_driver.c b/src/i830_driver.c
index 7077456..dffc630 100644
--- a/src/i830_driver.c
+++ b/src/i830_driver.c
@@ -2934,9 +2934,25 @@ I830ScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
       pI830->directRenderingEnabled = I830DRIFinishScreenInit(pScreen);
 #endif
 
+   /* Must force it before EnterVT, so we are in control of VT and
+    * later memory should be bound when allocating, e.g rotate_mem */
+   pScrn->vtSema = TRUE;
+
    if (!I830EnterVT(scrnIndex, 0))
       return FALSE;
 
+   pI830->BlockHandler = pScreen->BlockHandler;
+   pScreen->BlockHandler = I830BlockHandler;
+
+   pScreen->SaveScreen = xf86SaveScreen;
+   pI830->CloseScreen = pScreen->CloseScreen;
+   pScreen->CloseScreen = I830CloseScreen;
+   pI830->CreateScreenResources = pScreen->CreateScreenResources;
+   pScreen->CreateScreenResources = i830CreateScreenResources;
+
+   if (!xf86CrtcScreenInit (pScreen))
+       return FALSE;
+
    DPRINTF(PFX, "assert( if(!miCreateDefColormap(pScreen)) )\n");
    if (!miCreateDefColormap(pScreen))
       return FALSE;
@@ -2973,18 +2989,7 @@ I830ScreenInit(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "direct rendering: Not available\n");
 #endif
 
-   pI830->BlockHandler = pScreen->BlockHandler;
-   pScreen->BlockHandler = I830BlockHandler;
 
-   pScreen->SaveScreen = xf86SaveScreen;
-   pI830->CloseScreen = pScreen->CloseScreen;
-   pScreen->CloseScreen = I830CloseScreen;
-   pI830->CreateScreenResources = pScreen->CreateScreenResources;
-   pScreen->CreateScreenResources = i830CreateScreenResources;
-
-   if (!xf86CrtcScreenInit (pScreen))
-       return FALSE;
-       
    /* Wrap pointer motion to flip touch screen around */
    pI830->PointerMoved = pScrn->PointerMoved;
    pScrn->PointerMoved = I830PointerMoved;
commit 2c5f73feb7fc16400fa3f13c5b0d455264ae721d
Author: Eric Anholt <eric at anholt.net>
Date:   Thu Jan 24 16:49:52 2008 -0800

    Actually disable state max addresses instead of setting a max of 256MB.

diff --git a/src/i965_render.c b/src/i965_render.c
index e60c375..df50d6a 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -883,9 +883,9 @@ gen4_emit_batch_header (ScrnInfoPtr pScrn)
 
 	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);  /* media base addr, don't care */
 	/* general state max addr, disabled */
-	OUT_BATCH(0x10000000 | BASE_ADDRESS_MODIFY);
+	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
 	/* media object state max addr, disabled */
-	OUT_BATCH(0x10000000 | BASE_ADDRESS_MODIFY);
+	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);
 
 	/* Set system instruction pointer */
 	OUT_BATCH(BRW_STATE_SIP | 0);
commit 746c7f27fcb818775710763c6e181d0bc72be361
Author: Eric Anholt <eric at anholt.net>
Date:   Thu Jan 24 16:47:54 2008 -0800

    Refactor surface state setup into common code for dst/src/mask.
    
    The format code could probably be merged together as well.

diff --git a/src/i965_render.c b/src/i965_render.c
index 6f725a2..e60c375 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -166,7 +166,7 @@ static void i965_get_blend_cntl(int op, PicturePtr pMask, CARD32 dst_format,
 
 }
 
-static Bool i965_get_dest_format(PicturePtr pDstPicture, CARD32 *dst_format)
+static Bool i965_get_dest_format(PicturePtr pDstPicture, uint32_t *dst_format)
 {
     switch (pDstPicture->format) {
     case PICT_a8r8g8b8:
@@ -233,7 +233,7 @@ Bool
 i965_check_composite(int op, PicturePtr pSrcPicture, PicturePtr pMaskPicture,
 		     PicturePtr pDstPicture)
 {
-    CARD32 tmp1;
+    uint32_t tmp1;
 
     /* Check for unsupported compositing operations. */
     if (op >= sizeof(i965_blend_op) / sizeof(i965_blend_op[0]))
@@ -269,10 +269,6 @@ i965_check_composite(int op, PicturePtr pSrcPicture, PicturePtr pMaskPicture,
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define BRW_GRF_BLOCKS(nreg)    ((nreg + 15) / 16 - 1)
 
-/* these offsets will remain the same for all buffers post allocation */
-static int dest_surf_offset, src_surf_offset, mask_surf_offset;
-static int binding_table_offset;
-
 static const CARD32 sip_kernel_static[][4] = {
 /*    wait (1) a0<1>UW a145<0,1,0>UW { align1 +  } */
     { 0x00000030, 0x20000108, 0x00001220, 0x00000000 },
@@ -738,74 +734,6 @@ gen4_state_init (gen4_state_t *state)
 	    sizeof (ps_kernel_rotation_static));
 }
 
-static void
-gen4_surface_state_init (unsigned char *start_base,
-			 struct i965_exastate_buffer *state)
-{
-    struct brw_surface_state *dest_surf_state, *src_surf_state, *mask_surf_state;
-    unsigned int surf_state_offset = offsetof (gen4_surface_state_t,
-					       surface_state);
-
-    binding_table_offset = (offsetof (gen4_surface_state_t, binding_table) +
-			    sizeof (CARD32) * GEN4_BINDING_TABLE_PER_OP *
-			    state->num_ops);
-
-    /* destination surface state */
-    dest_surf_offset = (surf_state_offset +
-			sizeof (brw_surface_state_padded) *
-			(GEN4_SURFACE_STATE_PER_OP * state->num_ops + 0));
-    dest_surf_state = (void *)(start_base + dest_surf_offset);
-    dest_surf_state->ss0.surface_type = BRW_SURFACE_2D;
-    dest_surf_state->ss0.data_return_format = BRW_SURFACERETURNFORMAT_FLOAT32;
-    dest_surf_state->ss0.writedisable_alpha = 0;
-    dest_surf_state->ss0.writedisable_red = 0;
-    dest_surf_state->ss0.writedisable_green = 0;
-    dest_surf_state->ss0.writedisable_blue = 0;
-    dest_surf_state->ss0.color_blend = 1;
-    dest_surf_state->ss0.vert_line_stride = 0;
-    dest_surf_state->ss0.vert_line_stride_ofs = 0;
-    dest_surf_state->ss0.mipmap_layout_mode = 0;
-    dest_surf_state->ss0.render_cache_read_mode = 0;
-    dest_surf_state->ss2.mip_count = 0;
-    dest_surf_state->ss2.render_target_rotation = 0;
-
-    /* source surface state */
-    src_surf_offset = (surf_state_offset +
-		       sizeof (brw_surface_state_padded) *
-		       (GEN4_SURFACE_STATE_PER_OP * state->num_ops + 1));
-    src_surf_state = (void *)(start_base + src_surf_offset);
-    src_surf_state->ss0.surface_type = BRW_SURFACE_2D;
-    src_surf_state->ss0.writedisable_alpha = 0;
-    src_surf_state->ss0.writedisable_red = 0;
-    src_surf_state->ss0.writedisable_green = 0;
-    src_surf_state->ss0.writedisable_blue = 0;
-    src_surf_state->ss0.color_blend = 1;
-    src_surf_state->ss0.vert_line_stride = 0;
-    src_surf_state->ss0.vert_line_stride_ofs = 0;
-    src_surf_state->ss0.mipmap_layout_mode = 0;
-    src_surf_state->ss0.render_cache_read_mode = 0;
-    src_surf_state->ss2.mip_count = 0;
-    src_surf_state->ss2.render_target_rotation = 0;
-
-    /* mask surface state */
-    mask_surf_offset = (surf_state_offset +
-			sizeof (brw_surface_state_padded) *
-			(GEN4_SURFACE_STATE_PER_OP * state->num_ops + 2));
-    mask_surf_state = (void *)(start_base + mask_surf_offset);
-    mask_surf_state->ss0.surface_type = BRW_SURFACE_2D;
-    mask_surf_state->ss0.writedisable_alpha = 0;
-    mask_surf_state->ss0.writedisable_red = 0;
-    mask_surf_state->ss0.writedisable_green = 0;
-    mask_surf_state->ss0.writedisable_blue = 0;
-    mask_surf_state->ss0.color_blend = 1;
-    mask_surf_state->ss0.vert_line_stride = 0;
-    mask_surf_state->ss0.vert_line_stride_ofs = 0;
-    mask_surf_state->ss0.mipmap_layout_mode = 0;
-    mask_surf_state->ss0.render_cache_read_mode = 0;
-    mask_surf_state->ss2.mip_count = 0;
-    mask_surf_state->ss2.render_target_rotation = 0;
-}
-
 /**
  * Called from intel_batchbuffer_flush when we're about to flush a batch
  * buffer and start a new one.
@@ -995,6 +923,63 @@ gen4_emit_batch_header (ScrnInfoPtr pScrn)
     }
 }
 
+/**
+ * Sets up the common fields for a surface state buffer for the given picture
+ * in the surface state buffer at index, and returns the offset within the
+ * surface state buffer for this entry.
+ */
+static unsigned int
+i965_set_picture_surface_state(ScrnInfoPtr pScrn, unsigned int index,
+			       PicturePtr pPicture, PixmapPtr pPixmap,
+			       Bool is_dst)
+{
+    I830Ptr pI830 = I830PTR(pScrn);
+    struct brw_surface_state *ss;
+    unsigned int offset;
+
+
+    offset = offsetof(gen4_surface_state_t,
+		      surface_state[GEN4_SURFACE_STATE_PER_OP *
+				    pI830->exa965->num_ops + index]);
+
+    ss = (void *)((char *)pI830->exa965->surface_buf->virtual + offset);
+    ss->ss0.surface_type = BRW_SURFACE_2D;
+    if (is_dst) {
+	uint32_t dst_format;
+
+	i965_get_dest_format(pPicture, &dst_format);
+	ss->ss0.surface_format = dst_format;
+	ss->ss0.data_return_format = BRW_SURFACERETURNFORMAT_FLOAT32;
+    } else {
+	ss->ss0.surface_format = i965_get_card_format(pPicture);
+    }
+    ss->ss0.writedisable_alpha = 0;
+    ss->ss0.writedisable_red = 0;
+    ss->ss0.writedisable_green = 0;
+    ss->ss0.writedisable_blue = 0;
+    ss->ss0.color_blend = 1;
+    ss->ss0.vert_line_stride = 0;
+    ss->ss0.vert_line_stride_ofs = 0;
+    ss->ss0.mipmap_layout_mode = 0;
+    ss->ss0.render_cache_read_mode = 0;
+    ss->ss1.base_addr =
+	intelddx_batchbuffer_emit_pixmap(pPixmap,
+					 DRM_BO_FLAG_MEM_TT |
+					 (is_dst ? DRM_BO_FLAG_WRITE : 0) |
+					 DRM_BO_FLAG_READ,
+					 pI830->exa965->surface_buf,
+					 offset + 4, 0);
+    ss->ss2.mip_count = 0;
+    ss->ss2.render_target_rotation = 0;
+    ss->ss2.height = pPixmap->drawable.height - 1;
+    ss->ss2.width = pPixmap->drawable.width - 1;
+    ss->ss3.pitch = intel_get_pixmap_pitch(pPixmap) - 1;
+    ss->ss3.tile_walk = 0; /* Tiled X */
+    ss->ss3.tiled_surface = i830_pixmap_tiled(pPixmap);
+
+    return offset;
+}
+
 Bool
 i965_prepare_composite(int op, PicturePtr pSrcPicture,
 		       PicturePtr pMaskPicture, PicturePtr pDstPicture,
@@ -1002,9 +987,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 {
     ScrnInfoPtr pScrn = xf86Screens[pSrcPicture->pDrawable->pScreen->myNum];
     I830Ptr pI830 = I830PTR(pScrn);
-    CARD32 src_pitch, src_tile_format = 0, src_tiled = 0;
-    CARD32 mask_pitch = 0, mask_tile_format = 0, mask_tiled = 0;
-    CARD32 dst_format, dst_pitch, dst_tile_format = 0, dst_tiled = 0;
     Bool rotation_program = FALSE;
     int wm_state_offset;
     int sf_state_offset, cc_state_offset;
@@ -1012,9 +994,9 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     void *surface_map;
     sampler_state_filter_t src_filter, mask_filter;
     sampler_state_extend_t src_extend, mask_extend;
-    struct brw_surface_state *dest_surf_state, *src_surf_state, *mask_surf_state;
     CARD32 *binding_table;
     CARD32 src_blend, dst_blend;
+    int binding_table_offset;
 
     /* We cannot handle a flush occuring anytime during the
      * prepare_composite/composite/done_composite handling. So make
@@ -1038,31 +1020,11 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 
     i965_exastate_reset(pI830->exa965);
     surface_map = pI830->exa965->surface_map;
-    gen4_surface_state_init (surface_map, pI830->exa965);
 
     surface_start_base = surface_map;
 
     *pI830->last_3d = LAST_3D_RENDER;
 
-    src_pitch = intel_get_pixmap_pitch(pSrc);
-    if (i830_pixmap_tiled(pSrc)) {
-        src_tiled = 1;
-	src_tile_format = 0; /* Tiled X */
-    }
-
-    dst_pitch = intel_get_pixmap_pitch(pDst);
-    if (i830_pixmap_tiled(pDst)) {
-        dst_tiled = 1;
-	dst_tile_format = 0; /* Tiled X */
-    }
-
-    if (pMask) {
-	mask_pitch = intel_get_pixmap_pitch(pMask);
-	if (i830_pixmap_tiled(pMask)) {
-  	    mask_tiled = 1;
-	    mask_tile_format = 0;
-	}
-    }
     pI830->scale_units[0][0] = pSrc->drawable.width;
     pI830->scale_units[0][1] = pSrc->drawable.height;
 
@@ -1094,68 +1056,28 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     i965_get_blend_cntl(op, pMaskPicture, pDstPicture->format,
 			&src_blend, &dst_blend);
 
-    /* Set up the state buffer for the destination surface */
-    dest_surf_state = (void *)(surface_start_base + dest_surf_offset);
-    i965_get_dest_format(pDstPicture, &dst_format);
-    dest_surf_state->ss0.surface_format = dst_format;
-
-    dest_surf_state->ss1.base_addr =
-	intelddx_batchbuffer_emit_pixmap(pDst,
-					 DRM_BO_FLAG_MEM_TT |
-					 DRM_BO_FLAG_WRITE |
-					 DRM_BO_FLAG_READ,
-					 pI830->exa965->surface_buf,
-					 dest_surf_offset + 4, 0);
-
-    dest_surf_state->ss2.height = pDst->drawable.height - 1;
-    dest_surf_state->ss2.width = pDst->drawable.width - 1;
-    dest_surf_state->ss3.pitch = dst_pitch - 1;
-    dest_surf_state->ss3.tile_walk = dst_tile_format;
-    dest_surf_state->ss3.tiled_surface = dst_tiled;
-
-    /* Set up the source surface state buffer */
-    src_surf_state = (void *)(surface_start_base + src_surf_offset);
-    src_surf_state->ss0.surface_format = i965_get_card_format(pSrcPicture);
+    binding_table_offset = (offsetof (gen4_surface_state_t, binding_table) +
+			    sizeof (CARD32) * GEN4_BINDING_TABLE_PER_OP *
+			    pI830->exa965->num_ops);
+    binding_table = (void *)(surface_start_base +
+			     binding_table_offset);
 
-    src_surf_state->ss1.base_addr =
-	intelddx_batchbuffer_emit_pixmap(pSrc,
-					 DRM_BO_FLAG_MEM_TT |
-					 DRM_BO_FLAG_READ,
-					 pI830->exa965->surface_buf,
-					 src_surf_offset + 4, 0);
+    /* Set up and bind the state buffer for the destination surface */
+    binding_table[0] = i965_set_picture_surface_state(pScrn, 0,
+						      pDstPicture, pDst, TRUE);
 
-    src_surf_state->ss2.width = pSrc->drawable.width - 1;
-    src_surf_state->ss2.height = pSrc->drawable.height - 1;
-    src_surf_state->ss3.pitch = src_pitch - 1;
-    src_surf_state->ss3.tile_walk = src_tile_format;
-    src_surf_state->ss3.tiled_surface = src_tiled;
+    /* Set up and bind the source surface state buffer */
+    binding_table[1] = i965_set_picture_surface_state(pScrn, 1,
+						      pSrcPicture, pSrc, FALSE);
 
-    /* setup mask surface */
     if (pMask) {
-	mask_surf_state = (void *)(surface_start_base + mask_surf_offset);
-   	mask_surf_state->ss0.surface_format = i965_get_card_format(pMaskPicture);
-	mask_surf_state->ss1.base_addr =
-	    intelddx_batchbuffer_emit_pixmap(pMask,
-					     DRM_BO_FLAG_MEM_TT |
-					     DRM_BO_FLAG_READ,
-					     pI830->exa965->surface_buf,
-					     mask_surf_offset + 4, 0);
-   	mask_surf_state->ss2.width = pMask->drawable.width - 1;
-   	mask_surf_state->ss2.height = pMask->drawable.height - 1;
-   	mask_surf_state->ss3.pitch = mask_pitch - 1;
-	mask_surf_state->ss3.tile_walk = mask_tile_format;
-	mask_surf_state->ss3.tiled_surface = mask_tiled;
-    }
-
-    binding_table = (void *)(surface_start_base +
-			     binding_table_offset);
-    /* Set up a binding table for our surfaces.  Only the PS will use it */
-    binding_table[0] = dest_surf_offset;
-    binding_table[1] = src_surf_offset;
-    if (pMask)
-   	binding_table[2] = mask_surf_offset;
-    else
+	/* Set up and bind the mask surface state buffer */
+	binding_table[2] = i965_set_picture_surface_state(pScrn, 2,
+							  pMaskPicture, pMask,
+							  FALSE);
+    } else {
 	binding_table[2] = 0;
+    }
 
     src_filter = sampler_state_filter_from_picture (pSrcPicture->filter);
     if (src_filter < 0)
commit 34eb9c2847766d155446041535ec6df62901ed57
Author: Eric Anholt <eric at anholt.net>
Date:   Thu Jan 24 14:45:25 2008 -0800

    Explain why I830EXASync is unnecessary and clean up PrepareAccess path.
    
    The PrepareAccess path doesn't need to do a full finish, since dri_bo_map
    will wait on at least the last fence to involve rendering to the BO.  And in
    the case of software rendering (no driver_priv->bo), there's no rendering
    possible to the object anyway.

diff --git a/src/i830_exa.c b/src/i830_exa.c
index 6368fff..71601e4 100644
--- a/src/i830_exa.c
+++ b/src/i830_exa.c
@@ -155,16 +155,20 @@ i830_exa_pixmap_is_offscreen(PixmapPtr pPixmap)
  * @pScreen: current screen
  * @marker: marker command to wait for
  *
- * Wait for the command specified by @marker to finish, then return.  We don't
- * actually do marker waits, though we might in the future.  For now, just
- * wait for a full idle.
+ * Wait for the command specified by @marker to finish, then return.
+ *
+ * Since this is only called through EXA's PrepareAccess/FinishAccess path
+ * and dri_bo_map handles waiting when necessary, we no longer need to
+ * implement this one.
  */
 static void
 I830EXASync(ScreenPtr pScreen, int marker)
 {
     ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
 
-//    I830Sync(pScrn);
+#if 0
+    I830Sync(pScrn);
+#endif
 }
 
 /**
@@ -440,13 +444,11 @@ static Bool I830EXAPrepareAccess(PixmapPtr pPix, int index)
     if (!driver_priv)
 	return FALSE;
 
-    /* TODO : make this more conditional */
-    intelddx_batchbuffer_flush(pI830->batch);
-    dri_fence_wait(pI830->batch->last_fence);
-
     if (driver_priv->bo) {
 	mmDebug("mapping %p %d %dx%d\n", pPix, driver_priv->flags, pPix->drawable.width, pPix->drawable.height);
 
+	intelddx_batchbuffer_flush(pI830->batch);
+
 	if ((driver_priv->flags & I830_EXA_PIXMAP_IS_MAPPED))
 	    return TRUE;
 
commit 83b5bd04e413a049a0f26c8044fa5a968563af35
Author: Eric Anholt <eric at anholt.net>
Date:   Thu Jan 24 14:25:18 2008 -0800

    Revert "Align the batchbuffer emits to double-dword automatically."
    
    This reverts commit a47b9aee4265702058ab9c2c9451be67944a8d2a.
    
    Since we're using intel_batchbuffer.c now, this is done at batch submit anyway
    (which is much better than wasting padding per command).

diff --git a/src/intel_batchbuffer.h b/src/intel_batchbuffer.h
index de3a804..5acdb28 100644
--- a/src/intel_batchbuffer.h
+++ b/src/intel_batchbuffer.h
@@ -94,19 +94,12 @@ extern uint32_t intelddx_batchbuffer_emit_pixmap(PixmapPtr pPixmap,
  */
 #define BATCH_LOCALS
 
-#define BEGIN_BATCH(n)  						\
-    RING_LOCALS 							\
-    if (pI830->use_ttm_batch) {						\
-	intelddx_batchbuffer_require_space(pI830->batch,		\
-					   (((n) + 1) & ~1) * 4, 0);	\
-    } else {								\
-	DO_LP_RING((((n) + 1) & ~1));					\
-    }									\
-    if ((n) & 1) {							\
-	OUT_BATCH(MI_NOOP);						\
-    }
-
-
+#define BEGIN_BATCH(n)  							\
+	RING_LOCALS 								\
+	if (pI830->use_ttm_batch)						\
+   		intelddx_batchbuffer_require_space(pI830->batch, (n)*4, 0);	\
+	 else { \
+   DO_LP_RING(n) ; }
 
 #define OUT_BATCH(d) \
 	 if (pI830->use_ttm_batch) \
commit 3ebb19fec2f1304fda9e5d3d20d4037f33a282cb
Author: Eric Anholt <eric at anholt.net>
Date:   Thu Jan 24 14:23:22 2008 -0800

    Remove dead i830_batchbuffer.h

diff --git a/src/i830_batchbuffer.h b/src/i830_batchbuffer.h
deleted file mode 100644
index cb151b2..0000000
--- a/src/i830_batchbuffer.h
+++ /dev/null
@@ -1,44 +0,0 @@
-
-/*
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * Copyright © 2006 Intel Corporation
- * Copyright 2007 Red Hat
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Dave Airlie <airlied at linux.ie> - derived from Mesa code.
- */
-
-#ifndef I830_BATCHBUFFER_H
-#define I830_BATCHBUFFER_H
-
-#define BEGIN_BATCH(n) BEGIN_LP_RING(n)
-
-#define OUT_BATCH(d) OUT_RING(d)
-
-#define OUT_BATCH_F(f) OUT_RING_F(f)
-
-#define OUT_PIXMAP_RELOC(pixmap, flags, mask, delta) OUT_RING(intel_get_pixmap_offset(pixmap) + delta)
-
-#define ADVANCE_BATCH() ADVANCE_LP_RING()
-
-#endif
-
commit 4d96fd60f0cad51638a989ad2f43e52dc2942fda
Merge: 989c0da... 5972aaf...
Author: Eric Anholt <eric at anholt.net>
Date:   Thu Jan 24 14:11:59 2008 -0800

    Merge commit '5972aaf4351db300172cc8c79713c2dcf13144d0' into intel-batchbuffer
    
    This brings in cworth's require_space fix and non-use_ttm deletion.

commit 989c0dad21d1c91fc2239284ec5e2033dc8aeb21
Author: Eric Anholt <eric at anholt.net>
Date:   Thu Jan 24 13:08:30 2008 -0800

    Update to i915-ttm-cfu DRM changes (copied from Mesa)

diff --git a/src/intel_bufmgr_ttm.c b/src/intel_bufmgr_ttm.c
index 252c128..32b407c 100644
--- a/src/intel_bufmgr_ttm.c
+++ b/src/intel_bufmgr_ttm.c
@@ -35,18 +35,20 @@
  */
 
 #include <xf86drm.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <assert.h>
-#include <stdio.h>
+
 #include "errno.h"
 #include "dri_bufmgr.h"
-#include "intel_bufmgr_ttm.h"
 #include "string.h"
 
 #include "i915_drm.h"
 
+#include "intel_bufmgr_ttm.h"
+
 #define DBG(...) do {					\
    if (bufmgr_ttm->bufmgr.debug)			\
       fprintf(stderr, __VA_ARGS__);			\
@@ -81,7 +83,6 @@ typedef struct _dri_bufmgr_ttm {
     int validate_array_size;
     int validate_count;
 
-    drmBO *cached_reloc_buf;
     uint32_t *cached_reloc_buf_data;
 } dri_bufmgr_ttm;
 
@@ -110,7 +111,6 @@ typedef struct _dri_bo_ttm {
     int validate_index;
 
     /** DRM buffer object containing relocation list */
-    drmBO *reloc_buf;
     uint32_t *reloc_buf_data;
     struct dri_ttm_reloc *relocs;
 
@@ -151,7 +151,7 @@ static void dri_ttm_dump_validation_list(dri_bufmgr_ttm *bufmgr_ttm)
 		    bufmgr_ttm->validate_array[reloc_entry[2]].bo;
 		dri_bo_ttm *target_ttm = (dri_bo_ttm *)target_bo;
 
-		DBG("%2d: %s at 0x%08x -> %s at 0x%08x + 0x%08x\n",
+		DBG("%2d: %s at 0x%08x -> %s at 0x%08lx + 0x%08x\n",
 		    i,
 		    bo_ttm->name, reloc_entry[0],
 		    target_ttm->name, target_bo->offset,
@@ -237,10 +237,10 @@ intel_add_validate_buffer(dri_bo *buf,
 	req->bo_req.mask = INTEL_BO_MASK;
 	req->bo_req.fence_class = 0; /* Backwards compat. */
 
-	if (ttm_buf->reloc_buf != NULL)
-	    arg->reloc_handle = ttm_buf->reloc_buf->handle;
+	if (ttm_buf->reloc_buf_data != NULL)
+ 	    arg->reloc_ptr = (unsigned long)(void *)ttm_buf->reloc_buf_data;
 	else
-	    arg->reloc_handle = 0;
+	    arg->reloc_ptr = 0;
 
 	/* Hook up the linked list of args for the kernel */
 	arg->next = 0;
@@ -286,52 +286,25 @@ intel_setup_reloc_list(dri_bo *bo)
 {
     dri_bo_ttm *bo_ttm = (dri_bo_ttm *)bo;
     dri_bufmgr_ttm *bufmgr_ttm = (dri_bufmgr_ttm *)bo->bufmgr;
-    int ret;
 
     bo_ttm->relocs = malloc(sizeof(struct dri_ttm_reloc) *
 			    bufmgr_ttm->max_relocs);
 
-    if (bufmgr_ttm->cached_reloc_buf != NULL) {
-       bo_ttm->reloc_buf = bufmgr_ttm->cached_reloc_buf;
+    if (bufmgr_ttm->cached_reloc_buf_data != NULL) {
        bo_ttm->reloc_buf_data = bufmgr_ttm->cached_reloc_buf_data;
 
-       bufmgr_ttm->cached_reloc_buf = NULL;
        bufmgr_ttm->cached_reloc_buf_data = NULL;
     } else {
-       bo_ttm->reloc_buf = malloc(sizeof(bo_ttm->drm_bo));
-       ret = drmBOCreate(bufmgr_ttm->fd,
-			 RELOC_BUF_SIZE(bufmgr_ttm->max_relocs), 0,
-			 NULL,
-			 DRM_BO_FLAG_MEM_LOCAL |
-			 DRM_BO_FLAG_READ |
-			 DRM_BO_FLAG_WRITE |
-			 DRM_BO_FLAG_MAPPABLE |
-			 DRM_BO_FLAG_CACHED,
-			 0, bo_ttm->reloc_buf);
-       if (ret) {
-	  fprintf(stderr, "Failed to create relocation BO: %s\n",
-		  strerror(-ret));
-	  return ret;
-       }
-
-       ret = drmBOMap(bufmgr_ttm->fd, bo_ttm->reloc_buf,
-		      DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE,
-		      0, (void **)&bo_ttm->reloc_buf_data);
-       if (ret) {
-	  fprintf(stderr, "Failed to map relocation BO: %s\n",
-		  strerror(-ret));
-	  return ret;
-       }
+       bo_ttm->reloc_buf_data = calloc(1, RELOC_BUF_SIZE(bufmgr_ttm->max_relocs));
     }
 
     /* Initialize the relocation list with the header:
-     * DWORD 0: relocation type, relocation count
-     * DWORD 1: handle to next relocation list (currently none)
-     * DWORD 2: unused
-     * DWORD 3: unused
+     * DWORD 0: relocation count
+     * DWORD 1: relocation type  
+     * DWORD 2+3: handle to next relocation list (currently none) 64-bits
      */
-    bo_ttm->reloc_buf_data[0] = I915_RELOC_TYPE_0 << 16;
-    bo_ttm->reloc_buf_data[1] = 0;
+    bo_ttm->reloc_buf_data[0] = 0;
+    bo_ttm->reloc_buf_data[1] = I915_RELOC_TYPE_0;
     bo_ttm->reloc_buf_data[2] = 0;
     bo_ttm->reloc_buf_data[3] = 0;
 
@@ -390,7 +363,6 @@ dri_ttm_alloc(dri_bufmgr *bufmgr, const char *name,
     ttm_buf->bo.bufmgr = bufmgr;
     ttm_buf->name = name;
     ttm_buf->refcount = 1;
-    ttm_buf->reloc_buf = NULL;
     ttm_buf->reloc_buf_data = NULL;
     ttm_buf->relocs = NULL;
     ttm_buf->last_flags = ttm_buf->drm_bo.flags;
@@ -398,7 +370,7 @@ dri_ttm_alloc(dri_bufmgr *bufmgr, const char *name,
     ttm_buf->delayed_unmap = GL_FALSE;
     ttm_buf->validate_index = -1;
 
-    DBG("bo_create: %p (%s) %db\n", &ttm_buf->bo, ttm_buf->name, size);
+    DBG("bo_create: %p (%s) %ldb\n", &ttm_buf->bo, ttm_buf->name, size);
 
     return &ttm_buf->bo;
 }
@@ -446,7 +418,6 @@ intel_ttm_bo_create_from_handle(dri_bufmgr *bufmgr, const char *name,
     ttm_buf->bo.bufmgr = bufmgr;
     ttm_buf->name = name;
     ttm_buf->refcount = 1;
-    ttm_buf->reloc_buf = NULL;
     ttm_buf->reloc_buf_data = NULL;
     ttm_buf->relocs = NULL;
     ttm_buf->last_flags = ttm_buf->drm_bo.flags;
@@ -480,7 +451,7 @@ dri_ttm_bo_unreference(dri_bo *buf)
     if (--ttm_buf->refcount == 0) {
 	int ret;
 
-	if (ttm_buf->reloc_buf) {
+	if (ttm_buf->reloc_buf_data) {
 	    int i;
 
 	    /* Unreference all the target buffers */
@@ -488,18 +459,16 @@ dri_ttm_bo_unreference(dri_bo *buf)
 		 dri_bo_unreference(ttm_buf->relocs[i].target_buf);
 	    free(ttm_buf->relocs);
 
-	    if (bufmgr_ttm->cached_reloc_buf == NULL) {
+	    if (bufmgr_ttm->cached_reloc_buf_data == NULL) {
 	       /* Cache a single relocation buffer allocation to avoid
 		* repeated create/map/unmap/destroy for batchbuffer
 		* relocations.
 		*/
-	       bufmgr_ttm->cached_reloc_buf = ttm_buf->reloc_buf;
 	       bufmgr_ttm->cached_reloc_buf_data = ttm_buf->reloc_buf_data;
 	    } else {
 	       /* Free the kernel BO containing relocation entries */
-	       drmBOUnmap(bufmgr_ttm->fd, ttm_buf->reloc_buf);
-	       drmBOUnreference(bufmgr_ttm->fd, ttm_buf->reloc_buf);
-	       free(ttm_buf->reloc_buf);
+	       free(ttm_buf->reloc_buf_data);
+	       ttm_buf->reloc_buf_data = NULL;
 	    }
 	}
 
@@ -651,7 +620,7 @@ dri_ttm_fence_wait(dri_fence *fence)
 
     ret = drmFenceWait(bufmgr_ttm->fd, DRM_FENCE_FLAG_WAIT_LAZY, &fence_ttm->drm_fence, 0);
     if (ret != 0) {
-	fprintf(stderr, "%s:%d: Error %d waiting for fence %s.\n",
+        fprintf(stderr, "%s:%d: Error %d waiting for fence %s.\n",
 		__FILE__, __LINE__, ret, fence_ttm->name);
 	abort();
     }
@@ -664,11 +633,8 @@ dri_bufmgr_ttm_destroy(dri_bufmgr *bufmgr)
 {
     dri_bufmgr_ttm *bufmgr_ttm = (dri_bufmgr_ttm *)bufmgr;
 
-    if (bufmgr_ttm->cached_reloc_buf) {
-       /* Free the cached kernel BO containing relocation entries */
-       drmBOUnmap(bufmgr_ttm->fd, bufmgr_ttm->cached_reloc_buf);
-       drmBOUnreference(bufmgr_ttm->fd, bufmgr_ttm->cached_reloc_buf);
-       free(bufmgr_ttm->cached_reloc_buf);
+    if (bufmgr_ttm->cached_reloc_buf_data) {
+       free(bufmgr_ttm->cached_reloc_buf_data);
     }
 
     free(bufmgr_ttm->validate_array);
@@ -691,25 +657,25 @@ dri_ttm_emit_reloc(dri_bo *reloc_buf, uint64_t flags, GLuint delta,
 {
     dri_bufmgr_ttm *bufmgr_ttm = (dri_bufmgr_ttm *)reloc_buf->bufmgr;
     dri_bo_ttm *reloc_buf_ttm = (dri_bo_ttm *)reloc_buf;
+    dri_bo_ttm *target_buf_ttm = (dri_bo_ttm *)target_buf;
     int num_relocs;
     uint32_t *this_reloc;
 
     /* Create a new relocation list if needed */
-    if (reloc_buf_ttm->reloc_buf == NULL)
+    if (reloc_buf_ttm->reloc_buf_data == NULL)
 	intel_setup_reloc_list(reloc_buf);
 
-    num_relocs = (reloc_buf_ttm->reloc_buf_data[0] & 0xffff);
+    num_relocs = reloc_buf_ttm->reloc_buf_data[0];
 
     /* Check overflow */
-    assert((reloc_buf_ttm->reloc_buf_data[0] & 0xffff) <
-	   bufmgr_ttm->max_relocs);
+    assert(num_relocs < bufmgr_ttm->max_relocs);
 
     this_reloc = reloc_buf_ttm->reloc_buf_data + I915_RELOC_HEADER +
 	num_relocs * I915_RELOC0_STRIDE;
 
     this_reloc[0] = offset;
     this_reloc[1] = delta;
-    this_reloc[2] = -1; /* To be filled in at exec time */
+    this_reloc[2] = target_buf_ttm->drm_bo.handle; /* To be filled in at exec time */
     this_reloc[3] = 0;
 
     reloc_buf_ttm->relocs[num_relocs].validate_flags = flags;
@@ -718,7 +684,7 @@ dri_ttm_emit_reloc(dri_bo *reloc_buf, uint64_t flags, GLuint delta,
 
     reloc_buf_ttm->reloc_buf_data[0]++; /* Increment relocation count */
     /* Check wraparound */
-    assert((reloc_buf_ttm->reloc_buf_data[0] & 0xffff) != 0);
+    assert(reloc_buf_ttm->reloc_buf_data[0] != 0);
 }
 
 /**
@@ -740,19 +706,12 @@ dri_ttm_bo_process_reloc(dri_bo *bo)
 
     for (i = 0; i < nr_relocs; i++) {
 	struct dri_ttm_reloc *r = &bo_ttm->relocs[i];
-	dri_bo_ttm *target_ttm = (dri_bo_ttm *)r->target_buf;
-	uint32_t *reloc_entry;
 
 	/* Continue walking the tree depth-first. */
 	dri_ttm_bo_process_reloc(r->target_buf);
 
 	/* Add the target to the validate list */
 	intel_add_validate_buffer(r->target_buf, r->validate_flags);
-
-	/* Update the index of the target in the relocation entry */
-	reloc_entry = bo_ttm->reloc_buf_data + I915_RELOC_HEADER +
-	    i * I915_RELOC0_STRIDE;
-	reloc_entry[2] = target_ttm->validate_index;
     }
 }
 
@@ -826,8 +785,8 @@ intel_update_buffer_offsets (dri_bufmgr_ttm *bufmgr_ttm)
 	}
 	/* Update the buffer offset */
 	if (rep->bo_info.offset != bo->offset) {
-	    DBG("BO %s migrated: 0x%08x -> 0x%08x\n",
-		bo_ttm->name, bo->offset, rep->bo_info.offset);
+	    DBG("BO %s migrated: 0x%08lx -> 0x%08lx\n",
+		bo_ttm->name, bo->offset, (unsigned long)rep->bo_info.offset);
 	    bo->offset = rep->bo_info.offset;
 	}
     }
@@ -875,7 +834,6 @@ intel_bufmgr_ttm_init(int fd, unsigned int fence_type,
     bufmgr_ttm->fd = fd;
     bufmgr_ttm->fence_type = fence_type;
     bufmgr_ttm->fence_type_flush = fence_type_flush;
-    bufmgr_ttm->cached_reloc_buf = NULL;
     bufmgr_ttm->cached_reloc_buf_data = NULL;
 
     /* Let's go with one relocation per every 2 dwords (but round down a bit
commit 94a18fa1f8141837bdab32e545da7a7aed1cc396
Author: Julien Cristau <jcristau at debian.org>
Date:   Thu Jan 24 15:24:40 2008 +0100

    Don't build reg_dumper if we don't have pciaccess 0.10.0
    
    The pci_device_map_range() function was added in libpciaccess 0.10.0, and
    is used by the reg_dumper tool.  Don't try to build it if we have an older
    libpciaccess.
    Also make sure that util-macros >= 1.1.3 is available when running autoconf,
    because it's required for the PACKAGE_VERSION_* macros.

diff --git a/configure.ac b/configure.ac
index 334a1f4..e707a1a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -117,7 +117,7 @@ CFLAGS="$save_CFLAGS"
 if test x$XSERVER_LIBPCIACCESS = xyes; then
 	PKG_CHECK_MODULES([PCIACCESS], [pciaccess >= 0.10.0])
 else
-	PKG_CHECK_MODULES([PCIACCESS], [pciaccess >= 0.5.0],
+	PKG_CHECK_MODULES([PCIACCESS], [pciaccess >= 0.10.0],
 				       have_libpciaccess=yes,
 				       have_libpciaccess=no)
 fi
@@ -216,6 +216,8 @@ AC_SUBST([moduledir])
 DRIVER_NAME=intel
 AC_SUBST([DRIVER_NAME])
 
+m4_ifndef([XORG_MACROS_VERSION], [AC_FATAL([must install xorg-macros 1.1.3 or later before running autoconf/autogen])])
+XORG_MACROS_VERSION(1.1.3)
 XORG_MANPAGE_SECTIONS
 XORG_RELEASE_VERSION
 
commit 5972aaf4351db300172cc8c79713c2dcf13144d0
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Jan 23 22:14:00 2008 -0800

    Require 2k of batchbuffer space in prepare_composite
    
    This makes it so our assertion that batchbuffer flush never occurs
    during composite better than just wishful thinking.

diff --git a/src/i965_render.c b/src/i965_render.c
index bd21a49..6f725a2 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -1016,6 +1016,26 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     CARD32 *binding_table;
     CARD32 src_blend, dst_blend;
 
+    /* We cannot handle a flush occuring anytime during the
+     * prepare_composite/composite/done_composite handling. So make
+     * sure there's plenty of room left in the batch buffer. That is,
+     * if we're going to flush, let's do it *now*.
+     *
+     * The amount of space we need depends on how many composite calls
+     * we might get between prepare_composite and done_composite. That
+     * in turn depends on how many clip rects there might be in an
+     * expose region. Imagining a shaped window, there could be a
+     * lot. With 2kbytes reserved, this should be enough to handle a
+     * region with over 2000 clip rectangles in it, (given 51 dwords
+     * needed in prepare composite and 11 for each vertex buffer of 4k
+     * which means each vertex buffer holds enough for about 50
+     * composite calls).
+     *
+     * And with a 16k batchbuffer, this means we'll be wasting at most
+     * 1/8 of the total batchbuffer.
+     */
+    intelddx_batchbuffer_require_space (pI830->batch, 2048, 0);
+
     i965_exastate_reset(pI830->exa965);
     surface_map = pI830->exa965->surface_map;
     gen4_surface_state_init (surface_map, pI830->exa965);
commit 8c0cba5144a3345ac20bd20f5607ea3a2cec7c41
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Jan 23 21:55:43 2008 -0800

    Remove the non-use_ttm_batch code from i965_render.c
    
    This has most likely been broken for a long time anyway.
    And Eric plans to instead add this compatibility below
    in dri_bufmgr_exa.c .

diff --git a/src/i965_render.c b/src/i965_render.c
index 238e29e..bd21a49 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -945,18 +945,13 @@ gen4_emit_batch_header (ScrnInfoPtr pScrn)
 	 */
 	OUT_BATCH(BRW_STATE_BASE_ADDRESS | 4);
 
-	if (pI830->use_ttm_batch) {
-	    OUT_RELOC(pI830->exa965->buf,
-		      DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
-		      BASE_ADDRESS_MODIFY);
-
-	    OUT_RELOC(pI830->exa965->surface_buf,
-		      DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
-		      BASE_ADDRESS_MODIFY);
-	} else {
-	    OUT_BATCH(pI830->exa_965_state->offset | BASE_ADDRESS_MODIFY);
-	    OUT_BATCH(pI830->exa_965_state->offset | BASE_ADDRESS_MODIFY);
-	}
+	OUT_RELOC(pI830->exa965->buf,
+		  DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
+		  BASE_ADDRESS_MODIFY);
+
+	OUT_RELOC(pI830->exa965->surface_buf,
+		  DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
+		  BASE_ADDRESS_MODIFY);
 
 	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);  /* media base addr, don't care */
 	/* general state max addr, disabled */
@@ -1021,13 +1016,9 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     CARD32 *binding_table;
     CARD32 src_blend, dst_blend;
 
-    if (pI830->use_ttm_batch) {
-	i965_exastate_reset(pI830->exa965);
-	surface_map = pI830->exa965->surface_map;
-	gen4_surface_state_init (surface_map, pI830->exa965);
-    }else{
-	surface_map = pI830->exa_965_state->offset + pI830->FbBase;
-    }
+    i965_exastate_reset(pI830->exa965);
+    surface_map = pI830->exa965->surface_map;
+    gen4_surface_state_init (surface_map, pI830->exa965);
 
     surface_start_base = surface_map;
 
@@ -1088,17 +1079,13 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     i965_get_dest_format(pDstPicture, &dst_format);
     dest_surf_state->ss0.surface_format = dst_format;
 
-    if (pI830->use_ttm_batch) {
-    	dest_surf_state->ss1.base_addr =
-	    intelddx_batchbuffer_emit_pixmap(pDst,
-					     DRM_BO_FLAG_MEM_TT |
-					     DRM_BO_FLAG_WRITE |
-					     DRM_BO_FLAG_READ,
-					     pI830->exa965->surface_buf,
-					     dest_surf_offset + 4, 0);
-    } else {
-        dest_surf_state->ss1.base_addr = intel_get_pixmap_offset(pDst);
-    }
+    dest_surf_state->ss1.base_addr =
+	intelddx_batchbuffer_emit_pixmap(pDst,
+					 DRM_BO_FLAG_MEM_TT |
+					 DRM_BO_FLAG_WRITE |
+					 DRM_BO_FLAG_READ,
+					 pI830->exa965->surface_buf,
+					 dest_surf_offset + 4, 0);
 
     dest_surf_state->ss2.height = pDst->drawable.height - 1;
     dest_surf_state->ss2.width = pDst->drawable.width - 1;
@@ -1110,16 +1097,13 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     src_surf_state = (void *)(surface_start_base + src_surf_offset);
     src_surf_state->ss0.surface_format = i965_get_card_format(pSrcPicture);
 
-    if (pI830->use_ttm_batch) {
-        src_surf_state->ss1.base_addr =
-	    intelddx_batchbuffer_emit_pixmap(pSrc,
-					     DRM_BO_FLAG_MEM_TT |
-					     DRM_BO_FLAG_READ,
-					     pI830->exa965->surface_buf,
-					     src_surf_offset + 4, 0);
-    } else {
-        src_surf_state->ss1.base_addr = intel_get_pixmap_offset(pSrc);
-    }
+    src_surf_state->ss1.base_addr =
+	intelddx_batchbuffer_emit_pixmap(pSrc,
+					 DRM_BO_FLAG_MEM_TT |
+					 DRM_BO_FLAG_READ,
+					 pI830->exa965->surface_buf,
+					 src_surf_offset + 4, 0);
+
     src_surf_state->ss2.width = pSrc->drawable.width - 1;
     src_surf_state->ss2.height = pSrc->drawable.height - 1;
     src_surf_state->ss3.pitch = src_pitch - 1;
@@ -1130,16 +1114,12 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     if (pMask) {
 	mask_surf_state = (void *)(surface_start_base + mask_surf_offset);
    	mask_surf_state->ss0.surface_format = i965_get_card_format(pMaskPicture);
-        if (pI830->use_ttm_batch) {
-	   mask_surf_state->ss1.base_addr =
-	       intelddx_batchbuffer_emit_pixmap(pMask,
-						DRM_BO_FLAG_MEM_TT |
-						DRM_BO_FLAG_READ,
-						pI830->exa965->surface_buf,
-						mask_surf_offset + 4, 0);
-        } else {
-	    mask_surf_state->ss1.base_addr = intel_get_pixmap_offset(pMask);
-	}
+	mask_surf_state->ss1.base_addr =
+	    intelddx_batchbuffer_emit_pixmap(pMask,
+					     DRM_BO_FLAG_MEM_TT |
+					     DRM_BO_FLAG_READ,
+					     pI830->exa965->surface_buf,
+					     mask_surf_offset + 4, 0);
    	mask_surf_state->ss2.width = pMask->drawable.width - 1;
    	mask_surf_state->ss2.height = pMask->drawable.height - 1;
    	mask_surf_state->ss3.pitch = mask_pitch - 1;
@@ -1507,7 +1487,7 @@ void i965_done_composite(PixmapPtr pDst)
     }
 
     pI830->exa965->num_ops++;
-    if (pI830->use_ttm_batch && pI830->exa965->num_ops >= GEN4_MAX_OPS) {
+    if (pI830->exa965->num_ops >= GEN4_MAX_OPS) {
 	intelddx_batchbuffer_flush(pI830->batch);
     }
 }
@@ -1529,12 +1509,7 @@ i965_init_exa_state(ScrnInfoPtr pScrn)
 {
     I830Ptr pI830 = I830PTR(pScrn);
 
-    if (pI830->use_ttm_batch) {
-	pI830->exa965 = i965_exastate_alloc(pScrn);
-    } else {
-	void *map = pI830->FbBase + pI830->exa_965_state->offset;
-	gen4_state_init ((void *) map);
-    }
+    pI830->exa965 = i965_exastate_alloc(pScrn);
 
     return 0;
 }
commit d341e41c863f1212bf2c6b84782a7e472b6612a1
Author: Zhenyu Wang <zhenyu.z.wang at intel.com>
Date:   Thu Jan 24 10:40:18 2008 +0800

    Clear shadow memory after allocation

diff --git a/src/i830_display.c b/src/i830_display.c
index d16871d..f61d3c4 100644
--- a/src/i830_display.c
+++ b/src/i830_display.c
@@ -1342,6 +1342,7 @@ i830_crtc_shadow_allocate (xf86CrtcPtr crtc, int width, int height)
 		   "Couldn't allocate shadow memory for rotated CRTC\n");
 	return NULL;
     }
+    memset(pI830->FbBase + intel_crtc->rotate_mem->offset, 0, size);
 
     return pI830->FbBase + intel_crtc->rotate_mem->offset;
 }
commit 88a1041c5361964d37107c22d77feaa1b2160656
Author: Zhenyu Wang <zhenyu.z.wang at intel.com>
Date:   Thu Jan 24 10:36:06 2008 +0800

    Fix i830 block handler wrap
    
    which was observed in rotation crash with stack overflow.

diff --git a/src/i830_driver.c b/src/i830_driver.c
index 32cecff..7077456 100644
--- a/src/i830_driver.c
+++ b/src/i830_driver.c
@@ -2353,6 +2353,7 @@ I830BlockHandler(int i,
 
     (*pScreen->BlockHandler) (i, blockData, pTimeout, pReadmask);
 
+    pI830->BlockHandler = pScreen->BlockHandler;
     pScreen->BlockHandler = I830BlockHandler;
 
     I830VideoBlockHandler(i, blockData, pTimeout, pReadmask);
commit db0a7c569e383436a2725e1e74f35fb426da1196
Author: Andreas Stawinoga <a.stawinoga at gmx.de>
Date:   Thu Jan 24 08:51:09 2008 +0800

    Samsung Q45 has no TV output

diff --git a/src/i830_quirks.c b/src/i830_quirks.c
index 8fbdbfe..cb43603 100644
--- a/src/i830_quirks.c
+++ b/src/i830_quirks.c
@@ -91,6 +91,8 @@ static i830_quirk i830_quirk_list[] = {
 
     /* Samsung Q35 has no TV output */
     { PCI_CHIP_I945_GM, 0x144d, 0xc504, quirk_ignore_tv },
+    /* Samsung Q45 has no TV output */
+    { PCI_CHIP_I965_GM, 0x144d, 0xc510, quirk_ignore_tv },
 
     /* Dell Inspiron 510m needs pipe A force quirk */
     { PCI_CHIP_I855_GM, 0x1028, 0x0164, quirk_pipea_force },
commit 32c120d1b47671a94caf7f001ae50bbbcce70913
Author: Eric Anholt <eric at anholt.net>
Date:   Wed Jan 23 16:15:11 2008 -0800

    Emit vertex buffers pointing at the first vertex rather than a start vertex.
    
    Otherwise, we would miscalculate the start vertex if the vertex size changed
    from one composite call to another.

diff --git a/src/i965_render.c b/src/i965_render.c
index 304b92a..238e29e 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -1341,7 +1341,8 @@ i965_composite_flush_prims(ScrnInfoPtr pScrn)
 	      VB0_VERTEXDATA |
 	      pI830->exa965->vertex_size << VB0_BUFFER_PITCH_SHIFT);
     OUT_RELOC(pI830->exa965->vbo,
-	      DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, 0);
+	      DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
+	      pI830->exa965->vbo_prim_start);
     OUT_BATCH(0xffff); /* set max index */
     OUT_BATCH(0); /* ignore for VERTEXDATA, but still there */
 
@@ -1352,8 +1353,7 @@ i965_composite_flush_prims(ScrnInfoPtr pScrn)
 	      4);
     OUT_BATCH((pI830->exa965->vbo_used - pI830->exa965->vbo_prim_start) /
 	      pI830->exa965->vertex_size); /* vertex count */
-    OUT_BATCH(pI830->exa965->vbo_prim_start /
-	      pI830->exa965->vertex_size); /* start vertex offset */
+    OUT_BATCH(0); /* start vertex offset */
     OUT_BATCH(1); /* single instance - mbz in docs */
     OUT_BATCH(0); /* start instance location */
     OUT_BATCH(0); /* index buffer offset, ignored */
commit 0abc1b5809c791f543ddfc1cd39af0ed8896d7da
Author: Eric Anholt <eric at anholt.net>
Date:   Wed Jan 23 16:03:03 2008 -0800

    Count dwords right in i965_composite_flush.

diff --git a/src/i965_render.c b/src/i965_render.c
index e2a6350..304b92a 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -1332,7 +1332,7 @@ i965_composite_flush_prims(ScrnInfoPtr pScrn)
     if (pI830->exa965->vbo_used == pI830->exa965->vbo_prim_start)
 	return;
 
-    BEGIN_BATCH(9);
+    BEGIN_BATCH(11);
     /* Set up the pointer to our vertex buffer.  We could emit this a lot
      * less often (as long as vertex_size and vbo haven't changed).
      */
commit 1f35a7265d0ce75cadb75cdea0ec8d6833cd48ad
Author: Eric Anholt <eric at anholt.net>
Date:   Wed Jan 23 16:00:42 2008 -0800

    s/element_size/vertex_size/ -- I've been thinking about GL too much recently.

diff --git a/src/i965_render.c b/src/i965_render.c
index 7f2a404..e2a6350 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -1269,7 +1269,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     {
         int nelem = pMask ? 3: 2;
 
-	pI830->exa965->element_size = nelem * 4 * 2;
+	pI830->exa965->vertex_size = nelem * 4 * 2;
 	pI830->exa965->vbo_prim_start = pI830->exa965->vbo_used;
 
 	/* Set up our vertex elements, sourced from the single vertex buffer.
@@ -1334,12 +1334,12 @@ i965_composite_flush_prims(ScrnInfoPtr pScrn)
 
     BEGIN_BATCH(9);
     /* Set up the pointer to our vertex buffer.  We could emit this a lot
-     * less often (as long as element_size and vbo haven't changed).
+     * less often (as long as vertex_size and vbo haven't changed).
      */
     OUT_BATCH(BRW_3DSTATE_VERTEX_BUFFERS | 3);
     OUT_BATCH((0 << VB0_BUFFER_INDEX_SHIFT) |
 	      VB0_VERTEXDATA |
-	      pI830->exa965->element_size << VB0_BUFFER_PITCH_SHIFT);
+	      pI830->exa965->vertex_size << VB0_BUFFER_PITCH_SHIFT);
     OUT_RELOC(pI830->exa965->vbo,
 	      DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, 0);
     OUT_BATCH(0xffff); /* set max index */
@@ -1351,13 +1351,15 @@ i965_composite_flush_prims(ScrnInfoPtr pScrn)
 	      (0 << 9) |  /* CTG - indirect vertex count */
 	      4);
     OUT_BATCH((pI830->exa965->vbo_used - pI830->exa965->vbo_prim_start) /
-	      pI830->exa965->element_size); /* vertex count */
+	      pI830->exa965->vertex_size); /* vertex count */
     OUT_BATCH(pI830->exa965->vbo_prim_start /
-	      pI830->exa965->element_size); /* start vertex offset */
+	      pI830->exa965->vertex_size); /* start vertex offset */
     OUT_BATCH(1); /* single instance - mbz in docs */
     OUT_BATCH(0); /* start instance location */
     OUT_BATCH(0); /* index buffer offset, ignored */
     ADVANCE_BATCH();
+
+    pI830->exa965->vbo_prim_start = pI830->exa965->vbo_used;
 }
 
 /**
@@ -1436,7 +1438,7 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
 					 &mask_x[2], &mask_y[2]);
     }
 
-    vb = i965_composite_get_vbo_space(pScrn, 3 * (has_mask ? 6 : 4) * 4);
+    vb = i965_composite_get_vbo_space(pScrn, 3 * pI830->exa965->vertex_size);
 
     /* rect (x2,y2) */
     vb[i++] = (float)(dstX + w);
diff --git a/src/i965_render.h b/src/i965_render.h
index 038c6c7..fb6d2c8 100644
--- a/src/i965_render.h
+++ b/src/i965_render.h
@@ -7,7 +7,7 @@ struct i965_exastate_buffer {
     dri_bo *vbo;
     unsigned int vbo_prim_start;
     unsigned int vbo_used;
-    unsigned int element_size;
+    unsigned int vertex_size;
 
     Bool no_flush;
 
commit 92f0acd37483b8d5107226dc231e82a55cae7ac2
Author: Eric Anholt <eric at anholt.net>
Date:   Wed Jan 23 15:56:59 2008 -0800

    Remove remains of old vertex limit, and increment ops per op not per cliprect.

diff --git a/src/i965_render.c b/src/i965_render.c
index 7beef39..7f2a404 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -439,8 +439,6 @@ typedef struct _gen4_state {
 /* We only need 3, but we use 8 to get the proper alignment. */
 #define GEN4_BINDING_TABLE_PER_OP	8
 #define GEN4_MAX_BINDING_TABLE		(GEN4_MAX_OPS * GEN4_BINDING_TABLE_PER_OP)
-#define GEN4_VERTICES_PER_OP		24
-#define GEN4_MAX_VERTICES		(GEN4_MAX_OPS * GEN4_VERTICES_PER_OP)
 
 typedef struct _brw_surface_state_padded {
     struct brw_surface_state state;
@@ -1277,7 +1275,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	/* Set up our vertex elements, sourced from the single vertex buffer.
 	 * The vertex buffer will be set up later at primitive emit time.
 	 */
-	BEGIN_BATCH(pMask ? 9 : 7);
+	BEGIN_BATCH(pMask ? 7 : 5);
    	OUT_BATCH(BRW_3DSTATE_VERTEX_ELEMENTS | ((2 * nelem) - 1));
 	/* vertex coordinates */
    	OUT_BATCH((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
@@ -1344,8 +1342,8 @@ i965_composite_flush_prims(ScrnInfoPtr pScrn)
 	      pI830->exa965->element_size << VB0_BUFFER_PITCH_SHIFT);
     OUT_RELOC(pI830->exa965->vbo,
 	      DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, 0);
-    OUT_BATCH(GEN4_MAX_VERTICES); // set max index
-    OUT_BATCH(0); // ignore for VERTEXDATA, but still there
+    OUT_BATCH(0xffff); /* set max index */
+    OUT_BATCH(0); /* ignore for VERTEXDATA, but still there */
 
     OUT_BATCH(BRW_3DPRIMITIVE |
 	      BRW_3DPRIMITIVE_VERTEX_SEQUENTIAL |
@@ -1470,8 +1468,6 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
         vb[i++] = mask_y[0] / pI830->scale_units[1][1];
     }
 
-    pI830->exa965->num_ops++;
-
 #ifdef I830DEBUG
     i965_composite_flush_prims(pScrn);
 
@@ -1508,6 +1504,7 @@ void i965_done_composite(PixmapPtr pDst)
 	ADVANCE_BATCH();
     }
 
+    pI830->exa965->num_ops++;
     if (pI830->use_ttm_batch && pI830->exa965->num_ops >= GEN4_MAX_OPS) {
 	intelddx_batchbuffer_flush(pI830->batch);
     }
commit a47b9aee4265702058ab9c2c9451be67944a8d2a
Author: Eric Anholt <eric at anholt.net>
Date:   Wed Jan 23 15:51:12 2008 -0800

    Align the batchbuffer emits to double-dword automatically.

diff --git a/src/intel_batchbuffer.h b/src/intel_batchbuffer.h
index 5acdb28..de3a804 100644
--- a/src/intel_batchbuffer.h
+++ b/src/intel_batchbuffer.h
@@ -94,12 +94,19 @@ extern uint32_t intelddx_batchbuffer_emit_pixmap(PixmapPtr pPixmap,
  */
 #define BATCH_LOCALS
 
-#define BEGIN_BATCH(n)  							\
-	RING_LOCALS 								\
-	if (pI830->use_ttm_batch)						\
-   		intelddx_batchbuffer_require_space(pI830->batch, (n)*4, 0);	\
-	 else { \
-   DO_LP_RING(n) ; }
+#define BEGIN_BATCH(n)  						\
+    RING_LOCALS 							\
+    if (pI830->use_ttm_batch) {						\
+	intelddx_batchbuffer_require_space(pI830->batch,		\
+					   (((n) + 1) & ~1) * 4, 0);	\
+    } else {								\
+	DO_LP_RING((((n) + 1) & ~1));					\
+    }									\
+    if ((n) & 1) {							\
+	OUT_BATCH(MI_NOOP);						\
+    }
+
+
 
 #define OUT_BATCH(d) \
 	 if (pI830->use_ttm_batch) \
commit e59a6d88516ae6b75a6f6c5efc657fe5910f4d72
Author: Eric Anholt <eric at anholt.net>
Date:   Wed Jan 23 15:39:17 2008 -0800

    FatalError if we wrap the batch during 965 composite.

diff --git a/src/i965_render.c b/src/i965_render.c
index 27882f7..7beef39 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -816,6 +816,9 @@ void i965_exastate_flush(struct i965_exastate_buffer *state)
 {
     I830Ptr pI830 = I830PTR(state->pScrn);
 
+    if (pI830->exa965->no_flush)
+	FatalError("Flushed batchbuffer during 965 Composite\n");
+
     if (state->surface_buf) {
 	dri_bo_unmap(state->surface_buf);
 	dri_bo_unreference(state->surface_buf);
@@ -1219,6 +1222,11 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     cc_state_offset = offsetof (gen4_state_t,
 				cc_state[src_blend][dst_blend]);
 
+    /* Before emitting any batch commands to set up our batchbuffer, flag that
+     * we may not flush the batchbuffer until donecomposite.
+     */
+    pI830->exa965->no_flush = TRUE;
+
     /* Any commands that don't change from one composite operation to
      * the next we simply emit once at the beginning of the entire
      * batch. */
@@ -1484,6 +1492,8 @@ void i965_done_composite(PixmapPtr pDst)
 
     i965_composite_flush_prims(pScrn);
 
+    pI830->exa965->no_flush = FALSE;
+
     {
 	BEGIN_BATCH(4);
    	OUT_BATCH(BRW_PIPE_CONTROL |
diff --git a/src/i965_render.h b/src/i965_render.h
index f0c1313..038c6c7 100644
--- a/src/i965_render.h
+++ b/src/i965_render.h
@@ -9,6 +9,8 @@ struct i965_exastate_buffer {
     unsigned int vbo_used;
     unsigned int element_size;
 
+    Bool no_flush;
+
     dri_bo *surface_buf;
     unsigned char *surface_map;
     int num_ops;
commit 1563dbe874aa6ee8ca1e0cefef0425402d547d2c
Author: Eric Anholt <eric at anholt.net>
Date:   Wed Jan 23 15:26:58 2008 -0800

    Move i965_render.c to using separate vertex buffers from surface state.
    
    This allows us to handle (almost) arbitrary numbers of Composite calls per
    Prepare/Do/Done set, and batch those primitives into a single 3DPRIMITIVES.

diff --git a/src/i965_render.c b/src/i965_render.c
index 09ad6d9..27882f7 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -36,6 +36,7 @@
 #include "xf86.h"
 #include "i830.h"
 #include "i915_reg.h"
+#include "i965_render.h"
 
 /* bring in brw structs */
 #include "brw_defines.h"
@@ -270,12 +271,8 @@ i965_check_composite(int op, PicturePtr pSrcPicture, PicturePtr pMaskPicture,
 
 /* these offsets will remain the same for all buffers post allocation */
 static int dest_surf_offset, src_surf_offset, mask_surf_offset;
-static int vb_offset;
 static int binding_table_offset;
 
-static float *vb;
-static int vb_index;
-
 static const CARD32 sip_kernel_static[][4] = {
 /*    wait (1) a0<1>UW a145<0,1,0>UW { align1 +  } */
     { 0x00000030, 0x20000108, 0x00001220, 0x00000000 },
@@ -454,8 +451,6 @@ typedef struct _gen4_surface_state {
     brw_surface_state_padded surface_state[GEN4_MAX_SURFACE_STATES];
 
     CARD32 binding_table[GEN4_MAX_BINDING_TABLE];
-
-    float vb[GEN4_MAX_VERTICES];
 } gen4_surface_state_t;
 
 static CARD32 
@@ -753,9 +748,6 @@ gen4_surface_state_init (unsigned char *start_base,
     unsigned int surf_state_offset = offsetof (gen4_surface_state_t,
 					       surface_state);
 
-    vb_offset = (offsetof (gen4_surface_state_t, vb) +
-		 sizeof (float) * GEN4_VERTICES_PER_OP * state->num_ops);
-
     binding_table_offset = (offsetof (gen4_surface_state_t, binding_table) +
 			    sizeof (CARD32) * GEN4_BINDING_TABLE_PER_OP *
 			    state->num_ops);
@@ -816,16 +808,31 @@ gen4_surface_state_init (unsigned char *start_base,
     mask_surf_state->ss2.render_target_rotation = 0;
 }
 
+/**
+ * Called from intel_batchbuffer_flush when we're about to flush a batch
+ * buffer and start a new one.
+ */
 void i965_exastate_flush(struct i965_exastate_buffer *state)
 {
+    I830Ptr pI830 = I830PTR(state->pScrn);
+
     if (state->surface_buf) {
 	dri_bo_unmap(state->surface_buf);
 	dri_bo_unreference(state->surface_buf);
 	state->surface_buf = NULL;
 	state->surface_map = NULL;
+
+	if (pI830->exa965->vbo != NULL) {
+	    dri_bo_unmap(pI830->exa965->vbo);
+	    dri_bo_unreference(pI830->exa965->vbo);
+	    pI830->exa965->vbo = NULL;
+	}
     }
 }
 
+/**
+ * Called at the start of prepare_composite to allocate our state buffers.
+ */
 static void
 i965_exastate_reset(struct i965_exastate_buffer *state)
 {
@@ -1072,12 +1079,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     else
 	sf_state_offset = offsetof (gen4_state_t, sf_state);
 
-    /* Because we only have a single static buffer for our state currently,
-     * we have to sync before updating it every time.
-     */
-    vb = (void *)(surface_start_base + vb_offset);
-    vb_index = 0;
-
     i965_get_blend_cntl(op, pMaskPicture, pDstPicture->format,
 			&src_blend, &dst_blend);
 
@@ -1262,27 +1263,13 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     {
         int nelem = pMask ? 3: 2;
 
-   	BEGIN_BATCH(pMask?12:10);
-	/* Set up the pointer to our vertex buffer */
-   	OUT_BATCH(BRW_3DSTATE_VERTEX_BUFFERS | 3);
-   	OUT_BATCH((0 << VB0_BUFFER_INDEX_SHIFT) |
-	    	 VB0_VERTEXDATA |
-	    	 ((4 * 2 * nelem) << VB0_BUFFER_PITCH_SHIFT));
-
-	if (pI830->use_ttm_batch) {
-	    OUT_RELOC(pI830->exa965->surface_buf,
-		      DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
-		      vb_offset);
-
-	} else {
-	    OUT_BATCH(pI830->exa_965_state->offset + vb_offset);
-	}
-
-        OUT_BATCH(GEN4_MAX_VERTICES); // set max index
-   	OUT_BATCH(0); // ignore for VERTEXDATA, but still there
+	pI830->exa965->element_size = nelem * 4 * 2;
+	pI830->exa965->vbo_prim_start = pI830->exa965->vbo_used;
 
 	/* Set up our vertex elements, sourced from the single vertex buffer.
+	 * The vertex buffer will be set up later at primitive emit time.
 	 */
+	BEGIN_BATCH(pMask ? 9 : 7);
    	OUT_BATCH(BRW_3DSTATE_VERTEX_ELEMENTS | ((2 * nelem) - 1));
 	/* vertex coordinates */
    	OUT_BATCH((0 << VE0_VERTEX_BUFFER_INDEX_SHIFT) |
@@ -1326,7 +1313,87 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 #endif
     return TRUE;
 }
-		       
+
+/**
+ * Flushes the accumulated primitives in the VBO, according to the
+ * setup that had been done in PrepareComposite.
+ */
+static void
+i965_composite_flush_prims(ScrnInfoPtr pScrn)
+{
+    I830Ptr pI830 = I830PTR(pScrn);
+
+    if (pI830->exa965->vbo_used == pI830->exa965->vbo_prim_start)
+	return;
+
+    BEGIN_BATCH(9);
+    /* Set up the pointer to our vertex buffer.  We could emit this a lot
+     * less often (as long as element_size and vbo haven't changed).
+     */
+    OUT_BATCH(BRW_3DSTATE_VERTEX_BUFFERS | 3);
+    OUT_BATCH((0 << VB0_BUFFER_INDEX_SHIFT) |
+	      VB0_VERTEXDATA |
+	      pI830->exa965->element_size << VB0_BUFFER_PITCH_SHIFT);
+    OUT_RELOC(pI830->exa965->vbo,
+	      DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, 0);
+    OUT_BATCH(GEN4_MAX_VERTICES); // set max index
+    OUT_BATCH(0); // ignore for VERTEXDATA, but still there
+
+    OUT_BATCH(BRW_3DPRIMITIVE |
+	      BRW_3DPRIMITIVE_VERTEX_SEQUENTIAL |
+	      (_3DPRIM_RECTLIST << BRW_3DPRIMITIVE_TOPOLOGY_SHIFT) |
+	      (0 << 9) |  /* CTG - indirect vertex count */
+	      4);
+    OUT_BATCH((pI830->exa965->vbo_used - pI830->exa965->vbo_prim_start) /
+	      pI830->exa965->element_size); /* vertex count */
+    OUT_BATCH(pI830->exa965->vbo_prim_start /
+	      pI830->exa965->element_size); /* start vertex offset */
+    OUT_BATCH(1); /* single instance - mbz in docs */
+    OUT_BATCH(0); /* start instance location */
+    OUT_BATCH(0); /* index buffer offset, ignored */
+    ADVANCE_BATCH();
+}
+
+/**
+ * Allocates space in a VBO for size bytes of vertex data, flushing the
+ * current primitive and allocating a new VBO as necessary.
+ *
+ * Returns a pointer to the space the vertex data should be written to.
+ */
+static void *
+i965_composite_get_vbo_space(ScrnInfoPtr pScrn, int size)
+{
+    I830Ptr pI830 = I830PTR(pScrn);
+
+    /* Check if we would overflow the VBO, and flush if so. */
+    if (pI830->exa965->vbo != NULL) {
+	if (pI830->exa965->vbo_used + size > pI830->exa965->vbo->size) {
+	    i965_composite_flush_prims(pScrn);
+
+	    dri_bo_unmap(pI830->exa965->vbo);
+	    dri_bo_unreference(pI830->exa965->vbo);
+	    pI830->exa965->vbo = NULL;
+	}
+    }
+
+    /* Allocate a new VBO if we don't have one */
+    if (pI830->exa965->vbo == NULL) {
+	pI830->exa965->vbo = dri_bo_alloc(pI830->bufmgr, "exa vertex buffer",
+					  4096, 4096,
+					  DRM_BO_FLAG_MEM_LOCAL |
+					  DRM_BO_FLAG_CACHED |
+					  DRM_BO_FLAG_CACHED_MAPPED);
+	dri_bo_map(pI830->exa965->vbo, TRUE);
+	pI830->exa965->vbo_used = 0;
+	pI830->exa965->vbo_prim_start = 0;
+    }
+
+    pI830->exa965->vbo_used += size;
+
+    return (char *)pI830->exa965->vbo->virtual + pI830->exa965->vbo_used - size;
+}
+
+
 void
 i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
 	       int dstX, int dstY, int w, int h)
@@ -1335,7 +1402,8 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
     I830Ptr pI830 = I830PTR(pScrn);
     Bool has_mask;
     float src_x[3], src_y[3], mask_x[3], mask_y[3];
-    int i;
+    float *vb;
+    int i = 0;
 
     i830_get_transformed_coordinates(srcX, srcY,
 				     pI830->transform[0],
@@ -1362,15 +1430,8 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
 					 &mask_x[2], &mask_y[2]);
     }
 
-    /* Wait for any existing composite rectangles to land before we overwrite
-     * the VB with the next one.
-     */
-    if ((vb_index + 18) > GEN4_MAX_VERTICES) {
-      ErrorF("vb index exceeded maximum bailing...");
-      return;
-    }
+    vb = i965_composite_get_vbo_space(pScrn, 3 * (has_mask ? 6 : 4) * 4);
 
-    i = vb_index;
     /* rect (x2,y2) */
     vb[i++] = (float)(dstX + w);
     vb[i++] = (float)(dstY + h);
@@ -1401,26 +1462,11 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
         vb[i++] = mask_y[0] / pI830->scale_units[1][1];
     }
 
-    {
-      BEGIN_BATCH(6);
-      OUT_BATCH(BRW_3DPRIMITIVE |
-	       BRW_3DPRIMITIVE_VERTEX_SEQUENTIAL |
-	       (_3DPRIM_RECTLIST << BRW_3DPRIMITIVE_TOPOLOGY_SHIFT) |
-	       (0 << 9) |  /* CTG - indirect vertex count */
-	       4);
-      OUT_BATCH(3);  /* vertex count per instance */
-      OUT_BATCH(vb_index); /* start vertex offset */
-      OUT_BATCH(1); /* single instance - mbz in docs */
-      OUT_BATCH(0); /* start instance location */
-      OUT_BATCH(0); /* index buffer offset, ignored */
-      ADVANCE_BATCH();
-    }
-
-    vb_index = i;
-
     pI830->exa965->num_ops++;
 
 #ifdef I830DEBUG
+    i965_composite_flush_prims(pScrn);
+
     ErrorF("sync after 3dprimitive");
     I830Sync(pScrn);
 #endif
@@ -1436,6 +1482,8 @@ void i965_done_composite(PixmapPtr pDst)
     ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
     I830Ptr pI830 = I830PTR(pScrn);
 
+    i965_composite_flush_prims(pScrn);
+
     {
 	BEGIN_BATCH(4);
    	OUT_BATCH(BRW_PIPE_CONTROL |
diff --git a/src/i965_render.h b/src/i965_render.h
new file mode 100644
index 0000000..f0c1313
--- /dev/null
+++ b/src/i965_render.h
@@ -0,0 +1,18 @@
+#define EXASTATE_SZ 48000
+
+struct i965_exastate_buffer {
+    dri_bo *buf;
+    unsigned char *map;
+
+    dri_bo *vbo;
+    unsigned int vbo_prim_start;
+    unsigned int vbo_used;
+    unsigned int element_size;
+
+    dri_bo *surface_buf;
+    unsigned char *surface_map;
+    int num_ops;
+
+    dri_fence *last_fence;
+    ScrnInfoPtr pScrn;
+};
diff --git a/src/intel_batchbuffer.h b/src/intel_batchbuffer.h
index 26e0fe1..5acdb28 100644
--- a/src/intel_batchbuffer.h
+++ b/src/intel_batchbuffer.h
@@ -25,18 +25,6 @@ struct intelddx_batchbuffer
    GLuint dirty_state;
 };
 
-struct i965_exastate_buffer {
-   dri_bo *buf;
-   unsigned char *map;
-
-   dri_bo *surface_buf;
-   unsigned char *surface_map;
-   int num_ops;
-
-   dri_fence *last_fence;
-   ScrnInfoPtr pScrn;
-};
-
 struct intelddx_batchbuffer *intelddx_batchbuffer_alloc(ScrnInfoPtr pScrn);
 
 void intelddx_batchbuffer_free(struct intelddx_batchbuffer *batch);
commit e85365718a65bba3416161712eff505833afa662
Merge: a7189b5... d46c01d...
Author: Eric Anholt <eric at anholt.net>
Date:   Wed Jan 23 13:54:18 2008 -0800

    Merge remote branch 'cworth/master' into intel-batchbuffer

diff --cc src/i965_render.c
index 91f62bf,f8acdfa..09ad6d9
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@@ -522,18 -566,57 +566,58 @@@ sampler_state_init (struct brw_sampler_
  }
  
  static void
- i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
+ wm_state_init (struct brw_wm_unit_state *wm_state,
+ 	       Bool has_mask,
+ 	       int scratch_offset,
+ 	       int kernel_offset,
+ 	       int sampler_state_offset)
  {
-     /* cc viewport */
-     struct brw_cc_viewport *cc_viewport;
-     struct brw_cc_unit_state *cc_state;
-     struct brw_surface_state *dest_surf_state, *src_surf_state, *mask_surf_state;
+     memset(wm_state, 0, sizeof (*wm_state));
+     wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
+     wm_state->thread1.single_program_flow = 1;
  
-     cc_viewport = (void *)(start_base + cc_viewport_offset);
-     cc_viewport->min_depth = -1.e35;
-     cc_viewport->max_depth = 1.e35;
+     wm_state->thread2.scratch_space_base_pointer = scratch_offset >> 10;
+ 
+     wm_state->thread2.per_thread_scratch_space = 0;
+     wm_state->thread3.const_urb_entry_read_length = 0;
+     wm_state->thread3.const_urb_entry_read_offset = 0;
+ 
+     wm_state->thread3.urb_entry_read_offset = 0;
+     /* wm kernel use urb from 3, see wm_program in compiler module */
+     wm_state->thread3.dispatch_grf_start_reg = 3; /* must match kernel */
+ 
+     wm_state->wm4.stats_enable = 1;  /* statistic */
+     wm_state->wm4.sampler_state_pointer = sampler_state_offset >> 5;
+     wm_state->wm4.sampler_count = 1; /* 1-4 samplers used */
+     wm_state->wm5.max_threads = PS_MAX_THREADS - 1;
++    wm_state->wm5.transposed_urb_read = 0;
+     wm_state->wm5.thread_dispatch_enable = 1;
+     /* just use 16-pixel dispatch (4 subspans), don't need to change kernel
+      * start point
+      */
+     wm_state->wm5.enable_16_pix = 1;
+     wm_state->wm5.enable_8_pix = 0;
+     wm_state->wm5.early_depth_test = 1;
  
-     cc_state = (void *)(start_base + cc_offset);
+     wm_state->thread0.kernel_start_pointer = kernel_offset >> 6;
+ 
+     /* Each pair of attributes (src/mask coords) is one URB entry */
+     if (has_mask) {
+ 	wm_state->thread1.binding_table_entry_count = 3; /* 2 tex and fb */
+ 	wm_state->thread3.urb_entry_read_length = 2;
+     } else {
+ 	wm_state->thread1.binding_table_entry_count = 2; /* 1 tex and fb */
+ 	wm_state->thread3.urb_entry_read_length = 1;
+     }
+ }
+ 
+ static void
+ cc_state_init (struct brw_cc_unit_state *cc_state,
+ 	       int src_blend,
+ 	       int dst_blend,
+ 	       int cc_viewport_offset)
+ {
+     memset(cc_state, 0, sizeof(*cc_state));
      cc_state->cc0.stencil_enable = 0;   /* disable stencil */
      cc_state->cc2.depth_test = 0;       /* disable depth test */
      cc_state->cc2.logicop_enable = 0;   /* disable logic op */
@@@ -598,122 -813,174 +814,182 @@@ gen4_surface_state_init (unsigned char 
      mask_surf_state->ss0.render_cache_read_mode = 0;
      mask_surf_state->ss2.mip_count = 0;
      mask_surf_state->ss2.render_target_rotation = 0;
+ }
  
-     /* default color state */
-     default_color_state = (void *)(start_base + default_color_offset);
-     default_color_state->color[0] = 0.0; /* R */
-     default_color_state->color[1] = 0.0; /* G */
-     default_color_state->color[2] = 0.0; /* B */
-     default_color_state->color[3] = 0.0; /* A */
- 
-     /* src sampler state */
-     src_sampler_state = (void *)(start_base + src_sampler_offset);
-     src_sampler_state->ss0.lod_preclamp = 1; /* GL mode */
-     src_sampler_state->ss0.default_color_mode = 0; /* GL mode */
-     src_sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
- 
-     /* mask sampler state */
-     mask_sampler_state = (void *)(start_base + mask_sampler_offset);
-     mask_sampler_state->ss0.lod_preclamp = 1; /* GL mode */
-     mask_sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
- 
-     /* vertex shader state */
-     /* Set up the vertex shader to be disabled (passthrough) */
-     vs_state = (void *)(start_base + vs_offset);
-     vs_state->thread4.nr_urb_entries = URB_VS_ENTRIES;
-     vs_state->thread4.urb_entry_allocation_size = URB_VS_ENTRY_SIZE - 1;
-     vs_state->vs6.vs_enable = 0;
-     vs_state->vs6.vert_cache_disable = 1;
+ void i965_exastate_flush(struct i965_exastate_buffer *state)
+ {
+     if (state->surface_buf) {
 -	ddx_bo_unmap(state->surface_buf);
 -	ddx_bo_unreference(state->surface_buf);
++	dri_bo_unmap(state->surface_buf);
++	dri_bo_unreference(state->surface_buf);
+ 	state->surface_buf = NULL;
+ 	state->surface_map = NULL;
+     }
+ }
  
-     /* sf state */
-     sf_state = (void *)(start_base + sf_offset);
-     sf_state->thread0.kernel_start_pointer = sf_kernel_offset >> 6;
-     sf_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(SF_KERNEL_NUM_GRF);
-     sf_state->sf1.single_program_flow = 1;
-     sf_state->sf1.binding_table_entry_count = 0;
-     sf_state->sf1.thread_priority = 0;
-     sf_state->sf1.floating_point_mode = 0; /* Mesa does this */
-     sf_state->sf1.illegal_op_exception_enable = 1;
-     sf_state->sf1.mask_stack_exception_enable = 1;
-     sf_state->sf1.sw_exception_enable = 1;
-     sf_state->thread2.per_thread_scratch_space = 0;
-     /* scratch space is not used in our kernel */
-     sf_state->thread2.scratch_space_base_pointer = 0;
-     sf_state->thread3.const_urb_entry_read_length = 0; /* no const URBs */
-     sf_state->thread3.const_urb_entry_read_offset = 0; /* no const URBs */
-     sf_state->thread3.urb_entry_read_length = 1; /* 1 URB per vertex */
-     /* don't smash vertex header, read start from dw8 */
-     sf_state->thread3.urb_entry_read_offset = 1;
-     sf_state->thread3.dispatch_grf_start_reg = 3;
-     sf_state->thread4.max_threads = SF_MAX_THREADS - 1;
-     sf_state->thread4.urb_entry_allocation_size = URB_SF_ENTRY_SIZE - 1;
-     sf_state->thread4.nr_urb_entries = URB_SF_ENTRIES;
-     sf_state->thread4.stats_enable = 1;
-     sf_state->sf5.viewport_transform = FALSE; /* skip viewport */
-     sf_state->sf6.cull_mode = BRW_CULLMODE_NONE;
-     sf_state->sf6.scissor = 0;
-     sf_state->sf7.trifan_pv = 2;
-     sf_state->sf6.dest_org_vbias = 0x8;
-     sf_state->sf6.dest_org_hbias = 0x8;
+ static void
+ i965_exastate_reset(struct i965_exastate_buffer *state)
+ {
+     I830Ptr pI830 = I830PTR(state->pScrn);
  
-     /* wm state */
-     wm_state = (void *)(start_base + wm_offset);
-     wm_state->thread0.kernel_start_pointer = ps_kernel_offset >> 6;
-     wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
-     wm_state->thread1.single_program_flow = 1;
-     wm_state->thread2.scratch_space_base_pointer = wm_scratch_offset>>10;
-     wm_state->thread2.per_thread_scratch_space = 0;
-     wm_state->thread3.const_urb_entry_read_length = 0;
-     wm_state->thread3.const_urb_entry_read_offset = 0;
-     /* Each pair of attributes (src/mask coords) is one URB entry */
-     wm_state->thread3.urb_entry_read_offset = 0;
-     /* wm kernel use urb from 3, see wm_program in compiler module */
-     wm_state->thread3.dispatch_grf_start_reg = 3; /* must match kernel */
+     /* First the general state buffer. */
+     if (state->buf == NULL) {
 -	state->buf = ddx_bo_alloc(pI830->bufmgr, "exa state buffer",
++	state->buf = dri_bo_alloc(pI830->bufmgr, "exa state buffer",
+ 				  sizeof (gen4_state_t), 4096,
+ 				  DRM_BO_FLAG_MEM_LOCAL | DRM_BO_FLAG_CACHED | DRM_BO_FLAG_CACHED_MAPPED);
 -	ddx_bo_map(state->buf, TRUE);
++	dri_bo_map(state->buf, TRUE);
+ 	state->map = state->buf->virtual;
+ 	gen4_state_init ((void *) state->map);
 -	ddx_bo_unmap(state->buf);
++	dri_bo_unmap(state->buf);
+     }
  
-     wm_state->wm4.stats_enable = 1;  /* statistic */
-     wm_state->wm4.sampler_state_pointer = src_sampler_offset >> 5;
-     wm_state->wm4.sampler_count = 1; /* 1-4 samplers used */
-     wm_state->wm5.max_threads = PS_MAX_THREADS - 1;
-     wm_state->wm5.thread_dispatch_enable = 1;
-     /* just use 16-pixel dispatch (4 subspans), don't need to change kernel
-      * start point
-      */
-     wm_state->wm5.enable_16_pix = 1;
-     wm_state->wm5.enable_8_pix = 0;
-     wm_state->wm5.early_depth_test = 1;
+     /* Then the surface state buffer */
+     if (state->surface_buf != NULL && state->num_ops >= GEN4_MAX_OPS) {
 -	ddx_bo_unreference(state->surface_buf);
++	dri_bo_unreference(state->surface_buf);
+ 	state->surface_buf = NULL;
+     }
+ 
+     if (state->surface_buf == NULL) {
 -	state->surface_buf = ddx_bo_alloc(pI830->bufmgr, "exa surface state buffer",
++	state->surface_buf = dri_bo_alloc(pI830->bufmgr, "exa surface state buffer",
+ 					  sizeof (gen4_surface_state_t), 4096,
+ 					  DRM_BO_FLAG_MEM_LOCAL | DRM_BO_FLAG_CACHED | DRM_BO_FLAG_CACHED_MAPPED);
 -	ddx_bo_map(state->surface_buf, TRUE);
++	dri_bo_map(state->surface_buf, TRUE);
+ 	state->num_ops = 0;
+ 
+ 	state->surface_map = state->surface_buf->virtual;
+     }
  }
  
- static void
- i965_update_sf_kernel(ScrnInfoPtr pScrn, char *start_base,
- 		      int need_sf_kernel)
+ static sampler_state_filter_t
+ sampler_state_filter_from_picture (int filter)
  {
-     memcpy(start_base + sf_kernel_offset, sf_kernels[need_sf_kernel].kernel, sf_kernels[need_sf_kernel].size);
+     switch (filter) {
+     case PictFilterNearest:
+ 	return SAMPLER_STATE_FILTER_NEAREST;
+     case PictFilterBilinear:
+ 	return SAMPLER_STATE_FILTER_BILINEAR;
+     default:
+ 	return -1;
+     }
  }
  
- static void
- i965_update_ps_kernel(ScrnInfoPtr pScrn, char *start_base,
- 		      int need_ps_kernel)
+ static sampler_state_extend_t
+ sampler_state_extend_from_picture (int repeat)
  {
-     memcpy(start_base + ps_kernel_offset, ps_kernels[need_ps_kernel].kernel, ps_kernels[need_ps_kernel].size);
+     if (repeat)
+ 	return SAMPLER_STATE_EXTEND_REPEAT;
+     else
+ 	return SAMPLER_STATE_EXTEND_NONE;
  }
  
  static void
- i965_exastate_reset(struct i965_exastate_buffer *state)
+ gen4_emit_batch_header (ScrnInfoPtr pScrn)
  {
-     I830Ptr pI830 = I830PTR(state->pScrn);
+     I830Ptr pI830 = I830PTR(pScrn);
+     int sip_kernel_offset;
+     int urb_vs_start, urb_vs_size;
+     int urb_gs_start, urb_gs_size;
+     int urb_clip_start, urb_clip_size;
+     int urb_sf_start, urb_sf_size;
+     int urb_cs_start, urb_cs_size;
  
-     if (state->buf != NULL) {
- 	dri_bo_unreference(state->buf);
- 	state->buf = NULL;
+     urb_vs_start = 0;
+     urb_vs_size = URB_VS_ENTRIES * URB_VS_ENTRY_SIZE;
+     urb_gs_start = urb_vs_start + urb_vs_size;
+     urb_gs_size = URB_GS_ENTRIES * URB_GS_ENTRY_SIZE;
+     urb_clip_start = urb_gs_start + urb_gs_size;
+     urb_clip_size = URB_CLIP_ENTRIES * URB_CLIP_ENTRY_SIZE;
+     urb_sf_start = urb_clip_start + urb_clip_size;
+     urb_sf_size = URB_SF_ENTRIES * URB_SF_ENTRY_SIZE;
+     urb_cs_start = urb_sf_start + urb_sf_size;
+     urb_cs_size = URB_CS_ENTRIES * URB_CS_ENTRY_SIZE;
+ 
+     IntelEmitInvarientState(pScrn);
+ 
+     sip_kernel_offset = offsetof (gen4_state_t, sip_kernel);
+ 
+     /* Begin the long sequence of commands needed to set up the 3D
+      * rendering pipe
+      */
+     {
+ 	BEGIN_BATCH(2);
+ 	OUT_BATCH(MI_FLUSH |
+ 		  MI_STATE_INSTRUCTION_CACHE_FLUSH |
+ 		  BRW_MI_GLOBAL_SNAPSHOT_RESET);
+ 	OUT_BATCH(MI_NOOP);
+ 	ADVANCE_BATCH();
      }
+     {
+ 	BEGIN_BATCH(20);
+ 
+ /* Match Mesa driver setup */
 -	OUT_BATCH(BRW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
++	if (IS_IGD_GM(pI830)) {
++	    OUT_BATCH(NEW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
++	} else {
++	    OUT_BATCH(BRW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
++	}
+ 
+ 	OUT_BATCH(BRW_CS_URB_STATE | 0);
+ 	OUT_BATCH((0 << 4) |  /* URB Entry Allocation Size */
+ 		  (0 << 0));  /* Number of URB Entries */
+ 
+ 	/* Zero out the two base address registers so all offsets are
+ 	 * absolute.
+ 	 */
+ 	OUT_BATCH(BRW_STATE_BASE_ADDRESS | 4);
+ 
+ 	if (pI830->use_ttm_batch) {
 -	    OUT_RELOC(pI830->exa965->buf, DRM_BO_FLAG_MEM_TT, BASE_ADDRESS_MODIFY);
++	    OUT_RELOC(pI830->exa965->buf,
++		      DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
++		      BASE_ADDRESS_MODIFY);
+ 
 -	    OUT_RELOC(pI830->exa965->surface_buf, DRM_BO_FLAG_MEM_TT, BASE_ADDRESS_MODIFY);
++	    OUT_RELOC(pI830->exa965->surface_buf,
++		      DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
++		      BASE_ADDRESS_MODIFY);
+ 	} else {
+ 	    OUT_BATCH(pI830->exa_965_state->offset | BASE_ADDRESS_MODIFY);
+ 	    OUT_BATCH(pI830->exa_965_state->offset | BASE_ADDRESS_MODIFY);
+ 	}
  
-     state->buf = dri_bo_alloc(pI830->bufmgr, "exa state buffer",
- 			      EXASTATE_SZ, 4096,
- 			      DRM_BO_FLAG_MEM_TT);
-     dri_bo_map(state->buf, TRUE);
+ 	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);  /* media base addr, don't care */
+ 	/* general state max addr, disabled */
+ 	OUT_BATCH(0x10000000 | BASE_ADDRESS_MODIFY);
+ 	/* media object state max addr, disabled */
+ 	OUT_BATCH(0x10000000 | BASE_ADDRESS_MODIFY);
+ 
+ 	/* Set system instruction pointer */
+ 	OUT_BATCH(BRW_STATE_SIP | 0);
+ 	OUT_BATCH(sip_kernel_offset);
+ 
+ 	/* URB fence */
+ 	OUT_BATCH(BRW_URB_FENCE |
+ 		  UF0_CS_REALLOC |
+ 		  UF0_SF_REALLOC |
+ 		  UF0_CLIP_REALLOC |
+ 		  UF0_GS_REALLOC |
+ 		  UF0_VS_REALLOC |
+ 		  1);
+ 	OUT_BATCH(((urb_clip_start + urb_clip_size) << UF1_CLIP_FENCE_SHIFT) |
+ 		  ((urb_gs_start + urb_gs_size) << UF1_GS_FENCE_SHIFT) |
+ 		  ((urb_vs_start + urb_vs_size) << UF1_VS_FENCE_SHIFT));
+ 	OUT_BATCH(((urb_cs_start + urb_cs_size) << UF2_CS_FENCE_SHIFT) |
+ 		  ((urb_sf_start + urb_sf_size) << UF2_SF_FENCE_SHIFT));
+ 
+ 	/* Constant buffer state */
+ 	OUT_BATCH(BRW_CS_URB_STATE | 0);
+ 	OUT_BATCH(((URB_CS_ENTRY_SIZE - 1) << 4) |
+ 		  (URB_CS_ENTRIES << 0));
  
-     state->map = state->buf->virtual;
-     i965_init_state_objects(state->pScrn, state->map);
+ 	/* Pipe control */
+ 	OUT_BATCH(BRW_PIPE_CONTROL |
+ 		  BRW_PIPE_CONTROL_NOWRITE |
+ 		  BRW_PIPE_CONTROL_IS_FLUSH |
+ 		  2);
+ 	OUT_BATCH(0);			       /* Destination address */
+ 	OUT_BATCH(0);			       /* Immediate data low DW */
+ 	OUT_BATCH(0);			       /* Immediate data high DW */
+ 
+ 	ADVANCE_BATCH();
+     }
  }
  
  Bool
@@@ -809,13 -1078,10 +1087,13 @@@ i965_prepare_composite(int op, PictureP
      dest_surf_state->ss0.surface_format = dst_format;
  
      if (pI830->use_ttm_batch) {
- 	uint32_t _ret;
- 	_ret = intelddx_batchbuffer_emit_pixmap(pDst,
- 						DRM_BO_FLAG_MEM_TT |
- 						DRM_BO_FLAG_READ |
- 						DRM_BO_FLAG_WRITE,
- 						pI830->exa965->buf, dest_surf_offset + 4, 0);
- 	dest_surf_state->ss1.base_addr = _ret;
 -    	intelddx_batchbuffer_emit_pixmap(pDst,
 -				     DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_WRITE,
 -				     DRM_BO_MASK_MEM | DRM_BO_FLAG_WRITE,
 -				     pI830->exa965->surface_buf, dest_surf_offset + 4, 0);
++    	dest_surf_state->ss1.base_addr =
++	    intelddx_batchbuffer_emit_pixmap(pDst,
++					     DRM_BO_FLAG_MEM_TT |
++					     DRM_BO_FLAG_WRITE |
++					     DRM_BO_FLAG_READ,
++					     pI830->exa965->surface_buf,
++					     dest_surf_offset + 4, 0);
      } else {
          dest_surf_state->ss1.base_addr = intel_get_pixmap_offset(pDst);
      }
@@@ -831,13 -1097,10 +1109,12 @@@
      src_surf_state->ss0.surface_format = i965_get_card_format(pSrcPicture);
  
      if (pI830->use_ttm_batch) {
- 	uint32_t _ret;
-         _ret = intelddx_batchbuffer_emit_pixmap(pSrc,
- 						DRM_BO_FLAG_MEM_TT |
- 						DRM_BO_FLAG_READ,
- 						pI830->exa965->buf,
- 						src_surf_offset + 4, 0);
- 	src_surf_state->ss1.base_addr = _ret;
 -        intelddx_batchbuffer_emit_pixmap(pSrc,
 -				 DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
 -				 DRM_BO_MASK_MEM | DRM_BO_FLAG_READ,
 -				 pI830->exa965->surface_buf, src_surf_offset + 4, 0);
++        src_surf_state->ss1.base_addr =
++	    intelddx_batchbuffer_emit_pixmap(pSrc,
++					     DRM_BO_FLAG_MEM_TT |
++					     DRM_BO_FLAG_READ,
++					     pI830->exa965->surface_buf,
++					     src_surf_offset + 4, 0);
      } else {
          src_surf_state->ss1.base_addr = intel_get_pixmap_offset(pSrc);
      }
@@@ -849,16 -1112,13 +1126,15 @@@
  
      /* setup mask surface */
      if (pMask) {
- 	mask_surf_state = (void *)(start_base + mask_surf_offset);
+ 	mask_surf_state = (void *)(surface_start_base + mask_surf_offset);
     	mask_surf_state->ss0.surface_format = i965_get_card_format(pMaskPicture);
          if (pI830->use_ttm_batch) {
- 	  uint32_t _ret;
- 	  _ret = intelddx_batchbuffer_emit_pixmap(pMask,
- 						  DRM_BO_FLAG_MEM_TT |
- 						  DRM_BO_FLAG_READ,
- 						  pI830->exa965->buf,
- 						  mask_surf_offset + 4, 0);
- 	  mask_surf_state->ss1.base_addr = _ret;
 -	   intelddx_batchbuffer_emit_pixmap(pMask, 
 -				     DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
 -				     DRM_BO_MASK_MEM | DRM_BO_FLAG_READ,
 -				     pI830->exa965->surface_buf, mask_surf_offset + 4, 0);
++	   mask_surf_state->ss1.base_addr =
++	       intelddx_batchbuffer_emit_pixmap(pMask,
++						DRM_BO_FLAG_MEM_TT |
++						DRM_BO_FLAG_READ,
++						pI830->exa965->surface_buf,
++						mask_surf_offset + 4, 0);
          } else {
  	    mask_surf_state->ss1.base_addr = intel_get_pixmap_offset(pMask);
  	}
@@@ -1105,8 -1254,7 +1270,9 @@@
  	    	 ((4 * 2 * nelem) << VB0_BUFFER_PITCH_SHIFT));
  
  	if (pI830->use_ttm_batch) {
- 	    OUT_RELOC(pI830->exa965->buf,
- 		      DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, vb_offset);
 -	    OUT_RELOC(pI830->exa965->surface_buf, DRM_BO_FLAG_MEM_TT, vb_offset);
++	    OUT_RELOC(pI830->exa965->surface_buf,
++		      DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
++		      vb_offset);
  
  	} else {
  	    OUT_BATCH(pI830->exa_965_state->offset + vb_offset);
diff --cc src/intel_batchbuffer.c
index 636c9eb,c3b5d8e..42409e8
--- a/src/intel_batchbuffer.c
+++ b/src/intel_batchbuffer.c
@@@ -263,15 -264,11 +266,12 @@@ intelddx_batchbuffer_data(struct inteld
     batch->ptr += bytes;
  }
  
 -Bool
 -intelddx_batchbuffer_emit_pixmap(PixmapPtr pPixmap, unsigned int flags,
 -			      unsigned int mask, ddx_bo *reloc_buf,
 -			      unsigned int offset, unsigned int delta)
 +uint32_t intelddx_batchbuffer_emit_pixmap(PixmapPtr pPixmap,
 +					  unsigned int flags,
 +					  dri_bo *reloc_buf,
 +					  unsigned int offset,
 +					  unsigned int delta)
  {
-     ScreenPtr pScreen = pPixmap->drawable.pScreen;
-     ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
-     I830Ptr pI830 = I830PTR(pScrn);
      struct i830_exa_pixmap_priv *driver_priv = exaGetPixmapDriverPrivate(pPixmap);
  
      if (driver_priv->flags & I830_EXA_PIXMAP_IS_MAPPED) {
diff --cc src/intel_batchbuffer.h
index 0c0a95e,2244414..26e0fe1
--- a/src/intel_batchbuffer.h
+++ b/src/intel_batchbuffer.h
@@@ -26,10 -24,15 +26,15 @@@ struct intelddx_batchbuffe
  };
  
  struct i965_exastate_buffer {
 -   ddx_bo *buf;
 +   dri_bo *buf;
+    unsigned char *map;
+ 
 -   ddx_bo *surface_buf;
++   dri_bo *surface_buf;
+    unsigned char *surface_map;
+    int num_ops;
+ 
     dri_fence *last_fence;
     ScrnInfoPtr pScrn;
-    unsigned char *map;
  };
  
  struct intelddx_batchbuffer *intelddx_batchbuffer_alloc(ScrnInfoPtr pScrn);
commit a7189b5e81f622edaf3180199c957415a27c2aa6
Author: Eric Anholt <eric at anholt.net>
Date:   Wed Jan 23 13:02:52 2008 -0800

    Replace dri_bufmgr code with nearly-straight mesa code and a remapping header.
    
    While here, fix the relocation arguments to not include the useless mask arg
    and always supply at least DRM_BO_FLAG_READ (otherwise, the kernel rejects our
    buffers).
    
    To avoid any future mistakese with renaming, a make check test is added to make
    sure we covered all the symbols.

diff --git a/src/Makefile.am b/src/Makefile.am
index b014c0e..a36eb1b 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -119,7 +119,9 @@ intel_drv_la_SOURCES = \
 	 dri_bufmgr.c \
          dri_bufmgr.h \
          intel_bufmgr_ttm.c \
-         intel_batchbuffer.c
+         intel_bufmgr_ttm.h \
+         intel_batchbuffer.c \
+         intel_batchbuffer.h
 
 INTEL_G4A =				\
 	packed_yuv_sf.g4a		\
@@ -200,3 +202,7 @@ install-data-local: install-intel_drv_laLTLIBRARIES
 
 uninstall-local:
 	(cd $(DESTDIR)$(intel_drv_ladir) && rm -f i810_drv.so)
+
+TESTS_ENVIRONMENT = srcdir="$(srcdir)"
+TESTS = check-remap.sh
+EXTRA_DIST += $(TESTS)
diff --git a/src/check-remap.sh b/src/check-remap.sh
new file mode 100755
index 0000000..1bcde94
--- /dev/null
+++ b/src/check-remap.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+LANG=C
+
+if ! which nm 2>/dev/null >/dev/null; then
+	echo "'nm' not found; skipping test"
+	exit 0
+fi
+
+test -z "$srcdir" && srcdir=.
+status=0
+
+objs="dri_bufmgr.o intel_bufmgr_ttm.o"
+for obj in $objs; do
+	obj=.libs/${obj}
+
+	test -f $obj || continue
+	echo Checking $obj
+
+	syms=`nm "$obj" | grep " T " | cut -d" " -f3 | grep -E "intel|dri"`
+	bad_syms=`echo $syms | grep -v "ddx_"`
+
+	if test "x$bad_syms" != "x"; then
+	    echo "ERROR: $obj contains non-remapped symbols: $bad_syms"
+	    status=1
+	fi
+done
+
+exit $status
diff --git a/src/dri_bufmgr.c b/src/dri_bufmgr.c
index 578eff6..fc9871a 100644
--- a/src/dri_bufmgr.c
+++ b/src/dri_bufmgr.c
@@ -25,6 +25,9 @@
  *
  */
 
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
 #include "dri_bufmgr.h"
 
 /** @file dri_bufmgr.c
@@ -32,8 +35,8 @@
  * Convenience functions for buffer management methods.
  */
 
-ddx_bo *
-ddx_bo_alloc(ddx_bufmgr *bufmgr, const char *name, unsigned long size,
+dri_bo *
+dri_bo_alloc(dri_bufmgr *bufmgr, const char *name, unsigned long size,
 	     unsigned int alignment, uint64_t location_mask)
 {
    assert((location_mask & ~(DRM_BO_FLAG_MEM_LOCAL | DRM_BO_FLAG_MEM_TT |
@@ -44,8 +47,8 @@ ddx_bo_alloc(ddx_bufmgr *bufmgr, const char *name, unsigned long size,
    return bufmgr->bo_alloc(bufmgr, name, size, alignment, location_mask);
 }
 
-ddx_bo *
-ddx_bo_alloc_static(ddx_bufmgr *bufmgr, const char *name, unsigned long offset,
+dri_bo *
+dri_bo_alloc_static(dri_bufmgr *bufmgr, const char *name, unsigned long offset,
 		    unsigned long size, void *virtual,
 		    uint64_t location_mask)
 {
@@ -60,13 +63,13 @@ ddx_bo_alloc_static(ddx_bufmgr *bufmgr, const char *name, unsigned long offset,
 }
 
 void
-ddx_bo_reference(ddx_bo *bo)
+dri_bo_reference(dri_bo *bo)
 {
    bo->bufmgr->bo_reference(bo);
 }
 
 void
-ddx_bo_unreference(ddx_bo *bo)
+dri_bo_unreference(dri_bo *bo)
 {
    if (bo == NULL)
       return;
@@ -75,13 +78,13 @@ ddx_bo_unreference(ddx_bo *bo)
 }
 
 int
-ddx_bo_map(ddx_bo *buf, Bool write_enable)
+dri_bo_map(dri_bo *buf, GLboolean write_enable)
 {
    return buf->bufmgr->bo_map(buf, write_enable);
 }
 
 int
-ddx_bo_unmap(ddx_bo *buf)
+dri_bo_unmap(dri_bo *buf)
 {
    return buf->bufmgr->bo_unmap(buf);
 }
@@ -108,53 +111,54 @@ dri_fence_unreference(dri_fence *fence)
 }
 
 void
-ddx_bo_subdata(ddx_bo *bo, unsigned long offset,
+dri_bo_subdata(dri_bo *bo, unsigned long offset,
 	       unsigned long size, const void *data)
 {
    if (size == 0 || data == NULL)
       return;
 
-   ddx_bo_map(bo, TRUE);
+   dri_bo_map(bo, GL_TRUE);
    memcpy((unsigned char *)bo->virtual + offset, data, size);
-   ddx_bo_unmap(bo);
+   dri_bo_unmap(bo);
 }
 
 void
-ddx_bo_get_subdata(ddx_bo *bo, unsigned long offset,
+dri_bo_get_subdata(dri_bo *bo, unsigned long offset,
 		   unsigned long size, void *data)
 {
    if (size == 0 || data == NULL)
       return;
 
-   ddx_bo_map(bo, FALSE);
+   dri_bo_map(bo, GL_FALSE);
    memcpy(data, (unsigned char *)bo->virtual + offset, size);
-   ddx_bo_unmap(bo);
+   dri_bo_unmap(bo);
 }
 
 void
-ddx_bufmgr_destroy(ddx_bufmgr *bufmgr)
+dri_bufmgr_destroy(dri_bufmgr *bufmgr)
 {
    bufmgr->destroy(bufmgr);
 }
 
 
-void dri_emit_reloc(ddx_bo *batch_buf, uint64_t flags, uint32_t delta, uint32_t offset, ddx_bo *relocatee)
+void dri_emit_reloc(dri_bo *reloc_buf, uint64_t flags, GLuint delta,
+		    GLuint offset, dri_bo *target_buf)
 {
-   batch_buf->bufmgr->emit_reloc(batch_buf, flags, delta, offset, relocatee);
+   reloc_buf->bufmgr->emit_reloc(reloc_buf, flags, delta, offset, target_buf);
 }
 
-void *dri_process_relocs(ddx_bo *batch_buf, uint32_t *count)
+void *dri_process_relocs(dri_bo *batch_buf, GLuint *count)
 {
    return batch_buf->bufmgr->process_relocs(batch_buf, count);
 }
 
-void dri_post_submit(ddx_bo *batch_buf, dri_fence **last_fence)
+void dri_post_submit(dri_bo *batch_buf, dri_fence **last_fence)
 {
    batch_buf->bufmgr->post_submit(batch_buf, last_fence);
 }
 
 void
-dri_bufmgr_set_debug(ddx_bufmgr *bufmgr, Bool enable_debug)
+dri_bufmgr_set_debug(dri_bufmgr *bufmgr, GLboolean enable_debug)
 {
    bufmgr->debug = enable_debug;
 }
diff --git a/src/dri_bufmgr.h b/src/dri_bufmgr.h
index e78f8ec..77ef0f6 100644
--- a/src/dri_bufmgr.h
+++ b/src/dri_bufmgr.h
@@ -34,16 +34,16 @@
 
 #ifndef _DRI_BUFMGR_H_
 #define _DRI_BUFMGR_H_
-#include <stdint.h>
 #include <xf86drm.h>
 
-#include "xf86str.h"
+#include <GL/gl.h>
+#include "dri_bufmgr_remap.h"
 
-typedef struct _ddx_bufmgr ddx_bufmgr;
-typedef struct _ddx_bo ddx_bo;
+typedef struct _dri_bufmgr dri_bufmgr;
+typedef struct _dri_bo dri_bo;
 typedef struct _dri_fence dri_fence;
 
-struct _ddx_bo {
+struct _dri_bo {
    /** Size in bytes of the buffer object. */
    unsigned long size;
    /**
@@ -56,7 +56,7 @@ struct _ddx_bo {
     */
    void *virtual;
    /** Buffer manager context associated with this buffer object */
-   ddx_bufmgr *bufmgr;
+   dri_bufmgr *bufmgr;
 };
 
 struct _dri_fence {
@@ -68,7 +68,7 @@ struct _dri_fence {
     */
    unsigned int type;
    /** Buffer manager context associated with this fence */
-   ddx_bufmgr *bufmgr;
+   dri_bufmgr *bufmgr;
 };
 
 /**
@@ -76,7 +76,7 @@ struct _dri_fence {
  *
  * Contains public methods followed by private storage for the buffer manager.
  */
-struct _ddx_bufmgr {
+struct _dri_bufmgr {
    /**
     * Allocate a buffer object.
     *
@@ -85,9 +85,9 @@ struct _ddx_bufmgr {
     * bo_map() to be used by the CPU, and validated for use using bo_validate()
     * to be used from the graphics device.
     */
-   ddx_bo *(*bo_alloc)(ddx_bufmgr *bufmgr_ctx, const char *name,
+   dri_bo *(*bo_alloc)(dri_bufmgr *bufmgr_ctx, const char *name,
 		       unsigned long size, unsigned int alignment,
-		       unsigned int location_mask);
+		       uint64_t location_mask);
 
    /**
     * Allocates a buffer object for a static allocation.
@@ -95,18 +95,18 @@ struct _ddx_bufmgr {
     * Static allocations are ones such as the front buffer that are offered by
     * the X Server, which are never evicted and never moved.
     */
-   ddx_bo *(*bo_alloc_static)(ddx_bufmgr *bufmgr_ctx, const char *name,
+   dri_bo *(*bo_alloc_static)(dri_bufmgr *bufmgr_ctx, const char *name,
 			      unsigned long offset, unsigned long size,
-			      void *virtual, unsigned int location_mask);
+			      void *virtual, uint64_t location_mask);
 
    /** Takes a reference on a buffer object */
-   void (*bo_reference)(ddx_bo *bo);
+   void (*bo_reference)(dri_bo *bo);
 
    /**
     * Releases a reference on a buffer object, freeing the data if
     * rerefences remain.
     */
-   void (*bo_unreference)(ddx_bo *bo);
+   void (*bo_unreference)(dri_bo *bo);
 
    /**
     * Maps the buffer into userspace.
@@ -114,10 +114,10 @@ struct _ddx_bufmgr {
     * This function will block waiting for any existing fence on the buffer to
     * clear, first.  The resulting mapping is available at buf->virtual.
 \    */
-   int (*bo_map)(ddx_bo *buf, Bool write_enable);
+   int (*bo_map)(dri_bo *buf, GLboolean write_enable);
 
    /** Reduces the refcount on the userspace mapping of the buffer object. */
-   int (*bo_unmap)(ddx_bo *buf);
+   int (*bo_unmap)(dri_bo *buf);
 
    /** Takes a reference on a fence object */
    void (*fence_reference)(dri_fence *fence);
@@ -136,55 +136,87 @@ struct _ddx_bufmgr {
    /**
     * Tears down the buffer manager instance.
     */
-   void (*destroy)(ddx_bufmgr *bufmgr);
-   
+   void (*destroy)(dri_bufmgr *bufmgr);
+
    /**
-    * Add relocation
+    * Add relocation entry in reloc_buf, which will be updated with the
+    * target buffer's real offset on on command submission.
+    *
+    * Relocations remain in place for the lifetime of the buffer object.
+    *
+    * \param reloc_buf Buffer to write the relocation into.
+    * \param flags BO flags to be used in validating the target buffer.
+    *	     Applicable flags include:
+    *	     - DRM_BO_FLAG_READ: The buffer will be read in the process of
+    *	       command execution.
+    *	     - DRM_BO_FLAG_WRITE: The buffer will be written in the process of
+    *	       command execution.
+    *	     - DRM_BO_FLAG_MEM_TT: The buffer should be validated in TT memory.
+    *	     - DRM_BO_FLAG_MEM_VRAM: The buffer should be validated in video
+    *	       memory.
+    * \param delta Constant value to be added to the relocation target's offset.
+    * \param offset Byte offset within batch_buf of the relocated pointer.
+    * \param target Buffer whose offset should be written into the relocation
+    *	     entry.
     */
-   void (*emit_reloc)(ddx_bo *batch_buf, uint64_t flags, uint32_t delta, uint32_t offset, ddx_bo *relocatee);
+   void (*emit_reloc)(dri_bo *reloc_buf, uint64_t flags, GLuint delta,
+		      GLuint offset, dri_bo *target);
 
-  void *(*process_relocs)(ddx_bo *batch_buf, uint32_t *count);
+   /**
+    * Processes the relocations, either in userland or by converting the list
+    * for use in batchbuffer submission.
+    *
+    * Kernel-based implementations will return a pointer to the arguments
+    * to be handed with batchbuffer submission to the kernel.  The userland
+    * implementation performs the buffer validation and emits relocations
+    * into them the appopriate order.
+    *
+    * \param batch_buf buffer at the root of the tree of relocations
+    * \param count returns the number of buffers validated.
+    * \return relocation record for use in command submission.
+    * */
+   void *(*process_relocs)(dri_bo *batch_buf, GLuint *count);
 
-   void (*post_submit)(ddx_bo *batch_buf, dri_fence **fence);
+   void (*post_submit)(dri_bo *batch_buf, dri_fence **fence);
 
-   Bool debug; /**< Enables verbose debugging printouts */
+   GLboolean debug; /**< Enables verbose debugging printouts */
 };
 
-ddx_bo *ddx_bo_alloc(ddx_bufmgr *bufmgr, const char *name, unsigned long size,
+dri_bo *dri_bo_alloc(dri_bufmgr *bufmgr, const char *name, unsigned long size,
 		     unsigned int alignment, uint64_t location_mask);
-ddx_bo *ddx_bo_alloc_static(ddx_bufmgr *bufmgr, const char *name,
+dri_bo *dri_bo_alloc_static(dri_bufmgr *bufmgr, const char *name,
 			    unsigned long offset, unsigned long size,
 			    void *virtual, uint64_t location_mask);
-void ddx_bo_reference(ddx_bo *bo);
-void ddx_bo_unreference(ddx_bo *bo);
-int ddx_bo_map(ddx_bo *buf, Bool write_enable);
-int ddx_bo_unmap(ddx_bo *buf);
+void dri_bo_reference(dri_bo *bo);
+void dri_bo_unreference(dri_bo *bo);
+int dri_bo_map(dri_bo *buf, GLboolean write_enable);
+int dri_bo_unmap(dri_bo *buf);
 void dri_fence_wait(dri_fence *fence);
 void dri_fence_reference(dri_fence *fence);
 void dri_fence_unreference(dri_fence *fence);
 
-void ddx_bo_subdata(ddx_bo *bo, unsigned long offset,
+void dri_bo_subdata(dri_bo *bo, unsigned long offset,
 		    unsigned long size, const void *data);
-void ddx_bo_get_subdata(ddx_bo *bo, unsigned long offset,
+void dri_bo_get_subdata(dri_bo *bo, unsigned long offset,
 			unsigned long size, void *data);
 
-ddx_bufmgr *ddx_bufmgr_ttm_init(int fd, unsigned int fence_type,
-				unsigned int fence_type_flush);
-
-void ddx_bufmgr_fake_contended_lock_take(ddx_bufmgr *bufmgr);
-ddx_bufmgr *ddx_bufmgr_fake_init(unsigned long low_offset, void *low_virtual,
+void dri_bufmgr_fake_contended_lock_take(dri_bufmgr *bufmgr);
+dri_bufmgr *dri_bufmgr_fake_init(unsigned long low_offset, void *low_virtual,
 				 unsigned long size,
 				 unsigned int (*fence_emit)(void *private),
 				 int (*fence_wait)(void *private,
 						   unsigned int cookie),
 				 void *driver_priv);
-void ddx_bufmgr_destroy(ddx_bufmgr *bufmgr);
-ddx_bo *dri_ttm_bo_create_from_handle(ddx_bufmgr *bufmgr, const char *name,
-				      unsigned int handle);
-
-void dri_emit_reloc(ddx_bo *batch_buf, uint64_t flags, uint32_t delta, uint32_t offset, ddx_bo *relocatee);
-void *dri_process_relocs(ddx_bo *batch_buf, uint32_t *count);
-void dri_post_process_relocs(ddx_bo *batch_buf);
-void dri_post_submit(ddx_bo *batch_buf, dri_fence **last_fence);
-void dri_bufmgr_set_debug(ddx_bufmgr *bufmgr, Bool enable_debug);
+void dri_bufmgr_set_debug(dri_bufmgr *bufmgr, GLboolean enable_debug);
+void dri_bo_fake_disable_backing_store(dri_bo *bo,
+				       void (*invalidate_cb)(dri_bo *bo,
+							     void *ptr),
+				       void *ptr);
+void dri_bufmgr_destroy(dri_bufmgr *bufmgr);
+
+void dri_emit_reloc(dri_bo *reloc_buf, uint64_t flags, GLuint delta,
+		    GLuint offset, dri_bo *target_buf);
+void *dri_process_relocs(dri_bo *batch_buf, uint32_t *count);
+void dri_post_process_relocs(dri_bo *batch_buf);
+void dri_post_submit(dri_bo *batch_buf, dri_fence **last_fence);
 #endif
diff --git a/src/dri_bufmgr_compat.h b/src/dri_bufmgr_compat.h
new file mode 100644
index 0000000..2cd2556
--- /dev/null
+++ b/src/dri_bufmgr_compat.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2007 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric at anholt.net>
+ *
+ */
+
+/**
+ * @file dri_bufmgr_compat.h
+ *
+ * This file contains typedefs and such to build the dri_bufmgr files from the
+ * 3d driver.
+ */
+
+typedef Bool GLboolean;
+typedef uint32_t GLuint;
+typedef int32_t GLint;
diff --git a/src/dri_bufmgr_remap.h b/src/dri_bufmgr_remap.h
new file mode 100644
index 0000000..a5e231e
--- /dev/null
+++ b/src/dri_bufmgr_remap.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright © 2007 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric at anholt.net>
+ *
+ */
+
+/**
+ * @file dri_bufmgr_remap.h
+ *
+ * This file contains renaming macros for dri_bufmgr.c and intel_bufmgr_ttm.c,
+ * to prevent symbol collisions with the AIGLX-loaded 3D driver.
+ */
+
+#define dri_bo_alloc ddx_dri_bo_alloc
+#define dri_bo_alloc_static ddx_dri_bo_alloc_static
+#define dri_bo_reference ddx_dri_bo_reference
+#define dri_bo_unreference ddx_dri_bo_unreference
+#define dri_bo_map ddx_dri_bo_map
+#define dri_bo_unmap ddx_dri_bo_unmap
+#define dri_fence_wait ddx_dri_fence_wait
+#define dri_fence_reference ddx_dri_fence_reference
+#define dri_fence_unreference ddx_dri_fence_unreference
+#define dri_bo_subdata ddx_dri_bo_subdata
+#define dri_bo_get_subdata ddx_dri_bo_get_subdata
+#define dri_bufmgr_destroy ddx_dri_bufmgr_destroy
+#define dri_emit_reloc ddx_dri_emit_reloc
+#define dri_process_relocs ddx_dri_process_relocs
+#define dri_post_process_relocs ddx_dri_post_process_relocs
+#define dri_post_submit ddx_dri_post_submit
+
+#define intel_ttm_bo_create_from_handle ddx_intel_ttm_bo_create_from_handle
+#define intel_ttm_fence_create_from_arg ddx_intel_ttm_fence_create_from_arg
+#define intel_bufmgr_ttm_init ddx_intel_bufmgr_ttm_init
+
diff --git a/src/i830.h b/src/i830.h
index 923704c..e237c42 100644
--- a/src/i830.h
+++ b/src/i830.h
@@ -628,7 +628,7 @@ typedef struct _I830Rec {
    /* batchbuffer support */
    struct i965_exastate_buffer *exa965;
    struct intelddx_batchbuffer *batch;
-   ddx_bufmgr *bufmgr;
+   dri_bufmgr *bufmgr;
    unsigned int maxBatchSize;
    Bool use_ttm_batch;
 } I830Rec;
@@ -645,7 +645,7 @@ typedef struct _I830Rec {
 
 /* i830 pixmap private for TTM */
 struct i830_exa_pixmap_priv {
-    ddx_bo *bo;
+    dri_bo *bo;
     dri_fence *fence;
     int flags;
 };
diff --git a/src/i830_dri.c b/src/i830_dri.c
index 4bea2a7..0857603 100644
--- a/src/i830_dri.c
+++ b/src/i830_dri.c
@@ -531,9 +531,10 @@ I830InitBufMgr(ScreenPtr pScreen)
    else
    	pI830->maxBatchSize = BATCH_SZ;
 
-   pI830->bufmgr = intelddx_bufmgr_ttm_init(pI830->drmSubFD, DRM_FENCE_TYPE_EXE,
-			DRM_FENCE_TYPE_EXE | DRM_I915_FENCE_TYPE_RW,
-			BATCH_SZ);
+   pI830->bufmgr = intel_bufmgr_ttm_init(pI830->drmSubFD, DRM_FENCE_TYPE_EXE,
+					 DRM_FENCE_TYPE_EXE |
+					 DRM_I915_FENCE_TYPE_RW,
+					 BATCH_SZ);
 
    if (!pI830->bufmgr)
 	return;
diff --git a/src/i830_exa.c b/src/i830_exa.c
index 9e563e7..f06ac49 100644
--- a/src/i830_exa.c
+++ b/src/i830_exa.c
@@ -35,9 +35,8 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "xaarop.h"
 #include "i830.h"
 #include "i810_reg.h"
-#include <string.h>
-
 #include "intel_bufmgr_ttm.h"
+#include <string.h>
 
 #ifdef I830DEBUG
 #define DEBUG_I830FALLBACK 1
@@ -239,7 +238,7 @@ I830EXASolid(PixmapPtr pPixmap, int x1, int y1, int x2, int y2)
 	OUT_BATCH((y2 << 16) | (x2 & 0xffff));
 	OUT_PIXMAP_RELOC(pPixmap,
 			 DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_WRITE,
-			 DRM_BO_MASK_MEM | DRM_BO_FLAG_WRITE, 0);
+			 0);
 	OUT_BATCH(pI830->BR[16]);
 	ADVANCE_BATCH();
     }
@@ -331,12 +330,12 @@ I830EXACopy(PixmapPtr pDstPixmap, int src_x1, int src_y1, int dst_x1,
 	OUT_BATCH((dst_y2 << 16) | (dst_x2 & 0xffff));
 	OUT_PIXMAP_RELOC(pDstPixmap,
 			 DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_WRITE,
-			 DRM_BO_MASK_MEM | DRM_BO_FLAG_WRITE, 0);
+			 0);
 	OUT_BATCH((src_y1 << 16) | (src_x1 & 0xffff));
 	OUT_BATCH(src_pitch);
 	OUT_PIXMAP_RELOC(pI830->pSrcPixmap,
 			 DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
-			 DRM_BO_MASK_MEM | DRM_BO_FLAG_READ, 0);
+			 0);
 
 	ADVANCE_BATCH();
     }
@@ -398,7 +397,7 @@ static void *I830EXACreatePixmap(ScreenPtr pScreen, int size, int align)
     if (size == 0)
 	return new_priv;
 
-    new_priv->bo = ddx_bo_alloc(pI830->bufmgr, "pixmap",
+    new_priv->bo = dri_bo_alloc(pI830->bufmgr, "pixmap",
 				size, 4096, DRM_BO_FLAG_MEM_LOCAL | DRM_BO_FLAG_CACHED | DRM_BO_FLAG_CACHED_MAPPED);
 
     return new_priv;
@@ -411,9 +410,9 @@ static void I830EXADestroyPixmap(ScreenPtr pScreen, void *driverPriv)
     struct i830_exa_pixmap_priv *driver_priv = driverPriv;
 
     if (driver_priv->flags & I830_EXA_PIXMAP_IS_MAPPED)
-        ddx_bo_unmap(driver_priv->bo);
+        dri_bo_unmap(driver_priv->bo);
 
-    ddx_bo_unreference(driver_priv->bo);
+    dri_bo_unreference(driver_priv->bo);
     xfree(driverPriv);
 }
 
@@ -453,7 +452,7 @@ static Bool I830EXAPrepareAccess(PixmapPtr pPix, int index)
 	if ((driver_priv->flags & I830_EXA_PIXMAP_IS_MAPPED))
 	    return TRUE;
 
-	ret = ddx_bo_map(driver_priv->bo, 1);
+	ret = dri_bo_map(driver_priv->bo, 1);
 	if (ret)
 	    return FALSE;
 
@@ -480,7 +479,9 @@ static Bool I830EXAModifyPixmapHeader(PixmapPtr pPixmap, int width, int height,
         driver_priv->flags |= I830_EXA_PIXMAP_IS_FRONTBUFFER;
 
 	/* get a reference to the front buffer handle */
-	driver_priv->bo = intelddx_ttm_bo_create_from_handle(pI830->bufmgr, "front", pI830->front_buffer->bo.handle);
+	driver_priv->bo =
+	    intel_ttm_bo_create_from_handle(pI830->bufmgr, "front",
+					    pI830->front_buffer->bo.handle);
 	miModifyPixmapHeader(pPixmap, width, height, depth,
 			     bitsPerPixel, devKind, NULL);
 
diff --git a/src/i830_render.c b/src/i830_render.c
index 2b54a03..24ad262 100644
--- a/src/i830_render.c
+++ b/src/i830_render.c
@@ -319,7 +319,7 @@ i830_texture_setup(PicturePtr pPict, PixmapPtr pPix, int unit)
 	OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_2 | LOAD_TEXTURE_MAP(unit) | 4);
 	OUT_PIXMAP_RELOC(pPix,
 			 DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
-			 DRM_BO_MASK_MEM | DRM_BO_FLAG_WRITE, TM0S0_USE_FENCE);
+			 TM0S0_USE_FENCE);
 	OUT_BATCH(((pPix->drawable.height - 1) << TM0S1_HEIGHT_SHIFT) |
 		((pPix->drawable.width - 1) << TM0S1_WIDTH_SHIFT) | format);
 	OUT_BATCH((pitch/4 - 1) << TM0S2_PITCH_SHIFT | TM0S2_MAP_2D);
@@ -429,8 +429,10 @@ i830_prepare_composite(int op, PicturePtr pSrcPicture,
 	OUT_BATCH(BUF_3D_ID_COLOR_BACK| BUF_3D_USE_FENCE |
 			BUF_3D_PITCH(dst_pitch));
 	OUT_PIXMAP_RELOC(pDst,
-			 DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_WRITE,
-			 DRM_BO_MASK_MEM | DRM_BO_FLAG_WRITE, 0);
+			 DRM_BO_FLAG_MEM_TT |
+			 DRM_BO_FLAG_READ |
+			 DRM_BO_FLAG_WRITE,
+			 0);
 	OUT_BATCH(MI_NOOP);
 
 	OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD);
diff --git a/src/i915_render.c b/src/i915_render.c
index b753bc1..151f89d 100644
--- a/src/i915_render.c
+++ b/src/i915_render.c
@@ -344,7 +344,7 @@ i915_prepare_composite(int op, PicturePtr pSrcPicture,
 	OUT_BATCH(0x00000001); /* map 0 */
 	OUT_PIXMAP_RELOC(pI830->texture_pixmaps[0],
 			 DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
-			 DRM_BO_MASK_MEM | DRM_BO_FLAG_READ, 0);
+			 0);
 	OUT_BATCH(pI830->mapstate[1]);
 	OUT_BATCH(pI830->mapstate[2]);
 
@@ -360,12 +360,12 @@ i915_prepare_composite(int op, PicturePtr pSrcPicture,
 	OUT_BATCH(0x00000003); /* map 0,1 */
 	OUT_PIXMAP_RELOC(pI830->texture_pixmaps[0],
 			 DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
-			 DRM_BO_MASK_MEM | DRM_BO_FLAG_READ, 0);
+			 0);
 	OUT_BATCH(pI830->mapstate[1]);
 	OUT_BATCH(pI830->mapstate[2]);
 	OUT_PIXMAP_RELOC(pI830->texture_pixmaps[1],
 			 DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
-			 DRM_BO_MASK_MEM | DRM_BO_FLAG_READ, 0);
+			 0);
 	OUT_BATCH(pI830->mapstate[4]);
 	OUT_BATCH(pI830->mapstate[5]);
 
@@ -387,7 +387,7 @@ i915_prepare_composite(int op, PicturePtr pSrcPicture,
 	OUT_BATCH(BUF_3D_ID_COLOR_BACK| BUF_3D_USE_FENCE|
 		BUF_3D_PITCH(dst_pitch));
 	OUT_PIXMAP_RELOC(pDst, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
-			 DRM_BO_MASK_MEM | DRM_BO_FLAG_READ, 0);
+			 0);
 	OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD);
 	OUT_BATCH(dst_format);
 
diff --git a/src/i965_render.c b/src/i965_render.c
index b1c8d12..91f62bf 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -697,20 +697,20 @@ i965_update_ps_kernel(ScrnInfoPtr pScrn, char *start_base,
     memcpy(start_base + ps_kernel_offset, ps_kernels[need_ps_kernel].kernel, ps_kernels[need_ps_kernel].size);
 }
 
-void
+static void
 i965_exastate_reset(struct i965_exastate_buffer *state)
 {
     I830Ptr pI830 = I830PTR(state->pScrn);
 
     if (state->buf != NULL) {
-	ddx_bo_unreference(state->buf);
+	dri_bo_unreference(state->buf);
 	state->buf = NULL;
     }
 
-    state->buf = ddx_bo_alloc(pI830->bufmgr, "exa state buffer",
+    state->buf = dri_bo_alloc(pI830->bufmgr, "exa state buffer",
 			      EXASTATE_SZ, 4096,
 			      DRM_BO_FLAG_MEM_TT);
-    ddx_bo_map(state->buf, TRUE);
+    dri_bo_map(state->buf, TRUE);
 
     state->map = state->buf->virtual;
     i965_init_state_objects(state->pScrn, state->map);
@@ -811,8 +811,9 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     if (pI830->use_ttm_batch) {
 	uint32_t _ret;
 	_ret = intelddx_batchbuffer_emit_pixmap(pDst,
-						DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_WRITE,
-						DRM_BO_MASK_MEM | DRM_BO_FLAG_WRITE | DRM_BO_FLAG_CACHED,
+						DRM_BO_FLAG_MEM_TT |
+						DRM_BO_FLAG_READ |
+						DRM_BO_FLAG_WRITE,
 						pI830->exa965->buf, dest_surf_offset + 4, 0);
 	dest_surf_state->ss1.base_addr = _ret;
     } else {
@@ -832,9 +833,10 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     if (pI830->use_ttm_batch) {
 	uint32_t _ret;
         _ret = intelddx_batchbuffer_emit_pixmap(pSrc,
-						DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
-						DRM_BO_MASK_MEM | DRM_BO_FLAG_READ | DRM_BO_FLAG_CACHED,
-						pI830->exa965->buf, src_surf_offset + 4, 0);
+						DRM_BO_FLAG_MEM_TT |
+						DRM_BO_FLAG_READ,
+						pI830->exa965->buf,
+						src_surf_offset + 4, 0);
 	src_surf_state->ss1.base_addr = _ret;
     } else {
         src_surf_state->ss1.base_addr = intel_get_pixmap_offset(pSrc);
@@ -851,10 +853,11 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
    	mask_surf_state->ss0.surface_format = i965_get_card_format(pMaskPicture);
         if (pI830->use_ttm_batch) {
 	  uint32_t _ret;
-	  _ret = intelddx_batchbuffer_emit_pixmap(pMask, 
-				     DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
-				     DRM_BO_MASK_MEM | DRM_BO_FLAG_READ | DRM_BO_FLAG_CACHED,
-				     pI830->exa965->buf, mask_surf_offset + 4, 0);
+	  _ret = intelddx_batchbuffer_emit_pixmap(pMask,
+						  DRM_BO_FLAG_MEM_TT |
+						  DRM_BO_FLAG_READ,
+						  pI830->exa965->buf,
+						  mask_surf_offset + 4, 0);
 	  mask_surf_state->ss1.base_addr = _ret;
         } else {
 	    mask_surf_state->ss1.base_addr = intel_get_pixmap_offset(pMask);
@@ -1004,9 +1007,13 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
    	OUT_BATCH(BRW_STATE_BASE_ADDRESS | 4);
 
 	if (pI830->use_ttm_batch) {
-	    OUT_RELOC(pI830->exa965->buf, DRM_BO_FLAG_MEM_TT, BASE_ADDRESS_MODIFY);
+	    OUT_RELOC(pI830->exa965->buf,
+		      DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
+		      BASE_ADDRESS_MODIFY);
 
-	    OUT_RELOC(pI830->exa965->buf, DRM_BO_FLAG_MEM_TT, BASE_ADDRESS_MODIFY);
+	    OUT_RELOC(pI830->exa965->buf,
+		      DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
+		      BASE_ADDRESS_MODIFY);
 	} else {
 	    OUT_BATCH(pI830->exa_965_state->offset | BASE_ADDRESS_MODIFY);
 	    OUT_BATCH(pI830->exa_965_state->offset | BASE_ADDRESS_MODIFY);
@@ -1098,7 +1105,8 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	    	 ((4 * 2 * nelem) << VB0_BUFFER_PITCH_SHIFT));
 
 	if (pI830->use_ttm_batch) {
-	    OUT_RELOC(pI830->exa965->buf, DRM_BO_FLAG_MEM_TT, vb_offset);
+	    OUT_RELOC(pI830->exa965->buf,
+		      DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, vb_offset);
 
 	} else {
 	    OUT_BATCH(pI830->exa_965_state->offset + vb_offset);
@@ -1275,7 +1283,7 @@ void i965_done_composite(PixmapPtr pDst)
     }
 
     if (pI830->use_ttm_batch) {
-	ddx_bo_unmap(pI830->exa965->buf);
+	dri_bo_unmap(pI830->exa965->buf);
 	intelddx_batchbuffer_flush(pI830->batch);
     } else {
 	I830Sync(pScrn);
diff --git a/src/intel_batchbuffer.c b/src/intel_batchbuffer.c
index 25ba4be..636c9eb 100644
--- a/src/intel_batchbuffer.c
+++ b/src/intel_batchbuffer.c
@@ -82,14 +82,14 @@ intelddx_batchbuffer_reset(struct intelddx_batchbuffer *batch)
    I830Ptr pI830 = I830PTR(batch->pScrn);
 
    if (batch->buf != NULL) {
-      ddx_bo_unreference(batch->buf);
+      dri_bo_unreference(batch->buf);
       batch->buf = NULL;
    }
 
-   batch->buf = ddx_bo_alloc(pI830->bufmgr, "batchbuffer",
+   batch->buf = dri_bo_alloc(pI830->bufmgr, "batchbuffer",
 			     pI830->maxBatchSize, 4096,
 			     DRM_BO_FLAG_MEM_LOCAL | DRM_BO_FLAG_CACHED | DRM_BO_FLAG_CACHED_MAPPED);
-   ddx_bo_map(batch->buf, TRUE);
+   dri_bo_map(batch->buf, TRUE);
    batch->map = batch->buf->virtual;
    batch->size = pI830->maxBatchSize;
    batch->ptr = batch->map;
@@ -116,10 +116,10 @@ intelddx_batchbuffer_free(struct intelddx_batchbuffer *batch)
       batch->last_fence = NULL;
    }
    if (batch->map) {
-      ddx_bo_unmap(batch->buf);
+      dri_bo_unmap(batch->buf);
       batch->map = NULL;
    }
-   ddx_bo_unreference(batch->buf);
+   dri_bo_unreference(batch->buf);
    batch->buf = NULL;
    free(batch);
 }
@@ -157,7 +157,7 @@ intel_exec_ioctl(ScrnInfoPtr pScrn,
       exit(1);
    }
 
-   fo = intelddx_ttm_fence_create_from_arg(pI830->bufmgr, "fence buffers",
+   fo = intel_ttm_fence_create_from_arg(pI830->bufmgr, "fence buffers",
                                         &execbuf.fence_arg);
    if (!fo) {
       fprintf(stderr, "failed to fence handle: %08x\n", execbuf.fence_arg.handle);
@@ -175,7 +175,7 @@ do_flush_locked(struct intelddx_batchbuffer *batch,
    void *start;
    uint32_t count;
 
-   ddx_bo_unmap(batch->buf);
+   dri_bo_unmap(batch->buf);
    start = dri_process_relocs(batch->buf, &count);
 
    batch->map = NULL;
@@ -244,7 +244,7 @@ intelddx_batchbuffer_finish(struct intelddx_batchbuffer *batch)
  */
 Bool
 intelddx_batchbuffer_emit_reloc(struct intelddx_batchbuffer *batch,
-                             ddx_bo *buffer,
+                             dri_bo *buffer,
                              uint32_t flags, uint32_t delta)
 {
    dri_emit_reloc(batch->buf, flags, delta, batch->ptr - batch->map, buffer);
@@ -263,9 +263,11 @@ intelddx_batchbuffer_data(struct intelddx_batchbuffer *batch,
    batch->ptr += bytes;
 }
 
-uint32_t intelddx_batchbuffer_emit_pixmap(PixmapPtr pPixmap, unsigned int flags,
-			      unsigned int mask, ddx_bo *reloc_buf,
-			      unsigned int offset, unsigned int delta)
+uint32_t intelddx_batchbuffer_emit_pixmap(PixmapPtr pPixmap,
+					  unsigned int flags,
+					  dri_bo *reloc_buf,
+					  unsigned int offset,
+					  unsigned int delta)
 {
     ScreenPtr pScreen = pPixmap->drawable.pScreen;
     ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
@@ -273,7 +275,7 @@ uint32_t intelddx_batchbuffer_emit_pixmap(PixmapPtr pPixmap, unsigned int flags,
     struct i830_exa_pixmap_priv *driver_priv = exaGetPixmapDriverPrivate(pPixmap);
 
     if (driver_priv->flags & I830_EXA_PIXMAP_IS_MAPPED) {
-	ddx_bo_unmap(driver_priv->bo);
+	dri_bo_unmap(driver_priv->bo);
 	driver_priv->flags &= ~I830_EXA_PIXMAP_IS_MAPPED;
     }
     dri_emit_reloc(reloc_buf, flags, delta, offset, driver_priv->bo);
diff --git a/src/intel_batchbuffer.h b/src/intel_batchbuffer.h
index c1e3937..0c0a95e 100644
--- a/src/intel_batchbuffer.h
+++ b/src/intel_batchbuffer.h
@@ -13,7 +13,7 @@ struct intelddx_batchbuffer
 {
    ScrnInfoPtr pScrn;
 
-   ddx_bo *buf;
+   dri_bo *buf;
    dri_fence *last_fence;
    uint32_t flags;
 
@@ -26,7 +26,7 @@ struct intelddx_batchbuffer
 };
 
 struct i965_exastate_buffer {
-   ddx_bo *buf;
+   dri_bo *buf;
    dri_fence *last_fence;
    ScrnInfoPtr pScrn;
    unsigned char *map;
@@ -55,8 +55,8 @@ void intelddx_batchbuffer_release_space(struct intelddx_batchbuffer *batch,
                                      uint32_t bytes);
 
 Bool intelddx_batchbuffer_emit_reloc(struct intelddx_batchbuffer *batch,
-                                       ddx_bo *buffer,
-                                       uint32_t flags, uint32_t offset);
+				     dri_bo *buffer,
+				     uint32_t flags, uint32_t offset);
 
 /* Inline functions - might actually be better off with these
  * non-inlined.  Certainly better off switching all command packets to
@@ -93,8 +93,7 @@ intelddx_batchbuffer_require_space(struct intelddx_batchbuffer *batch,
 
 extern uint32_t intelddx_batchbuffer_emit_pixmap(PixmapPtr pPixmap,
 					     unsigned int flags,
-					     unsigned int mask,
-					     ddx_bo *reloc_buf,
+					     dri_bo *reloc_buf,
 					     unsigned int offset,
 					     unsigned int delta);
 
@@ -124,8 +123,8 @@ extern uint32_t intelddx_batchbuffer_emit_pixmap(PixmapPtr pPixmap,
    intelddx_batchbuffer_emit_reloc(pI830->batch, buf, flags, delta);	\
 } while (0)
 
-#define OUT_PIXMAP_RELOC(pixmap, flags, mask, delta) if (pI830->use_ttm_batch) { \
-    uint32_t _retval = intelddx_batchbuffer_emit_pixmap((pixmap), (flags), (mask),		\
+#define OUT_PIXMAP_RELOC(pixmap, flags, delta) if (pI830->use_ttm_batch) { \
+    uint32_t _retval = intelddx_batchbuffer_emit_pixmap((pixmap), (flags),		\
                                  pI830->batch->buf, (pI830->batch->ptr - pI830->batch->map), (delta)); \
     intelddx_batchbuffer_emit_dword (pI830->batch, _retval + (delta)); \
   } else {								\
diff --git a/src/intel_bufmgr_ttm.c b/src/intel_bufmgr_ttm.c
index 0fb657f..252c128 100644
--- a/src/intel_bufmgr_ttm.c
+++ b/src/intel_bufmgr_ttm.c
@@ -1,10 +1,10 @@
 /**************************************************************************
- * 
+ *
  * Copyright © 2007 Red Hat Inc.
  * Copyright © 2007 Intel Corporation
  * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -12,20 +12,20 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
- * 
+ *
+ *
  **************************************************************************/
 /*
  * Authors: Thomas Hellström <thomas-at-tungstengraphics-dot-com>
@@ -36,18 +36,20 @@
 
 #include <xf86drm.h>
 #include <stdlib.h>
+#include <string.h>
 #include <unistd.h>
+#include <assert.h>
+#include <stdio.h>
 #include "errno.h"
 #include "dri_bufmgr.h"
-#include <xf86mm.h>
+#include "intel_bufmgr_ttm.h"
+#include "string.h"
 
 #include "i915_drm.h"
 
-#include "intel_bufmgr_ttm.h"
-
 #define DBG(...) do {					\
    if (bufmgr_ttm->bufmgr.debug)			\
-     ErrorF(__VA_ARGS__);			\
+      fprintf(stderr, __VA_ARGS__);			\
 } while (0)
 
 /*
@@ -62,18 +64,18 @@
 			DRM_BO_FLAG_EXE)
 
 struct intel_validate_entry {
-    ddx_bo *bo;
+    dri_bo *bo;
     struct drm_i915_op_arg bo_arg;
 };
 
-typedef struct _ddx_bufmgr_ttm {
-   ddx_bufmgr bufmgr;
+typedef struct _dri_bufmgr_ttm {
+    dri_bufmgr bufmgr;
 
-   int fd;
-   unsigned int fence_type;
-   unsigned int fence_type_flush;
+    int fd;
+    unsigned int fence_type;
+    unsigned int fence_type_flush;
 
-   uint32_t max_relocs;
+    uint32_t max_relocs;
 
     struct intel_validate_entry *validate_array;
     int validate_array_size;
@@ -81,24 +83,23 @@ typedef struct _ddx_bufmgr_ttm {
 
     drmBO *cached_reloc_buf;
     uint32_t *cached_reloc_buf_data;
-} ddx_bufmgr_ttm;
-
+} dri_bufmgr_ttm;
 
 /**
  * Private information associated with a relocation that isn't already stored
  * in the relocation buffer to be passed to the kernel.
  */
-struct _ddx_ttm_reloc {
-    ddx_bo *target_buf;
+struct dri_ttm_reloc {
+    dri_bo *target_buf;
     uint64_t validate_flags;
 };
 
-typedef struct _ddx_bo_ttm {
-   ddx_bo bo;
+typedef struct _dri_bo_ttm {
+    dri_bo bo;
 
-   int refcount;		/* Protected by bufmgr->mutex */
-   drmBO drm_bo;
-   const char *name;
+    int refcount;
+    drmBO drm_bo;
+    const char *name;
 
     uint64_t last_flags;
 
@@ -111,44 +112,44 @@ typedef struct _ddx_bo_ttm {
     /** DRM buffer object containing relocation list */
     drmBO *reloc_buf;
     uint32_t *reloc_buf_data;
-    struct _ddx_ttm_reloc *relocs;
+    struct dri_ttm_reloc *relocs;
 
     /**
      * Indicates that the buffer may be shared with other processes, so we
      * can't hold maps beyond when the user does.
      */
-    Bool shared;
+    GLboolean shared;
 
-    Bool delayed_unmap;
+    GLboolean delayed_unmap;
     /* Virtual address from the dri_bo_map whose unmap was delayed. */
     void *saved_virtual;
-} ddx_bo_ttm;
+} dri_bo_ttm;
 
 typedef struct _dri_fence_ttm
 {
-   dri_fence fence;
+    dri_fence fence;
 
-   int refcount;		/* Protected by bufmgr->mutex */
-   const char *name;
-   drmFence drm_fence;
+    int refcount;
+    const char *name;
+    drmFence drm_fence;
 } dri_fence_ttm;
 
-static void dri_ttm_dump_validation_list(ddx_bufmgr_ttm *bufmgr_ttm)
+static void dri_ttm_dump_validation_list(dri_bufmgr_ttm *bufmgr_ttm)
 {
     int i, j;
 
     for (i = 0; i < bufmgr_ttm->validate_count; i++) {
-	ddx_bo *bo = bufmgr_ttm->validate_array[i].bo;
-	ddx_bo_ttm *bo_ttm = (ddx_bo_ttm *)bo;
+	dri_bo *bo = bufmgr_ttm->validate_array[i].bo;
+	dri_bo_ttm *bo_ttm = (dri_bo_ttm *)bo;
 
 	if (bo_ttm->reloc_buf_data != NULL) {
 	    for (j = 0; j < (bo_ttm->reloc_buf_data[0] & 0xffff); j++) {
 		uint32_t *reloc_entry = bo_ttm->reloc_buf_data +
 		    I915_RELOC_HEADER +
 		    j * I915_RELOC0_STRIDE;
-		ddx_bo *target_bo =
+		dri_bo *target_bo =
 		    bufmgr_ttm->validate_array[reloc_entry[2]].bo;
-		ddx_bo_ttm *target_ttm = (ddx_bo_ttm *)target_bo;
+		dri_bo_ttm *target_ttm = (dri_bo_ttm *)target_bo;
 
 		DBG("%2d: %s at 0x%08x -> %s at 0x%08x + 0x%08x\n",
 		    i,
@@ -171,18 +172,18 @@ static void dri_ttm_dump_validation_list(ddx_bufmgr_ttm *bufmgr_ttm)
  * access flags.
  */
 static void
-intelddx_add_validate_buffer(ddx_bo *buf,
+intel_add_validate_buffer(dri_bo *buf,
 			  uint64_t flags)
 {
-    ddx_bufmgr_ttm *bufmgr_ttm = (ddx_bufmgr_ttm *)buf->bufmgr;
-    ddx_bo_ttm *ttm_buf = (ddx_bo_ttm *)buf;
+    dri_bufmgr_ttm *bufmgr_ttm = (dri_bufmgr_ttm *)buf->bufmgr;
+    dri_bo_ttm *ttm_buf = (dri_bo_ttm *)buf;
 
     /* If we delayed doing an unmap to mitigate map/unmap syscall thrashing,
      * do that now.
      */
     if (ttm_buf->delayed_unmap) {
 	drmBOUnmap(bufmgr_ttm->fd, &ttm_buf->drm_bo);
-	ttm_buf->delayed_unmap = FALSE;
+	ttm_buf->delayed_unmap = GL_FALSE;
     }
 
     if (ttm_buf->validate_index == -1) {
@@ -218,7 +219,7 @@ intelddx_add_validate_buffer(ddx_bo *buf,
 
 	/* Fill in array entry */
 	entry->bo = buf;
-	ddx_bo_reference(buf);
+	dri_bo_reference(buf);
 
 	/* Fill in kernel arg */
 	arg = &entry->bo_arg;
@@ -276,17 +277,18 @@ intelddx_add_validate_buffer(ddx_bo *buf,
     }
 }
 
+
 #define RELOC_BUF_SIZE(x) ((I915_RELOC_HEADER + x * I915_RELOC0_STRIDE) * \
 	sizeof(uint32_t))
 
 static int
-intelddx_setup_reloc_list(ddx_bo *bo)
+intel_setup_reloc_list(dri_bo *bo)
 {
-    ddx_bo_ttm *bo_ttm = (ddx_bo_ttm *)bo;
-    ddx_bufmgr_ttm *bufmgr_ttm = (ddx_bufmgr_ttm *)bo->bufmgr;
+    dri_bo_ttm *bo_ttm = (dri_bo_ttm *)bo;
+    dri_bufmgr_ttm *bufmgr_ttm = (dri_bufmgr_ttm *)bo->bufmgr;
     int ret;
 
-    bo_ttm->relocs = malloc(sizeof(struct _ddx_ttm_reloc) *
+    bo_ttm->relocs = malloc(sizeof(struct dri_ttm_reloc) *
 			    bufmgr_ttm->max_relocs);
 
     if (bufmgr_ttm->cached_reloc_buf != NULL) {
@@ -340,153 +342,150 @@ intelddx_setup_reloc_list(ddx_bo *bo)
 int
 driFenceSignaled(DriFenceObject * fence, unsigned type)
 {
-   int signaled;
-   int ret;
+    int signaled;
+    int ret;
 
-   if (fence == NULL)
-      return TRUE;
+    if (fence == NULL)
+	return GL_TRUE;
 
-   ret = drmFenceSignaled(bufmgr_ttm->fd, &fence->fence, type, &signaled);
-   BM_CKFATAL(ret);
-   return signaled;
+    ret = drmFenceSignaled(bufmgr_ttm->fd, &fence->fence, type, &signaled);
+    BM_CKFATAL(ret);
+    return signaled;
 }
 #endif
 
-static ddx_bo *
-dri_ttm_alloc(ddx_bufmgr *bufmgr, const char *name,
+static dri_bo *
+dri_ttm_alloc(dri_bufmgr *bufmgr, const char *name,
 	      unsigned long size, unsigned int alignment,
-	      uint64_t  location_mask)
+	      uint64_t location_mask)
 {
-   ddx_bufmgr_ttm *bufmgr_ttm = (ddx_bufmgr_ttm *)bufmgr;
-   ddx_bo_ttm *ttm_buf;
-   unsigned int pageSize = getpagesize();
-   int ret;
-   unsigned int flags, hint;
-
-   ttm_buf = malloc(sizeof(*ttm_buf));
-   if (!ttm_buf)
-      return NULL;
-
-   /* The mask argument doesn't do anything for us that we want other than
-    * determine which pool (TTM or local) the buffer is allocated into, so just
-    * pass all of the allocation class flags.
-    */
-   flags = location_mask | DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE |
-      DRM_BO_FLAG_EXE;
-   /* No hints we want to use. */
-   hint = 0;
-
-   ret = drmBOCreate(bufmgr_ttm->fd, size, alignment / pageSize,
-		     NULL, flags, hint, &ttm_buf->drm_bo);
-   if (ret != 0) {
-      free(ttm_buf);
-      return NULL;
-   }
-   ttm_buf->bo.size = ttm_buf->drm_bo.size;
-   ttm_buf->bo.offset = ttm_buf->drm_bo.offset;
-   ttm_buf->bo.virtual = NULL;
-   ttm_buf->bo.bufmgr = bufmgr;
-   ttm_buf->name = name;
-   ttm_buf->refcount = 1;
-   ttm_buf->reloc_buf = NULL;
-   ttm_buf->reloc_buf_data = NULL;
-   ttm_buf->relocs = NULL;
-   ttm_buf->last_flags = ttm_buf->drm_bo.flags;
-   ttm_buf->shared = FALSE;
-   ttm_buf->delayed_unmap = FALSE;
-   ttm_buf->validate_index = -1;
-
-#if BUFMGR_DEBUG
-   fprintf(stderr, "bo_create: %p (%s)\n", &ttm_buf->bo, ttm_buf->name);
-#endif
+    dri_bufmgr_ttm *bufmgr_ttm = (dri_bufmgr_ttm *)bufmgr;
+    dri_bo_ttm *ttm_buf;
+    unsigned int pageSize = getpagesize();
+    int ret;
+    unsigned int flags, hint;
 
-   return &ttm_buf->bo;
+    ttm_buf = malloc(sizeof(*ttm_buf));
+    if (!ttm_buf)
+	return NULL;
+
+    /* The mask argument doesn't do anything for us that we want other than
+     * determine which pool (TTM or local) the buffer is allocated into, so
+     * just pass all of the allocation class flags.
+     */
+    flags = location_mask | DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE |
+	DRM_BO_FLAG_EXE;
+    /* No hints we want to use. */
+    hint = 0;
+
+    ret = drmBOCreate(bufmgr_ttm->fd, size, alignment / pageSize,
+		      NULL, flags, hint, &ttm_buf->drm_bo);
+    if (ret != 0) {
+	free(ttm_buf);
+	return NULL;
+    }
+    ttm_buf->bo.size = ttm_buf->drm_bo.size;
+    ttm_buf->bo.offset = ttm_buf->drm_bo.offset;
+    ttm_buf->bo.virtual = NULL;
+    ttm_buf->bo.bufmgr = bufmgr;
+    ttm_buf->name = name;
+    ttm_buf->refcount = 1;
+    ttm_buf->reloc_buf = NULL;
+    ttm_buf->reloc_buf_data = NULL;
+    ttm_buf->relocs = NULL;
+    ttm_buf->last_flags = ttm_buf->drm_bo.flags;
+    ttm_buf->shared = GL_FALSE;
+    ttm_buf->delayed_unmap = GL_FALSE;
+    ttm_buf->validate_index = -1;
+
+    DBG("bo_create: %p (%s) %db\n", &ttm_buf->bo, ttm_buf->name, size);
+
+    return &ttm_buf->bo;
 }
 
 /* Our TTM backend doesn't allow creation of static buffers, as that requires
  * privelege for the non-fake case, and the lock in the fake case where we were
  * working around the X Server not creating buffers and passing handles to us.
  */
-static ddx_bo *
-dri_ttm_alloc_static(ddx_bufmgr *bufmgr, const char *name,
+static dri_bo *
+dri_ttm_alloc_static(dri_bufmgr *bufmgr, const char *name,
 		     unsigned long offset, unsigned long size, void *virtual,
 		     uint64_t location_mask)
 {
-   return NULL;
+    return NULL;
 }
 
-/** Returns a ddx_bo wrapping the given buffer object handle.
+/**
+ * Returns a dri_bo wrapping the given buffer object handle.
  *
  * This can be used when one application needs to pass a buffer object
  * to another.
  */
-ddx_bo *
-intelddx_ttm_bo_create_from_handle(ddx_bufmgr *bufmgr, const char *name,
+dri_bo *
+intel_ttm_bo_create_from_handle(dri_bufmgr *bufmgr, const char *name,
 			      unsigned int handle)
 {
-   ddx_bufmgr_ttm *bufmgr_ttm;
-   ddx_bo_ttm *ttm_buf;
-   int ret;
-
-   bufmgr_ttm = (ddx_bufmgr_ttm *)bufmgr;
-
-   ttm_buf = malloc(sizeof(*ttm_buf));
-   if (!ttm_buf)
-      return NULL;
-
-   ret = drmBOReference(bufmgr_ttm->fd, handle, &ttm_buf->drm_bo);
-   if (ret != 0) {
-      free(ttm_buf);
-      return NULL;
-   }
-   ttm_buf->bo.size = ttm_buf->drm_bo.size;
-   ttm_buf->bo.offset = ttm_buf->drm_bo.offset;
-   ttm_buf->bo.virtual = NULL;
-   ttm_buf->bo.bufmgr = bufmgr;
-   ttm_buf->name = name;
-   ttm_buf->refcount = 1;
-   ttm_buf->reloc_buf = NULL;
-   ttm_buf->reloc_buf_data = NULL;
-   ttm_buf->relocs = NULL;
-   ttm_buf->last_flags = ttm_buf->drm_bo.flags;
-   ttm_buf->shared = TRUE;
-   ttm_buf->delayed_unmap = FALSE;
-   ttm_buf->validate_index = -1;
-
-#if BUFMGR_DEBUG
-   fprintf(stderr, "bo_create_from_handle: %p %08x (%s)\n", &ttm_buf->bo, handle,
-	   ttm_buf->name);
-#endif
+    dri_bufmgr_ttm *bufmgr_ttm = (dri_bufmgr_ttm *)bufmgr;
+    dri_bo_ttm *ttm_buf;
+    int ret;
 
-   return &ttm_buf->bo;
+    ttm_buf = malloc(sizeof(*ttm_buf));
+    if (!ttm_buf)
+	return NULL;
+
+    ret = drmBOReference(bufmgr_ttm->fd, handle, &ttm_buf->drm_bo);
+    if (ret != 0) {
+       fprintf(stderr, "Couldn't reference %s handle 0x%08x: %s\n",
+	       name, handle, strerror(-ret));
+	free(ttm_buf);
+	return NULL;
+    }
+    ttm_buf->bo.size = ttm_buf->drm_bo.size;
+    ttm_buf->bo.offset = ttm_buf->drm_bo.offset;
+    ttm_buf->bo.virtual = NULL;
+    ttm_buf->bo.bufmgr = bufmgr;
+    ttm_buf->name = name;
+    ttm_buf->refcount = 1;
+    ttm_buf->reloc_buf = NULL;
+    ttm_buf->reloc_buf_data = NULL;
+    ttm_buf->relocs = NULL;
+    ttm_buf->last_flags = ttm_buf->drm_bo.flags;
+    ttm_buf->shared = GL_TRUE;
+    ttm_buf->delayed_unmap = GL_FALSE;
+    ttm_buf->validate_index = -1;
+
+    DBG("bo_create_from_handle: %p %08x (%s)\n",
+	&ttm_buf->bo, handle, ttm_buf->name);
+
+    return &ttm_buf->bo;
 }
 
 static void
-dri_ttm_bo_reference(ddx_bo *buf)
+dri_ttm_bo_reference(dri_bo *buf)
 {
-   ddx_bo_ttm *ttm_buf = (ddx_bo_ttm *)buf;
+    dri_bo_ttm *ttm_buf = (dri_bo_ttm *)buf;
 
-   ttm_buf->refcount++;
+    ttm_buf->refcount++;
 }
 
 static void
-dri_ttm_bo_unreference(ddx_bo *buf)
+dri_ttm_bo_unreference(dri_bo *buf)
 {
-   ddx_bufmgr_ttm *bufmgr_ttm = (ddx_bufmgr_ttm *)buf->bufmgr;
-   ddx_bo_ttm *ttm_buf = (ddx_bo_ttm *)buf;
+    dri_bufmgr_ttm *bufmgr_ttm = (dri_bufmgr_ttm *)buf->bufmgr;
+    dri_bo_ttm *ttm_buf = (dri_bo_ttm *)buf;
 
-   if (!buf)
-      return;
+    if (!buf)
+	return;
 
-   if (--ttm_buf->refcount == 0) {
-      int ret;
+    if (--ttm_buf->refcount == 0) {
+	int ret;
 
-      	if (ttm_buf->reloc_buf) {
+	if (ttm_buf->reloc_buf) {
 	    int i;
 
 	    /* Unreference all the target buffers */
 	    for (i = 0; i < (ttm_buf->reloc_buf_data[0] & 0xffff); i++)
-		 ddx_bo_unreference(ttm_buf->relocs[i].target_buf);
+		 dri_bo_unreference(ttm_buf->relocs[i].target_buf);
 	    free(ttm_buf->relocs);
 
 	    if (bufmgr_ttm->cached_reloc_buf == NULL) {
@@ -509,32 +508,30 @@ dri_ttm_bo_unreference(ddx_bo *buf)
 
 	ret = drmBOUnreference(bufmgr_ttm->fd, &ttm_buf->drm_bo);
 	if (ret != 0) {
-	  fprintf(stderr, "drmBOUnreference failed (%s): %s\n", ttm_buf->name,
-		  strerror(-ret));
+	    fprintf(stderr, "drmBOUnreference failed (%s): %s\n",
+		    ttm_buf->name, strerror(-ret));
 	}
-#if BUFMGR_DEBUG
-	fprintf(stderr, "bo_unreference final: %p (%s)\n",
-		&ttm_buf->bo, ttm_buf->name);
-#endif
+	DBG("bo_unreference final: %p (%s)\n", &ttm_buf->bo, ttm_buf->name);
+
 	free(buf);
 	return;
-   }
+    }
 }
 
 static int
-dri_ttm_bo_map(ddx_bo *buf, Bool write_enable)
+dri_ttm_bo_map(dri_bo *buf, GLboolean write_enable)
 {
-   ddx_bufmgr_ttm *bufmgr_ttm;
-   ddx_bo_ttm *ttm_buf = (ddx_bo_ttm *)buf;
-   unsigned int flags;
+    dri_bufmgr_ttm *bufmgr_ttm;
+    dri_bo_ttm *ttm_buf = (dri_bo_ttm *)buf;
+    unsigned int flags;
 
-   bufmgr_ttm = (ddx_bufmgr_ttm *)buf->bufmgr;
+    bufmgr_ttm = (dri_bufmgr_ttm *)buf->bufmgr;
 
-   flags = DRM_BO_FLAG_READ;
-   if (write_enable)
-       flags |= DRM_BO_FLAG_WRITE;
+    flags = DRM_BO_FLAG_READ;
+    if (write_enable)
+	flags |= DRM_BO_FLAG_WRITE;
 
-   assert(buf->virtual == NULL);
+    assert(buf->virtual == NULL);
 
     DBG("bo_map: %p (%s)\n", &ttm_buf->bo, ttm_buf->name);
 
@@ -544,137 +541,128 @@ dri_ttm_bo_map(ddx_bo *buf, Bool write_enable)
 	return 0;
     }
 
-   return drmBOMap(bufmgr_ttm->fd, &ttm_buf->drm_bo, flags, 0, &buf->virtual);
+    return drmBOMap(bufmgr_ttm->fd, &ttm_buf->drm_bo, flags, 0, &buf->virtual);
 }
 
 static int
-dri_ttm_bo_unmap(ddx_bo *buf)
+dri_ttm_bo_unmap(dri_bo *buf)
 {
-   ddx_bufmgr_ttm *bufmgr_ttm;
-   ddx_bo_ttm *ttm_buf = (ddx_bo_ttm *)buf;
+    dri_bufmgr_ttm *bufmgr_ttm;
+    dri_bo_ttm *ttm_buf = (dri_bo_ttm *)buf;
+
+    if (buf == NULL)
+	return 0;
 
-   if (buf == NULL)
-      return 0;
+    bufmgr_ttm = (dri_bufmgr_ttm *)buf->bufmgr;
 
-   bufmgr_ttm = (ddx_bufmgr_ttm *)buf->bufmgr;
+    assert(buf->virtual != NULL);
 
-   assert(buf->virtual != NULL);
+    DBG("bo_unmap: %p (%s)\n", &ttm_buf->bo, ttm_buf->name);
 
-   if (!ttm_buf->shared) {
+    if (!ttm_buf->shared) {
 	ttm_buf->saved_virtual = buf->virtual;
-	ttm_buf->delayed_unmap = TRUE;
+	ttm_buf->delayed_unmap = GL_TRUE;
 	buf->virtual = NULL;
-	return 0;
-   }
-   buf->virtual = NULL;
 
+	return 0;
+    }
 
-#if BUFMGR_DEBUG
-   fprintf(stderr, "bo_unmap: %p (%s)\n", &ttm_buf->bo, ttm_buf->name);
-#endif
+    buf->virtual = NULL;
 
-   return drmBOUnmap(bufmgr_ttm->fd, &ttm_buf->drm_bo);
+    return drmBOUnmap(bufmgr_ttm->fd, &ttm_buf->drm_bo);
 }
 
-/* Returns a ddx_bo wrapping the given buffer object handle.
+/**
+ * Returns a dri_bo wrapping the given buffer object handle.
  *
  * This can be used when one application needs to pass a buffer object
  * to another.
  */
 dri_fence *
-intelddx_ttm_fence_create_from_arg(ddx_bufmgr *bufmgr, const char *name,
+intel_ttm_fence_create_from_arg(dri_bufmgr *bufmgr, const char *name,
 				drm_fence_arg_t *arg)
 {
-   ddx_bufmgr_ttm *bufmgr_ttm = (ddx_bufmgr_ttm *)bufmgr;
-   dri_fence_ttm *ttm_fence;
+    dri_bufmgr_ttm *bufmgr_ttm = (dri_bufmgr_ttm *)bufmgr;
+    dri_fence_ttm *ttm_fence;
 
     ttm_fence = malloc(sizeof(*ttm_fence));
-   if (!ttm_fence)
-      return NULL;
-
-   ttm_fence->drm_fence.handle = arg->handle;
-   ttm_fence->drm_fence.fence_class = arg->fence_class;
-   ttm_fence->drm_fence.type = arg->type;
-   ttm_fence->drm_fence.flags = arg->flags;
-   ttm_fence->drm_fence.signaled = 0;
-   ttm_fence->drm_fence.sequence = arg->sequence;
-
-   ttm_fence->fence.bufmgr = bufmgr;
-   ttm_fence->name = name;
-   ttm_fence->refcount = 1;
-
-#if BUFMGR_DEBUG
-   fprintf(stderr, "fence_create_from_handle: %p (%s)\n", &ttm_fence->fence,
-	   ttm_fence->name);
-#endif
+    if (!ttm_fence)
+	return NULL;
+
+    ttm_fence->drm_fence.handle = arg->handle;
+    ttm_fence->drm_fence.fence_class = arg->fence_class;
+    ttm_fence->drm_fence.type = arg->type;
+    ttm_fence->drm_fence.flags = arg->flags;
+    ttm_fence->drm_fence.signaled = 0;
+    ttm_fence->drm_fence.sequence = arg->sequence;
+
+    ttm_fence->fence.bufmgr = bufmgr;
+    ttm_fence->name = name;
+    ttm_fence->refcount = 1;
+
+    DBG("fence_create_from_handle: %p (%s)\n",
+	&ttm_fence->fence, ttm_fence->name);
 
-   return &ttm_fence->fence;
+    return &ttm_fence->fence;
 }
 
 
 static void
 dri_ttm_fence_reference(dri_fence *fence)
 {
-   dri_fence_ttm *fence_ttm = (dri_fence_ttm *)fence;
+    dri_fence_ttm *fence_ttm = (dri_fence_ttm *)fence;
+    dri_bufmgr_ttm *bufmgr_ttm = (dri_bufmgr_ttm *)fence->bufmgr;
 
-   ++fence_ttm->refcount;
-#if BUFMGR_DEBUG
-   fprintf(stderr, "fence_reference: %p (%s)\n", &fence_ttm->fence,
-	   fence_ttm->name);
-#endif
+    ++fence_ttm->refcount;
+    DBG("fence_reference: %p (%s)\n", &fence_ttm->fence, fence_ttm->name);
 }
 
 static void
 dri_ttm_fence_unreference(dri_fence *fence)
 {
-   dri_fence_ttm *fence_ttm = (dri_fence_ttm *)fence;
-   ddx_bufmgr_ttm *bufmgr_ttm = (ddx_bufmgr_ttm *)fence->bufmgr;
+    dri_fence_ttm *fence_ttm = (dri_fence_ttm *)fence;
+    dri_bufmgr_ttm *bufmgr_ttm = (dri_bufmgr_ttm *)fence->bufmgr;
+
+    if (!fence)
+	return;
 
-   if (!fence)
-      return;
+    DBG("fence_unreference: %p (%s)\n", &fence_ttm->fence, fence_ttm->name);
 
-#if BUFMGR_DEBUG
-   fprintf(stderr, "fence_unreference: %d %p (%s)\n", fence_ttm->refcount, &fence_ttm->fence,
-	   fence_ttm->name);
-#endif
-   if (--fence_ttm->refcount == 0) {
-      int ret;
-
-      ret = drmFenceUnreference(bufmgr_ttm->fd, &fence_ttm->drm_fence);
-      if (ret != 0) {
-	 fprintf(stderr, "drmFenceUnreference failed (%s): %s\n",
-		 fence_ttm->name, strerror(-ret));
-      }
-
-      free(fence);
-      return;
-   }
+    if (--fence_ttm->refcount == 0) {
+	int ret;
+
+	ret = drmFenceUnreference(bufmgr_ttm->fd, &fence_ttm->drm_fence);
+	if (ret != 0) {
+	    fprintf(stderr, "drmFenceUnreference failed (%s): %s\n",
+		    fence_ttm->name, strerror(-ret));
+	}
+
+	free(fence);
+	return;
+    }
 }
 
 static void
 dri_ttm_fence_wait(dri_fence *fence)
 {
-   dri_fence_ttm *fence_ttm = (dri_fence_ttm *)fence;
-   ddx_bufmgr_ttm *bufmgr_ttm = (ddx_bufmgr_ttm *)fence->bufmgr;
-   int ret;
-
-   ret = drmFenceWait(bufmgr_ttm->fd, DRM_FENCE_FLAG_WAIT_LAZY, &fence_ttm->drm_fence, 0);
-   if (ret != 0) {
-      ErrorF("%s:%d: Error %d waiting for fence %s.\n",
-		   __FILE__, __LINE__, ret, fence_ttm->name);
-      abort();
-   }
-
-#if BUFMGR_DEBUG
-   fprintf(stderr, "fence_wait: %p (%s)\n", &fence_ttm->fence,
-	   fence_ttm->name);
-#endif
+    dri_fence_ttm *fence_ttm = (dri_fence_ttm *)fence;
+    dri_bufmgr_ttm *bufmgr_ttm = (dri_bufmgr_ttm *)fence->bufmgr;
+    int ret;
+
+    ret = drmFenceWait(bufmgr_ttm->fd, DRM_FENCE_FLAG_WAIT_LAZY, &fence_ttm->drm_fence, 0);
+    if (ret != 0) {
+	fprintf(stderr, "%s:%d: Error %d waiting for fence %s.\n",
+		__FILE__, __LINE__, ret, fence_ttm->name);
+	abort();
+    }
+
+    DBG("fence_wait: %p (%s)\n", &fence_ttm->fence, fence_ttm->name);
 }
 
 static void
-ddx_bufmgr_ttm_destroy(ddx_bufmgr *bufmgr)
+dri_bufmgr_ttm_destroy(dri_bufmgr *bufmgr)
 {
-   ddx_bufmgr_ttm *bufmgr_ttm = (ddx_bufmgr_ttm *)bufmgr;
+    dri_bufmgr_ttm *bufmgr_ttm = (dri_bufmgr_ttm *)bufmgr;
 
     if (bufmgr_ttm->cached_reloc_buf) {
        /* Free the cached kernel BO containing relocation entries */
@@ -685,7 +673,7 @@ ddx_bufmgr_ttm_destroy(ddx_bufmgr *bufmgr)
 
     free(bufmgr_ttm->validate_array);
 
-   free(bufmgr);
+    free(bufmgr);
 }
 
 /**
@@ -698,17 +686,17 @@ ddx_bufmgr_ttm_destroy(ddx_bufmgr *bufmgr)
  * last known offset in target_buf.
  */
 static void
-dri_ttm_emit_reloc(ddx_bo *reloc_buf, uint64_t flags, uint32_t delta,
-		   uint32_t offset, ddx_bo *target_buf)
+dri_ttm_emit_reloc(dri_bo *reloc_buf, uint64_t flags, GLuint delta,
+		   GLuint offset, dri_bo *target_buf)
 {
-   ddx_bufmgr_ttm *bufmgr_ttm = (ddx_bufmgr_ttm *)reloc_buf->bufmgr;
-   ddx_bo_ttm *reloc_buf_ttm = (ddx_bo_ttm *)reloc_buf;
-   int num_relocs;
-   uint32_t *this_reloc;
+    dri_bufmgr_ttm *bufmgr_ttm = (dri_bufmgr_ttm *)reloc_buf->bufmgr;
+    dri_bo_ttm *reloc_buf_ttm = (dri_bo_ttm *)reloc_buf;
+    int num_relocs;
+    uint32_t *this_reloc;
 
     /* Create a new relocation list if needed */
     if (reloc_buf_ttm->reloc_buf == NULL)
-	intelddx_setup_reloc_list(reloc_buf);
+	intel_setup_reloc_list(reloc_buf);
 
     num_relocs = (reloc_buf_ttm->reloc_buf_data[0] & 0xffff);
 
@@ -726,12 +714,11 @@ dri_ttm_emit_reloc(ddx_bo *reloc_buf, uint64_t flags, uint32_t delta,
 
     reloc_buf_ttm->relocs[num_relocs].validate_flags = flags;
     reloc_buf_ttm->relocs[num_relocs].target_buf = target_buf;
-    ddx_bo_reference(target_buf);
+    dri_bo_reference(target_buf);
 
     reloc_buf_ttm->reloc_buf_data[0]++; /* Increment relocation count */
     /* Check wraparound */
     assert((reloc_buf_ttm->reloc_buf_data[0] & 0xffff) != 0);
-   return;
 }
 
 /**
@@ -740,9 +727,9 @@ dri_ttm_emit_reloc(ddx_bo *reloc_buf, uint64_t flags, uint32_t delta,
  * index values into the validation list.
  */
 static void
-dri_ttm_bo_process_reloc(ddx_bo *bo)
+dri_ttm_bo_process_reloc(dri_bo *bo)
 {
-    ddx_bo_ttm *bo_ttm = (ddx_bo_ttm *)bo;
+    dri_bo_ttm *bo_ttm = (dri_bo_ttm *)bo;
     unsigned int nr_relocs;
     int i;
 
@@ -752,15 +739,15 @@ dri_ttm_bo_process_reloc(ddx_bo *bo)
     nr_relocs = bo_ttm->reloc_buf_data[0] & 0xffff;
 
     for (i = 0; i < nr_relocs; i++) {
-	struct _ddx_ttm_reloc *r = &bo_ttm->relocs[i];
-	ddx_bo_ttm *target_ttm = (ddx_bo_ttm *)r->target_buf;
+	struct dri_ttm_reloc *r = &bo_ttm->relocs[i];
+	dri_bo_ttm *target_ttm = (dri_bo_ttm *)r->target_buf;
 	uint32_t *reloc_entry;
 
 	/* Continue walking the tree depth-first. */
 	dri_ttm_bo_process_reloc(r->target_buf);
 
 	/* Add the target to the validate list */
-	intelddx_add_validate_buffer(r->target_buf, r->validate_flags);
+	intel_add_validate_buffer(r->target_buf, r->validate_flags);
 
 	/* Update the index of the target in the relocation entry */
 	reloc_entry = bo_ttm->reloc_buf_data + I915_RELOC_HEADER +
@@ -770,19 +757,19 @@ dri_ttm_bo_process_reloc(ddx_bo *bo)
 }
 
 static void *
-dri_ttm_process_reloc(ddx_bo *batch_buf, uint32_t *count)
+dri_ttm_process_reloc(dri_bo *batch_buf, GLuint *count)
 {
-   ddx_bufmgr_ttm *bufmgr_ttm = (ddx_bufmgr_ttm *)batch_buf->bufmgr;
-
-   /* Update indices and set up the validate list. */
-   dri_ttm_bo_process_reloc(batch_buf);
-   
-   /* Add the batch buffer to the validation list.  There are no relocations
-    * pointing to it.
-    */
-    intelddx_add_validate_buffer(batch_buf,
+    dri_bufmgr_ttm *bufmgr_ttm = (dri_bufmgr_ttm *)batch_buf->bufmgr;
+
+    /* Update indices and set up the validate list. */
+    dri_ttm_bo_process_reloc(batch_buf);
+
+    /* Add the batch buffer to the validation list.  There are no relocations
+     * pointing to it.
+     */
+    intel_add_validate_buffer(batch_buf,
 			      DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_EXE);
-    
+
     *count = bufmgr_ttm->validate_count;
     return &bufmgr_ttm->validate_array[0].bo_arg;
 }
@@ -816,13 +803,13 @@ intel_get_flags_caching_string(uint64_t flags)
 }
 
 static void
-intel_update_buffer_offsets (ddx_bufmgr_ttm *bufmgr_ttm)
+intel_update_buffer_offsets (dri_bufmgr_ttm *bufmgr_ttm)
 {
     int i;
 
     for (i = 0; i < bufmgr_ttm->validate_count; i++) {
-	ddx_bo *bo = bufmgr_ttm->validate_array[i].bo;
-	ddx_bo_ttm *bo_ttm = (ddx_bo_ttm *)bo;
+	dri_bo *bo = bufmgr_ttm->validate_array[i].bo;
+	dri_bo_ttm *bo_ttm = (dri_bo_ttm *)bo;
 	struct drm_i915_op_arg *arg = &bufmgr_ttm->validate_array[i].bo_arg;
 	struct drm_bo_arg_rep *rep = &arg->d.rep;
 
@@ -847,27 +834,26 @@ intel_update_buffer_offsets (ddx_bufmgr_ttm *bufmgr_ttm)
 }
 
 static void
-dri_ttm_post_submit(ddx_bo *batch_buf, dri_fence **last_fence)
+dri_ttm_post_submit(dri_bo *batch_buf, dri_fence **last_fence)
 {
-   ddx_bufmgr_ttm *bufmgr_ttm = (ddx_bufmgr_ttm *)batch_buf->bufmgr;
-   int i;
-    
-   intel_update_buffer_offsets (bufmgr_ttm);
-
-   if (bufmgr_ttm->bufmgr.debug)
-     dri_ttm_dump_validation_list(bufmgr_ttm);
-   
-   for (i = 0; i < bufmgr_ttm->validate_count; i++) {
-     ddx_bo *bo = bufmgr_ttm->validate_array[i].bo;
-     ddx_bo_ttm *bo_ttm = (ddx_bo_ttm *)bo;
-     
-	/* Disconnect the buffer from the validate list */
-     bo_ttm->validate_index = -1;
-     ddx_bo_unreference(bo);
-     bufmgr_ttm->validate_array[i].bo = NULL;
-   }
-   bufmgr_ttm->validate_count = 0;
+    dri_bufmgr_ttm *bufmgr_ttm = (dri_bufmgr_ttm *)batch_buf->bufmgr;
+    int i;
+
+    intel_update_buffer_offsets (bufmgr_ttm);
+
+    if (bufmgr_ttm->bufmgr.debug)
+	dri_ttm_dump_validation_list(bufmgr_ttm);
+
+    for (i = 0; i < bufmgr_ttm->validate_count; i++) {
+	dri_bo *bo = bufmgr_ttm->validate_array[i].bo;
+	dri_bo_ttm *bo_ttm = (dri_bo_ttm *)bo;
 
+	/* Disconnect the buffer from the validate list */
+	bo_ttm->validate_index = -1;
+	dri_bo_unreference(bo);
+	bufmgr_ttm->validate_array[i].bo = NULL;
+    }
+    bufmgr_ttm->validate_count = 0;
 }
 
 /**
@@ -879,35 +865,42 @@ dri_ttm_post_submit(ddx_bo *batch_buf, dri_fence **last_fence)
  * \param fence_type_flush Driver-specific fence type used for fences with a
  *	  flush.
  */
-ddx_bufmgr *
-intelddx_bufmgr_ttm_init(int fd, unsigned int fence_type,
+dri_bufmgr *
+intel_bufmgr_ttm_init(int fd, unsigned int fence_type,
 		      unsigned int fence_type_flush, int batch_size)
 {
-   ddx_bufmgr_ttm *bufmgr_ttm;
-
-   bufmgr_ttm = calloc(1, sizeof(*bufmgr_ttm));
-   bufmgr_ttm->fd = fd;
-   bufmgr_ttm->fence_type = fence_type;
-   bufmgr_ttm->fence_type_flush = fence_type_flush;
-   bufmgr_ttm->cached_reloc_buf = NULL;
-   bufmgr_ttm->cached_reloc_buf_data = NULL;
-
-   /* lets go with one relocation per every four dwords - purely heuristic */
-   bufmgr_ttm->max_relocs = batch_size / sizeof(uint32_t) / 2 - 2;
-
-   bufmgr_ttm->bufmgr.bo_alloc = dri_ttm_alloc;
-   bufmgr_ttm->bufmgr.bo_alloc_static = dri_ttm_alloc_static;
-   bufmgr_ttm->bufmgr.bo_reference = dri_ttm_bo_reference;
-   bufmgr_ttm->bufmgr.bo_unreference = dri_ttm_bo_unreference;
-   bufmgr_ttm->bufmgr.bo_map = dri_ttm_bo_map;
-   bufmgr_ttm->bufmgr.bo_unmap = dri_ttm_bo_unmap;
-   bufmgr_ttm->bufmgr.fence_reference = dri_ttm_fence_reference;
-   bufmgr_ttm->bufmgr.fence_unreference = dri_ttm_fence_unreference;
-   bufmgr_ttm->bufmgr.fence_wait = dri_ttm_fence_wait;
-   bufmgr_ttm->bufmgr.destroy = ddx_bufmgr_ttm_destroy;
-   bufmgr_ttm->bufmgr.emit_reloc = dri_ttm_emit_reloc;
-   bufmgr_ttm->bufmgr.process_relocs = dri_ttm_process_reloc;
-   bufmgr_ttm->bufmgr.post_submit = dri_ttm_post_submit;
-   return &bufmgr_ttm->bufmgr;
+    dri_bufmgr_ttm *bufmgr_ttm;
+
+    bufmgr_ttm = calloc(1, sizeof(*bufmgr_ttm));
+    bufmgr_ttm->fd = fd;
+    bufmgr_ttm->fence_type = fence_type;
+    bufmgr_ttm->fence_type_flush = fence_type_flush;
+    bufmgr_ttm->cached_reloc_buf = NULL;
+    bufmgr_ttm->cached_reloc_buf_data = NULL;
+
+    /* Let's go with one relocation per every 2 dwords (but round down a bit
+     * since a power of two will mean an extra page allocation for the reloc
+     * buffer).
+     *
+     * Every 4 was too few for the blender benchmark.
+     */
+    bufmgr_ttm->max_relocs = batch_size / sizeof(uint32_t) / 2 - 2;
+
+    bufmgr_ttm->bufmgr.bo_alloc = dri_ttm_alloc;
+    bufmgr_ttm->bufmgr.bo_alloc_static = dri_ttm_alloc_static;
+    bufmgr_ttm->bufmgr.bo_reference = dri_ttm_bo_reference;
+    bufmgr_ttm->bufmgr.bo_unreference = dri_ttm_bo_unreference;
+    bufmgr_ttm->bufmgr.bo_map = dri_ttm_bo_map;
+    bufmgr_ttm->bufmgr.bo_unmap = dri_ttm_bo_unmap;
+    bufmgr_ttm->bufmgr.fence_reference = dri_ttm_fence_reference;
+    bufmgr_ttm->bufmgr.fence_unreference = dri_ttm_fence_unreference;
+    bufmgr_ttm->bufmgr.fence_wait = dri_ttm_fence_wait;
+    bufmgr_ttm->bufmgr.destroy = dri_bufmgr_ttm_destroy;
+    bufmgr_ttm->bufmgr.emit_reloc = dri_ttm_emit_reloc;
+    bufmgr_ttm->bufmgr.process_relocs = dri_ttm_process_reloc;
+    bufmgr_ttm->bufmgr.post_submit = dri_ttm_post_submit;
+    bufmgr_ttm->bufmgr.debug = GL_FALSE;
+
+    return &bufmgr_ttm->bufmgr;
 }
 
diff --git a/src/intel_bufmgr_ttm.h b/src/intel_bufmgr_ttm.h
index d67a151..0738839 100644
--- a/src/intel_bufmgr_ttm.h
+++ b/src/intel_bufmgr_ttm.h
@@ -4,14 +4,14 @@
 
 #include "dri_bufmgr.h"
 
-extern ddx_bo *intelddx_ttm_bo_create_from_handle(ddx_bufmgr *bufmgr, const char *name,
+extern dri_bo *intel_ttm_bo_create_from_handle(dri_bufmgr *bufmgr, const char *name,
 					       unsigned int handle);
 
-dri_fence *intelddx_ttm_fence_create_from_arg(ddx_bufmgr *bufmgr, const char *name,
+dri_fence *intel_ttm_fence_create_from_arg(dri_bufmgr *bufmgr, const char *name,
 					   drm_fence_arg_t *arg);
 
 
-ddx_bufmgr *intelddx_bufmgr_ttm_init(int fd, unsigned int fence_type,
+dri_bufmgr *intel_bufmgr_ttm_init(int fd, unsigned int fence_type,
 				  unsigned int fence_type_flush, int batch_size);
 
 #endif
commit d46c01d97b729bf616795980f6fcac71b2fb9a84
Author: Carl Worth <cworth at cworth.org>
Date:   Fri Dec 21 14:15:08 2007 -0800

    Move PIPE_CONTROL command from prepare_composite to batch_header

diff --git a/src/i965_render.c b/src/i965_render.c
index 42fad58..f8acdfa 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -918,7 +918,7 @@ gen4_emit_batch_header (ScrnInfoPtr pScrn)
 	ADVANCE_BATCH();
     }
     {
-	BEGIN_BATCH(16);
+	BEGIN_BATCH(20);
 
 /* Match Mesa driver setup */
 	OUT_BATCH(BRW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
@@ -970,6 +970,15 @@ gen4_emit_batch_header (ScrnInfoPtr pScrn)
 	OUT_BATCH(((URB_CS_ENTRY_SIZE - 1) << 4) |
 		  (URB_CS_ENTRIES << 0));
 
+	/* Pipe control */
+	OUT_BATCH(BRW_PIPE_CONTROL |
+		  BRW_PIPE_CONTROL_NOWRITE |
+		  BRW_PIPE_CONTROL_IS_FLUSH |
+		  2);
+	OUT_BATCH(0);			       /* Destination address */
+	OUT_BATCH(0);			       /* Immediate data low DW */
+	OUT_BATCH(0);			       /* Immediate data high DW */
+
 	ADVANCE_BATCH();
     }
 }
@@ -1200,16 +1209,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	gen4_emit_batch_header (pScrn);
 
     {
-	BEGIN_BATCH(22);
-	/* Pipe control */
-   	OUT_BATCH(BRW_PIPE_CONTROL |
-		 BRW_PIPE_CONTROL_NOWRITE |
-		 BRW_PIPE_CONTROL_IS_FLUSH |
-		 2);
-   	OUT_BATCH(0);			       /* Destination address */
-   	OUT_BATCH(0);			       /* Immediate data low DW */
-   	OUT_BATCH(0);			       /* Immediate data high DW */
-
+	BEGIN_BATCH(18);
 	/* Binding table pointers */
    	OUT_BATCH(BRW_3DSTATE_BINDING_TABLE_POINTERS | 4);
    	OUT_BATCH(0); /* vs */
commit 23ea41531c0b2fe4fb23bea92a6fd4e1409e6025
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Dec 19 17:08:24 2007 -0800

    Don't emit redundant commands for every composite operation.
    
    Instead add a new gen4_emit_batch_header function that emits these
    commands only once. So far this improves performance from about
    120,000 glyphs/sec. to about 160,000 glyphs/sec., (measured with
    "x11perf -aa10text" on my system).

diff --git a/src/i965_render.c b/src/i965_render.c
index 4867571..42fad58 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -880,6 +880,100 @@ sampler_state_extend_from_picture (int repeat)
 	return SAMPLER_STATE_EXTEND_NONE;
 }
 
+static void
+gen4_emit_batch_header (ScrnInfoPtr pScrn)
+{
+    I830Ptr pI830 = I830PTR(pScrn);
+    int sip_kernel_offset;
+    int urb_vs_start, urb_vs_size;
+    int urb_gs_start, urb_gs_size;
+    int urb_clip_start, urb_clip_size;
+    int urb_sf_start, urb_sf_size;
+    int urb_cs_start, urb_cs_size;
+
+    urb_vs_start = 0;
+    urb_vs_size = URB_VS_ENTRIES * URB_VS_ENTRY_SIZE;
+    urb_gs_start = urb_vs_start + urb_vs_size;
+    urb_gs_size = URB_GS_ENTRIES * URB_GS_ENTRY_SIZE;
+    urb_clip_start = urb_gs_start + urb_gs_size;
+    urb_clip_size = URB_CLIP_ENTRIES * URB_CLIP_ENTRY_SIZE;
+    urb_sf_start = urb_clip_start + urb_clip_size;
+    urb_sf_size = URB_SF_ENTRIES * URB_SF_ENTRY_SIZE;
+    urb_cs_start = urb_sf_start + urb_sf_size;
+    urb_cs_size = URB_CS_ENTRIES * URB_CS_ENTRY_SIZE;
+
+    IntelEmitInvarientState(pScrn);
+
+    sip_kernel_offset = offsetof (gen4_state_t, sip_kernel);
+
+    /* Begin the long sequence of commands needed to set up the 3D
+     * rendering pipe
+     */
+    {
+	BEGIN_BATCH(2);
+	OUT_BATCH(MI_FLUSH |
+		  MI_STATE_INSTRUCTION_CACHE_FLUSH |
+		  BRW_MI_GLOBAL_SNAPSHOT_RESET);
+	OUT_BATCH(MI_NOOP);
+	ADVANCE_BATCH();
+    }
+    {
+	BEGIN_BATCH(16);
+
+/* Match Mesa driver setup */
+	OUT_BATCH(BRW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
+
+	OUT_BATCH(BRW_CS_URB_STATE | 0);
+	OUT_BATCH((0 << 4) |  /* URB Entry Allocation Size */
+		  (0 << 0));  /* Number of URB Entries */
+
+	/* Zero out the two base address registers so all offsets are
+	 * absolute.
+	 */
+	OUT_BATCH(BRW_STATE_BASE_ADDRESS | 4);
+
+	if (pI830->use_ttm_batch) {
+	    OUT_RELOC(pI830->exa965->buf, DRM_BO_FLAG_MEM_TT, BASE_ADDRESS_MODIFY);
+
+	    OUT_RELOC(pI830->exa965->surface_buf, DRM_BO_FLAG_MEM_TT, BASE_ADDRESS_MODIFY);
+	} else {
+	    OUT_BATCH(pI830->exa_965_state->offset | BASE_ADDRESS_MODIFY);
+	    OUT_BATCH(pI830->exa_965_state->offset | BASE_ADDRESS_MODIFY);
+	}
+
+	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);  /* media base addr, don't care */
+	/* general state max addr, disabled */
+	OUT_BATCH(0x10000000 | BASE_ADDRESS_MODIFY);
+	/* media object state max addr, disabled */
+	OUT_BATCH(0x10000000 | BASE_ADDRESS_MODIFY);
+
+	/* Set system instruction pointer */
+	OUT_BATCH(BRW_STATE_SIP | 0);
+	OUT_BATCH(sip_kernel_offset);
+
+	/* URB fence */
+	OUT_BATCH(BRW_URB_FENCE |
+		  UF0_CS_REALLOC |
+		  UF0_SF_REALLOC |
+		  UF0_CLIP_REALLOC |
+		  UF0_GS_REALLOC |
+		  UF0_VS_REALLOC |
+		  1);
+	OUT_BATCH(((urb_clip_start + urb_clip_size) << UF1_CLIP_FENCE_SHIFT) |
+		  ((urb_gs_start + urb_gs_size) << UF1_GS_FENCE_SHIFT) |
+		  ((urb_vs_start + urb_vs_size) << UF1_VS_FENCE_SHIFT));
+	OUT_BATCH(((urb_cs_start + urb_cs_size) << UF2_CS_FENCE_SHIFT) |
+		  ((urb_sf_start + urb_sf_size) << UF2_SF_FENCE_SHIFT));
+
+	/* Constant buffer state */
+	OUT_BATCH(BRW_CS_URB_STATE | 0);
+	OUT_BATCH(((URB_CS_ENTRY_SIZE - 1) << 4) |
+		  (URB_CS_ENTRIES << 0));
+
+	ADVANCE_BATCH();
+    }
+}
+
 Bool
 i965_prepare_composite(int op, PicturePtr pSrcPicture,
 		       PicturePtr pMaskPicture, PicturePtr pDstPicture,
@@ -891,7 +985,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     CARD32 mask_pitch = 0, mask_tile_format = 0, mask_tiled = 0;
     CARD32 dst_format, dst_pitch, dst_tile_format = 0, dst_tiled = 0;
     Bool rotation_program = FALSE;
-    int wm_state_offset, sip_kernel_offset;
+    int wm_state_offset;
     int sf_state_offset, cc_state_offset;
     char *surface_start_base;
     void *surface_map;
@@ -899,11 +993,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     sampler_state_extend_t src_extend, mask_extend;
     struct brw_surface_state *dest_surf_state, *src_surf_state, *mask_surf_state;
     CARD32 *binding_table;
-    int urb_vs_start, urb_vs_size;
-    int urb_gs_start, urb_gs_size;
-    int urb_clip_start, urb_clip_size;
-    int urb_sf_start, urb_sf_size;
-    int urb_cs_start, urb_cs_size;
     CARD32 src_blend, dst_blend;
 
     if (pI830->use_ttm_batch) {
@@ -916,7 +1005,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 
     surface_start_base = surface_map;
 
-    IntelEmitInvarientState(pScrn);
     *pI830->last_3d = LAST_3D_RENDER;
 
     src_pitch = intel_get_pixmap_pitch(pSrc);
@@ -1102,71 +1190,17 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 				    [mask_extend]);
     }
 
-    sip_kernel_offset = offsetof (gen4_state_t, sip_kernel);
-    
     cc_state_offset = offsetof (gen4_state_t,
 				cc_state[src_blend][dst_blend]);
 
-    urb_vs_start = 0;
-    urb_vs_size = URB_VS_ENTRIES * URB_VS_ENTRY_SIZE;
-    urb_gs_start = urb_vs_start + urb_vs_size;
-    urb_gs_size = URB_GS_ENTRIES * URB_GS_ENTRY_SIZE;
-    urb_clip_start = urb_gs_start + urb_gs_size;
-    urb_clip_size = URB_CLIP_ENTRIES * URB_CLIP_ENTRY_SIZE;
-    urb_sf_start = urb_clip_start + urb_clip_size;
-    urb_sf_size = URB_SF_ENTRIES * URB_SF_ENTRY_SIZE;
-    urb_cs_start = urb_sf_start + urb_sf_size;
-    urb_cs_size = URB_CS_ENTRIES * URB_CS_ENTRY_SIZE;
+    /* Any commands that don't change from one composite operation to
+     * the next we simply emit once at the beginning of the entire
+     * batch. */
+    if (pI830->exa965->num_ops == 0)
+	gen4_emit_batch_header (pScrn);
 
-    /* Begin the long sequence of commands needed to set up the 3D
-     * rendering pipe
-     */
     {
-	BEGIN_BATCH(2);
-   	OUT_BATCH(MI_FLUSH |
-		  MI_STATE_INSTRUCTION_CACHE_FLUSH |
-		  BRW_MI_GLOBAL_SNAPSHOT_RESET);
-	OUT_BATCH(MI_NOOP);
-	ADVANCE_BATCH();
-    }
-    {
-        BEGIN_BATCH(12);
-
-        /* Match Mesa driver setup */
-        OUT_BATCH(BRW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
-
-   	OUT_BATCH(BRW_CS_URB_STATE | 0);
-   	OUT_BATCH((0 << 4) |  /* URB Entry Allocation Size */
-		 (0 << 0));  /* Number of URB Entries */
-
-	/* Zero out the two base address registers so all offsets are
-	 * absolute.
-	 */
-   	OUT_BATCH(BRW_STATE_BASE_ADDRESS | 4);
-
-	if (pI830->use_ttm_batch) {
-	    OUT_RELOC(pI830->exa965->buf, DRM_BO_FLAG_MEM_TT, BASE_ADDRESS_MODIFY);
-
-	    OUT_RELOC(pI830->exa965->surface_buf, DRM_BO_FLAG_MEM_TT, BASE_ADDRESS_MODIFY);
-	} else {
-	    OUT_BATCH(pI830->exa_965_state->offset | BASE_ADDRESS_MODIFY);
-	    OUT_BATCH(pI830->exa_965_state->offset | BASE_ADDRESS_MODIFY);
-	}
-
-   	OUT_BATCH(0 | BASE_ADDRESS_MODIFY);  /* media base addr, don't care */
-	/* general state max addr, disabled */
-   	OUT_BATCH(0x10000000 | BASE_ADDRESS_MODIFY);
-	/* media object state max addr, disabled */
-   	OUT_BATCH(0x10000000 | BASE_ADDRESS_MODIFY);
-
-	/* Set system instruction pointer */
-   	OUT_BATCH(BRW_STATE_SIP | 0);
-   	OUT_BATCH(sip_kernel_offset);
-	OUT_BATCH(MI_NOOP);
-	ADVANCE_BATCH();
-    }
-    {
-	BEGIN_BATCH(26);
+	BEGIN_BATCH(22);
 	/* Pipe control */
    	OUT_BATCH(BRW_PIPE_CONTROL |
 		 BRW_PIPE_CONTROL_NOWRITE |
@@ -1207,25 +1241,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	OUT_BATCH(sf_state_offset); /* 32 byte aligned */
 	OUT_BATCH(wm_state_offset); /* 32 byte aligned */
 	OUT_BATCH(cc_state_offset); /* 64 byte aligned */
-
-	/* URB fence */
-   	OUT_BATCH(BRW_URB_FENCE |
-        	 UF0_CS_REALLOC |
-	    	 UF0_SF_REALLOC |
-	    	 UF0_CLIP_REALLOC |
-	         UF0_GS_REALLOC |
-	         UF0_VS_REALLOC |
-	    	 1);
-   	OUT_BATCH(((urb_clip_start + urb_clip_size) << UF1_CLIP_FENCE_SHIFT) |
-	    	 ((urb_gs_start + urb_gs_size) << UF1_GS_FENCE_SHIFT) |
-	    	 ((urb_vs_start + urb_vs_size) << UF1_VS_FENCE_SHIFT));
-   	OUT_BATCH(((urb_cs_start + urb_cs_size) << UF2_CS_FENCE_SHIFT) |
-	     	 ((urb_sf_start + urb_sf_size) << UF2_SF_FENCE_SHIFT));
-
-	/* Constant buffer state */
-   	OUT_BATCH(BRW_CS_URB_STATE | 0);
-   	OUT_BATCH(((URB_CS_ENTRY_SIZE - 1) << 4) |
-	    	 (URB_CS_ENTRIES << 0));
 	ADVANCE_BATCH();
     }
     {
commit 3b1a5ea69e2baa8a6053740246da2f62440afb7d
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Dec 19 17:00:42 2007 -0800

    Move increment of num_ops from surface_state_init to i965_composite

diff --git a/src/i965_render.c b/src/i965_render.c
index daa2ae8..4867571 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -813,8 +813,6 @@ gen4_surface_state_init (unsigned char *start_base,
     mask_surf_state->ss0.render_cache_read_mode = 0;
     mask_surf_state->ss2.mip_count = 0;
     mask_surf_state->ss2.render_target_rotation = 0;
-
-    state->num_ops++;
 }
 
 void i965_exastate_flush(struct i965_exastate_buffer *state)
@@ -1387,6 +1385,8 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
 
     vb_index = i;
 
+    pI830->exa965->num_ops++;
+
 #ifdef I830DEBUG
     ErrorF("sync after 3dprimitive");
     I830Sync(pScrn);
commit 5e89225649abe39c64c27bd7f0684c84d1eccb8f
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Dec 19 16:07:31 2007 -0800

    Allocate gen4_state_t bufer object at the right size.
    
    Instead of allocating at EXASTATE_SZ along with a silly macro for
    verifying that that was big enough.

diff --git a/src/i965_render.c b/src/i965_render.c
index 3916788..daa2ae8 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -435,9 +435,6 @@ typedef struct _gen4_state {
 					    [SAMPLER_STATE_EXTEND_COUNT];
 } gen4_state_t;
 
-char gen4_state_too_big[(EXASTATE_SZ >=
-			 sizeof(gen4_state_t)) ? 1 : -1];
-
 /* How many composite operations will we fit in one object. */
 #define GEN4_MAX_OPS			32
 #define GEN4_SURFACE_STATE_PER_OP	3
@@ -838,7 +835,7 @@ i965_exastate_reset(struct i965_exastate_buffer *state)
     /* First the general state buffer. */
     if (state->buf == NULL) {
 	state->buf = ddx_bo_alloc(pI830->bufmgr, "exa state buffer",
-				  EXASTATE_SZ, 4096,
+				  sizeof (gen4_state_t), 4096,
 				  DRM_BO_FLAG_MEM_LOCAL | DRM_BO_FLAG_CACHED | DRM_BO_FLAG_CACHED_MAPPED);
 	ddx_bo_map(state->buf, TRUE);
 	state->map = state->buf->virtual;
commit 07f79fef66bc6b13c50fe5ae4808b2fb8cae813a
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Dec 19 16:05:36 2007 -0800

    Move several file-scope variables to function-scope where they belong.

diff --git a/src/i965_render.c b/src/i965_render.c
index c513010..3916788 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -58,6 +58,24 @@ do { 							\
 } while(0)
 #endif
 
+/* Set up a default static partitioning of the URB, which is supposed to
+ * allow anything we would want to do, at potentially lower performance.
+ */
+#define URB_CS_ENTRY_SIZE     0
+#define URB_CS_ENTRIES	      0
+
+#define URB_VS_ENTRY_SIZE     1	  // each 512-bit row
+#define URB_VS_ENTRIES	      8	  // we needs at least 8 entries
+
+#define URB_GS_ENTRY_SIZE     0
+#define URB_GS_ENTRIES	      0
+
+#define URB_CLIP_ENTRY_SIZE   0
+#define URB_CLIP_ENTRIES      0
+
+#define URB_SF_ENTRY_SIZE     2
+#define URB_SF_ENTRIES	      1
+
 struct blendinfo {
     Bool dst_alpha;
     Bool src_alpha;
@@ -250,27 +268,14 @@ i965_check_composite(int op, PicturePtr pSrcPicture, PicturePtr pMaskPicture,
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define BRW_GRF_BLOCKS(nreg)    ((nreg + 15) / 16 - 1)
 
-static int urb_vs_start, urb_vs_size;
-static int urb_gs_start, urb_gs_size;
-static int urb_clip_start, urb_clip_size;
-static int urb_sf_start, urb_sf_size;
-static int urb_cs_start, urb_cs_size;
-
-static struct brw_surface_state *dest_surf_state;
-static struct brw_surface_state *src_surf_state;
-static struct brw_surface_state *mask_surf_state;
-
-static CARD32 *binding_table;
-
 /* these offsets will remain the same for all buffers post allocation */
 static int dest_surf_offset, src_surf_offset, mask_surf_offset;
 static int vb_offset;
 static int binding_table_offset;
+
 static float *vb;
 static int vb_index;
 
-static CARD32 src_blend, dst_blend;
-
 static const CARD32 sip_kernel_static[][4] = {
 /*    wait (1) a0<1>UW a145<0,1,0>UW { align1 +  } */
     { 0x00000030, 0x20000108, 0x00001220, 0x00000000 },
@@ -314,12 +319,6 @@ static const CARD32 sf_kernel_rotation_static[][4] = {
 #include "exa_sf_rotation_prog.h"
 };
 
-struct i965_kernels {
-    void *kernel;
-    int size;
-
-};
-
 /* ps kernels */
 #define PS_KERNEL_NUM_GRF   32
 #define PS_MAX_THREADS	   32
@@ -491,49 +490,6 @@ i965_check_rotation_transform(PictTransformPtr t)
 	return FALSE;
 }
 
-/* initialise the state offsets these should not change at runtime */
-static void
-i965_init_state_offsets(ScrnInfoPtr pScrn, int total_size)
-{
-    static int init = 0;
-
-    if (init)
-	return;
-
-    init = 1;
-
-    /* Set up a default static partitioning of the URB, which is supposed to
-     * allow anything we would want to do, at potentially lower performance.
-     */
-#define URB_CS_ENTRY_SIZE     0
-#define URB_CS_ENTRIES	      0
-
-#define URB_VS_ENTRY_SIZE     1	  // each 512-bit row
-#define URB_VS_ENTRIES	      8	  // we needs at least 8 entries
-
-#define URB_GS_ENTRY_SIZE     0
-#define URB_GS_ENTRIES	      0
-
-#define URB_CLIP_ENTRY_SIZE   0
-#define URB_CLIP_ENTRIES      0
-
-#define URB_SF_ENTRY_SIZE     2
-#define URB_SF_ENTRIES	      1
-
-    urb_vs_start = 0;
-    urb_vs_size = URB_VS_ENTRIES * URB_VS_ENTRY_SIZE;
-    urb_gs_start = urb_vs_start + urb_vs_size;
-    urb_gs_size = URB_GS_ENTRIES * URB_GS_ENTRY_SIZE;
-    urb_clip_start = urb_gs_start + urb_gs_size;
-    urb_clip_size = URB_CLIP_ENTRIES * URB_CLIP_ENTRY_SIZE;
-    urb_sf_start = urb_clip_start + urb_clip_size;
-    urb_sf_size = URB_SF_ENTRIES * URB_SF_ENTRY_SIZE;
-    urb_cs_start = urb_sf_start + urb_sf_size;
-    urb_cs_size = URB_CS_ENTRIES * URB_CS_ENTRY_SIZE;
-
-    //    assert(total_state_size < pI830->exa_965_state->size);
-}
-
 static void
 sf_state_init (struct brw_sf_unit_state *sf_state, int kernel_offset)
 {
@@ -942,13 +898,18 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     Bool rotation_program = FALSE;
     int wm_state_offset, sip_kernel_offset;
     int sf_state_offset, cc_state_offset;
-    char *start_base;
-    void *map;
-    gen4_state_t *gen4_state;
     char *surface_start_base;
     void *surface_map;
     sampler_state_filter_t src_filter, mask_filter;
     sampler_state_extend_t src_extend, mask_extend;
+    struct brw_surface_state *dest_surf_state, *src_surf_state, *mask_surf_state;
+    CARD32 *binding_table;
+    int urb_vs_start, urb_vs_size;
+    int urb_gs_start, urb_gs_size;
+    int urb_clip_start, urb_clip_size;
+    int urb_sf_start, urb_sf_size;
+    int urb_cs_start, urb_cs_size;
+    CARD32 src_blend, dst_blend;
 
     if (pI830->use_ttm_batch) {
 	i965_exastate_reset(pI830->exa965);
@@ -1151,6 +1112,17 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     cc_state_offset = offsetof (gen4_state_t,
 				cc_state[src_blend][dst_blend]);
 
+    urb_vs_start = 0;
+    urb_vs_size = URB_VS_ENTRIES * URB_VS_ENTRY_SIZE;
+    urb_gs_start = urb_vs_start + urb_vs_size;
+    urb_gs_size = URB_GS_ENTRIES * URB_GS_ENTRY_SIZE;
+    urb_clip_start = urb_gs_start + urb_gs_size;
+    urb_clip_size = URB_CLIP_ENTRIES * URB_CLIP_ENTRY_SIZE;
+    urb_sf_start = urb_clip_start + urb_clip_size;
+    urb_sf_size = URB_SF_ENTRIES * URB_SF_ENTRY_SIZE;
+    urb_cs_start = urb_sf_start + urb_sf_size;
+    urb_cs_size = URB_CS_ENTRIES * URB_CS_ENTRY_SIZE;
+
     /* Begin the long sequence of commands needed to set up the 3D
      * rendering pipe
      */
@@ -1470,11 +1442,7 @@ i965_init_exa_state(ScrnInfoPtr pScrn)
 {
     I830Ptr pI830 = I830PTR(pScrn);
 
-    i965_init_state_offsets(pScrn, EXASTATE_SZ);
-
     if (pI830->use_ttm_batch) {
-
-	
 	pI830->exa965 = i965_exastate_alloc(pScrn);
     } else {
 	void *map = pI830->FbBase + pI830->exa_965_state->offset;
commit ae84972e94f3a644955f8d64592587043416efa2
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Dec 19 07:33:29 2007 -0800

    Increase the number of composite operations performed in each batch.
    
    We increase from 1 to 64 with this commit with the following
    approximate performance improvements (measured with
    "x11perf -aa10text" on my system).
    
    Operations	Glyphs/sec.
    ----------	-----------
    1		 10,000
    2		 20,000
    4		 37,000
    8		 67,000
    16		110,000
    32		122,000

diff --git a/src/i965_render.c b/src/i965_render.c
index 1042616..c513010 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -440,7 +440,7 @@ char gen4_state_too_big[(EXASTATE_SZ >=
 			 sizeof(gen4_state_t)) ? 1 : -1];
 
 /* How many composite operations will we fit in one object. */
-#define GEN4_MAX_OPS			1
+#define GEN4_MAX_OPS			32
 #define GEN4_SURFACE_STATE_PER_OP	3
 #define GEN4_MAX_SURFACE_STATES		(GEN4_MAX_OPS * GEN4_SURFACE_STATE_PER_OP)
 /* We only need 3, but we use 8 to get the proper alignment. */
commit 44421005f29f52fd918597bbee0652432edb8390
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Dec 19 07:29:43 2007 -0800

    Don't allocate more surface state than we actually use
    
    This improves performance from ~ 4000 to ~ 10000 glyphs/sec. for
    "x11perf -aa10text" on my system.

diff --git a/src/i965_render.c b/src/i965_render.c
index 08c88ec..1042616 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -440,7 +440,7 @@ char gen4_state_too_big[(EXASTATE_SZ >=
 			 sizeof(gen4_state_t)) ? 1 : -1];
 
 /* How many composite operations will we fit in one object. */
-#define GEN4_MAX_OPS			1024
+#define GEN4_MAX_OPS			1
 #define GEN4_SURFACE_STATE_PER_OP	3
 #define GEN4_MAX_SURFACE_STATES		(GEN4_MAX_OPS * GEN4_SURFACE_STATE_PER_OP)
 /* We only need 3, but we use 8 to get the proper alignment. */
@@ -891,7 +891,7 @@ i965_exastate_reset(struct i965_exastate_buffer *state)
     }
 
     /* Then the surface state buffer */
-    if (state->surface_buf != NULL && state->num_ops) {
+    if (state->surface_buf != NULL && state->num_ops >= GEN4_MAX_OPS) {
 	ddx_bo_unreference(state->surface_buf);
 	state->surface_buf = NULL;
     }
@@ -1448,7 +1448,7 @@ void i965_done_composite(PixmapPtr pDst)
 	ADVANCE_BATCH();
     }
 
-    if (pI830->use_ttm_batch && pI830->exa965->num_ops) {
+    if (pI830->use_ttm_batch && pI830->exa965->num_ops >= GEN4_MAX_OPS) {
 	intelddx_batchbuffer_flush(pI830->batch);
     }
 }
commit 0965f59aa191405ef5d4b0a8ec701b075ba18f29
Author: Dave Airlie <airlied at linux.ie>
Date:   Thu Dec 13 18:58:57 2007 +1000

    dirty hack to send index into kernel - NEEDS NEW DRM

diff --git a/src/i830.h b/src/i830.h
index cbc1b93..8314afe 100644
--- a/src/i830.h
+++ b/src/i830.h
@@ -813,6 +813,7 @@ void i965_composite(PixmapPtr pDst, int srcX, int srcY,
 		    int maskX, int maskY, int dstX, int dstY, int w, int h);
 void i965_done_composite(PixmapPtr pDst);
 int i965_init_exa_state(ScrnInfoPtr pScrn);
+void i965_exastate_flush(struct i965_exastate_buffer *state);
 void
 i830_get_transformed_coordinates(int x, int y, PictTransformPtr transform,
 				 float *x_out, float *y_out);
diff --git a/src/i830_exa.c b/src/i830_exa.c
index c164bab..dc1c461 100644
--- a/src/i830_exa.c
+++ b/src/i830_exa.c
@@ -165,7 +165,7 @@ I830EXASync(ScreenPtr pScreen, int marker)
 {
     ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
 
-    I830Sync(pScrn);
+//    I830Sync(pScrn);
 }
 
 /**
diff --git a/src/i965_render.c b/src/i965_render.c
index fa9c88e..08c88ec 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -864,6 +864,16 @@ gen4_surface_state_init (unsigned char *start_base,
     state->num_ops++;
 }
 
+void i965_exastate_flush(struct i965_exastate_buffer *state)
+{
+    if (state->surface_buf) {
+	ddx_bo_unmap(state->surface_buf);
+	ddx_bo_unreference(state->surface_buf);
+	state->surface_buf = NULL;
+	state->surface_map = NULL;
+    }
+}
+
 static void
 i965_exastate_reset(struct i965_exastate_buffer *state)
 {
@@ -894,7 +904,6 @@ i965_exastate_reset(struct i965_exastate_buffer *state)
 	state->num_ops = 0;
 
 	state->surface_map = state->surface_buf->virtual;
-	gen4_surface_state_init (state->surface_map, state);
     }
 }
 
@@ -944,6 +953,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     if (pI830->use_ttm_batch) {
 	i965_exastate_reset(pI830->exa965);
 	surface_map = pI830->exa965->surface_map;
+	gen4_surface_state_init (surface_map, pI830->exa965);
     }else{
 	surface_map = pI830->exa_965_state->offset + pI830->FbBase;
     }
@@ -1439,12 +1449,8 @@ void i965_done_composite(PixmapPtr pDst)
     }
 
     if (pI830->use_ttm_batch && pI830->exa965->num_ops) {
-	ddx_bo_unmap(pI830->exa965->surface_buf);
 	intelddx_batchbuffer_flush(pI830->batch);
-    } else {
-	I830Sync(pScrn);
     }
-
 }
 
 static struct i965_exastate_buffer *
diff --git a/src/intel_batchbuffer.c b/src/intel_batchbuffer.c
index f908497..c3b5d8e 100644
--- a/src/intel_batchbuffer.c
+++ b/src/intel_batchbuffer.c
@@ -206,6 +206,9 @@ intelddx_batchbuffer_flush(struct intelddx_batchbuffer *batch)
       return;
 
    if (IS_I965G(pI830))
+       i965_exastate_flush(pI830->exa965);
+
+   if (IS_I965G(pI830))
 	flags = 0;
    /* Add the MI_BATCH_BUFFER_END.  Always add an MI_FLUSH - this is a
     * performance drain that we would like to avoid.
diff --git a/src/intel_bufmgr_ttm.c b/src/intel_bufmgr_ttm.c
index ceb0b92..4884296 100644
--- a/src/intel_bufmgr_ttm.c
+++ b/src/intel_bufmgr_ttm.c
@@ -61,6 +61,7 @@ struct intel_bo_node
     drmMMListHead head;
     drmBO *buf;
     struct drm_i915_op_arg bo_arg;
+    int index;
     unsigned long arg0;
     unsigned long arg1;
     void (*destroy)(void *);
@@ -162,15 +163,31 @@ intel_setup_validate_list(int fd, struct intel_bo_list *list, struct intel_bo_li
     struct drm_bo_op_req *req;
     uint64_t *prevNext = NULL;
     uint32_t count = 0;
-
+    int pass_num = 0;
+    uint32_t reloc_handle;
     first = NULL;
-
+    
+ repass:
     for (l = list->list.next; l != &list->list; l = l->next) {
         node = DRMLISTENTRY(struct intel_bo_node, l, head);
 
         arg = &node->bo_arg;
         req = &arg->d.req;
 
+	reloc_handle = 0;
+	for (rl = reloc_list->list.next; rl != &reloc_list->list; rl = rl->next) {
+	    rl_node = DRMLISTENTRY(struct intel_bo_reloc_node, rl, head);
+
+	    if (rl_node->handle == node->buf->handle) {
+		reloc_handle = rl_node->type_list.buf.handle;
+	    }
+	}
+
+	if (reloc_handle && pass_num == 0)
+	    continue;
+	if (reloc_handle == 0 && pass_num == 1)
+	    continue;
+
         if (!first)
             first = arg;
 
@@ -178,25 +195,24 @@ intel_setup_validate_list(int fd, struct intel_bo_list *list, struct intel_bo_li
 	    *prevNext = (unsigned long) arg;
 
 	memset(arg, 0, sizeof(*arg));
+	arg->reloc_handle = reloc_handle;
 	prevNext = &arg->next;
 	req->bo_req.handle = node->buf->handle;
 	req->op = drm_bo_validate;
 	req->bo_req.flags = node->arg0;
 	req->bo_req.hint = 0;
+	req->bo_req.index = node->index;
 	req->bo_req.mask = node->arg1;
 	req->bo_req.fence_class = 0; /* Backwards compat. */
-	arg->reloc_handle = 0;
 
-	for (rl = reloc_list->list.next; rl != &reloc_list->list; rl = rl->next) {
-	    rl_node = DRMLISTENTRY(struct intel_bo_reloc_node, rl, head);
-
-	    if (rl_node->handle == node->buf->handle) {
-		arg->reloc_handle = rl_node->type_list.buf.handle;
-	    }
-	}
 	count++;
     }
 
+    if (pass_num == 0){
+	pass_num = 1;
+	goto repass;
+    }
+	
     if (!first)
 	return 0;
 
@@ -268,6 +284,7 @@ static int intel_add_validate_buffer(struct intel_bo_list *list, ddx_bo *buf, un
 	cur->arg0 = flags;
 	cur->arg1 = mask;
 	cur->destroy = destroy_cb;
+	cur->index = count;
 	ret = 1;
 
 	DRMLISTADDTAIL(&cur->head, &list->list);
commit 01cb0f25fa9ff2214ee866c8a0dcbf8b70f43181
Author: Dave Airlie <airlied at linux.ie>
Date:   Thu Dec 13 15:52:42 2007 +1000

    i965/exa: fixup buffer allocation flags to use cached drm
    
    With this change performance improves from ~ 700 glyphs/sec.
    to ~ 4000 glyphs/sec. for "x11perf -aa10text" on my system.

diff --git a/src/i965_render.c b/src/i965_render.c
index 1622efd..fa9c88e 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -873,7 +873,7 @@ i965_exastate_reset(struct i965_exastate_buffer *state)
     if (state->buf == NULL) {
 	state->buf = ddx_bo_alloc(pI830->bufmgr, "exa state buffer",
 				  EXASTATE_SZ, 4096,
-				  DRM_BO_FLAG_MEM_TT);
+				  DRM_BO_FLAG_MEM_LOCAL | DRM_BO_FLAG_CACHED | DRM_BO_FLAG_CACHED_MAPPED);
 	ddx_bo_map(state->buf, TRUE);
 	state->map = state->buf->virtual;
 	gen4_state_init ((void *) state->map);
@@ -889,7 +889,7 @@ i965_exastate_reset(struct i965_exastate_buffer *state)
     if (state->surface_buf == NULL) {
 	state->surface_buf = ddx_bo_alloc(pI830->bufmgr, "exa surface state buffer",
 					  sizeof (gen4_surface_state_t), 4096,
-					  DRM_BO_FLAG_MEM_TT);
+					  DRM_BO_FLAG_MEM_LOCAL | DRM_BO_FLAG_CACHED | DRM_BO_FLAG_CACHED_MAPPED);
 	ddx_bo_map(state->surface_buf, TRUE);
 	state->num_ops = 0;
 
@@ -1017,7 +1017,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     if (pI830->use_ttm_batch) {
     	intelddx_batchbuffer_emit_pixmap(pDst,
 				     DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_WRITE,
-				     DRM_BO_MASK_MEM | DRM_BO_FLAG_WRITE | DRM_BO_FLAG_CACHED,
+				     DRM_BO_MASK_MEM | DRM_BO_FLAG_WRITE,
 				     pI830->exa965->surface_buf, dest_surf_offset + 4, 0);
     } else {
         dest_surf_state->ss1.base_addr = intel_get_pixmap_offset(pDst);
@@ -1036,7 +1036,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     if (pI830->use_ttm_batch) {
         intelddx_batchbuffer_emit_pixmap(pSrc,
 				 DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
-				 DRM_BO_MASK_MEM | DRM_BO_FLAG_READ | DRM_BO_FLAG_CACHED,
+				 DRM_BO_MASK_MEM | DRM_BO_FLAG_READ,
 				 pI830->exa965->surface_buf, src_surf_offset + 4, 0);
     } else {
         src_surf_state->ss1.base_addr = intel_get_pixmap_offset(pSrc);
@@ -1054,7 +1054,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
         if (pI830->use_ttm_batch) {
 	   intelddx_batchbuffer_emit_pixmap(pMask, 
 				     DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
-				     DRM_BO_MASK_MEM | DRM_BO_FLAG_READ | DRM_BO_FLAG_CACHED,
+				     DRM_BO_MASK_MEM | DRM_BO_FLAG_READ,
 				     pI830->exa965->surface_buf, mask_surf_offset + 4, 0);
         } else {
 	    mask_surf_state->ss1.base_addr = intel_get_pixmap_offset(pMask);
commit e94bd85912dfce251ef892eb4ed90218feb47141
Merge: 1feca9a... 8b598e9...
Author: Carl Worth <cworth at cworth.org>
Date:   Tue Dec 18 17:09:37 2007 -0800

    Merge commit 'origin/intel-batchbuffer'

commit 1feca9a17dd4372ba02696544537216af13a8775
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Dec 12 09:20:23 2007 -0800

    Introduce conditionals for batch flush and new allocation of surface state.
    
    The conditional doesn't change anything yet, since it still flushes
    and allocates anew on each operation. But it should be a tiny change
    from here to do better than that.

diff --git a/src/i965_render.c b/src/i965_render.c
index 5d23d1d..1622efd 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -881,19 +881,21 @@ i965_exastate_reset(struct i965_exastate_buffer *state)
     }
 
     /* Then the surface state buffer */
-    if (state->surface_buf != NULL) {
+    if (state->surface_buf != NULL && state->num_ops) {
 	ddx_bo_unreference(state->surface_buf);
 	state->surface_buf = NULL;
     }
 
-    state->surface_buf = ddx_bo_alloc(pI830->bufmgr, "exa surface state buffer",
-				      sizeof (gen4_surface_state_t), 4096,
-				      DRM_BO_FLAG_MEM_TT);
-    ddx_bo_map(state->surface_buf, TRUE);
-    state->num_ops = 0;
+    if (state->surface_buf == NULL) {
+	state->surface_buf = ddx_bo_alloc(pI830->bufmgr, "exa surface state buffer",
+					  sizeof (gen4_surface_state_t), 4096,
+					  DRM_BO_FLAG_MEM_TT);
+	ddx_bo_map(state->surface_buf, TRUE);
+	state->num_ops = 0;
 
-    state->surface_map = state->surface_buf->virtual;
-    gen4_surface_state_init (state->surface_map, state);
+	state->surface_map = state->surface_buf->virtual;
+	gen4_surface_state_init (state->surface_map, state);
+    }
 }
 
 static sampler_state_filter_t
@@ -1436,7 +1438,7 @@ void i965_done_composite(PixmapPtr pDst)
 	ADVANCE_BATCH();
     }
 
-    if (pI830->use_ttm_batch) {
+    if (pI830->use_ttm_batch && pI830->exa965->num_ops) {
 	ddx_bo_unmap(pI830->exa965->surface_buf);
 	intelddx_batchbuffer_flush(pI830->batch);
     } else {
@@ -1451,6 +1453,7 @@ i965_exastate_alloc(ScrnInfoPtr pScrn)
     struct i965_exastate_buffer *state = calloc(sizeof(*state), 1);
 
     state->pScrn = pScrn;
+    state->num_ops = 0;
     i965_exastate_reset(state);
     return state;
 
commit 147c09f123c45300e5f94bf4310d25331b6f1e95
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Dec 12 09:07:05 2007 -0800

    Fix alignment for binding table and vertex buffer
    
    The alignment here will really matter as soon as we batch
    multiple operations together.

diff --git a/src/i965_render.c b/src/i965_render.c
index 4b24984..5d23d1d 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -444,9 +444,9 @@ char gen4_state_too_big[(EXASTATE_SZ >=
 #define GEN4_SURFACE_STATE_PER_OP	3
 #define GEN4_MAX_SURFACE_STATES		(GEN4_MAX_OPS * GEN4_SURFACE_STATE_PER_OP)
 /* We only need 3, but we use 8 to get the proper alignment. */
-#define GEN4_BINDING_TABLE_PER_OP	3
+#define GEN4_BINDING_TABLE_PER_OP	8
 #define GEN4_MAX_BINDING_TABLE		(GEN4_MAX_OPS * GEN4_BINDING_TABLE_PER_OP)
-#define GEN4_VERTICES_PER_OP		18
+#define GEN4_VERTICES_PER_OP		24
 #define GEN4_MAX_VERTICES		(GEN4_MAX_OPS * GEN4_VERTICES_PER_OP)
 
 typedef struct _brw_surface_state_padded {
commit 053cee579ba9f0781f5a506f249d217afac00b82
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Dec 12 09:01:20 2007 -0800

    Make gen4_surface_state_t really big
    
    Still just preparatory---for now we're still doing just one
    operation at a time.

diff --git a/src/i965_render.c b/src/i965_render.c
index 19f6a49..4b24984 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -440,7 +440,7 @@ char gen4_state_too_big[(EXASTATE_SZ >=
 			 sizeof(gen4_state_t)) ? 1 : -1];
 
 /* How many composite operations will we fit in one object. */
-#define GEN4_MAX_OPS			16
+#define GEN4_MAX_OPS			1024
 #define GEN4_SURFACE_STATE_PER_OP	3
 #define GEN4_MAX_SURFACE_STATES		(GEN4_MAX_OPS * GEN4_SURFACE_STATE_PER_OP)
 /* We only need 3, but we use 8 to get the proper alignment. */
@@ -462,9 +462,6 @@ typedef struct _gen4_surface_state {
     float vb[GEN4_MAX_VERTICES];
 } gen4_surface_state_t;
 
-char gen4_surface_state_too_big[(EXASTATE_SZ >=
-				 sizeof(gen4_surface_state_t)) ? 1 : -1];
-
 static CARD32 
 i965_get_card_format(PicturePtr pPict)
 {
@@ -890,7 +887,7 @@ i965_exastate_reset(struct i965_exastate_buffer *state)
     }
 
     state->surface_buf = ddx_bo_alloc(pI830->bufmgr, "exa surface state buffer",
-				      EXASTATE_SZ, 4096,
+				      sizeof (gen4_surface_state_t), 4096,
 				      DRM_BO_FLAG_MEM_TT);
     ddx_bo_map(state->surface_buf, TRUE);
     state->num_ops = 0;
commit 39f43da7b7d1b2f6b3daace4e3b5abc2440914a6
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Dec 12 08:08:55 2007 -0800

    Move vertex and binding table offsets out of i965_init_state_offsets

diff --git a/src/i965_render.c b/src/i965_render.c
index e6e4572..19f6a49 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -440,10 +440,14 @@ char gen4_state_too_big[(EXASTATE_SZ >=
 			 sizeof(gen4_state_t)) ? 1 : -1];
 
 /* How many composite operations will we fit in one object. */
-#define GEN4_COMPOSITE_BATCH	16
-#define GEN4_MAX_SURFACE_STATES	(GEN4_COMPOSITE_BATCH * 3)
-#define GEN4_MAX_BINDING_TABLE	(GEN4_COMPOSITE_BATCH * 3)
-#define GEN4_MAX_VERTICES	(GEN4_COMPOSITE_BATCH * 18)
+#define GEN4_MAX_OPS			16
+#define GEN4_SURFACE_STATE_PER_OP	3
+#define GEN4_MAX_SURFACE_STATES		(GEN4_MAX_OPS * GEN4_SURFACE_STATE_PER_OP)
+/* We only need 3, but we use 8 to get the proper alignment. */
+#define GEN4_BINDING_TABLE_PER_OP	3
+#define GEN4_MAX_BINDING_TABLE		(GEN4_MAX_OPS * GEN4_BINDING_TABLE_PER_OP)
+#define GEN4_VERTICES_PER_OP		18
+#define GEN4_MAX_VERTICES		(GEN4_MAX_OPS * GEN4_VERTICES_PER_OP)
 
 typedef struct _brw_surface_state_padded {
     struct brw_surface_state state;
@@ -501,10 +505,6 @@ i965_init_state_offsets(ScrnInfoPtr pScrn, int total_size)
 
     init = 1;
 
-    binding_table_offset = offsetof (gen4_surface_state_t, binding_table);
-
-    vb_offset = offsetof (gen4_surface_state_t, vb);
-
     /* Set up a default static partitioning of the URB, which is supposed to
      * allow anything we would want to do, at potentially lower performance.
      */
@@ -802,10 +802,17 @@ gen4_surface_state_init (unsigned char *start_base,
     unsigned int surf_state_offset = offsetof (gen4_surface_state_t,
 					       surface_state);
 
+    vb_offset = (offsetof (gen4_surface_state_t, vb) +
+		 sizeof (float) * GEN4_VERTICES_PER_OP * state->num_ops);
+
+    binding_table_offset = (offsetof (gen4_surface_state_t, binding_table) +
+			    sizeof (CARD32) * GEN4_BINDING_TABLE_PER_OP *
+			    state->num_ops);
+
     /* destination surface state */
     dest_surf_offset = (surf_state_offset +
 			sizeof (brw_surface_state_padded) *
-			state->num_surface_states++);
+			(GEN4_SURFACE_STATE_PER_OP * state->num_ops + 0));
     dest_surf_state = (void *)(start_base + dest_surf_offset);
     dest_surf_state->ss0.surface_type = BRW_SURFACE_2D;
     dest_surf_state->ss0.data_return_format = BRW_SURFACERETURNFORMAT_FLOAT32;
@@ -824,7 +831,7 @@ gen4_surface_state_init (unsigned char *start_base,
     /* source surface state */
     src_surf_offset = (surf_state_offset +
 		       sizeof (brw_surface_state_padded) *
-		       state->num_surface_states++);
+		       (GEN4_SURFACE_STATE_PER_OP * state->num_ops + 1));
     src_surf_state = (void *)(start_base + src_surf_offset);
     src_surf_state->ss0.surface_type = BRW_SURFACE_2D;
     src_surf_state->ss0.writedisable_alpha = 0;
@@ -842,7 +849,7 @@ gen4_surface_state_init (unsigned char *start_base,
     /* mask surface state */
     mask_surf_offset = (surf_state_offset +
 			sizeof (brw_surface_state_padded) *
-			state->num_surface_states++);
+			(GEN4_SURFACE_STATE_PER_OP * state->num_ops + 2));
     mask_surf_state = (void *)(start_base + mask_surf_offset);
     mask_surf_state->ss0.surface_type = BRW_SURFACE_2D;
     mask_surf_state->ss0.writedisable_alpha = 0;
@@ -856,6 +863,8 @@ gen4_surface_state_init (unsigned char *start_base,
     mask_surf_state->ss0.render_cache_read_mode = 0;
     mask_surf_state->ss2.mip_count = 0;
     mask_surf_state->ss2.render_target_rotation = 0;
+
+    state->num_ops++;
 }
 
 static void
@@ -884,7 +893,7 @@ i965_exastate_reset(struct i965_exastate_buffer *state)
 				      EXASTATE_SZ, 4096,
 				      DRM_BO_FLAG_MEM_TT);
     ddx_bo_map(state->surface_buf, TRUE);
-    state->num_surface_states = 0;
+    state->num_ops = 0;
 
     state->surface_map = state->surface_buf->virtual;
     gen4_surface_state_init (state->surface_map, state);
@@ -1058,7 +1067,8 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	mask_surf_state->ss3.tiled_surface = mask_tiled;
     }
 
-    binding_table = (void *)(surface_start_base + binding_table_offset);
+    binding_table = (void *)(surface_start_base +
+			     binding_table_offset);
     /* Set up a binding table for our surfaces.  Only the PS will use it */
     binding_table[0] = dest_surf_offset;
     binding_table[1] = src_surf_offset;
diff --git a/src/intel_batchbuffer.h b/src/intel_batchbuffer.h
index 4320e64..2244414 100644
--- a/src/intel_batchbuffer.h
+++ b/src/intel_batchbuffer.h
@@ -29,7 +29,7 @@ struct i965_exastate_buffer {
 
    ddx_bo *surface_buf;
    unsigned char *surface_map;
-   int num_surface_states;
+   int num_ops;
 
    dri_fence *last_fence;
    ScrnInfoPtr pScrn;
commit c3e337e38ab729bad55b4d8a5798ef2633b35e8e
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Dec 12 06:44:45 2007 -0800

    Expand out gen4_surface_state_t in preparation for batching

diff --git a/src/i965_render.c b/src/i965_render.c
index 9df9655..e6e4572 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -439,8 +439,11 @@ typedef struct _gen4_state {
 char gen4_state_too_big[(EXASTATE_SZ >=
 			 sizeof(gen4_state_t)) ? 1 : -1];
 
-#define GEN4_VB_NUM_VERTICES	32
-#define GEN4_NUM_SURFACE_STATES	3
+/* How many composite operations will we fit in one object. */
+#define GEN4_COMPOSITE_BATCH	16
+#define GEN4_MAX_SURFACE_STATES	(GEN4_COMPOSITE_BATCH * 3)
+#define GEN4_MAX_BINDING_TABLE	(GEN4_COMPOSITE_BATCH * 3)
+#define GEN4_MAX_VERTICES	(GEN4_COMPOSITE_BATCH * 18)
 
 typedef struct _brw_surface_state_padded {
     struct brw_surface_state state;
@@ -448,11 +451,11 @@ typedef struct _brw_surface_state_padded {
 } brw_surface_state_padded;
 
 typedef struct _gen4_surface_state {
-    brw_surface_state_padded surface_state[GEN4_NUM_SURFACE_STATES];
+    brw_surface_state_padded surface_state[GEN4_MAX_SURFACE_STATES];
 
-    CARD32 binding_table[16];
+    CARD32 binding_table[GEN4_MAX_BINDING_TABLE];
 
-    float vb[GEN4_VB_NUM_VERTICES];
+    float vb[GEN4_MAX_VERTICES];
 } gen4_surface_state_t;
 
 char gen4_surface_state_too_big[(EXASTATE_SZ >=
@@ -1256,7 +1259,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	    OUT_BATCH(pI830->exa_965_state->offset + vb_offset);
 	}
 
-        OUT_BATCH(GEN4_VB_NUM_VERTICES); // set max index
+        OUT_BATCH(GEN4_MAX_VERTICES); // set max index
    	OUT_BATCH(0); // ignore for VERTEXDATA, but still there
 
 	/* Set up our vertex elements, sourced from the single vertex buffer.
@@ -1343,7 +1346,7 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
     /* Wait for any existing composite rectangles to land before we overwrite
      * the VB with the next one.
      */
-    if ((vb_index + 18) > GEN4_VB_NUM_VERTICES) {
+    if ((vb_index + 18) > GEN4_MAX_VERTICES) {
       ErrorF("vb index exceeded maximum bailing...");
       return;
     }
commit d3cea5d01db68c2a0d60c7444b736d9eec7fbcf3
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Dec 12 06:17:30 2007 -0800

    Start tracking how many surface state objects we use.

diff --git a/src/i965_render.c b/src/i965_render.c
index 1f80f5e..9df9655 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -801,7 +801,8 @@ gen4_surface_state_init (unsigned char *start_base,
 
     /* destination surface state */
     dest_surf_offset = (surf_state_offset +
-			sizeof (brw_surface_state_padded) * 0);
+			sizeof (brw_surface_state_padded) *
+			state->num_surface_states++);
     dest_surf_state = (void *)(start_base + dest_surf_offset);
     dest_surf_state->ss0.surface_type = BRW_SURFACE_2D;
     dest_surf_state->ss0.data_return_format = BRW_SURFACERETURNFORMAT_FLOAT32;
@@ -819,7 +820,8 @@ gen4_surface_state_init (unsigned char *start_base,
 
     /* source surface state */
     src_surf_offset = (surf_state_offset +
-		       sizeof (brw_surface_state_padded) * 1);
+		       sizeof (brw_surface_state_padded) *
+		       state->num_surface_states++);
     src_surf_state = (void *)(start_base + src_surf_offset);
     src_surf_state->ss0.surface_type = BRW_SURFACE_2D;
     src_surf_state->ss0.writedisable_alpha = 0;
@@ -836,7 +838,8 @@ gen4_surface_state_init (unsigned char *start_base,
 
     /* mask surface state */
     mask_surf_offset = (surf_state_offset +
-			sizeof (brw_surface_state_padded) * 2);
+			sizeof (brw_surface_state_padded) *
+			state->num_surface_states++);
     mask_surf_state = (void *)(start_base + mask_surf_offset);
     mask_surf_state->ss0.surface_type = BRW_SURFACE_2D;
     mask_surf_state->ss0.writedisable_alpha = 0;
@@ -878,6 +881,7 @@ i965_exastate_reset(struct i965_exastate_buffer *state)
 				      EXASTATE_SZ, 4096,
 				      DRM_BO_FLAG_MEM_TT);
     ddx_bo_map(state->surface_buf, TRUE);
+    state->num_surface_states = 0;
 
     state->surface_map = state->surface_buf->virtual;
     gen4_surface_state_init (state->surface_map, state);
diff --git a/src/intel_batchbuffer.h b/src/intel_batchbuffer.h
index 7ccc73a..4320e64 100644
--- a/src/intel_batchbuffer.h
+++ b/src/intel_batchbuffer.h
@@ -29,6 +29,7 @@ struct i965_exastate_buffer {
 
    ddx_bo *surface_buf;
    unsigned char *surface_map;
+   int num_surface_states;
 
    dri_fence *last_fence;
    ScrnInfoPtr pScrn;
commit fae3e58f18d1f1e6e2bdb29546c1df2adea38a1f
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Dec 12 05:50:24 2007 -0800

    Pass the i965_exastate_buffer to gen4_surface_state_init.
    
    We'll eventually want to use that to track how many surface
    state entries and vertices we've used, etc.

diff --git a/src/i965_render.c b/src/i965_render.c
index 7d49f8e..1f80f5e 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -792,7 +792,8 @@ gen4_state_init (gen4_state_t *state)
 }
 
 static void
-gen4_surface_state_init (ScrnInfoPtr pScrn, unsigned char *start_base)
+gen4_surface_state_init (unsigned char *start_base,
+			 struct i965_exastate_buffer *state)
 {
     struct brw_surface_state *dest_surf_state, *src_surf_state, *mask_surf_state;
     unsigned int surf_state_offset = offsetof (gen4_surface_state_t,
@@ -879,7 +880,7 @@ i965_exastate_reset(struct i965_exastate_buffer *state)
     ddx_bo_map(state->surface_buf, TRUE);
 
     state->surface_map = state->surface_buf->virtual;
-    gen4_surface_state_init (state->pScrn, state->surface_map);
+    gen4_surface_state_init (state->surface_map, state);
 }
 
 static sampler_state_filter_t
commit 463ca200958afb8b1f9b14faffa9108acdf01aa8
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Dec 12 05:41:57 2007 -0800

    Remove gen4_surface_state_init from non-ttm_batch case
    
    This is certainly already broken even before now. It's probably
    time to stop pretending that the non-ttm_batch cases are being
    maintained on this branch.

diff --git a/src/i965_render.c b/src/i965_render.c
index 287a477..7d49f8e 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -1455,7 +1455,6 @@ i965_init_exa_state(ScrnInfoPtr pScrn)
     } else {
 	void *map = pI830->FbBase + pI830->exa_965_state->offset;
 	gen4_state_init ((void *) map);
-	gen4_surface_state_init (pScrn, map);
     }
 
     return 0;
commit 06f1a9654cc3ad54105bb344a1503ba41e02c127
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Dec 12 05:39:34 2007 -0800

    Move surface-state offset calcaulation out of i965_init_state_offsets
    
    These aren't dynamic quite yet, but they will be soon.

diff --git a/src/i965_render.c b/src/i965_render.c
index 1987017..287a477 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -498,13 +498,6 @@ i965_init_state_offsets(ScrnInfoPtr pScrn, int total_size)
 
     init = 1;
 
-    /* And then the general state: */
-    dest_surf_offset = offsetof (gen4_surface_state_t, surface_state[0]);
-
-    src_surf_offset = offsetof (gen4_surface_state_t, surface_state[1]);
-
-    mask_surf_offset = offsetof (gen4_surface_state_t, surface_state[2]);
-
     binding_table_offset = offsetof (gen4_surface_state_t, binding_table);
 
     vb_offset = offsetof (gen4_surface_state_t, vb);
@@ -802,8 +795,12 @@ static void
 gen4_surface_state_init (ScrnInfoPtr pScrn, unsigned char *start_base)
 {
     struct brw_surface_state *dest_surf_state, *src_surf_state, *mask_surf_state;
+    unsigned int surf_state_offset = offsetof (gen4_surface_state_t,
+					       surface_state);
 
     /* destination surface state */
+    dest_surf_offset = (surf_state_offset +
+			sizeof (brw_surface_state_padded) * 0);
     dest_surf_state = (void *)(start_base + dest_surf_offset);
     dest_surf_state->ss0.surface_type = BRW_SURFACE_2D;
     dest_surf_state->ss0.data_return_format = BRW_SURFACERETURNFORMAT_FLOAT32;
@@ -820,6 +817,8 @@ gen4_surface_state_init (ScrnInfoPtr pScrn, unsigned char *start_base)
     dest_surf_state->ss2.render_target_rotation = 0;
 
     /* source surface state */
+    src_surf_offset = (surf_state_offset +
+		       sizeof (brw_surface_state_padded) * 1);
     src_surf_state = (void *)(start_base + src_surf_offset);
     src_surf_state->ss0.surface_type = BRW_SURFACE_2D;
     src_surf_state->ss0.writedisable_alpha = 0;
@@ -835,6 +834,8 @@ gen4_surface_state_init (ScrnInfoPtr pScrn, unsigned char *start_base)
     src_surf_state->ss2.render_target_rotation = 0;
 
     /* mask surface state */
+    mask_surf_offset = (surf_state_offset +
+			sizeof (brw_surface_state_padded) * 2);
     mask_surf_state = (void *)(start_base + mask_surf_offset);
     mask_surf_state->ss0.surface_type = BRW_SURFACE_2D;
     mask_surf_state->ss0.writedisable_alpha = 0;
commit 90987e0c5237f311027949b9c82b375d5ed3fc7f
Author: Carl Worth <cworth at cworth.org>
Date:   Fri Dec 7 17:10:11 2007 -0800

    Fix checks that gen4_surface_state_t and gen4_state_t will fit in their respective allocations

diff --git a/src/i965_render.c b/src/i965_render.c
index 203662d..1987017 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -436,6 +436,9 @@ typedef struct _gen4_state {
 					    [SAMPLER_STATE_EXTEND_COUNT];
 } gen4_state_t;
 
+char gen4_state_too_big[(EXASTATE_SZ >=
+			 sizeof(gen4_state_t)) ? 1 : -1];
+
 #define GEN4_VB_NUM_VERTICES	32
 #define GEN4_NUM_SURFACE_STATES	3
 
@@ -452,9 +455,8 @@ typedef struct _gen4_surface_state {
     float vb[GEN4_VB_NUM_VERTICES];
 } gen4_surface_state_t;
 
-char gen4_state_big_enough[(EXA_LINEAR_EXTRA >=
-			    (sizeof(gen4_state_t) +
-			     sizeof(gen4_surface_state_t))) ? 1 : -1];
+char gen4_surface_state_too_big[(EXASTATE_SZ >=
+				 sizeof(gen4_surface_state_t)) ? 1 : -1];
 
 static CARD32 
 i965_get_card_format(PicturePtr pPict)
commit 4dd36b9d6ad7aec6b2dab02324ad247c484ce47e
Author: Carl Worth <cworth at cworth.org>
Date:   Fri Dec 7 17:08:39 2007 -0800

    Switch to an array of surface state objects
    
    This is in preparation for reusing the surface_state buffer object over
    multiple composite operations.

diff --git a/src/i965_render.c b/src/i965_render.c
index a4552f8..203662d 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -436,15 +436,16 @@ typedef struct _gen4_state {
 					    [SAMPLER_STATE_EXTEND_COUNT];
 } gen4_state_t;
 
-#define GEN4_VB_NUM_VERTICES   32
+#define GEN4_VB_NUM_VERTICES	32
+#define GEN4_NUM_SURFACE_STATES	3
+
+typedef struct _brw_surface_state_padded {
+    struct brw_surface_state state;
+    char pad[32 - sizeof (struct brw_surface_state)];
+} brw_surface_state_padded;
 
 typedef struct _gen4_surface_state {
-    struct brw_surface_state dest_surf_state;
-    PAD64 (brw_surface_state, 0);
-    struct brw_surface_state src_surf_state;
-    PAD64 (brw_surface_state, 1);
-    struct brw_surface_state mask_surf_state;
-    PAD64 (brw_surface_state, 2);
+    brw_surface_state_padded surface_state[GEN4_NUM_SURFACE_STATES];
 
     CARD32 binding_table[16];
 
@@ -496,11 +497,11 @@ i965_init_state_offsets(ScrnInfoPtr pScrn, int total_size)
     init = 1;
 
     /* And then the general state: */
-    dest_surf_offset = offsetof (gen4_surface_state_t, dest_surf_state);
+    dest_surf_offset = offsetof (gen4_surface_state_t, surface_state[0]);
 
-    src_surf_offset = offsetof (gen4_surface_state_t, src_surf_state);
+    src_surf_offset = offsetof (gen4_surface_state_t, surface_state[1]);
 
-    mask_surf_offset = offsetof (gen4_surface_state_t, mask_surf_state);
+    mask_surf_offset = offsetof (gen4_surface_state_t, surface_state[2]);
 
     binding_table_offset = offsetof (gen4_surface_state_t, binding_table);
 
commit 4af634e1e2ff0cc0cfed9234f2d4d63e935986f1
Author: Carl Worth <cworth at cworth.org>
Date:   Fri Dec 7 17:00:40 2007 -0800

    Add copyright information for edits by Carl Worth for Red Hat

diff --git a/src/i965_render.c b/src/i965_render.c
index 4d8a245..a4552f8 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -1,5 +1,6 @@
 /*
  * Copyright © 2006 Intel Corporation
+ * Copyright © 2007 Red Hat, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -23,6 +24,7 @@
  * Authors:
  *    Wang Zhenyu <zhenyu.z.wang at intel.com>
  *    Eric Anholt <eric at anholt.net>
+ *    Carl Worth  <cworth at redhat.com>
  *
  */
 
commit ab36b02b1b1016e4010ed147087fc405b2b3525d
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Nov 28 13:54:09 2007 -0800

    Initialize gen4_state_t structure only once.
    
    With previous changes, this state object is entirely invariant
    from one operation to the next, so we don't need to continually
    allocate and initialize it.

diff --git a/src/i965_render.c b/src/i965_render.c
index 09ce1ce..4d8a245 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -851,19 +851,16 @@ i965_exastate_reset(struct i965_exastate_buffer *state)
     I830Ptr pI830 = I830PTR(state->pScrn);
 
     /* First the general state buffer. */
-    if (state->buf != NULL) {
-	ddx_bo_unreference(state->buf);
-	state->buf = NULL;
+    if (state->buf == NULL) {
+	state->buf = ddx_bo_alloc(pI830->bufmgr, "exa state buffer",
+				  EXASTATE_SZ, 4096,
+				  DRM_BO_FLAG_MEM_TT);
+	ddx_bo_map(state->buf, TRUE);
+	state->map = state->buf->virtual;
+	gen4_state_init ((void *) state->map);
+	ddx_bo_unmap(state->buf);
     }
 
-    state->buf = ddx_bo_alloc(pI830->bufmgr, "exa state buffer",
-			      EXASTATE_SZ, 4096,
-			      DRM_BO_FLAG_MEM_TT);
-    ddx_bo_map(state->buf, TRUE);
-
-    state->map = state->buf->virtual;
-    gen4_state_init ((void *) state->map);
-
     /* Then the surface state buffer */
     if (state->surface_buf != NULL) {
 	ddx_bo_unreference(state->surface_buf);
@@ -1419,7 +1416,6 @@ void i965_done_composite(PixmapPtr pDst)
     }
 
     if (pI830->use_ttm_batch) {
-	ddx_bo_unmap(pI830->exa965->buf);
 	ddx_bo_unmap(pI830->exa965->surface_buf);
 	intelddx_batchbuffer_flush(pI830->batch);
     } else {
commit 0b6a4e7866b09c9aad5c402ef6064677f33b90c5
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Nov 28 13:39:46 2007 -0800

    Reorganize i965_exastate_reset
    
    No functional change here---just separating the logically independent
    parts so that future changes will be a bit more clear.

diff --git a/src/i965_render.c b/src/i965_render.c
index 4135924..09ce1ce 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -850,28 +850,31 @@ i965_exastate_reset(struct i965_exastate_buffer *state)
 {
     I830Ptr pI830 = I830PTR(state->pScrn);
 
+    /* First the general state buffer. */
     if (state->buf != NULL) {
 	ddx_bo_unreference(state->buf);
 	state->buf = NULL;
     }
-    if (state->surface_buf != NULL) {
-	ddx_bo_unreference(state->surface_buf);
-	state->surface_buf = NULL;
-    }
 
     state->buf = ddx_bo_alloc(pI830->bufmgr, "exa state buffer",
 			      EXASTATE_SZ, 4096,
 			      DRM_BO_FLAG_MEM_TT);
     ddx_bo_map(state->buf, TRUE);
 
+    state->map = state->buf->virtual;
+    gen4_state_init ((void *) state->map);
+
+    /* Then the surface state buffer */
+    if (state->surface_buf != NULL) {
+	ddx_bo_unreference(state->surface_buf);
+	state->surface_buf = NULL;
+    }
+
     state->surface_buf = ddx_bo_alloc(pI830->bufmgr, "exa surface state buffer",
 				      EXASTATE_SZ, 4096,
 				      DRM_BO_FLAG_MEM_TT);
     ddx_bo_map(state->surface_buf, TRUE);
 
-    state->map = state->buf->virtual;
-    gen4_state_init ((void *) state->map);
-
     state->surface_map = state->surface_buf->virtual;
     gen4_surface_state_init (state->pScrn, state->surface_map);
 }
commit b61a5cac570e7fe6823ef1d4cd419a9141284c41
Author: Carl Worth <cworth at cworth.org>
Date:   Wed Nov 28 12:55:47 2007 -0800

    Allocate separate buffer objects for general and 'surface' state
    
    Take advantage of the two separate base address registers for
    these two buffer objects. The state they reference has very
    different lifetime requirements.

diff --git a/src/i965_render.c b/src/i965_render.c
index 86bce79..4135924 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -262,7 +262,6 @@ static CARD32 *binding_table;
 
 /* these offsets will remain the same for all buffers post allocation */
 static int dest_surf_offset, src_surf_offset, mask_surf_offset;
-static int surface_state_offset;
 static int vb_offset;
 static int binding_table_offset;
 static float *vb;
@@ -494,23 +493,16 @@ i965_init_state_offsets(ScrnInfoPtr pScrn, int total_size)
 
     init = 1;
 
-    surface_state_offset = ALIGN(sizeof(gen4_state_t), 32);
-
     /* And then the general state: */
-    dest_surf_offset = surface_state_offset +
-	offsetof (gen4_surface_state_t, dest_surf_state);
+    dest_surf_offset = offsetof (gen4_surface_state_t, dest_surf_state);
 
-    src_surf_offset = surface_state_offset +
-	offsetof (gen4_surface_state_t, src_surf_state);
+    src_surf_offset = offsetof (gen4_surface_state_t, src_surf_state);
 
-    mask_surf_offset = surface_state_offset +
-	offsetof (gen4_surface_state_t, mask_surf_state);
+    mask_surf_offset = offsetof (gen4_surface_state_t, mask_surf_state);
 
-    binding_table_offset = surface_state_offset +
-	offsetof (gen4_surface_state_t, binding_table);
+    binding_table_offset = offsetof (gen4_surface_state_t, binding_table);
 
-    vb_offset = surface_state_offset +
-	offsetof (gen4_surface_state_t, vb);
+    vb_offset = offsetof (gen4_surface_state_t, vb);
 
     /* Set up a default static partitioning of the URB, which is supposed to
      * allow anything we would want to do, at potentially lower performance.
@@ -862,15 +854,26 @@ i965_exastate_reset(struct i965_exastate_buffer *state)
 	ddx_bo_unreference(state->buf);
 	state->buf = NULL;
     }
+    if (state->surface_buf != NULL) {
+	ddx_bo_unreference(state->surface_buf);
+	state->surface_buf = NULL;
+    }
 
     state->buf = ddx_bo_alloc(pI830->bufmgr, "exa state buffer",
 			      EXASTATE_SZ, 4096,
 			      DRM_BO_FLAG_MEM_TT);
     ddx_bo_map(state->buf, TRUE);
 
+    state->surface_buf = ddx_bo_alloc(pI830->bufmgr, "exa surface state buffer",
+				      EXASTATE_SZ, 4096,
+				      DRM_BO_FLAG_MEM_TT);
+    ddx_bo_map(state->surface_buf, TRUE);
+
     state->map = state->buf->virtual;
     gen4_state_init ((void *) state->map);
-    gen4_surface_state_init (state->pScrn, state->map);
+
+    state->surface_map = state->surface_buf->virtual;
+    gen4_surface_state_init (state->pScrn, state->surface_map);
 }
 
 static sampler_state_filter_t
@@ -911,17 +914,19 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     char *start_base;
     void *map;
     gen4_state_t *gen4_state;
+    char *surface_start_base;
+    void *surface_map;
     sampler_state_filter_t src_filter, mask_filter;
     sampler_state_extend_t src_extend, mask_extend;
 
     if (pI830->use_ttm_batch) {
 	i965_exastate_reset(pI830->exa965);
-	map = pI830->exa965->map;
+	surface_map = pI830->exa965->surface_map;
     }else{
-	map = pI830->exa_965_state->offset + pI830->FbBase;
+	surface_map = pI830->exa_965_state->offset + pI830->FbBase;
     }
 
-    start_base = map;
+    surface_start_base = surface_map;
 
     IntelEmitInvarientState(pScrn);
     *pI830->last_3d = LAST_3D_RENDER;
@@ -976,14 +981,14 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     /* Because we only have a single static buffer for our state currently,
      * we have to sync before updating it every time.
      */
-    vb = (void *)(start_base + vb_offset);
+    vb = (void *)(surface_start_base + vb_offset);
     vb_index = 0;
 
     i965_get_blend_cntl(op, pMaskPicture, pDstPicture->format,
 			&src_blend, &dst_blend);
 
     /* Set up the state buffer for the destination surface */
-    dest_surf_state = (void *)(start_base + dest_surf_offset);
+    dest_surf_state = (void *)(surface_start_base + dest_surf_offset);
     i965_get_dest_format(pDstPicture, &dst_format);
     dest_surf_state->ss0.surface_format = dst_format;
 
@@ -991,7 +996,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     	intelddx_batchbuffer_emit_pixmap(pDst,
 				     DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_WRITE,
 				     DRM_BO_MASK_MEM | DRM_BO_FLAG_WRITE | DRM_BO_FLAG_CACHED,
-				     pI830->exa965->buf, dest_surf_offset + 4, 0);
+				     pI830->exa965->surface_buf, dest_surf_offset + 4, 0);
     } else {
         dest_surf_state->ss1.base_addr = intel_get_pixmap_offset(pDst);
     }
@@ -1003,14 +1008,14 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     dest_surf_state->ss3.tiled_surface = dst_tiled;
 
     /* Set up the source surface state buffer */
-    src_surf_state = (void *)(start_base + src_surf_offset);
+    src_surf_state = (void *)(surface_start_base + src_surf_offset);
     src_surf_state->ss0.surface_format = i965_get_card_format(pSrcPicture);
 
     if (pI830->use_ttm_batch) {
         intelddx_batchbuffer_emit_pixmap(pSrc,
 				 DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
 				 DRM_BO_MASK_MEM | DRM_BO_FLAG_READ | DRM_BO_FLAG_CACHED,
-				 pI830->exa965->buf, src_surf_offset + 4, 0);
+				 pI830->exa965->surface_buf, src_surf_offset + 4, 0);
     } else {
         src_surf_state->ss1.base_addr = intel_get_pixmap_offset(pSrc);
     }
@@ -1022,13 +1027,13 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 
     /* setup mask surface */
     if (pMask) {
-	mask_surf_state = (void *)(start_base + mask_surf_offset);
+	mask_surf_state = (void *)(surface_start_base + mask_surf_offset);
    	mask_surf_state->ss0.surface_format = i965_get_card_format(pMaskPicture);
         if (pI830->use_ttm_batch) {
 	   intelddx_batchbuffer_emit_pixmap(pMask, 
 				     DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
 				     DRM_BO_MASK_MEM | DRM_BO_FLAG_READ | DRM_BO_FLAG_CACHED,
-				     pI830->exa965->buf, mask_surf_offset + 4, 0);
+				     pI830->exa965->surface_buf, mask_surf_offset + 4, 0);
         } else {
 	    mask_surf_state->ss1.base_addr = intel_get_pixmap_offset(pMask);
 	}
@@ -1039,7 +1044,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	mask_surf_state->ss3.tiled_surface = mask_tiled;
     }
 
-    binding_table = (void *)(start_base + binding_table_offset);
+    binding_table = (void *)(surface_start_base + binding_table_offset);
     /* Set up a binding table for our surfaces.  Only the PS will use it */
     binding_table[0] = dest_surf_offset;
     binding_table[1] = src_surf_offset;
@@ -1142,7 +1147,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	if (pI830->use_ttm_batch) {
 	    OUT_RELOC(pI830->exa965->buf, DRM_BO_FLAG_MEM_TT, BASE_ADDRESS_MODIFY);
 
-	    OUT_RELOC(pI830->exa965->buf, DRM_BO_FLAG_MEM_TT, BASE_ADDRESS_MODIFY);
+	    OUT_RELOC(pI830->exa965->surface_buf, DRM_BO_FLAG_MEM_TT, BASE_ADDRESS_MODIFY);
 	} else {
 	    OUT_BATCH(pI830->exa_965_state->offset | BASE_ADDRESS_MODIFY);
 	    OUT_BATCH(pI830->exa_965_state->offset | BASE_ADDRESS_MODIFY);
@@ -1234,7 +1239,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	    	 ((4 * 2 * nelem) << VB0_BUFFER_PITCH_SHIFT));
 
 	if (pI830->use_ttm_batch) {
-	    OUT_RELOC(pI830->exa965->buf, DRM_BO_FLAG_MEM_TT, vb_offset);
+	    OUT_RELOC(pI830->exa965->surface_buf, DRM_BO_FLAG_MEM_TT, vb_offset);
 
 	} else {
 	    OUT_BATCH(pI830->exa_965_state->offset + vb_offset);
@@ -1412,6 +1417,7 @@ void i965_done_composite(PixmapPtr pDst)
 
     if (pI830->use_ttm_batch) {
 	ddx_bo_unmap(pI830->exa965->buf);
+	ddx_bo_unmap(pI830->exa965->surface_buf);
 	intelddx_batchbuffer_flush(pI830->batch);
     } else {
 	I830Sync(pScrn);
diff --git a/src/intel_batchbuffer.h b/src/intel_batchbuffer.h
index 9f1c1c5..7ccc73a 100644
--- a/src/intel_batchbuffer.h
+++ b/src/intel_batchbuffer.h
@@ -25,9 +25,13 @@ struct intelddx_batchbuffer
 
 struct i965_exastate_buffer {
    ddx_bo *buf;
+   unsigned char *map;
+
+   ddx_bo *surface_buf;
+   unsigned char *surface_map;
+
    dri_fence *last_fence;
    ScrnInfoPtr pScrn;
-   unsigned char *map;
 };
 
 struct intelddx_batchbuffer *intelddx_batchbuffer_alloc(ScrnInfoPtr pScrn);
commit 21d9ac50ef812dfbbc38af1890bb388de1e3acfa
Author: Carl Worth <cworth at cworth.org>
Date:   Thu Nov 15 10:35:02 2007 -0800

    Introduce a gen4_surface_state_t structure.
    
    This will help us remove a bunch of global offset variables, make
    the code more readable, and help in future optimization efforts.

diff --git a/src/i965_render.c b/src/i965_render.c
index e5d81be..86bce79 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -262,10 +262,11 @@ static CARD32 *binding_table;
 
 /* these offsets will remain the same for all buffers post allocation */
 static int dest_surf_offset, src_surf_offset, mask_surf_offset;
+static int surface_state_offset;
 static int vb_offset;
 static int binding_table_offset;
 static float *vb;
-static int vb_max_size, vb_index;
+static int vb_index;
 
 static CARD32 src_blend, dst_blend;
 
@@ -434,7 +435,24 @@ typedef struct _gen4_state {
 					    [SAMPLER_STATE_EXTEND_COUNT];
 } gen4_state_t;
 
-char gen4_state_big_enough[EXA_LINEAR_EXTRA >= sizeof(gen4_state_t) ? 1 : -1];
+#define GEN4_VB_NUM_VERTICES   32
+
+typedef struct _gen4_surface_state {
+    struct brw_surface_state dest_surf_state;
+    PAD64 (brw_surface_state, 0);
+    struct brw_surface_state src_surf_state;
+    PAD64 (brw_surface_state, 1);
+    struct brw_surface_state mask_surf_state;
+    PAD64 (brw_surface_state, 2);
+
+    CARD32 binding_table[16];
+
+    float vb[GEN4_VB_NUM_VERTICES];
+} gen4_surface_state_t;
+
+char gen4_state_big_enough[(EXA_LINEAR_EXTRA >=
+			    (sizeof(gen4_state_t) +
+			     sizeof(gen4_surface_state_t))) ? 1 : -1];
 
 static CARD32 
 i965_get_card_format(PicturePtr pPict)
@@ -469,36 +487,31 @@ i965_check_rotation_transform(PictTransformPtr t)
 static void
 i965_init_state_offsets(ScrnInfoPtr pScrn, int total_size)
 {
-    unsigned int next_offset = 0, total_state_size;
-    static int init;
+    static int init = 0;
 
     if (init)
 	return;
 
     init = 1;
 
-    next_offset = sizeof(gen4_state_t);
+    surface_state_offset = ALIGN(sizeof(gen4_state_t), 32);
 
     /* And then the general state: */
-    dest_surf_offset = ALIGN(next_offset, 32);
-    next_offset = dest_surf_offset + sizeof(*dest_surf_state);
-
-    src_surf_offset = ALIGN(next_offset, 32);
-    next_offset = src_surf_offset + sizeof(*src_surf_state);
+    dest_surf_offset = surface_state_offset +
+	offsetof (gen4_surface_state_t, dest_surf_state);
 
-    mask_surf_offset = ALIGN(next_offset, 32);
-    next_offset = mask_surf_offset + sizeof(*mask_surf_state);
+    src_surf_offset = surface_state_offset +
+	offsetof (gen4_surface_state_t, src_surf_state);
 
-    binding_table_offset = ALIGN(next_offset, 32);
-    next_offset = binding_table_offset + (4 * 4);
+    mask_surf_offset = surface_state_offset +
+	offsetof (gen4_surface_state_t, mask_surf_state);
 
-    total_state_size = next_offset;
+    binding_table_offset = surface_state_offset +
+	offsetof (gen4_surface_state_t, binding_table);
 
-    /* Align VB to native size of elements, for safety */
-    vb_offset = ALIGN(next_offset, 32);
-    vb_max_size = total_size - vb_offset;
+    vb_offset = surface_state_offset +
+	offsetof (gen4_surface_state_t, vb);
 
-    ErrorF("%d available for vertex data\n", vb_max_size);
     /* Set up a default static partitioning of the URB, which is supposed to
      * allow anything we would want to do, at potentially lower performance.
      */
@@ -1227,7 +1240,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	    OUT_BATCH(pI830->exa_965_state->offset + vb_offset);
 	}
 
-        OUT_BATCH((vb_max_size / sizeof(float))); // set max index
+        OUT_BATCH(GEN4_VB_NUM_VERTICES); // set max index
    	OUT_BATCH(0); // ignore for VERTEXDATA, but still there
 
 	/* Set up our vertex elements, sourced from the single vertex buffer.
@@ -1314,7 +1327,7 @@ i965_composite(PixmapPtr pDst, int srcX, int srcY, int maskX, int maskY,
     /* Wait for any existing composite rectangles to land before we overwrite
      * the VB with the next one.
      */
-    if ((vb_index + 18) > (vb_max_size / sizeof(float))) {
+    if ((vb_index + 18) > GEN4_VB_NUM_VERTICES) {
       ErrorF("vb index exceeded maximum bailing...");
       return;
     }
commit 2d26847d5afd6d5888b0a8a930137bc734e5e0ed
Author: Carl Worth <cworth at cworth.org>
Date:   Thu Nov 15 09:29:00 2007 -0800

    Use separate functions to initialize general and surface state.

diff --git a/src/i965_render.c b/src/i965_render.c
index 4cdadb4..e5d81be 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -266,7 +266,6 @@ static int vb_offset;
 static int binding_table_offset;
 static float *vb;
 static int vb_max_size, vb_index;
-static int gen4_state_offset;
 
 static CARD32 src_blend, dst_blend;
 
@@ -478,8 +477,7 @@ i965_init_state_offsets(ScrnInfoPtr pScrn, int total_size)
 
     init = 1;
 
-    gen4_state_offset = ALIGN(next_offset, 64);
-    next_offset = gen4_state_offset + sizeof(gen4_state_t);
+    next_offset = sizeof(gen4_state_t);
 
     /* And then the general state: */
     dest_surf_offset = ALIGN(next_offset, 32);
@@ -791,14 +789,9 @@ gen4_state_init (gen4_state_t *state)
 }
 
 static void
-i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
+gen4_surface_state_init (ScrnInfoPtr pScrn, unsigned char *start_base)
 {
     struct brw_surface_state *dest_surf_state, *src_surf_state, *mask_surf_state;
-    gen4_state_t* gen4_state;
-
-    gen4_state = (void *)(start_base + gen4_state_offset);
-
-    gen4_state_init (gen4_state);
 
     /* destination surface state */
     dest_surf_state = (void *)(start_base + dest_surf_offset);
@@ -863,7 +856,8 @@ i965_exastate_reset(struct i965_exastate_buffer *state)
     ddx_bo_map(state->buf, TRUE);
 
     state->map = state->buf->virtual;
-    i965_init_state_objects(state->pScrn, state->map);
+    gen4_state_init ((void *) state->map);
+    gen4_surface_state_init (state->pScrn, state->map);
 }
 
 static sampler_state_filter_t
@@ -916,8 +910,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 
     start_base = map;
 
-    gen4_state = (void *)(start_base + gen4_state_offset);
-
     IntelEmitInvarientState(pScrn);
     *pI830->last_3d = LAST_3D_RENDER;
 
@@ -1438,7 +1430,8 @@ i965_init_exa_state(ScrnInfoPtr pScrn)
 	pI830->exa965 = i965_exastate_alloc(pScrn);
     } else {
 	void *map = pI830->FbBase + pI830->exa_965_state->offset;
-	i965_init_state_objects(pScrn, map);
+	gen4_state_init ((void *) map);
+	gen4_surface_state_init (pScrn, map);
     }
 
     return 0;
commit 0c0ab52c2d100c47f38c7ef826ef585c8b9815e9
Author: Carl Worth <cworth at cworth.org>
Date:   Tue Nov 13 12:56:13 2007 -0800

    Enumerate all possible cc_state objects
    
    We need one for each possible combination of src and dst
    blend_factors. Again, as with recent changes, this eliminates
    state updates from prepare_composite and allows that function
    to instead simply reference an existing object initialized
    within gen4_state_init.
    
    Thanks to Dave Airlie (and git-bisect) for pointing out that with
    gnome-terminal all text was appearing as solid black with an early
    version of this commit. As expected the bug was an alignment issue.

diff --git a/src/brw_defines.h b/src/brw_defines.h
index 93aed54..72a87f9 100644
--- a/src/brw_defines.h
+++ b/src/brw_defines.h
@@ -179,6 +179,7 @@
 #define BRW_BLENDFACTOR_INV_CONST_ALPHA     0x18
 #define BRW_BLENDFACTOR_INV_SRC1_COLOR      0x19
 #define BRW_BLENDFACTOR_INV_SRC1_ALPHA      0x1A
+#define BRW_BLENDFACTOR_COUNT		    0x1B
 
 #define BRW_BLENDFUNCTION_ADD               0
 #define BRW_BLENDFUNCTION_SUBTRACT          1
diff --git a/src/i965_render.c b/src/i965_render.c
index 7a22180..4cdadb4 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -355,14 +355,20 @@ typedef enum {
     SAMPLER_STATE_EXTEND_COUNT
 } sampler_state_extend_t;
 
+typedef struct _brw_cc_unit_state_padded {
+    struct brw_cc_unit_state state;
+    char pad[64 - sizeof (struct brw_cc_unit_state)];
+} brw_cc_unit_state_padded;
+
 /* Many of the fields in the state structure must be aligned to a
  * 64-byte boundary, (or a 32-byte boundary, but 64 is good enough for
  * those too). */
-#define PAD64(previous, idx) char previous ## _pad ## idx [(64 - (sizeof(struct previous) % 64)) % 64]
+#define PAD64_MULTI(previous, idx, factor) char previous ## _pad ## idx [(64 - (sizeof(struct previous) * (factor)) % 64) % 64]
+#define PAD64(previous, idx) PAD64_MULTI(previous, idx, 1)
 #define KERNEL_DECL(template) \
     CARD32 template [((sizeof (template ## _static) + 63) & ~63) / 16][4];
 typedef struct _gen4_state {
-    char wm_scratch[1024 * PS_MAX_THREADS];
+    char wm_scratch[128 * PS_MAX_THREADS];
 
     /* Index by [src_filter][src_extend][mask_filter][mask_extend] */
     struct brw_sampler_state sampler_state[SAMPLER_STATE_FILTER_COUNT]
@@ -375,8 +381,10 @@ typedef struct _gen4_state {
     struct brw_vs_unit_state vs_state;
     PAD64 (brw_vs_unit_state, 0);
 
-    struct brw_cc_unit_state cc_state;
-    PAD64 (brw_cc_unit_state, 0);
+    /* Index by [src_blend][dst_blend] */
+    brw_cc_unit_state_padded cc_state[BRW_BLENDFACTOR_COUNT]
+				     [BRW_BLENDFACTOR_COUNT];
+
     struct brw_cc_viewport cc_viewport;
     PAD64 (brw_cc_viewport, 0);
 
@@ -649,44 +657,57 @@ wm_state_init (struct brw_wm_unit_state *wm_state,
 }
 
 static void
-gen4_state_init (gen4_state_t *state)
+cc_state_init (struct brw_cc_unit_state *cc_state,
+	       int src_blend,
+	       int dst_blend,
+	       int cc_viewport_offset)
 {
-    struct brw_cc_viewport *cc_viewport;
-    struct brw_cc_unit_state *cc_state;
-    struct brw_sampler_default_color *default_color_state;
-    struct brw_vs_unit_state *vs_state;
-    int cc_viewport_offset;
-
-    int i,j, k, l;
-
-    cc_viewport = &state->cc_viewport;
-    cc_viewport->min_depth = -1.e35;
-    cc_viewport->max_depth = 1.e35;
-
-    cc_state = &state->cc_state;
+    memset(cc_state, 0, sizeof(*cc_state));
     cc_state->cc0.stencil_enable = 0;   /* disable stencil */
     cc_state->cc2.depth_test = 0;       /* disable depth test */
     cc_state->cc2.logicop_enable = 0;   /* disable logic op */
     cc_state->cc3.ia_blend_enable = 1;  /* blend alpha just like colors */
     cc_state->cc3.blend_enable = 1;     /* enable color blend */
     cc_state->cc3.alpha_test = 0;       /* disable alpha test */
-    cc_viewport_offset = offsetof (gen4_state_t, cc_viewport);
+
     cc_state->cc4.cc_viewport_state_offset = cc_viewport_offset >> 5;
+
     cc_state->cc5.dither_enable = 0;    /* disable dither */
     cc_state->cc5.logicop_func = 0xc;   /* COPY */
     cc_state->cc5.statistics_enable = 1;
     cc_state->cc5.ia_blend_function = BRW_BLENDFUNCTION_ADD;
+
+    /* XXX: alpha blend factor should be same as color, but check
+     * for CA case in future
+     */
+    cc_state->cc5.ia_src_blend_factor = src_blend;
+    cc_state->cc5.ia_dest_blend_factor = dst_blend;
+
     cc_state->cc6.blend_function = BRW_BLENDFUNCTION_ADD;
     cc_state->cc6.clamp_post_alpha_blend = 1;
     cc_state->cc6.clamp_pre_alpha_blend = 1;
     cc_state->cc6.clamp_range = 0;  /* clamp range [0,1] */
 
-    /* default color state */
-    default_color_state = &state->default_color_state;
-    default_color_state->color[0] = 0.0; /* R */
-    default_color_state->color[1] = 0.0; /* G */
-    default_color_state->color[2] = 0.0; /* B */
-    default_color_state->color[3] = 0.0; /* A */
+    cc_state->cc6.src_blend_factor = src_blend;
+    cc_state->cc6.dest_blend_factor = dst_blend;
+}
+
+static void
+gen4_state_init (gen4_state_t *state)
+{
+    struct brw_cc_viewport *cc_viewport;
+    struct brw_vs_unit_state *vs_state;
+    int i,j, k, l;
+
+    cc_viewport = &state->cc_viewport;
+    cc_viewport->min_depth = -1.e35;
+    cc_viewport->max_depth = 1.e35;
+
+    /* Color calculator state */
+    for (i = 0; i < BRW_BLENDFACTOR_COUNT; i++)
+	for (j = 0; j < BRW_BLENDFACTOR_COUNT; j++)
+	    cc_state_init (&state->cc_state[i][j].state, i, j,
+			   offsetof (gen4_state_t, cc_viewport));
 
     for (i = 0; i < SAMPLER_STATE_FILTER_COUNT; i++)
 	for (j = 0; j < SAMPLER_STATE_EXTEND_COUNT; j++)
@@ -878,9 +899,8 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     CARD32 mask_pitch = 0, mask_tile_format = 0, mask_tiled = 0;
     CARD32 dst_format, dst_pitch, dst_tile_format = 0, dst_tiled = 0;
     Bool rotation_program = FALSE;
-    struct brw_cc_unit_state *cc_state;
     int wm_state_offset, sip_kernel_offset;
-    int sf_state_offset;
+    int sf_state_offset, cc_state_offset;
     char *start_base;
     void *map;
     gen4_state_t *gen4_state;
@@ -953,18 +973,9 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
      */
     vb = (void *)(start_base + vb_offset);
     vb_index = 0;
-    /* Color calculator state */
-    cc_state = &gen4_state->cc_state;
+
     i965_get_blend_cntl(op, pMaskPicture, pDstPicture->format,
 			&src_blend, &dst_blend);
-    /* XXX: alpha blend factor should be same as color, but check
-     * for CA case in future
-     */
-    cc_state->cc5.ia_src_blend_factor = src_blend;
-    cc_state->cc5.ia_dest_blend_factor = dst_blend;
-    cc_state->cc6.src_blend_factor = src_blend;
-    cc_state->cc6.dest_blend_factor = dst_blend;
-
 
     /* Set up the state buffer for the destination surface */
     dest_surf_state = (void *)(start_base + dest_surf_offset);
@@ -1094,6 +1105,9 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 
     sip_kernel_offset = offsetof (gen4_state_t, sip_kernel);
     
+    cc_state_offset = offsetof (gen4_state_t,
+				cc_state[src_blend][dst_blend]);
+
     /* Begin the long sequence of commands needed to set up the 3D
      * rendering pipe
      */
@@ -1182,7 +1196,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
    	OUT_BATCH(BRW_CLIP_DISABLE); /* disable CLIP, resulting in passthrough */
 	OUT_BATCH(sf_state_offset); /* 32 byte aligned */
 	OUT_BATCH(wm_state_offset); /* 32 byte aligned */
-	OUT_BATCH(offsetof (gen4_state_t, cc_state));  /* 64 byte aligned */
+	OUT_BATCH(cc_state_offset); /* 64 byte aligned */
 
 	/* URB fence */
    	OUT_BATCH(BRW_URB_FENCE |
commit 7763706a93d3021907273f9b330750ba110e2fc3
Author: Carl Worth <cworth at cworth.org>
Date:   Tue Nov 13 12:37:13 2007 -0800

    Enumerate all possible wm_state objects
    
    We have a collection of wm_state objects for each ps kernel,
    (one for each combination of src and mask extend and repeat
    values).
    
    Thanks to Dave Airlie for noticing an errant write through a
    wild wm_state pointer in an early version of this commit.

diff --git a/src/i965_render.c b/src/i965_render.c
index 91fb8b9..7a22180 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -375,8 +375,6 @@ typedef struct _gen4_state {
     struct brw_vs_unit_state vs_state;
     PAD64 (brw_vs_unit_state, 0);
 
-    struct brw_wm_unit_state wm_state;
-    PAD64 (brw_wm_unit_state, 0);
     struct brw_cc_unit_state cc_state;
     PAD64 (brw_cc_unit_state, 0);
     struct brw_cc_viewport cc_viewport;
@@ -397,11 +395,36 @@ typedef struct _gen4_state {
     struct brw_sf_unit_state sf_state_rotation;
     PAD64 (brw_sf_unit_state, 2);
 
+    /* PS kernels and corresponding WM states */
     KERNEL_DECL (ps_kernel_nomask);
+    struct brw_wm_unit_state wm_state_nomask[SAMPLER_STATE_FILTER_COUNT]
+					    [SAMPLER_STATE_EXTEND_COUNT]
+					    [SAMPLER_STATE_FILTER_COUNT]
+					    [SAMPLER_STATE_EXTEND_COUNT];
+
     KERNEL_DECL (ps_kernel_maskca);
+    struct brw_wm_unit_state wm_state_maskca[SAMPLER_STATE_FILTER_COUNT]
+					    [SAMPLER_STATE_EXTEND_COUNT]
+					    [SAMPLER_STATE_FILTER_COUNT]
+					    [SAMPLER_STATE_EXTEND_COUNT];
+
     KERNEL_DECL (ps_kernel_maskca_srcalpha);
+    struct brw_wm_unit_state wm_state_maskca_srcalpha[SAMPLER_STATE_FILTER_COUNT]
+					    [SAMPLER_STATE_EXTEND_COUNT]
+					    [SAMPLER_STATE_FILTER_COUNT]
+					    [SAMPLER_STATE_EXTEND_COUNT];
+
     KERNEL_DECL (ps_kernel_masknoca);
+    struct brw_wm_unit_state wm_state_masknoca[SAMPLER_STATE_FILTER_COUNT]
+					    [SAMPLER_STATE_EXTEND_COUNT]
+					    [SAMPLER_STATE_FILTER_COUNT]
+					    [SAMPLER_STATE_EXTEND_COUNT];
+
     KERNEL_DECL (ps_kernel_rotation);
+    struct brw_wm_unit_state wm_state_rotation[SAMPLER_STATE_FILTER_COUNT]
+					    [SAMPLER_STATE_EXTEND_COUNT]
+					    [SAMPLER_STATE_FILTER_COUNT]
+					    [SAMPLER_STATE_EXTEND_COUNT];
 } gen4_state_t;
 
 char gen4_state_big_enough[EXA_LINEAR_EXTRA >= sizeof(gen4_state_t) ? 1 : -1];
@@ -581,14 +604,58 @@ sampler_state_init (struct brw_sampler_state *sampler_state,
 }
 
 static void
+wm_state_init (struct brw_wm_unit_state *wm_state,
+	       Bool has_mask,
+	       int scratch_offset,
+	       int kernel_offset,
+	       int sampler_state_offset)
+{
+    memset(wm_state, 0, sizeof (*wm_state));
+    wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
+    wm_state->thread1.single_program_flow = 1;
+
+    wm_state->thread2.scratch_space_base_pointer = scratch_offset >> 10;
+
+    wm_state->thread2.per_thread_scratch_space = 0;
+    wm_state->thread3.const_urb_entry_read_length = 0;
+    wm_state->thread3.const_urb_entry_read_offset = 0;
+
+    wm_state->thread3.urb_entry_read_offset = 0;
+    /* wm kernel use urb from 3, see wm_program in compiler module */
+    wm_state->thread3.dispatch_grf_start_reg = 3; /* must match kernel */
+
+    wm_state->wm4.stats_enable = 1;  /* statistic */
+    wm_state->wm4.sampler_state_pointer = sampler_state_offset >> 5;
+    wm_state->wm4.sampler_count = 1; /* 1-4 samplers used */
+    wm_state->wm5.max_threads = PS_MAX_THREADS - 1;
+    wm_state->wm5.thread_dispatch_enable = 1;
+    /* just use 16-pixel dispatch (4 subspans), don't need to change kernel
+     * start point
+     */
+    wm_state->wm5.enable_16_pix = 1;
+    wm_state->wm5.enable_8_pix = 0;
+    wm_state->wm5.early_depth_test = 1;
+
+    wm_state->thread0.kernel_start_pointer = kernel_offset >> 6;
+
+    /* Each pair of attributes (src/mask coords) is one URB entry */
+    if (has_mask) {
+	wm_state->thread1.binding_table_entry_count = 3; /* 2 tex and fb */
+	wm_state->thread3.urb_entry_read_length = 2;
+    } else {
+	wm_state->thread1.binding_table_entry_count = 2; /* 1 tex and fb */
+	wm_state->thread3.urb_entry_read_length = 1;
+    }
+}
+
+static void
 gen4_state_init (gen4_state_t *state)
 {
     struct brw_cc_viewport *cc_viewport;
     struct brw_cc_unit_state *cc_state;
     struct brw_sampler_default_color *default_color_state;
     struct brw_vs_unit_state *vs_state;
-    struct brw_wm_unit_state *wm_state;
-    int cc_viewport_offset, wm_scratch_offset;
+    int cc_viewport_offset;
 
     int i,j, k, l;
 
@@ -655,31 +722,36 @@ gen4_state_init (gen4_state_t *state)
     sf_state_init (&state->sf_state_rotation,
 		   offsetof (gen4_state_t, sf_kernel_rotation));
 
-    /* wm state */
-    wm_state = &state->wm_state;
-/*    wm_state->thread0.kernel_start_pointer = ps_kernel_offset >> 6; */
-    wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
-    wm_state->thread1.single_program_flow = 1;
-    wm_scratch_offset = offsetof (gen4_state_t, wm_scratch);
-    wm_state->thread2.scratch_space_base_pointer = wm_scratch_offset>>10;
-    wm_state->thread2.per_thread_scratch_space = 0;
-    wm_state->thread3.const_urb_entry_read_length = 0;
-    wm_state->thread3.const_urb_entry_read_offset = 0;
-    /* Each pair of attributes (src/mask coords) is one URB entry */
-    wm_state->thread3.urb_entry_read_offset = 0;
-    /* wm kernel use urb from 3, see wm_program in compiler module */
-    wm_state->thread3.dispatch_grf_start_reg = 3; /* must match kernel */
-
-    wm_state->wm4.stats_enable = 1;  /* statistic */
-    wm_state->wm4.sampler_count = 1; /* 1-4 samplers used */
-    wm_state->wm5.max_threads = PS_MAX_THREADS - 1;
-    wm_state->wm5.thread_dispatch_enable = 1;
-    /* just use 16-pixel dispatch (4 subspans), don't need to change kernel
-     * start point
-     */
-    wm_state->wm5.enable_16_pix = 1;
-    wm_state->wm5.enable_8_pix = 0;
-    wm_state->wm5.early_depth_test = 1;
+    for (i = 0; i < SAMPLER_STATE_FILTER_COUNT; i++)
+	for (j = 0; j < SAMPLER_STATE_EXTEND_COUNT; j++)
+	    for (k = 0; k < SAMPLER_STATE_FILTER_COUNT; k++)
+		for (l = 0; l < SAMPLER_STATE_EXTEND_COUNT; l++) {
+		    wm_state_init (&state->wm_state_nomask[i][j][k][l],
+				   FALSE,
+				   offsetof (gen4_state_t, wm_scratch),
+				   offsetof (gen4_state_t, ps_kernel_nomask),
+				   offsetof (gen4_state_t, sampler_state[i][j][k][l][0]));
+		    wm_state_init (&state->wm_state_maskca[i][j][k][l],
+				   TRUE,
+				   offsetof (gen4_state_t, wm_scratch),
+				   offsetof (gen4_state_t, ps_kernel_maskca),
+				   offsetof (gen4_state_t, sampler_state[i][j][k][l][0]));
+		    wm_state_init (&state->wm_state_maskca_srcalpha[i][j][k][l],
+				   TRUE,
+				   offsetof (gen4_state_t, wm_scratch),
+				   offsetof (gen4_state_t, ps_kernel_maskca_srcalpha),
+				   offsetof (gen4_state_t, sampler_state[i][j][k][l][0]));
+		    wm_state_init (&state->wm_state_masknoca[i][j][k][l],
+				   TRUE,
+				   offsetof (gen4_state_t, wm_scratch),
+				   offsetof (gen4_state_t, ps_kernel_masknoca),
+				   offsetof (gen4_state_t, sampler_state[i][j][k][l][0]));
+		    wm_state_init (&state->wm_state_rotation[i][j][k][l],
+				   FALSE,
+				   offsetof (gen4_state_t, wm_scratch),
+				   offsetof (gen4_state_t, ps_kernel_rotation),
+				   offsetof (gen4_state_t, sampler_state[i][j][k][l][0]));
+		}
 
     /* Upload kernels */
     memcpy (state->sip_kernel, sip_kernel_static, sizeof (sip_kernel_static));
@@ -807,14 +879,11 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     CARD32 dst_format, dst_pitch, dst_tile_format = 0, dst_tiled = 0;
     Bool rotation_program = FALSE;
     struct brw_cc_unit_state *cc_state;
-    CARD32 *ps_kernel;
-    int ps_kernel_offset, sip_kernel_offset;
+    int wm_state_offset, sip_kernel_offset;
     int sf_state_offset;
     char *start_base;
     void *map;
     gen4_state_t *gen4_state;
-    struct brw_wm_unit_state *wm_state;
-    int sampler_state_offset;
     sampler_state_filter_t src_filter, mask_filter;
     sampler_state_extend_t src_extend, mask_extend;
 
@@ -980,45 +1049,51 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	mask_extend = SAMPLER_STATE_EXTEND_NONE;
     }
 
-    sampler_state_offset = offsetof (gen4_state_t,
-				     sampler_state
-				     [src_filter]
-				     [src_extend]
-				     [mask_filter]
-				     [mask_extend][0]);
-
-    gen4_state->wm_state.wm4.sampler_state_pointer = sampler_state_offset >> 5;
-
     /* Set up the PS kernel (dispatched by WM) */
     if (pMask) {
 	if (pMaskPicture->componentAlpha && 
 	    PICT_FORMAT_RGB(pMaskPicture->format)) {
-            if (i965_blend_op[op].src_alpha) 
-		ps_kernel = (CARD32 *) gen4_state->ps_kernel_maskca_srcalpha;
-            else
-		ps_kernel = (CARD32 *) gen4_state->ps_kernel_maskca;
-        } else
-	    ps_kernel = (CARD32 *) gen4_state->ps_kernel_masknoca;
+	    if (i965_blend_op[op].src_alpha) {
+		wm_state_offset = offsetof (gen4_state_t,
+					    wm_state_maskca_srcalpha
+					    [src_filter]
+					    [src_extend]
+					    [mask_filter]
+					    [mask_extend]);
+	    } else {
+		wm_state_offset = offsetof (gen4_state_t,
+					    wm_state_maskca
+					    [src_filter]
+					    [src_extend]
+					    [mask_filter]
+					    [mask_extend]);
+	    }
+	} else {
+	    wm_state_offset = offsetof (gen4_state_t,
+					wm_state_masknoca
+					[src_filter]
+					[src_extend]
+					[mask_filter]
+					[mask_extend]);
+	}
     } else if (rotation_program) {
-	ps_kernel = (CARD32 *) gen4_state->ps_kernel_rotation;
+	wm_state_offset = offsetof (gen4_state_t,
+				    wm_state_rotation
+				    [src_filter]
+				    [src_extend]
+				    [mask_filter]
+				    [mask_extend]);
     } else {
-	ps_kernel = (CARD32 *) gen4_state->ps_kernel_nomask;
+	wm_state_offset = offsetof (gen4_state_t,
+				    wm_state_nomask
+				    [src_filter]
+				    [src_extend]
+				    [mask_filter]
+				    [mask_extend]);
     }
 
-    ps_kernel_offset = (char *) ps_kernel - (char *) gen4_state;
-    wm_state = &gen4_state->wm_state;
-    wm_state->thread0.kernel_start_pointer = ps_kernel_offset >> 6;
-
     sip_kernel_offset = offsetof (gen4_state_t, sip_kernel);
     
-    if (!pMask) {
-	wm_state->thread1.binding_table_entry_count = 2; /* 1 tex and fb */
-	wm_state->thread3.urb_entry_read_length = 1;
-    } else {
-	wm_state->thread1.binding_table_entry_count = 3; /* 2 tex and fb */
-	wm_state->thread3.urb_entry_read_length = 2;
-    }
-
     /* Begin the long sequence of commands needed to set up the 3D
      * rendering pipe
      */
@@ -1106,7 +1181,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
    	OUT_BATCH(BRW_GS_DISABLE);   /* disable GS, resulting in passthrough */
    	OUT_BATCH(BRW_CLIP_DISABLE); /* disable CLIP, resulting in passthrough */
 	OUT_BATCH(sf_state_offset); /* 32 byte aligned */
-	OUT_BATCH(offsetof (gen4_state_t, wm_state));  /* 32 byte aligned */
+	OUT_BATCH(wm_state_offset); /* 32 byte aligned */
 	OUT_BATCH(offsetof (gen4_state_t, cc_state));  /* 64 byte aligned */
 
 	/* URB fence */
commit d0874697be8086cd64740c24698df8cd4d31c76f
Author: Carl Worth <cworth at cworth.org>
Date:   Mon Nov 12 16:09:00 2007 -0800

    Enumerate all possible src,mask sampler state pairs
    
    This will eventually allow for the elimination of sampler state
    updates while compositing---and initializing everything in the
    initialization function.

diff --git a/src/i965_render.c b/src/i965_render.c
index 0925769..91fb8b9 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -343,6 +343,18 @@ static const CARD32 ps_kernel_rotation_static [][4] = {
 #include "exa_wm_rotation_prog.h"
 };
 
+typedef enum {
+    SAMPLER_STATE_FILTER_NEAREST,
+    SAMPLER_STATE_FILTER_BILINEAR,
+    SAMPLER_STATE_FILTER_COUNT
+} sampler_state_filter_t;
+
+typedef enum {
+    SAMPLER_STATE_EXTEND_NONE,
+    SAMPLER_STATE_EXTEND_REPEAT,
+    SAMPLER_STATE_EXTEND_COUNT
+} sampler_state_extend_t;
+
 /* Many of the fields in the state structure must be aligned to a
  * 64-byte boundary, (or a 32-byte boundary, but 64 is good enough for
  * those too). */
@@ -351,10 +363,13 @@ static const CARD32 ps_kernel_rotation_static [][4] = {
     CARD32 template [((sizeof (template ## _static) + 63) & ~63) / 16][4];
 typedef struct _gen4_state {
     char wm_scratch[1024 * PS_MAX_THREADS];
-    struct brw_sampler_state src_sampler_state;
-    PAD64 (brw_sampler_state, 0);
-    struct brw_sampler_state mask_sampler_state;
-    PAD64 (brw_sampler_state, 1);
+
+    /* Index by [src_filter][src_extend][mask_filter][mask_extend] */
+    struct brw_sampler_state sampler_state[SAMPLER_STATE_FILTER_COUNT]
+					  [SAMPLER_STATE_EXTEND_COUNT]
+					  [SAMPLER_STATE_FILTER_COUNT]
+					  [SAMPLER_STATE_EXTEND_COUNT][2];
+
     struct brw_sampler_default_color default_color_state;
     PAD64 (brw_sampler_default_color, 0);
     struct brw_vs_unit_state vs_state;
@@ -523,16 +538,59 @@ sf_state_init (struct brw_sf_unit_state *sf_state, int kernel_offset)
 }
 
 static void
+sampler_state_init (struct brw_sampler_state *sampler_state,
+		    sampler_state_filter_t filter,
+		    sampler_state_extend_t extend,
+		    int default_color_offset)
+{
+    /* PS kernel use this sampler */
+    memset(sampler_state, 0, sizeof(*sampler_state));
+    sampler_state->ss0.lod_preclamp = 1; /* GL mode */
+
+    sampler_state->ss0.default_color_mode = 0; /* GL mode */
+
+    switch(filter) {
+    default:
+    case SAMPLER_STATE_FILTER_NEAREST:
+	sampler_state->ss0.min_filter = BRW_MAPFILTER_NEAREST;
+	sampler_state->ss0.mag_filter = BRW_MAPFILTER_NEAREST;
+	break;
+    case SAMPLER_STATE_FILTER_BILINEAR:
+	sampler_state->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+	sampler_state->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+	break;
+    }
+
+    switch (extend) {
+    default:
+    case SAMPLER_STATE_EXTEND_NONE:
+	sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
+	sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
+	sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
+	break;
+    case SAMPLER_STATE_EXTEND_REPEAT:
+	sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_WRAP;
+	sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_WRAP;
+	sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
+	break;
+    }
+
+    sampler_state->ss2.default_color_pointer = default_color_offset >> 5;
+
+    sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
+}
+
+static void
 gen4_state_init (gen4_state_t *state)
 {
     struct brw_cc_viewport *cc_viewport;
     struct brw_cc_unit_state *cc_state;
     struct brw_sampler_default_color *default_color_state;
-    struct brw_sampler_state *src_sampler_state;
-    struct brw_sampler_state *mask_sampler_state;
     struct brw_vs_unit_state *vs_state;
     struct brw_wm_unit_state *wm_state;
-    int cc_viewport_offset, wm_scratch_offset, src_sampler_offset;
+    int cc_viewport_offset, wm_scratch_offset;
+
+    int i,j, k, l;
 
     cc_viewport = &state->cc_viewport;
     cc_viewport->min_depth = -1.e35;
@@ -563,18 +621,18 @@ gen4_state_init (gen4_state_t *state)
     default_color_state->color[2] = 0.0; /* B */
     default_color_state->color[3] = 0.0; /* A */
 
-    /* src sampler state */
-    src_sampler_state = &state->src_sampler_state;
-    src_sampler_state->ss0.lod_preclamp = 1; /* GL mode */
-    src_sampler_state->ss0.default_color_mode = 0; /* GL mode */
-    src_sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
-
-    /* mask sampler state */
-    mask_sampler_state = &state->mask_sampler_state;
-    mask_sampler_state->ss0.lod_preclamp = 1; /* GL mode */
-    mask_sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
+    for (i = 0; i < SAMPLER_STATE_FILTER_COUNT; i++)
+	for (j = 0; j < SAMPLER_STATE_EXTEND_COUNT; j++)
+	    for (k = 0; k < SAMPLER_STATE_FILTER_COUNT; k++)
+		for (l = 0; l < SAMPLER_STATE_EXTEND_COUNT; l++) {
+		    sampler_state_init (&state->sampler_state[i][j][k][l][0],
+					i, j,
+					offsetof (gen4_state_t, default_color_state));
+		    sampler_state_init (&state->sampler_state[i][j][k][l][1],
+					k, l,
+					offsetof (gen4_state_t, default_color_state));
+		}
 
-    /* vertex shader state */
     /* Set up the vertex shader to be disabled (passthrough) */
     vs_state = &state->vs_state;
     vs_state->thread4.nr_urb_entries = URB_VS_ENTRIES;
@@ -613,8 +671,6 @@ gen4_state_init (gen4_state_t *state)
     wm_state->thread3.dispatch_grf_start_reg = 3; /* must match kernel */
 
     wm_state->wm4.stats_enable = 1;  /* statistic */
-    src_sampler_offset = offsetof (gen4_state_t, src_sampler_state);
-    wm_state->wm4.sampler_state_pointer = src_sampler_offset >> 5;
     wm_state->wm4.sampler_count = 1; /* 1-4 samplers used */
     wm_state->wm5.max_threads = PS_MAX_THREADS - 1;
     wm_state->wm5.thread_dispatch_enable = 1;
@@ -717,6 +773,28 @@ i965_exastate_reset(struct i965_exastate_buffer *state)
     i965_init_state_objects(state->pScrn, state->map);
 }
 
+static sampler_state_filter_t
+sampler_state_filter_from_picture (int filter)
+{
+    switch (filter) {
+    case PictFilterNearest:
+	return SAMPLER_STATE_FILTER_NEAREST;
+    case PictFilterBilinear:
+	return SAMPLER_STATE_FILTER_BILINEAR;
+    default:
+	return -1;
+    }
+}
+
+static sampler_state_extend_t
+sampler_state_extend_from_picture (int repeat)
+{
+    if (repeat)
+	return SAMPLER_STATE_EXTEND_REPEAT;
+    else
+	return SAMPLER_STATE_EXTEND_NONE;
+}
+
 Bool
 i965_prepare_composite(int op, PicturePtr pSrcPicture,
 		       PicturePtr pMaskPicture, PicturePtr pDstPicture,
@@ -731,13 +809,14 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     struct brw_cc_unit_state *cc_state;
     CARD32 *ps_kernel;
     int ps_kernel_offset, sip_kernel_offset;
-    int sf_state_offset, default_color_offset;
+    int sf_state_offset;
     char *start_base;
     void *map;
     gen4_state_t *gen4_state;
     struct brw_wm_unit_state *wm_state;
-    struct brw_sampler_state *src_sampler_state;
-    struct brw_sampler_state *mask_sampler_state;
+    int sampler_state_offset;
+    sampler_state_filter_t src_filter, mask_filter;
+    sampler_state_extend_t src_extend, mask_extend;
 
     if (pI830->use_ttm_batch) {
 	i965_exastate_reset(pI830->exa965);
@@ -884,63 +963,31 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     else
 	binding_table[2] = 0;
 
-    /* PS kernel use this sampler */
-    src_sampler_state = &gen4_state->src_sampler_state;
-    switch(pSrcPicture->filter) {
-    case PictFilterNearest:
-   	src_sampler_state->ss0.min_filter = BRW_MAPFILTER_NEAREST;
-   	src_sampler_state->ss0.mag_filter = BRW_MAPFILTER_NEAREST;
-	break;
-    case PictFilterBilinear:
-	src_sampler_state->ss0.min_filter = BRW_MAPFILTER_LINEAR;
-   	src_sampler_state->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
-	break;
-    default:
-	I830FALLBACK("Bad filter 0x%x\n", pSrcPicture->filter);
-    }
+    src_filter = sampler_state_filter_from_picture (pSrcPicture->filter);
+    if (src_filter < 0)
+	I830FALLBACK ("Bad filter 0x%x\n", pSrcPicture->filter);
+
+    src_extend = sampler_state_extend_from_picture (pSrcPicture->repeat);
 
-    if (!pSrcPicture->repeat) {
-   	src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
-   	src_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
-   	src_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
-	default_color_offset = offsetof (gen4_state_t, default_color_state);
-	src_sampler_state->ss2.default_color_pointer = default_color_offset >> 5;
+    if (pMaskPicture) {
+	mask_filter = sampler_state_filter_from_picture (pMaskPicture->filter);
+	if (mask_filter < 0)
+	    I830FALLBACK ("Bad filter 0x%x\n", pMaskPicture->filter);
+
+	mask_extend = sampler_state_extend_from_picture (pMaskPicture->repeat);
     } else {
-   	src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_WRAP;
-   	src_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_WRAP;
-   	src_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
+	mask_filter = SAMPLER_STATE_FILTER_NEAREST;
+	mask_extend = SAMPLER_STATE_EXTEND_NONE;
     }
 
-    if (pMask) {
-	mask_sampler_state = &gen4_state->mask_sampler_state;
-   	switch(pMaskPicture->filter) {
-   	case PictFilterNearest:
-   	    mask_sampler_state->ss0.min_filter = BRW_MAPFILTER_NEAREST;
-   	    mask_sampler_state->ss0.mag_filter = BRW_MAPFILTER_NEAREST;
-	    break;
-   	case PictFilterBilinear:
-   	    mask_sampler_state->ss0.min_filter = BRW_MAPFILTER_LINEAR;
-   	    mask_sampler_state->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
-	    break;
-   	default:
-	    I830FALLBACK("Bad filter 0x%x\n", pMaskPicture->filter);
-   	}
+    sampler_state_offset = offsetof (gen4_state_t,
+				     sampler_state
+				     [src_filter]
+				     [src_extend]
+				     [mask_filter]
+				     [mask_extend][0]);
 
-   	if (!pMaskPicture->repeat) {
-   	    mask_sampler_state->ss1.r_wrap_mode =
-		BRW_TEXCOORDMODE_CLAMP_BORDER;
-   	    mask_sampler_state->ss1.s_wrap_mode =
-		BRW_TEXCOORDMODE_CLAMP_BORDER;
-   	    mask_sampler_state->ss1.t_wrap_mode =
-		BRW_TEXCOORDMODE_CLAMP_BORDER;
-            mask_sampler_state->ss2.default_color_pointer =
-		default_color_offset >> 5;
-   	} else {
-   	    mask_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_WRAP;
-   	    mask_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_WRAP;
-   	    mask_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
-    	}
-    }
+    gen4_state->wm_state.wm4.sampler_state_pointer = sampler_state_offset >> 5;
 
     /* Set up the PS kernel (dispatched by WM) */
     if (pMask) {
commit a2b5c23184d19b386fdfd04f578a55566df60132
Author: Carl Worth <cworth at cworth.org>
Date:   Fri Nov 9 17:24:21 2007 -0800

    Associate one sf_state object with each sf_kernel

diff --git a/src/i965_render.c b/src/i965_render.c
index 7bb714d..0925769 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -359,8 +359,7 @@ typedef struct _gen4_state {
     PAD64 (brw_sampler_default_color, 0);
     struct brw_vs_unit_state vs_state;
     PAD64 (brw_vs_unit_state, 0);
-    struct brw_sf_unit_state sf_state;
-    PAD64 (brw_sf_unit_state, 0);
+
     struct brw_wm_unit_state wm_state;
     PAD64 (brw_wm_unit_state, 0);
     struct brw_cc_unit_state cc_state;
@@ -370,9 +369,18 @@ typedef struct _gen4_state {
 
     KERNEL_DECL (sip_kernel);
 
+    /* SF kernels and corresponding states */
     KERNEL_DECL (sf_kernel);
+    struct brw_sf_unit_state sf_state;
+    PAD64 (brw_sf_unit_state, 0);
+
     KERNEL_DECL (sf_kernel_mask);
+    struct brw_sf_unit_state sf_state_mask;
+    PAD64 (brw_sf_unit_state, 1);
+
     KERNEL_DECL (sf_kernel_rotation);
+    struct brw_sf_unit_state sf_state_rotation;
+    PAD64 (brw_sf_unit_state, 2);
 
     KERNEL_DECL (ps_kernel_nomask);
     KERNEL_DECL (ps_kernel_maskca);
@@ -480,16 +488,49 @@ i965_init_state_offsets(ScrnInfoPtr pScrn, int total_size)
 }
 
 static void
+sf_state_init (struct brw_sf_unit_state *sf_state, int kernel_offset)
+{
+    memset(sf_state, 0, sizeof(*sf_state));
+    sf_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(SF_KERNEL_NUM_GRF);
+    sf_state->sf1.single_program_flow = 1;
+    sf_state->sf1.binding_table_entry_count = 0;
+    sf_state->sf1.thread_priority = 0;
+    sf_state->sf1.floating_point_mode = 0; /* Mesa does this */
+    sf_state->sf1.illegal_op_exception_enable = 1;
+    sf_state->sf1.mask_stack_exception_enable = 1;
+    sf_state->sf1.sw_exception_enable = 1;
+    sf_state->thread2.per_thread_scratch_space = 0;
+    /* scratch space is not used in our kernel */
+    sf_state->thread2.scratch_space_base_pointer = 0;
+    sf_state->thread3.const_urb_entry_read_length = 0; /* no const URBs */
+    sf_state->thread3.const_urb_entry_read_offset = 0; /* no const URBs */
+    sf_state->thread3.urb_entry_read_length = 1; /* 1 URB per vertex */
+    /* don't smash vertex header, read start from dw8 */
+    sf_state->thread3.urb_entry_read_offset = 1;
+    sf_state->thread3.dispatch_grf_start_reg = 3;
+    sf_state->thread4.max_threads = SF_MAX_THREADS - 1;
+    sf_state->thread4.urb_entry_allocation_size = URB_SF_ENTRY_SIZE - 1;
+    sf_state->thread4.nr_urb_entries = URB_SF_ENTRIES;
+    sf_state->thread4.stats_enable = 1;
+    sf_state->sf5.viewport_transform = FALSE; /* skip viewport */
+    sf_state->sf6.cull_mode = BRW_CULLMODE_NONE;
+    sf_state->sf6.scissor = 0;
+    sf_state->sf7.trifan_pv = 2;
+    sf_state->sf6.dest_org_vbias = 0x8;
+    sf_state->sf6.dest_org_hbias = 0x8;
+
+    sf_state->thread0.kernel_start_pointer = kernel_offset >> 6;
+}
+
+static void
 gen4_state_init (gen4_state_t *state)
 {
-    /* cc viewport */
     struct brw_cc_viewport *cc_viewport;
     struct brw_cc_unit_state *cc_state;
     struct brw_sampler_default_color *default_color_state;
     struct brw_sampler_state *src_sampler_state;
     struct brw_sampler_state *mask_sampler_state;
     struct brw_vs_unit_state *vs_state;
-    struct brw_sf_unit_state *sf_state;
     struct brw_wm_unit_state *wm_state;
     int cc_viewport_offset, wm_scratch_offset, src_sampler_offset;
 
@@ -541,36 +582,20 @@ gen4_state_init (gen4_state_t *state)
     vs_state->vs6.vs_enable = 0;
     vs_state->vs6.vert_cache_disable = 1;
 
-    /* sf state */
-    sf_state = &state->sf_state;
-/*    sf_state->thread0.kernel_start_pointer = sf_kernel_offset >> 6; */
-    sf_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(SF_KERNEL_NUM_GRF);
-    sf_state->sf1.single_program_flow = 1;
-    sf_state->sf1.binding_table_entry_count = 0;
-    sf_state->sf1.thread_priority = 0;
-    sf_state->sf1.floating_point_mode = 0; /* Mesa does this */
-    sf_state->sf1.illegal_op_exception_enable = 1;
-    sf_state->sf1.mask_stack_exception_enable = 1;
-    sf_state->sf1.sw_exception_enable = 1;
-    sf_state->thread2.per_thread_scratch_space = 0;
-    /* scratch space is not used in our kernel */
-    sf_state->thread2.scratch_space_base_pointer = 0;
-    sf_state->thread3.const_urb_entry_read_length = 0; /* no const URBs */
-    sf_state->thread3.const_urb_entry_read_offset = 0; /* no const URBs */
-    sf_state->thread3.urb_entry_read_length = 1; /* 1 URB per vertex */
-    /* don't smash vertex header, read start from dw8 */
-    sf_state->thread3.urb_entry_read_offset = 1;
-    sf_state->thread3.dispatch_grf_start_reg = 3;
-    sf_state->thread4.max_threads = SF_MAX_THREADS - 1;
-    sf_state->thread4.urb_entry_allocation_size = URB_SF_ENTRY_SIZE - 1;
-    sf_state->thread4.nr_urb_entries = URB_SF_ENTRIES;
-    sf_state->thread4.stats_enable = 1;
-    sf_state->sf5.viewport_transform = FALSE; /* skip viewport */
-    sf_state->sf6.cull_mode = BRW_CULLMODE_NONE;
-    sf_state->sf6.scissor = 0;
-    sf_state->sf7.trifan_pv = 2;
-    sf_state->sf6.dest_org_vbias = 0x8;
-    sf_state->sf6.dest_org_hbias = 0x8;
+    /* Copy all SF kernels into state structure. */
+    memcpy(state->sf_kernel, sf_kernel_static,
+	   sizeof (sf_kernel_static));
+    memcpy(state->sf_kernel_mask, sf_kernel_mask_static,
+	   sizeof (sf_kernel_mask_static));
+    memcpy(state->sf_kernel_rotation, sf_kernel_rotation_static,
+	   sizeof (sf_kernel_rotation_static));
+
+    sf_state_init (&state->sf_state,
+		   offsetof (gen4_state_t, sf_kernel));
+    sf_state_init (&state->sf_state_mask,
+		   offsetof (gen4_state_t, sf_kernel_mask));
+    sf_state_init (&state->sf_state_rotation,
+		   offsetof (gen4_state_t, sf_kernel_rotation));
 
     /* wm state */
     wm_state = &state->wm_state;
@@ -603,13 +628,6 @@ gen4_state_init (gen4_state_t *state)
     /* Upload kernels */
     memcpy (state->sip_kernel, sip_kernel_static, sizeof (sip_kernel_static));
 
-    memcpy (state->sf_kernel, sf_kernel_static,
-	    sizeof (sf_kernel_static));
-    memcpy (state->sf_kernel_mask, sf_kernel_mask_static,
-	    sizeof (sf_kernel_mask_static));
-    memcpy (state->sf_kernel_rotation, sf_kernel_rotation_static,
-	    sizeof (sf_kernel_rotation_static));
-
     memcpy (state->ps_kernel_nomask, ps_kernel_nomask_static,
 	    sizeof (ps_kernel_nomask_static));
     memcpy (state->ps_kernel_maskca, ps_kernel_maskca_static,
@@ -711,13 +729,12 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     CARD32 dst_format, dst_pitch, dst_tile_format = 0, dst_tiled = 0;
     Bool rotation_program = FALSE;
     struct brw_cc_unit_state *cc_state;
-    CARD32 *sf_kernel, *ps_kernel;
-    int sf_kernel_offset, ps_kernel_offset, sip_kernel_offset;
-    int default_color_offset;
+    CARD32 *ps_kernel;
+    int ps_kernel_offset, sip_kernel_offset;
+    int sf_state_offset, default_color_offset;
     char *start_base;
     void *map;
     gen4_state_t *gen4_state;
-    struct brw_sf_unit_state *sf_state;
     struct brw_wm_unit_state *wm_state;
     struct brw_sampler_state *src_sampler_state;
     struct brw_sampler_state *mask_sampler_state;
@@ -776,6 +793,12 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     }
 
     /* setup 3d pipeline state */
+    if (pMask)
+	sf_state_offset = offsetof (gen4_state_t, sf_state_mask);
+    else if (rotation_program)
+	sf_state_offset = offsetof (gen4_state_t, sf_state_rotation);
+    else
+	sf_state_offset = offsetof (gen4_state_t, sf_state);
 
     /* Because we only have a single static buffer for our state currently,
      * we have to sync before updating it every time.
@@ -919,22 +942,6 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     	}
     }
 
-
-    /* Set up the SF kernel to do coord interp: for each attribute,
-     * calculate dA/dx and dA/dy.  Hand these interpolation coefficients
-     * back to SF which then hands pixels off to WM.
-     */
-    if (pMask)
-	sf_kernel = (CARD32 *) gen4_state->sf_kernel_mask;
-    else if (rotation_program)
-	sf_kernel = (CARD32 *) gen4_state->sf_kernel_rotation;
-    else
-	sf_kernel = (CARD32 *) gen4_state->sf_kernel;
-
-    sf_kernel_offset = (char *) sf_kernel - (char *) gen4_state;
-    sf_state = &gen4_state->sf_state;
-    sf_state->thread0.kernel_start_pointer = sf_kernel_offset >> 6;
-
     /* Set up the PS kernel (dispatched by WM) */
     if (pMask) {
 	if (pMaskPicture->componentAlpha && 
@@ -955,8 +962,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     wm_state = &gen4_state->wm_state;
     wm_state->thread0.kernel_start_pointer = ps_kernel_offset >> 6;
 
-    sip_kernel_offset = ((char *) gen4_state->sip_kernel -
-			 (char *) gen4_state);
+    sip_kernel_offset = offsetof (gen4_state_t, sip_kernel);
     
     if (!pMask) {
 	wm_state->thread1.binding_table_entry_count = 2; /* 1 tex and fb */
@@ -1052,7 +1058,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	OUT_BATCH(offsetof (gen4_state_t, vs_state));  /* 32 byte aligned */
    	OUT_BATCH(BRW_GS_DISABLE);   /* disable GS, resulting in passthrough */
    	OUT_BATCH(BRW_CLIP_DISABLE); /* disable CLIP, resulting in passthrough */
-	OUT_BATCH(offsetof (gen4_state_t, sf_state));  /* 32 byte aligned */
+	OUT_BATCH(sf_state_offset); /* 32 byte aligned */
 	OUT_BATCH(offsetof (gen4_state_t, wm_state));  /* 32 byte aligned */
 	OUT_BATCH(offsetof (gen4_state_t, cc_state));  /* 64 byte aligned */
 
commit 26573b7542707bd25f6b19570c0f1cd86a538dbd
Author: Carl Worth <cworth at cworth.org>
Date:   Fri Nov 9 17:13:06 2007 -0800

    Avoid some global variables

diff --git a/src/i965_render.c b/src/i965_render.c
index a37729d..7bb714d 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -257,13 +257,6 @@ static int urb_cs_start, urb_cs_size;
 static struct brw_surface_state *dest_surf_state;
 static struct brw_surface_state *src_surf_state;
 static struct brw_surface_state *mask_surf_state;
-static struct brw_sampler_state *src_sampler_state;
-static struct brw_sampler_state *mask_sampler_state;
-static struct brw_sampler_default_color *default_color_state;
-
-static struct brw_vs_unit_state *vs_state;
-static struct brw_sf_unit_state *sf_state;
-static struct brw_wm_unit_state *wm_state;
 
 static CARD32 *binding_table;
 
@@ -492,6 +485,12 @@ gen4_state_init (gen4_state_t *state)
     /* cc viewport */
     struct brw_cc_viewport *cc_viewport;
     struct brw_cc_unit_state *cc_state;
+    struct brw_sampler_default_color *default_color_state;
+    struct brw_sampler_state *src_sampler_state;
+    struct brw_sampler_state *mask_sampler_state;
+    struct brw_vs_unit_state *vs_state;
+    struct brw_sf_unit_state *sf_state;
+    struct brw_wm_unit_state *wm_state;
     int cc_viewport_offset, wm_scratch_offset, src_sampler_offset;
 
     cc_viewport = &state->cc_viewport;
@@ -718,6 +717,10 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     char *start_base;
     void *map;
     gen4_state_t *gen4_state;
+    struct brw_sf_unit_state *sf_state;
+    struct brw_wm_unit_state *wm_state;
+    struct brw_sampler_state *src_sampler_state;
+    struct brw_sampler_state *mask_sampler_state;
 
     if (pI830->use_ttm_batch) {
 	i965_exastate_reset(pI830->exa965);
@@ -929,6 +932,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	sf_kernel = (CARD32 *) gen4_state->sf_kernel;
 
     sf_kernel_offset = (char *) sf_kernel - (char *) gen4_state;
+    sf_state = &gen4_state->sf_state;
     sf_state->thread0.kernel_start_pointer = sf_kernel_offset >> 6;
 
     /* Set up the PS kernel (dispatched by WM) */
@@ -948,12 +952,12 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     }
 
     ps_kernel_offset = (char *) ps_kernel - (char *) gen4_state;
+    wm_state = &gen4_state->wm_state;
     wm_state->thread0.kernel_start_pointer = ps_kernel_offset >> 6;
 
     sip_kernel_offset = ((char *) gen4_state->sip_kernel -
 			 (char *) gen4_state);
     
-    wm_state = &gen4_state->wm_state;
     if (!pMask) {
 	wm_state->thread1.binding_table_entry_count = 2; /* 1 tex and fb */
 	wm_state->thread3.urb_entry_read_length = 1;
commit d55f7088788b3b0bb31100073916118b98b01561
Author: Carl Worth <cworth at cworth.org>
Date:   Fri Nov 9 17:04:59 2007 -0800

    Separate gen4_state_init from the rest of i965_init_state_objects

diff --git a/src/i965_render.c b/src/i965_render.c
index 9914236..a37729d 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -487,22 +487,18 @@ i965_init_state_offsets(ScrnInfoPtr pScrn, int total_size)
 }
 
 static void
-i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
+gen4_state_init (gen4_state_t *state)
 {
     /* cc viewport */
     struct brw_cc_viewport *cc_viewport;
     struct brw_cc_unit_state *cc_state;
-    struct brw_surface_state *dest_surf_state, *src_surf_state, *mask_surf_state;
-    gen4_state_t* gen4_state;
     int cc_viewport_offset, wm_scratch_offset, src_sampler_offset;
 
-    gen4_state = (void *)(start_base + gen4_state_offset);
-
-    cc_viewport = &gen4_state->cc_viewport;
+    cc_viewport = &state->cc_viewport;
     cc_viewport->min_depth = -1.e35;
     cc_viewport->max_depth = 1.e35;
 
-    cc_state = &gen4_state->cc_state;
+    cc_state = &state->cc_state;
     cc_state->cc0.stencil_enable = 0;   /* disable stencil */
     cc_state->cc2.depth_test = 0;       /* disable depth test */
     cc_state->cc2.logicop_enable = 0;   /* disable logic op */
@@ -520,80 +516,34 @@ i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
     cc_state->cc6.clamp_pre_alpha_blend = 1;
     cc_state->cc6.clamp_range = 0;  /* clamp range [0,1] */
 
-    /* destination surface state */
-    dest_surf_state = (void *)(start_base + dest_surf_offset);
-    dest_surf_state->ss0.surface_type = BRW_SURFACE_2D;
-    dest_surf_state->ss0.data_return_format = BRW_SURFACERETURNFORMAT_FLOAT32;
-    dest_surf_state->ss0.writedisable_alpha = 0;
-    dest_surf_state->ss0.writedisable_red = 0;
-    dest_surf_state->ss0.writedisable_green = 0;
-    dest_surf_state->ss0.writedisable_blue = 0;
-    dest_surf_state->ss0.color_blend = 1;
-    dest_surf_state->ss0.vert_line_stride = 0;
-    dest_surf_state->ss0.vert_line_stride_ofs = 0;
-    dest_surf_state->ss0.mipmap_layout_mode = 0;
-    dest_surf_state->ss0.render_cache_read_mode = 0;
-    dest_surf_state->ss2.mip_count = 0;
-    dest_surf_state->ss2.render_target_rotation = 0;
-
-    /* source surface state */
-    src_surf_state = (void *)(start_base + src_surf_offset);
-    src_surf_state->ss0.surface_type = BRW_SURFACE_2D;
-    src_surf_state->ss0.writedisable_alpha = 0;
-    src_surf_state->ss0.writedisable_red = 0;
-    src_surf_state->ss0.writedisable_green = 0;
-    src_surf_state->ss0.writedisable_blue = 0;
-    src_surf_state->ss0.color_blend = 1;
-    src_surf_state->ss0.vert_line_stride = 0;
-    src_surf_state->ss0.vert_line_stride_ofs = 0;
-    src_surf_state->ss0.mipmap_layout_mode = 0;
-    src_surf_state->ss0.render_cache_read_mode = 0;
-    src_surf_state->ss2.mip_count = 0;
-    src_surf_state->ss2.render_target_rotation = 0;
-
-    /* mask surface state */
-    mask_surf_state = (void *)(start_base + mask_surf_offset);
-    mask_surf_state->ss0.surface_type = BRW_SURFACE_2D;
-    mask_surf_state->ss0.writedisable_alpha = 0;
-    mask_surf_state->ss0.writedisable_red = 0;
-    mask_surf_state->ss0.writedisable_green = 0;
-    mask_surf_state->ss0.writedisable_blue = 0;
-    mask_surf_state->ss0.color_blend = 1;
-    mask_surf_state->ss0.vert_line_stride = 0;
-    mask_surf_state->ss0.vert_line_stride_ofs = 0;
-    mask_surf_state->ss0.mipmap_layout_mode = 0;
-    mask_surf_state->ss0.render_cache_read_mode = 0;
-    mask_surf_state->ss2.mip_count = 0;
-    mask_surf_state->ss2.render_target_rotation = 0;
-
     /* default color state */
-    default_color_state = &gen4_state->default_color_state;
+    default_color_state = &state->default_color_state;
     default_color_state->color[0] = 0.0; /* R */
     default_color_state->color[1] = 0.0; /* G */
     default_color_state->color[2] = 0.0; /* B */
     default_color_state->color[3] = 0.0; /* A */
 
     /* src sampler state */
-    src_sampler_state = &gen4_state->src_sampler_state;
+    src_sampler_state = &state->src_sampler_state;
     src_sampler_state->ss0.lod_preclamp = 1; /* GL mode */
     src_sampler_state->ss0.default_color_mode = 0; /* GL mode */
     src_sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
 
     /* mask sampler state */
-    mask_sampler_state = &gen4_state->mask_sampler_state;
+    mask_sampler_state = &state->mask_sampler_state;
     mask_sampler_state->ss0.lod_preclamp = 1; /* GL mode */
     mask_sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
 
     /* vertex shader state */
     /* Set up the vertex shader to be disabled (passthrough) */
-    vs_state = &gen4_state->vs_state;
+    vs_state = &state->vs_state;
     vs_state->thread4.nr_urb_entries = URB_VS_ENTRIES;
     vs_state->thread4.urb_entry_allocation_size = URB_VS_ENTRY_SIZE - 1;
     vs_state->vs6.vs_enable = 0;
     vs_state->vs6.vert_cache_disable = 1;
 
     /* sf state */
-    sf_state = &gen4_state->sf_state;
+    sf_state = &state->sf_state;
 /*    sf_state->thread0.kernel_start_pointer = sf_kernel_offset >> 6; */
     sf_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(SF_KERNEL_NUM_GRF);
     sf_state->sf1.single_program_flow = 1;
@@ -624,7 +574,7 @@ i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
     sf_state->sf6.dest_org_hbias = 0x8;
 
     /* wm state */
-    wm_state = &gen4_state->wm_state;
+    wm_state = &state->wm_state;
 /*    wm_state->thread0.kernel_start_pointer = ps_kernel_offset >> 6; */
     wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
     wm_state->thread1.single_program_flow = 1;
@@ -652,29 +602,86 @@ i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
     wm_state->wm5.early_depth_test = 1;
 
     /* Upload kernels */
-    memcpy (gen4_state->sip_kernel, sip_kernel_static, sizeof (sip_kernel_static));
+    memcpy (state->sip_kernel, sip_kernel_static, sizeof (sip_kernel_static));
 
-    memcpy (gen4_state->sf_kernel, sf_kernel_static,
+    memcpy (state->sf_kernel, sf_kernel_static,
 	    sizeof (sf_kernel_static));
-    memcpy (gen4_state->sf_kernel_mask, sf_kernel_mask_static,
+    memcpy (state->sf_kernel_mask, sf_kernel_mask_static,
 	    sizeof (sf_kernel_mask_static));
-    memcpy (gen4_state->sf_kernel_rotation, sf_kernel_rotation_static,
+    memcpy (state->sf_kernel_rotation, sf_kernel_rotation_static,
 	    sizeof (sf_kernel_rotation_static));
 
-    memcpy (gen4_state->ps_kernel_nomask, ps_kernel_nomask_static,
+    memcpy (state->ps_kernel_nomask, ps_kernel_nomask_static,
 	    sizeof (ps_kernel_nomask_static));
-    memcpy (gen4_state->ps_kernel_maskca, ps_kernel_maskca_static,
+    memcpy (state->ps_kernel_maskca, ps_kernel_maskca_static,
 	    sizeof (ps_kernel_maskca_static));
-    memcpy (gen4_state->ps_kernel_maskca_srcalpha,
+    memcpy (state->ps_kernel_maskca_srcalpha,
 	    ps_kernel_maskca_srcalpha_static,
 	    sizeof (ps_kernel_maskca_srcalpha_static));
-    memcpy (gen4_state->ps_kernel_masknoca, ps_kernel_masknoca_static,
+    memcpy (state->ps_kernel_masknoca, ps_kernel_masknoca_static,
 	    sizeof (ps_kernel_masknoca_static));
-    memcpy (gen4_state->ps_kernel_rotation, ps_kernel_rotation_static,
+    memcpy (state->ps_kernel_rotation, ps_kernel_rotation_static,
 	    sizeof (ps_kernel_rotation_static));
 }
 
 static void
+i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
+{
+    struct brw_surface_state *dest_surf_state, *src_surf_state, *mask_surf_state;
+    gen4_state_t* gen4_state;
+
+    gen4_state = (void *)(start_base + gen4_state_offset);
+
+    gen4_state_init (gen4_state);
+
+    /* destination surface state */
+    dest_surf_state = (void *)(start_base + dest_surf_offset);
+    dest_surf_state->ss0.surface_type = BRW_SURFACE_2D;
+    dest_surf_state->ss0.data_return_format = BRW_SURFACERETURNFORMAT_FLOAT32;
+    dest_surf_state->ss0.writedisable_alpha = 0;
+    dest_surf_state->ss0.writedisable_red = 0;
+    dest_surf_state->ss0.writedisable_green = 0;
+    dest_surf_state->ss0.writedisable_blue = 0;
+    dest_surf_state->ss0.color_blend = 1;
+    dest_surf_state->ss0.vert_line_stride = 0;
+    dest_surf_state->ss0.vert_line_stride_ofs = 0;
+    dest_surf_state->ss0.mipmap_layout_mode = 0;
+    dest_surf_state->ss0.render_cache_read_mode = 0;
+    dest_surf_state->ss2.mip_count = 0;
+    dest_surf_state->ss2.render_target_rotation = 0;
+
+    /* source surface state */
+    src_surf_state = (void *)(start_base + src_surf_offset);
+    src_surf_state->ss0.surface_type = BRW_SURFACE_2D;
+    src_surf_state->ss0.writedisable_alpha = 0;
+    src_surf_state->ss0.writedisable_red = 0;
+    src_surf_state->ss0.writedisable_green = 0;
+    src_surf_state->ss0.writedisable_blue = 0;
+    src_surf_state->ss0.color_blend = 1;
+    src_surf_state->ss0.vert_line_stride = 0;
+    src_surf_state->ss0.vert_line_stride_ofs = 0;
+    src_surf_state->ss0.mipmap_layout_mode = 0;
+    src_surf_state->ss0.render_cache_read_mode = 0;
+    src_surf_state->ss2.mip_count = 0;
+    src_surf_state->ss2.render_target_rotation = 0;
+
+    /* mask surface state */
+    mask_surf_state = (void *)(start_base + mask_surf_offset);
+    mask_surf_state->ss0.surface_type = BRW_SURFACE_2D;
+    mask_surf_state->ss0.writedisable_alpha = 0;
+    mask_surf_state->ss0.writedisable_red = 0;
+    mask_surf_state->ss0.writedisable_green = 0;
+    mask_surf_state->ss0.writedisable_blue = 0;
+    mask_surf_state->ss0.color_blend = 1;
+    mask_surf_state->ss0.vert_line_stride = 0;
+    mask_surf_state->ss0.vert_line_stride_ofs = 0;
+    mask_surf_state->ss0.mipmap_layout_mode = 0;
+    mask_surf_state->ss0.render_cache_read_mode = 0;
+    mask_surf_state->ss2.mip_count = 0;
+    mask_surf_state->ss2.render_target_rotation = 0;
+}
+
+static void
 i965_exastate_reset(struct i965_exastate_buffer *state)
 {
     I830Ptr pI830 = I830PTR(state->pScrn);
commit e971a4dce5c58b8e09c30dfdd8d3b88d515cb243
Author: Carl Worth <cworth at cworth.org>
Date:   Fri Nov 9 16:44:53 2007 -0800

    Add sampler and functional unit states to gen4_state_t

diff --git a/src/i965_render.c b/src/i965_render.c
index ef8059e..9914236 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -269,11 +269,8 @@ static CARD32 *binding_table;
 
 /* these offsets will remain the same for all buffers post allocation */
 static int dest_surf_offset, src_surf_offset, mask_surf_offset;
-static int src_sampler_offset, mask_sampler_offset,vs_offset;
-static int sf_offset, wm_offset, cc_offset, vb_offset, cc_viewport_offset;
-static int wm_scratch_offset;
+static int vb_offset;
 static int binding_table_offset;
-static int default_color_offset;
 static float *vb;
 static int vb_max_size, vb_index;
 static int gen4_state_offset;
@@ -360,6 +357,24 @@ static const CARD32 ps_kernel_rotation_static [][4] = {
 #define KERNEL_DECL(template) \
     CARD32 template [((sizeof (template ## _static) + 63) & ~63) / 16][4];
 typedef struct _gen4_state {
+    char wm_scratch[1024 * PS_MAX_THREADS];
+    struct brw_sampler_state src_sampler_state;
+    PAD64 (brw_sampler_state, 0);
+    struct brw_sampler_state mask_sampler_state;
+    PAD64 (brw_sampler_state, 1);
+    struct brw_sampler_default_color default_color_state;
+    PAD64 (brw_sampler_default_color, 0);
+    struct brw_vs_unit_state vs_state;
+    PAD64 (brw_vs_unit_state, 0);
+    struct brw_sf_unit_state sf_state;
+    PAD64 (brw_sf_unit_state, 0);
+    struct brw_wm_unit_state wm_state;
+    PAD64 (brw_wm_unit_state, 0);
+    struct brw_cc_unit_state cc_state;
+    PAD64 (brw_cc_unit_state, 0);
+    struct brw_cc_viewport cc_viewport;
+    PAD64 (brw_cc_viewport, 0);
+
     KERNEL_DECL (sip_kernel);
 
     KERNEL_DECL (sf_kernel);
@@ -419,32 +434,6 @@ i965_init_state_offsets(ScrnInfoPtr pScrn, int total_size)
     gen4_state_offset = ALIGN(next_offset, 64);
     next_offset = gen4_state_offset + sizeof(gen4_state_t);
 
-    vs_offset = ALIGN(next_offset, 64);
-    next_offset = vs_offset + sizeof(*vs_state);
-
-    sf_offset = ALIGN(next_offset, 32);
-    next_offset = sf_offset + sizeof(*sf_state);
-
-    wm_offset = ALIGN(next_offset, 32);
-    next_offset = wm_offset + sizeof(*wm_state);
-
-    wm_scratch_offset = ALIGN(next_offset, 1024);
-    next_offset = wm_scratch_offset + 1024 * PS_MAX_THREADS;
-
-    cc_offset = ALIGN(next_offset, 32);
-    next_offset = cc_offset + sizeof(struct brw_cc_unit_state);
-
-    /* needed? */
-    cc_viewport_offset = ALIGN(next_offset, 32);
-    next_offset = cc_viewport_offset + sizeof(struct brw_cc_viewport);
-
-    /* for texture sampler */
-    src_sampler_offset = ALIGN(next_offset, 32);
-    next_offset = src_sampler_offset + sizeof(*src_sampler_state);
-    
-    mask_sampler_offset = ALIGN(next_offset, 32);
-    next_offset = mask_sampler_offset + sizeof(*mask_sampler_state);
-
     /* And then the general state: */
     dest_surf_offset = ALIGN(next_offset, 32);
     next_offset = dest_surf_offset + sizeof(*dest_surf_state);
@@ -458,9 +447,6 @@ i965_init_state_offsets(ScrnInfoPtr pScrn, int total_size)
     binding_table_offset = ALIGN(next_offset, 32);
     next_offset = binding_table_offset + (4 * 4);
 
-    default_color_offset = ALIGN(next_offset, 32);
-    next_offset = default_color_offset + sizeof(*default_color_state);
-
     total_state_size = next_offset;
 
     /* Align VB to native size of elements, for safety */
@@ -508,18 +494,22 @@ i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
     struct brw_cc_unit_state *cc_state;
     struct brw_surface_state *dest_surf_state, *src_surf_state, *mask_surf_state;
     gen4_state_t* gen4_state;
+    int cc_viewport_offset, wm_scratch_offset, src_sampler_offset;
 
-    cc_viewport = (void *)(start_base + cc_viewport_offset);
+    gen4_state = (void *)(start_base + gen4_state_offset);
+
+    cc_viewport = &gen4_state->cc_viewport;
     cc_viewport->min_depth = -1.e35;
     cc_viewport->max_depth = 1.e35;
 
-    cc_state = (void *)(start_base + cc_offset);
+    cc_state = &gen4_state->cc_state;
     cc_state->cc0.stencil_enable = 0;   /* disable stencil */
     cc_state->cc2.depth_test = 0;       /* disable depth test */
     cc_state->cc2.logicop_enable = 0;   /* disable logic op */
     cc_state->cc3.ia_blend_enable = 1;  /* blend alpha just like colors */
     cc_state->cc3.blend_enable = 1;     /* enable color blend */
     cc_state->cc3.alpha_test = 0;       /* disable alpha test */
+    cc_viewport_offset = offsetof (gen4_state_t, cc_viewport);
     cc_state->cc4.cc_viewport_state_offset = cc_viewport_offset >> 5;
     cc_state->cc5.dither_enable = 0;    /* disable dither */
     cc_state->cc5.logicop_func = 0xc;   /* COPY */
@@ -577,33 +567,33 @@ i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
     mask_surf_state->ss2.render_target_rotation = 0;
 
     /* default color state */
-    default_color_state = (void *)(start_base + default_color_offset);
+    default_color_state = &gen4_state->default_color_state;
     default_color_state->color[0] = 0.0; /* R */
     default_color_state->color[1] = 0.0; /* G */
     default_color_state->color[2] = 0.0; /* B */
     default_color_state->color[3] = 0.0; /* A */
 
     /* src sampler state */
-    src_sampler_state = (void *)(start_base + src_sampler_offset);
+    src_sampler_state = &gen4_state->src_sampler_state;
     src_sampler_state->ss0.lod_preclamp = 1; /* GL mode */
     src_sampler_state->ss0.default_color_mode = 0; /* GL mode */
     src_sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
 
     /* mask sampler state */
-    mask_sampler_state = (void *)(start_base + mask_sampler_offset);
+    mask_sampler_state = &gen4_state->mask_sampler_state;
     mask_sampler_state->ss0.lod_preclamp = 1; /* GL mode */
     mask_sampler_state->ss3.chroma_key_enable = 0; /* disable chromakey */
 
     /* vertex shader state */
     /* Set up the vertex shader to be disabled (passthrough) */
-    vs_state = (void *)(start_base + vs_offset);
+    vs_state = &gen4_state->vs_state;
     vs_state->thread4.nr_urb_entries = URB_VS_ENTRIES;
     vs_state->thread4.urb_entry_allocation_size = URB_VS_ENTRY_SIZE - 1;
     vs_state->vs6.vs_enable = 0;
     vs_state->vs6.vert_cache_disable = 1;
 
     /* sf state */
-    sf_state = (void *)(start_base + sf_offset);
+    sf_state = &gen4_state->sf_state;
 /*    sf_state->thread0.kernel_start_pointer = sf_kernel_offset >> 6; */
     sf_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(SF_KERNEL_NUM_GRF);
     sf_state->sf1.single_program_flow = 1;
@@ -634,10 +624,11 @@ i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
     sf_state->sf6.dest_org_hbias = 0x8;
 
     /* wm state */
-    wm_state = (void *)(start_base + wm_offset);
+    wm_state = &gen4_state->wm_state;
 /*    wm_state->thread0.kernel_start_pointer = ps_kernel_offset >> 6; */
     wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
     wm_state->thread1.single_program_flow = 1;
+    wm_scratch_offset = offsetof (gen4_state_t, wm_scratch);
     wm_state->thread2.scratch_space_base_pointer = wm_scratch_offset>>10;
     wm_state->thread2.per_thread_scratch_space = 0;
     wm_state->thread3.const_urb_entry_read_length = 0;
@@ -648,6 +639,7 @@ i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
     wm_state->thread3.dispatch_grf_start_reg = 3; /* must match kernel */
 
     wm_state->wm4.stats_enable = 1;  /* statistic */
+    src_sampler_offset = offsetof (gen4_state_t, src_sampler_state);
     wm_state->wm4.sampler_state_pointer = src_sampler_offset >> 5;
     wm_state->wm4.sampler_count = 1; /* 1-4 samplers used */
     wm_state->wm5.max_threads = PS_MAX_THREADS - 1;
@@ -660,7 +652,6 @@ i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
     wm_state->wm5.early_depth_test = 1;
 
     /* Upload kernels */
-    gen4_state = (void *)(start_base + gen4_state_offset);
     memcpy (gen4_state->sip_kernel, sip_kernel_static, sizeof (sip_kernel_static));
 
     memcpy (gen4_state->sf_kernel, sf_kernel_static,
@@ -716,6 +707,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     struct brw_cc_unit_state *cc_state;
     CARD32 *sf_kernel, *ps_kernel;
     int sf_kernel_offset, ps_kernel_offset, sip_kernel_offset;
+    int default_color_offset;
     char *start_base;
     void *map;
     gen4_state_t *gen4_state;
@@ -781,7 +773,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     vb = (void *)(start_base + vb_offset);
     vb_index = 0;
     /* Color calculator state */
-    cc_state = (void *)(start_base + cc_offset);
+    cc_state = &gen4_state->cc_state;
     i965_get_blend_cntl(op, pMaskPicture, pDstPicture->format,
 			&src_blend, &dst_blend);
     /* XXX: alpha blend factor should be same as color, but check
@@ -860,7 +852,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 	binding_table[2] = 0;
 
     /* PS kernel use this sampler */
-    src_sampler_state = (void *)(start_base + src_sampler_offset);
+    src_sampler_state = &gen4_state->src_sampler_state;
     switch(pSrcPicture->filter) {
     case PictFilterNearest:
    	src_sampler_state->ss0.min_filter = BRW_MAPFILTER_NEAREST;
@@ -878,6 +870,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
    	src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
    	src_sampler_state->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
    	src_sampler_state->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP_BORDER;
+	default_color_offset = offsetof (gen4_state_t, default_color_state);
 	src_sampler_state->ss2.default_color_pointer = default_color_offset >> 5;
     } else {
    	src_sampler_state->ss1.r_wrap_mode = BRW_TEXCOORDMODE_WRAP;
@@ -886,7 +879,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     }
 
     if (pMask) {
-	mask_sampler_state = (void *)(start_base + mask_sampler_offset);
+	mask_sampler_state = &gen4_state->mask_sampler_state;
    	switch(pMaskPicture->filter) {
    	case PictFilterNearest:
    	    mask_sampler_state->ss0.min_filter = BRW_MAPFILTER_NEAREST;
@@ -953,7 +946,7 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     sip_kernel_offset = ((char *) gen4_state->sip_kernel -
 			 (char *) gen4_state);
     
-    wm_state = (void *)(start_base + wm_offset);
+    wm_state = &gen4_state->wm_state;
     if (!pMask) {
 	wm_state->thread1.binding_table_entry_count = 2; /* 1 tex and fb */
 	wm_state->thread3.urb_entry_read_length = 1;
@@ -1045,12 +1038,12 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 
 	/* Set the pointers to the 3d pipeline state */
    	OUT_BATCH(BRW_3DSTATE_PIPELINED_POINTERS | 5);
-   	OUT_BATCH(vs_offset);  /* 32 byte aligned */
+	OUT_BATCH(offsetof (gen4_state_t, vs_state));  /* 32 byte aligned */
    	OUT_BATCH(BRW_GS_DISABLE);   /* disable GS, resulting in passthrough */
    	OUT_BATCH(BRW_CLIP_DISABLE); /* disable CLIP, resulting in passthrough */
-   	OUT_BATCH(sf_offset);  /* 32 byte aligned */
-   	OUT_BATCH(wm_offset);  /* 32 byte aligned */
-   	OUT_BATCH(cc_offset);  /* 64 byte aligned */
+	OUT_BATCH(offsetof (gen4_state_t, sf_state));  /* 32 byte aligned */
+	OUT_BATCH(offsetof (gen4_state_t, wm_state));  /* 32 byte aligned */
+	OUT_BATCH(offsetof (gen4_state_t, cc_state));  /* 64 byte aligned */
 
 	/* URB fence */
    	OUT_BATCH(BRW_URB_FENCE |
commit 43481975840c95053a4af917078802a25e51fe77
Author: Carl Worth <cworth at cworth.org>
Date:   Fri Nov 9 15:57:10 2007 -0800

    Add gen4_state_t structure
    
    The idea here is to actually use a (padded) C structure instead
    of just a bunch of void* addition with global offset values and
    casts.
    
    So far, we only have the kernels in the structure, (and we're even
    doing more copying of kernels than before). But soon enough we'll
    switch to separate "general" and "surface" state structures which
    will let us avoid do any kernel copying whatsover while compositing.

diff --git a/src/i965_render.c b/src/i965_render.c
index 6f5321a..ef8059e 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -271,13 +271,12 @@ static CARD32 *binding_table;
 static int dest_surf_offset, src_surf_offset, mask_surf_offset;
 static int src_sampler_offset, mask_sampler_offset,vs_offset;
 static int sf_offset, wm_offset, cc_offset, vb_offset, cc_viewport_offset;
-static int sf_kernel_offset, ps_kernel_offset, sip_kernel_offset;
 static int wm_scratch_offset;
 static int binding_table_offset;
 static int default_color_offset;
-//static int next_offset, total_state_size;
 static float *vb;
 static int vb_max_size, vb_index;
+static int gen4_state_offset;
 
 static CARD32 src_blend, dst_blend;
 
@@ -330,14 +329,6 @@ struct i965_kernels {
 
 };
 
-static struct i965_kernels sf_kernels[] = { { sf_kernel_static, sizeof(sf_kernel_static) },
-					    { sf_kernel_mask_static, sizeof(sf_kernel_mask_static) },
-					    { sf_kernel_rotation_static, sizeof(sf_kernel_rotation_static) } };
-
-#define SF_KERNEL 0
-#define SF_KERNEL_MASK 1
-#define SF_KERNEL_ROTATION 2
-
 /* ps kernels */
 #define PS_KERNEL_NUM_GRF   32
 #define PS_MAX_THREADS	   32
@@ -362,17 +353,27 @@ static const CARD32 ps_kernel_rotation_static [][4] = {
 #include "exa_wm_rotation_prog.h"
 };
 
-static struct i965_kernels ps_kernels[] = { { ps_kernel_nomask_static, sizeof(ps_kernel_nomask_static) },
-					    { ps_kernel_maskca_static, sizeof(ps_kernel_maskca_static) },
-					    { ps_kernel_maskca_srcalpha_static, sizeof(ps_kernel_maskca_srcalpha_static) },
-					    { ps_kernel_masknoca_static, sizeof(ps_kernel_masknoca_static) },
- 					    { ps_kernel_rotation_static, sizeof(ps_kernel_rotation_static) } };
+/* Many of the fields in the state structure must be aligned to a
+ * 64-byte boundary, (or a 32-byte boundary, but 64 is good enough for
+ * those too). */
+#define PAD64(previous, idx) char previous ## _pad ## idx [(64 - (sizeof(struct previous) % 64)) % 64]
+#define KERNEL_DECL(template) \
+    CARD32 template [((sizeof (template ## _static) + 63) & ~63) / 16][4];
+typedef struct _gen4_state {
+    KERNEL_DECL (sip_kernel);
+
+    KERNEL_DECL (sf_kernel);
+    KERNEL_DECL (sf_kernel_mask);
+    KERNEL_DECL (sf_kernel_rotation);
+
+    KERNEL_DECL (ps_kernel_nomask);
+    KERNEL_DECL (ps_kernel_maskca);
+    KERNEL_DECL (ps_kernel_maskca_srcalpha);
+    KERNEL_DECL (ps_kernel_masknoca);
+    KERNEL_DECL (ps_kernel_rotation);
+} gen4_state_t;
 
-#define PS_KERNEL_NOMASK 0
-#define PS_KERNEL_MASKCA 1
-#define PS_KERNEL_MASKCA_SRCALPHA 2
-#define PS_KERNEL_MASKNOCA 3
-#define PS_KERNEL_ROTATION 4
+char gen4_state_big_enough[EXA_LINEAR_EXTRA >= sizeof(gen4_state_t) ? 1 : -1];
 
 static CARD32 
 i965_get_card_format(PicturePtr pPict)
@@ -409,12 +410,15 @@ i965_init_state_offsets(ScrnInfoPtr pScrn, int total_size)
 {
     unsigned int next_offset = 0, total_state_size;
     static int init;
-    int tmp;
 
     if (init)
 	return;
 
     init = 1;
+
+    gen4_state_offset = ALIGN(next_offset, 64);
+    next_offset = gen4_state_offset + sizeof(gen4_state_t);
+
     vs_offset = ALIGN(next_offset, 64);
     next_offset = vs_offset + sizeof(*vs_state);
 
@@ -430,31 +434,6 @@ i965_init_state_offsets(ScrnInfoPtr pScrn, int total_size)
     cc_offset = ALIGN(next_offset, 32);
     next_offset = cc_offset + sizeof(struct brw_cc_unit_state);
 
-    sf_kernel_offset = ALIGN(next_offset, 64);
-    tmp = sizeof(sf_kernel_mask_static);
-    if (tmp < sizeof(sf_kernel_rotation_static))
-	tmp = sizeof(sf_kernel_rotation_static);
-    if (tmp < sizeof(sf_kernel_static))
-	tmp = sizeof(sf_kernel_static);
-
-    next_offset = sf_kernel_offset + tmp;
-
-    ps_kernel_offset = ALIGN(next_offset, 64);
-    tmp = sizeof(ps_kernel_maskca_srcalpha_static);
-    if (tmp < sizeof(ps_kernel_maskca_static))
-	tmp = sizeof(ps_kernel_maskca_static);
-    if (tmp < sizeof(ps_kernel_masknoca_static))
-	tmp = sizeof(ps_kernel_masknoca_static);
-    if (tmp < sizeof(ps_kernel_rotation_static))
-	tmp = sizeof(ps_kernel_rotation_static);
-    if (tmp < sizeof(ps_kernel_nomask_static))
-	tmp = sizeof(ps_kernel_nomask_static);
-    
-    next_offset = ps_kernel_offset + tmp;
-
-    sip_kernel_offset = ALIGN(next_offset, 64);
-    next_offset = sip_kernel_offset + sizeof (sip_kernel_static);
-
     /* needed? */
     cc_viewport_offset = ALIGN(next_offset, 32);
     next_offset = cc_viewport_offset + sizeof(struct brw_cc_viewport);
@@ -528,6 +507,7 @@ i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
     struct brw_cc_viewport *cc_viewport;
     struct brw_cc_unit_state *cc_state;
     struct brw_surface_state *dest_surf_state, *src_surf_state, *mask_surf_state;
+    gen4_state_t* gen4_state;
 
     cc_viewport = (void *)(start_base + cc_viewport_offset);
     cc_viewport->min_depth = -1.e35;
@@ -550,9 +530,6 @@ i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
     cc_state->cc6.clamp_pre_alpha_blend = 1;
     cc_state->cc6.clamp_range = 0;  /* clamp range [0,1] */
 
-    /* Upload system kernel */
-    memcpy (start_base + sip_kernel_offset, sip_kernel_static, sizeof (sip_kernel_static));
-
     /* destination surface state */
     dest_surf_state = (void *)(start_base + dest_surf_offset);
     dest_surf_state->ss0.surface_type = BRW_SURFACE_2D;
@@ -627,7 +604,7 @@ i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
 
     /* sf state */
     sf_state = (void *)(start_base + sf_offset);
-    sf_state->thread0.kernel_start_pointer = sf_kernel_offset >> 6;
+/*    sf_state->thread0.kernel_start_pointer = sf_kernel_offset >> 6; */
     sf_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(SF_KERNEL_NUM_GRF);
     sf_state->sf1.single_program_flow = 1;
     sf_state->sf1.binding_table_entry_count = 0;
@@ -658,7 +635,7 @@ i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
 
     /* wm state */
     wm_state = (void *)(start_base + wm_offset);
-    wm_state->thread0.kernel_start_pointer = ps_kernel_offset >> 6;
+/*    wm_state->thread0.kernel_start_pointer = ps_kernel_offset >> 6; */
     wm_state->thread0.grf_reg_count = BRW_GRF_BLOCKS(PS_KERNEL_NUM_GRF);
     wm_state->thread1.single_program_flow = 1;
     wm_state->thread2.scratch_space_base_pointer = wm_scratch_offset>>10;
@@ -681,20 +658,29 @@ i965_init_state_objects(ScrnInfoPtr pScrn, unsigned char *start_base)
     wm_state->wm5.enable_16_pix = 1;
     wm_state->wm5.enable_8_pix = 0;
     wm_state->wm5.early_depth_test = 1;
-}
 
-static void
-i965_update_sf_kernel(ScrnInfoPtr pScrn, char *start_base,
-		      int need_sf_kernel)
-{
-    memcpy(start_base + sf_kernel_offset, sf_kernels[need_sf_kernel].kernel, sf_kernels[need_sf_kernel].size);
-}
-
-static void
-i965_update_ps_kernel(ScrnInfoPtr pScrn, char *start_base,
-		      int need_ps_kernel)
-{
-    memcpy(start_base + ps_kernel_offset, ps_kernels[need_ps_kernel].kernel, ps_kernels[need_ps_kernel].size);
+    /* Upload kernels */
+    gen4_state = (void *)(start_base + gen4_state_offset);
+    memcpy (gen4_state->sip_kernel, sip_kernel_static, sizeof (sip_kernel_static));
+
+    memcpy (gen4_state->sf_kernel, sf_kernel_static,
+	    sizeof (sf_kernel_static));
+    memcpy (gen4_state->sf_kernel_mask, sf_kernel_mask_static,
+	    sizeof (sf_kernel_mask_static));
+    memcpy (gen4_state->sf_kernel_rotation, sf_kernel_rotation_static,
+	    sizeof (sf_kernel_rotation_static));
+
+    memcpy (gen4_state->ps_kernel_nomask, ps_kernel_nomask_static,
+	    sizeof (ps_kernel_nomask_static));
+    memcpy (gen4_state->ps_kernel_maskca, ps_kernel_maskca_static,
+	    sizeof (ps_kernel_maskca_static));
+    memcpy (gen4_state->ps_kernel_maskca_srcalpha,
+	    ps_kernel_maskca_srcalpha_static,
+	    sizeof (ps_kernel_maskca_srcalpha_static));
+    memcpy (gen4_state->ps_kernel_masknoca, ps_kernel_masknoca_static,
+	    sizeof (ps_kernel_masknoca_static));
+    memcpy (gen4_state->ps_kernel_rotation, ps_kernel_rotation_static,
+	    sizeof (ps_kernel_rotation_static));
 }
 
 static void
@@ -728,9 +714,11 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
     CARD32 dst_format, dst_pitch, dst_tile_format = 0, dst_tiled = 0;
     Bool rotation_program = FALSE;
     struct brw_cc_unit_state *cc_state;
-    int need_ps_kernel, need_sf_kernel;
+    CARD32 *sf_kernel, *ps_kernel;
+    int sf_kernel_offset, ps_kernel_offset, sip_kernel_offset;
     char *start_base;
     void *map;
+    gen4_state_t *gen4_state;
 
     if (pI830->use_ttm_batch) {
 	i965_exastate_reset(pI830->exa965);
@@ -741,6 +729,8 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
 
     start_base = map;
 
+    gen4_state = (void *)(start_base + gen4_state_offset);
+
     IntelEmitInvarientState(pScrn);
     *pI830->last_3d = LAST_3D_RENDER;
 
@@ -932,31 +922,36 @@ i965_prepare_composite(int op, PicturePtr pSrcPicture,
      * back to SF which then hands pixels off to WM.
      */
     if (pMask)
-	need_sf_kernel = SF_KERNEL_MASK;
+	sf_kernel = (CARD32 *) gen4_state->sf_kernel_mask;
     else if (rotation_program)
-	need_sf_kernel = SF_KERNEL_ROTATION;
+	sf_kernel = (CARD32 *) gen4_state->sf_kernel_rotation;
     else
-	need_sf_kernel = SF_KERNEL;
-    
-    i965_update_sf_kernel(pScrn, start_base, need_sf_kernel);
+	sf_kernel = (CARD32 *) gen4_state->sf_kernel;
+
+    sf_kernel_offset = (char *) sf_kernel - (char *) gen4_state;
+    sf_state->thread0.kernel_start_pointer = sf_kernel_offset >> 6;
 
     /* Set up the PS kernel (dispatched by WM) */
     if (pMask) {
 	if (pMaskPicture->componentAlpha && 
 	    PICT_FORMAT_RGB(pMaskPicture->format)) {
             if (i965_blend_op[op].src_alpha) 
-		need_ps_kernel = PS_KERNEL_MASKCA_SRCALPHA;
+		ps_kernel = (CARD32 *) gen4_state->ps_kernel_maskca_srcalpha;
             else
-		need_ps_kernel = PS_KERNEL_MASKCA;
+		ps_kernel = (CARD32 *) gen4_state->ps_kernel_maskca;
         } else
-	    need_ps_kernel = PS_KERNEL_MASKNOCA;
+	    ps_kernel = (CARD32 *) gen4_state->ps_kernel_masknoca;
     } else if (rotation_program) {
-	need_ps_kernel = PS_KERNEL_ROTATION;
+	ps_kernel = (CARD32 *) gen4_state->ps_kernel_rotation;
     } else {
-	need_ps_kernel = PS_KERNEL_NOMASK;
+	ps_kernel = (CARD32 *) gen4_state->ps_kernel_nomask;
     }
 
-    i965_update_ps_kernel(pScrn, start_base, need_ps_kernel);
+    ps_kernel_offset = (char *) ps_kernel - (char *) gen4_state;
+    wm_state->thread0.kernel_start_pointer = ps_kernel_offset >> 6;
+
+    sip_kernel_offset = ((char *) gen4_state->sip_kernel -
+			 (char *) gen4_state);
     
     wm_state = (void *)(start_base + wm_offset);
     if (!pMask) {
commit 291a735490d7ec2d2b3c841cda91044675282e67
Author: Carl Worth <cworth at cworth.org>
Date:   Fri Nov 9 11:54:24 2007 -0800

    Cosmetic: Consistently use _static suffix for kernel naming
    
    Previously, _static came in the middle of some kernel names.
    No functionality is changed with this commit.

diff --git a/src/i965_render.c b/src/i965_render.c
index 6ab7c9e..6f5321a 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -316,11 +316,11 @@ static const CARD32 sf_kernel_static[][4] = {
 #include "exa_sf_prog.h"
 };
 
-static const CARD32 sf_kernel_static_mask[][4] = {
+static const CARD32 sf_kernel_mask_static[][4] = {
 #include "exa_sf_mask_prog.h"
 };
 
-static const CARD32 sf_kernel_static_rotation[][4] = {
+static const CARD32 sf_kernel_rotation_static[][4] = {
 #include "exa_sf_rotation_prog.h"
 };
 
@@ -331,8 +331,8 @@ struct i965_kernels {
 };
 
 static struct i965_kernels sf_kernels[] = { { sf_kernel_static, sizeof(sf_kernel_static) },
-					    { sf_kernel_static_mask, sizeof(sf_kernel_static_mask) },
-					    { sf_kernel_static_rotation, sizeof(sf_kernel_static_rotation) } };
+					    { sf_kernel_mask_static, sizeof(sf_kernel_mask_static) },
+					    { sf_kernel_rotation_static, sizeof(sf_kernel_rotation_static) } };
 
 #define SF_KERNEL 0
 #define SF_KERNEL_MASK 1
@@ -342,31 +342,31 @@ static struct i965_kernels sf_kernels[] = { { sf_kernel_static, sizeof(sf_kernel
 #define PS_KERNEL_NUM_GRF   32
 #define PS_MAX_THREADS	   32
 
-static const CARD32 ps_kernel_static_nomask [][4] = {
+static const CARD32 ps_kernel_nomask_static [][4] = {
 #include "exa_wm_nomask_prog.h"
 };
 
-static const CARD32 ps_kernel_static_maskca [][4] = {
+static const CARD32 ps_kernel_maskca_static [][4] = {
 #include "exa_wm_maskca_prog.h"
 };
 
-static const CARD32 ps_kernel_static_maskca_srcalpha [][4] = {
+static const CARD32 ps_kernel_maskca_srcalpha_static [][4] = {
 #include "exa_wm_maskca_srcalpha_prog.h"
 };
 
-static const CARD32 ps_kernel_static_masknoca [][4] = {
+static const CARD32 ps_kernel_masknoca_static [][4] = {
 #include "exa_wm_masknoca_prog.h"
 };
 
-static const CARD32 ps_kernel_static_rotation [][4] = {
+static const CARD32 ps_kernel_rotation_static [][4] = {
 #include "exa_wm_rotation_prog.h"
 };
 
-static struct i965_kernels ps_kernels[] = { { ps_kernel_static_nomask, sizeof(ps_kernel_static_nomask) },
-					    { ps_kernel_static_maskca, sizeof(ps_kernel_static_maskca) },
-					    { ps_kernel_static_maskca_srcalpha, sizeof(ps_kernel_static_maskca_srcalpha) },
-					    { ps_kernel_static_masknoca, sizeof(ps_kernel_static_masknoca) },
- 					    { ps_kernel_static_rotation, sizeof(ps_kernel_static_rotation) } };
+static struct i965_kernels ps_kernels[] = { { ps_kernel_nomask_static, sizeof(ps_kernel_nomask_static) },
+					    { ps_kernel_maskca_static, sizeof(ps_kernel_maskca_static) },
+					    { ps_kernel_maskca_srcalpha_static, sizeof(ps_kernel_maskca_srcalpha_static) },
+					    { ps_kernel_masknoca_static, sizeof(ps_kernel_masknoca_static) },
+ 					    { ps_kernel_rotation_static, sizeof(ps_kernel_rotation_static) } };
 
 #define PS_KERNEL_NOMASK 0
 #define PS_KERNEL_MASKCA 1
@@ -431,24 +431,24 @@ i965_init_state_offsets(ScrnInfoPtr pScrn, int total_size)
     next_offset = cc_offset + sizeof(struct brw_cc_unit_state);
 
     sf_kernel_offset = ALIGN(next_offset, 64);
-    tmp = sizeof(sf_kernel_static_mask);
-    if (tmp < sizeof(sf_kernel_static_rotation))
-	tmp = sizeof(sf_kernel_static_rotation);
+    tmp = sizeof(sf_kernel_mask_static);
+    if (tmp < sizeof(sf_kernel_rotation_static))
+	tmp = sizeof(sf_kernel_rotation_static);
     if (tmp < sizeof(sf_kernel_static))
 	tmp = sizeof(sf_kernel_static);
 
     next_offset = sf_kernel_offset + tmp;
 
     ps_kernel_offset = ALIGN(next_offset, 64);
-    tmp = sizeof(ps_kernel_static_maskca_srcalpha);
-    if (tmp < sizeof(ps_kernel_static_maskca))
-	tmp = sizeof(ps_kernel_static_maskca);
-    if (tmp < sizeof(ps_kernel_static_masknoca))
-	tmp = sizeof(ps_kernel_static_masknoca);
-    if (tmp < sizeof(ps_kernel_static_rotation))
-	tmp = sizeof(ps_kernel_static_rotation);
-    if (tmp < sizeof(ps_kernel_static_nomask))
-	tmp = sizeof(ps_kernel_static_nomask);
+    tmp = sizeof(ps_kernel_maskca_srcalpha_static);
+    if (tmp < sizeof(ps_kernel_maskca_static))
+	tmp = sizeof(ps_kernel_maskca_static);
+    if (tmp < sizeof(ps_kernel_masknoca_static))
+	tmp = sizeof(ps_kernel_masknoca_static);
+    if (tmp < sizeof(ps_kernel_rotation_static))
+	tmp = sizeof(ps_kernel_rotation_static);
+    if (tmp < sizeof(ps_kernel_nomask_static))
+	tmp = sizeof(ps_kernel_nomask_static);
     
     next_offset = ps_kernel_offset + tmp;
 
commit 5ac25a637afe9515673abee4816a1dca9797a35d
Author: Carl Worth <cworth at cworth.org>
Date:   Thu Nov 8 17:04:25 2007 -0800

    Warnings cleanup: Replace [DE]ALLOCATE_LOCAL with xalloc/xfree
    
    The [DE]ALLOCATE_LOCAL wrappers don't even exist anymore.

diff --git a/src/i810_dri.c b/src/i810_dri.c
index e5e1565..58871bb 100644
--- a/src/i810_dri.c
+++ b/src/i810_dri.c
@@ -1231,12 +1231,12 @@ I810DRIMoveBuffers(WindowPtr pParent, DDXPointRec ptOldOrg,
 
       if (nbox > 1) {
 	 /* Keep ordering in each band, reverse order of bands */
-	 pboxNew1 = (BoxPtr) ALLOCATE_LOCAL(sizeof(BoxRec) * nbox);
+	 pboxNew1 = (BoxPtr) xalloc(sizeof(BoxRec) * nbox);
 	 if (!pboxNew1)
 	    return;
-	 pptNew1 = (DDXPointPtr) ALLOCATE_LOCAL(sizeof(DDXPointRec) * nbox);
+	 pptNew1 = (DDXPointPtr) xalloc(sizeof(DDXPointRec) * nbox);
 	 if (!pptNew1) {
-	    DEALLOCATE_LOCAL(pboxNew1);
+	    xfree(pboxNew1);
 	    return;
 	 }
 	 pboxBase = pboxNext = pbox + nbox - 1;
@@ -1267,16 +1267,16 @@ I810DRIMoveBuffers(WindowPtr pParent, DDXPointRec ptOldOrg,
 
       if (nbox > 1) {
 	 /*reverse orderof rects in each band */
-	 pboxNew2 = (BoxPtr) ALLOCATE_LOCAL(sizeof(BoxRec) * nbox);
-	 pptNew2 = (DDXPointPtr) ALLOCATE_LOCAL(sizeof(DDXPointRec) * nbox);
+	 pboxNew2 = (BoxPtr) xalloc(sizeof(BoxRec) * nbox);
+	 pptNew2 = (DDXPointPtr) xalloc(sizeof(DDXPointRec) * nbox);
 	 if (!pboxNew2 || !pptNew2) {
 	    if (pptNew2)
-	       DEALLOCATE_LOCAL(pptNew2);
+	       xfree(pptNew2);
 	    if (pboxNew2)
-	       DEALLOCATE_LOCAL(pboxNew2);
+	       xfree(pboxNew2);
 	    if (pboxNew1) {
-	       DEALLOCATE_LOCAL(pptNew1);
-	       DEALLOCATE_LOCAL(pboxNew1);
+	       xfree(pptNew1);
+	       xfree(pboxNew1);
 	    }
 	    return;
 	 }
@@ -1341,12 +1341,12 @@ I810DRIMoveBuffers(WindowPtr pParent, DDXPointRec ptOldOrg,
    I810EmitFlush(pScrn);
 
    if (pboxNew2) {
-      DEALLOCATE_LOCAL(pptNew2);
-      DEALLOCATE_LOCAL(pboxNew2);
+      xfree(pptNew2);
+      xfree(pboxNew2);
    }
    if (pboxNew1) {
-      DEALLOCATE_LOCAL(pptNew1);
-      DEALLOCATE_LOCAL(pboxNew1);
+      xfree(pptNew1);
+      xfree(pboxNew1);
    }
 
    if (pI810->AccelInfoRec)
diff --git a/src/i830_dri.c b/src/i830_dri.c
index 0102db9..6955ada 100644
--- a/src/i830_dri.c
+++ b/src/i830_dri.c
@@ -1308,12 +1308,12 @@ I830DRIMoveBuffers(WindowPtr pParent, DDXPointRec ptOldOrg,
 
       if (nbox > 1) {
 	 /* Keep ordering in each band, reverse order of bands */
-	 pboxNew1 = (BoxPtr) ALLOCATE_LOCAL(sizeof(BoxRec) * nbox);
+	 pboxNew1 = (BoxPtr) xalloc(sizeof(BoxRec) * nbox);
 	 if (!pboxNew1)
 	    return;
-	 pptNew1 = (DDXPointPtr) ALLOCATE_LOCAL(sizeof(DDXPointRec) * nbox);
+	 pptNew1 = (DDXPointPtr) xalloc(sizeof(DDXPointRec) * nbox);
 	 if (!pptNew1) {
-	    DEALLOCATE_LOCAL(pboxNew1);
+	    xfree(pboxNew1);
 	    return;
 	 }
 	 pboxBase = pboxNext = pbox + nbox - 1;
@@ -1344,16 +1344,16 @@ I830DRIMoveBuffers(WindowPtr pParent, DDXPointRec ptOldOrg,
 
       if (nbox > 1) {
 	 /*reverse orderof rects in each band */
-	 pboxNew2 = (BoxPtr) ALLOCATE_LOCAL(sizeof(BoxRec) * nbox);
-	 pptNew2 = (DDXPointPtr) ALLOCATE_LOCAL(sizeof(DDXPointRec) * nbox);
+	 pboxNew2 = (BoxPtr) xalloc(sizeof(BoxRec) * nbox);
+	 pptNew2 = (DDXPointPtr) xalloc(sizeof(DDXPointRec) * nbox);
 	 if (!pboxNew2 || !pptNew2) {
 	    if (pptNew2)
-	       DEALLOCATE_LOCAL(pptNew2);
+	       xfree(pptNew2);
 	    if (pboxNew2)
-	       DEALLOCATE_LOCAL(pboxNew2);
+	       xfree(pboxNew2);
 	    if (pboxNew1) {
-	       DEALLOCATE_LOCAL(pptNew1);
-	       DEALLOCATE_LOCAL(pboxNew1);
+	       xfree(pptNew1);
+	       xfree(pboxNew1);
 	    }
 	    return;
 	 }
@@ -1424,12 +1424,12 @@ I830DRIMoveBuffers(WindowPtr pParent, DDXPointRec ptOldOrg,
    I830EmitFlush(pScrn);
 
    if (pboxNew2) {
-      DEALLOCATE_LOCAL(pptNew2);
-      DEALLOCATE_LOCAL(pboxNew2);
+      xfree(pptNew2);
+      xfree(pboxNew2);
    }
    if (pboxNew1) {
-      DEALLOCATE_LOCAL(pptNew1);
-      DEALLOCATE_LOCAL(pboxNew1);
+      xfree(pptNew1);
+      xfree(pboxNew1);
    }
    i830MarkSync(pScrn);
 }
commit 9f39f3c9e0a9bbca2a2c8986e9fb2e90bbced5dd
Author: Carl Worth <cworth at cworth.org>
Date:   Thu Nov 8 16:59:17 2007 -0800

    Warning cleanup: missing static leads to warning about missing prototype

diff --git a/src/i965_render.c b/src/i965_render.c
index 142200e..6ab7c9e 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -697,7 +697,7 @@ i965_update_ps_kernel(ScrnInfoPtr pScrn, char *start_base,
     memcpy(start_base + ps_kernel_offset, ps_kernels[need_ps_kernel].kernel, ps_kernels[need_ps_kernel].size);
 }
 
-void
+static void
 i965_exastate_reset(struct i965_exastate_buffer *state)
 {
     I830Ptr pI830 = I830PTR(state->pScrn);
commit e22c6a1f3f87f8aac73715e51bc294d6810b5e78
Author: Carl Worth <cworth at cworth.org>
Date:   Thu Nov 8 16:46:06 2007 -0800

    Warning cleanup: unused variables

diff --git a/src/i830_debug.c b/src/i830_debug.c
index 54dff29..0237182 100644
--- a/src/i830_debug.c
+++ b/src/i830_debug.c
@@ -564,7 +564,7 @@ static void i830DumpAR(ScrnInfoPtr pScrn)
 {
     I830Ptr pI830 = I830PTR(pScrn);
     int i;
-    uint16_t st01, palette_enable = 0;
+    uint16_t st01;
     unsigned char orig_arx, msr;
 
     msr = INREG8(0x3cc);
diff --git a/src/i830_exa.c b/src/i830_exa.c
index 6e1c5bd..1dd7027 100644
--- a/src/i830_exa.c
+++ b/src/i830_exa.c
@@ -407,8 +407,6 @@ static void *I830EXACreatePixmap(ScreenPtr pScreen, int size, int align)
 
 static void I830EXADestroyPixmap(ScreenPtr pScreen, void *driverPriv)
 {
-    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
-    I830Ptr pI830 = I830PTR(pScrn);
     struct i830_exa_pixmap_priv *driver_priv = driverPriv;
 
     if (driver_priv->flags & I830_EXA_PIXMAP_IS_MAPPED)
diff --git a/src/intel_batchbuffer.c b/src/intel_batchbuffer.c
index f29f069..bce516c 100644
--- a/src/intel_batchbuffer.c
+++ b/src/intel_batchbuffer.c
@@ -266,9 +266,6 @@ intelddx_batchbuffer_emit_pixmap(PixmapPtr pPixmap, unsigned int flags,
 			      unsigned int mask, ddx_bo *reloc_buf,
 			      unsigned int offset, unsigned int delta)
 {
-    ScreenPtr pScreen = pPixmap->drawable.pScreen;
-    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
-    I830Ptr pI830 = I830PTR(pScrn);
     struct i830_exa_pixmap_priv *driver_priv = exaGetPixmapDriverPrivate(pPixmap);
 
     if (driver_priv->flags & I830_EXA_PIXMAP_IS_MAPPED) {
commit 4c4c370197348f7fc062969c414dd68f1b523eed
Author: Carl Worth <cworth at cworth.org>
Date:   Thu Nov 8 16:44:27 2007 -0800

    Correct confusion of EXASTATE_SZ and EXA_LINEAR_EXTRA
    
    If not using ttm_batch then the state buffer would be allocated
    at one size and initialized at another, (fortunately smaller for
    the moment so nothing bad should have actually happened yet).

diff --git a/src/intel_batchbuffer.h b/src/intel_batchbuffer.h
index 01e2e18..9f1c1c5 100644
--- a/src/intel_batchbuffer.h
+++ b/src/intel_batchbuffer.h
@@ -6,7 +6,7 @@
 struct intel_context;
 
 #define BATCH_SZ 16384
-#define EXASTATE_SZ 48000
+#define EXASTATE_SZ EXA_LINEAR_EXTRA
 #define BATCH_RESERVED 16
 
 struct intelddx_batchbuffer


More information about the xorg-commit mailing list