xf86-video-ati: Branch 'master' - 45 commits

Alex Deucher agd5f at kemper.freedesktop.org
Thu Feb 26 08:48:09 PST 2009


 src/Makefile.am                |    8 
 src/r600_exa.c                 | 4441 +++++++++++++++++++++++++++++++++++++++++
 src/r600_reg.h                 |  132 +
 src/r600_reg_auto_r6xx.h       | 3087 ++++++++++++++++++++++++++++
 src/r600_reg_r6xx.h            |  494 ++++
 src/r600_reg_r7xx.h            |  149 +
 src/r600_shader.h              |  346 +++
 src/r600_state.h               |  229 ++
 src/r600_textured_videofuncs.c |  521 ++++
 src/r6xx_accel.c               | 1160 ++++++++++
 src/radeon.h                   |  166 +
 src/radeon_accel.c             |  119 -
 src/radeon_commonfuncs.c       |   84 
 src/radeon_crtc.c              |    3 
 src/radeon_dri.c               |  201 -
 src/radeon_driver.c            |   49 
 src/radeon_exa.c               |    1 
 src/radeon_exa_render.c        |    2 
 src/radeon_modes.c             |   22 
 src/radeon_reg.h               |   33 
 src/radeon_textured_video.c    |  164 +
 21 files changed, 11177 insertions(+), 234 deletions(-)

New commits:
commit 000756e052a291230e5c95e48b69a5aa9c4fab0e
Merge: 22d7746... 8373f43...
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Thu Feb 26 11:44:13 2009 -0500

    Merge branch 'r6xx-r7xx-support' of git+ssh://agd5f@git.freedesktop.org/git/xorg/driver/xf86-video-ati

diff --cc src/radeon_driver.c
index 8bf2a02,75feca4..1171de4
--- a/src/radeon_driver.c
+++ b/src/radeon_driver.c
@@@ -1879,12 -1876,11 +1879,15 @@@ static Bool RADEONPreInitChipType(ScrnI
  
      /* treat PCIE IGP cards as PCI */
      if (info->cardType == CARD_PCIE && info->IsIGP)
- 		info->cardType = CARD_PCI;
+ 	info->cardType = CARD_PCI;
+ 
+     if ((info->ChipFamily >= CHIP_FAMILY_R600) && info->IsIGP)
+ 	info->cardType = CARD_PCIE;
  
 +    /* not sure about gart table requirements */
 +    if ((info->ChipFamily == CHIP_FAMILY_RS600) && info->IsIGP)
 +	info->cardType = CARD_PCIE;
 +
      if ((s = xf86GetOptValString(info->Options, OPTION_BUS_TYPE))) {
  	if (strcmp(s, "AGP") == 0) {
  	    info->cardType = CARD_AGP;
commit 8373f4399b03961f2c928a9275d47e9f41bd92bb
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Feb 25 09:16:38 2009 -0500

    R6xx/R7xx EXA: same surface and same coords equals nop
    
    should fix bug 20305

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 52b0042..17c5567 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -869,6 +869,9 @@ R600Copy(PixmapPtr pDst,
     RADEONInfoPtr info = RADEONPTR(pScrn);
     struct radeon_accel_state *accel_state = info->accel_state;
 
+    if (accel_state->same_surface && (srcX == dstX) && (srcY == dstY))
+	return;
+
     if (accel_state->same_surface && is_overlap(srcX, srcX + w, srcY, srcY + h, dstX, dstX + w, dstY, dstY + h)) {
 	if (accel_state->copy_area) {
 	    uint32_t pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
@@ -891,7 +894,7 @@ R600Copy(PixmapPtr pDst,
 	    R600DoCopy(pScrn);
 	} else
 	    R600OverlapCopy(pDst, srcX, srcY, dstX, dstY, w, h);
-    } else if(accel_state->same_surface) {
+    } else if (accel_state->same_surface) {
 	uint32_t pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
 	uint32_t offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
 
commit c74727015453ff3c3d6d06b812ebca9eb19a9767
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Feb 24 11:08:46 2009 -0500

    R6xx/R7xx EXA: init copy_area to NULL

diff --git a/src/r600_exa.c b/src/r600_exa.c
index b4db38d..52b0042 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -4425,6 +4425,7 @@ R600DrawInit(ScreenPtr pScreen)
 	return FALSE;
 
     info->accel_state->XInited3D = FALSE;
+    info->accel_state->copy_area = NULL;
 
     if (!R600LoadShaders(pScrn, pScreen))
 	return FALSE;
commit 95ce13572dc2d9f5dd6cf55c23411e275c0aadf1
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Feb 24 10:51:13 2009 -0500

    R6xx/R7xx EXA: Optimize temp surface for overlapping copies
    
    - allocate temp surface in PrepareCopy()
    - fall back to old OverlapCopy() path if we are not able
    to allocate a temp surface

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 8da0b4d..b4db38d 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -694,13 +694,15 @@ R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
     accel_state->planemask = planemask;
 
     if (exaGetPixmapOffset(pSrc) == exaGetPixmapOffset(pDst)) {
+	unsigned long size = pDst->drawable.height * accel_state->dst_pitch * pDst->drawable.bitsPerPixel/8;
 	accel_state->same_surface = TRUE;
 
-#ifdef SHOW_VERTEXES
-	ErrorF("same surface!\n");
-#endif
+	if (accel_state->copy_area) {
+	    exaOffscreenFree(pDst->drawable.pScreen, accel_state->copy_area);
+	    accel_state->copy_area = NULL;
+	}
+	accel_state->copy_area = exaOffscreenAlloc(pDst->drawable.pScreen, size, 256, TRUE, NULL, NULL);
     } else {
-
 	accel_state->same_surface = FALSE;
 
 	R600DoPrepareCopy(pScrn,
@@ -868,29 +870,27 @@ R600Copy(PixmapPtr pDst,
     struct radeon_accel_state *accel_state = info->accel_state;
 
     if (accel_state->same_surface && is_overlap(srcX, srcX + w, srcY, srcY + h, dstX, dstX + w, dstY, dstY + h)) {
-	uint32_t pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
-	uint32_t orig_offset, tmp_offset;
+	if (accel_state->copy_area) {
+	    uint32_t pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
+	    uint32_t orig_offset, tmp_offset;
 
-	if(!(accel_state->copy_area)) {
-	    unsigned long size=pDst->drawable.height*pitch*pDst->drawable.bitsPerPixel/8;
-	    accel_state->copy_area=exaOffscreenAlloc(pDst->drawable.pScreen, size, 256, TRUE, NULL, NULL);
-	}
-
-	tmp_offset = accel_state->copy_area->offset + info->fbLocation + pScrn->fbOffset;
-	orig_offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
+	    tmp_offset = accel_state->copy_area->offset + info->fbLocation + pScrn->fbOffset;
+	    orig_offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
 
-	R600DoPrepareCopy(pScrn,
-			  pitch, pDst->drawable.width, pDst->drawable.height, orig_offset, pDst->drawable.bitsPerPixel,
-			  pitch,                       pDst->drawable.height, tmp_offset, pDst->drawable.bitsPerPixel,
-			  accel_state->rop, accel_state->planemask);
-	R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, h);
-	R600DoCopy(pScrn);
-	R600DoPrepareCopy(pScrn,
-			  pitch, pDst->drawable.width, pDst->drawable.height, tmp_offset, pDst->drawable.bitsPerPixel,
-			  pitch,                       pDst->drawable.height, orig_offset, pDst->drawable.bitsPerPixel,
-			  accel_state->rop, accel_state->planemask);
-	R600AppendCopyVertex(pScrn, dstX, dstY, dstX, dstY, w, h);
-	R600DoCopy(pScrn);
+	    R600DoPrepareCopy(pScrn,
+			      pitch, pDst->drawable.width, pDst->drawable.height, orig_offset, pDst->drawable.bitsPerPixel,
+			      pitch,                       pDst->drawable.height, tmp_offset, pDst->drawable.bitsPerPixel,
+			      accel_state->rop, accel_state->planemask);
+	    R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, h);
+	    R600DoCopy(pScrn);
+	    R600DoPrepareCopy(pScrn,
+			      pitch, pDst->drawable.width, pDst->drawable.height, tmp_offset, pDst->drawable.bitsPerPixel,
+			      pitch,                       pDst->drawable.height, orig_offset, pDst->drawable.bitsPerPixel,
+			      accel_state->rop, accel_state->planemask);
+	    R600AppendCopyVertex(pScrn, dstX, dstY, dstX, dstY, w, h);
+	    R600DoCopy(pScrn);
+	} else
+	    R600OverlapCopy(pDst, srcX, srcY, dstX, dstY, w, h);
     } else if(accel_state->same_surface) {
 	uint32_t pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
 	uint32_t offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
@@ -914,12 +914,12 @@ R600DoneCopy(PixmapPtr pDst)
     RADEONInfoPtr info = RADEONPTR(pScrn);
     struct radeon_accel_state *accel_state = info->accel_state;
 
-    if(!(accel_state->same_surface))
+    if (!accel_state->same_surface)
 	R600DoCopy(pScrn);
 
     if (accel_state->copy_area) {
 	exaOffscreenFree(pDst->drawable.pScreen, accel_state->copy_area);
-	accel_state->copy_area=NULL;
+	accel_state->copy_area = NULL;
     }
 
 }
commit 1a7db3fc2a0277d724d60d028064d8ef75019c28
Author: Mark van Doesburg <mark.vandoesburg at hetnet.nl>
Date:   Tue Feb 24 10:44:19 2009 -0500

    R6xx/R7xx EXA: use a temp surface for overlapping copy

diff --git a/src/r600_exa.c b/src/r600_exa.c
index a252fb6..8da0b4d 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -867,11 +867,44 @@ R600Copy(PixmapPtr pDst,
     RADEONInfoPtr info = RADEONPTR(pScrn);
     struct radeon_accel_state *accel_state = info->accel_state;
 
-    //blit to/from same surfacce
-    if (accel_state->same_surface)
-	R600OverlapCopy(pDst, srcX, srcY, dstX, dstY, w, h);
-    else
+    if (accel_state->same_surface && is_overlap(srcX, srcX + w, srcY, srcY + h, dstX, dstX + w, dstY, dstY + h)) {
+	uint32_t pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
+	uint32_t orig_offset, tmp_offset;
+
+	if(!(accel_state->copy_area)) {
+	    unsigned long size=pDst->drawable.height*pitch*pDst->drawable.bitsPerPixel/8;
+	    accel_state->copy_area=exaOffscreenAlloc(pDst->drawable.pScreen, size, 256, TRUE, NULL, NULL);
+	}
+
+	tmp_offset = accel_state->copy_area->offset + info->fbLocation + pScrn->fbOffset;
+	orig_offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
+
+	R600DoPrepareCopy(pScrn,
+			  pitch, pDst->drawable.width, pDst->drawable.height, orig_offset, pDst->drawable.bitsPerPixel,
+			  pitch,                       pDst->drawable.height, tmp_offset, pDst->drawable.bitsPerPixel,
+			  accel_state->rop, accel_state->planemask);
 	R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, h);
+	R600DoCopy(pScrn);
+	R600DoPrepareCopy(pScrn,
+			  pitch, pDst->drawable.width, pDst->drawable.height, tmp_offset, pDst->drawable.bitsPerPixel,
+			  pitch,                       pDst->drawable.height, orig_offset, pDst->drawable.bitsPerPixel,
+			  accel_state->rop, accel_state->planemask);
+	R600AppendCopyVertex(pScrn, dstX, dstY, dstX, dstY, w, h);
+	R600DoCopy(pScrn);
+    } else if(accel_state->same_surface) {
+	uint32_t pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
+	uint32_t offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
+
+	R600DoPrepareCopy(pScrn,
+			  pitch, pDst->drawable.width, pDst->drawable.height, offset, pDst->drawable.bitsPerPixel,
+			  pitch,                       pDst->drawable.height, offset, pDst->drawable.bitsPerPixel,
+			  accel_state->rop, accel_state->planemask);
+	R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, h);
+	R600DoCopy(pScrn);
+    } else {
+	R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, h);
+    }
+
 }
 
 static void
@@ -881,10 +914,14 @@ R600DoneCopy(PixmapPtr pDst)
     RADEONInfoPtr info = RADEONPTR(pScrn);
     struct radeon_accel_state *accel_state = info->accel_state;
 
-    if (accel_state->same_surface)
-	return;
-    else
+    if(!(accel_state->same_surface))
 	R600DoCopy(pScrn);
+
+    if (accel_state->copy_area) {
+	exaOffscreenFree(pDst->drawable.pScreen, accel_state->copy_area);
+	accel_state->copy_area=NULL;
+    }
+
 }
 
 #define RADEON_TRACE_FALL 0
diff --git a/src/radeon.h b/src/radeon.h
index aa9dc46..2edad51 100644
--- a/src/radeon.h
+++ b/src/radeon.h
@@ -684,6 +684,7 @@ struct radeon_accel_state {
     drmBufPtr         scratch;
 
     // copy
+    ExaOffscreenArea  *copy_area;
     Bool              same_surface;
     int               rop;
     uint32_t          planemask;
commit 27f8ca2cce65be2bcb3375231886d5444d251808
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Thu Feb 19 21:18:07 2009 -0500

    R6xx/R7xx: add wait for idle MMIO path

diff --git a/src/radeon_accel.c b/src/radeon_accel.c
index 2b17cd1..dffbc57 100644
--- a/src/radeon_accel.c
+++ b/src/radeon_accel.c
@@ -93,6 +93,7 @@
 				/* X and server generic header files */
 #include "xf86.h"
 
+static void R600EngineReset(ScrnInfoPtr pScrn);
 
 #ifdef USE_XAA
 static struct {
@@ -150,6 +151,37 @@ void RADEONWaitForFifoFunction(ScrnInfoPtr pScrn, int entries)
     }
 }
 
+void R600WaitForFifoFunction(ScrnInfoPtr pScrn, int entries)
+{
+    RADEONInfoPtr  info       = RADEONPTR(pScrn);
+    unsigned char *RADEONMMIO = info->MMIO;
+    int            i;
+
+    for (;;) {
+	for (i = 0; i < RADEON_TIMEOUT; i++) {
+	    if (info->ChipFamily >= CHIP_FAMILY_RV770)
+		info->accel_state->fifo_slots =
+		    INREG(R600_GRBM_STATUS) & R700_CMDFIFO_AVAIL_MASK;
+	    else
+		info->accel_state->fifo_slots =
+		    INREG(R600_GRBM_STATUS) & R600_CMDFIFO_AVAIL_MASK;
+	    if (info->accel_state->fifo_slots >= entries) return;
+	}
+	xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, RADEON_LOGLEVEL_DEBUG,
+		       "FIFO timed out: stat=0x%08x\n",
+		       (unsigned int)INREG(R600_GRBM_STATUS));
+	xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+		   "FIFO timed out, resetting engine...\n");
+	R600EngineReset(pScrn);
+#ifdef XF86DRI
+	if (info->directRenderingEnabled) {
+	    RADEONCP_RESET(pScrn, info);
+	    RADEONCP_START(pScrn, info);
+	}
+#endif
+    }
+}
+
 /* Flush all dirty data in the Pixel Cache to memory */
 void RADEONEngineFlush(ScrnInfoPtr pScrn)
 {
@@ -307,7 +339,7 @@ void RADEONEngineReset(ScrnInfoPtr pScrn)
 }
 
 /* Reset graphics card to known state */
-void R600EngineReset(ScrnInfoPtr pScrn)
+static void R600EngineReset(ScrnInfoPtr pScrn)
 {
     RADEONInfoPtr  info       = RADEONPTR(pScrn);
     unsigned char *RADEONMMIO = info->MMIO;
diff --git a/src/radeon_commonfuncs.c b/src/radeon_commonfuncs.c
index d69a9d8..f7a1a60 100644
--- a/src/radeon_commonfuncs.c
+++ b/src/radeon_commonfuncs.c
@@ -746,39 +746,56 @@ void FUNC_NAME(RADEONWaitForIdle)(ScrnInfoPtr pScrn)
     }
 #endif
 
-#if 0
-    xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, RADEON_LOGLEVEL_DEBUG,
-		   "WaitForIdle (entering): %d entries, stat=0x%08x\n",
-		   INREG(RADEON_RBBM_STATUS) & RADEON_RBBM_FIFOCNT_MASK,
-		   INREG(RADEON_RBBM_STATUS));
-#endif
-
-    if (info->ChipFamily >= CHIP_FAMILY_R600)
-	return;
-
-    /* Wait for the engine to go idle */
-    RADEONWaitForFifoFunction(pScrn, 64);
+    if (info->ChipFamily >= CHIP_FAMILY_R600) {
+	/* Wait for the engine to go idle */
+	if (info->ChipFamily >= CHIP_FAMILY_RV770)
+	    R600WaitForFifoFunction(pScrn, 8);
+	else
+	    R600WaitForFifoFunction(pScrn, 16);
 
-    for (;;) {
-	for (i = 0; i < RADEON_TIMEOUT; i++) {
-	    if (!(INREG(RADEON_RBBM_STATUS) & RADEON_RBBM_ACTIVE)) {
-		RADEONEngineFlush(pScrn);
-		return;
+	for (;;) {
+	    for (i = 0; i < RADEON_TIMEOUT; i++) {
+		if (!(INREG(R600_GRBM_STATUS) & R600_GUI_ACTIVE))
+		    return;
 	    }
-	}
-	xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, RADEON_LOGLEVEL_DEBUG,
-		       "Idle timed out: %u entries, stat=0x%08x\n",
-		       (unsigned int)INREG(RADEON_RBBM_STATUS) & RADEON_RBBM_FIFOCNT_MASK,
-		       (unsigned int)INREG(RADEON_RBBM_STATUS));
-	xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
-		   "Idle timed out, resetting engine...\n");
-	RADEONEngineReset(pScrn);
-	RADEONEngineRestore(pScrn);
+	    xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, RADEON_LOGLEVEL_DEBUG,
+			   "Idle timed out: stat=0x%08x\n",
+			   (unsigned int)INREG(R600_GRBM_STATUS));
+	    xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+		       "Idle timed out, resetting engine...\n");
+	    R600EngineReset(pScrn);
 #ifdef XF86DRI
-	if (info->directRenderingEnabled) {
-	    RADEONCP_RESET(pScrn, info);
-	    RADEONCP_START(pScrn, info);
+	    if (info->directRenderingEnabled) {
+		RADEONCP_RESET(pScrn, info);
+		RADEONCP_START(pScrn, info);
+	    }
+#endif
 	}
+    } else {
+	/* Wait for the engine to go idle */
+	RADEONWaitForFifoFunction(pScrn, 64);
+
+	for (;;) {
+	    for (i = 0; i < RADEON_TIMEOUT; i++) {
+		if (!(INREG(RADEON_RBBM_STATUS) & RADEON_RBBM_ACTIVE)) {
+		    RADEONEngineFlush(pScrn);
+		    return;
+		}
+	    }
+	    xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, RADEON_LOGLEVEL_DEBUG,
+			   "Idle timed out: %u entries, stat=0x%08x\n",
+			   (unsigned int)INREG(RADEON_RBBM_STATUS) & RADEON_RBBM_FIFOCNT_MASK,
+			   (unsigned int)INREG(RADEON_RBBM_STATUS));
+	    xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
+		       "Idle timed out, resetting engine...\n");
+	    RADEONEngineReset(pScrn);
+	    RADEONEngineRestore(pScrn);
+#ifdef XF86DRI
+	    if (info->directRenderingEnabled) {
+		RADEONCP_RESET(pScrn, info);
+		RADEONCP_START(pScrn, info);
+	    }
 #endif
+	}
     }
 }
diff --git a/src/radeon_reg.h b/src/radeon_reg.h
index 7f0281a..4d743a4 100644
--- a/src/radeon_reg.h
+++ b/src/radeon_reg.h
@@ -5368,10 +5368,15 @@
 #define R500_DYN_SCLK_PWMEM_PIPE                        0x000d /* PLL */
 
 /* r6xx/r7xx stuff */
+#define R600_GRBM_STATUS                                   	   0x8010
+#       define R600_CMDFIFO_AVAIL_MASK                             0x1f
+#       define R700_CMDFIFO_AVAIL_MASK                             0xf
+#       define R600_GUI_ACTIVE                                     (1 << 31)
+
 #define R600_GRBM_SOFT_RESET                                    0x8020
 #       define R600_SOFT_RESET_CP                               (1 << 0)
 
-#define R600_WAIT_UNTIL                                  0x8040
+#define R600_WAIT_UNTIL                                         0x8040
 
 #define R600_CP_ME_CNTL                                         0x86d8
 #       define R600_CP_ME_HALT                                  (1 << 28)
commit e6475282486f4895bc68f6b093ecbb1aa6d25f72
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Feb 18 20:21:13 2009 -0500

    R6xx/R7xx Xv: fix some missing bits from last commit

diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index c2b0e75..f03fb7d 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -44,7 +44,7 @@
 
 #include "damage.h"
 
-void
+static void
 R600DoneTexturedVideo(ScrnInfoPtr pScrn)
 {
     RADEONInfoPtr info = RADEONPTR(pScrn);
@@ -462,7 +462,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     while (nBox--) {
 	int srcX, srcY, srcw, srch;
 	int dstX, dstY, dstw, dsth;
-	struct r6xx_copy_vertex *xv_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+	struct r6xx_copy_vertex *xv_vb;
 	struct r6xx_copy_vertex vertex[3];
 
 	if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
@@ -471,6 +471,8 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	    accel_state->ib = RADEONCPGetBuffer(pScrn);
 	}
 
+	xv_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+
 	dstX = pBox->x1 + dstxoff;
 	dstY = pBox->y1 + dstyoff;
 	dstw = pBox->x2 - pBox->x1;
commit adff8906c9899dde7711382577a63f4a726437ca
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Feb 18 20:01:57 2009 -0500

    R6xx/R7xx EXA/Xv: properly deal with running out of vertex buffer space
    
    As noted by mhopf, if VGT_MAX/MIN_INDX, etc. regs change, you need to re-emit
    CB blocks to avoid a hang.  So, just set the VGT_MAX_INDX to a reasonably large value
    in the default state and don't touch them when drawing.  When we run out of VB space,
    just draw the current buffer, grab a new one, and continue.

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 5b17dcb..a252fb6 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -79,6 +79,13 @@ uint32_t RADEON_ROP[16] = {
     RADEON_ROP3_ONE,  /* GXset          */
 };
 
+static void
+R600DoneSolid(PixmapPtr pPix);
+
+static void
+R600DoneComposite(PixmapPtr pDst);
+
+
 static Bool
 R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
 {
@@ -260,13 +267,16 @@ R600Solid(PixmapPtr pPix, int x1, int y1, int x2, int y2)
     RADEONInfoPtr info = RADEONPTR(pScrn);
     struct radeon_accel_state *accel_state = info->accel_state;
     struct r6xx_solid_vertex vertex[3];
-    struct r6xx_solid_vertex *solid_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+    struct r6xx_solid_vertex *solid_vb;
 
     if (((accel_state->vb_index + 3) * 8) > (accel_state->ib->total / 2)) {
-	ErrorF("Solid: Ran out of VB space!\n");
-	return;
+	R600DoneSolid(pPix);
+	accel_state->vb_index = 0;
+	accel_state->ib = RADEONCPGetBuffer(pScrn);
     }
 
+    solid_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+
     vertex[0].x = (float)x1;
     vertex[0].y = (float)y1;
 
@@ -335,13 +345,6 @@ R600DoneSolid(PixmapPtr pPix)
     draw_conf.num_indices        = vtx_res.vtx_num_entries / vtx_res.vtx_size_dw;
     draw_conf.index_type         = DI_INDEX_SIZE_16_BIT;
 
-    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_0,            0);	/* ? */
-    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_1,            0);
-
-    ereg  (accel_state->ib, VGT_MAX_VTX_INDX,                    draw_conf.num_indices);
-    ereg  (accel_state->ib, VGT_MIN_VTX_INDX,                    0);
-    ereg  (accel_state->ib, VGT_INDX_OFFSET,                     0);
-
     draw_auto(pScrn, accel_state->ib, &draw_conf);
 
     wait_3d_idle_clean(pScrn, accel_state->ib);
@@ -581,13 +584,6 @@ R600DoCopy(ScrnInfoPtr pScrn)
     draw_conf.num_indices        = vtx_res.vtx_num_entries / vtx_res.vtx_size_dw;
     draw_conf.index_type         = DI_INDEX_SIZE_16_BIT;
 
-    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_0,            0);	/* ? */
-    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_1,            0);
-
-    ereg  (accel_state->ib, VGT_MAX_VTX_INDX,                    draw_conf.num_indices);
-    ereg  (accel_state->ib, VGT_MIN_VTX_INDX,                    0);
-    ereg  (accel_state->ib, VGT_INDX_OFFSET,                     0);
-
     draw_auto(pScrn, accel_state->ib, &draw_conf);
 
     wait_3d_idle_clean(pScrn, accel_state->ib);
@@ -611,17 +607,9 @@ R600AppendCopyVertex(ScrnInfoPtr pScrn,
     struct r6xx_copy_vertex vertex[3];
 
     if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
-	//ErrorF("Copy: Ran out of VB space!\n");
-	// emit the old VB
 	R600DoCopy(pScrn);
-	// start a new one
-	R600DoPrepareCopy(pScrn,
-			  accel_state->src_pitch[0], accel_state->src_width[0], accel_state->src_height[0],
-			  accel_state->src_mc_addr[0], accel_state->src_bpp[0],
-			  accel_state->dst_pitch, accel_state->dst_height,
-			  accel_state->dst_mc_addr, accel_state->dst_bpp,
-			  accel_state->rop, accel_state->planemask);
-
+	accel_state->vb_index = 0;
+	accel_state->ib = RADEONCPGetBuffer(pScrn);
     }
 
     copy_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
@@ -1950,16 +1938,18 @@ static void R600Composite(PixmapPtr pDst,
     }
 
     if (accel_state->has_mask) {
-	struct r6xx_comp_mask_vertex *comp_vb =
-	    (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+	struct r6xx_comp_mask_vertex *comp_vb;
 	struct r6xx_comp_mask_vertex vertex[3];
 	xPointFixed maskTopLeft, maskTopRight, maskBottomLeft, maskBottomRight;
 
 	if (((accel_state->vb_index + 3) * 24) > (accel_state->ib->total / 2)) {
-	    ErrorF("Composite: Ran out of VB space!\n");
-	    return;
+	    R600DoneComposite(pDst);
+	    accel_state->vb_index = 0;
+	    accel_state->ib = RADEONCPGetBuffer(pScrn);
 	}
 
+	comp_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+
 	maskTopLeft.x     = IntToxFixed(maskX);
 	maskTopLeft.y     = IntToxFixed(maskY);
 	maskTopRight.x    = IntToxFixed(maskX + w);
@@ -2012,15 +2002,17 @@ static void R600Composite(PixmapPtr pDst,
 	comp_vb[accel_state->vb_index++] = vertex[2];
 
     } else {
-	struct r6xx_comp_vertex *comp_vb =
-	    (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+	struct r6xx_comp_vertex *comp_vb;
 	struct r6xx_comp_vertex vertex[3];
 
 	if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
-	    ErrorF("Composite: Ran out of VB space!\n");
-	    return;
+	    R600DoneComposite(pDst);
+	    accel_state->vb_index = 0;
+	    accel_state->ib = RADEONCPGetBuffer(pScrn);
 	}
 
+	comp_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+
 	vertex[0].x = (float)dstX;
 	vertex[0].y = (float)dstY;
 	vertex[0].src_s = xFixedToFloat(srcTopLeft.x) / accel_state->texW[0];
@@ -2106,13 +2098,6 @@ static void R600DoneComposite(PixmapPtr pDst)
     draw_conf.num_indices        = vtx_res.vtx_num_entries / vtx_res.vtx_size_dw;
     draw_conf.index_type         = DI_INDEX_SIZE_16_BIT;
 
-    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_0,            0);	/* ? */
-    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_1,            0);
-
-    ereg  (accel_state->ib, VGT_MAX_VTX_INDX,                    draw_conf.num_indices);
-    ereg  (accel_state->ib, VGT_MIN_VTX_INDX,                    0);
-    ereg  (accel_state->ib, VGT_INDX_OFFSET,                     0);
-
     draw_auto(pScrn, accel_state->ib, &draw_conf);
 
     wait_3d_idle_clean(pScrn, accel_state->ib);
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index aca6412..c2b0e75 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -44,6 +44,61 @@
 
 #include "damage.h"
 
+void
+R600DoneTexturedVideo(ScrnInfoPtr pScrn)
+{
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+    draw_config_t   draw_conf;
+    vtx_resource_t  vtx_res;
+
+    CLEAR (draw_conf);
+    CLEAR (vtx_res);
+
+    if (accel_state->vb_index == 0) {
+	R600IBDiscard(pScrn, accel_state->ib);
+	return;
+    }
+
+    accel_state->vb_mc_addr = info->gartLocation + info->dri->bufStart +
+	(accel_state->ib->idx * accel_state->ib->total) + (accel_state->ib->total / 2);
+    accel_state->vb_size = accel_state->vb_index * 16;
+
+    /* flush vertex cache */
+    if ((info->ChipFamily == CHIP_FAMILY_RV610) ||
+	(info->ChipFamily == CHIP_FAMILY_RV620) ||
+	(info->ChipFamily == CHIP_FAMILY_RS780) ||
+	(info->ChipFamily == CHIP_FAMILY_RV710))
+	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
+			    accel_state->vb_size, accel_state->vb_mc_addr);
+    else
+	cp_set_surface_sync(pScrn, accel_state->ib, VC_ACTION_ENA_bit,
+			    accel_state->vb_size, accel_state->vb_mc_addr);
+
+    /* Vertex buffer setup */
+    vtx_res.id              = SQ_VTX_RESOURCE_vs;
+    vtx_res.vtx_size_dw     = 16 / 4;
+    vtx_res.vtx_num_entries = accel_state->vb_size / 4;
+    vtx_res.mem_req_size    = 1;
+    vtx_res.vb_addr         = accel_state->vb_mc_addr;
+    set_vtx_resource        (pScrn, accel_state->ib, &vtx_res);
+
+    draw_conf.prim_type          = DI_PT_RECTLIST;
+    draw_conf.vgt_draw_initiator = DI_SRC_SEL_AUTO_INDEX;
+    draw_conf.num_instances      = 1;
+    draw_conf.num_indices        = vtx_res.vtx_num_entries / vtx_res.vtx_size_dw;
+    draw_conf.index_type         = DI_INDEX_SIZE_16_BIT;
+
+    draw_auto(pScrn, accel_state->ib, &draw_conf);
+
+    wait_3d_idle_clean(pScrn, accel_state->ib);
+
+    /* sync destination surface */
+    cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit | CB0_DEST_BASE_ENA_bit),
+			accel_state->dst_size, accel_state->dst_mc_addr);
+
+    R600CPFlushIndirect(pScrn, accel_state->ib);
+}
 
 void
 R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
@@ -58,8 +113,6 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     tex_resource_t  tex_res;
     tex_sampler_t   tex_samp;
     shader_config_t vs_conf, ps_conf;
-    draw_config_t   draw_conf;
-    vtx_resource_t  vtx_res;
     int uv_offset;
 
     static float ps_alu_consts[] = {
@@ -80,8 +133,6 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     CLEAR (tex_samp);
     CLEAR (vs_conf);
     CLEAR (ps_conf);
-    CLEAR (draw_conf);
-    CLEAR (vtx_res);
 
     accel_state->dst_pitch = exaGetPixmapPitch(pPixmap) / (pPixmap->drawable.bitsPerPixel / 8);
     accel_state->src_pitch[0] = pPriv->src_pitch;
@@ -415,8 +466,9 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	struct r6xx_copy_vertex vertex[3];
 
 	if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
-	    ErrorF("Xv: Ran out of VB space!\n");
-	    break;
+	    R600DoneTexturedVideo(pScrn);
+	    accel_state->vb_index = 0;
+	    accel_state->ib = RADEONCPGetBuffer(pScrn);
 	}
 
 	dstX = pBox->x1 + dstxoff;
@@ -461,57 +513,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	pBox++;
     }
 
-    if (accel_state->vb_index == 0) {
-	R600IBDiscard(pScrn, accel_state->ib);
-	DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
-	return;
-    }
-
-    accel_state->vb_mc_addr = info->gartLocation + info->dri->bufStart +
-	(accel_state->ib->idx * accel_state->ib->total) + (accel_state->ib->total / 2);
-    accel_state->vb_size = accel_state->vb_index * 16;
-
-    /* flush vertex cache */
-    if ((info->ChipFamily == CHIP_FAMILY_RV610) ||
-	(info->ChipFamily == CHIP_FAMILY_RV620) ||
-	(info->ChipFamily == CHIP_FAMILY_RS780) ||
-	(info->ChipFamily == CHIP_FAMILY_RV710))
-	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
-			    accel_state->vb_size, accel_state->vb_mc_addr);
-    else
-	cp_set_surface_sync(pScrn, accel_state->ib, VC_ACTION_ENA_bit,
-			    accel_state->vb_size, accel_state->vb_mc_addr);
-
-    /* Vertex buffer setup */
-    vtx_res.id              = SQ_VTX_RESOURCE_vs;
-    vtx_res.vtx_size_dw     = 16 / 4;
-    vtx_res.vtx_num_entries = accel_state->vb_size / 4;
-    vtx_res.mem_req_size    = 1;
-    vtx_res.vb_addr         = accel_state->vb_mc_addr;
-    set_vtx_resource        (pScrn, accel_state->ib, &vtx_res);
-
-    draw_conf.prim_type          = DI_PT_RECTLIST;
-    draw_conf.vgt_draw_initiator = DI_SRC_SEL_AUTO_INDEX;
-    draw_conf.num_instances      = 1;
-    draw_conf.num_indices        = vtx_res.vtx_num_entries / vtx_res.vtx_size_dw;
-    draw_conf.index_type         = DI_INDEX_SIZE_16_BIT;
-
-    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_0,            0);	/* ? */
-    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_1,            0);
-
-    ereg  (accel_state->ib, VGT_MAX_VTX_INDX,                    draw_conf.num_indices);
-    ereg  (accel_state->ib, VGT_MIN_VTX_INDX,                    0);
-    ereg  (accel_state->ib, VGT_INDX_OFFSET,                     0);
-
-    draw_auto(pScrn, accel_state->ib, &draw_conf);
-
-    wait_3d_idle_clean(pScrn, accel_state->ib);
-
-    /* sync destination surface */
-    cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit | CB0_DEST_BASE_ENA_bit),
-			accel_state->dst_size, accel_state->dst_mc_addr);
-
-    R600CPFlushIndirect(pScrn, accel_state->ib);
+    R600DoneTexturedVideo(pScrn);
 
     DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
 }
diff --git a/src/r6xx_accel.c b/src/r6xx_accel.c
index bebab88..267a7b0 100644
--- a/src/r6xx_accel.c
+++ b/src/r6xx_accel.c
@@ -1058,7 +1058,7 @@ set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
     fs_setup(pScrn, ib, &fs_conf);
 
     // VGT
-    ereg  (ib, VGT_MAX_VTX_INDX,                    0);
+    ereg  (ib, VGT_MAX_VTX_INDX,                    2048); /* XXX set to a reasonably large number of indices */
     ereg  (ib, VGT_MIN_VTX_INDX,                    0);
     ereg  (ib, VGT_INDX_OFFSET,                     0);
     ereg  (ib, VGT_INSTANCE_STEP_RATE_0,            0);
commit cf85d4a1d43a3209c7ca9307aede2c2c243f7130
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Feb 18 19:02:56 2009 -0500

    R6xx/R7xx: reset 3D state after VT switch

diff --git a/src/radeon_driver.c b/src/radeon_driver.c
index b4df090..75feca4 100644
--- a/src/radeon_driver.c
+++ b/src/radeon_driver.c
@@ -5536,6 +5536,9 @@ Bool RADEONEnterVT(int scrnIndex, int flags)
     if (info->accelOn && (info->ChipFamily < CHIP_FAMILY_R600))
 	RADEONEngineRestore(pScrn);
 
+    if (info->accelOn && info->accel_state)
+	info->accel_state->XInited3D = FALSE;
+
 #ifdef XF86DRI
     if (info->directRenderingEnabled) {
 	RADEONCP_START(pScrn, info);
commit 6545f0a2967414fa518a78440060b8b69c0146ee
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Feb 18 18:15:18 2009 -0500

    R6xx/R7xx: fixup accel paths

diff --git a/src/radeon.h b/src/radeon.h
index bad55bf..aa9dc46 100644
--- a/src/radeon.h
+++ b/src/radeon.h
@@ -1312,9 +1312,8 @@ do {									\
     if (RADEON_VERBOSE)							\
 	xf86DrvMsg(pScrn->scrnIndex, X_INFO,				\
 		   "FLUSH_RING in %s\n", __FUNCTION__);			\
-    if (info->cp->indirectBuffer) {					\
+    if (info->cp->indirectBuffer)					\
 	RADEONCPFlushIndirect(pScrn, 0);				\
-    }									\
 } while (0)
 
 
@@ -1331,16 +1330,13 @@ do {									\
 
 #define RADEON_WAIT_UNTIL_3D_IDLE()					\
 do {									\
-    BEGIN_RING(2);							\
-    if (info->ChipFamily >= CHIP_FAMILY_R600) {                         \
-	OUT_RING(CP_PACKET0(R600_WAIT_UNTIL, 0));                       \
-	OUT_RING((RADEON_WAIT_3D_IDLECLEAN));                           \
-    } else {                                                            \
+    if (info->ChipFamily < CHIP_FAMILY_R600) {				\
+	BEGIN_RING(2);							\
 	OUT_RING(CP_PACKET0(RADEON_WAIT_UNTIL, 0));                     \
 	OUT_RING((RADEON_WAIT_3D_IDLECLEAN |                            \
 		  RADEON_WAIT_HOST_IDLECLEAN));                         \
+	ADVANCE_RING();							\
     }                                                                   \
-    ADVANCE_RING();							\
 } while (0)
 
 #define RADEON_WAIT_UNTIL_IDLE()					\
@@ -1349,33 +1345,29 @@ do {									\
 	xf86DrvMsg(pScrn->scrnIndex, X_INFO,				\
 		   "WAIT_UNTIL_IDLE() in %s\n", __FUNCTION__);		\
     }									\
-    BEGIN_RING(2);							\
-    if (info->ChipFamily >= CHIP_FAMILY_R600) {                         \
-	OUT_RING(CP_PACKET0(R600_WAIT_UNTIL, 0));                       \
-	OUT_RING((RADEON_WAIT_3D_IDLECLEAN));                           \
-    } else {                                                            \
+    if (info->ChipFamily < CHIP_FAMILY_R600) {                          \
+	BEGIN_RING(2);							\
 	OUT_RING(CP_PACKET0(RADEON_WAIT_UNTIL, 0));                     \
 	OUT_RING((RADEON_WAIT_2D_IDLECLEAN |                            \
                   RADEON_WAIT_3D_IDLECLEAN |                            \
 		  RADEON_WAIT_HOST_IDLECLEAN));                         \
+	ADVANCE_RING();							\
     }                                                                   \
-    ADVANCE_RING();							\
 } while (0)
 
 #define RADEON_PURGE_CACHE()						\
 do {									\
-    BEGIN_RING(2);							\
-    if (info->ChipFamily >= CHIP_FAMILY_R600) {                         \
-	OUT_RING(CP_PACKET3(IT_EVENT_WRITE, 0));                        \
-	OUT_RING(CACHE_FLUSH_AND_INV_EVENT);                            \
-    } else if (info->ChipFamily <= CHIP_FAMILY_RV280) {                 \
-        OUT_RING(CP_PACKET0(RADEON_RB3D_DSTCACHE_CTLSTAT, 0));		\
-        OUT_RING(RADEON_RB3D_DC_FLUSH_ALL);				\
-    } else {                                                            \
-        OUT_RING(CP_PACKET0(R300_RB3D_DSTCACHE_CTLSTAT, 0));		\
-        OUT_RING(R300_RB3D_DC_FLUSH_ALL);				\
-    }                                                                   \
-    ADVANCE_RING();							\
+    if (info->ChipFamily < CHIP_FAMILY_R600) {				\
+	BEGIN_RING(2);							\
+	if (info->ChipFamily <= CHIP_FAMILY_RV280) {			\
+	    OUT_RING(CP_PACKET0(RADEON_RB3D_DSTCACHE_CTLSTAT, 0));	\
+	    OUT_RING(RADEON_RB3D_DC_FLUSH_ALL);				\
+	} else {							\
+	    OUT_RING(CP_PACKET0(R300_RB3D_DSTCACHE_CTLSTAT, 0));	\
+	    OUT_RING(R300_RB3D_DC_FLUSH_ALL);				\
+	}								\
+	ADVANCE_RING();							\
+    }									\
 } while (0)
 
 #define RADEON_PURGE_ZCACHE()						\
diff --git a/src/radeon_dri.c b/src/radeon_dri.c
index 4520be5..ba5fbce 100644
--- a/src/radeon_dri.c
+++ b/src/radeon_dri.c
@@ -1846,7 +1846,8 @@ void RADEONDRIResume(ScreenPtr pScreen)
 	/* FIXME: return? */
     }
 
-    RADEONEngineRestore(pScrn);
+    if (info->ChipFamily < CHIP_FAMILY_R600)
+	RADEONEngineRestore(pScrn);
 
     RADEONDRICPInit(pScrn);
 }
diff --git a/src/radeon_driver.c b/src/radeon_driver.c
index 919a9fb..b4df090 100644
--- a/src/radeon_driver.c
+++ b/src/radeon_driver.c
@@ -3771,9 +3771,10 @@ void RADEONRestoreMemMapRegisters(ScrnInfoPtr pScrn,
 	    } else {
 		OUTREG(R600_HDP_NONSURFACE_BASE, (restore->mc_fb_location << 16) & 0xff0000);
 	    }
-	    
+
 	    /* Reset the engine and HDP */
-	    RADEONEngineReset(pScrn);
+	    if (info->ChipFamily < CHIP_FAMILY_R600)
+		RADEONEngineReset(pScrn);
 	}
     } else {
 
@@ -5220,7 +5221,8 @@ Bool RADEONSwitchMode(int scrnIndex, DisplayModePtr mode, int flags)
 
     if (info->accelOn) {
         RADEON_SYNC(info, pScrn);
-	RADEONEngineRestore(pScrn);
+	if (info->ChipFamily < CHIP_FAMILY_R600)
+	    RADEONEngineRestore(pScrn);
     }
 
 #ifdef XF86DRI
@@ -5424,6 +5426,10 @@ void RADEONAdjustFrame(int scrnIndex, int x, int y, int flags)
     xf86OutputPtr  output = config->output[config->compat_output];
     xf86CrtcPtr	crtc = output->crtc;
 
+    /* not handled */
+    if (IS_AVIVO_VARIANT)
+	return;
+
 #ifdef XF86DRI
     if (info->cp->CPStarted && pScrn->pScreen) DRILock(pScrn->pScreen, 0);
 #endif
@@ -5527,7 +5533,7 @@ Bool RADEONEnterVT(int scrnIndex, int flags)
     if (info->adaptor)
 	RADEONResetVideo(pScrn);
 
-    if (info->accelOn)
+    if (info->accelOn && (info->ChipFamily < CHIP_FAMILY_R600))
 	RADEONEngineRestore(pScrn);
 
 #ifdef XF86DRI
commit 3dff20e276615e8b77177689a4a5f8d91b3e8eac
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Feb 17 13:14:38 2009 -0500

    R6xx/R7xx EXA: switch to surface sync packet

diff --git a/src/r6xx_accel.c b/src/r6xx_accel.c
index c0e3a2b..bebab88 100644
--- a/src/r6xx_accel.c
+++ b/src/r6xx_accel.c
@@ -365,16 +365,11 @@ cp_set_surface_sync(ScrnInfoPtr pScrn, drmBufPtr ib, uint32_t sync_type, uint32_
     else
 	cp_coher_size = ((size + 255) >> 8);
 
-    ereg  (ib, CP_COHER_CNTL,                       sync_type);
-    ereg  (ib, CP_COHER_SIZE,                       cp_coher_size);
-    ereg  (ib, CP_COHER_BASE,                       (mc_addr >> 8));
-    pack3 (ib, IT_WAIT_REG_MEM, 6);
-    e32   (ib, IT_WAIT_REG | IT_WAIT_EQ);
-    e32   (ib, IT_WAIT_ADDR(CP_COHER_STATUS));
-    e32   (ib, 0);
-    e32   (ib, 0);							// Ref value
-    e32   (ib, STATUS_bit);						// Ref mask
-    e32   (ib, 10);							// Wait interval
+    pack3 (ib, IT_SURFACE_SYNC, 4);
+    e32   (ib, sync_type);
+    e32   (ib, cp_coher_size);
+    e32   (ib, (mc_addr >> 8));
+    e32   (ib, 10); /* poll interval */
 }
 
 /* inserts a wait for vline in the command stream */
commit 7cde00f49649e25fd5816927c7a5e28b608fabcd
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Thu Feb 12 17:39:44 2009 -0500

    R6xx/R7xx: switch to drm for wait for idle
    
    THIS REQUIRES AN UPDATED DRM

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 8a16b7a..5b17dcb 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -2123,23 +2123,6 @@ static void R600DoneComposite(PixmapPtr pDst)
     R600CPFlushIndirect(pScrn, accel_state->ib);
 }
 
-/* really would be better to wait on a timestamp shadowed in memory,
- * but this will do for now.
- */
-static Bool
-R600WaitforIdlePoll(ScrnInfoPtr pScrn)
-{
-    RADEONInfoPtr info = RADEONPTR(pScrn);
-    unsigned char *RADEONMMIO = info->MMIO;
-    uint32_t i;
-
-    for (i = 0; i < 1000000; i++) {
-	if ((INREG(GRBM_STATUS) & GUI_ACTIVE_bit) == 0)
-	    return TRUE;
-    }
-    return FALSE;
-}
-
 Bool
 R600CopyToVRAM(ScrnInfoPtr pScrn,
 	       char *src, int src_pitch,
@@ -2186,7 +2169,7 @@ R600CopyToVRAM(ScrnInfoPtr pScrn,
 	    scratch_offset = scratch->total/2 - scratch_offset;
 	    dst = (char *)scratch->address + scratch_offset;
 	    // wait for the engine to be idle
-	    R600WaitforIdlePoll(pScrn);
+	    RADEONWaitForIdleCP(pScrn);
 	    //memcopy from sys to scratch
 	    while (temph--) {
 		memcpy (dst, src, wpass);
@@ -2281,7 +2264,7 @@ R600DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
 	}
 
 	// wait for the engine to be idle
-	R600WaitforIdlePoll(pScrn);
+	RADEONWaitForIdleCP(pScrn);
 	//memcopy from scratch to sys
 	while (oldhpass--) {
 	    memcpy (dst, src, wpass);
@@ -2315,7 +2298,7 @@ R600Sync(ScreenPtr pScreen, int marker)
     struct radeon_accel_state *accel_state = info->accel_state;
 
     if (accel_state->exaMarkerSynced != marker) {
-	R600WaitforIdlePoll(pScrn);
+	RADEONWaitForIdleCP(pScrn);
 	accel_state->exaMarkerSynced = marker;
     }
 
commit e22cd4011b9be437ba89bff568e7fb82b4907d99
Author: Yang Zhao <yang at yangman.ca>
Date:   Thu Feb 12 14:46:53 2009 -0500

    R6xx/R7xx EXA: Further optimizations to overlapping copy
    
    Diagonal overlapping copies can be reduced to either horizontal- or
    vertical-only offset, and the one with fewer copies is picked.

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 2cff645..8a16b7a 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -750,87 +750,110 @@ R600OverlapCopy(PixmapPtr pDst,
     struct radeon_accel_state *accel_state = info->accel_state;
     uint32_t dst_pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
     uint32_t dst_offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
-    int i, chunk;
+    int i, hchunk, vchunk;
 
     if (is_overlap(srcX, srcX + w, srcY, srcY + h,
 		   dstX, dstX + w, dstY, dstY + h)) {
-        /* Diagonally offset overlap is reduced to a horizontal-only offset by first
-         * copying the vertically non-overlapping portion, then adjusting coordinates
+        /* Calculate height/width of non-overlapping area */
+        hchunk = (srcX < dstX) ? (dstX - srcX) : (srcX - dstX);
+        vchunk = (srcY < dstY) ? (dstY - srcY) : (srcY - dstY);
+
+        /* Diagonally offset overlap is reduced to either horizontal or vertical offset-only
+         * by copying a part of the  non-overlapping portion, then adjusting coordinates
+         * Choose horizontal vs vertical to minimize the total number of copy operations
          */
-	if (srcX != dstX) { // left/right or diagonal
-            if (srcY > dstY ) { // diagonal up
-                chunk = srcY - dstY;
-                R600DoPrepareCopy(pScrn,
-                                  dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-                                  dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-                                  accel_state->rop, accel_state->planemask);
-                R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, chunk);
-                R600DoCopy(pScrn);
-
-                h = h - chunk;
-                srcY = srcY + chunk;
-                dstY = dstY + chunk;
-            } else if (srcY < dstY) { // diagonal down
-                chunk = dstY - srcY;
-                R600DoPrepareCopy(pScrn,
-                                  dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-                                  dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-                                  accel_state->rop, accel_state->planemask);
-                R600AppendCopyVertex(pScrn, srcX, srcY + h - chunk, dstX, dstY + h - chunk, w, chunk);
-                R600DoCopy(pScrn);
-
-                h = h - chunk;
+        if (vchunk != 0 && hchunk != 0) { //diagonal
+            if ((w / hchunk) <= (h / vchunk)) { // reduce to horizontal
+                if (srcY > dstY ) { // diagonal up
+                    R600DoPrepareCopy(pScrn,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      accel_state->rop, accel_state->planemask);
+                    R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, vchunk);
+                    R600DoCopy(pScrn);
+
+                    srcY = srcY + vchunk;
+                    dstY = dstY + vchunk;
+                } else { // diagonal down
+                    R600DoPrepareCopy(pScrn,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      accel_state->rop, accel_state->planemask);
+                    R600AppendCopyVertex(pScrn, srcX, srcY + h - vchunk, dstX, dstY + h - vchunk, w, vchunk);
+                    R600DoCopy(pScrn);
+                }
+                h = h - vchunk;
+                vchunk = 0;
+            } else { //reduce to vertical
+                if (srcX > dstX ) { // diagonal left
+                    R600DoPrepareCopy(pScrn,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      accel_state->rop, accel_state->planemask);
+                    R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, hchunk, h);
+                    R600DoCopy(pScrn);
+
+                    srcX = srcX + hchunk;
+                    dstX = dstX + hchunk;
+                } else { // diagonal right
+                    R600DoPrepareCopy(pScrn,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      accel_state->rop, accel_state->planemask);
+                    R600AppendCopyVertex(pScrn, srcX + w - hchunk, srcY, dstX + w - hchunk, dstY, hchunk, h);
+                    R600DoCopy(pScrn);
+                }
+                w = w - hchunk;
+                hchunk = 0;
             }
+        }
 
+	if (vchunk == 0) { // left/right
 	    if (srcX < dstX) { // right
 		// copy right to left
-                chunk = dstX - srcX;
-		for (i = w; i > 0; i -= chunk) {
+		for (i = w; i > 0; i -= hchunk) {
 		    R600DoPrepareCopy(pScrn,
 				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
 				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
 				      accel_state->rop, accel_state->planemask);
-		    R600AppendCopyVertex(pScrn, srcX + i - chunk, srcY, dstX + i - chunk, dstY, chunk, h);
+		    R600AppendCopyVertex(pScrn, srcX + i - hchunk, srcY, dstX + i - hchunk, dstY, hchunk, h);
 		    R600DoCopy(pScrn);
 		}
 	    } else { //left
 		// copy left to right
-                chunk = srcX - dstX;
-		for (i = 0; i < w; i += chunk) {
+		for (i = 0; i < w; i += hchunk) {
 		    R600DoPrepareCopy(pScrn,
 				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
 				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
 				      accel_state->rop, accel_state->planemask);
 
-		    R600AppendCopyVertex(pScrn, srcX + i, srcY, dstX + i, dstY, chunk, h);
+		    R600AppendCopyVertex(pScrn, srcX + i, srcY, dstX + i, dstY, hchunk, h);
 		    R600DoCopy(pScrn);
 		}
 	    }
 	} else { //up/down
 	    if (srcY > dstY) { // up
 		// copy top to bottom
-                for (i = 0; i < h; i += chunk) {
-                chunk = srcY - dstY;
+                for (i = 0; i < h; i += vchunk) {
                     R600DoPrepareCopy(pScrn,
                                       dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
                                       dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
                                       accel_state->rop, accel_state->planemask);
 
-                    if (chunk > h - i) chunk = h - i;
-                    R600AppendCopyVertex(pScrn, srcX, srcY + i, dstX, dstY + i, w, chunk);
+                    if (vchunk > h - i) vchunk = h - i;
+                    R600AppendCopyVertex(pScrn, srcX, srcY + i, dstX, dstY + i, w, vchunk);
                     R600DoCopy(pScrn);
                 }
 	    } else { // down
 		// copy bottom to top
-		chunk = dstY - srcY;
-                for (i = h; i > 0; i -= chunk) {
+                for (i = h; i > 0; i -= vchunk) {
                     R600DoPrepareCopy(pScrn,
                                       dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
                                       dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
                                       accel_state->rop, accel_state->planemask);
 
-                    if (chunk > i) chunk = i;
-                    R600AppendCopyVertex(pScrn, srcX, srcY + i - chunk, dstX, dstY + i - chunk, w, chunk);
+                    if (vchunk > i) vchunk = i;
+                    R600AppendCopyVertex(pScrn, srcX, srcY + i - vchunk, dstX, dstY + i - vchunk, w, vchunk);
                     R600DoCopy(pScrn);
                 }
             }
commit da08b760bcf3d04d775c4440fafec10657bb1863
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Thu Feb 12 13:53:11 2009 -0500

    R6xx/R7xx EXA: handle running out of vertex space in the copy path

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 1e3bd74..2cff645 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -419,6 +419,9 @@ R600DoPrepareCopy(ScrnInfoPtr pScrn,
     accel_state->src_size[0] = src_pitch * src_height * (src_bpp/8);
     accel_state->src_mc_addr[0] = src_offset;
     accel_state->src_pitch[0] = src_pitch;
+    accel_state->src_width[0] = src_width;
+    accel_state->src_height[0] = src_height;
+    accel_state->src_bpp[0] = src_bpp;
 
     /* flush texture cache */
     cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
@@ -486,6 +489,8 @@ R600DoPrepareCopy(ScrnInfoPtr pScrn,
     accel_state->dst_size = dst_pitch * dst_height * (dst_bpp/8);
     accel_state->dst_mc_addr = dst_offset;
     accel_state->dst_pitch = dst_pitch;
+    accel_state->dst_height = dst_height;
+    accel_state->dst_bpp = dst_bpp;
 
     cb_conf.id = 0;
     cb_conf.w = accel_state->dst_pitch;
@@ -602,14 +607,25 @@ R600AppendCopyVertex(ScrnInfoPtr pScrn,
 {
     RADEONInfoPtr info = RADEONPTR(pScrn);
     struct radeon_accel_state *accel_state = info->accel_state;
-    struct r6xx_copy_vertex *copy_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+    struct r6xx_copy_vertex *copy_vb;
     struct r6xx_copy_vertex vertex[3];
 
     if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
-	ErrorF("Copy: Ran out of VB space!\n");
-	return;
+	//ErrorF("Copy: Ran out of VB space!\n");
+	// emit the old VB
+	R600DoCopy(pScrn);
+	// start a new one
+	R600DoPrepareCopy(pScrn,
+			  accel_state->src_pitch[0], accel_state->src_width[0], accel_state->src_height[0],
+			  accel_state->src_mc_addr[0], accel_state->src_bpp[0],
+			  accel_state->dst_pitch, accel_state->dst_height,
+			  accel_state->dst_mc_addr, accel_state->dst_bpp,
+			  accel_state->rop, accel_state->planemask);
+
     }
 
+    copy_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+
     vertex[0].x = (float)dstX;
     vertex[0].y = (float)dstY;
     vertex[0].s = (float)srcX;
@@ -654,6 +670,12 @@ R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
     accel_state->src_mc_addr[0] = exaGetPixmapOffset(pSrc) + info->fbLocation + pScrn->fbOffset;
     accel_state->dst_mc_addr = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
 
+    accel_state->src_width[0] = pSrc->drawable.width;
+    accel_state->src_height[0] = pSrc->drawable.height;
+    accel_state->src_bpp[0] = pSrc->drawable.bitsPerPixel;
+    accel_state->dst_height = pDst->drawable.height;
+    accel_state->dst_bpp = pDst->drawable.bitsPerPixel;
+
     // bad pitch
     if (accel_state->src_pitch[0] & 7)
 	return FALSE;
@@ -680,10 +702,11 @@ R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
 	   pDst->drawable.bitsPerPixel, exaGetPixmapPitch(pDst));
 #endif
 
+    accel_state->rop = rop;
+    accel_state->planemask = planemask;
+
     if (exaGetPixmapOffset(pSrc) == exaGetPixmapOffset(pDst)) {
 	accel_state->same_surface = TRUE;
-	accel_state->rop = rop;
-	accel_state->planemask = planemask;
 
 #ifdef SHOW_VERTEXES
 	ErrorF("same surface!\n");
diff --git a/src/radeon.h b/src/radeon.h
index 9b42afd..bad55bf 100644
--- a/src/radeon.h
+++ b/src/radeon.h
@@ -665,9 +665,14 @@ struct radeon_accel_state {
     uint32_t          src_size[2];
     uint64_t          src_mc_addr[2];
     uint32_t          src_pitch[2];
+    uint32_t          src_width[2];
+    uint32_t          src_height[2];
+    uint32_t          src_bpp[2];
     uint32_t          dst_size;
     uint64_t          dst_mc_addr;
     uint32_t          dst_pitch;
+    uint32_t          dst_height;
+    uint32_t          dst_bpp;
     uint32_t          vs_size;
     uint64_t          vs_mc_addr;
     uint32_t          ps_size;
commit e3be312b0b73982c24f1f5d9cf76d7caafae0853
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Thu Feb 12 13:48:36 2009 -0500

    R6xx/R7xx EXA: properly handle non repeat cases in the texture setup

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 8ae5b53..1e3bd74 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -1077,25 +1077,21 @@ static Bool R600TextureSetup(PicturePtr pPict, PixmapPtr pPix,
     tex_res.request_size        = 1;
 
     /* component swizzles */
-    // XXX double check these
     switch (pPict->format) {
     case PICT_a1r5g5b5:
     case PICT_a8r8g8b8:
-	//ErrorF("%s: PICT_a8r8g8b8\n", unit ? "mask" : "src");
 	tex_res.dst_sel_x           = SQ_SEL_Z; //R
 	tex_res.dst_sel_y           = SQ_SEL_Y; //G
 	tex_res.dst_sel_z           = SQ_SEL_X; //B
 	tex_res.dst_sel_w           = SQ_SEL_W; //A
 	break;
     case PICT_a8b8g8r8:
-	//ErrorF("%s: PICT_a8b8g8r8\n", unit ? "mask" : "src");
 	tex_res.dst_sel_x           = SQ_SEL_X; //R
 	tex_res.dst_sel_y           = SQ_SEL_Y; //G
 	tex_res.dst_sel_z           = SQ_SEL_Z; //B
 	tex_res.dst_sel_w           = SQ_SEL_W; //A
 	break;
     case PICT_x8b8g8r8:
-	//ErrorF("%s: PICT_x8b8g8r8\n", unit ? "mask" : "src");
 	tex_res.dst_sel_x           = SQ_SEL_X; //R
 	tex_res.dst_sel_y           = SQ_SEL_Y; //G
 	tex_res.dst_sel_z           = SQ_SEL_Z; //B
@@ -1103,21 +1099,18 @@ static Bool R600TextureSetup(PicturePtr pPict, PixmapPtr pPix,
 	break;
     case PICT_x1r5g5b5:
     case PICT_x8r8g8b8:
-	//ErrorF("%s: PICT_x8r8g8b8\n", unit ? "mask" : "src");
 	tex_res.dst_sel_x           = SQ_SEL_Z; //R
 	tex_res.dst_sel_y           = SQ_SEL_Y; //G
 	tex_res.dst_sel_z           = SQ_SEL_X; //B
 	tex_res.dst_sel_w           = SQ_SEL_1; //A
 	break;
     case PICT_r5g6b5:
-	//ErrorF("%s: PICT_r5g6b5\n", unit ? "mask" : "src");
 	tex_res.dst_sel_x           = SQ_SEL_Z; //R
 	tex_res.dst_sel_y           = SQ_SEL_Y; //G
 	tex_res.dst_sel_z           = SQ_SEL_X; //B
 	tex_res.dst_sel_w           = SQ_SEL_1; //A
 	break;
     case PICT_a8:
-	//ErrorF("%s: PICT_a8\n", unit ? "mask" : "src");
 	tex_res.dst_sel_x           = SQ_SEL_0; //R
 	tex_res.dst_sel_y           = SQ_SEL_0; //G
 	tex_res.dst_sel_z           = SQ_SEL_0; //B
@@ -1135,25 +1128,30 @@ static Bool R600TextureSetup(PicturePtr pPict, PixmapPtr pPix,
     tex_samp.id                 = unit;
     tex_samp.border_color       = SQ_TEX_BORDER_COLOR_TRANS_BLACK;
 
-    switch (pPict->repeatType) {
-    case RepeatNormal:
-	tex_samp.clamp_x            = SQ_TEX_WRAP;
-	tex_samp.clamp_y            = SQ_TEX_WRAP;
-	break;
-    case RepeatPad:
-	tex_samp.clamp_x            = SQ_TEX_CLAMP_LAST_TEXEL;
-	tex_samp.clamp_y            = SQ_TEX_CLAMP_LAST_TEXEL;
-	break;
-    case RepeatReflect:
-	tex_samp.clamp_x            = SQ_TEX_MIRROR;
-	tex_samp.clamp_y            = SQ_TEX_MIRROR;
-	break;
-    case RepeatNone:
+    if (pPict->repeat) {
+	switch (pPict->repeatType) {
+	case RepeatNormal:
+	    tex_samp.clamp_x            = SQ_TEX_WRAP;
+	    tex_samp.clamp_y            = SQ_TEX_WRAP;
+	    break;
+	case RepeatPad:
+	    tex_samp.clamp_x            = SQ_TEX_CLAMP_LAST_TEXEL;
+	    tex_samp.clamp_y            = SQ_TEX_CLAMP_LAST_TEXEL;
+	    break;
+	case RepeatReflect:
+	    tex_samp.clamp_x            = SQ_TEX_MIRROR;
+	    tex_samp.clamp_y            = SQ_TEX_MIRROR;
+	    break;
+	case RepeatNone:
+	    tex_samp.clamp_x            = SQ_TEX_CLAMP_BORDER;
+	    tex_samp.clamp_y            = SQ_TEX_CLAMP_BORDER;
+	    break;
+	default:
+	    RADEON_FALLBACK(("Bad repeat 0x%x\n", pPict->repeatType));
+	}
+    } else {
 	tex_samp.clamp_x            = SQ_TEX_CLAMP_BORDER;
 	tex_samp.clamp_y            = SQ_TEX_CLAMP_BORDER;
-	break;
-    default:
-	RADEON_FALLBACK(("Bad repeat 0x%x\n", pPict->repeatType));
     }
 
     switch (pPict->filter) {
commit 3e5ac32f747bef70903e9cdf41652a827eece707
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Feb 11 17:40:20 2009 -0500

    Revert "R6xx/R7xx: handle running out of vertex buffer space"
    
    This reverts commit 4fd7228de7ad88edf825dbc3039df877795a9479.
    
    Causes lockups.

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 542d42d..8ae5b53 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -41,12 +41,6 @@
 extern PixmapPtr
 RADEONGetDrawablePixmap(DrawablePtr pDrawable);
 
-static void
-R600DoneSolid(PixmapPtr pPix);
-
-static void
-R600DoneComposite(PixmapPtr pDst);
-
 //#define SHOW_VERTEXES
 
 #       define RADEON_ROP3_ZERO             0x00000000
@@ -270,9 +264,7 @@ R600Solid(PixmapPtr pPix, int x1, int y1, int x2, int y2)
 
     if (((accel_state->vb_index + 3) * 8) > (accel_state->ib->total / 2)) {
 	ErrorF("Solid: Ran out of VB space!\n");
-	R600DoneSolid(pPix);
-	accel_state->ib = RADEONCPGetBuffer(pScrn);
-	accel_state->vb_index = 0;
+	return;
     }
 
     vertex[0].x = (float)x1;
@@ -615,9 +607,7 @@ R600AppendCopyVertex(ScrnInfoPtr pScrn,
 
     if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
 	ErrorF("Copy: Ran out of VB space!\n");
-	R600DoCopy(pScrn);
-	accel_state->ib = RADEONCPGetBuffer(pScrn);
-	accel_state->vb_index = 0;
+	return;
     }
 
     vertex[0].x = (float)dstX;
@@ -1923,9 +1913,7 @@ static void R600Composite(PixmapPtr pDst,
 
 	if (((accel_state->vb_index + 3) * 24) > (accel_state->ib->total / 2)) {
 	    ErrorF("Composite: Ran out of VB space!\n");
-	    R600DoneComposite(pDst);
-	    accel_state->ib = RADEONCPGetBuffer(pScrn);
-	    accel_state->vb_index = 0;
+	    return;
 	}
 
 	maskTopLeft.x     = IntToxFixed(maskX);
@@ -1986,9 +1974,7 @@ static void R600Composite(PixmapPtr pDst,
 
 	if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
 	    ErrorF("Composite: Ran out of VB space!\n");
-	    R600DoneComposite(pDst);
-	    accel_state->ib = RADEONCPGetBuffer(pScrn);
-	    accel_state->vb_index = 0;
+	    return;
 	}
 
 	vertex[0].x = (float)dstX;
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index c06512a..aca6412 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -44,63 +44,6 @@
 
 #include "damage.h"
 
-static void
-R600DoneXv(ScrnInfoPtr pScrn)
-{
-    RADEONInfoPtr info = RADEONPTR(pScrn);
-    struct radeon_accel_state *accel_state = info->accel_state;
-    draw_config_t   draw_conf;
-    vtx_resource_t  vtx_res;
-
-    CLEAR (draw_conf);
-    CLEAR (vtx_res);
-
-    accel_state->vb_mc_addr = info->gartLocation + info->dri->bufStart +
-	(accel_state->ib->idx * accel_state->ib->total) + (accel_state->ib->total / 2);
-    accel_state->vb_size = accel_state->vb_index * 16;
-
-    /* flush vertex cache */
-    if ((info->ChipFamily == CHIP_FAMILY_RV610) ||
-	(info->ChipFamily == CHIP_FAMILY_RV620) ||
-	(info->ChipFamily == CHIP_FAMILY_RS780) ||
-	(info->ChipFamily == CHIP_FAMILY_RV710))
-	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
-			    accel_state->vb_size, accel_state->vb_mc_addr);
-    else
-	cp_set_surface_sync(pScrn, accel_state->ib, VC_ACTION_ENA_bit,
-			    accel_state->vb_size, accel_state->vb_mc_addr);
-
-    /* Vertex buffer setup */
-    vtx_res.id              = SQ_VTX_RESOURCE_vs;
-    vtx_res.vtx_size_dw     = 16 / 4;
-    vtx_res.vtx_num_entries = accel_state->vb_size / 4;
-    vtx_res.mem_req_size    = 1;
-    vtx_res.vb_addr         = accel_state->vb_mc_addr;
-    set_vtx_resource        (pScrn, accel_state->ib, &vtx_res);
-
-    draw_conf.prim_type          = DI_PT_RECTLIST;
-    draw_conf.vgt_draw_initiator = DI_SRC_SEL_AUTO_INDEX;
-    draw_conf.num_instances      = 1;
-    draw_conf.num_indices        = vtx_res.vtx_num_entries / vtx_res.vtx_size_dw;
-    draw_conf.index_type         = DI_INDEX_SIZE_16_BIT;
-
-    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_0,            0);	/* ? */
-    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_1,            0);
-
-    ereg  (accel_state->ib, VGT_MAX_VTX_INDX,                    draw_conf.num_indices);
-    ereg  (accel_state->ib, VGT_MIN_VTX_INDX,                    0);
-    ereg  (accel_state->ib, VGT_INDX_OFFSET,                     0);
-
-    draw_auto(pScrn, accel_state->ib, &draw_conf);
-
-    wait_3d_idle_clean(pScrn, accel_state->ib);
-
-    /* sync destination surface */
-    cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit | CB0_DEST_BASE_ENA_bit),
-			accel_state->dst_size, accel_state->dst_mc_addr);
-
-    R600CPFlushIndirect(pScrn, accel_state->ib);
-}
 
 void
 R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
@@ -115,6 +58,8 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     tex_resource_t  tex_res;
     tex_sampler_t   tex_samp;
     shader_config_t vs_conf, ps_conf;
+    draw_config_t   draw_conf;
+    vtx_resource_t  vtx_res;
     int uv_offset;
 
     static float ps_alu_consts[] = {
@@ -135,6 +80,8 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     CLEAR (tex_samp);
     CLEAR (vs_conf);
     CLEAR (ps_conf);
+    CLEAR (draw_conf);
+    CLEAR (vtx_res);
 
     accel_state->dst_pitch = exaGetPixmapPitch(pPixmap) / (pPixmap->drawable.bitsPerPixel / 8);
     accel_state->src_pitch[0] = pPriv->src_pitch;
@@ -469,9 +416,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 
 	if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
 	    ErrorF("Xv: Ran out of VB space!\n");
-	    R600DoneXv(pScrn);
-	    accel_state->ib = RADEONCPGetBuffer(pScrn);
-	    accel_state->vb_index = 0;
+	    break;
 	}
 
 	dstX = pBox->x1 + dstxoff;
@@ -516,7 +461,57 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	pBox++;
     }
 
-    R600DoneXv(pScrn);
+    if (accel_state->vb_index == 0) {
+	R600IBDiscard(pScrn, accel_state->ib);
+	DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
+	return;
+    }
+
+    accel_state->vb_mc_addr = info->gartLocation + info->dri->bufStart +
+	(accel_state->ib->idx * accel_state->ib->total) + (accel_state->ib->total / 2);
+    accel_state->vb_size = accel_state->vb_index * 16;
+
+    /* flush vertex cache */
+    if ((info->ChipFamily == CHIP_FAMILY_RV610) ||
+	(info->ChipFamily == CHIP_FAMILY_RV620) ||
+	(info->ChipFamily == CHIP_FAMILY_RS780) ||
+	(info->ChipFamily == CHIP_FAMILY_RV710))
+	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
+			    accel_state->vb_size, accel_state->vb_mc_addr);
+    else
+	cp_set_surface_sync(pScrn, accel_state->ib, VC_ACTION_ENA_bit,
+			    accel_state->vb_size, accel_state->vb_mc_addr);
+
+    /* Vertex buffer setup */
+    vtx_res.id              = SQ_VTX_RESOURCE_vs;
+    vtx_res.vtx_size_dw     = 16 / 4;
+    vtx_res.vtx_num_entries = accel_state->vb_size / 4;
+    vtx_res.mem_req_size    = 1;
+    vtx_res.vb_addr         = accel_state->vb_mc_addr;
+    set_vtx_resource        (pScrn, accel_state->ib, &vtx_res);
+
+    draw_conf.prim_type          = DI_PT_RECTLIST;
+    draw_conf.vgt_draw_initiator = DI_SRC_SEL_AUTO_INDEX;
+    draw_conf.num_instances      = 1;
+    draw_conf.num_indices        = vtx_res.vtx_num_entries / vtx_res.vtx_size_dw;
+    draw_conf.index_type         = DI_INDEX_SIZE_16_BIT;
+
+    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_0,            0);	/* ? */
+    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_1,            0);
+
+    ereg  (accel_state->ib, VGT_MAX_VTX_INDX,                    draw_conf.num_indices);
+    ereg  (accel_state->ib, VGT_MIN_VTX_INDX,                    0);
+    ereg  (accel_state->ib, VGT_INDX_OFFSET,                     0);
+
+    draw_auto(pScrn, accel_state->ib, &draw_conf);
+
+    wait_3d_idle_clean(pScrn, accel_state->ib);
+
+    /* sync destination surface */
+    cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit | CB0_DEST_BASE_ENA_bit),
+			accel_state->dst_size, accel_state->dst_mc_addr);
+
+    R600CPFlushIndirect(pScrn, accel_state->ib);
 
     DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
 }
commit bd141aa73a77f68301715fb3b5664e2082327a80
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Feb 11 15:31:47 2009 -0500

    R6xx/R7xx Xv: switch packed over to Yang's new shader code

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 34e67d8..542d42d 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -2841,7 +2841,7 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
     // 2
     ps[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
 				      TYPE(SQ_EXPORT_PIXEL),
-				      RW_GPR(3),
+				      RW_GPR(2),
 				      RW_REL(ABSOLUTE),
 				      INDEX_GPR(0),
 				      ELEM_SIZE(3));
@@ -2856,96 +2856,88 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 					   CF_INST(SQ_CF_INST_EXPORT_DONE),
 					   WHOLE_QUAD_MODE(0),
 					   BARRIER(1));
+    /* Undo scaling of Y'CbCr values
+     *  Y' is scaled from 16:235
+     *  Cb/Cr are scaled from 16:240
+     */
     // 3 - alu 0
-    // DP4 gpr[2].x gpr[1].x c[0].x
+    // MULADD gpr[1].x gpr[1].x c[3].x c[3].y
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_X),
 			 SRC0_NEG(0),
-			 SRC1_SEL(256),
+			 SRC1_SEL(259),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_X),
 			 SRC1_NEG(0),
 			 INDEX_MODE(SQ_INDEX_LOOP),
 			 PRED_SEL(SQ_PRED_SEL_OFF),
 			 LAST(0));
-    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
-			     SRC0_ABS(0),
-			     SRC1_ABS(0),
-			     UPDATE_EXECUTE_MASK(0),
-			     UPDATE_PRED(0),
-			     WRITE_MASK(1),
-			     FOG_MERGE(0),
-			     OMOD(SQ_ALU_OMOD_OFF),
-			     ALU_INST(SQ_OP2_INST_DOT4),
-			     BANK_SWIZZLE(SQ_ALU_VEC_102),
-			     DST_GPR(2),
+    ps[i++] = ALU_DWORD1_OP3(SRC2_SEL(259),
+			     SRC2_REL(ABSOLUTE),
+			     SRC2_ELEM(ELEM_Y),
+			     SRC2_NEG(0),
+			     ALU_INST(SQ_OP3_INST_MULADD),
+			     BANK_SWIZZLE(SQ_ALU_VEC_012),
+			     DST_GPR(1),
 			     DST_REL(ABSOLUTE),
 			     DST_ELEM(ELEM_X),
 			     CLAMP(1));
     // 4 - alu 1
-    // DP4 gpr[2].y gpr[1].y c[0].y
+    // MULADD gpr[1].y gpr[1].y c[3].z c[3].w
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_Y),
 			 SRC0_NEG(0),
-			 SRC1_SEL(256),
+			 SRC1_SEL(259),
 			 SRC1_REL(ABSOLUTE),
-			 SRC1_ELEM(ELEM_Y),
+			 SRC1_ELEM(ELEM_Z),
 			 SRC1_NEG(0),
 			 INDEX_MODE(SQ_INDEX_LOOP),
 			 PRED_SEL(SQ_PRED_SEL_OFF),
 			 LAST(0));
-    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
-			     SRC0_ABS(0),
-			     SRC1_ABS(0),
-			     UPDATE_EXECUTE_MASK(0),
-			     UPDATE_PRED(0),
-			     WRITE_MASK(0),
-			     FOG_MERGE(0),
-			     OMOD(SQ_ALU_OMOD_OFF),
-			     ALU_INST(SQ_OP2_INST_DOT4),
-			     BANK_SWIZZLE(SQ_ALU_VEC_102),
-			     DST_GPR(2),
+    ps[i++] = ALU_DWORD1_OP3(SRC2_SEL(259),
+			     SRC2_REL(ABSOLUTE),
+			     SRC2_ELEM(ELEM_W),
+			     SRC2_NEG(0),
+			     ALU_INST(SQ_OP3_INST_MULADD),
+			     BANK_SWIZZLE(SQ_ALU_VEC_012),
+			     DST_GPR(1),
 			     DST_REL(ABSOLUTE),
 			     DST_ELEM(ELEM_Y),
-			     CLAMP(1));
+			     CLAMP(0));
     // 5 - alu 2
-    // DP4 gpr[2].z gpr[1].z c[0].z
+    // MULADD gpr[1].z gpr[1].z c[3].z c[3].w
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_Z),
 			 SRC0_NEG(0),
-			 SRC1_SEL(256),
+			 SRC1_SEL(259),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_Z),
 			 SRC1_NEG(0),
 			 INDEX_MODE(SQ_INDEX_LOOP),
 			 PRED_SEL(SQ_PRED_SEL_OFF),
 			 LAST(0));
-    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
-			     SRC0_ABS(0),
-			     SRC1_ABS(0),
-			     UPDATE_EXECUTE_MASK(0),
-			     UPDATE_PRED(0),
-			     WRITE_MASK(0),
-			     FOG_MERGE(0),
-			     OMOD(SQ_ALU_OMOD_OFF),
-			     ALU_INST(SQ_OP2_INST_DOT4),
-			     BANK_SWIZZLE(SQ_ALU_VEC_102),
-			     DST_GPR(2),
+    ps[i++] = ALU_DWORD1_OP3(SRC2_SEL(259),
+			     SRC2_REL(ABSOLUTE),
+			     SRC2_ELEM(ELEM_W),
+			     SRC2_NEG(0),
+			     ALU_INST(SQ_OP3_INST_MULADD),
+			     BANK_SWIZZLE(SQ_ALU_VEC_012),
+			     DST_GPR(1),
 			     DST_REL(ABSOLUTE),
 			     DST_ELEM(ELEM_Z),
-			     CLAMP(1));
+			     CLAMP(0));
     // 6 - alu 3
-    // DP4 gpr[2].w gpr[1].w c[0].w
-    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+    // MOV gpr[1].w 0.0
+    ps[i++] = ALU_DWORD0(SRC0_SEL(SQ_ALU_SRC_0),
 			 SRC0_REL(ABSOLUTE),
-			 SRC0_ELEM(ELEM_W),
+			 SRC0_ELEM(ELEM_X),
 			 SRC0_NEG(0),
-			 SRC1_SEL(256),
+			 SRC1_SEL(SQ_ALU_SRC_0),
 			 SRC1_REL(ABSOLUTE),
-			 SRC1_ELEM(ELEM_W),
+			 SRC1_ELEM(ELEM_X),
 			 SRC1_NEG(0),
 			 INDEX_MODE(SQ_INDEX_LOOP),
 			 PRED_SEL(SQ_PRED_SEL_OFF),
@@ -2955,22 +2947,22 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     SRC1_ABS(0),
 			     UPDATE_EXECUTE_MASK(0),
 			     UPDATE_PRED(0),
-			     WRITE_MASK(0),
+			     WRITE_MASK(1),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
 			     ALU_INST(SQ_OP2_INST_DOT4),
-			     BANK_SWIZZLE(SQ_ALU_VEC_021),
-			     DST_GPR(2),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(1),
 			     DST_REL(ABSOLUTE),
 			     DST_ELEM(ELEM_W),
-			     CLAMP(1));
+			     CLAMP(0));
     // 7 - alu 4
-    // DP4 gpr[2].x gpr[1].x c[1].x
+    // DP4 gpr[2].x gpr[1].x c[0].x
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_X),
 			 SRC0_NEG(0),
-			 SRC1_SEL(257),
+			 SRC1_SEL(256),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_X),
 			 SRC1_NEG(0),
@@ -2982,7 +2974,7 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     SRC1_ABS(0),
 			     UPDATE_EXECUTE_MASK(0),
 			     UPDATE_PRED(0),
-			     WRITE_MASK(0),
+			     WRITE_MASK(1),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
 			     ALU_INST(SQ_OP2_INST_DOT4),
@@ -2992,12 +2984,12 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     DST_ELEM(ELEM_X),
 			     CLAMP(1));
     // 8 - alu 5
-    // DP4 gpr[2].y gpr[1].y c[1].y
+    // DP4 gpr[2].y gpr[1].y c[0].y
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_Y),
 			 SRC0_NEG(0),
-			 SRC1_SEL(257),
+			 SRC1_SEL(256),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_Y),
 			 SRC1_NEG(0),
@@ -3009,7 +3001,7 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     SRC1_ABS(0),
 			     UPDATE_EXECUTE_MASK(0),
 			     UPDATE_PRED(0),
-			     WRITE_MASK(1),
+			     WRITE_MASK(0),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
 			     ALU_INST(SQ_OP2_INST_DOT4),
@@ -3019,12 +3011,12 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     DST_ELEM(ELEM_Y),
 			     CLAMP(1));
     // 9 - alu 6
-    // DP4 gpr[2].z gpr[1].z c[1].z
+    // DP4 gpr[2].z gpr[1].z c[0].z
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_Z),
 			 SRC0_NEG(0),
-			 SRC1_SEL(257),
+			 SRC1_SEL(256),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_Z),
 			 SRC1_NEG(0),
@@ -3046,12 +3038,12 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     DST_ELEM(ELEM_Z),
 			     CLAMP(1));
     // 10 - alu 7
-    // DP4 gpr[2].w gpr[1].w c[1].w
+    // DP4 gpr[2].w gpr[1].w c[0].w
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_W),
 			 SRC0_NEG(0),
-			 SRC1_SEL(257),
+			 SRC1_SEL(256),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_W),
 			 SRC1_NEG(0),
@@ -3073,12 +3065,12 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     DST_ELEM(ELEM_W),
 			     CLAMP(1));
     // 11 - alu 8
-    // DP4 gpr[2].x gpr[1].x c[2].x
+    // DP4 gpr[2].x gpr[1].x c[1].x
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_X),
 			 SRC0_NEG(0),
-			 SRC1_SEL(258),
+			 SRC1_SEL(257),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_X),
 			 SRC1_NEG(0),
@@ -3100,12 +3092,12 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     DST_ELEM(ELEM_X),
 			     CLAMP(1));
     // 12 - alu 9
-    // DP4 gpr[2].y gpr[1].y c[2].y
+    // DP4 gpr[2].y gpr[1].y c[1].y
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_Y),
 			 SRC0_NEG(0),
-			 SRC1_SEL(258),
+			 SRC1_SEL(257),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_Y),
 			 SRC1_NEG(0),
@@ -3117,7 +3109,7 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     SRC1_ABS(0),
 			     UPDATE_EXECUTE_MASK(0),
 			     UPDATE_PRED(0),
-			     WRITE_MASK(0),
+			     WRITE_MASK(1),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
 			     ALU_INST(SQ_OP2_INST_DOT4),
@@ -3127,12 +3119,12 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     DST_ELEM(ELEM_Y),
 			     CLAMP(1));
     // 13 - alu 10
-    // DP4 gpr[2].z gpr[1].z c[2].z
+    // DP4 gpr[2].z gpr[1].z c[1].z
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_Z),
 			 SRC0_NEG(0),
-			 SRC1_SEL(258),
+			 SRC1_SEL(257),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_Z),
 			 SRC1_NEG(0),
@@ -3144,7 +3136,7 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     SRC1_ABS(0),
 			     UPDATE_EXECUTE_MASK(0),
 			     UPDATE_PRED(0),
-			     WRITE_MASK(1),
+			     WRITE_MASK(0),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
 			     ALU_INST(SQ_OP2_INST_DOT4),
@@ -3154,12 +3146,12 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     DST_ELEM(ELEM_Z),
 			     CLAMP(1));
     // 14 - alu 11
-    // DP4 gpr[2].w gpr[1].w c[2].w
+    // DP4 gpr[2].w gpr[1].w c[1].w
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_W),
 			 SRC0_NEG(0),
-			 SRC1_SEL(258),
+			 SRC1_SEL(257),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_W),
 			 SRC1_NEG(0),
@@ -3181,12 +3173,12 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     DST_ELEM(ELEM_W),
 			     CLAMP(1));
     // 15 - alu 12
-    // MOV gpr[3].x gpr[2].x
-    ps[i++] = ALU_DWORD0(SRC0_SEL(2),
+    // DP4 gpr[2].x gpr[1].x c[2].x
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_X),
 			 SRC0_NEG(0),
-			 SRC1_SEL(0),
+			 SRC1_SEL(258),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_X),
 			 SRC1_NEG(0),
@@ -3198,24 +3190,24 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     SRC1_ABS(0),
 			     UPDATE_EXECUTE_MASK(0),
 			     UPDATE_PRED(0),
-			     WRITE_MASK(1),
+			     WRITE_MASK(0),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
-			     ALU_INST(SQ_OP2_INST_MOV),
-			     BANK_SWIZZLE(SQ_ALU_VEC_210),
-			     DST_GPR(3),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
 			     DST_REL(ABSOLUTE),
 			     DST_ELEM(ELEM_X),
-			     CLAMP(0));
+			     CLAMP(1));
     // 16 - alu 13
-    // MOV gpr[3].y gpr[2].y
-    ps[i++] = ALU_DWORD0(SRC0_SEL(2),
+    // DP4 gpr[2].y gpr[1].y c[2].y
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_Y),
 			 SRC0_NEG(0),
-			 SRC1_SEL(0),
+			 SRC1_SEL(258),
 			 SRC1_REL(ABSOLUTE),
-			 SRC1_ELEM(ELEM_X),
+			 SRC1_ELEM(ELEM_Y),
 			 SRC1_NEG(0),
 			 INDEX_MODE(SQ_INDEX_LOOP),
 			 PRED_SEL(SQ_PRED_SEL_OFF),
@@ -3225,24 +3217,24 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     SRC1_ABS(0),
 			     UPDATE_EXECUTE_MASK(0),
 			     UPDATE_PRED(0),
-			     WRITE_MASK(1),
+			     WRITE_MASK(0),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
-			     ALU_INST(SQ_OP2_INST_MOV),
-			     BANK_SWIZZLE(SQ_ALU_VEC_210),
-			     DST_GPR(3),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
 			     DST_REL(ABSOLUTE),
 			     DST_ELEM(ELEM_Y),
-			     CLAMP(0));
+			     CLAMP(1));
     // 17 - alu 14
-    // MOV gpr[3].z gpr[2].z
-    ps[i++] = ALU_DWORD0(SRC0_SEL(2),
+    // DP4 gpr[2].z gpr[1].z c[2].z
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_Z),
 			 SRC0_NEG(0),
-			 SRC1_SEL(0),
+			 SRC1_SEL(258),
 			 SRC1_REL(ABSOLUTE),
-			 SRC1_ELEM(ELEM_X),
+			 SRC1_ELEM(ELEM_Z),
 			 SRC1_NEG(0),
 			 INDEX_MODE(SQ_INDEX_LOOP),
 			 PRED_SEL(SQ_PRED_SEL_OFF),
@@ -3255,21 +3247,21 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     WRITE_MASK(1),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
-			     ALU_INST(SQ_OP2_INST_MOV),
-			     BANK_SWIZZLE(SQ_ALU_VEC_210),
-			     DST_GPR(3),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
 			     DST_REL(ABSOLUTE),
 			     DST_ELEM(ELEM_Z),
-			     CLAMP(0));
+			     CLAMP(1));
     // 18 - alu 15
-    // MOV gpr[3].w gpr[2].w
-    ps[i++] = ALU_DWORD0(SRC0_SEL(2),
+    // DP4 gpr[2].w gpr[1].w c[2].w
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_W),
 			 SRC0_NEG(0),
-			 SRC1_SEL(0),
+			 SRC1_SEL(258),
 			 SRC1_REL(ABSOLUTE),
-			 SRC1_ELEM(ELEM_X),
+			 SRC1_ELEM(ELEM_W),
 			 SRC1_NEG(0),
 			 INDEX_MODE(SQ_INDEX_LOOP),
 			 PRED_SEL(SQ_PRED_SEL_OFF),
@@ -3279,15 +3271,15 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     SRC1_ABS(0),
 			     UPDATE_EXECUTE_MASK(0),
 			     UPDATE_PRED(0),
-			     WRITE_MASK(1),
+			     WRITE_MASK(0),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
-			     ALU_INST(SQ_OP2_INST_MOV),
-			     BANK_SWIZZLE(SQ_ALU_VEC_012),
-			     DST_GPR(3),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_021),
+			     DST_GPR(2),
 			     DST_REL(ABSOLUTE),
 			     DST_ELEM(ELEM_W),
-			     CLAMP(0));
+			     CLAMP(1));
     // 19 - alignment
     ps[i++] = 0x00000000;
     ps[i++] = 0x00000000;
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index 4a7391c..c06512a 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -202,7 +202,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 			accel_state->ps_size, accel_state->ps_mc_addr);
 
     ps_conf.shader_addr         = accel_state->ps_mc_addr;
-    ps_conf.num_gprs            = 4;
+    ps_conf.num_gprs            = 3;
     ps_conf.stack_size          = 0;
     ps_conf.uncached_first_inst = 1;
     ps_conf.clamp_consts        = 0;
commit cb4a1ceaa32d6847b146a31291772e1183972ee7
Author: Yang Zhao <yang at yangman.ca>
Date:   Wed Feb 11 15:18:00 2009 -0500

    R6xx/R7xx Xv: Planar - Properly scale Y'CbCr values before converting to RGB
    
    According to MPEG-2 spec, Y' and Cb/Cr values are scaled to [16, 235]
    and [16, 240], respectively, when packed into bytes. Properly take care
    of the reverse scaling before translating to RGB.
    
    Conversion matrix has been simplified to remove 3rd column, as the fitting
    to [-0.5, 0.5] can be done with scaling.
    
    Redundant MOV instructions were also removed, and now only 3 GPRs are required.

diff --git a/src/r600_exa.c b/src/r600_exa.c
index a0f227b..34e67d8 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -3349,7 +3349,7 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
     ps[i++] = TEX_DWORD_PAD;
 
     // xv ps planar ----------------------------------
-    i = accel_state->xv_ps_offset_planar / 4;
+     i = accel_state->xv_ps_offset_planar / 4;
     // 0
     ps[i++] = CF_DWORD0(ADDR(20));
     ps[i++] = CF_DWORD1(POP_COUNT(0),
@@ -3378,7 +3378,7 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
     // 2
     ps[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
 				      TYPE(SQ_EXPORT_PIXEL),
-				      RW_GPR(3),
+				      RW_GPR(2),
 				      RW_REL(ABSOLUTE),
 				      INDEX_GPR(0),
 				      ELEM_SIZE(3));
@@ -3393,96 +3393,88 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 					   CF_INST(SQ_CF_INST_EXPORT_DONE),
 					   WHOLE_QUAD_MODE(0),
 					   BARRIER(1));
+    /* Undo scaling of Y'CbCr values
+     *  Y' is scaled from 16:235
+     *  Cb/Cr are scaled from 16:240
+     */
     // 3 - alu 0
-    // DP4 gpr[2].x gpr[1].x c[0].x
+    // MULADD gpr[1].x gpr[1].x c[3].x c[3].y
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_X),
 			 SRC0_NEG(0),
-			 SRC1_SEL(256),
+			 SRC1_SEL(259),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_X),
 			 SRC1_NEG(0),
 			 INDEX_MODE(SQ_INDEX_LOOP),
 			 PRED_SEL(SQ_PRED_SEL_OFF),
 			 LAST(0));
-    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
-			     SRC0_ABS(0),
-			     SRC1_ABS(0),
-			     UPDATE_EXECUTE_MASK(0),
-			     UPDATE_PRED(0),
-			     WRITE_MASK(1),
-			     FOG_MERGE(0),
-			     OMOD(SQ_ALU_OMOD_OFF),
-			     ALU_INST(SQ_OP2_INST_DOT4),
-			     BANK_SWIZZLE(SQ_ALU_VEC_102),
-			     DST_GPR(2),
+    ps[i++] = ALU_DWORD1_OP3(SRC2_SEL(259),
+			     SRC2_REL(ABSOLUTE),
+			     SRC2_ELEM(ELEM_Y),
+			     SRC2_NEG(0),
+			     ALU_INST(SQ_OP3_INST_MULADD),
+			     BANK_SWIZZLE(SQ_ALU_VEC_012),
+			     DST_GPR(1),
 			     DST_REL(ABSOLUTE),
 			     DST_ELEM(ELEM_X),
 			     CLAMP(1));
     // 4 - alu 1
-    // DP4 gpr[2].y gpr[1].y c[0].y
+    // MULADD gpr[1].y gpr[1].y c[3].z c[3].w
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_Y),
 			 SRC0_NEG(0),
-			 SRC1_SEL(256),
+			 SRC1_SEL(259),
 			 SRC1_REL(ABSOLUTE),
-			 SRC1_ELEM(ELEM_Y),
+			 SRC1_ELEM(ELEM_Z),
 			 SRC1_NEG(0),
 			 INDEX_MODE(SQ_INDEX_LOOP),
 			 PRED_SEL(SQ_PRED_SEL_OFF),
 			 LAST(0));
-    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
-			     SRC0_ABS(0),
-			     SRC1_ABS(0),
-			     UPDATE_EXECUTE_MASK(0),
-			     UPDATE_PRED(0),
-			     WRITE_MASK(0),
-			     FOG_MERGE(0),
-			     OMOD(SQ_ALU_OMOD_OFF),
-			     ALU_INST(SQ_OP2_INST_DOT4),
-			     BANK_SWIZZLE(SQ_ALU_VEC_102),
-			     DST_GPR(2),
+    ps[i++] = ALU_DWORD1_OP3(SRC2_SEL(259),
+			     SRC2_REL(ABSOLUTE),
+			     SRC2_ELEM(ELEM_W),
+			     SRC2_NEG(0),
+			     ALU_INST(SQ_OP3_INST_MULADD),
+			     BANK_SWIZZLE(SQ_ALU_VEC_012),
+			     DST_GPR(1),
 			     DST_REL(ABSOLUTE),
 			     DST_ELEM(ELEM_Y),
-			     CLAMP(1));
+			     CLAMP(0));
     // 5 - alu 2
-    // DP4 gpr[2].z gpr[1].z c[0].z
+    // MULADD gpr[1].z gpr[1].z c[3].z c[3].w
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_Z),
 			 SRC0_NEG(0),
-			 SRC1_SEL(256),
+			 SRC1_SEL(259),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_Z),
 			 SRC1_NEG(0),
 			 INDEX_MODE(SQ_INDEX_LOOP),
 			 PRED_SEL(SQ_PRED_SEL_OFF),
 			 LAST(0));
-    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
-			     SRC0_ABS(0),
-			     SRC1_ABS(0),
-			     UPDATE_EXECUTE_MASK(0),
-			     UPDATE_PRED(0),
-			     WRITE_MASK(0),
-			     FOG_MERGE(0),
-			     OMOD(SQ_ALU_OMOD_OFF),
-			     ALU_INST(SQ_OP2_INST_DOT4),
-			     BANK_SWIZZLE(SQ_ALU_VEC_102),
-			     DST_GPR(2),
+    ps[i++] = ALU_DWORD1_OP3(SRC2_SEL(259),
+			     SRC2_REL(ABSOLUTE),
+			     SRC2_ELEM(ELEM_W),
+			     SRC2_NEG(0),
+			     ALU_INST(SQ_OP3_INST_MULADD),
+			     BANK_SWIZZLE(SQ_ALU_VEC_012),
+			     DST_GPR(1),
 			     DST_REL(ABSOLUTE),
 			     DST_ELEM(ELEM_Z),
-			     CLAMP(1));
+			     CLAMP(0));
     // 6 - alu 3
-    // DP4 gpr[2].w gpr[1].w c[0].w
-    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+    // MOV gpr[1].w 0.0
+    ps[i++] = ALU_DWORD0(SRC0_SEL(SQ_ALU_SRC_0),
 			 SRC0_REL(ABSOLUTE),
-			 SRC0_ELEM(ELEM_W),
+			 SRC0_ELEM(ELEM_X),
 			 SRC0_NEG(0),
-			 SRC1_SEL(256),
+			 SRC1_SEL(SQ_ALU_SRC_0),
 			 SRC1_REL(ABSOLUTE),
-			 SRC1_ELEM(ELEM_W),
+			 SRC1_ELEM(ELEM_X),
 			 SRC1_NEG(0),
 			 INDEX_MODE(SQ_INDEX_LOOP),
 			 PRED_SEL(SQ_PRED_SEL_OFF),
@@ -3492,22 +3484,22 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     SRC1_ABS(0),
 			     UPDATE_EXECUTE_MASK(0),
 			     UPDATE_PRED(0),
-			     WRITE_MASK(0),
+			     WRITE_MASK(1),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
 			     ALU_INST(SQ_OP2_INST_DOT4),
-			     BANK_SWIZZLE(SQ_ALU_VEC_021),
-			     DST_GPR(2),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(1),
 			     DST_REL(ABSOLUTE),
 			     DST_ELEM(ELEM_W),
-			     CLAMP(1));
+			     CLAMP(0));
     // 7 - alu 4
-    // DP4 gpr[2].x gpr[1].x c[1].x
+    // DP4 gpr[2].x gpr[1].x c[0].x
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_X),
 			 SRC0_NEG(0),
-			 SRC1_SEL(257),
+			 SRC1_SEL(256),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_X),
 			 SRC1_NEG(0),
@@ -3519,7 +3511,7 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     SRC1_ABS(0),
 			     UPDATE_EXECUTE_MASK(0),
 			     UPDATE_PRED(0),
-			     WRITE_MASK(0),
+			     WRITE_MASK(1),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
 			     ALU_INST(SQ_OP2_INST_DOT4),
@@ -3529,12 +3521,12 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     DST_ELEM(ELEM_X),
 			     CLAMP(1));
     // 8 - alu 5
-    // DP4 gpr[2].y gpr[1].y c[1].y
+    // DP4 gpr[2].y gpr[1].y c[0].y
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_Y),
 			 SRC0_NEG(0),
-			 SRC1_SEL(257),
+			 SRC1_SEL(256),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_Y),
 			 SRC1_NEG(0),
@@ -3546,7 +3538,7 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     SRC1_ABS(0),
 			     UPDATE_EXECUTE_MASK(0),
 			     UPDATE_PRED(0),
-			     WRITE_MASK(1),
+			     WRITE_MASK(0),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
 			     ALU_INST(SQ_OP2_INST_DOT4),
@@ -3556,12 +3548,12 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     DST_ELEM(ELEM_Y),
 			     CLAMP(1));
     // 9 - alu 6
-    // DP4 gpr[2].z gpr[1].z c[1].z
+    // DP4 gpr[2].z gpr[1].z c[0].z
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_Z),
 			 SRC0_NEG(0),
-			 SRC1_SEL(257),
+			 SRC1_SEL(256),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_Z),
 			 SRC1_NEG(0),
@@ -3583,12 +3575,12 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     DST_ELEM(ELEM_Z),
 			     CLAMP(1));
     // 10 - alu 7
-    // DP4 gpr[2].w gpr[1].w c[1].w
+    // DP4 gpr[2].w gpr[1].w c[0].w
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_W),
 			 SRC0_NEG(0),
-			 SRC1_SEL(257),
+			 SRC1_SEL(256),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_W),
 			 SRC1_NEG(0),
@@ -3610,12 +3602,12 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     DST_ELEM(ELEM_W),
 			     CLAMP(1));
     // 11 - alu 8
-    // DP4 gpr[2].x gpr[1].x c[2].x
+    // DP4 gpr[2].x gpr[1].x c[1].x
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_X),
 			 SRC0_NEG(0),
-			 SRC1_SEL(258),
+			 SRC1_SEL(257),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_X),
 			 SRC1_NEG(0),
@@ -3637,12 +3629,12 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     DST_ELEM(ELEM_X),
 			     CLAMP(1));
     // 12 - alu 9
-    // DP4 gpr[2].y gpr[1].y c[2].y
+    // DP4 gpr[2].y gpr[1].y c[1].y
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_Y),
 			 SRC0_NEG(0),
-			 SRC1_SEL(258),
+			 SRC1_SEL(257),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_Y),
 			 SRC1_NEG(0),
@@ -3654,7 +3646,7 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     SRC1_ABS(0),
 			     UPDATE_EXECUTE_MASK(0),
 			     UPDATE_PRED(0),
-			     WRITE_MASK(0),
+			     WRITE_MASK(1),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
 			     ALU_INST(SQ_OP2_INST_DOT4),
@@ -3664,12 +3656,12 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     DST_ELEM(ELEM_Y),
 			     CLAMP(1));
     // 13 - alu 10
-    // DP4 gpr[2].z gpr[1].z c[2].z
+    // DP4 gpr[2].z gpr[1].z c[1].z
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_Z),
 			 SRC0_NEG(0),
-			 SRC1_SEL(258),
+			 SRC1_SEL(257),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_Z),
 			 SRC1_NEG(0),
@@ -3681,7 +3673,7 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     SRC1_ABS(0),
 			     UPDATE_EXECUTE_MASK(0),
 			     UPDATE_PRED(0),
-			     WRITE_MASK(1),
+			     WRITE_MASK(0),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
 			     ALU_INST(SQ_OP2_INST_DOT4),
@@ -3691,12 +3683,12 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     DST_ELEM(ELEM_Z),
 			     CLAMP(1));
     // 14 - alu 11
-    // DP4 gpr[2].w gpr[1].w c[2].w
+    // DP4 gpr[2].w gpr[1].w c[1].w
     ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_W),
 			 SRC0_NEG(0),
-			 SRC1_SEL(258),
+			 SRC1_SEL(257),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_W),
 			 SRC1_NEG(0),
@@ -3718,12 +3710,12 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     DST_ELEM(ELEM_W),
 			     CLAMP(1));
     // 15 - alu 12
-    // MOV gpr[3].x gpr[2].x
-    ps[i++] = ALU_DWORD0(SRC0_SEL(2),
+    // DP4 gpr[2].x gpr[1].x c[2].x
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_X),
 			 SRC0_NEG(0),
-			 SRC1_SEL(0),
+			 SRC1_SEL(258),
 			 SRC1_REL(ABSOLUTE),
 			 SRC1_ELEM(ELEM_X),
 			 SRC1_NEG(0),
@@ -3735,24 +3727,24 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     SRC1_ABS(0),
 			     UPDATE_EXECUTE_MASK(0),
 			     UPDATE_PRED(0),
-			     WRITE_MASK(1),
+			     WRITE_MASK(0),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
-			     ALU_INST(SQ_OP2_INST_MOV),
-			     BANK_SWIZZLE(SQ_ALU_VEC_210),
-			     DST_GPR(3),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
 			     DST_REL(ABSOLUTE),
 			     DST_ELEM(ELEM_X),
-			     CLAMP(0));
+			     CLAMP(1));
     // 16 - alu 13
-    // MOV gpr[3].y gpr[2].y
-    ps[i++] = ALU_DWORD0(SRC0_SEL(2),
+    // DP4 gpr[2].y gpr[1].y c[2].y
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_Y),
 			 SRC0_NEG(0),
-			 SRC1_SEL(0),
+			 SRC1_SEL(258),
 			 SRC1_REL(ABSOLUTE),
-			 SRC1_ELEM(ELEM_X),
+			 SRC1_ELEM(ELEM_Y),
 			 SRC1_NEG(0),
 			 INDEX_MODE(SQ_INDEX_LOOP),
 			 PRED_SEL(SQ_PRED_SEL_OFF),
@@ -3762,24 +3754,24 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     SRC1_ABS(0),
 			     UPDATE_EXECUTE_MASK(0),
 			     UPDATE_PRED(0),
-			     WRITE_MASK(1),
+			     WRITE_MASK(0),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
-			     ALU_INST(SQ_OP2_INST_MOV),
-			     BANK_SWIZZLE(SQ_ALU_VEC_210),
-			     DST_GPR(3),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
 			     DST_REL(ABSOLUTE),
 			     DST_ELEM(ELEM_Y),
-			     CLAMP(0));
+			     CLAMP(1));
     // 17 - alu 14
-    // MOV gpr[3].z gpr[2].z
-    ps[i++] = ALU_DWORD0(SRC0_SEL(2),
+    // DP4 gpr[2].z gpr[1].z c[2].z
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_Z),
 			 SRC0_NEG(0),
-			 SRC1_SEL(0),
+			 SRC1_SEL(258),
 			 SRC1_REL(ABSOLUTE),
-			 SRC1_ELEM(ELEM_X),
+			 SRC1_ELEM(ELEM_Z),
 			 SRC1_NEG(0),
 			 INDEX_MODE(SQ_INDEX_LOOP),
 			 PRED_SEL(SQ_PRED_SEL_OFF),
@@ -3792,21 +3784,21 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     WRITE_MASK(1),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
-			     ALU_INST(SQ_OP2_INST_MOV),
-			     BANK_SWIZZLE(SQ_ALU_VEC_210),
-			     DST_GPR(3),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
 			     DST_REL(ABSOLUTE),
 			     DST_ELEM(ELEM_Z),
-			     CLAMP(0));
+			     CLAMP(1));
     // 18 - alu 15
-    // MOV gpr[3].w gpr[2].w
-    ps[i++] = ALU_DWORD0(SRC0_SEL(2),
+    // DP4 gpr[2].w gpr[1].w c[2].w
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
 			 SRC0_REL(ABSOLUTE),
 			 SRC0_ELEM(ELEM_W),
 			 SRC0_NEG(0),
-			 SRC1_SEL(0),
+			 SRC1_SEL(258),
 			 SRC1_REL(ABSOLUTE),
-			 SRC1_ELEM(ELEM_X),
+			 SRC1_ELEM(ELEM_W),
 			 SRC1_NEG(0),
 			 INDEX_MODE(SQ_INDEX_LOOP),
 			 PRED_SEL(SQ_PRED_SEL_OFF),
@@ -3816,15 +3808,15 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			     SRC1_ABS(0),
 			     UPDATE_EXECUTE_MASK(0),
 			     UPDATE_PRED(0),
-			     WRITE_MASK(1),
+			     WRITE_MASK(0),
 			     FOG_MERGE(0),
 			     OMOD(SQ_ALU_OMOD_OFF),
-			     ALU_INST(SQ_OP2_INST_MOV),
-			     BANK_SWIZZLE(SQ_ALU_VEC_012),
-			     DST_GPR(3),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_021),
+			     DST_GPR(2),
 			     DST_REL(ABSOLUTE),
 			     DST_ELEM(ELEM_W),
-			     CLAMP(0));
+			     CLAMP(1));
     // 19 - alignment
     ps[i++] = 0x00000000;
     ps[i++] = 0x00000000;
@@ -4057,6 +4049,150 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			 MEGA_FETCH(0));
     vs[i++] = VTX_DWORD_PAD;
 
+    // comp mask vs ---------------------------------------
+    i = accel_state->comp_mask_vs_offset / 4;
+    //0
+    vs[i++] = CF_DWORD0(ADDR(4));
+    vs[i++] = CF_DWORD1(POP_COUNT(0),
+			CF_CONST(0),
+			COND(SQ_CF_COND_ACTIVE),
+			I_COUNT(3),
+			CALL_COUNT(0),
+			END_OF_PROGRAM(0),
+			VALID_PIXEL_MODE(0),
+			CF_INST(SQ_CF_INST_VTX),
+			WHOLE_QUAD_MODE(0),
+			BARRIER(1));
+    //1 - dst
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_POS0),
+				      TYPE(SQ_EXPORT_POS),
+				      RW_GPR(2),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(0));
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(1),
+					   END_OF_PROGRAM(0),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT_DONE),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(1));
+    //2 - src
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(0),
+				      TYPE(SQ_EXPORT_PARAM),
+				      RW_GPR(1),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(0));
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(1),
+					   END_OF_PROGRAM(0),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(0));
+    //3 - mask
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(1),
+				      TYPE(SQ_EXPORT_PARAM),
+				      RW_GPR(0),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(0));
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(1),
+					   END_OF_PROGRAM(1),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT_DONE),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(0));
+    //4/5 - dst
+    vs[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+			 FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+			 FETCH_WHOLE_QUAD(0),
+			 BUFFER_ID(0),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 SRC_SEL_X(SQ_SEL_X),
+			 MEGA_FETCH_COUNT(24));
+    vs[i++] = VTX_DWORD1_GPR(DST_GPR(2),
+			     DST_REL(0),
+			     DST_SEL_X(SQ_SEL_X),
+			     DST_SEL_Y(SQ_SEL_Y),
+			     DST_SEL_Z(SQ_SEL_0),
+			     DST_SEL_W(SQ_SEL_1),
+			     USE_CONST_FIELDS(0),
+			     DATA_FORMAT(FMT_32_32_FLOAT), //xxx
+			     NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), //xxx
+			     FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), //xxx
+			     SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
+    vs[i++] = VTX_DWORD2(OFFSET(0),
+			 ENDIAN_SWAP(ENDIAN_NONE),
+			 CONST_BUF_NO_STRIDE(0),
+			 MEGA_FETCH(1));
+    vs[i++] = VTX_DWORD_PAD;
+    //6/7 - src
+    vs[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+			 FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+			 FETCH_WHOLE_QUAD(0),
+			 BUFFER_ID(0),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 SRC_SEL_X(SQ_SEL_X),
+			 MEGA_FETCH_COUNT(8));
+    vs[i++] = VTX_DWORD1_GPR(DST_GPR(1),
+			     DST_REL(0),
+			     DST_SEL_X(SQ_SEL_X),
+			     DST_SEL_Y(SQ_SEL_Y),
+			     DST_SEL_Z(SQ_SEL_0),
+			     DST_SEL_W(SQ_SEL_1),
+			     USE_CONST_FIELDS(0),
+			     DATA_FORMAT(FMT_32_32_FLOAT), //xxx
+			     NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), //xxx
+			     FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), //xxx
+			     SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
+    vs[i++] = VTX_DWORD2(OFFSET(8),
+			 ENDIAN_SWAP(ENDIAN_NONE),
+			 CONST_BUF_NO_STRIDE(0),
+			 MEGA_FETCH(0));
+    vs[i++] = VTX_DWORD_PAD;
+    //8/9 - mask
+    vs[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+			 FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+			 FETCH_WHOLE_QUAD(0),
+			 BUFFER_ID(0),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 SRC_SEL_X(SQ_SEL_X),
+			 MEGA_FETCH_COUNT(8));
+    vs[i++] = VTX_DWORD1_GPR(DST_GPR(0),
+			     DST_REL(0),
+			     DST_SEL_X(SQ_SEL_X),
+			     DST_SEL_Y(SQ_SEL_Y),
+			     DST_SEL_Z(SQ_SEL_0),
+			     DST_SEL_W(SQ_SEL_1),
+			     USE_CONST_FIELDS(0),
+			     DATA_FORMAT(FMT_32_32_FLOAT), //xxx
+			     NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), //xxx
+			     FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), //xxx
+			     SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
+    vs[i++] = VTX_DWORD2(OFFSET(16),
+			 ENDIAN_SWAP(ENDIAN_NONE),
+			 CONST_BUF_NO_STRIDE(0),
+			 MEGA_FETCH(0));
+    vs[i++] = VTX_DWORD_PAD;
+
     // comp mask ps ---------------------------------------
     // not yet
 
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index d5add19..4a7391c 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -118,9 +118,16 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     int uv_offset;
 
     static float ps_alu_consts[] = {
-	1.0,  0.0,      1.13983,  -1.13983/2,        // r - c[0]
-	1.0, -0.39465, -0.5806,  (0.39465+0.5806)/2, // g - c[1]
-	1.0,  2.03211,  0.0,     -2.03211/2,         // b - c[2]
+        1.0,  0.0,      1.4020,    0,  // r - c[0]
+        1.0, -0.34414, -0.71414,  0,  // g - c[1]
+        1.0,  1.7720,   0.0,        0,  // b - c[2]
+	/* Constants for undoing Y'CbCr scaling
+	 *  - Y' is scaled from 16:235
+	 *  - Cb/Cr are scaled from 16:240
+	 * Unscaled value N' = N * N_mul + N_shift (N' in range [-0.5, 0.5])
+	 * Vector is [Y_mul, Y_shfit, C_mul, C_shift]
+	 */
+        256.0/219.0, -16.0/219.0, 256.0/224.0, -128.0/224.0,
     };
 
     CLEAR (cb_conf);
commit 4fd7228de7ad88edf825dbc3039df877795a9479
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Feb 11 14:46:02 2009 -0500

    R6xx/R7xx: handle running out of vertex buffer space
    
    draw current VB and start a new one.

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 8a2a009..a0f227b 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -41,6 +41,12 @@
 extern PixmapPtr
 RADEONGetDrawablePixmap(DrawablePtr pDrawable);
 
+static void
+R600DoneSolid(PixmapPtr pPix);
+
+static void
+R600DoneComposite(PixmapPtr pDst);
+
 //#define SHOW_VERTEXES
 
 #       define RADEON_ROP3_ZERO             0x00000000
@@ -264,7 +270,9 @@ R600Solid(PixmapPtr pPix, int x1, int y1, int x2, int y2)
 
     if (((accel_state->vb_index + 3) * 8) > (accel_state->ib->total / 2)) {
 	ErrorF("Solid: Ran out of VB space!\n");
-	return;
+	R600DoneSolid(pPix);
+	accel_state->ib = RADEONCPGetBuffer(pScrn);
+	accel_state->vb_index = 0;
     }
 
     vertex[0].x = (float)x1;
@@ -607,7 +615,9 @@ R600AppendCopyVertex(ScrnInfoPtr pScrn,
 
     if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
 	ErrorF("Copy: Ran out of VB space!\n");
-	return;
+	R600DoCopy(pScrn);
+	accel_state->ib = RADEONCPGetBuffer(pScrn);
+	accel_state->vb_index = 0;
     }
 
     vertex[0].x = (float)dstX;
@@ -1913,7 +1923,9 @@ static void R600Composite(PixmapPtr pDst,
 
 	if (((accel_state->vb_index + 3) * 24) > (accel_state->ib->total / 2)) {
 	    ErrorF("Composite: Ran out of VB space!\n");
-	    return;
+	    R600DoneComposite(pDst);
+	    accel_state->ib = RADEONCPGetBuffer(pScrn);
+	    accel_state->vb_index = 0;
 	}
 
 	maskTopLeft.x     = IntToxFixed(maskX);
@@ -1974,7 +1986,9 @@ static void R600Composite(PixmapPtr pDst,
 
 	if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
 	    ErrorF("Composite: Ran out of VB space!\n");
-	    return;
+	    R600DoneComposite(pDst);
+	    accel_state->ib = RADEONCPGetBuffer(pScrn);
+	    accel_state->vb_index = 0;
 	}
 
 	vertex[0].x = (float)dstX;
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index e30e227..d5add19 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -44,6 +44,63 @@
 
 #include "damage.h"
 
+static void
+R600DoneXv(ScrnInfoPtr pScrn)
+{
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+    draw_config_t   draw_conf;
+    vtx_resource_t  vtx_res;
+
+    CLEAR (draw_conf);
+    CLEAR (vtx_res);
+
+    accel_state->vb_mc_addr = info->gartLocation + info->dri->bufStart +
+	(accel_state->ib->idx * accel_state->ib->total) + (accel_state->ib->total / 2);
+    accel_state->vb_size = accel_state->vb_index * 16;
+
+    /* flush vertex cache */
+    if ((info->ChipFamily == CHIP_FAMILY_RV610) ||
+	(info->ChipFamily == CHIP_FAMILY_RV620) ||
+	(info->ChipFamily == CHIP_FAMILY_RS780) ||
+	(info->ChipFamily == CHIP_FAMILY_RV710))
+	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
+			    accel_state->vb_size, accel_state->vb_mc_addr);
+    else
+	cp_set_surface_sync(pScrn, accel_state->ib, VC_ACTION_ENA_bit,
+			    accel_state->vb_size, accel_state->vb_mc_addr);
+
+    /* Vertex buffer setup */
+    vtx_res.id              = SQ_VTX_RESOURCE_vs;
+    vtx_res.vtx_size_dw     = 16 / 4;
+    vtx_res.vtx_num_entries = accel_state->vb_size / 4;
+    vtx_res.mem_req_size    = 1;
+    vtx_res.vb_addr         = accel_state->vb_mc_addr;
+    set_vtx_resource        (pScrn, accel_state->ib, &vtx_res);
+
+    draw_conf.prim_type          = DI_PT_RECTLIST;
+    draw_conf.vgt_draw_initiator = DI_SRC_SEL_AUTO_INDEX;
+    draw_conf.num_instances      = 1;
+    draw_conf.num_indices        = vtx_res.vtx_num_entries / vtx_res.vtx_size_dw;
+    draw_conf.index_type         = DI_INDEX_SIZE_16_BIT;
+
+    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_0,            0);	/* ? */
+    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_1,            0);
+
+    ereg  (accel_state->ib, VGT_MAX_VTX_INDX,                    draw_conf.num_indices);
+    ereg  (accel_state->ib, VGT_MIN_VTX_INDX,                    0);
+    ereg  (accel_state->ib, VGT_INDX_OFFSET,                     0);
+
+    draw_auto(pScrn, accel_state->ib, &draw_conf);
+
+    wait_3d_idle_clean(pScrn, accel_state->ib);
+
+    /* sync destination surface */
+    cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit | CB0_DEST_BASE_ENA_bit),
+			accel_state->dst_size, accel_state->dst_mc_addr);
+
+    R600CPFlushIndirect(pScrn, accel_state->ib);
+}
 
 void
 R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
@@ -58,8 +115,6 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     tex_resource_t  tex_res;
     tex_sampler_t   tex_samp;
     shader_config_t vs_conf, ps_conf;
-    draw_config_t   draw_conf;
-    vtx_resource_t  vtx_res;
     int uv_offset;
 
     static float ps_alu_consts[] = {
@@ -73,8 +128,6 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     CLEAR (tex_samp);
     CLEAR (vs_conf);
     CLEAR (ps_conf);
-    CLEAR (draw_conf);
-    CLEAR (vtx_res);
 
     accel_state->dst_pitch = exaGetPixmapPitch(pPixmap) / (pPixmap->drawable.bitsPerPixel / 8);
     accel_state->src_pitch[0] = pPriv->src_pitch;
@@ -409,7 +462,9 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 
 	if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
 	    ErrorF("Xv: Ran out of VB space!\n");
-	    break;
+	    R600DoneXv(pScrn);
+	    accel_state->ib = RADEONCPGetBuffer(pScrn);
+	    accel_state->vb_index = 0;
 	}
 
 	dstX = pBox->x1 + dstxoff;
@@ -454,57 +509,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	pBox++;
     }
 
-    if (accel_state->vb_index == 0) {
-	R600IBDiscard(pScrn, accel_state->ib);
-	DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
-	return;
-    }
-
-    accel_state->vb_mc_addr = info->gartLocation + info->dri->bufStart +
-	(accel_state->ib->idx * accel_state->ib->total) + (accel_state->ib->total / 2);
-    accel_state->vb_size = accel_state->vb_index * 16;
-
-    /* flush vertex cache */
-    if ((info->ChipFamily == CHIP_FAMILY_RV610) ||
-	(info->ChipFamily == CHIP_FAMILY_RV620) ||
-	(info->ChipFamily == CHIP_FAMILY_RS780) ||
-	(info->ChipFamily == CHIP_FAMILY_RV710))
-	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
-			    accel_state->vb_size, accel_state->vb_mc_addr);
-    else
-	cp_set_surface_sync(pScrn, accel_state->ib, VC_ACTION_ENA_bit,
-			    accel_state->vb_size, accel_state->vb_mc_addr);
-
-    /* Vertex buffer setup */
-    vtx_res.id              = SQ_VTX_RESOURCE_vs;
-    vtx_res.vtx_size_dw     = 16 / 4;
-    vtx_res.vtx_num_entries = accel_state->vb_size / 4;
-    vtx_res.mem_req_size    = 1;
-    vtx_res.vb_addr         = accel_state->vb_mc_addr;
-    set_vtx_resource        (pScrn, accel_state->ib, &vtx_res);
-
-    draw_conf.prim_type          = DI_PT_RECTLIST;
-    draw_conf.vgt_draw_initiator = DI_SRC_SEL_AUTO_INDEX;
-    draw_conf.num_instances      = 1;
-    draw_conf.num_indices        = vtx_res.vtx_num_entries / vtx_res.vtx_size_dw;
-    draw_conf.index_type         = DI_INDEX_SIZE_16_BIT;
-
-    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_0,            0);	/* ? */
-    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_1,            0);
-
-    ereg  (accel_state->ib, VGT_MAX_VTX_INDX,                    draw_conf.num_indices);
-    ereg  (accel_state->ib, VGT_MIN_VTX_INDX,                    0);
-    ereg  (accel_state->ib, VGT_INDX_OFFSET,                     0);
-
-    draw_auto(pScrn, accel_state->ib, &draw_conf);
-
-    wait_3d_idle_clean(pScrn, accel_state->ib);
-
-    /* sync destination surface */
-    cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit | CB0_DEST_BASE_ENA_bit),
-			accel_state->dst_size, accel_state->dst_mc_addr);
-
-    R600CPFlushIndirect(pScrn, accel_state->ib);
+    R600DoneXv(pScrn);
 
     DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
 }
commit ec60ef094762901ede4df5cde55c3f162e8c667b
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Feb 11 13:05:27 2009 -0500

    R6xx/R7xx: be more verbose about what function ran out of VB space

diff --git a/src/r600_exa.c b/src/r600_exa.c
index a7d058b..8a2a009 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -263,7 +263,7 @@ R600Solid(PixmapPtr pPix, int x1, int y1, int x2, int y2)
     struct r6xx_solid_vertex *solid_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
 
     if (((accel_state->vb_index + 3) * 8) > (accel_state->ib->total / 2)) {
-	ErrorF("Ran out of VB space!\n");
+	ErrorF("Solid: Ran out of VB space!\n");
 	return;
     }
 
@@ -606,7 +606,7 @@ R600AppendCopyVertex(ScrnInfoPtr pScrn,
     struct r6xx_copy_vertex vertex[3];
 
     if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
-	ErrorF("Ran out of VB space!\n");
+	ErrorF("Copy: Ran out of VB space!\n");
 	return;
     }
 
@@ -1912,7 +1912,7 @@ static void R600Composite(PixmapPtr pDst,
 	xPointFixed maskTopLeft, maskTopRight, maskBottomLeft, maskBottomRight;
 
 	if (((accel_state->vb_index + 3) * 24) > (accel_state->ib->total / 2)) {
-	    ErrorF("Ran out of VB space!\n");
+	    ErrorF("Composite: Ran out of VB space!\n");
 	    return;
 	}
 
@@ -1973,7 +1973,7 @@ static void R600Composite(PixmapPtr pDst,
 	struct r6xx_comp_vertex vertex[3];
 
 	if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
-	    ErrorF("Ran out of VB space!\n");
+	    ErrorF("Composite: Ran out of VB space!\n");
 	    return;
 	}
 
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index 993a8d4..e30e227 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -408,7 +408,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	struct r6xx_copy_vertex vertex[3];
 
 	if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
-	    ErrorF("Ran out of VB space!\n");
+	    ErrorF("Xv: Ran out of VB space!\n");
 	    break;
 	}
 
commit e7dedbc355970407eddb38370b58a8c96d204c1d
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Feb 11 12:45:14 2009 -0500

    R6xx/R7xx: Move engine idle to sync functions

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 6fd922c..a7d058b 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -2259,9 +2259,8 @@ R600MarkSync(ScreenPtr pScreen)
     RADEONInfoPtr info = RADEONPTR(pScrn);
     struct radeon_accel_state *accel_state = info->accel_state;
 
-    accel_state->exaSyncMarker++;
+    return ++accel_state->exaSyncMarker;
 
-    return accel_state->exaSyncMarker;
 }
 
 static void
@@ -2271,8 +2270,11 @@ R600Sync(ScreenPtr pScreen, int marker)
     RADEONInfoPtr info = RADEONPTR(pScrn);
     struct radeon_accel_state *accel_state = info->accel_state;
 
-    if (accel_state->exaMarkerSynced != marker)
+    if (accel_state->exaMarkerSynced != marker) {
+	R600WaitforIdlePoll(pScrn);
 	accel_state->exaMarkerSynced = marker;
+    }
+
 }
 
 static Bool
@@ -4162,8 +4164,6 @@ R600PrepareAccess(PixmapPtr pPix, int index)
     RADEONInfoPtr info = RADEONPTR(pScrn);
     unsigned char *RADEONMMIO = info->MMIO;
 
-    R600WaitforIdlePoll(pScrn);
-
     //flush HDP read/write caches
     OUTREG(HDP_MEM_COHERENCY_FLUSH_CNTL, 0x1);
 
commit 4a759a907ecd571460c7ff64cd6288380359b04b
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Feb 11 12:29:50 2009 -0500

    adjust alignment

diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index e70a012..cbedb7e 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -291,7 +291,7 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 	if (info->ChipFamily >= CHIP_FAMILY_R600)
 	    pPriv->video_offset = radeon_legacy_allocate_memory(pScrn,
 								&pPriv->video_memory,
-								size * 2, 512);
+								size * 2, 256);
 	else
 	    pPriv->video_offset = radeon_legacy_allocate_memory(pScrn,
 								&pPriv->video_memory,
commit d6c50b221cbab6e726948e2310c1def9fc38da64
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Feb 11 11:05:02 2009 -0500

    R6xx/R7xx Xv: add support for packed uploads

diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index eeaf990..e70a012 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -195,22 +195,18 @@ R600CopyPlanar(ScrnInfoPtr pScrn,
 }
 
 static void
-CopyPackedtoNV12(unsigned char *src, unsigned char *dst,
-		 int srcPitch, int dstPitch,
-		 int w, int h, int id)
+R600CopyPacked(ScrnInfoPtr pScrn,
+	       unsigned char *src, uint32_t dst_mc_addr,
+	       int srcPitch, int dstPitch,
+	       int w, int h)
 {
-    int i;
 
-    if (srcPitch == dstPitch) {
-        memcpy(dst, src, srcPitch * h);
-	dst += (dstPitch * h);
-    } else {
-	for (i = 0; i < h; i++) {
-            memcpy(dst, src, srcPitch);
-            src += srcPitch;
-            dst += dstPitch;
-        }
-    }
+    /* YUV */
+    R600CopyToVRAM(pScrn,
+		   (char *)src, srcPitch,
+		   dstPitch >> 2, dst_mc_addr, h, 32,
+		   0, 0, w >> 1, h);
+
 }
 
 static int
@@ -410,9 +406,9 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     case FOURCC_YUY2:
     default:
 	if (info->ChipFamily >= CHIP_FAMILY_R600) {
-	    CopyPackedtoNV12(buf, pPriv->src_addr,
-			     2 * width, pPriv->src_pitch,
-			     width, height, id);
+	    R600CopyPacked(pScrn, buf, pPriv->src_offset,
+			   2 * width, pPriv->src_pitch,
+			   width, height);
 	} else {
 	    nlines = ((y2 + 0xffff) >> 16) - top;
 	    RADEONCopyData(pScrn, buf, pPriv->src_addr, srcPitch, dstPitch, nlines, npixels, 2);
commit 8e437e996cc3f2c424c342701f4aa6bcf72ad08e
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Wed Feb 11 10:53:50 2009 -0500

    R6xx/R7xx Xv: Add native support for packed formats

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 598a5ac..6fd922c 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -2305,7 +2305,7 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
     accel_state->comp_mask_vs_offset = 3072;
     accel_state->comp_mask_ps_offset = 3584;
     accel_state->xv_vs_offset = 4096;
-    accel_state->xv_ps_offset_nv12 = 4608;
+    accel_state->xv_ps_offset_packed = 4608;
     accel_state->xv_ps_offset_planar = 5120;
 
     // solid vs ---------------------------------------
@@ -2795,8 +2795,8 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			 MEGA_FETCH(0));
     vs[i++] = VTX_DWORD_PAD;
 
-    // xv ps nv12 ----------------------------------
-    i = accel_state->xv_ps_offset_nv12 / 4;
+    // xv ps packed ----------------------------------
+    i = accel_state->xv_ps_offset_packed / 4;
     // 0
     ps[i++] = CF_DWORD0(ADDR(20));
     ps[i++] = CF_DWORD1(POP_COUNT(0),
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index 82de88a..993a8d4 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -119,7 +119,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     case FOURCC_YUY2:
     default:
 	accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
-	    accel_state->xv_ps_offset_nv12;
+	    accel_state->xv_ps_offset_packed;
 	break;
     }
 
@@ -268,14 +268,17 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	tex_res.id                  = 0;
 	tex_res.w                   = pPriv->w;
 	tex_res.h                   = pPriv->h;
-	tex_res.pitch               = accel_state->src_pitch[0];
+	tex_res.pitch               = accel_state->src_pitch[0] >> 1;
 	tex_res.depth               = 0;
 	tex_res.dim                 = SQ_TEX_DIM_2D;
 	tex_res.base                = accel_state->src_mc_addr[0];
 	tex_res.mip_base            = accel_state->src_mc_addr[0];
 
-	tex_res.format              = FMT_8;
-	tex_res.dst_sel_x           = SQ_SEL_X; //Y
+	tex_res.format              = FMT_8_8;
+	if (pPriv->id == FOURCC_UYVY)
+	    tex_res.dst_sel_x           = SQ_SEL_Y; //Y
+	else
+	    tex_res.dst_sel_x           = SQ_SEL_X; //Y
 	tex_res.dst_sel_y           = SQ_SEL_1;
 	tex_res.dst_sel_z           = SQ_SEL_1;
 	tex_res.dst_sel_w           = SQ_SEL_1;
@@ -302,26 +305,24 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	set_tex_sampler             (pScrn, accel_state->ib, &tex_samp);
 
 	// UV texture
-	uv_offset = accel_state->src_pitch[0] * pPriv->h;
-	uv_offset = (uv_offset + 255) & ~255;
-
-	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
-			    accel_state->src_size[0] / 2,
-			    accel_state->src_mc_addr[0] + uv_offset);
-
 	tex_res.id                  = 1;
-	tex_res.format              = FMT_8_8;
+	tex_res.format              = FMT_8_8_8_8;
 	tex_res.w                   = pPriv->w >> 1;
-	tex_res.h                   = pPriv->h >> 1;
-	tex_res.pitch               = accel_state->src_pitch[0] >> 1;
-	tex_res.dst_sel_x           = SQ_SEL_Y; //V
-	tex_res.dst_sel_y           = SQ_SEL_X; //U
+	tex_res.h                   = pPriv->h;
+	tex_res.pitch               = accel_state->src_pitch[0] >> 2;
+	if (pPriv->id == FOURCC_UYVY) {
+	    tex_res.dst_sel_x           = SQ_SEL_X; //V
+	    tex_res.dst_sel_y           = SQ_SEL_Z; //U
+	} else {
+	    tex_res.dst_sel_x           = SQ_SEL_Y; //V
+	    tex_res.dst_sel_y           = SQ_SEL_W; //U
+	}
 	tex_res.dst_sel_z           = SQ_SEL_1;
 	tex_res.dst_sel_w           = SQ_SEL_1;
 	tex_res.interlaced          = 0;
 	// XXX tex bases need to be 256B aligned
-	tex_res.base                = accel_state->src_mc_addr[0] + uv_offset;
-	tex_res.mip_base            = accel_state->src_mc_addr[0] + uv_offset;
+	tex_res.base                = accel_state->src_mc_addr[0];
+	tex_res.mip_base            = accel_state->src_mc_addr[0];
 	set_tex_resource            (pScrn, accel_state->ib, &tex_res);
 
 	// UV sampler
diff --git a/src/radeon.h b/src/radeon.h
index 2974cdf..9b42afd 100644
--- a/src/radeon.h
+++ b/src/radeon.h
@@ -658,7 +658,7 @@ struct radeon_accel_state {
     uint32_t          comp_mask_vs_offset;
     uint32_t          comp_mask_ps_offset;
     uint32_t          xv_vs_offset;
-    uint32_t          xv_ps_offset_nv12;
+    uint32_t          xv_ps_offset_packed;
     uint32_t          xv_ps_offset_planar;
 
     //size/addr stuff
diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index 6097ab5..eeaf990 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -199,33 +199,17 @@ CopyPackedtoNV12(unsigned char *src, unsigned char *dst,
 		 int srcPitch, int dstPitch,
 		 int w, int h, int id)
 {
-    int i, j;
-    int uv_offset = dstPitch * h;
-    uv_offset = (uv_offset + 255) & ~255;
-
-    // FOURCC_UYVY: U0 Y0 V0 Y1
-    // FOURCC_YUY2: Y0 U0 Y1 V0
-    for (i = 0; i < h; i++) {
-	unsigned char *y = dst;
-	unsigned char *uv = (unsigned char *)dst + uv_offset;
-
-	for (j = 0; j < (w / 2); j++) {
-	    if (id == FOURCC_UYVY) {
-		uv[1] = src[(j * 4) + 0];
-		y[0]  = src[(j * 4) + 1];
-		uv[0] = src[(j * 4) + 2];
-		y[1]  = src[(j * 4) + 3];
-	    } else {
-		y[0]  = src[(j * 4) + 0];
-		uv[1] = src[(j * 4) + 1];
-		y[1]  = src[(j * 4) + 2];
-		uv[0] = src[(j * 4) + 3];
-	    }
-	    y += 2;
-	    uv += 2;
-	}
-	dst += dstPitch;
-	src += srcPitch;
+    int i;
+
+    if (srcPitch == dstPitch) {
+        memcpy(dst, src, srcPitch * h);
+	dst += (dstPitch * h);
+    } else {
+	for (i = 0; i < h; i++) {
+            memcpy(dst, src, srcPitch);
+            src += srcPitch;
+            dst += dstPitch;
+        }
     }
 }
 
@@ -298,7 +282,7 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     }
 
     if (info->ChipFamily >= CHIP_FAMILY_R600)
-	dstPitch = (dstPitch + 511) & ~511;
+	dstPitch = (dstPitch + 255) & ~255;
     else
 	dstPitch = (dstPitch + 63) & ~63;
 
commit 1b4afc1c9f8458f9ab3434418f4ccf959f532ac1
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Mon Feb 9 19:50:37 2009 -0500

    R6xx/R7xx: Add checks to make sure we don't overrun VB space

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 9b6d197..598a5ac 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -262,6 +262,11 @@ R600Solid(PixmapPtr pPix, int x1, int y1, int x2, int y2)
     struct r6xx_solid_vertex vertex[3];
     struct r6xx_solid_vertex *solid_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
 
+    if (((accel_state->vb_index + 3) * 8) > (accel_state->ib->total / 2)) {
+	ErrorF("Ran out of VB space!\n");
+	return;
+    }
+
     vertex[0].x = (float)x1;
     vertex[0].y = (float)y1;
 
@@ -600,6 +605,11 @@ R600AppendCopyVertex(ScrnInfoPtr pScrn,
     struct r6xx_copy_vertex *copy_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
     struct r6xx_copy_vertex vertex[3];
 
+    if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
+	ErrorF("Ran out of VB space!\n");
+	return;
+    }
+
     vertex[0].x = (float)dstX;
     vertex[0].y = (float)dstY;
     vertex[0].s = (float)srcX;
@@ -1901,6 +1911,11 @@ static void R600Composite(PixmapPtr pDst,
 	struct r6xx_comp_mask_vertex vertex[3];
 	xPointFixed maskTopLeft, maskTopRight, maskBottomLeft, maskBottomRight;
 
+	if (((accel_state->vb_index + 3) * 24) > (accel_state->ib->total / 2)) {
+	    ErrorF("Ran out of VB space!\n");
+	    return;
+	}
+
 	maskTopLeft.x     = IntToxFixed(maskX);
 	maskTopLeft.y     = IntToxFixed(maskY);
 	maskTopRight.x    = IntToxFixed(maskX + w);
@@ -1957,6 +1972,11 @@ static void R600Composite(PixmapPtr pDst,
 	    (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
 	struct r6xx_comp_vertex vertex[3];
 
+	if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
+	    ErrorF("Ran out of VB space!\n");
+	    return;
+	}
+
 	vertex[0].x = (float)dstX;
 	vertex[0].y = (float)dstY;
 	vertex[0].src_s = xFixedToFloat(srcTopLeft.x) / accel_state->texW[0];
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index 24bba07..82de88a 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -406,6 +406,11 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	struct r6xx_copy_vertex *xv_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
 	struct r6xx_copy_vertex vertex[3];
 
+	if (((accel_state->vb_index + 3) * 16) > (accel_state->ib->total / 2)) {
+	    ErrorF("Ran out of VB space!\n");
+	    break;
+	}
+
 	dstX = pBox->x1 + dstxoff;
 	dstY = pBox->y1 + dstyoff;
 	dstw = pBox->x2 - pBox->x1;
commit e85b7a1c5948b5123eddf9dfbb9e002d74fbe9fb
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Mon Feb 9 14:13:16 2009 -0500

    R6xx/R7xx Xv: fix cache flush buffer size for planar

diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index 42a5d68..24bba07 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -205,7 +205,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	uv_offset = (uv_offset + 255) & ~255;
 
 	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
-			    accel_state->src_size[0] / 2,
+			    accel_state->src_size[0] / 4,
 			    accel_state->src_mc_addr[0] + uv_offset);
 
 	tex_res.id                  = 1;
@@ -232,7 +232,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 	uv_offset = (uv_offset + 255) & ~255;
 
 	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
-			    accel_state->src_size[0] / 2,
+			    accel_state->src_size[0] / 4,
 			    accel_state->src_mc_addr[0] + uv_offset);
 
 	tex_res.id                  = 2;
commit 2a893bac1faffd28dce6d9a9693108196c71e6d3
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Mon Feb 9 13:18:17 2009 -0500

    R6xx/R7xx Xv: add accelerated uploads for planar formats

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 950e6ac..9b6d197 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -2076,7 +2076,7 @@ R600WaitforIdlePoll(ScrnInfoPtr pScrn)
     return FALSE;
 }
 
-static Bool
+Bool
 R600CopyToVRAM(ScrnInfoPtr pScrn,
 	       char *src, int src_pitch,
 	       uint32_t dst_pitch, uint32_t dst_mc_addr, uint32_t dst_height, int bpp,
diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index 22e7d17..6097ab5 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -47,6 +47,12 @@
 extern void
 R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv);
 
+extern Bool
+R600CopyToVRAM(ScrnInfoPtr pScrn,
+	       char *src, int src_pitch,
+	       uint32_t dst_pitch, uint32_t dst_mc_addr, uint32_t dst_height, int bpp,
+	       int x, int y, int w, int h);
+
 #define IMAGE_MAX_WIDTH		2048
 #define IMAGE_MAX_HEIGHT	2048
 
@@ -154,59 +160,38 @@ static __inline__ uint32_t F_TO_24(float val)
 #endif /* XF86DRI */
 
 static void
-R600CopyPlanar(unsigned char *y_src, unsigned char *u_src, unsigned char *v_src,
-	       unsigned char *dst,
+R600CopyPlanar(ScrnInfoPtr pScrn,
+	       unsigned char *y_src, unsigned char *u_src, unsigned char *v_src,
+	       uint32_t dst_mc_addr,
 	       int srcPitch, int srcPitch2, int dstPitch,
 	       int w, int h)
 {
-    int i;
     int dstPitch2 = dstPitch >> 1;
     int h2 = h >> 1;
+    int w2 = w >> 1;
+    int v_offset, u_offset;
+    v_offset = dstPitch * h;
+    v_offset = (v_offset + 255) & ~255;
+    u_offset = v_offset + (dstPitch2 * h2);
+    u_offset = (u_offset + 255) & ~255;
 
     /* Y */
-    if (srcPitch == dstPitch) {
-        memcpy(dst, y_src, srcPitch * h);
-	dst += (dstPitch * h);
-    } else {
-	for (i = 0; i < h; i++) {
-            memcpy(dst, y_src, srcPitch);
-            y_src += srcPitch;
-            dst += dstPitch;
-        }
-    }
-
-    /* tex base need 256B alignment */
-    if (h & 1)
-	dst += dstPitch;
+    R600CopyToVRAM(pScrn,
+		   (char *)y_src, srcPitch,
+		   dstPitch, dst_mc_addr, h, 8,
+		   0, 0, w, h);
 
     /* V */
-    if (srcPitch2 == dstPitch2) {
-        memcpy(dst, v_src, srcPitch2 * h2);
-	dst += (dstPitch2 * h2);
-    } else {
-	for (i = 0; i < h2; i++) {
-            memcpy(dst, v_src, srcPitch2);
-            v_src += srcPitch2;
-            dst += dstPitch2;
-        }
-    }
-
-    /* tex base need 256B alignment */
-    if (h2 & 1)
-	dst += dstPitch2;
+    R600CopyToVRAM(pScrn,
+		   (char *)v_src, srcPitch2,
+		   dstPitch2, dst_mc_addr + v_offset, h2, 8,
+		   0, 0, w2, h2);
 
     /* U */
-    if (srcPitch2 == dstPitch2) {
-        memcpy(dst, u_src, srcPitch2 * h2);
-	dst += (dstPitch2 * h2);
-    } else {
-	for (i = 0; i < h2; i++) {
-            memcpy(dst, u_src, srcPitch2);
-            u_src += srcPitch2;
-            dst += dstPitch2;
-        }
-    }
-
+    R600CopyToVRAM(pScrn,
+		   (char *)u_src, srcPitch2,
+		   dstPitch2, dst_mc_addr + u_offset, h2, 8,
+		   0, 0, w2, h2);
 }
 
 static void
@@ -407,13 +392,13 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 	    s2offset = srcPitch * height;
 	    s3offset = (srcPitch2 * (height >> 1)) + s2offset;
 	    if (id == FOURCC_YV12)
-		R600CopyPlanar(buf, buf + s3offset, buf + s2offset,
-			       pPriv->src_addr,
+		R600CopyPlanar(pScrn, buf, buf + s3offset, buf + s2offset,
+			       pPriv->src_offset,
 			       srcPitch, srcPitch2, pPriv->src_pitch,
 			       width, height);
 	    else
-		R600CopyPlanar(buf, buf + s2offset, buf + s3offset,
-			       pPriv->src_addr,
+		R600CopyPlanar(pScrn, buf, buf + s2offset, buf + s3offset,
+			       pPriv->src_offset,
 			       srcPitch, srcPitch2, pPriv->src_pitch,
 			       width, height);
 
commit 231aee18a73805be2f6c962e94a8345dd89fd0df
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Mon Feb 9 13:02:27 2009 -0500

    R6xx/R7xx Xv: implement native shader for planar formats

diff --git a/src/r600_exa.c b/src/r600_exa.c
index a38469a..950e6ac 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -2263,7 +2263,7 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
     uint32_t *vs;
     uint32_t *ps;
     // 512 bytes per shader for now
-    int size = 512 * 10;
+    int size = 512 * 11;
     int i;
 
     accel_state->shaders = NULL;
@@ -2285,7 +2285,8 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
     accel_state->comp_mask_vs_offset = 3072;
     accel_state->comp_mask_ps_offset = 3584;
     accel_state->xv_vs_offset = 4096;
-    accel_state->xv_ps_offset = 4608;
+    accel_state->xv_ps_offset_nv12 = 4608;
+    accel_state->xv_ps_offset_planar = 5120;
 
     // solid vs ---------------------------------------
     i = accel_state->solid_vs_offset / 4;
@@ -2774,8 +2775,8 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			 MEGA_FETCH(0));
     vs[i++] = VTX_DWORD_PAD;
 
-    // xv ps ---------------------------------------
-    i = accel_state->xv_ps_offset / 4;
+    // xv ps nv12 ----------------------------------
+    i = accel_state->xv_ps_offset_nv12 / 4;
     // 0
     ps[i++] = CF_DWORD0(ADDR(20));
     ps[i++] = CF_DWORD1(POP_COUNT(0),
@@ -3311,6 +3312,571 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			 SRC_SEL_W(SQ_SEL_1));
     ps[i++] = TEX_DWORD_PAD;
 
+    // xv ps planar ----------------------------------
+    i = accel_state->xv_ps_offset_planar / 4;
+    // 0
+    ps[i++] = CF_DWORD0(ADDR(20));
+    ps[i++] = CF_DWORD1(POP_COUNT(0),
+			CF_CONST(0),
+			COND(SQ_CF_COND_ACTIVE),
+			I_COUNT(3),
+			CALL_COUNT(0),
+			END_OF_PROGRAM(0),
+			VALID_PIXEL_MODE(0),
+			CF_INST(SQ_CF_INST_TEX),
+			WHOLE_QUAD_MODE(0),
+			BARRIER(0));
+    // 1
+    ps[i++] = CF_ALU_DWORD0(ADDR(3),
+			    KCACHE_BANK0(0),
+			    KCACHE_BANK1(0),
+			    KCACHE_MODE0(SQ_CF_KCACHE_NOP));
+    ps[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
+			    KCACHE_ADDR0(0),
+			    KCACHE_ADDR1(0),
+			    I_COUNT(16),
+			    USES_WATERFALL(0),
+			    CF_INST(SQ_CF_INST_ALU),
+			    WHOLE_QUAD_MODE(0),
+			    BARRIER(1));
+    // 2
+    ps[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
+				      TYPE(SQ_EXPORT_PIXEL),
+				      RW_GPR(3),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(3));
+    ps[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(1),
+					   END_OF_PROGRAM(1),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT_DONE),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(1));
+    // 3 - alu 0
+    // DP4 gpr[2].x gpr[1].x c[0].x
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_X),
+			 SRC0_NEG(0),
+			 SRC1_SEL(256),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_X),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_X),
+			     CLAMP(1));
+    // 4 - alu 1
+    // DP4 gpr[2].y gpr[1].y c[0].y
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Y),
+			 SRC0_NEG(0),
+			 SRC1_SEL(256),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_Y),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Y),
+			     CLAMP(1));
+    // 5 - alu 2
+    // DP4 gpr[2].z gpr[1].z c[0].z
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Z),
+			 SRC0_NEG(0),
+			 SRC1_SEL(256),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_Z),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Z),
+			     CLAMP(1));
+    // 6 - alu 3
+    // DP4 gpr[2].w gpr[1].w c[0].w
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_W),
+			 SRC0_NEG(0),
+			 SRC1_SEL(256),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_W),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(1));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_021),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_W),
+			     CLAMP(1));
+    // 7 - alu 4
+    // DP4 gpr[2].x gpr[1].x c[1].x
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_X),
+			 SRC0_NEG(0),
+			 SRC1_SEL(257),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_X),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_X),
+			     CLAMP(1));
+    // 8 - alu 5
+    // DP4 gpr[2].y gpr[1].y c[1].y
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Y),
+			 SRC0_NEG(0),
+			 SRC1_SEL(257),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_Y),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Y),
+			     CLAMP(1));
+    // 9 - alu 6
+    // DP4 gpr[2].z gpr[1].z c[1].z
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Z),
+			 SRC0_NEG(0),
+			 SRC1_SEL(257),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_Z),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Z),
+			     CLAMP(1));
+    // 10 - alu 7
+    // DP4 gpr[2].w gpr[1].w c[1].w
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_W),
+			 SRC0_NEG(0),
+			 SRC1_SEL(257),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_W),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(1));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_021),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_W),
+			     CLAMP(1));
+    // 11 - alu 8
+    // DP4 gpr[2].x gpr[1].x c[2].x
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_X),
+			 SRC0_NEG(0),
+			 SRC1_SEL(258),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_X),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_X),
+			     CLAMP(1));
+    // 12 - alu 9
+    // DP4 gpr[2].y gpr[1].y c[2].y
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Y),
+			 SRC0_NEG(0),
+			 SRC1_SEL(258),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_Y),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Y),
+			     CLAMP(1));
+    // 13 - alu 10
+    // DP4 gpr[2].z gpr[1].z c[2].z
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Z),
+			 SRC0_NEG(0),
+			 SRC1_SEL(258),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_Z),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Z),
+			     CLAMP(1));
+    // 14 - alu 11
+    // DP4 gpr[2].w gpr[1].w c[2].w
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_W),
+			 SRC0_NEG(0),
+			 SRC1_SEL(258),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_W),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(1));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_021),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_W),
+			     CLAMP(1));
+    // 15 - alu 12
+    // MOV gpr[3].x gpr[2].x
+    ps[i++] = ALU_DWORD0(SRC0_SEL(2),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_X),
+			 SRC0_NEG(0),
+			 SRC1_SEL(0),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_X),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_MOV),
+			     BANK_SWIZZLE(SQ_ALU_VEC_210),
+			     DST_GPR(3),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_X),
+			     CLAMP(0));
+    // 16 - alu 13
+    // MOV gpr[3].y gpr[2].y
+    ps[i++] = ALU_DWORD0(SRC0_SEL(2),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Y),
+			 SRC0_NEG(0),
+			 SRC1_SEL(0),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_X),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_MOV),
+			     BANK_SWIZZLE(SQ_ALU_VEC_210),
+			     DST_GPR(3),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Y),
+			     CLAMP(0));
+    // 17 - alu 14
+    // MOV gpr[3].z gpr[2].z
+    ps[i++] = ALU_DWORD0(SRC0_SEL(2),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Z),
+			 SRC0_NEG(0),
+			 SRC1_SEL(0),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_X),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_MOV),
+			     BANK_SWIZZLE(SQ_ALU_VEC_210),
+			     DST_GPR(3),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Z),
+			     CLAMP(0));
+    // 18 - alu 15
+    // MOV gpr[3].w gpr[2].w
+    ps[i++] = ALU_DWORD0(SRC0_SEL(2),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_W),
+			 SRC0_NEG(0),
+			 SRC1_SEL(0),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_X),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(1));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_MOV),
+			     BANK_SWIZZLE(SQ_ALU_VEC_012),
+			     DST_GPR(3),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_W),
+			     CLAMP(0));
+    // 19 - alignment
+    ps[i++] = 0x00000000;
+    ps[i++] = 0x00000000;
+    // 20/21 - tex 0
+    ps[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
+			 BC_FRAC_MODE(0),
+			 FETCH_WHOLE_QUAD(0),
+			 RESOURCE_ID(0),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 R7xx_ALT_CONST(0));
+    ps[i++] = TEX_DWORD1(DST_GPR(1),
+			 DST_REL(ABSOLUTE),
+			 DST_SEL_X(SQ_SEL_X),    //R
+			 DST_SEL_Y(SQ_SEL_MASK), //G
+			 DST_SEL_Z(SQ_SEL_MASK), //B
+			 DST_SEL_W(SQ_SEL_1),    //A
+			 LOD_BIAS(0),
+			 COORD_TYPE_X(TEX_NORMALIZED),
+			 COORD_TYPE_Y(TEX_NORMALIZED),
+			 COORD_TYPE_Z(TEX_NORMALIZED),
+			 COORD_TYPE_W(TEX_NORMALIZED));
+    ps[i++] = TEX_DWORD2(OFFSET_X(0),
+			 OFFSET_Y(0),
+			 OFFSET_Z(0),
+			 SAMPLER_ID(0),
+			 SRC_SEL_X(SQ_SEL_X),
+			 SRC_SEL_Y(SQ_SEL_Y),
+			 SRC_SEL_Z(SQ_SEL_0),
+			 SRC_SEL_W(SQ_SEL_1));
+    ps[i++] = TEX_DWORD_PAD;
+    // 22/23 - tex 1
+    ps[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
+			 BC_FRAC_MODE(0),
+			 FETCH_WHOLE_QUAD(0),
+			 RESOURCE_ID(1),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 R7xx_ALT_CONST(0));
+    ps[i++] = TEX_DWORD1(DST_GPR(1),
+			 DST_REL(ABSOLUTE),
+			 DST_SEL_X(SQ_SEL_MASK), //R
+			 DST_SEL_Y(SQ_SEL_MASK), //G
+			 DST_SEL_Z(SQ_SEL_X),    //B
+			 DST_SEL_W(SQ_SEL_MASK), //A
+			 LOD_BIAS(0),
+			 COORD_TYPE_X(TEX_NORMALIZED),
+			 COORD_TYPE_Y(TEX_NORMALIZED),
+			 COORD_TYPE_Z(TEX_NORMALIZED),
+			 COORD_TYPE_W(TEX_NORMALIZED));
+    ps[i++] = TEX_DWORD2(OFFSET_X(0),
+			 OFFSET_Y(0),
+			 OFFSET_Z(0),
+			 SAMPLER_ID(1),
+			 SRC_SEL_X(SQ_SEL_X),
+			 SRC_SEL_Y(SQ_SEL_Y),
+			 SRC_SEL_Z(SQ_SEL_0),
+			 SRC_SEL_W(SQ_SEL_1));
+    ps[i++] = TEX_DWORD_PAD;
+    // 24/25 - tex 2
+    ps[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
+			 BC_FRAC_MODE(0),
+			 FETCH_WHOLE_QUAD(0),
+			 RESOURCE_ID(2),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 R7xx_ALT_CONST(0));
+    ps[i++] = TEX_DWORD1(DST_GPR(1),
+			 DST_REL(ABSOLUTE),
+			 DST_SEL_X(SQ_SEL_MASK), //R
+			 DST_SEL_Y(SQ_SEL_X),    //G
+			 DST_SEL_Z(SQ_SEL_MASK), //B
+			 DST_SEL_W(SQ_SEL_MASK), //A
+			 LOD_BIAS(0),
+			 COORD_TYPE_X(TEX_NORMALIZED),
+			 COORD_TYPE_Y(TEX_NORMALIZED),
+			 COORD_TYPE_Z(TEX_NORMALIZED),
+			 COORD_TYPE_W(TEX_NORMALIZED));
+    ps[i++] = TEX_DWORD2(OFFSET_X(0),
+			 OFFSET_Y(0),
+			 OFFSET_Z(0),
+			 SAMPLER_ID(2),
+			 SRC_SEL_X(SQ_SEL_X),
+			 SRC_SEL_Y(SQ_SEL_Y),
+			 SRC_SEL_Z(SQ_SEL_0),
+			 SRC_SEL_W(SQ_SEL_1));
+    ps[i++] = TEX_DWORD_PAD;
+
     // comp mask vs ---------------------------------------
     i = accel_state->comp_mask_vs_offset / 4;
     //0
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index 222740e..42a5d68 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -108,8 +108,20 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
 
     accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
 	accel_state->xv_vs_offset;
-    accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
-	accel_state->xv_ps_offset;
+
+    switch(pPriv->id) {
+    case FOURCC_YV12:
+    case FOURCC_I420:
+	accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	    accel_state->xv_ps_offset_planar;
+	break;
+    case FOURCC_UYVY:
+    case FOURCC_YUY2:
+    default:
+	accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	    accel_state->xv_ps_offset_nv12;
+	break;
+    }
 
     accel_state->vs_size = 512;
     accel_state->ps_size = 512;
@@ -141,76 +153,182 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     set_alu_consts(pScrn, accel_state->ib, 0, sizeof(ps_alu_consts) / SQ_ALU_CONSTANT_offset, ps_alu_consts);
 
     /* Texture */
-    accel_state->src_mc_addr[0] = pPriv->src_offset;
-    accel_state->src_size[0] = exaGetPixmapPitch(pPixmap) * pPriv->w;
-
-    /* flush texture cache */
-    cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit, accel_state->src_size[0],
-			accel_state->src_mc_addr[0]);
-
-    // Y texture
-    tex_res.id                  = 0;
-    tex_res.w                   = pPriv->w;
-    tex_res.h                   = pPriv->h;
-    tex_res.pitch               = accel_state->src_pitch[0];
-    tex_res.depth               = 0;
-    tex_res.dim                 = SQ_TEX_DIM_2D;
-    tex_res.base                = accel_state->src_mc_addr[0];
-    tex_res.mip_base            = accel_state->src_mc_addr[0];
-
-    tex_res.format              = FMT_8;
-    tex_res.dst_sel_x           = SQ_SEL_X; //Y
-    tex_res.dst_sel_y           = SQ_SEL_1;
-    tex_res.dst_sel_z           = SQ_SEL_1;
-    tex_res.dst_sel_w           = SQ_SEL_1;
-
-    tex_res.request_size        = 1;
-    tex_res.base_level          = 0;
-    tex_res.last_level          = 0;
-    tex_res.perf_modulation     = 0;
-    tex_res.interlaced          = 0;
-    set_tex_resource            (pScrn, accel_state->ib, &tex_res);
-
-    // UV texture
-    uv_offset = accel_state->src_pitch[0] * pPriv->h;
-    uv_offset = (uv_offset + 255) & ~255;
-
-    cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
-			accel_state->src_size[0] / 2,
-			accel_state->src_mc_addr[0] + uv_offset);
-
-    tex_res.id                  = 1;
-    tex_res.format              = FMT_8_8;
-    tex_res.w                   = pPriv->w >> 1;
-    tex_res.h                   = pPriv->h >> 1;
-    tex_res.pitch               = accel_state->src_pitch[0] >> 1;
-    tex_res.dst_sel_x           = SQ_SEL_Y; //V
-    tex_res.dst_sel_y           = SQ_SEL_X; //U
-    tex_res.dst_sel_z           = SQ_SEL_1;
-    tex_res.dst_sel_w           = SQ_SEL_1;
-    tex_res.interlaced          = 0;
-    // XXX tex bases need to be 256B aligned
-    tex_res.base                = accel_state->src_mc_addr[0] + uv_offset;
-    tex_res.mip_base            = accel_state->src_mc_addr[0] + uv_offset;
-    set_tex_resource            (pScrn, accel_state->ib, &tex_res);
-
-    // Y sampler
-    tex_samp.id                 = 0;
-    tex_samp.clamp_x            = SQ_TEX_CLAMP_LAST_TEXEL;
-    tex_samp.clamp_y            = SQ_TEX_CLAMP_LAST_TEXEL;
-    tex_samp.clamp_z            = SQ_TEX_WRAP;
-
-    // xxx: switch to bicubic
-    tex_samp.xy_mag_filter      = SQ_TEX_XY_FILTER_BILINEAR;
-    tex_samp.xy_min_filter      = SQ_TEX_XY_FILTER_BILINEAR;
-
-    tex_samp.z_filter           = SQ_TEX_Z_FILTER_NONE;
-    tex_samp.mip_filter         = 0;			/* no mipmap */
-    set_tex_sampler             (pScrn, accel_state->ib, &tex_samp);
-
-    // UV sampler
-    tex_samp.id                 = 1;
-    set_tex_sampler             (pScrn, accel_state->ib, &tex_samp);
+    switch(pPriv->id) {
+    case FOURCC_YV12:
+    case FOURCC_I420:
+	accel_state->src_mc_addr[0] = pPriv->src_offset;
+	accel_state->src_size[0] = accel_state->src_pitch[0] * pPriv->h;
+
+	/* flush texture cache */
+	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit, accel_state->src_size[0],
+			    accel_state->src_mc_addr[0]);
+
+	// Y texture
+	tex_res.id                  = 0;
+	tex_res.w                   = pPriv->w;
+	tex_res.h                   = pPriv->h;
+	tex_res.pitch               = accel_state->src_pitch[0];
+	tex_res.depth               = 0;
+	tex_res.dim                 = SQ_TEX_DIM_2D;
+	tex_res.base                = accel_state->src_mc_addr[0];
+	tex_res.mip_base            = accel_state->src_mc_addr[0];
+
+	tex_res.format              = FMT_8;
+	tex_res.dst_sel_x           = SQ_SEL_X; //Y
+	tex_res.dst_sel_y           = SQ_SEL_1;
+	tex_res.dst_sel_z           = SQ_SEL_1;
+	tex_res.dst_sel_w           = SQ_SEL_1;
+
+	tex_res.request_size        = 1;
+	tex_res.base_level          = 0;
+	tex_res.last_level          = 0;
+	tex_res.perf_modulation     = 0;
+	tex_res.interlaced          = 0;
+	set_tex_resource            (pScrn, accel_state->ib, &tex_res);
+
+	// Y sampler
+	tex_samp.id                 = 0;
+	tex_samp.clamp_x            = SQ_TEX_CLAMP_LAST_TEXEL;
+	tex_samp.clamp_y            = SQ_TEX_CLAMP_LAST_TEXEL;
+	tex_samp.clamp_z            = SQ_TEX_WRAP;
+
+	// xxx: switch to bicubic
+	tex_samp.xy_mag_filter      = SQ_TEX_XY_FILTER_BILINEAR;
+	tex_samp.xy_min_filter      = SQ_TEX_XY_FILTER_BILINEAR;
+
+	tex_samp.z_filter           = SQ_TEX_Z_FILTER_NONE;
+	tex_samp.mip_filter         = 0;			/* no mipmap */
+	set_tex_sampler             (pScrn, accel_state->ib, &tex_samp);
+
+	// U or V texture
+	uv_offset = accel_state->src_pitch[0] * pPriv->h;
+	uv_offset = (uv_offset + 255) & ~255;
+
+	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
+			    accel_state->src_size[0] / 2,
+			    accel_state->src_mc_addr[0] + uv_offset);
+
+	tex_res.id                  = 1;
+	tex_res.format              = FMT_8;
+	tex_res.w                   = pPriv->w >> 1;
+	tex_res.h                   = pPriv->h >> 1;
+	tex_res.pitch               = accel_state->src_pitch[0] >> 1;
+	tex_res.dst_sel_x           = SQ_SEL_X; //V or U
+	tex_res.dst_sel_y           = SQ_SEL_1;
+	tex_res.dst_sel_z           = SQ_SEL_1;
+	tex_res.dst_sel_w           = SQ_SEL_1;
+	tex_res.interlaced          = 0;
+	// XXX tex bases need to be 256B aligned
+	tex_res.base                = accel_state->src_mc_addr[0] + uv_offset;
+	tex_res.mip_base            = accel_state->src_mc_addr[0] + uv_offset;
+	set_tex_resource            (pScrn, accel_state->ib, &tex_res);
+
+	// U or V sampler
+	tex_samp.id                 = 1;
+	set_tex_sampler             (pScrn, accel_state->ib, &tex_samp);
+
+	// U or V texture
+	uv_offset += ((accel_state->src_pitch[0] >> 1) * (pPriv->h >> 1));
+	uv_offset = (uv_offset + 255) & ~255;
+
+	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
+			    accel_state->src_size[0] / 2,
+			    accel_state->src_mc_addr[0] + uv_offset);
+
+	tex_res.id                  = 2;
+	tex_res.format              = FMT_8;
+	tex_res.w                   = pPriv->w >> 1;
+	tex_res.h                   = pPriv->h >> 1;
+	tex_res.pitch               = accel_state->src_pitch[0] >> 1;
+	tex_res.dst_sel_x           = SQ_SEL_X; //V or U
+	tex_res.dst_sel_y           = SQ_SEL_1;
+	tex_res.dst_sel_z           = SQ_SEL_1;
+	tex_res.dst_sel_w           = SQ_SEL_1;
+	tex_res.interlaced          = 0;
+	// XXX tex bases need to be 256B aligned
+	tex_res.base                = accel_state->src_mc_addr[0] + uv_offset;
+	tex_res.mip_base            = accel_state->src_mc_addr[0] + uv_offset;
+	set_tex_resource            (pScrn, accel_state->ib, &tex_res);
+
+	// UV sampler
+	tex_samp.id                 = 2;
+	set_tex_sampler             (pScrn, accel_state->ib, &tex_samp);
+	break;
+    case FOURCC_UYVY:
+    case FOURCC_YUY2:
+    default:
+	accel_state->src_mc_addr[0] = pPriv->src_offset;
+	accel_state->src_size[0] = accel_state->src_pitch[0] * pPriv->h;
+
+	/* flush texture cache */
+	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit, accel_state->src_size[0],
+			    accel_state->src_mc_addr[0]);
+
+	// Y texture
+	tex_res.id                  = 0;
+	tex_res.w                   = pPriv->w;
+	tex_res.h                   = pPriv->h;
+	tex_res.pitch               = accel_state->src_pitch[0];
+	tex_res.depth               = 0;
+	tex_res.dim                 = SQ_TEX_DIM_2D;
+	tex_res.base                = accel_state->src_mc_addr[0];
+	tex_res.mip_base            = accel_state->src_mc_addr[0];
+
+	tex_res.format              = FMT_8;
+	tex_res.dst_sel_x           = SQ_SEL_X; //Y
+	tex_res.dst_sel_y           = SQ_SEL_1;
+	tex_res.dst_sel_z           = SQ_SEL_1;
+	tex_res.dst_sel_w           = SQ_SEL_1;
+
+	tex_res.request_size        = 1;
+	tex_res.base_level          = 0;
+	tex_res.last_level          = 0;
+	tex_res.perf_modulation     = 0;
+	tex_res.interlaced          = 0;
+	set_tex_resource            (pScrn, accel_state->ib, &tex_res);
+
+	// Y sampler
+	tex_samp.id                 = 0;
+	tex_samp.clamp_x            = SQ_TEX_CLAMP_LAST_TEXEL;
+	tex_samp.clamp_y            = SQ_TEX_CLAMP_LAST_TEXEL;
+	tex_samp.clamp_z            = SQ_TEX_WRAP;
+
+	// xxx: switch to bicubic
+	tex_samp.xy_mag_filter      = SQ_TEX_XY_FILTER_BILINEAR;
+	tex_samp.xy_min_filter      = SQ_TEX_XY_FILTER_BILINEAR;
+
+	tex_samp.z_filter           = SQ_TEX_Z_FILTER_NONE;
+	tex_samp.mip_filter         = 0;			/* no mipmap */
+	set_tex_sampler             (pScrn, accel_state->ib, &tex_samp);
+
+	// UV texture
+	uv_offset = accel_state->src_pitch[0] * pPriv->h;
+	uv_offset = (uv_offset + 255) & ~255;
+
+	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
+			    accel_state->src_size[0] / 2,
+			    accel_state->src_mc_addr[0] + uv_offset);
+
+	tex_res.id                  = 1;
+	tex_res.format              = FMT_8_8;
+	tex_res.w                   = pPriv->w >> 1;
+	tex_res.h                   = pPriv->h >> 1;
+	tex_res.pitch               = accel_state->src_pitch[0] >> 1;
+	tex_res.dst_sel_x           = SQ_SEL_Y; //V
+	tex_res.dst_sel_y           = SQ_SEL_X; //U
+	tex_res.dst_sel_z           = SQ_SEL_1;
+	tex_res.dst_sel_w           = SQ_SEL_1;
+	tex_res.interlaced          = 0;
+	// XXX tex bases need to be 256B aligned
+	tex_res.base                = accel_state->src_mc_addr[0] + uv_offset;
+	tex_res.mip_base            = accel_state->src_mc_addr[0] + uv_offset;
+	set_tex_resource            (pScrn, accel_state->ib, &tex_res);
+
+	// UV sampler
+	tex_samp.id                 = 1;
+	set_tex_sampler             (pScrn, accel_state->ib, &tex_samp);
+	break;
+    }
 
     /* Render setup */
     ereg  (accel_state->ib, CB_SHADER_MASK,                      (0x0f << OUTPUT0_ENABLE_shift));
diff --git a/src/radeon.h b/src/radeon.h
index 629e1ff..2974cdf 100644
--- a/src/radeon.h
+++ b/src/radeon.h
@@ -658,7 +658,8 @@ struct radeon_accel_state {
     uint32_t          comp_mask_vs_offset;
     uint32_t          comp_mask_ps_offset;
     uint32_t          xv_vs_offset;
-    uint32_t          xv_ps_offset;
+    uint32_t          xv_ps_offset_nv12;
+    uint32_t          xv_ps_offset_planar;
 
     //size/addr stuff
     uint32_t          src_size[2];
diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index 16b2c82..22e7d17 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -154,12 +154,14 @@ static __inline__ uint32_t F_TO_24(float val)
 #endif /* XF86DRI */
 
 static void
-CopyPlanartoNV12(unsigned char *y_src, unsigned char *u_src, unsigned char *v_src,
-		 unsigned char *dst,
-		 int srcPitch, int srcPitch2, int dstPitch,
-		 int w, int h)
+R600CopyPlanar(unsigned char *y_src, unsigned char *u_src, unsigned char *v_src,
+	       unsigned char *dst,
+	       int srcPitch, int srcPitch2, int dstPitch,
+	       int w, int h)
 {
-    int i, j;
+    int i;
+    int dstPitch2 = dstPitch >> 1;
+    int h2 = h >> 1;
 
     /* Y */
     if (srcPitch == dstPitch) {
@@ -177,21 +179,34 @@ CopyPlanartoNV12(unsigned char *y_src, unsigned char *u_src, unsigned char *v_sr
     if (h & 1)
 	dst += dstPitch;
 
-    /* UV */
-    for (i = 0; i < (h >> 1); i++) {
-	unsigned char *u = u_src;
-	unsigned char *v = v_src;
-	unsigned char *uv = dst;
+    /* V */
+    if (srcPitch2 == dstPitch2) {
+        memcpy(dst, v_src, srcPitch2 * h2);
+	dst += (dstPitch2 * h2);
+    } else {
+	for (i = 0; i < h2; i++) {
+            memcpy(dst, v_src, srcPitch2);
+            v_src += srcPitch2;
+            dst += dstPitch2;
+        }
+    }
 
-	for (j = 0; j < w; j++) {
-	    uv[0] = v[j];
-	    uv[1] = u[j];
-	    uv += 2;
-	}
-	dst += dstPitch;
-	u_src += srcPitch2;
-	v_src += srcPitch2;
+    /* tex base need 256B alignment */
+    if (h2 & 1)
+	dst += dstPitch2;
+
+    /* U */
+    if (srcPitch2 == dstPitch2) {
+        memcpy(dst, u_src, srcPitch2 * h2);
+	dst += (dstPitch2 * h2);
+    } else {
+	for (i = 0; i < h2; i++) {
+            memcpy(dst, u_src, srcPitch2);
+            u_src += srcPitch2;
+            dst += dstPitch2;
+        }
     }
+
 }
 
 static void
@@ -392,15 +407,15 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 	    s2offset = srcPitch * height;
 	    s3offset = (srcPitch2 * (height >> 1)) + s2offset;
 	    if (id == FOURCC_YV12)
-		CopyPlanartoNV12(buf, buf + s3offset, buf + s2offset,
-				 pPriv->src_addr,
-				 srcPitch, srcPitch2, pPriv->src_pitch,
-				 width, height);
+		R600CopyPlanar(buf, buf + s3offset, buf + s2offset,
+			       pPriv->src_addr,
+			       srcPitch, srcPitch2, pPriv->src_pitch,
+			       width, height);
 	    else
-		CopyPlanartoNV12(buf, buf + s2offset, buf + s3offset,
-				 pPriv->src_addr,
-				 srcPitch, srcPitch2, pPriv->src_pitch,
-				 width, height);
+		R600CopyPlanar(buf, buf + s2offset, buf + s3offset,
+			       pPriv->src_addr,
+			       srcPitch, srcPitch2, pPriv->src_pitch,
+			       width, height);
 
 	} else {
 	    top &= ~1;
commit 6c76bfe8105e3cf4e7e6ea1bfe1235be2079110f
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Mon Feb 9 09:47:22 2009 -0500

    R6xx/R7xx UTS: move actual upload to separate function
    
    So it can be shared with Xv

diff --git a/src/r600_exa.c b/src/r600_exa.c
index fa99a6e..a38469a 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -2077,15 +2077,12 @@ R600WaitforIdlePoll(ScrnInfoPtr pScrn)
 }
 
 static Bool
-R600UploadToScreen(PixmapPtr pDst, int x, int y, int w, int h,
-		   char *src, int src_pitch)
+R600CopyToVRAM(ScrnInfoPtr pScrn,
+	       char *src, int src_pitch,
+	       uint32_t dst_pitch, uint32_t dst_mc_addr, uint32_t dst_height, int bpp,
+	       int x, int y, int w, int h)
 {
-    ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
     RADEONInfoPtr info = RADEONPTR(pScrn);
-    uint32_t dst_pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
-    uint32_t dst_mc_addr = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
-    uint32_t dst_height = pDst->drawable.height;
-    int bpp = pDst->drawable.bitsPerPixel;
     uint32_t scratch_mc_addr;
     int wpass = w * (bpp/8);
     int scratch_pitch_bytes = (wpass + 255) & ~255;
@@ -2149,6 +2146,23 @@ R600UploadToScreen(PixmapPtr pDst, int x, int y, int w, int h,
 }
 
 static Bool
+R600UploadToScreen(PixmapPtr pDst, int x, int y, int w, int h,
+		   char *src, int src_pitch)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    uint32_t dst_pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
+    uint32_t dst_mc_addr = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
+    uint32_t dst_height = pDst->drawable.height;
+    int bpp = pDst->drawable.bitsPerPixel;
+
+    return R600CopyToVRAM(pScrn,
+			  src, src_pitch,
+			  dst_pitch, dst_mc_addr, dst_height, bpp,
+			  x, y, w, h);
+}
+
+static Bool
 R600DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
 		       char *dst, int dst_pitch)
 {
commit 132e4c575dc4675f4995e45f08c53c26bffd999a
Author: Yang Zhao <yang at yangman.ca>
Date:   Sat Feb 7 13:54:51 2009 -0500

    R6xx/R7xx EXA: Optimize overlapping copy
    
    Overlapping copy is now done in chunks proportional to the
    non-overlapping area.
    
    Diagonal overlaps are also handled properly.

diff --git a/src/r600_exa.c b/src/r600_exa.c
index a4e2a4d..fa99a6e 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -675,8 +675,6 @@ R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
 	accel_state->rop = rop;
 	accel_state->planemask = planemask;
 
-	return FALSE;
-
 #ifdef SHOW_VERTEXES
 	ErrorF("same surface!\n");
 #endif
@@ -719,58 +717,90 @@ R600OverlapCopy(PixmapPtr pDst,
     struct radeon_accel_state *accel_state = info->accel_state;
     uint32_t dst_pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
     uint32_t dst_offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
-    int i;
+    int i, chunk;
 
     if (is_overlap(srcX, srcX + w, srcY, srcY + h,
 		   dstX, dstX + w, dstY, dstY + h)) {
-	if (srcY == dstY) { // left/right
+        /* Diagonally offset overlap is reduced to a horizontal-only offset by first
+         * copying the vertically non-overlapping portion, then adjusting coordinates
+         */
+	if (srcX != dstX) { // left/right or diagonal
+            if (srcY > dstY ) { // diagonal up
+                chunk = srcY - dstY;
+                R600DoPrepareCopy(pScrn,
+                                  dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                  dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                  accel_state->rop, accel_state->planemask);
+                R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, chunk);
+                R600DoCopy(pScrn);
+
+                h = h - chunk;
+                srcY = srcY + chunk;
+                dstY = dstY + chunk;
+            } else if (srcY < dstY) { // diagonal down
+                chunk = dstY - srcY;
+                R600DoPrepareCopy(pScrn,
+                                  dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                  dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                  accel_state->rop, accel_state->planemask);
+                R600AppendCopyVertex(pScrn, srcX, srcY + h - chunk, dstX, dstY + h - chunk, w, chunk);
+                R600DoCopy(pScrn);
+
+                h = h - chunk;
+            }
+
 	    if (srcX < dstX) { // right
 		// copy right to left
-		for (i = w; i > 0; i--) {
+                chunk = dstX - srcX;
+		for (i = w; i > 0; i -= chunk) {
 		    R600DoPrepareCopy(pScrn,
 				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
 				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
 				      accel_state->rop, accel_state->planemask);
-
-		    R600AppendCopyVertex(pScrn, srcX + i - 1, srcY, dstX + i - 1, dstY, 1, h);
+		    R600AppendCopyVertex(pScrn, srcX + i - chunk, srcY, dstX + i - chunk, dstY, chunk, h);
 		    R600DoCopy(pScrn);
 		}
 	    } else { //left
 		// copy left to right
-		for (i = 0; i < w; i++) {
+                chunk = srcX - dstX;
+		for (i = 0; i < w; i += chunk) {
 		    R600DoPrepareCopy(pScrn,
 				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
 				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
 				      accel_state->rop, accel_state->planemask);
 
-		    R600AppendCopyVertex(pScrn, srcX + i, srcY, dstX + i, dstY, 1, h);
+		    R600AppendCopyVertex(pScrn, srcX + i, srcY, dstX + i, dstY, chunk, h);
 		    R600DoCopy(pScrn);
 		}
 	    }
 	} else { //up/down
 	    if (srcY > dstY) { // up
 		// copy top to bottom
-		for (i = 0; i < h; i++) {
-		    R600DoPrepareCopy(pScrn,
-				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      accel_state->rop, accel_state->planemask);
-
-		    R600AppendCopyVertex(pScrn, srcX, srcY + i, dstX, dstY + i, w, 1);
-		    R600DoCopy(pScrn);
-		}
+                for (i = 0; i < h; i += chunk) {
+                chunk = srcY - dstY;
+                    R600DoPrepareCopy(pScrn,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      accel_state->rop, accel_state->planemask);
+
+                    if (chunk > h - i) chunk = h - i;
+                    R600AppendCopyVertex(pScrn, srcX, srcY + i, dstX, dstY + i, w, chunk);
+                    R600DoCopy(pScrn);
+                }
 	    } else { // down
 		// copy bottom to top
-		for (i = h; i > 0; i--) {
-		    R600DoPrepareCopy(pScrn,
-				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      accel_state->rop, accel_state->planemask);
-
-		    R600AppendCopyVertex(pScrn, srcX, srcY + i - 1, dstX, dstY + i - 1, w, 1);
-		    R600DoCopy(pScrn);
-		}
-	    }
+		chunk = dstY - srcY;
+                for (i = h; i > 0; i -= chunk) {
+                    R600DoPrepareCopy(pScrn,
+                                      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                      accel_state->rop, accel_state->planemask);
+
+                    if (chunk > i) chunk = i;
+                    R600AppendCopyVertex(pScrn, srcX, srcY + i - chunk, dstX, dstY + i - chunk, w, chunk);
+                    R600DoCopy(pScrn);
+                }
+            }
 	}
     } else {
 	R600DoPrepareCopy(pScrn,
commit 8e9ef8ff581892cbe1b7ea56d48b9a1abd70179d
Author: Pierre Ossman <pierre at ossman.eu>
Date:   Sat Feb 7 18:57:47 2009 +0100

    Xv vsync support on r6xx/r7xx cards.

diff --git a/src/r600_reg.h b/src/r600_reg.h
index dfe4703..9036e2a 100644
--- a/src/r600_reg.h
+++ b/src/r600_reg.h
@@ -115,4 +115,18 @@ enum {
     IT_SURFACE_BASE_UPDATE               = 0x73,
 } ;
 
+/* IT_WAIT_REG_MEM operation encoding */
+
+#define IT_WAIT_ALWAYS          (0<<0)
+#define IT_WAIT_LT              (1<<0)
+#define IT_WAIT_LE              (2<<0)
+#define IT_WAIT_EQ              (3<<0)
+#define IT_WAIT_NE              (4<<0)
+#define IT_WAIT_GE              (5<<0)
+#define IT_WAIT_GT              (6<<0)
+#define IT_WAIT_REG             (0<<4)
+#define IT_WAIT_MEM             (1<<4)
+
+#define IT_WAIT_ADDR(x)         ((x) >> 2)
+
 #endif
diff --git a/src/r600_state.h b/src/r600_state.h
index bf9cdb5..9efd557 100644
--- a/src/r600_state.h
+++ b/src/r600_state.h
@@ -194,6 +194,8 @@ set_render_target(ScrnInfoPtr pScrn, drmBufPtr ib, cb_config_t *cb_conf);
 void
 cp_set_surface_sync(ScrnInfoPtr pScrn, drmBufPtr ib, uint32_t sync_type, uint32_t size, uint64_t mc_addr);
 void
+cp_wait_vline_sync(ScrnInfoPtr pScrn, drmBufPtr ib, PixmapPtr pPix, int crtc, int start, int stop, Bool enable);
+void
 fs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *fs_conf);
 void
 vs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *vs_conf);
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index 5941899..222740e 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -268,6 +268,18 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     ereg  (accel_state->ib, SPI_INTERP_CONTROL_0,                0);
 
 
+    cp_wait_vline_sync(pScrn, accel_state->ib, pPixmap, 
+                       radeon_covering_crtc_num(pScrn,
+                                                pPriv->drw_x,
+                                                pPriv->drw_x + pPriv->dst_w,
+                                                pPriv->drw_y,
+                                                pPriv->drw_y + pPriv->dst_h,
+                                                pPriv->desired_crtc),
+                       pPriv->drw_y,
+                       pPriv->drw_y + pPriv->dst_h,
+                       pPriv->vsync);
+
+
     accel_state->vb_index = 0;
 
     while (nBox--) {
diff --git a/src/r6xx_accel.c b/src/r6xx_accel.c
index 659d13d..c0e3a2b 100644
--- a/src/r6xx_accel.c
+++ b/src/r6xx_accel.c
@@ -369,14 +369,69 @@ cp_set_surface_sync(ScrnInfoPtr pScrn, drmBufPtr ib, uint32_t sync_type, uint32_
     ereg  (ib, CP_COHER_SIZE,                       cp_coher_size);
     ereg  (ib, CP_COHER_BASE,                       (mc_addr >> 8));
     pack3 (ib, IT_WAIT_REG_MEM, 6);
-    e32   (ib, 0x00000003);						// ME, Register, EqualTo
-    e32   (ib, CP_COHER_STATUS >> 2);
+    e32   (ib, IT_WAIT_REG | IT_WAIT_EQ);
+    e32   (ib, IT_WAIT_ADDR(CP_COHER_STATUS));
     e32   (ib, 0);
     e32   (ib, 0);							// Ref value
     e32   (ib, STATUS_bit);						// Ref mask
     e32   (ib, 10);							// Wait interval
 }
 
+/* inserts a wait for vline in the command stream */
+void cp_wait_vline_sync(ScrnInfoPtr pScrn, drmBufPtr ib, PixmapPtr pPix,
+	int crtc, int start, int stop, Bool enable)
+{
+    RADEONInfoPtr  info = RADEONPTR(pScrn);
+    xf86CrtcConfigPtr  xf86_config = XF86_CRTC_CONFIG_PTR(pScrn);
+    uint32_t offset;
+    RADEONCrtcPrivatePtr radeon_crtc;
+
+    if (!enable)
+        return;
+
+    if ((crtc < 0) || (crtc > 1))
+        return;
+
+    if (stop < start)
+        return;
+
+    if (!xf86_config->crtc[crtc]->enabled)
+        return;
+
+#ifdef USE_EXA
+    if (info->useEXA)
+        offset = exaGetPixmapOffset(pPix);
+    else
+#endif
+        offset = pPix->devPrivate.ptr - info->FB;
+
+    /* if drawing to front buffer */
+    if (offset != 0)
+        return;
+
+    start = max(start, 0);
+    stop = min(stop, xf86_config->crtc[crtc]->mode.VDisplay);
+
+    if (start > xf86_config->crtc[crtc]->mode.VDisplay)
+        return;
+
+    radeon_crtc = xf86_config->crtc[crtc]->driver_private;
+
+    /* set the VLINE range */
+    ereg(ib, AVIVO_D1MODE_VLINE_START_END + radeon_crtc->crtc_offset,
+         (start << AVIVO_D1MODE_VLINE_START_SHIFT) |
+         (stop << AVIVO_D1MODE_VLINE_END_SHIFT));
+
+    /* tell the CP to poll the VLINE state register */
+    pack3 (ib, IT_WAIT_REG_MEM, 6);
+    e32   (ib, IT_WAIT_REG | IT_WAIT_EQ);
+    e32   (ib, IT_WAIT_ADDR(AVIVO_D1MODE_VLINE_STATUS + radeon_crtc->crtc_offset));
+    e32   (ib, 0);
+    e32   (ib, 0);                          // Ref value
+    e32   (ib, AVIVO_D1MODE_VLINE_STAT);    // Mask
+    e32   (ib, 10);                         // Wait interval
+}
+
 void
 fs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *fs_conf)
 {
diff --git a/src/radeon_reg.h b/src/radeon_reg.h
index 17f8575..7f0281a 100644
--- a/src/radeon_reg.h
+++ b/src/radeon_reg.h
@@ -3662,6 +3662,8 @@
 #       define AVIVO_D1MODE_VLINE_START_SHIFT   0
 #       define AVIVO_D1MODE_VLINE_END_SHIFT     16
 #       define AVIVO_D1MODE_VLINE_INV           (1 << 31)
+#define AVIVO_D1MODE_VLINE_STATUS               0x653c
+#       define AVIVO_D1MODE_VLINE_STAT          (1 << 12)
 #define AVIVO_D1MODE_VIEWPORT_START             0x6580
 #define AVIVO_D1MODE_VIEWPORT_SIZE              0x6584
 #define AVIVO_D1MODE_EXT_OVERSCAN_LEFT_RIGHT    0x6588
commit 2222f0fd700f100b2e91fac2babe7d1b53f56c3e
Author: Pierre Ossman <pierre at ossman.eu>
Date:   Sat Feb 7 18:56:42 2009 +0100

    Fix bad range adjustment in VLINE code.

diff --git a/src/radeon_commonfuncs.c b/src/radeon_commonfuncs.c
index 7e00384..d69a9d8 100644
--- a/src/radeon_commonfuncs.c
+++ b/src/radeon_commonfuncs.c
@@ -659,7 +659,7 @@ void FUNC_NAME(RADEONWaitForVLine)(ScrnInfoPtr pScrn, PixmapPtr pPix,
 	return;
 
     start = max(start, 0);
-    stop = max(stop, xf86_config->crtc[crtc]->mode.VDisplay);
+    stop = min(stop, xf86_config->crtc[crtc]->mode.VDisplay);
 
     if (start > xf86_config->crtc[crtc]->mode.VDisplay)
 	return;
commit 8ce6c024e31f6a3f5ae6c882738b1e64ae2944a1
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Sat Feb 7 10:32:01 2009 -0500

    R6xx/R7xx Xv: fix typos in cache flushing commands

diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
index b1cd4f1..5941899 100644
--- a/src/r600_textured_videofuncs.c
+++ b/src/r600_textured_videofuncs.c
@@ -145,7 +145,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     accel_state->src_size[0] = exaGetPixmapPitch(pPixmap) * pPriv->w;
 
     /* flush texture cache */
-    cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit, 512,
+    cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit, accel_state->src_size[0],
 			accel_state->src_mc_addr[0]);
 
     // Y texture
@@ -365,7 +365,7 @@ R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
     wait_3d_idle_clean(pScrn, accel_state->ib);
 
     /* sync destination surface */
-    cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit, CB0_DEST_BASE_ENA_bit),
+    cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit | CB0_DEST_BASE_ENA_bit),
 			accel_state->dst_size, accel_state->dst_mc_addr);
 
     R600CPFlushIndirect(pScrn, accel_state->ib);
commit 1b45936ae614244aa49b1a5d3c7fc39773c4f9b6
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Fri Feb 6 18:29:55 2009 -0500

    R6xx/R7xx EXA: Fix typo in DFS
    
    noticed by pzad in IRC

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 7d5d7dc..a4e2a4d 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -2144,7 +2144,7 @@ R600DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
 	return FALSE;
 
     scratch_mc_addr = info->gartLocation + info->dri->bufStart + (scratch->idx * scratch->total);
-    hpass = min(h, scratch->total/2 / scratch_pitch);
+    hpass = min(h, scratch->total/2 / scratch_pitch_bytes);
 
     //blit from vram to scratch
     R600DoPrepareCopy(pScrn,
@@ -2159,7 +2159,7 @@ R600DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
 	int oldhpass = hpass;
 	h -= oldhpass;
 	y += oldhpass;
-	hpass = min(h, scratch->total/2 / scratch_pitch);
+	hpass = min(h, scratch->total/2 / scratch_pitch_bytes);
 
 	if (hpass) {
 	    scratch_offset = scratch->total/2 - scratch_offset;
commit c06d89e16d5b2553142e8641e66080e1770c1563
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Fri Feb 6 18:07:22 2009 -0500

    R6xx/R7xx EXA: fallback on overlapping blits for now
    
    Leave this disabled until we get a proper solution.

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 155f913..7d5d7dc 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -675,6 +675,8 @@ R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
 	accel_state->rop = rop;
 	accel_state->planemask = planemask;
 
+	return FALSE;
+
 #ifdef SHOW_VERTEXES
 	ErrorF("same surface!\n");
 #endif
commit 1d5fc3febf3470b94c423a1eda5e0683856909df
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Fri Feb 6 14:43:30 2009 -0500

    Revert "r6xx/r7xx EXA: Optimize overlapping copy"
    
    This reverts commit 0dfadc1843e0d14b9cc1ee19a72f4fd60a2c495b.
    
    This doesn't always work properly.

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 88a9d91..155f913 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -717,37 +717,59 @@ R600OverlapCopy(PixmapPtr pDst,
     struct radeon_accel_state *accel_state = info->accel_state;
     uint32_t dst_pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
     uint32_t dst_offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
-    int i, chunk;
+    int i;
 
     if (is_overlap(srcX, srcX + w, srcY, srcY + h,
-		   dstX, dstX + w, dstY, dstY + h) && (srcY != dstY)) {
-        if (srcY > dstY) { // up
-            // copy top to bottom
-            chunk = srcY - dstY;
-            for (i = 0; i < h; i += chunk) {
-                if (chunk > h - i) chunk = h - i;
-                R600DoPrepareCopy(pScrn,
-                                  dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-                                  dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-                                  accel_state->rop, accel_state->planemask);
-
-                R600AppendCopyVertex(pScrn, srcX, srcY + i, dstX, dstY + i, w, chunk);
-                R600DoCopy(pScrn);
-            }
-        } else { // down
-            // copy bottom to top
-            chunk = dstY - srcY;
-            for (i = h; i > 0; i -= chunk) {
-                if (chunk > i) chunk = i;
-                R600DoPrepareCopy(pScrn,
-                                  dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-                                  dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-                                  accel_state->rop, accel_state->planemask);
-
-                R600AppendCopyVertex(pScrn, srcX, srcY + i - chunk, dstX, dstY + i - chunk, w, chunk);
-                R600DoCopy(pScrn);
-            }
-        }
+		   dstX, dstX + w, dstY, dstY + h)) {
+	if (srcY == dstY) { // left/right
+	    if (srcX < dstX) { // right
+		// copy right to left
+		for (i = w; i > 0; i--) {
+		    R600DoPrepareCopy(pScrn,
+				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      accel_state->rop, accel_state->planemask);
+
+		    R600AppendCopyVertex(pScrn, srcX + i - 1, srcY, dstX + i - 1, dstY, 1, h);
+		    R600DoCopy(pScrn);
+		}
+	    } else { //left
+		// copy left to right
+		for (i = 0; i < w; i++) {
+		    R600DoPrepareCopy(pScrn,
+				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      accel_state->rop, accel_state->planemask);
+
+		    R600AppendCopyVertex(pScrn, srcX + i, srcY, dstX + i, dstY, 1, h);
+		    R600DoCopy(pScrn);
+		}
+	    }
+	} else { //up/down
+	    if (srcY > dstY) { // up
+		// copy top to bottom
+		for (i = 0; i < h; i++) {
+		    R600DoPrepareCopy(pScrn,
+				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      accel_state->rop, accel_state->planemask);
+
+		    R600AppendCopyVertex(pScrn, srcX, srcY + i, dstX, dstY + i, w, 1);
+		    R600DoCopy(pScrn);
+		}
+	    } else { // down
+		// copy bottom to top
+		for (i = h; i > 0; i--) {
+		    R600DoPrepareCopy(pScrn,
+				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      accel_state->rop, accel_state->planemask);
+
+		    R600AppendCopyVertex(pScrn, srcX, srcY + i - 1, dstX, dstY + i - 1, w, 1);
+		    R600DoCopy(pScrn);
+		}
+	    }
+	}
     } else {
 	R600DoPrepareCopy(pScrn,
 			  dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
commit def317e22d072405cd95ddb19d17eacd784ffd9e
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Fri Feb 6 11:04:59 2009 -0500

    R6xx/R7xx EXA: add accelerated UTS/DFS hooks
    
    I'm not sure how much of a win these are.  I need to
    do some benchmarking.

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 6f158e9..88a9d91 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -2028,23 +2028,69 @@ R600UploadToScreen(PixmapPtr pDst, int x, int y, int w, int h,
 {
     ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
     RADEONInfoPtr info = RADEONPTR(pScrn);
-//    struct radeon_accel_state *accel_state = info->accel_state;
-    uint8_t *dst = (pointer)((char *)info->FB + exaGetPixmapOffset(pDst));
-    int dst_pitch = exaGetPixmapPitch(pDst);
+    uint32_t dst_pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
+    uint32_t dst_mc_addr = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
+    uint32_t dst_height = pDst->drawable.height;
     int bpp = pDst->drawable.bitsPerPixel;
+    uint32_t scratch_mc_addr;
+    int wpass = w * (bpp/8);
+    int scratch_pitch_bytes = (wpass + 255) & ~255;
+    uint32_t scratch_pitch = scratch_pitch_bytes / (bpp / 8);
+    int scratch_offset = 0, hpass, temph;
+    char *dst;
+    drmBufPtr scratch;
+
+    if (dst_pitch & 7)
+	return FALSE;
 
+    if (dst_mc_addr & 0xff)
+	return FALSE;
 
-    //return FALSE;
+    scratch = RADEONCPGetBuffer(pScrn);
+    if (scratch == NULL)
+	return FALSE;
 
-    dst += (x * bpp / 8) + (y * dst_pitch);
-    w *= bpp / 8;
+    scratch_mc_addr = info->gartLocation + info->dri->bufStart + (scratch->idx * scratch->total);
+    temph = hpass = min(h, scratch->total/2 / scratch_pitch_bytes);
+    dst = (char *)scratch->address;
 
-    while (h--) {
-	memcpy(dst, src, w);
+    //memcopy from sys to scratch
+    while (temph--) {
+	memcpy (dst, src, wpass);
 	src += src_pitch;
-	dst += dst_pitch;
+	dst += scratch_pitch_bytes;
+    }
+
+    while (h) {
+	uint32_t offset = scratch_mc_addr + scratch_offset;
+	int oldhpass = hpass;
+	h -= oldhpass;
+	temph = hpass = min(h, scratch->total/2 / scratch_pitch_bytes);
+
+	if (hpass) {
+	    scratch_offset = scratch->total/2 - scratch_offset;
+	    dst = (char *)scratch->address + scratch_offset;
+	    // wait for the engine to be idle
+	    R600WaitforIdlePoll(pScrn);
+	    //memcopy from sys to scratch
+	    while (temph--) {
+		memcpy (dst, src, wpass);
+		src += src_pitch;
+		dst += scratch_pitch_bytes;
+	    }
+	}
+	//blit from scratch to vram
+	R600DoPrepareCopy(pScrn,
+			  scratch_pitch, w, oldhpass, offset, bpp,
+			  dst_pitch, dst_height, dst_mc_addr, bpp,
+			  3, 0xffffffff);
+	R600AppendCopyVertex(pScrn, 0, 0, x, y, w, oldhpass);
+	R600DoCopy(pScrn);
+	y += oldhpass;
     }
 
+    R600IBDiscard(pScrn, scratch);
+
     return TRUE;
 }
 
@@ -2054,23 +2100,68 @@ R600DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
 {
     ScrnInfoPtr pScrn = xf86Screens[pSrc->drawable.pScreen->myNum];
     RADEONInfoPtr info = RADEONPTR(pScrn);
-//    struct radeon_accel_state *accel_state = info->accel_state;
-    uint8_t *src = (pointer)((char *)info->FB + exaGetPixmapOffset(pSrc));
-    int	src_pitch = exaGetPixmapPitch(pSrc);
-    int	bpp = pSrc->drawable.bitsPerPixel;
+    uint32_t src_pitch = exaGetPixmapPitch(pSrc) / (pSrc->drawable.bitsPerPixel / 8);
+    uint32_t src_mc_addr = exaGetPixmapOffset(pSrc) + info->fbLocation + pScrn->fbOffset;
+    uint32_t src_width = pSrc->drawable.width;
+    uint32_t src_height = pSrc->drawable.height;
+    int bpp = pSrc->drawable.bitsPerPixel;
+    uint32_t scratch_mc_addr;
+    int scratch_pitch_bytes = (dst_pitch + 255) & ~255;
+    int scratch_offset = 0, hpass;
+    uint32_t scratch_pitch = scratch_pitch_bytes / (bpp / 8);
+    int wpass = w * (bpp/8);
+    drmBufPtr scratch;
+
+    if (src_pitch & 7)
+	return FALSE;
 
-    //return FALSE;
+    scratch = RADEONCPGetBuffer(pScrn);
+    if (scratch == NULL)
+	return FALSE;
 
-    src += (x * bpp / 8) + (y * src_pitch);
-    w *= bpp / 8;
+    scratch_mc_addr = info->gartLocation + info->dri->bufStart + (scratch->idx * scratch->total);
+    hpass = min(h, scratch->total/2 / scratch_pitch);
+
+    //blit from vram to scratch
+    R600DoPrepareCopy(pScrn,
+		      src_pitch, src_width, src_height, src_mc_addr, bpp,
+		      scratch_pitch, hpass, scratch_mc_addr, bpp,
+		      3, 0xffffffff);
+    R600AppendCopyVertex(pScrn, x, y, 0, 0, w, hpass);
+    R600DoCopy(pScrn);
+
+    while (h) {
+	char *src = (char *)scratch->address + scratch_offset;
+	int oldhpass = hpass;
+	h -= oldhpass;
+	y += oldhpass;
+	hpass = min(h, scratch->total/2 / scratch_pitch);
+
+	if (hpass) {
+	    scratch_offset = scratch->total/2 - scratch_offset;
+	    //blit from vram to scratch
+	    R600DoPrepareCopy(pScrn,
+			      src_pitch, src_width, src_height, src_mc_addr, bpp,
+			      scratch_pitch, hpass, scratch_mc_addr + scratch_offset, bpp,
+			      3, 0xffffffff);
+	    R600AppendCopyVertex(pScrn, x, y, 0, 0, w, hpass);
+	    R600DoCopy(pScrn);
+	}
 
-    while (h--) {
-	memcpy(dst, src, w);
-	src += src_pitch;
-	dst += dst_pitch;
+	// wait for the engine to be idle
+	R600WaitforIdlePoll(pScrn);
+	//memcopy from scratch to sys
+	while (oldhpass--) {
+	    memcpy (dst, src, wpass);
+	    dst += dst_pitch;
+	    src += scratch_pitch_bytes;
+	}
     }
 
+    R600IBDiscard(pScrn, scratch);
+
     return TRUE;
+
 }
 
 static int
@@ -3466,6 +3557,9 @@ R600DrawInit(ScreenPtr pScreen)
     info->accel_state->exa->PrepareAccess = R600PrepareAccess;
     info->accel_state->exa->FinishAccess = R600FinishAccess;
 
+    info->accel_state->exa->UploadToScreen = R600UploadToScreen;
+    info->accel_state->exa->DownloadFromScreen = R600DownloadFromScreen;
+
     info->accel_state->exa->flags = EXA_OFFSCREEN_PIXMAPS;
     info->accel_state->exa->pixmapOffsetAlign = 256;
     info->accel_state->exa->pixmapPitchAlign = 256;
commit 0dfadc1843e0d14b9cc1ee19a72f4fd60a2c495b
Author: Yang Zhao <yang at yangman.ca>
Date:   Fri Feb 6 10:29:39 2009 -0500

    r6xx/r7xx EXA: Optimize overlapping copy
    
    When source and destination blocks are only offset horizontally, it
    appears to be unnecessary to perform careful, segment-by-segment copy.
    The code path that does this is taken out completely.
    
    For the case where offset is only vertical, copying is now done by
    height of the non-overlapping area each time, instead of always
    line-by-line.

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 80f8dd2..6f158e9 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -717,59 +717,37 @@ R600OverlapCopy(PixmapPtr pDst,
     struct radeon_accel_state *accel_state = info->accel_state;
     uint32_t dst_pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
     uint32_t dst_offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
-    int i;
+    int i, chunk;
 
     if (is_overlap(srcX, srcX + w, srcY, srcY + h,
-		   dstX, dstX + w, dstY, dstY + h)) {
-	if (srcY == dstY) { // left/right
-	    if (srcX < dstX) { // right
-		// copy right to left
-		for (i = w; i > 0; i--) {
-		    R600DoPrepareCopy(pScrn,
-				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      accel_state->rop, accel_state->planemask);
-
-		    R600AppendCopyVertex(pScrn, srcX + i - 1, srcY, dstX + i - 1, dstY, 1, h);
-		    R600DoCopy(pScrn);
-		}
-	    } else { //left
-		// copy left to right
-		for (i = 0; i < w; i++) {
-		    R600DoPrepareCopy(pScrn,
-				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      accel_state->rop, accel_state->planemask);
-
-		    R600AppendCopyVertex(pScrn, srcX + i, srcY, dstX + i, dstY, 1, h);
-		    R600DoCopy(pScrn);
-		}
-	    }
-	} else { //up/down
-	    if (srcY > dstY) { // up
-		// copy top to bottom
-		for (i = 0; i < h; i++) {
-		    R600DoPrepareCopy(pScrn,
-				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      accel_state->rop, accel_state->planemask);
-
-		    R600AppendCopyVertex(pScrn, srcX, srcY + i, dstX, dstY + i, w, 1);
-		    R600DoCopy(pScrn);
-		}
-	    } else { // down
-		// copy bottom to top
-		for (i = h; i > 0; i--) {
-		    R600DoPrepareCopy(pScrn,
-				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      accel_state->rop, accel_state->planemask);
-
-		    R600AppendCopyVertex(pScrn, srcX, srcY + i - 1, dstX, dstY + i - 1, w, 1);
-		    R600DoCopy(pScrn);
-		}
-	    }
-	}
+		   dstX, dstX + w, dstY, dstY + h) && (srcY != dstY)) {
+        if (srcY > dstY) { // up
+            // copy top to bottom
+            chunk = srcY - dstY;
+            for (i = 0; i < h; i += chunk) {
+                if (chunk > h - i) chunk = h - i;
+                R600DoPrepareCopy(pScrn,
+                                  dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                  dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                  accel_state->rop, accel_state->planemask);
+
+                R600AppendCopyVertex(pScrn, srcX, srcY + i, dstX, dstY + i, w, chunk);
+                R600DoCopy(pScrn);
+            }
+        } else { // down
+            // copy bottom to top
+            chunk = dstY - srcY;
+            for (i = h; i > 0; i -= chunk) {
+                if (chunk > i) chunk = i;
+                R600DoPrepareCopy(pScrn,
+                                  dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                  dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+                                  accel_state->rop, accel_state->planemask);
+
+                R600AppendCopyVertex(pScrn, srcX, srcY + i - chunk, dstX, dstY + i - chunk, w, chunk);
+                R600DoCopy(pScrn);
+            }
+        }
     } else {
 	R600DoPrepareCopy(pScrn,
 			  dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
commit 3d17bd199423e92d201c20f047d5e699942af976
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Thu Feb 5 19:25:03 2009 -0500

    Revert "R6xx/R7xx EXA: improve overlapping copy performance"
    
    This reverts commit b24827c9d211e8a35da53b665385a7733d19910d.
    
    This seems to cause corruption in some cases.

diff --git a/src/r600_exa.c b/src/r600_exa.c
index cca82cb..80f8dd2 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -719,42 +719,66 @@ R600OverlapCopy(PixmapPtr pDst,
     uint32_t dst_offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
     int i;
 
-    R600DoPrepareCopy(pScrn,
-		      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-		      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-		      accel_state->rop, accel_state->planemask);
-
     if (is_overlap(srcX, srcX + w, srcY, srcY + h,
 		   dstX, dstX + w, dstY, dstY + h)) {
 	if (srcY == dstY) { // left/right
 	    if (srcX < dstX) { // right
 		// copy right to left
 		for (i = w; i > 0; i--) {
+		    R600DoPrepareCopy(pScrn,
+				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      accel_state->rop, accel_state->planemask);
+
 		    R600AppendCopyVertex(pScrn, srcX + i - 1, srcY, dstX + i - 1, dstY, 1, h);
+		    R600DoCopy(pScrn);
 		}
 	    } else { //left
 		// copy left to right
 		for (i = 0; i < w; i++) {
+		    R600DoPrepareCopy(pScrn,
+				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      accel_state->rop, accel_state->planemask);
+
 		    R600AppendCopyVertex(pScrn, srcX + i, srcY, dstX + i, dstY, 1, h);
+		    R600DoCopy(pScrn);
 		}
 	    }
 	} else { //up/down
 	    if (srcY > dstY) { // up
 		// copy top to bottom
 		for (i = 0; i < h; i++) {
+		    R600DoPrepareCopy(pScrn,
+				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      accel_state->rop, accel_state->planemask);
+
 		    R600AppendCopyVertex(pScrn, srcX, srcY + i, dstX, dstY + i, w, 1);
+		    R600DoCopy(pScrn);
 		}
 	    } else { // down
 		// copy bottom to top
 		for (i = h; i > 0; i--) {
+		    R600DoPrepareCopy(pScrn,
+				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      accel_state->rop, accel_state->planemask);
+
 		    R600AppendCopyVertex(pScrn, srcX, srcY + i - 1, dstX, dstY + i - 1, w, 1);
+		    R600DoCopy(pScrn);
 		}
 	    }
 	}
     } else {
+	R600DoPrepareCopy(pScrn,
+			  dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+			  dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+			  accel_state->rop, accel_state->planemask);
+
 	R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, h);
+	R600DoCopy(pScrn);
     }
-    R600DoCopy(pScrn);
 }
 
 static void
commit bf74055f543e7f0664741620fb1fe827ebc12711
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Thu Feb 5 17:31:43 2009 -0500

    r6xx/r7xx EXA: fix corruption when doing sw access
    
    need to wait until the engine is idle.  Ideally we wait
    on a timestamp shadowed in memory, but polling the
    GRBM_STATUS reg will do for now.

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 4d37804..cca82cb 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -2003,6 +2003,23 @@ static void R600DoneComposite(PixmapPtr pDst)
     R600CPFlushIndirect(pScrn, accel_state->ib);
 }
 
+/* really would be better to wait on a timestamp shadowed in memory,
+ * but this will do for now.
+ */
+static Bool
+R600WaitforIdlePoll(ScrnInfoPtr pScrn)
+{
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    unsigned char *RADEONMMIO = info->MMIO;
+    uint32_t i;
+
+    for (i = 0; i < 1000000; i++) {
+	if ((INREG(GRBM_STATUS) & GUI_ACTIVE_bit) == 0)
+	    return TRUE;
+    }
+    return FALSE;
+}
+
 static Bool
 R600UploadToScreen(PixmapPtr pDst, int x, int y, int w, int h,
 		   char *src, int src_pitch)
@@ -3398,6 +3415,8 @@ R600PrepareAccess(PixmapPtr pPix, int index)
     RADEONInfoPtr info = RADEONPTR(pScrn);
     unsigned char *RADEONMMIO = info->MMIO;
 
+    R600WaitforIdlePoll(pScrn);
+
     //flush HDP read/write caches
     OUTREG(HDP_MEM_COHERENCY_FLUSH_CNTL, 0x1);
 
commit b24827c9d211e8a35da53b665385a7733d19910d
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Thu Feb 5 16:21:37 2009 -0500

    R6xx/R7xx EXA: improve overlapping copy performance
    
    send vertices for each line of the copy, but only draw once

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 1501cd3..4d37804 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -719,66 +719,42 @@ R600OverlapCopy(PixmapPtr pDst,
     uint32_t dst_offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
     int i;
 
+    R600DoPrepareCopy(pScrn,
+		      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+		      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+		      accel_state->rop, accel_state->planemask);
+
     if (is_overlap(srcX, srcX + w, srcY, srcY + h,
 		   dstX, dstX + w, dstY, dstY + h)) {
 	if (srcY == dstY) { // left/right
 	    if (srcX < dstX) { // right
 		// copy right to left
 		for (i = w; i > 0; i--) {
-		    R600DoPrepareCopy(pScrn,
-				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      accel_state->rop, accel_state->planemask);
-
 		    R600AppendCopyVertex(pScrn, srcX + i - 1, srcY, dstX + i - 1, dstY, 1, h);
-		    R600DoCopy(pScrn);
 		}
 	    } else { //left
 		// copy left to right
 		for (i = 0; i < w; i++) {
-		    R600DoPrepareCopy(pScrn,
-				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      accel_state->rop, accel_state->planemask);
-
 		    R600AppendCopyVertex(pScrn, srcX + i, srcY, dstX + i, dstY, 1, h);
-		    R600DoCopy(pScrn);
 		}
 	    }
 	} else { //up/down
 	    if (srcY > dstY) { // up
 		// copy top to bottom
 		for (i = 0; i < h; i++) {
-		    R600DoPrepareCopy(pScrn,
-				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      accel_state->rop, accel_state->planemask);
-
 		    R600AppendCopyVertex(pScrn, srcX, srcY + i, dstX, dstY + i, w, 1);
-		    R600DoCopy(pScrn);
 		}
 	    } else { // down
 		// copy bottom to top
 		for (i = h; i > 0; i--) {
-		    R600DoPrepareCopy(pScrn,
-				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-				      accel_state->rop, accel_state->planemask);
-
 		    R600AppendCopyVertex(pScrn, srcX, srcY + i - 1, dstX, dstY + i - 1, w, 1);
-		    R600DoCopy(pScrn);
 		}
 	    }
 	}
     } else {
-	R600DoPrepareCopy(pScrn,
-			  dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-			  dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
-			  accel_state->rop, accel_state->planemask);
-
 	R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, h);
-	R600DoCopy(pScrn);
     }
+    R600DoCopy(pScrn);
 }
 
 static void
commit 729fe756f809a41494dc161252d82313956e4a4b
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Thu Feb 5 16:08:35 2009 -0500

    r6xx/r7xx EXA: cleanup overlapping copy

diff --git a/src/r600_exa.c b/src/r600_exa.c
index 2fc5c52..1501cd3 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -717,8 +717,6 @@ R600OverlapCopy(PixmapPtr pDst,
     struct radeon_accel_state *accel_state = info->accel_state;
     uint32_t dst_pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
     uint32_t dst_offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
-    struct r6xx_copy_vertex *copy_vb;
-    struct r6xx_copy_vertex vertex[3];
     int i;
 
     if (is_overlap(srcX, srcX + w, srcY, srcY + h,
@@ -732,35 +730,7 @@ R600OverlapCopy(PixmapPtr pDst,
 				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
 				      accel_state->rop, accel_state->planemask);
 
-		    copy_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
-
-		    vertex[0].x = (float)(dstX + i - 1);
-		    vertex[0].y = (float)dstY;
-		    vertex[0].s = (float)(srcX + i - 1);
-		    vertex[0].t = (float)srcY;
-
-		    vertex[1].x = (float)(dstX + i - 1);
-		    vertex[1].y = (float)(dstY + h);
-		    vertex[1].s = (float)(srcX + i - 1);
-		    vertex[1].t = (float)(srcY + h);
-
-		    vertex[2].x = (float)(dstX + i);
-		    vertex[2].y = (float)(dstY + h);
-		    vertex[2].s = (float)(srcX + i);
-		    vertex[2].t = (float)(srcY + h);
-
-#ifdef SHOW_VERTEXES
-		    ErrorF("vertex 0: %f, %f, %f, %f\n", vertex[0].x, vertex[0].y, vertex[0].s, vertex[0].t);
-		    ErrorF("vertex 1: %f, %f, %f, %f\n", vertex[1].x, vertex[1].y, vertex[1].s, vertex[1].t);
-		    ErrorF("vertex 2: %f, %f, %f, %f\n", vertex[2].x, vertex[2].y, vertex[2].s, vertex[2].t);
-#endif
-
-		    // append to vertex buffer
-		    copy_vb[accel_state->vb_index++] = vertex[0];
-		    copy_vb[accel_state->vb_index++] = vertex[1];
-		    copy_vb[accel_state->vb_index++] = vertex[2];
-
-		    // do the blit
+		    R600AppendCopyVertex(pScrn, srcX + i - 1, srcY, dstX + i - 1, dstY, 1, h);
 		    R600DoCopy(pScrn);
 		}
 	    } else { //left
@@ -771,35 +741,7 @@ R600OverlapCopy(PixmapPtr pDst,
 				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
 				      accel_state->rop, accel_state->planemask);
 
-		    copy_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
-
-		    vertex[0].x = (float)(dstX + i);
-		    vertex[0].y = (float)(dstY);
-		    vertex[0].s = (float)(srcX + i);
-		    vertex[0].t = (float)srcY;
-
-		    vertex[1].x = (float)(dstX + i);
-		    vertex[1].y = (float)(dstY + h);
-		    vertex[1].s = (float)(srcX + i);
-		    vertex[1].t = (float)(srcY + h);
-
-		    vertex[2].x = (float)(dstX + i + 1);
-		    vertex[2].y = (float)(dstY + h);
-		    vertex[2].s = (float)(srcX + i + 1);
-		    vertex[2].t = (float)(srcY + h);
-
-#ifdef SHOW_VERTEXES
-		    ErrorF("vertex 0: %f, %f, %f, %f\n", vertex[0].x, vertex[0].y, vertex[0].s, vertex[0].t);
-		    ErrorF("vertex 1: %f, %f, %f, %f\n", vertex[1].x, vertex[1].y, vertex[1].s, vertex[1].t);
-		    ErrorF("vertex 2: %f, %f, %f, %f\n", vertex[2].x, vertex[2].y, vertex[2].s, vertex[2].t);
-#endif
-
-		    // append to vertex buffer
-		    copy_vb[accel_state->vb_index++] = vertex[0];
-		    copy_vb[accel_state->vb_index++] = vertex[1];
-		    copy_vb[accel_state->vb_index++] = vertex[2];
-
-		    // do the blit
+		    R600AppendCopyVertex(pScrn, srcX + i, srcY, dstX + i, dstY, 1, h);
 		    R600DoCopy(pScrn);
 		}
 	    }
@@ -812,35 +754,7 @@ R600OverlapCopy(PixmapPtr pDst,
 				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
 				      accel_state->rop, accel_state->planemask);
 
-		    copy_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
-
-		    vertex[0].x = (float)dstX;
-		    vertex[0].y = (float)(dstY + i);
-		    vertex[0].s = (float)srcX;
-		    vertex[0].t = (float)(srcY + i);
-
-		    vertex[1].x = (float)dstX;
-		    vertex[1].y = (float)(dstY + i + 1);
-		    vertex[1].s = (float)srcX;
-		    vertex[1].t = (float)(srcY + i + 1);
-
-		    vertex[2].x = (float)(dstX + w);
-		    vertex[2].y = (float)(dstY + i + 1);
-		    vertex[2].s = (float)(srcX + w);
-		    vertex[2].t = (float)(srcY + i + 1);
-
-#ifdef SHOW_VERTEXES
-		    ErrorF("vertex 0: %f, %f, %f, %f\n", vertex[0].x, vertex[0].y, vertex[0].s, vertex[0].t);
-		    ErrorF("vertex 1: %f, %f, %f, %f\n", vertex[1].x, vertex[1].y, vertex[1].s, vertex[1].t);
-		    ErrorF("vertex 2: %f, %f, %f, %f\n", vertex[2].x, vertex[2].y, vertex[2].s, vertex[2].t);
-#endif
-
-		    // append to vertex buffer
-		    copy_vb[accel_state->vb_index++] = vertex[0];
-		    copy_vb[accel_state->vb_index++] = vertex[1];
-		    copy_vb[accel_state->vb_index++] = vertex[2];
-
-		    // do the blit
+		    R600AppendCopyVertex(pScrn, srcX, srcY + i, dstX, dstY + i, w, 1);
 		    R600DoCopy(pScrn);
 		}
 	    } else { // down
@@ -851,35 +765,7 @@ R600OverlapCopy(PixmapPtr pDst,
 				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
 				      accel_state->rop, accel_state->planemask);
 
-		    copy_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
-
-		    vertex[0].x = (float)dstX;
-		    vertex[0].y = (float)(dstY + i - 1);
-		    vertex[0].s = (float)(srcX);
-		    vertex[0].t = (float)(srcY + i - 1);
-
-		    vertex[1].x = (float)dstX;
-		    vertex[1].y = (float)(dstY + i);
-		    vertex[1].s = (float)srcX;
-		    vertex[1].t = (float)srcY + i;
-
-		    vertex[2].x = (float)(dstX + w);
-		    vertex[2].y = (float)(dstY + i);
-		    vertex[2].s = (float)(srcX + w);
-		    vertex[2].t = (float)(srcY + i);
-
-#ifdef SHOW_VERTEXES
-		    ErrorF("vertex 0: %f, %f, %f, %f\n", vertex[0].x, vertex[0].y, vertex[0].s, vertex[0].t);
-		    ErrorF("vertex 1: %f, %f, %f, %f\n", vertex[1].x, vertex[1].y, vertex[1].s, vertex[1].t);
-		    ErrorF("vertex 2: %f, %f, %f, %f\n", vertex[2].x, vertex[2].y, vertex[2].s, vertex[2].t);
-#endif
-
-		    // append to vertex buffer
-		    copy_vb[accel_state->vb_index++] = vertex[0];
-		    copy_vb[accel_state->vb_index++] = vertex[1];
-		    copy_vb[accel_state->vb_index++] = vertex[2];
-
-		    // do the blit
+		    R600AppendCopyVertex(pScrn, srcX, srcY + i - 1, dstX, dstY + i - 1, w, 1);
 		    R600DoCopy(pScrn);
 		}
 	    }
@@ -890,35 +776,7 @@ R600OverlapCopy(PixmapPtr pDst,
 			  dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
 			  accel_state->rop, accel_state->planemask);
 
-	copy_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
-
-	vertex[0].x = (float)dstX;
-	vertex[0].y = (float)dstY;
-	vertex[0].s = (float)srcX;
-	vertex[0].t = (float)srcY;
-
-	vertex[1].x = (float)dstX;
-	vertex[1].y = (float)(dstY + h);
-	vertex[1].s = (float)srcX;
-	vertex[1].t = (float)(srcY + h);
-
-	vertex[2].x = (float)(dstX + w);
-	vertex[2].y = (float)(dstY + h);
-	vertex[2].s = (float)(srcX + w);
-	vertex[2].t = (float)(srcY + h);
-
-#ifdef SHOW_VERTEXES
-	ErrorF("vertex 0: %f, %f, %f, %f\n", vertex[0].x, vertex[0].y, vertex[0].s, vertex[0].t);
-	ErrorF("vertex 1: %f, %f, %f, %f\n", vertex[1].x, vertex[1].y, vertex[1].s, vertex[1].t);
-	ErrorF("vertex 2: %f, %f, %f, %f\n", vertex[2].x, vertex[2].y, vertex[2].s, vertex[2].t);
-#endif
-
-	// append to vertex buffer
-	copy_vb[accel_state->vb_index++] = vertex[0];
-	copy_vb[accel_state->vb_index++] = vertex[1];
-	copy_vb[accel_state->vb_index++] = vertex[2];
-
-	// do the blit
+	R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, h);
 	R600DoCopy(pScrn);
     }
 }
commit d7bf7b9c17add31dc9a115b56a563c7f9bf2cdb6
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Feb 3 18:28:38 2009 -0500

    EXA: fix and re-enable Solid() on R7xx
    
    missing last bit in alu clause in solid PS

diff --git a/src/r600_exa.c b/src/r600_exa.c
index b9c228f..2fc5c52 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -91,12 +91,6 @@ R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
     uint32_t a, r, g, b;
     float ps_alu_consts[4];
 
-    // FIXME
-    // R7xx seems to hang when using PS constants for fg color
-    // sending the color as a vertex attribute works
-    if (info->ChipFamily >= CHIP_FAMILY_RV770)
-	return FALSE;
-
     accel_state->dst_mc_addr = exaGetPixmapOffset(pPix) + info->fbLocation + pScrn->fbOffset;
     accel_state->dst_size = exaGetPixmapPitch(pPix) * pPix->drawable.height;
     accel_state->dst_pitch = exaGetPixmapPitch(pPix) / (pPix->drawable.bitsPerPixel / 8);
@@ -2366,8 +2360,8 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
     ps[i++] = CF_ALU_DWORD0(ADDR(2),
 			    KCACHE_BANK0(0),
 			    KCACHE_BANK1(0),
-			    KCACHE_MODE0(0));
-    ps[i++] = CF_ALU_DWORD1(KCACHE_MODE1(0),
+			    KCACHE_MODE0(SQ_CF_KCACHE_NOP));
+    ps[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
 			    KCACHE_ADDR0(0),
 			    KCACHE_ADDR1(0),
 			    I_COUNT(4),
@@ -2483,7 +2477,7 @@ R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
 			 SRC1_NEG(0),
 			 INDEX_MODE(SQ_INDEX_AR_X),
 			 PRED_SEL(SQ_PRED_SEL_OFF),
-			 LAST(0));
+			 LAST(1));
     ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
 			     SRC0_ABS(0),
 			     SRC1_ABS(0),
commit e5b916770946a9eebcb4bd1e6f698220db8c718a
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Feb 3 10:48:41 2009 -0500

    Allow rotation on r6xx/r7xx

diff --git a/src/radeon_crtc.c b/src/radeon_crtc.c
index e0875a4..ec6a662 100644
--- a/src/radeon_crtc.c
+++ b/src/radeon_crtc.c
@@ -587,8 +587,7 @@ Bool RADEONAllocateControllers(ScrnInfoPtr pScrn, int mask)
     RADEONEntPtr pRADEONEnt = RADEONEntPriv(pScrn);
     RADEONInfoPtr  info = RADEONPTR(pScrn);
 
-    if ((info->ChipFamily < CHIP_FAMILY_R600) &&
-	(!xf86ReturnOptValBool(info->Options, OPTION_NOACCEL, FALSE))) {
+    if (!xf86ReturnOptValBool(info->Options, OPTION_NOACCEL, FALSE)) {
 	radeon_crtc_funcs.shadow_create = radeon_crtc_shadow_create;
 	radeon_crtc_funcs.shadow_allocate = radeon_crtc_shadow_allocate;
 	radeon_crtc_funcs.shadow_destroy = radeon_crtc_shadow_destroy;
commit d1f071c7f1dad6babfbcfcc2cb2b722a4987f372
Author: Alex Deucher <alexdeucher at gmail.com>
Date:   Tue Feb 3 10:44:10 2009 -0500

    Initial R6xx/R7xx EXA and textured video support

diff --git a/src/Makefile.am b/src/Makefile.am
index c15cc30..7ff7d31 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -66,7 +66,7 @@ XMODE_SRCS=\
         modes/xf86DiDGA.c
 
 if USE_EXA
-RADEON_EXA_SOURCES = radeon_exa.c
+RADEON_EXA_SOURCES = radeon_exa.c r600_exa.c r6xx_accel.c r600_textured_videofuncs.c
 endif
 
 AM_CFLAGS = @XORG_CFLAGS@ @DRI_CFLAGS@ @XMODES_CFLAGS@ -DDISABLE_EASF -DENABLE_ALL_SERVICE_FUNCTIONS -DATOM_BIOS -DATOM_BIOS_PARSER -DDRIVER_PARSER
@@ -128,6 +128,12 @@ EXTRA_DIST = \
 	radeon_render.c \
 	radeon_accelfuncs.c \
 	radeon_textured_videofuncs.c \
+	r600_reg.h \
+	r600_reg_auto_r6xx.h \
+	r600_reg_r6xx.h \
+	r600_reg_r7xx.h \
+	r600_shader.h \
+	r600_state.h \
 	ati.h \
 	ativersion.h \
 	bicubic_table.h \
diff --git a/src/r600_exa.c b/src/r600_exa.c
new file mode 100644
index 0000000..b9c228f
--- /dev/null
+++ b/src/r600_exa.c
@@ -0,0 +1,3663 @@
+/*
+ * Copyright 2008 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Author: Alex Deucher <alexander.deucher at amd.com>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "xf86.h"
+
+#include "exa.h"
+
+#include "radeon.h"
+#include "radeon_macros.h"
+#include "r600_shader.h"
+#include "r600_reg.h"
+#include "r600_state.h"
+
+extern PixmapPtr
+RADEONGetDrawablePixmap(DrawablePtr pDrawable);
+
+//#define SHOW_VERTEXES
+
+#       define RADEON_ROP3_ZERO             0x00000000
+#       define RADEON_ROP3_DSa              0x00880000
+#       define RADEON_ROP3_SDna             0x00440000
+#       define RADEON_ROP3_S                0x00cc0000
+#       define RADEON_ROP3_DSna             0x00220000
+#       define RADEON_ROP3_D                0x00aa0000
+#       define RADEON_ROP3_DSx              0x00660000
+#       define RADEON_ROP3_DSo              0x00ee0000
+#       define RADEON_ROP3_DSon             0x00110000
+#       define RADEON_ROP3_DSxn             0x00990000
+#       define RADEON_ROP3_Dn               0x00550000
+#       define RADEON_ROP3_SDno             0x00dd0000
+#       define RADEON_ROP3_Sn               0x00330000
+#       define RADEON_ROP3_DSno             0x00bb0000
+#       define RADEON_ROP3_DSan             0x00770000
+#       define RADEON_ROP3_ONE              0x00ff0000
+
+uint32_t RADEON_ROP[16] = {
+    RADEON_ROP3_ZERO, /* GXclear        */
+    RADEON_ROP3_DSa,  /* Gxand          */
+    RADEON_ROP3_SDna, /* GXandReverse   */
+    RADEON_ROP3_S,    /* GXcopy         */
+    RADEON_ROP3_DSna, /* GXandInverted  */
+    RADEON_ROP3_D,    /* GXnoop         */
+    RADEON_ROP3_DSx,  /* GXxor          */
+    RADEON_ROP3_DSo,  /* GXor           */
+    RADEON_ROP3_DSon, /* GXnor          */
+    RADEON_ROP3_DSxn, /* GXequiv        */
+    RADEON_ROP3_Dn,   /* GXinvert       */
+    RADEON_ROP3_SDno, /* GXorReverse    */
+    RADEON_ROP3_Sn,   /* GXcopyInverted */
+    RADEON_ROP3_DSno, /* GXorInverted   */
+    RADEON_ROP3_DSan, /* GXnand         */
+    RADEON_ROP3_ONE,  /* GXset          */
+};
+
+static Bool
+R600PrepareSolid(PixmapPtr pPix, int alu, Pixel pm, Pixel fg)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pPix->drawable.pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+    cb_config_t     cb_conf;
+    shader_config_t vs_conf, ps_conf;
+    int pmask = 0;
+    uint32_t a, r, g, b;
+    float ps_alu_consts[4];
+
+    // FIXME
+    // R7xx seems to hang when using PS constants for fg color
+    // sending the color as a vertex attribute works
+    if (info->ChipFamily >= CHIP_FAMILY_RV770)
+	return FALSE;
+
+    accel_state->dst_mc_addr = exaGetPixmapOffset(pPix) + info->fbLocation + pScrn->fbOffset;
+    accel_state->dst_size = exaGetPixmapPitch(pPix) * pPix->drawable.height;
+    accel_state->dst_pitch = exaGetPixmapPitch(pPix) / (pPix->drawable.bitsPerPixel / 8);
+
+    // bad pitch
+    if (accel_state->dst_pitch & 7)
+	return FALSE;
+
+    // bad offset
+    if (accel_state->dst_mc_addr & 0xff)
+	return FALSE;
+
+    if (pPix->drawable.bitsPerPixel == 24)
+	return FALSE;
+
+    CLEAR (cb_conf);
+    CLEAR (vs_conf);
+    CLEAR (ps_conf);
+
+    //return FALSE;
+
+#ifdef SHOW_VERTEXES
+    ErrorF("%dx%d @ %dbpp, 0x%08x\n", pPix->drawable.width, pPix->drawable.height,
+	   pPix->drawable.bitsPerPixel, exaGetPixmapPitch(pPix));
+#endif
+
+    accel_state->ib = RADEONCPGetBuffer(pScrn);
+
+    /* Init */
+    start_3d(pScrn, accel_state->ib);
+
+    //cp_set_surface_sync(pScrn, accel_state->ib);
+
+    set_default_state(pScrn, accel_state->ib);
+
+    /* Scissor / viewport */
+    ereg  (accel_state->ib, PA_CL_VTE_CNTL,                      VTX_XY_FMT_bit);
+    ereg  (accel_state->ib, PA_CL_CLIP_CNTL,                     CLIP_DISABLE_bit);
+
+    accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	accel_state->solid_vs_offset;
+    accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	accel_state->solid_ps_offset;
+    accel_state->vs_size = 512;
+    accel_state->ps_size = 512;
+
+    /* Shader */
+
+    /* flush SQ cache */
+    cp_set_surface_sync(pScrn, accel_state->ib, SH_ACTION_ENA_bit,
+			accel_state->vs_size, accel_state->vs_mc_addr);
+
+    vs_conf.shader_addr         = accel_state->vs_mc_addr;
+    vs_conf.num_gprs            = 2;
+    vs_conf.stack_size          = 0;
+    vs_setup                    (pScrn, accel_state->ib, &vs_conf);
+
+    /* flush SQ cache */
+    cp_set_surface_sync(pScrn, accel_state->ib, SH_ACTION_ENA_bit,
+			accel_state->ps_size, accel_state->ps_mc_addr);
+
+    ps_conf.shader_addr         = accel_state->ps_mc_addr;
+    ps_conf.num_gprs            = 1;
+    ps_conf.stack_size          = 0;
+    ps_conf.uncached_first_inst = 1;
+    ps_conf.clamp_consts        = 0;
+    ps_conf.export_mode         = 2;
+    ps_setup                    (pScrn, accel_state->ib, &ps_conf);
+
+    /* Render setup */
+    if (pm & 0x000000ff)
+	pmask |= 4; //B
+    if (pm & 0x0000ff00)
+	pmask |= 2; //G
+    if (pm & 0x00ff0000)
+	pmask |= 1; //R
+    if (pm & 0xff000000)
+	pmask |= 8; //A
+    ereg  (accel_state->ib, CB_SHADER_MASK,                      (pmask << OUTPUT0_ENABLE_shift));
+    ereg  (accel_state->ib, R7xx_CB_SHADER_CONTROL,              (RT0_ENABLE_bit));
+    ereg  (accel_state->ib, CB_COLOR_CONTROL,                    RADEON_ROP[alu]);
+
+
+    cb_conf.id = 0;
+    cb_conf.w = accel_state->dst_pitch;
+    cb_conf.h = pPix->drawable.height;
+    cb_conf.base = accel_state->dst_mc_addr;
+
+    if (pPix->drawable.bitsPerPixel == 8) {
+	cb_conf.format = COLOR_8;
+	cb_conf.comp_swap = 3; //A
+    } else if (pPix->drawable.bitsPerPixel == 16) {
+	cb_conf.format = COLOR_5_6_5;
+	cb_conf.comp_swap = 2; //RGB
+    } else {
+	cb_conf.format = COLOR_8_8_8_8;
+	cb_conf.comp_swap = 1; //ARGB
+    }
+    cb_conf.source_format = 1;
+    cb_conf.blend_clamp = 1;
+    set_render_target(pScrn, accel_state->ib, &cb_conf);
+
+    ereg  (accel_state->ib, PA_SU_SC_MODE_CNTL,                  (FACE_bit			|
+						 (POLYMODE_PTYPE__TRIANGLES << POLYMODE_FRONT_PTYPE_shift)	|
+						 (POLYMODE_PTYPE__TRIANGLES << POLYMODE_BACK_PTYPE_shift)));
+    ereg  (accel_state->ib, DB_SHADER_CONTROL,                   ((1 << Z_ORDER_shift)		| /* EARLY_Z_THEN_LATE_Z */
+						 DUAL_EXPORT_ENABLE_bit)); /* Only useful if no depth export */
+
+    /* Interpolator setup */
+    // one unused export from VS (VS_EXPORT_COUNT is zero based, count minus one)
+    ereg  (accel_state->ib, SPI_VS_OUT_CONFIG, (0 << VS_EXPORT_COUNT_shift));
+    ereg  (accel_state->ib, SPI_VS_OUT_ID_0, (0 << SEMANTIC_0_shift));
+
+    /* Enabling flat shading needs both FLAT_SHADE_bit in SPI_PS_INPUT_CNTL_x
+     * *and* FLAT_SHADE_ENA_bit in SPI_INTERP_CONTROL_0 */
+    // no VS exports as PS input (NUM_INTERP is not zero based, no minus one)
+    ereg  (accel_state->ib, SPI_PS_IN_CONTROL_0,                 (0 << NUM_INTERP_shift));
+    ereg  (accel_state->ib, SPI_PS_IN_CONTROL_1,                 0);
+    // color semantic id 0 -> GPR[0]
+    ereg  (accel_state->ib, SPI_PS_INPUT_CNTL_0 + (0 <<2),       ((0    << SEMANTIC_shift)	|
+								  (0x03 << DEFAULT_VAL_shift)	|
+								  FLAT_SHADE_bit		|
+								  SEL_CENTROID_bit));
+    ereg  (accel_state->ib, SPI_INTERP_CONTROL_0,                FLAT_SHADE_ENA_bit | 0);
+
+    // PS alu constants
+    if (pPix->drawable.bitsPerPixel == 16) {
+	r = (fg >> 11) & 0x1f;
+	g = (fg >> 5) & 0x3f;
+	b = (fg >> 0) & 0x1f;
+	ps_alu_consts[0] = (float)r / 31; //R
+	ps_alu_consts[1] = (float)g / 63; //G
+	ps_alu_consts[2] = (float)b / 31; //B
+	ps_alu_consts[3] = 1.0; //A
+    } else if (pPix->drawable.bitsPerPixel == 8) {
+	a = (fg >> 0) & 0xff;
+	ps_alu_consts[0] = 0.0; //R
+	ps_alu_consts[1] = 0.0; //G
+	ps_alu_consts[2] = 0.0; //B
+	ps_alu_consts[3] = (float)a / 255; //A
+    } else {
+	a = (fg >> 24) & 0xff;
+	r = (fg >> 16) & 0xff;
+	g = (fg >> 8) & 0xff;
+	b = (fg >> 0) & 0xff;
+	ps_alu_consts[0] = (float)r / 255; //R
+	ps_alu_consts[1] = (float)g / 255; //G
+	ps_alu_consts[2] = (float)b / 255; //B
+	ps_alu_consts[3] = (float)a / 255; //A
+    }
+    set_alu_consts(pScrn, accel_state->ib, 0, sizeof(ps_alu_consts) / SQ_ALU_CONSTANT_offset, ps_alu_consts);
+
+    accel_state->vb_index = 0;
+
+#ifdef SHOW_VERTEXES
+    ErrorF("PM: 0x%08x\n", pm);
+#endif
+
+    return TRUE;
+}
+
+
+static void
+R600Solid(PixmapPtr pPix, int x1, int y1, int x2, int y2)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pPix->drawable.pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+    struct r6xx_solid_vertex vertex[3];
+    struct r6xx_solid_vertex *solid_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+
+    vertex[0].x = (float)x1;
+    vertex[0].y = (float)y1;
+
+    vertex[1].x = (float)x1;
+    vertex[1].y = (float)y2;
+
+    vertex[2].x = (float)x2;
+    vertex[2].y = (float)y2;
+
+#ifdef SHOW_VERTEXES
+    ErrorF("vertex 0: %f, %f\n", vertex[0].x, vertex[0].y);
+    ErrorF("vertex 1: %f, %f\n", vertex[1].x, vertex[1].y);
+    ErrorF("vertex 2: %f\n", vertex[2].x, vertex[2].y);
+#endif
+
+    // append to vertex buffer
+    solid_vb[accel_state->vb_index++] = vertex[0];
+    solid_vb[accel_state->vb_index++] = vertex[1];
+    solid_vb[accel_state->vb_index++] = vertex[2];
+}
+
+static void
+R600DoneSolid(PixmapPtr pPix)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pPix->drawable.pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+    draw_config_t   draw_conf;
+    vtx_resource_t  vtx_res;
+
+    CLEAR (draw_conf);
+    CLEAR (vtx_res);
+
+    if (accel_state->vb_index == 0) {
+	R600IBDiscard(pScrn, accel_state->ib);
+	return;
+    }
+
+    accel_state->vb_mc_addr = info->gartLocation + info->dri->bufStart +
+	(accel_state->ib->idx * accel_state->ib->total) + (accel_state->ib->total / 2);
+    accel_state->vb_size = accel_state->vb_index * 8;
+
+    /* flush vertex cache */
+    if ((info->ChipFamily == CHIP_FAMILY_RV610) ||
+	(info->ChipFamily == CHIP_FAMILY_RV620) ||
+	(info->ChipFamily == CHIP_FAMILY_RS780) ||
+	(info->ChipFamily == CHIP_FAMILY_RV710))
+	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
+			    accel_state->vb_size, accel_state->vb_mc_addr);
+    else
+	cp_set_surface_sync(pScrn, accel_state->ib, VC_ACTION_ENA_bit,
+			    accel_state->vb_size, accel_state->vb_mc_addr);
+
+    /* Vertex buffer setup */
+    vtx_res.id              = SQ_VTX_RESOURCE_vs;
+    vtx_res.vtx_size_dw     = 8 / 4;
+    vtx_res.vtx_num_entries = accel_state->vb_size / 4;
+    vtx_res.mem_req_size    = 1;
+    vtx_res.vb_addr         = accel_state->vb_mc_addr;
+    set_vtx_resource        (pScrn, accel_state->ib, &vtx_res);
+
+    /* Draw */
+    draw_conf.prim_type          = DI_PT_RECTLIST;
+    draw_conf.vgt_draw_initiator = DI_SRC_SEL_AUTO_INDEX;
+    draw_conf.num_instances      = 1;
+    draw_conf.num_indices        = vtx_res.vtx_num_entries / vtx_res.vtx_size_dw;
+    draw_conf.index_type         = DI_INDEX_SIZE_16_BIT;
+
+    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_0,            0);	/* ? */
+    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_1,            0);
+
+    ereg  (accel_state->ib, VGT_MAX_VTX_INDX,                    draw_conf.num_indices);
+    ereg  (accel_state->ib, VGT_MIN_VTX_INDX,                    0);
+    ereg  (accel_state->ib, VGT_INDX_OFFSET,                     0);
+
+    draw_auto(pScrn, accel_state->ib, &draw_conf);
+
+    wait_3d_idle_clean(pScrn, accel_state->ib);
+
+    /* sync dst surface */
+    cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit | CB0_DEST_BASE_ENA_bit),
+			accel_state->dst_size, accel_state->dst_mc_addr);
+
+    R600CPFlushIndirect(pScrn, accel_state->ib);
+}
+
+static void
+R600DoPrepareCopy(ScrnInfoPtr pScrn,
+		  int src_pitch, int src_width, int src_height, uint32_t src_offset, int src_bpp,
+		  int dst_pitch, int dst_height, uint32_t dst_offset, int dst_bpp,
+		  int rop, Pixel planemask)
+{
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+    int pmask = 0;
+    cb_config_t     cb_conf;
+    tex_resource_t  tex_res;
+    tex_sampler_t   tex_samp;
+    shader_config_t vs_conf, ps_conf;
+
+    CLEAR (cb_conf);
+    CLEAR (tex_res);
+    CLEAR (tex_samp);
+    CLEAR (vs_conf);
+    CLEAR (ps_conf);
+
+    accel_state->ib = RADEONCPGetBuffer(pScrn);
+
+    /* Init */
+    start_3d(pScrn, accel_state->ib);
+
+    //cp_set_surface_sync(pScrn, accel_state->ib);
+
+    set_default_state(pScrn, accel_state->ib);
+
+    /* Scissor / viewport */
+    ereg  (accel_state->ib, PA_CL_VTE_CNTL,                      VTX_XY_FMT_bit);
+    ereg  (accel_state->ib, PA_CL_CLIP_CNTL,                     CLIP_DISABLE_bit);
+
+    accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	accel_state->copy_vs_offset;
+    accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	accel_state->copy_ps_offset;
+    accel_state->vs_size = 512;
+    accel_state->ps_size = 512;
+
+    /* Shader */
+
+    /* flush SQ cache */
+    cp_set_surface_sync(pScrn, accel_state->ib, SH_ACTION_ENA_bit,
+			accel_state->vs_size, accel_state->vs_mc_addr);
+
+    vs_conf.shader_addr         = accel_state->vs_mc_addr;
+    vs_conf.num_gprs            = 2;
+    vs_conf.stack_size          = 0;
+    vs_setup                    (pScrn, accel_state->ib, &vs_conf);
+
+    /* flush SQ cache */
+    cp_set_surface_sync(pScrn, accel_state->ib, SH_ACTION_ENA_bit,
+			accel_state->ps_size, accel_state->ps_mc_addr);
+
+    ps_conf.shader_addr         = accel_state->ps_mc_addr;
+    ps_conf.num_gprs            = 1;
+    ps_conf.stack_size          = 0;
+    ps_conf.uncached_first_inst = 1;
+    ps_conf.clamp_consts        = 0;
+    ps_conf.export_mode         = 2;
+    ps_setup                    (pScrn, accel_state->ib, &ps_conf);
+
+    accel_state->src_size[0] = src_pitch * src_height * (src_bpp/8);
+    accel_state->src_mc_addr[0] = src_offset;
+    accel_state->src_pitch[0] = src_pitch;
+
+    /* flush texture cache */
+    cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
+			accel_state->src_size[0], accel_state->src_mc_addr[0]);
+
+    /* Texture */
+    tex_res.id                  = 0;
+    tex_res.w                   = src_width;
+    tex_res.h                   = src_height;
+    tex_res.pitch               = accel_state->src_pitch[0];
+    tex_res.depth               = 0;
+    tex_res.dim                 = SQ_TEX_DIM_2D;
+    tex_res.base                = accel_state->src_mc_addr[0];
+    tex_res.mip_base            = accel_state->src_mc_addr[0];
+    if (src_bpp == 8) {
+	tex_res.format              = FMT_8;
+	tex_res.dst_sel_x           = SQ_SEL_1; //R
+	tex_res.dst_sel_y           = SQ_SEL_1; //G
+	tex_res.dst_sel_z           = SQ_SEL_1; //B
+	tex_res.dst_sel_w           = SQ_SEL_X; //A
+    } else if (src_bpp == 16) {
+	tex_res.format              = FMT_5_6_5;
+	tex_res.dst_sel_x           = SQ_SEL_Z; //R
+	tex_res.dst_sel_y           = SQ_SEL_Y; //G
+	tex_res.dst_sel_z           = SQ_SEL_X; //B
+	tex_res.dst_sel_w           = SQ_SEL_1; //A
+    } else {
+	tex_res.format              = FMT_8_8_8_8;
+	tex_res.dst_sel_x           = SQ_SEL_Z; //R
+	tex_res.dst_sel_y           = SQ_SEL_Y; //G
+	tex_res.dst_sel_z           = SQ_SEL_X; //B
+	tex_res.dst_sel_w           = SQ_SEL_W; //A
+    }
+
+    tex_res.request_size        = 1;
+    tex_res.base_level          = 0;
+    tex_res.last_level          = 0;
+    tex_res.perf_modulation     = 0;
+    set_tex_resource            (pScrn, accel_state->ib, &tex_res);
+
+    tex_samp.id                 = 0;
+    tex_samp.clamp_x            = SQ_TEX_CLAMP_LAST_TEXEL;
+    tex_samp.clamp_y            = SQ_TEX_CLAMP_LAST_TEXEL;
+    tex_samp.clamp_z            = SQ_TEX_WRAP;
+    tex_samp.xy_mag_filter      = SQ_TEX_XY_FILTER_POINT;
+    tex_samp.xy_min_filter      = SQ_TEX_XY_FILTER_POINT;
+    tex_samp.z_filter           = SQ_TEX_Z_FILTER_NONE;
+    tex_samp.mip_filter         = 0;			/* no mipmap */
+    set_tex_sampler             (pScrn, accel_state->ib, &tex_samp);
+
+
+    /* Render setup */
+    if (planemask & 0x000000ff)
+	pmask |= 4; //B
+    if (planemask & 0x0000ff00)
+	pmask |= 2; //G
+    if (planemask & 0x00ff0000)
+	pmask |= 1; //R
+    if (planemask & 0xff000000)
+	pmask |= 8; //A
+    ereg  (accel_state->ib, CB_SHADER_MASK,                      (pmask << OUTPUT0_ENABLE_shift));
+    ereg  (accel_state->ib, R7xx_CB_SHADER_CONTROL,              (RT0_ENABLE_bit));
+    ereg  (accel_state->ib, CB_COLOR_CONTROL,                    RADEON_ROP[rop]);
+
+    accel_state->dst_size = dst_pitch * dst_height * (dst_bpp/8);
+    accel_state->dst_mc_addr = dst_offset;
+    accel_state->dst_pitch = dst_pitch;
+
+    cb_conf.id = 0;
+    cb_conf.w = accel_state->dst_pitch;
+    cb_conf.h = dst_height;
+    cb_conf.base = accel_state->dst_mc_addr;
+    if (dst_bpp == 8) {
+	cb_conf.format = COLOR_8;
+	cb_conf.comp_swap = 3; // A
+    } else if (dst_bpp == 16) {
+	cb_conf.format = COLOR_5_6_5;
+	cb_conf.comp_swap = 2; // RGB
+    } else {
+	cb_conf.format = COLOR_8_8_8_8;
+	cb_conf.comp_swap = 1; // ARGB
+    }
+    cb_conf.source_format = 1;
+    cb_conf.blend_clamp = 1;
+    set_render_target(pScrn, accel_state->ib, &cb_conf);
+
+    ereg  (accel_state->ib, PA_SU_SC_MODE_CNTL,                  (FACE_bit			|
+						 (POLYMODE_PTYPE__TRIANGLES << POLYMODE_FRONT_PTYPE_shift)	|
+						 (POLYMODE_PTYPE__TRIANGLES << POLYMODE_BACK_PTYPE_shift)));
+    ereg  (accel_state->ib, DB_SHADER_CONTROL,                   ((1 << Z_ORDER_shift)		| /* EARLY_Z_THEN_LATE_Z */
+						 DUAL_EXPORT_ENABLE_bit)); /* Only useful if no depth export */
+
+    /* Interpolator setup */
+    // export tex coord from VS
+    ereg  (accel_state->ib, SPI_VS_OUT_CONFIG, ((1 - 1) << VS_EXPORT_COUNT_shift));
+    ereg  (accel_state->ib, SPI_VS_OUT_ID_0, (0 << SEMANTIC_0_shift));
+
+    /* Enabling flat shading needs both FLAT_SHADE_bit in SPI_PS_INPUT_CNTL_x
+     * *and* FLAT_SHADE_ENA_bit in SPI_INTERP_CONTROL_0 */
+    // input tex coord from VS
+    ereg  (accel_state->ib, SPI_PS_IN_CONTROL_0,                 ((1 << NUM_INTERP_shift)));
+    ereg  (accel_state->ib, SPI_PS_IN_CONTROL_1,                 0);
+    // color semantic id 0 -> GPR[0]
+    ereg  (accel_state->ib, SPI_PS_INPUT_CNTL_0 + (0 <<2),       ((0    << SEMANTIC_shift)	|
+								  (0x01 << DEFAULT_VAL_shift)	|
+								  SEL_CENTROID_bit));
+    ereg  (accel_state->ib, SPI_INTERP_CONTROL_0,                0);
+
+    accel_state->vb_index = 0;
+
+}
+
+static void
+R600DoCopy(ScrnInfoPtr pScrn)
+{
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+    draw_config_t   draw_conf;
+    vtx_resource_t  vtx_res;
+
+    CLEAR (draw_conf);
+    CLEAR (vtx_res);
+
+    if (accel_state->vb_index == 0) {
+	R600IBDiscard(pScrn, accel_state->ib);
+	return;
+    }
+
+    accel_state->vb_mc_addr = info->gartLocation + info->dri->bufStart +
+	(accel_state->ib->idx * accel_state->ib->total) + (accel_state->ib->total / 2);
+    accel_state->vb_size = accel_state->vb_index * 16;
+
+    /* flush vertex cache */
+    if ((info->ChipFamily == CHIP_FAMILY_RV610) ||
+	(info->ChipFamily == CHIP_FAMILY_RV620) ||
+	(info->ChipFamily == CHIP_FAMILY_RS780) ||
+	(info->ChipFamily == CHIP_FAMILY_RV710))
+	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
+			    accel_state->vb_size, accel_state->vb_mc_addr);
+    else
+	cp_set_surface_sync(pScrn, accel_state->ib, VC_ACTION_ENA_bit,
+			    accel_state->vb_size, accel_state->vb_mc_addr);
+
+    /* Vertex buffer setup */
+    vtx_res.id              = SQ_VTX_RESOURCE_vs;
+    vtx_res.vtx_size_dw     = 16 / 4;
+    vtx_res.vtx_num_entries = accel_state->vb_size / 4;
+    vtx_res.mem_req_size    = 1;
+    vtx_res.vb_addr         = accel_state->vb_mc_addr;
+    set_vtx_resource        (pScrn, accel_state->ib, &vtx_res);
+
+    draw_conf.prim_type          = DI_PT_RECTLIST;
+    draw_conf.vgt_draw_initiator = DI_SRC_SEL_AUTO_INDEX;
+    draw_conf.num_instances      = 1;
+    draw_conf.num_indices        = vtx_res.vtx_num_entries / vtx_res.vtx_size_dw;
+    draw_conf.index_type         = DI_INDEX_SIZE_16_BIT;
+
+    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_0,            0);	/* ? */
+    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_1,            0);
+
+    ereg  (accel_state->ib, VGT_MAX_VTX_INDX,                    draw_conf.num_indices);
+    ereg  (accel_state->ib, VGT_MIN_VTX_INDX,                    0);
+    ereg  (accel_state->ib, VGT_INDX_OFFSET,                     0);
+
+    draw_auto(pScrn, accel_state->ib, &draw_conf);
+
+    wait_3d_idle_clean(pScrn, accel_state->ib);
+
+    /* sync dst surface */
+    cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit | CB0_DEST_BASE_ENA_bit),
+			accel_state->dst_size, accel_state->dst_mc_addr);
+
+    R600CPFlushIndirect(pScrn, accel_state->ib);
+}
+
+static void
+R600AppendCopyVertex(ScrnInfoPtr pScrn,
+		     int srcX, int srcY,
+		     int dstX, int dstY,
+		     int w, int h)
+{
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+    struct r6xx_copy_vertex *copy_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+    struct r6xx_copy_vertex vertex[3];
+
+    vertex[0].x = (float)dstX;
+    vertex[0].y = (float)dstY;
+    vertex[0].s = (float)srcX;
+    vertex[0].t = (float)srcY;
+
+    vertex[1].x = (float)dstX;
+    vertex[1].y = (float)(dstY + h);
+    vertex[1].s = (float)srcX;
+    vertex[1].t = (float)(srcY + h);
+
+    vertex[2].x = (float)(dstX + w);
+    vertex[2].y = (float)(dstY + h);
+    vertex[2].s = (float)(srcX + w);
+    vertex[2].t = (float)(srcY + h);
+
+#ifdef SHOW_VERTEXES
+    ErrorF("vertex 0: %f, %f, %f, %d\n", vertex[0].x, vertex[0].y, vertex[0].s, vertex[0].t);
+    ErrorF("vertex 1: %f, %f, %f, %d\n", vertex[1].x, vertex[1].y, vertex[1].s, vertex[1].t);
+    ErrorF("vertex 2: %f, %f, %f, %d\n", vertex[2].x, vertex[2].y, vertex[2].s, vertex[2].t);
+#endif
+
+    // append to vertex buffer
+    copy_vb[accel_state->vb_index++] = vertex[0];
+    copy_vb[accel_state->vb_index++] = vertex[1];
+    copy_vb[accel_state->vb_index++] = vertex[2];
+
+}
+
+static Bool
+R600PrepareCopy(PixmapPtr pSrc,   PixmapPtr pDst,
+		int xdir, int ydir,
+		int rop,
+		Pixel planemask)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+
+    accel_state->dst_pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
+    accel_state->src_pitch[0] = exaGetPixmapPitch(pSrc) / (pSrc->drawable.bitsPerPixel / 8);
+
+    accel_state->src_mc_addr[0] = exaGetPixmapOffset(pSrc) + info->fbLocation + pScrn->fbOffset;
+    accel_state->dst_mc_addr = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
+
+    // bad pitch
+    if (accel_state->src_pitch[0] & 7)
+	return FALSE;
+    if (accel_state->dst_pitch & 7)
+	return FALSE;
+
+    // bad offset
+    if (accel_state->src_mc_addr[0] & 0xff)
+	return FALSE;
+    if (accel_state->dst_mc_addr & 0xff)
+	return FALSE;
+
+    if (pSrc->drawable.bitsPerPixel == 24)
+	return FALSE;
+    if (pDst->drawable.bitsPerPixel == 24)
+	return FALSE;
+
+    //return FALSE;
+
+#ifdef SHOW_VERTEXES
+    ErrorF("src: %dx%d @ %dbpp, 0x%08x\n", pSrc->drawable.width, pSrc->drawable.height,
+	   pSrc->drawable.bitsPerPixel, exaGetPixmapPitch(pSrc));
+    ErrorF("dst: %dx%d @ %dbpp, 0x%08x\n", pDst->drawable.width, pDst->drawable.height,
+	   pDst->drawable.bitsPerPixel, exaGetPixmapPitch(pDst));
+#endif
+
+    if (exaGetPixmapOffset(pSrc) == exaGetPixmapOffset(pDst)) {
+	accel_state->same_surface = TRUE;
+	accel_state->rop = rop;
+	accel_state->planemask = planemask;
+
+#ifdef SHOW_VERTEXES
+	ErrorF("same surface!\n");
+#endif
+    } else {
+
+	accel_state->same_surface = FALSE;
+
+	R600DoPrepareCopy(pScrn,
+			  accel_state->src_pitch[0], pSrc->drawable.width, pSrc->drawable.height,
+			  accel_state->src_mc_addr[0], pSrc->drawable.bitsPerPixel,
+			  accel_state->dst_pitch, pDst->drawable.height,
+			  accel_state->dst_mc_addr, pDst->drawable.bitsPerPixel,
+			  rop, planemask);
+
+    }
+
+    return TRUE;
+}
+
+static Bool
+is_overlap(int sx1, int sx2, int sy1, int sy2, int dx1, int dx2, int dy1, int dy2)
+{
+    if (((sx1 >= dx1) && (sx1 <= dx2) && (sy1 >= dy1) && (sy1 <= dy2)) || // TL x1, y1
+	((sx2 >= dx1) && (sx2 <= dx2) && (sy1 >= dy1) && (sy1 <= dy2)) || // TR x2, y1
+	((sx1 >= dx1) && (sx1 <= dx2) && (sy2 >= dy1) && (sy2 <= dy2)) || // BL x1, y2
+	((sx2 >= dx1) && (sx2 <= dx2) && (sy2 >= dy1) && (sy2 <= dy2)))   // BR x2, y2
+	return TRUE;
+    else
+	return FALSE;
+}
+
+static void
+R600OverlapCopy(PixmapPtr pDst,
+		int srcX, int srcY,
+		int dstX, int dstY,
+		int w, int h)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+    uint32_t dst_pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
+    uint32_t dst_offset = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
+    struct r6xx_copy_vertex *copy_vb;
+    struct r6xx_copy_vertex vertex[3];
+    int i;
+
+    if (is_overlap(srcX, srcX + w, srcY, srcY + h,
+		   dstX, dstX + w, dstY, dstY + h)) {
+	if (srcY == dstY) { // left/right
+	    if (srcX < dstX) { // right
+		// copy right to left
+		for (i = w; i > 0; i--) {
+		    R600DoPrepareCopy(pScrn,
+				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      accel_state->rop, accel_state->planemask);
+
+		    copy_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+
+		    vertex[0].x = (float)(dstX + i - 1);
+		    vertex[0].y = (float)dstY;
+		    vertex[0].s = (float)(srcX + i - 1);
+		    vertex[0].t = (float)srcY;
+
+		    vertex[1].x = (float)(dstX + i - 1);
+		    vertex[1].y = (float)(dstY + h);
+		    vertex[1].s = (float)(srcX + i - 1);
+		    vertex[1].t = (float)(srcY + h);
+
+		    vertex[2].x = (float)(dstX + i);
+		    vertex[2].y = (float)(dstY + h);
+		    vertex[2].s = (float)(srcX + i);
+		    vertex[2].t = (float)(srcY + h);
+
+#ifdef SHOW_VERTEXES
+		    ErrorF("vertex 0: %f, %f, %f, %f\n", vertex[0].x, vertex[0].y, vertex[0].s, vertex[0].t);
+		    ErrorF("vertex 1: %f, %f, %f, %f\n", vertex[1].x, vertex[1].y, vertex[1].s, vertex[1].t);
+		    ErrorF("vertex 2: %f, %f, %f, %f\n", vertex[2].x, vertex[2].y, vertex[2].s, vertex[2].t);
+#endif
+
+		    // append to vertex buffer
+		    copy_vb[accel_state->vb_index++] = vertex[0];
+		    copy_vb[accel_state->vb_index++] = vertex[1];
+		    copy_vb[accel_state->vb_index++] = vertex[2];
+
+		    // do the blit
+		    R600DoCopy(pScrn);
+		}
+	    } else { //left
+		// copy left to right
+		for (i = 0; i < w; i++) {
+		    R600DoPrepareCopy(pScrn,
+				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      accel_state->rop, accel_state->planemask);
+
+		    copy_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+
+		    vertex[0].x = (float)(dstX + i);
+		    vertex[0].y = (float)(dstY);
+		    vertex[0].s = (float)(srcX + i);
+		    vertex[0].t = (float)srcY;
+
+		    vertex[1].x = (float)(dstX + i);
+		    vertex[1].y = (float)(dstY + h);
+		    vertex[1].s = (float)(srcX + i);
+		    vertex[1].t = (float)(srcY + h);
+
+		    vertex[2].x = (float)(dstX + i + 1);
+		    vertex[2].y = (float)(dstY + h);
+		    vertex[2].s = (float)(srcX + i + 1);
+		    vertex[2].t = (float)(srcY + h);
+
+#ifdef SHOW_VERTEXES
+		    ErrorF("vertex 0: %f, %f, %f, %f\n", vertex[0].x, vertex[0].y, vertex[0].s, vertex[0].t);
+		    ErrorF("vertex 1: %f, %f, %f, %f\n", vertex[1].x, vertex[1].y, vertex[1].s, vertex[1].t);
+		    ErrorF("vertex 2: %f, %f, %f, %f\n", vertex[2].x, vertex[2].y, vertex[2].s, vertex[2].t);
+#endif
+
+		    // append to vertex buffer
+		    copy_vb[accel_state->vb_index++] = vertex[0];
+		    copy_vb[accel_state->vb_index++] = vertex[1];
+		    copy_vb[accel_state->vb_index++] = vertex[2];
+
+		    // do the blit
+		    R600DoCopy(pScrn);
+		}
+	    }
+	} else { //up/down
+	    if (srcY > dstY) { // up
+		// copy top to bottom
+		for (i = 0; i < h; i++) {
+		    R600DoPrepareCopy(pScrn,
+				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      accel_state->rop, accel_state->planemask);
+
+		    copy_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+
+		    vertex[0].x = (float)dstX;
+		    vertex[0].y = (float)(dstY + i);
+		    vertex[0].s = (float)srcX;
+		    vertex[0].t = (float)(srcY + i);
+
+		    vertex[1].x = (float)dstX;
+		    vertex[1].y = (float)(dstY + i + 1);
+		    vertex[1].s = (float)srcX;
+		    vertex[1].t = (float)(srcY + i + 1);
+
+		    vertex[2].x = (float)(dstX + w);
+		    vertex[2].y = (float)(dstY + i + 1);
+		    vertex[2].s = (float)(srcX + w);
+		    vertex[2].t = (float)(srcY + i + 1);
+
+#ifdef SHOW_VERTEXES
+		    ErrorF("vertex 0: %f, %f, %f, %f\n", vertex[0].x, vertex[0].y, vertex[0].s, vertex[0].t);
+		    ErrorF("vertex 1: %f, %f, %f, %f\n", vertex[1].x, vertex[1].y, vertex[1].s, vertex[1].t);
+		    ErrorF("vertex 2: %f, %f, %f, %f\n", vertex[2].x, vertex[2].y, vertex[2].s, vertex[2].t);
+#endif
+
+		    // append to vertex buffer
+		    copy_vb[accel_state->vb_index++] = vertex[0];
+		    copy_vb[accel_state->vb_index++] = vertex[1];
+		    copy_vb[accel_state->vb_index++] = vertex[2];
+
+		    // do the blit
+		    R600DoCopy(pScrn);
+		}
+	    } else { // down
+		// copy bottom to top
+		for (i = h; i > 0; i--) {
+		    R600DoPrepareCopy(pScrn,
+				      dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+				      accel_state->rop, accel_state->planemask);
+
+		    copy_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+
+		    vertex[0].x = (float)dstX;
+		    vertex[0].y = (float)(dstY + i - 1);
+		    vertex[0].s = (float)(srcX);
+		    vertex[0].t = (float)(srcY + i - 1);
+
+		    vertex[1].x = (float)dstX;
+		    vertex[1].y = (float)(dstY + i);
+		    vertex[1].s = (float)srcX;
+		    vertex[1].t = (float)srcY + i;
+
+		    vertex[2].x = (float)(dstX + w);
+		    vertex[2].y = (float)(dstY + i);
+		    vertex[2].s = (float)(srcX + w);
+		    vertex[2].t = (float)(srcY + i);
+
+#ifdef SHOW_VERTEXES
+		    ErrorF("vertex 0: %f, %f, %f, %f\n", vertex[0].x, vertex[0].y, vertex[0].s, vertex[0].t);
+		    ErrorF("vertex 1: %f, %f, %f, %f\n", vertex[1].x, vertex[1].y, vertex[1].s, vertex[1].t);
+		    ErrorF("vertex 2: %f, %f, %f, %f\n", vertex[2].x, vertex[2].y, vertex[2].s, vertex[2].t);
+#endif
+
+		    // append to vertex buffer
+		    copy_vb[accel_state->vb_index++] = vertex[0];
+		    copy_vb[accel_state->vb_index++] = vertex[1];
+		    copy_vb[accel_state->vb_index++] = vertex[2];
+
+		    // do the blit
+		    R600DoCopy(pScrn);
+		}
+	    }
+	}
+    } else {
+	R600DoPrepareCopy(pScrn,
+			  dst_pitch, pDst->drawable.width, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+			  dst_pitch, pDst->drawable.height, dst_offset, pDst->drawable.bitsPerPixel,
+			  accel_state->rop, accel_state->planemask);
+
+	copy_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+
+	vertex[0].x = (float)dstX;
+	vertex[0].y = (float)dstY;
+	vertex[0].s = (float)srcX;
+	vertex[0].t = (float)srcY;
+
+	vertex[1].x = (float)dstX;
+	vertex[1].y = (float)(dstY + h);
+	vertex[1].s = (float)srcX;
+	vertex[1].t = (float)(srcY + h);
+
+	vertex[2].x = (float)(dstX + w);
+	vertex[2].y = (float)(dstY + h);
+	vertex[2].s = (float)(srcX + w);
+	vertex[2].t = (float)(srcY + h);
+
+#ifdef SHOW_VERTEXES
+	ErrorF("vertex 0: %f, %f, %f, %f\n", vertex[0].x, vertex[0].y, vertex[0].s, vertex[0].t);
+	ErrorF("vertex 1: %f, %f, %f, %f\n", vertex[1].x, vertex[1].y, vertex[1].s, vertex[1].t);
+	ErrorF("vertex 2: %f, %f, %f, %f\n", vertex[2].x, vertex[2].y, vertex[2].s, vertex[2].t);
+#endif
+
+	// append to vertex buffer
+	copy_vb[accel_state->vb_index++] = vertex[0];
+	copy_vb[accel_state->vb_index++] = vertex[1];
+	copy_vb[accel_state->vb_index++] = vertex[2];
+
+	// do the blit
+	R600DoCopy(pScrn);
+    }
+}
+
+static void
+R600Copy(PixmapPtr pDst,
+	 int srcX, int srcY,
+	 int dstX, int dstY,
+	 int w, int h)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+
+    //blit to/from same surfacce
+    if (accel_state->same_surface)
+	R600OverlapCopy(pDst, srcX, srcY, dstX, dstY, w, h);
+    else
+	R600AppendCopyVertex(pScrn, srcX, srcY, dstX, dstY, w, h);
+}
+
+static void
+R600DoneCopy(PixmapPtr pDst)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+
+    if (accel_state->same_surface)
+	return;
+    else
+	R600DoCopy(pScrn);
+}
+
+#define RADEON_TRACE_FALL 0
+#define RADEON_TRACE_DRAW 0
+
+#if RADEON_TRACE_FALL
+#define RADEON_FALLBACK(x)     		\
+do {					\
+	ErrorF("%s: ", __FUNCTION__);	\
+	ErrorF x;			\
+	return FALSE;			\
+} while (0)
+#else
+#define RADEON_FALLBACK(x) return FALSE
+#endif
+
+#define xFixedToFloat(f) (((float) (f)) / 65536)
+
+static inline void transformPoint(PictTransform *transform, xPointFixed *point)
+{
+    PictVector v;
+    v.vector[0] = point->x;
+    v.vector[1] = point->y;
+    v.vector[2] = xFixed1;
+    PictureTransformPoint(transform, &v);
+    point->x = v.vector[0];
+    point->y = v.vector[1];
+}
+
+struct blendinfo {
+    Bool dst_alpha;
+    Bool src_alpha;
+    uint32_t blend_cntl;
+};
+
+static struct blendinfo R600BlendOp[] = {
+    /* Clear */
+    {0, 0, (BLEND_ZERO << COLOR_SRCBLEND_shift) | (BLEND_ZERO << COLOR_DESTBLEND_shift)},
+    /* Src */
+    {0, 0, (BLEND_ONE << COLOR_SRCBLEND_shift) | (BLEND_ZERO << COLOR_DESTBLEND_shift)},
+    /* Dst */
+    {0, 0, (BLEND_ZERO << COLOR_SRCBLEND_shift) | (BLEND_ONE << COLOR_DESTBLEND_shift)},
+    /* Over */
+    {0, 1, (BLEND_ONE << COLOR_SRCBLEND_shift) | (BLEND_ONE_MINUS_SRC_ALPHA << COLOR_DESTBLEND_shift)},
+    /* OverReverse */
+    {1, 0, (BLEND_ONE_MINUS_DST_ALPHA << COLOR_SRCBLEND_shift) | (BLEND_ONE << COLOR_DESTBLEND_shift)},
+    /* In */
+    {1, 0, (BLEND_DST_ALPHA << COLOR_SRCBLEND_shift) | (BLEND_ZERO << COLOR_DESTBLEND_shift)},
+    /* InReverse */
+    {0, 1, (BLEND_ZERO << COLOR_SRCBLEND_shift) | (BLEND_SRC_ALPHA << COLOR_DESTBLEND_shift)},
+    /* Out */
+    {1, 0, (BLEND_ONE_MINUS_DST_ALPHA << COLOR_SRCBLEND_shift) | (BLEND_ZERO << COLOR_DESTBLEND_shift)},
+    /* OutReverse */
+    {0, 1, (BLEND_ZERO << COLOR_SRCBLEND_shift) | (BLEND_ONE_MINUS_SRC_ALPHA << COLOR_DESTBLEND_shift)},
+    /* Atop */
+    {1, 1, (BLEND_DST_ALPHA << COLOR_SRCBLEND_shift) | (BLEND_ONE_MINUS_SRC_ALPHA << COLOR_DESTBLEND_shift)},
+    /* AtopReverse */
+    {1, 1, (BLEND_ONE_MINUS_DST_ALPHA << COLOR_SRCBLEND_shift) | (BLEND_SRC_ALPHA << COLOR_DESTBLEND_shift)},
+    /* Xor */
+    {1, 1, (BLEND_ONE_MINUS_DST_ALPHA << COLOR_SRCBLEND_shift) | (BLEND_ONE_MINUS_SRC_ALPHA << COLOR_DESTBLEND_shift)},
+    /* Add */
+    {0, 0, (BLEND_ONE << COLOR_SRCBLEND_shift) | (BLEND_ONE << COLOR_DESTBLEND_shift)},
+};
+
+struct formatinfo {
+    unsigned int fmt;
+    uint32_t card_fmt;
+};
+
+static struct formatinfo R600TexFormats[] = {
+    {PICT_a8r8g8b8,	FMT_8_8_8_8},
+    {PICT_x8r8g8b8,	FMT_8_8_8_8},
+    {PICT_a8b8g8r8,	FMT_8_8_8_8},
+    {PICT_x8b8g8r8,	FMT_8_8_8_8},
+    {PICT_r5g6b5,	FMT_5_6_5},
+    {PICT_a1r5g5b5,	FMT_1_5_5_5},
+    {PICT_x1r5g5b5,     FMT_1_5_5_5},
+    {PICT_a8,		FMT_8},
+};
+
+static uint32_t R600GetBlendCntl(int op, PicturePtr pMask, uint32_t dst_format)
+{
+    uint32_t sblend, dblend;
+
+    sblend = R600BlendOp[op].blend_cntl & COLOR_SRCBLEND_mask;
+    dblend = R600BlendOp[op].blend_cntl & COLOR_DESTBLEND_mask;
+
+    /* If there's no dst alpha channel, adjust the blend op so that we'll treat
+     * it as always 1.
+     */
+    if (PICT_FORMAT_A(dst_format) == 0 && R600BlendOp[op].dst_alpha) {
+	if (sblend == (BLEND_DST_ALPHA << COLOR_SRCBLEND_shift))
+	    sblend = (BLEND_ONE << COLOR_SRCBLEND_shift);
+	else if (sblend == (BLEND_ONE_MINUS_DST_ALPHA << COLOR_SRCBLEND_shift))
+	    sblend = (BLEND_ZERO << COLOR_SRCBLEND_shift);
+    }
+
+    /* If the source alpha is being used, then we should only be in a case where
+     * the source blend factor is 0, and the source blend value is the mask
+     * channels multiplied by the source picture's alpha.
+     */
+    if (pMask && pMask->componentAlpha && R600BlendOp[op].src_alpha) {
+	if (dblend == (BLEND_SRC_ALPHA << COLOR_DESTBLEND_shift)) {
+	    dblend = (BLEND_SRC_COLOR << COLOR_DESTBLEND_shift);
+	} else if (dblend == (BLEND_ONE_MINUS_SRC_ALPHA << COLOR_DESTBLEND_shift)) {
+	    dblend = (BLEND_ONE_MINUS_SRC_COLOR << COLOR_DESTBLEND_shift);
+	}
+    }
+
+    return sblend | dblend;
+}
+
+static Bool R600GetDestFormat(PicturePtr pDstPicture, uint32_t *dst_format)
+{
+    switch (pDstPicture->format) {
+    case PICT_a8r8g8b8:
+    case PICT_x8r8g8b8:
+	*dst_format = COLOR_8_8_8_8;
+	break;
+    case PICT_r5g6b5:
+	*dst_format = COLOR_5_6_5;
+	break;
+    case PICT_a1r5g5b5:
+    case PICT_x1r5g5b5:
+	*dst_format = COLOR_1_5_5_5;
+	break;
+    case PICT_a8:
+	*dst_format = COLOR_8;
+	break;
+    default:
+	RADEON_FALLBACK(("Unsupported dest format 0x%x\n",
+	       (int)pDstPicture->format));
+    }
+    return TRUE;
+}
+
+static Bool R600CheckCompositeTexture(PicturePtr pPict,
+				      PicturePtr pDstPict,
+				      int op,
+				      int unit)
+{
+    int w = pPict->pDrawable->width;
+    int h = pPict->pDrawable->height;
+    unsigned int i;
+    int max_tex_w, max_tex_h;
+
+    max_tex_w = 8192;
+    max_tex_h = 8192;
+
+    if ((w > max_tex_w) || (h > max_tex_h))
+	RADEON_FALLBACK(("Picture w/h too large (%dx%d)\n", w, h));
+
+    for (i = 0; i < sizeof(R600TexFormats) / sizeof(R600TexFormats[0]); i++) {
+	if (R600TexFormats[i].fmt == pPict->format)
+	    break;
+    }
+    if (i == sizeof(R600TexFormats) / sizeof(R600TexFormats[0]))
+	RADEON_FALLBACK(("Unsupported picture format 0x%x\n",
+			 (int)pPict->format));
+
+    if (pPict->filter != PictFilterNearest &&
+	pPict->filter != PictFilterBilinear)
+	RADEON_FALLBACK(("Unsupported filter 0x%x\n", pPict->filter));
+
+    /* for REPEAT_NONE, Render semantics are that sampling outside the source
+     * picture results in alpha=0 pixels. We can implement this with a border color
+     * *if* our source texture has an alpha channel, otherwise we need to fall
+     * back. If we're not transformed then we hope that upper layers have clipped
+     * rendering to the bounds of the source drawable, in which case it doesn't
+     * matter. I have not, however, verified that the X server always does such
+     * clipping.
+     */
+    //FIXME R6xx
+    if (pPict->transform != 0 && !pPict->repeat && PICT_FORMAT_A(pPict->format) == 0) {
+	if (!(((op == PictOpSrc) || (op == PictOpClear)) && (PICT_FORMAT_A(pDstPict->format) == 0)))
+	    RADEON_FALLBACK(("REPEAT_NONE unsupported for transformed xRGB source\n"));
+    }
+
+    return TRUE;
+}
+
+static Bool R600TextureSetup(PicturePtr pPict, PixmapPtr pPix,
+					int unit)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pPix->drawable.pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+    int w = pPict->pDrawable->width;
+    int h = pPict->pDrawable->height;
+    unsigned int i;
+    tex_resource_t  tex_res;
+    tex_sampler_t   tex_samp;
+
+    CLEAR (tex_res);
+    CLEAR (tex_samp);
+
+    for (i = 0; i < sizeof(R600TexFormats) / sizeof(R600TexFormats[0]); i++) {
+	if (R600TexFormats[i].fmt == pPict->format)
+	    break;
+    }
+
+    accel_state->texW[unit] = w;
+    accel_state->texH[unit] = h;
+
+    //ErrorF("Tex %d setup %dx%d\n", unit, w, h);
+
+    accel_state->src_pitch[unit] = exaGetPixmapPitch(pPix) / (pPix->drawable.bitsPerPixel / 8);
+    accel_state->src_size[unit] = exaGetPixmapPitch(pPix) * h;
+    accel_state->src_mc_addr[unit] = exaGetPixmapOffset(pPix) + info->fbLocation + pScrn->fbOffset;
+    /* flush texture cache */
+    cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
+			accel_state->src_size[unit], accel_state->src_mc_addr[unit]);
+
+    /* Texture */
+    tex_res.id                  = unit;
+    tex_res.w                   = w;
+    tex_res.h                   = h;
+    tex_res.pitch               = accel_state->src_pitch[unit];
+    tex_res.depth               = 0;
+    tex_res.dim                 = SQ_TEX_DIM_2D;
+    tex_res.base                = accel_state->src_mc_addr[unit];
+    tex_res.mip_base            = accel_state->src_mc_addr[unit];
+    tex_res.format              = R600TexFormats[i].card_fmt;
+    tex_res.request_size        = 1;
+
+    /* component swizzles */
+    // XXX double check these
+    switch (pPict->format) {
+    case PICT_a1r5g5b5:
+    case PICT_a8r8g8b8:
+	//ErrorF("%s: PICT_a8r8g8b8\n", unit ? "mask" : "src");
+	tex_res.dst_sel_x           = SQ_SEL_Z; //R
+	tex_res.dst_sel_y           = SQ_SEL_Y; //G
+	tex_res.dst_sel_z           = SQ_SEL_X; //B
+	tex_res.dst_sel_w           = SQ_SEL_W; //A
+	break;
+    case PICT_a8b8g8r8:
+	//ErrorF("%s: PICT_a8b8g8r8\n", unit ? "mask" : "src");
+	tex_res.dst_sel_x           = SQ_SEL_X; //R
+	tex_res.dst_sel_y           = SQ_SEL_Y; //G
+	tex_res.dst_sel_z           = SQ_SEL_Z; //B
+	tex_res.dst_sel_w           = SQ_SEL_W; //A
+	break;
+    case PICT_x8b8g8r8:
+	//ErrorF("%s: PICT_x8b8g8r8\n", unit ? "mask" : "src");
+	tex_res.dst_sel_x           = SQ_SEL_X; //R
+	tex_res.dst_sel_y           = SQ_SEL_Y; //G
+	tex_res.dst_sel_z           = SQ_SEL_Z; //B
+	tex_res.dst_sel_w           = SQ_SEL_1; //A
+	break;
+    case PICT_x1r5g5b5:
+    case PICT_x8r8g8b8:
+	//ErrorF("%s: PICT_x8r8g8b8\n", unit ? "mask" : "src");
+	tex_res.dst_sel_x           = SQ_SEL_Z; //R
+	tex_res.dst_sel_y           = SQ_SEL_Y; //G
+	tex_res.dst_sel_z           = SQ_SEL_X; //B
+	tex_res.dst_sel_w           = SQ_SEL_1; //A
+	break;
+    case PICT_r5g6b5:
+	//ErrorF("%s: PICT_r5g6b5\n", unit ? "mask" : "src");
+	tex_res.dst_sel_x           = SQ_SEL_Z; //R
+	tex_res.dst_sel_y           = SQ_SEL_Y; //G
+	tex_res.dst_sel_z           = SQ_SEL_X; //B
+	tex_res.dst_sel_w           = SQ_SEL_1; //A
+	break;
+    case PICT_a8:
+	//ErrorF("%s: PICT_a8\n", unit ? "mask" : "src");
+	tex_res.dst_sel_x           = SQ_SEL_0; //R
+	tex_res.dst_sel_y           = SQ_SEL_0; //G
+	tex_res.dst_sel_z           = SQ_SEL_0; //B
+	tex_res.dst_sel_w           = SQ_SEL_X; //A
+	break;
+    default:
+	RADEON_FALLBACK(("Bad format 0x%x\n", pPict->format));
+    }
+
+    tex_res.base_level          = 0;
+    tex_res.last_level          = 0;
+    tex_res.perf_modulation     = 0;
+    set_tex_resource            (pScrn, accel_state->ib, &tex_res);
+
+    tex_samp.id                 = unit;
+    tex_samp.border_color       = SQ_TEX_BORDER_COLOR_TRANS_BLACK;
+
+    switch (pPict->repeatType) {
+    case RepeatNormal:
+	tex_samp.clamp_x            = SQ_TEX_WRAP;
+	tex_samp.clamp_y            = SQ_TEX_WRAP;
+	break;
+    case RepeatPad:
+	tex_samp.clamp_x            = SQ_TEX_CLAMP_LAST_TEXEL;
+	tex_samp.clamp_y            = SQ_TEX_CLAMP_LAST_TEXEL;
+	break;
+    case RepeatReflect:
+	tex_samp.clamp_x            = SQ_TEX_MIRROR;
+	tex_samp.clamp_y            = SQ_TEX_MIRROR;
+	break;
+    case RepeatNone:
+	tex_samp.clamp_x            = SQ_TEX_CLAMP_BORDER;
+	tex_samp.clamp_y            = SQ_TEX_CLAMP_BORDER;
+	break;
+    default:
+	RADEON_FALLBACK(("Bad repeat 0x%x\n", pPict->repeatType));
+    }
+
+    switch (pPict->filter) {
+    case PictFilterNearest:
+	tex_samp.xy_mag_filter      = SQ_TEX_XY_FILTER_POINT;
+	tex_samp.xy_min_filter      = SQ_TEX_XY_FILTER_POINT;
+	break;
+    case PictFilterBilinear:
+	tex_samp.xy_mag_filter      = SQ_TEX_XY_FILTER_BILINEAR;
+	tex_samp.xy_min_filter      = SQ_TEX_XY_FILTER_BILINEAR;
+	break;
+    default:
+	RADEON_FALLBACK(("Bad filter 0x%x\n", pPict->filter));
+    }
+
+    tex_samp.clamp_z            = SQ_TEX_WRAP;
+    tex_samp.z_filter           = SQ_TEX_Z_FILTER_NONE;
+    tex_samp.mip_filter         = 0;			/* no mipmap */
+    set_tex_sampler             (pScrn, accel_state->ib, &tex_samp);
+
+    if (pPict->transform != 0) {
+	accel_state->is_transform[unit] = TRUE;
+	accel_state->transform[unit] = pPict->transform;
+    } else
+	accel_state->is_transform[unit] = FALSE;
+
+    return TRUE;
+}
+
+static Bool R600CheckComposite(int op, PicturePtr pSrcPicture, PicturePtr pMaskPicture,
+			       PicturePtr pDstPicture)
+{
+    uint32_t tmp1;
+//    ScreenPtr pScreen = pDstPicture->pDrawable->pScreen;
+    PixmapPtr pSrcPixmap, pDstPixmap;
+//    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
+//    RADEONInfoPtr info = RADEONPTR(pScrn);
+    int max_tex_w, max_tex_h, max_dst_w, max_dst_h;
+
+    /* Check for unsupported compositing operations. */
+    if (op >= (int) (sizeof(R600BlendOp) / sizeof(R600BlendOp[0])))
+	RADEON_FALLBACK(("Unsupported Composite op 0x%x\n", op));
+
+    pSrcPixmap = RADEONGetDrawablePixmap(pSrcPicture->pDrawable);
+
+    max_tex_w = 8192;
+    max_tex_h = 8192;
+    max_dst_w = 8192;
+    max_dst_h = 8192;
+
+    if (pSrcPixmap->drawable.width >= max_tex_w ||
+	pSrcPixmap->drawable.height >= max_tex_h) {
+	RADEON_FALLBACK(("Source w/h too large (%d,%d).\n",
+			 pSrcPixmap->drawable.width,
+			 pSrcPixmap->drawable.height));
+    }
+
+    pDstPixmap = RADEONGetDrawablePixmap(pDstPicture->pDrawable);
+
+    if (pDstPixmap->drawable.width >= max_dst_w ||
+	pDstPixmap->drawable.height >= max_dst_h) {
+	RADEON_FALLBACK(("Dest w/h too large (%d,%d).\n",
+			 pDstPixmap->drawable.width,
+			 pDstPixmap->drawable.height));
+    }
+
+    if (pMaskPicture) {
+	PixmapPtr pMaskPixmap = RADEONGetDrawablePixmap(pMaskPicture->pDrawable);
+
+	if (pMaskPixmap->drawable.width >= max_tex_w ||
+	    pMaskPixmap->drawable.height >= max_tex_h) {
+	    RADEON_FALLBACK(("Mask w/h too large (%d,%d).\n",
+			     pMaskPixmap->drawable.width,
+			     pMaskPixmap->drawable.height));
+	}
+
+	if (pMaskPicture->componentAlpha) {
+	    /* Check if it's component alpha that relies on a source alpha and
+	     * on the source value.  We can only get one of those into the
+	     * single source value that we get to blend with.
+	     */
+	    if (R600BlendOp[op].src_alpha &&
+		(R600BlendOp[op].blend_cntl & COLOR_SRCBLEND_mask) !=
+		(BLEND_ZERO << COLOR_SRCBLEND_shift)) {
+		RADEON_FALLBACK(("Component alpha not supported with source "
+				 "alpha and source value blending.\n"));
+	    }
+	}
+
+	if (!R600CheckCompositeTexture(pMaskPicture, pDstPicture, op, 1))
+	    return FALSE;
+    }
+
+    if (!R600CheckCompositeTexture(pSrcPicture, pDstPicture, op, 0))
+	return FALSE;
+
+    if (!R600GetDestFormat(pDstPicture, &tmp1))
+	return FALSE;
+
+    return TRUE;
+
+}
+
+static Bool R600PrepareComposite(int op, PicturePtr pSrcPicture,
+				 PicturePtr pMaskPicture, PicturePtr pDstPicture,
+				 PixmapPtr pSrc, PixmapPtr pMask, PixmapPtr pDst)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pSrc->drawable.pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+    uint32_t blendcntl, dst_format;
+    cb_config_t cb_conf;
+    shader_config_t vs_conf, ps_conf;
+    int i = 0;
+    uint32_t ps[24];
+
+    //return FALSE;
+
+    if (pMask)
+	accel_state->has_mask = TRUE;
+    else
+	accel_state->has_mask = FALSE;
+
+    accel_state->dst_mc_addr = exaGetPixmapOffset(pDst) + info->fbLocation + pScrn->fbOffset;
+    accel_state->dst_pitch = exaGetPixmapPitch(pDst) / (pDst->drawable.bitsPerPixel / 8);
+    accel_state->dst_size = exaGetPixmapPitch(pDst) * pDst->drawable.height;
+
+    accel_state->src_mc_addr[0] = exaGetPixmapOffset(pSrc) + info->fbLocation + pScrn->fbOffset;
+    accel_state->src_pitch[0] = exaGetPixmapPitch(pSrc) / (pSrc->drawable.bitsPerPixel / 8);
+    accel_state->src_size[0] = exaGetPixmapPitch(pSrc) * pSrc->drawable.height;
+
+    if (accel_state->dst_pitch & 7)
+	RADEON_FALLBACK(("Bad dst pitch 0x%x\n", (int)accel_state->dst_pitch));
+
+    if (accel_state->dst_mc_addr & 0xff)
+	RADEON_FALLBACK(("Bad destination offset 0x%x\n", (int)accel_state->dst_mc_addr));
+
+    if (accel_state->src_pitch[0] & 7)
+	RADEON_FALLBACK(("Bad src pitch 0x%x\n", (int)accel_state->src_pitch[0]));
+
+    if (accel_state->src_mc_addr[0] & 0xff)
+	RADEON_FALLBACK(("Bad src offset 0x%x\n", (int)accel_state->src_mc_addr[0]));
+
+    if (!R600GetDestFormat(pDstPicture, &dst_format))
+	return FALSE;
+
+    if (pMask) {
+	int src_a, src_r, src_g, src_b;
+	int mask_a, mask_r, mask_g, mask_b;
+
+	accel_state->src_mc_addr[1] = exaGetPixmapOffset(pMask) + info->fbLocation + pScrn->fbOffset;
+	accel_state->src_pitch[1] = exaGetPixmapPitch(pMask) / (pMask->drawable.bitsPerPixel / 8);
+	accel_state->src_size[1] = exaGetPixmapPitch(pMask) * pMask->drawable.height;
+
+	if (accel_state->src_pitch[1] & 7)
+	    RADEON_FALLBACK(("Bad mask pitch 0x%x\n", (int)accel_state->src_pitch[1]));
+
+	if (accel_state->src_mc_addr[1] & 0xff)
+	    RADEON_FALLBACK(("Bad mask offset 0x%x\n", (int)accel_state->src_mc_addr[1]));
+
+	/* setup pixel shader */
+	if (PICT_FORMAT_RGB(pSrcPicture->format) == 0) {
+	    //src_color = R300_ALU_RGB_0_0;
+	    src_r = SQ_SEL_0;
+	    src_g = SQ_SEL_0;
+	    src_b = SQ_SEL_0;
+	} else {
+	    //src_color = R300_ALU_RGB_SRC0_RGB;
+	    src_r = SQ_SEL_X;
+	    src_g = SQ_SEL_Y;
+	    src_b = SQ_SEL_Z;
+	}
+
+	if (PICT_FORMAT_A(pSrcPicture->format) == 0) {
+	    //src_alpha = R300_ALU_ALPHA_1_0;
+	    src_a = SQ_SEL_1;
+	} else {
+	    //src_alpha = R300_ALU_ALPHA_SRC0_A;
+	    src_a = SQ_SEL_W;
+	}
+
+	if (pMaskPicture->componentAlpha) {
+	    if (R600BlendOp[op].src_alpha) {
+		if (PICT_FORMAT_A(pSrcPicture->format) == 0) {
+		    //src_color = R300_ALU_RGB_1_0;
+		    //src_alpha = R300_ALU_ALPHA_1_0;
+		    src_r = SQ_SEL_1;
+		    src_g = SQ_SEL_1;
+		    src_b = SQ_SEL_1;
+		    src_a = SQ_SEL_1;
+		} else {
+		    //src_color = R300_ALU_RGB_SRC0_AAA;
+		    //src_alpha = R300_ALU_ALPHA_SRC0_A;
+		    src_r = SQ_SEL_W;
+		    src_g = SQ_SEL_W;
+		    src_b = SQ_SEL_W;
+		    src_a = SQ_SEL_W;
+		}
+
+		//mask_color = R300_ALU_RGB_SRC1_RGB;
+		mask_r = SQ_SEL_X;
+		mask_g = SQ_SEL_Y;
+		mask_b = SQ_SEL_Z;
+
+		if (PICT_FORMAT_A(pMaskPicture->format) == 0) {
+		    //mask_alpha = R300_ALU_ALPHA_1_0;
+		    mask_a = SQ_SEL_1;
+		} else {
+		    //mask_alpha = R300_ALU_ALPHA_SRC1_A;
+		    mask_a = SQ_SEL_W;
+		}
+	    } else {
+		//src_color = R300_ALU_RGB_SRC0_RGB;
+		src_r = SQ_SEL_X;
+		src_g = SQ_SEL_Y;
+		src_b = SQ_SEL_Z;
+
+		if (PICT_FORMAT_A(pSrcPicture->format) == 0) {
+		    //src_alpha = R300_ALU_ALPHA_1_0;
+		    src_a = SQ_SEL_1;
+		} else {
+		    //src_alpha = R300_ALU_ALPHA_SRC0_A;
+		    src_a = SQ_SEL_W;
+		}
+
+		//mask_color = R300_ALU_RGB_SRC1_RGB;
+		mask_r = SQ_SEL_X;
+		mask_g = SQ_SEL_Y;
+		mask_b = SQ_SEL_Z;
+
+		if (PICT_FORMAT_A(pMaskPicture->format) == 0) {
+		    //mask_alpha = R300_ALU_ALPHA_1_0;
+		    mask_a = SQ_SEL_1;
+		} else {
+		    //mask_alpha = R300_ALU_ALPHA_SRC1_A;
+		    mask_a = SQ_SEL_W;
+		}
+	    }
+	} else {
+	    if (PICT_FORMAT_A(pMaskPicture->format) == 0) {
+		//mask_color = R300_ALU_RGB_1_0;
+		mask_r = SQ_SEL_1;
+		mask_g = SQ_SEL_1;
+		mask_b = SQ_SEL_1;
+	    } else {
+		//mask_color = R300_ALU_RGB_SRC1_AAA;
+		mask_r = SQ_SEL_W;
+		mask_g = SQ_SEL_W;
+		mask_b = SQ_SEL_W;
+	    }
+	    if (PICT_FORMAT_A(pMaskPicture->format) == 0) {
+		//mask_alpha = R300_ALU_ALPHA_1_0;
+		mask_a = SQ_SEL_1;
+	    } else {
+		//mask_alpha = R300_ALU_ALPHA_SRC1_A;
+		mask_a = SQ_SEL_W;
+	    }
+	}
+
+	//0
+	ps[i++] = CF_DWORD0(ADDR(8));
+	ps[i++] = CF_DWORD1(POP_COUNT(0),
+			    CF_CONST(0),
+			    COND(SQ_CF_COND_ACTIVE),
+			    I_COUNT(2),
+			    CALL_COUNT(0),
+			    END_OF_PROGRAM(0),
+			    VALID_PIXEL_MODE(0),
+			    CF_INST(SQ_CF_INST_TEX),
+			    WHOLE_QUAD_MODE(0),
+			    BARRIER(1));
+
+	// 1
+	ps[i++] = CF_ALU_DWORD0(ADDR(3),
+				KCACHE_BANK0(0),
+				KCACHE_BANK1(0),
+				KCACHE_MODE0(SQ_CF_KCACHE_NOP));
+	ps[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
+				KCACHE_ADDR0(0),
+				KCACHE_ADDR1(0),
+				I_COUNT(4),
+				USES_WATERFALL(0),
+				CF_INST(SQ_CF_INST_ALU),
+				WHOLE_QUAD_MODE(0),
+				BARRIER(1));
+
+	//2
+	ps[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
+					  TYPE(SQ_EXPORT_PIXEL),
+					  RW_GPR(2),
+					  RW_REL(ABSOLUTE),
+					  INDEX_GPR(0),
+					  ELEM_SIZE(1));
+
+	ps[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					       SRC_SEL_Y(SQ_SEL_Y),
+					       SRC_SEL_Z(SQ_SEL_Z),
+					       SRC_SEL_W(SQ_SEL_W),
+					       R6xx_ELEM_LOOP(0),
+					       BURST_COUNT(1),
+					       END_OF_PROGRAM(1),
+					       VALID_PIXEL_MODE(0),
+					       CF_INST(SQ_CF_INST_EXPORT_DONE),
+					       WHOLE_QUAD_MODE(0),
+					       BARRIER(1));
+
+	// 3 - alu 0
+	// MUL gpr[2].x gpr[1].x gpr[0].x
+	ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			     SRC0_REL(ABSOLUTE),
+			     SRC0_ELEM(ELEM_X),
+			     SRC0_NEG(0),
+			     SRC1_SEL(0),
+			     SRC1_REL(ABSOLUTE),
+			     SRC1_ELEM(ELEM_X),
+			     SRC1_NEG(0),
+			     INDEX_MODE(SQ_INDEX_LOOP),
+			     PRED_SEL(SQ_PRED_SEL_OFF),
+			     LAST(0));
+	ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+				 SRC0_ABS(0),
+				 SRC1_ABS(0),
+				 UPDATE_EXECUTE_MASK(0),
+				 UPDATE_PRED(0),
+				 WRITE_MASK(1),
+				 FOG_MERGE(0),
+				 OMOD(SQ_ALU_OMOD_OFF),
+				 ALU_INST(SQ_OP2_INST_MUL),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(2),
+				 DST_REL(ABSOLUTE),
+				 DST_ELEM(ELEM_X),
+				 CLAMP(1));
+	// 4 - alu 1
+	// MUL gpr[2].y gpr[1].y gpr[0].y
+	ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			     SRC0_REL(ABSOLUTE),
+			     SRC0_ELEM(ELEM_Y),
+			     SRC0_NEG(0),
+			     SRC1_SEL(0),
+			     SRC1_REL(ABSOLUTE),
+			     SRC1_ELEM(ELEM_Y),
+			     SRC1_NEG(0),
+			     INDEX_MODE(SQ_INDEX_LOOP),
+			     PRED_SEL(SQ_PRED_SEL_OFF),
+			     LAST(0));
+	ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+				 SRC0_ABS(0),
+				 SRC1_ABS(0),
+				 UPDATE_EXECUTE_MASK(0),
+				 UPDATE_PRED(0),
+				 WRITE_MASK(1),
+				 FOG_MERGE(0),
+				 OMOD(SQ_ALU_OMOD_OFF),
+				 ALU_INST(SQ_OP2_INST_MUL),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(2),
+				 DST_REL(ABSOLUTE),
+				 DST_ELEM(ELEM_Y),
+				 CLAMP(1));
+	// 5 - alu 2
+	// MUL gpr[2].z gpr[1].z gpr[0].z
+	ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			     SRC0_REL(ABSOLUTE),
+			     SRC0_ELEM(ELEM_Z),
+			     SRC0_NEG(0),
+			     SRC1_SEL(0),
+			     SRC1_REL(ABSOLUTE),
+			     SRC1_ELEM(ELEM_Z),
+			     SRC1_NEG(0),
+			     INDEX_MODE(SQ_INDEX_LOOP),
+			     PRED_SEL(SQ_PRED_SEL_OFF),
+			     LAST(0));
+	ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+				 SRC0_ABS(0),
+				 SRC1_ABS(0),
+				 UPDATE_EXECUTE_MASK(0),
+				 UPDATE_PRED(0),
+				 WRITE_MASK(1),
+				 FOG_MERGE(0),
+				 OMOD(SQ_ALU_OMOD_OFF),
+				 ALU_INST(SQ_OP2_INST_MUL),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(2),
+				 DST_REL(ABSOLUTE),
+				 DST_ELEM(ELEM_Z),
+				 CLAMP(1));
+	// 6 - alu 3
+	// MUL gpr[2].w gpr[1].w gpr[0].w
+	ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			     SRC0_REL(ABSOLUTE),
+			     SRC0_ELEM(ELEM_W),
+			     SRC0_NEG(0),
+			     SRC1_SEL(0),
+			     SRC1_REL(ABSOLUTE),
+			     SRC1_ELEM(ELEM_W),
+			     SRC1_NEG(0),
+			     INDEX_MODE(SQ_INDEX_LOOP),
+			     PRED_SEL(SQ_PRED_SEL_OFF),
+			     LAST(1));
+	ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+				 SRC0_ABS(0),
+				 SRC1_ABS(0),
+				 UPDATE_EXECUTE_MASK(0),
+				 UPDATE_PRED(0),
+				 WRITE_MASK(1),
+				 FOG_MERGE(0),
+				 OMOD(SQ_ALU_OMOD_OFF),
+				 ALU_INST(SQ_OP2_INST_MUL),
+				 BANK_SWIZZLE(SQ_ALU_VEC_012),
+				 DST_GPR(2),
+				 DST_REL(ABSOLUTE),
+				 DST_ELEM(ELEM_W),
+				 CLAMP(1));
+	// 7
+	ps[i++] = 0x00000000;
+	ps[i++] = 0x00000000;
+
+	//8/9 - src
+	ps[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
+			     BC_FRAC_MODE(0),
+			     FETCH_WHOLE_QUAD(0),
+			     RESOURCE_ID(0),
+			     SRC_GPR(0),
+			     SRC_REL(ABSOLUTE),
+			     R7xx_ALT_CONST(0));
+	ps[i++] = TEX_DWORD1(DST_GPR(0),
+			     DST_REL(ABSOLUTE),
+			     DST_SEL_X(src_r),
+			     DST_SEL_Y(src_g),
+			     DST_SEL_Z(src_b),
+			     DST_SEL_W(src_a),
+			     LOD_BIAS(0),
+			     COORD_TYPE_X(TEX_NORMALIZED),
+			     COORD_TYPE_Y(TEX_NORMALIZED),
+			     COORD_TYPE_Z(TEX_NORMALIZED),
+			     COORD_TYPE_W(TEX_NORMALIZED));
+	ps[i++] = TEX_DWORD2(OFFSET_X(0),
+			     OFFSET_Y(0),
+			     OFFSET_Z(0),
+			     SAMPLER_ID(0),
+			     SRC_SEL_X(SQ_SEL_X),
+			     SRC_SEL_Y(SQ_SEL_Y),
+			     SRC_SEL_Z(SQ_SEL_0),
+			     SRC_SEL_W(SQ_SEL_1));
+	ps[i++] = TEX_DWORD_PAD;
+	//10/11 - mask
+	ps[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
+			     BC_FRAC_MODE(0),
+			     FETCH_WHOLE_QUAD(0),
+			     RESOURCE_ID(1),
+			     SRC_GPR(1),
+			     SRC_REL(ABSOLUTE),
+			     R7xx_ALT_CONST(0));
+	ps[i++] = TEX_DWORD1(DST_GPR(1),
+			     DST_REL(ABSOLUTE),
+			     DST_SEL_X(mask_r),
+			     DST_SEL_Y(mask_g),
+			     DST_SEL_Z(mask_b),
+			     DST_SEL_W(mask_a),
+			     LOD_BIAS(0),
+			     COORD_TYPE_X(TEX_NORMALIZED),
+			     COORD_TYPE_Y(TEX_NORMALIZED),
+			     COORD_TYPE_Z(TEX_NORMALIZED),
+			     COORD_TYPE_W(TEX_NORMALIZED));
+	ps[i++] = TEX_DWORD2(OFFSET_X(0),
+			     OFFSET_Y(0),
+			     OFFSET_Z(0),
+			     SAMPLER_ID(1),
+			     SRC_SEL_X(SQ_SEL_X),
+			     SRC_SEL_Y(SQ_SEL_Y),
+			     SRC_SEL_Z(SQ_SEL_0),
+			     SRC_SEL_W(SQ_SEL_1));
+	ps[i++] = TEX_DWORD_PAD;
+    } else {
+	int src_a, src_r, src_g, src_b;
+	/* setup pixel shader */
+	if (PICT_FORMAT_RGB(pSrcPicture->format) == 0) {
+	    //src_color = R300_ALU_RGB_0_0;
+	    src_r = SQ_SEL_0;
+	    src_g = SQ_SEL_0;
+	    src_b = SQ_SEL_0;
+	} else {
+	    //src_color = R300_ALU_RGB_SRC0_RGB;
+	    src_r = SQ_SEL_X;
+	    src_g = SQ_SEL_Y;
+	    src_b = SQ_SEL_Z;
+	}
+
+	if (PICT_FORMAT_A(pSrcPicture->format) == 0) {
+	    //src_alpha = R300_ALU_ALPHA_1_0;
+	    src_a = SQ_SEL_1;
+	} else {
+	    //src_alpha = R300_ALU_ALPHA_SRC0_A;
+	    src_a = SQ_SEL_W;
+	}
+
+	//0
+	ps[i++] = CF_DWORD0(ADDR(2));
+	ps[i++] = CF_DWORD1(POP_COUNT(0),
+			    CF_CONST(0),
+			    COND(SQ_CF_COND_ACTIVE),
+			    I_COUNT(1),
+			    CALL_COUNT(0),
+			    END_OF_PROGRAM(0),
+			    VALID_PIXEL_MODE(0),
+			    CF_INST(SQ_CF_INST_TEX),
+			    WHOLE_QUAD_MODE(0),
+			    BARRIER(1));
+	//1
+	ps[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
+					  TYPE(SQ_EXPORT_PIXEL),
+					  RW_GPR(0),
+					  RW_REL(ABSOLUTE),
+					  INDEX_GPR(0),
+					  ELEM_SIZE(1));
+
+	ps[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					       SRC_SEL_Y(SQ_SEL_Y),
+					       SRC_SEL_Z(SQ_SEL_Z),
+					       SRC_SEL_W(SQ_SEL_W),
+					       R6xx_ELEM_LOOP(0),
+					       BURST_COUNT(1),
+					       END_OF_PROGRAM(1),
+					       VALID_PIXEL_MODE(0),
+					       CF_INST(SQ_CF_INST_EXPORT_DONE),
+					       WHOLE_QUAD_MODE(0),
+					       BARRIER(1));
+
+
+	//2/3 - src
+	ps[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
+			     BC_FRAC_MODE(0),
+			     FETCH_WHOLE_QUAD(0),
+			     RESOURCE_ID(0),
+			     SRC_GPR(0),
+			     SRC_REL(ABSOLUTE),
+			     R7xx_ALT_CONST(0));
+	ps[i++] = TEX_DWORD1(DST_GPR(0),
+			     DST_REL(ABSOLUTE),
+			     DST_SEL_X(src_r),
+			     DST_SEL_Y(src_g),
+			     DST_SEL_Z(src_b),
+			     DST_SEL_W(src_a),
+			     LOD_BIAS(0),
+			     COORD_TYPE_X(TEX_NORMALIZED),
+			     COORD_TYPE_Y(TEX_NORMALIZED),
+			     COORD_TYPE_Z(TEX_NORMALIZED),
+			     COORD_TYPE_W(TEX_NORMALIZED));
+	ps[i++] = TEX_DWORD2(OFFSET_X(0),
+			     OFFSET_Y(0),
+			     OFFSET_Z(0),
+			     SAMPLER_ID(0),
+			     SRC_SEL_X(SQ_SEL_X),
+			     SRC_SEL_Y(SQ_SEL_Y),
+			     SRC_SEL_Z(SQ_SEL_0),
+			     SRC_SEL_W(SQ_SEL_1));
+	ps[i++] = TEX_DWORD_PAD;
+    }
+
+    CLEAR (cb_conf);
+    CLEAR (vs_conf);
+    CLEAR (ps_conf);
+
+    accel_state->ib = RADEONCPGetBuffer(pScrn);
+
+    /* Init */
+    start_3d(pScrn, accel_state->ib);
+
+    //cp_set_surface_sync(pScrn, accel_state->ib);
+
+    set_default_state(pScrn, accel_state->ib);
+
+    /* Scissor / viewport */
+    ereg  (accel_state->ib, PA_CL_VTE_CNTL,                      VTX_XY_FMT_bit);
+    ereg  (accel_state->ib, PA_CL_CLIP_CNTL,                     CLIP_DISABLE_bit);
+
+    // fix me if false discard buffer!
+    if (!R600TextureSetup(pSrcPicture, pSrc, 0))
+	return FALSE;
+
+    if (pMask != NULL) {
+	// fix me if false discard buffer!
+	if (!R600TextureSetup(pMaskPicture, pMask, 1))
+	    return FALSE;
+    } else {
+	accel_state->is_transform[1] = FALSE;
+    }
+
+    if (pMask != NULL)
+	accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	    accel_state->comp_mask_vs_offset;
+    else
+	accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	    accel_state->comp_vs_offset;
+
+    memcpy ((char *)accel_state->ib->address + (accel_state->ib->total / 2) - 256, ps, sizeof(ps));
+    accel_state->ps_mc_addr = info->gartLocation + info->dri->bufStart +
+	(accel_state->ib->idx * accel_state->ib->total) + (accel_state->ib->total / 2) - 256;
+
+    accel_state->vs_size = 512;
+    accel_state->ps_size = 512;
+
+    /* Shader */
+
+    /* flush SQ cache */
+    cp_set_surface_sync(pScrn, accel_state->ib, SH_ACTION_ENA_bit,
+			accel_state->vs_size, accel_state->vs_mc_addr);
+
+    vs_conf.shader_addr         = accel_state->vs_mc_addr;
+    vs_conf.num_gprs            = 3;
+    vs_conf.stack_size          = 0;
+    vs_setup                    (pScrn, accel_state->ib, &vs_conf);
+
+    /* flush SQ cache */
+    cp_set_surface_sync(pScrn, accel_state->ib, SH_ACTION_ENA_bit,
+			accel_state->ps_size, accel_state->ps_mc_addr);
+
+    ps_conf.shader_addr         = accel_state->ps_mc_addr;
+    ps_conf.num_gprs            = 3;
+    ps_conf.stack_size          = 0;
+    ps_conf.uncached_first_inst = 1;
+    ps_conf.clamp_consts        = 0;
+    ps_conf.export_mode         = 2;
+    ps_setup                    (pScrn, accel_state->ib, &ps_conf);
+
+    ereg  (accel_state->ib, CB_SHADER_MASK,                      (0xf << OUTPUT0_ENABLE_shift));
+    ereg  (accel_state->ib, R7xx_CB_SHADER_CONTROL,              (RT0_ENABLE_bit));
+
+    blendcntl = R600GetBlendCntl(op, pMaskPicture, pDstPicture->format);
+
+    if (info->ChipFamily == CHIP_FAMILY_R600) {
+	// no per-MRT blend on R600
+	ereg  (accel_state->ib, CB_COLOR_CONTROL,                    RADEON_ROP[3] | (1 << TARGET_BLEND_ENABLE_shift));
+	ereg  (accel_state->ib, CB_BLEND_CONTROL,                    blendcntl);
+    } else {
+	ereg  (accel_state->ib, CB_COLOR_CONTROL,                    (RADEON_ROP[3] |
+								      (1 << TARGET_BLEND_ENABLE_shift) |
+								      PER_MRT_BLEND_bit));
+	ereg  (accel_state->ib, CB_BLEND0_CONTROL,                   blendcntl);
+    }
+
+    cb_conf.id = 0;
+    cb_conf.w = accel_state->dst_pitch;
+    cb_conf.h = pDst->drawable.height;
+    cb_conf.base = accel_state->dst_mc_addr;
+    cb_conf.format = dst_format;
+
+    switch (pDstPicture->format) {
+    case PICT_a8r8g8b8:
+	//ErrorF("dst: PICT_a8r8g8b8\n");
+	cb_conf.comp_swap = 1; //ARGB
+	break;
+    case PICT_x8r8g8b8:
+	//ErrorF("dst: PICT_x8r8g8b8\n");
+	cb_conf.comp_swap = 1; //ARGB
+	break;
+    case PICT_r5g6b5:
+	//ErrorF("dst: PICT_r5g6b5\n");
+	cb_conf.comp_swap = 2; //RGB
+	break;
+    case PICT_a1r5g5b5:
+	//ErrorF("dst: PICT_a1r5g5b5\n");
+	cb_conf.comp_swap = 1; //ARGB
+	break;
+    case PICT_x1r5g5b5:
+	//ErrorF("dst: PICT_x1r5g5b5\n");
+	cb_conf.comp_swap = 1; //ARGB
+	break;
+    case PICT_a8:
+	//ErrorF("dst: PICT_a8\n");
+	cb_conf.comp_swap = 3; //A
+	break;
+    default:
+	cb_conf.comp_swap = 1;
+	break;
+    }
+    cb_conf.source_format = 1;
+    cb_conf.blend_clamp = 1;
+    set_render_target(pScrn, accel_state->ib, &cb_conf);
+
+    ereg  (accel_state->ib, PA_SU_SC_MODE_CNTL,                  (FACE_bit			|
+						 (POLYMODE_PTYPE__TRIANGLES << POLYMODE_FRONT_PTYPE_shift)	|
+						 (POLYMODE_PTYPE__TRIANGLES << POLYMODE_BACK_PTYPE_shift)));
+    ereg  (accel_state->ib, DB_SHADER_CONTROL,                   ((1 << Z_ORDER_shift)		| /* EARLY_Z_THEN_LATE_Z */
+						 DUAL_EXPORT_ENABLE_bit)); /* Only useful if no depth export */
+
+    /* Interpolator setup */
+    if (pMask) {
+	// export 2 tex coords from VS
+	ereg  (accel_state->ib, SPI_VS_OUT_CONFIG, ((2 - 1) << VS_EXPORT_COUNT_shift));
+	// src = semantic id 0; mask = semantic id 1
+	ereg  (accel_state->ib, SPI_VS_OUT_ID_0, ((0 << SEMANTIC_0_shift) |
+						  (1 << SEMANTIC_1_shift)));
+	// input 2 tex coords from VS
+	ereg  (accel_state->ib, SPI_PS_IN_CONTROL_0, (2 << NUM_INTERP_shift));
+    } else {
+	// export 1 tex coords from VS
+	ereg  (accel_state->ib, SPI_VS_OUT_CONFIG, ((1 - 1) << VS_EXPORT_COUNT_shift));
+	// src = semantic id 0
+	ereg  (accel_state->ib, SPI_VS_OUT_ID_0,   (0 << SEMANTIC_0_shift));
+	// input 1 tex coords from VS
+	ereg  (accel_state->ib, SPI_PS_IN_CONTROL_0, (1 << NUM_INTERP_shift));
+    }
+    ereg  (accel_state->ib, SPI_PS_IN_CONTROL_1,                 0);
+    // SPI_PS_INPUT_CNTL_0 maps to GPR[0] - load with semantic id 0
+    ereg  (accel_state->ib, SPI_PS_INPUT_CNTL_0 + (0 <<2),       ((0    << SEMANTIC_shift)	|
+								  (0x01 << DEFAULT_VAL_shift)	|
+								  SEL_CENTROID_bit));
+    // SPI_PS_INPUT_CNTL_1 maps to GPR[1] - load with semantic id 1
+    ereg  (accel_state->ib, SPI_PS_INPUT_CNTL_0 + (1 <<2),       ((1    << SEMANTIC_shift)	|
+								  (0x01 << DEFAULT_VAL_shift)	|
+								  SEL_CENTROID_bit));
+    ereg  (accel_state->ib, SPI_INTERP_CONTROL_0,                0);
+
+    accel_state->vb_index = 0;
+
+    return TRUE;
+}
+
+static void R600Composite(PixmapPtr pDst,
+			  int srcX, int srcY,
+			  int maskX, int maskY,
+			  int dstX, int dstY,
+			  int w, int h)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+    xPointFixed srcTopLeft, srcTopRight, srcBottomLeft, srcBottomRight;
+
+    /* ErrorF("R600Composite (%d,%d) (%d,%d) (%d,%d) (%d,%d)\n",
+       srcX, srcY, maskX, maskY,dstX, dstY, w, h); */
+
+    srcTopLeft.x     = IntToxFixed(srcX);
+    srcTopLeft.y     = IntToxFixed(srcY);
+    srcTopRight.x    = IntToxFixed(srcX + w);
+    srcTopRight.y    = IntToxFixed(srcY);
+    srcBottomLeft.x  = IntToxFixed(srcX);
+    srcBottomLeft.y  = IntToxFixed(srcY + h);
+    srcBottomRight.x = IntToxFixed(srcX + w);
+    srcBottomRight.y = IntToxFixed(srcY + h);
+
+    //XXX do transform in vertex shader
+    if (accel_state->is_transform[0]) {
+	transformPoint(accel_state->transform[0], &srcTopLeft);
+	transformPoint(accel_state->transform[0], &srcTopRight);
+	transformPoint(accel_state->transform[0], &srcBottomLeft);
+	transformPoint(accel_state->transform[0], &srcBottomRight);
+    }
+
+    if (accel_state->has_mask) {
+	struct r6xx_comp_mask_vertex *comp_vb =
+	    (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+	struct r6xx_comp_mask_vertex vertex[3];
+	xPointFixed maskTopLeft, maskTopRight, maskBottomLeft, maskBottomRight;
+
+	maskTopLeft.x     = IntToxFixed(maskX);
+	maskTopLeft.y     = IntToxFixed(maskY);
+	maskTopRight.x    = IntToxFixed(maskX + w);
+	maskTopRight.y    = IntToxFixed(maskY);
+	maskBottomLeft.x  = IntToxFixed(maskX);
+	maskBottomLeft.y  = IntToxFixed(maskY + h);
+	maskBottomRight.x = IntToxFixed(maskX + w);
+	maskBottomRight.y = IntToxFixed(maskY + h);
+
+	if (accel_state->is_transform[1]) {
+	    transformPoint(accel_state->transform[1], &maskTopLeft);
+	    transformPoint(accel_state->transform[1], &maskTopRight);
+	    transformPoint(accel_state->transform[1], &maskBottomLeft);
+	    transformPoint(accel_state->transform[1], &maskBottomRight);
+	}
+
+	vertex[0].x = (float)dstX;
+	vertex[0].y = (float)dstY;
+	vertex[0].src_s = xFixedToFloat(srcTopLeft.x) / accel_state->texW[0];
+	vertex[0].src_t = xFixedToFloat(srcTopLeft.y) / accel_state->texH[0];
+	vertex[0].mask_s = xFixedToFloat(maskTopLeft.x) / accel_state->texW[1];
+	vertex[0].mask_t = xFixedToFloat(maskTopLeft.y) / accel_state->texH[1];
+
+	vertex[1].x = (float)dstX;
+	vertex[1].y = (float)(dstY + h);
+	vertex[1].src_s = xFixedToFloat(srcBottomLeft.x) / accel_state->texW[0];
+	vertex[1].src_t = xFixedToFloat(srcBottomLeft.y) / accel_state->texH[0];
+	vertex[1].mask_s = xFixedToFloat(maskBottomLeft.x) / accel_state->texW[1];
+	vertex[1].mask_t = xFixedToFloat(maskBottomLeft.y) / accel_state->texH[1];
+
+	vertex[2].x = (float)(dstX + w);
+	vertex[2].y = (float)(dstY + h);
+	vertex[2].src_s = xFixedToFloat(srcBottomRight.x) / accel_state->texW[0];
+	vertex[2].src_t = xFixedToFloat(srcBottomRight.y) / accel_state->texH[0];
+	vertex[2].mask_s = xFixedToFloat(maskBottomRight.x) / accel_state->texW[1];
+	vertex[2].mask_t = xFixedToFloat(maskBottomRight.y) / accel_state->texH[1];
+
+#ifdef SHOW_VERTEXES
+	ErrorF("vertex 0: %d, %d, %f, %f, %f, %f\n", vertex[0].x, vertex[0].y,
+	       vertex[0].src_s, vertex[0].src_t, vertex[0].mask_s, vertex[0].mask_t);
+	ErrorF("vertex 1: %d, %d, %f, %f, %f, %f\n", vertex[1].x, vertex[1].y,
+	       vertex[1].src_s, vertex[1].src_t, vertex[1].mask_s, vertex[1].mask_t);
+	ErrorF("vertex 2: %d, %d, %f, %f, %f, %f\n", vertex[2].x, vertex[2].y,
+	       vertex[2].src_s, vertex[2].src_t,  vertex[2].mask_s, vertex[2].mask_t);
+#endif
+
+	// append to vertex buffer
+	comp_vb[accel_state->vb_index++] = vertex[0];
+	comp_vb[accel_state->vb_index++] = vertex[1];
+	comp_vb[accel_state->vb_index++] = vertex[2];
+
+    } else {
+	struct r6xx_comp_vertex *comp_vb =
+	    (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+	struct r6xx_comp_vertex vertex[3];
+
+	vertex[0].x = (float)dstX;
+	vertex[0].y = (float)dstY;
+	vertex[0].src_s = xFixedToFloat(srcTopLeft.x) / accel_state->texW[0];
+	vertex[0].src_t = xFixedToFloat(srcTopLeft.y) / accel_state->texH[0];
+
+	vertex[1].x = (float)dstX;
+	vertex[1].y = (float)(dstY + h);
+	vertex[1].src_s = xFixedToFloat(srcBottomLeft.x) / accel_state->texW[0];
+	vertex[1].src_t = xFixedToFloat(srcBottomLeft.y) / accel_state->texH[0];
+
+	vertex[2].x = (float)(dstX + w);
+	vertex[2].y = (float)(dstY + h);
+	vertex[2].src_s = xFixedToFloat(srcBottomRight.x) / accel_state->texW[0];
+	vertex[2].src_t = xFixedToFloat(srcBottomRight.y) / accel_state->texH[0];
+
+	// append to vertex buffer
+	comp_vb[accel_state->vb_index++] = vertex[0];
+	comp_vb[accel_state->vb_index++] = vertex[1];
+	comp_vb[accel_state->vb_index++] = vertex[2];
+
+#ifdef SHOW_VERTEXES
+	ErrorF("vertex 0: %d, %d, %f, %f\n", vertex[0].x, vertex[0].y, vertex[0].src_s, vertex[0].src_t);
+	ErrorF("vertex 1: %d, %d, %f, %f\n", vertex[1].x, vertex[1].y, vertex[1].src_s, vertex[1].src_t);
+	ErrorF("vertex 2: %d, %d, %f, %f\n", vertex[2].x, vertex[2].y, vertex[2].src_s, vertex[2].src_t);
+#endif
+    }
+
+
+}
+
+static void R600DoneComposite(PixmapPtr pDst)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+    draw_config_t   draw_conf;
+    vtx_resource_t  vtx_res;
+
+    CLEAR (draw_conf);
+    CLEAR (vtx_res);
+
+    if (accel_state->vb_index == 0) {
+	R600IBDiscard(pScrn, accel_state->ib);
+	return;
+    }
+
+    accel_state->vb_mc_addr = info->gartLocation + info->dri->bufStart +
+	(accel_state->ib->idx * accel_state->ib->total) + (accel_state->ib->total / 2);
+
+
+    /* Vertex buffer setup */
+    if (accel_state->has_mask) {
+	accel_state->vb_size = accel_state->vb_index * 24;
+	vtx_res.id              = SQ_VTX_RESOURCE_vs;
+	vtx_res.vtx_size_dw     = 24 / 4;
+	vtx_res.vtx_num_entries = accel_state->vb_size / 4;
+	vtx_res.mem_req_size    = 1;
+	vtx_res.vb_addr         = accel_state->vb_mc_addr;
+    } else {
+	accel_state->vb_size = accel_state->vb_index * 16;
+	vtx_res.id              = SQ_VTX_RESOURCE_vs;
+	vtx_res.vtx_size_dw     = 16 / 4;
+	vtx_res.vtx_num_entries = accel_state->vb_size / 4;
+	vtx_res.mem_req_size    = 1;
+	vtx_res.vb_addr         = accel_state->vb_mc_addr;
+    }
+    /* flush vertex cache */
+    if ((info->ChipFamily == CHIP_FAMILY_RV610) ||
+	(info->ChipFamily == CHIP_FAMILY_RV620) ||
+	(info->ChipFamily == CHIP_FAMILY_RS780) ||
+	(info->ChipFamily == CHIP_FAMILY_RV710))
+	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
+			    accel_state->vb_size, accel_state->vb_mc_addr);
+    else
+	cp_set_surface_sync(pScrn, accel_state->ib, VC_ACTION_ENA_bit,
+			    accel_state->vb_size, accel_state->vb_mc_addr);
+
+    set_vtx_resource        (pScrn, accel_state->ib, &vtx_res);
+
+    draw_conf.prim_type          = DI_PT_RECTLIST;
+    draw_conf.vgt_draw_initiator = DI_SRC_SEL_AUTO_INDEX;
+    draw_conf.num_instances      = 1;
+    draw_conf.num_indices        = vtx_res.vtx_num_entries / vtx_res.vtx_size_dw;
+    draw_conf.index_type         = DI_INDEX_SIZE_16_BIT;
+
+    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_0,            0);	/* ? */
+    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_1,            0);
+
+    ereg  (accel_state->ib, VGT_MAX_VTX_INDX,                    draw_conf.num_indices);
+    ereg  (accel_state->ib, VGT_MIN_VTX_INDX,                    0);
+    ereg  (accel_state->ib, VGT_INDX_OFFSET,                     0);
+
+    draw_auto(pScrn, accel_state->ib, &draw_conf);
+
+    wait_3d_idle_clean(pScrn, accel_state->ib);
+
+    cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit | CB0_DEST_BASE_ENA_bit),
+			accel_state->dst_size, accel_state->dst_mc_addr);
+
+    R600CPFlushIndirect(pScrn, accel_state->ib);
+}
+
+static Bool
+R600UploadToScreen(PixmapPtr pDst, int x, int y, int w, int h,
+		   char *src, int src_pitch)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pDst->drawable.pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+//    struct radeon_accel_state *accel_state = info->accel_state;
+    uint8_t *dst = (pointer)((char *)info->FB + exaGetPixmapOffset(pDst));
+    int dst_pitch = exaGetPixmapPitch(pDst);
+    int bpp = pDst->drawable.bitsPerPixel;
+
+
+    //return FALSE;
+
+    dst += (x * bpp / 8) + (y * dst_pitch);
+    w *= bpp / 8;
+
+    while (h--) {
+	memcpy(dst, src, w);
+	src += src_pitch;
+	dst += dst_pitch;
+    }
+
+    return TRUE;
+}
+
+static Bool
+R600DownloadFromScreen(PixmapPtr pSrc, int x, int y, int w, int h,
+		       char *dst, int dst_pitch)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pSrc->drawable.pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+//    struct radeon_accel_state *accel_state = info->accel_state;
+    uint8_t *src = (pointer)((char *)info->FB + exaGetPixmapOffset(pSrc));
+    int	src_pitch = exaGetPixmapPitch(pSrc);
+    int	bpp = pSrc->drawable.bitsPerPixel;
+
+    //return FALSE;
+
+    src += (x * bpp / 8) + (y * src_pitch);
+    w *= bpp / 8;
+
+    while (h--) {
+	memcpy(dst, src, w);
+	src += src_pitch;
+	dst += dst_pitch;
+    }
+
+    return TRUE;
+}
+
+static int
+R600MarkSync(ScreenPtr pScreen)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+
+    accel_state->exaSyncMarker++;
+
+    return accel_state->exaSyncMarker;
+}
+
+static void
+R600Sync(ScreenPtr pScreen, int marker)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+
+    if (accel_state->exaMarkerSynced != marker)
+	accel_state->exaMarkerSynced = marker;
+}
+
+static Bool
+R600LoadShaders(ScrnInfoPtr pScrn, ScreenPtr pScreen)
+{
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+    uint32_t *vs;
+    uint32_t *ps;
+    // 512 bytes per shader for now
+    int size = 512 * 10;
+    int i;
+
+    accel_state->shaders = NULL;
+
+    accel_state->shaders = exaOffscreenAlloc(pScreen, size, 256,
+					     TRUE, NULL, NULL);
+
+    if (accel_state->shaders == NULL)
+	return FALSE;
+
+    vs = (pointer)((char *)info->FB + accel_state->shaders->offset);
+    ps = (pointer)((char *)info->FB + accel_state->shaders->offset);
+    accel_state->solid_vs_offset = 0;
+    accel_state->solid_ps_offset = 512;
+    accel_state->copy_vs_offset = 1024;
+    accel_state->copy_ps_offset = 1536;
+    accel_state->comp_vs_offset = 2048;
+    accel_state->comp_ps_offset = 2560;
+    accel_state->comp_mask_vs_offset = 3072;
+    accel_state->comp_mask_ps_offset = 3584;
+    accel_state->xv_vs_offset = 4096;
+    accel_state->xv_ps_offset = 4608;
+
+    // solid vs ---------------------------------------
+    i = accel_state->solid_vs_offset / 4;
+    //0
+    vs[i++] = CF_DWORD0(ADDR(4));
+    vs[i++] = CF_DWORD1(POP_COUNT(0),
+			CF_CONST(0),
+			COND(SQ_CF_COND_ACTIVE),
+			I_COUNT(1),
+			CALL_COUNT(0),
+			END_OF_PROGRAM(0),
+			VALID_PIXEL_MODE(0),
+			CF_INST(SQ_CF_INST_VTX),
+			WHOLE_QUAD_MODE(0),
+			BARRIER(1));
+    //1
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_POS0),
+				      TYPE(SQ_EXPORT_POS),
+				      RW_GPR(1),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(0));
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(1),
+					   END_OF_PROGRAM(0),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT_DONE),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(1));
+    //2 - always export a param whether it's used or not
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(0),
+				      TYPE(SQ_EXPORT_PARAM),
+				      RW_GPR(0),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(0));
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(0),
+					   END_OF_PROGRAM(1),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT_DONE),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(0));
+    //3 - padding
+    vs[i++] = 0x00000000;
+    vs[i++] = 0x00000000;
+    //4/5
+    vs[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+			 FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+			 FETCH_WHOLE_QUAD(0),
+			 BUFFER_ID(0),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 SRC_SEL_X(SQ_SEL_X),
+			 MEGA_FETCH_COUNT(8));
+    vs[i++] = VTX_DWORD1_GPR(DST_GPR(1),
+			     DST_REL(0),
+			     DST_SEL_X(SQ_SEL_X),
+			     DST_SEL_Y(SQ_SEL_Y),
+			     DST_SEL_Z(SQ_SEL_0),
+			     DST_SEL_W(SQ_SEL_1),
+			     USE_CONST_FIELDS(0),
+			     DATA_FORMAT(FMT_32_32_FLOAT), //xxx
+			     NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), //xxx
+			     FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), //xxx
+			     SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
+    vs[i++] = VTX_DWORD2(OFFSET(0),
+			 ENDIAN_SWAP(ENDIAN_NONE),
+			 CONST_BUF_NO_STRIDE(0),
+			 MEGA_FETCH(1));
+    vs[i++] = VTX_DWORD_PAD;
+
+    // solid ps ---------------------------------------
+    i = accel_state->solid_ps_offset / 4;
+    // 0
+    ps[i++] = CF_ALU_DWORD0(ADDR(2),
+			    KCACHE_BANK0(0),
+			    KCACHE_BANK1(0),
+			    KCACHE_MODE0(0));
+    ps[i++] = CF_ALU_DWORD1(KCACHE_MODE1(0),
+			    KCACHE_ADDR0(0),
+			    KCACHE_ADDR1(0),
+			    I_COUNT(4),
+			    USES_WATERFALL(0),
+			    CF_INST(SQ_CF_INST_ALU),
+			    WHOLE_QUAD_MODE(0),
+			    BARRIER(1));
+    // 1
+    ps[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
+				      TYPE(SQ_EXPORT_PIXEL),
+				      RW_GPR(0),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(1));
+    ps[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(1),
+					   END_OF_PROGRAM(1),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT_DONE),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(1));
+
+    // 2
+    ps[i++] = ALU_DWORD0(SRC0_SEL(256),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_X),
+			 SRC0_NEG(0),
+			 SRC1_SEL(0),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_X),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_AR_X),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_MOV),
+			     BANK_SWIZZLE(SQ_ALU_VEC_012),
+			     DST_GPR(0),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_X),
+			     CLAMP(1));
+    // 3
+    ps[i++] = ALU_DWORD0(SRC0_SEL(256),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Y),
+			 SRC0_NEG(0),
+			 SRC1_SEL(0),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_Y),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_AR_X),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_MOV),
+			     BANK_SWIZZLE(SQ_ALU_VEC_012),
+			     DST_GPR(0),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Y),
+			     CLAMP(1));
+    // 4
+    ps[i++] = ALU_DWORD0(SRC0_SEL(256),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Z),
+			 SRC0_NEG(0),
+			 SRC1_SEL(0),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_Z),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_AR_X),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_MOV),
+			     BANK_SWIZZLE(SQ_ALU_VEC_012),
+			     DST_GPR(0),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Z),
+			     CLAMP(1));
+    // 5
+    ps[i++] = ALU_DWORD0(SRC0_SEL(256),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_W),
+			 SRC0_NEG(0),
+			 SRC1_SEL(0),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_W),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_AR_X),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_MOV),
+			     BANK_SWIZZLE(SQ_ALU_VEC_012),
+			     DST_GPR(0),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_W),
+			     CLAMP(1));
+
+    // copy vs ---------------------------------------
+    i = accel_state->copy_vs_offset / 4;
+    //0
+    vs[i++] = CF_DWORD0(ADDR(4));
+    vs[i++] = CF_DWORD1(POP_COUNT(0),
+			CF_CONST(0),
+			COND(SQ_CF_COND_ACTIVE),
+			I_COUNT(2),
+			CALL_COUNT(0),
+			END_OF_PROGRAM(0),
+			VALID_PIXEL_MODE(0),
+			CF_INST(SQ_CF_INST_VTX),
+			WHOLE_QUAD_MODE(0),
+			BARRIER(1));
+    //1
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_POS0),
+				      TYPE(SQ_EXPORT_POS),
+				      RW_GPR(1),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(0));
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(0),
+					   END_OF_PROGRAM(0),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT_DONE),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(1));
+    //2
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(0),
+				      TYPE(SQ_EXPORT_PARAM),
+				      RW_GPR(0),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(0));
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(0),
+					   END_OF_PROGRAM(1),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT_DONE),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(0));
+    //3
+    vs[i++] = 0x00000000;
+    vs[i++] = 0x00000000;
+    //4/5
+    vs[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+			 FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+			 FETCH_WHOLE_QUAD(0),
+			 BUFFER_ID(0),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 SRC_SEL_X(SQ_SEL_X),
+			 MEGA_FETCH_COUNT(16));
+    vs[i++] = VTX_DWORD1_GPR(DST_GPR(1),
+			     DST_REL(0),
+			     DST_SEL_X(SQ_SEL_X),
+			     DST_SEL_Y(SQ_SEL_Y),
+			     DST_SEL_Z(SQ_SEL_0),
+			     DST_SEL_W(SQ_SEL_1),
+			     USE_CONST_FIELDS(0),
+			     DATA_FORMAT(FMT_32_32_FLOAT), //xxx
+			     NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), //xxx
+			     FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), //xxx
+			     SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
+    vs[i++] = VTX_DWORD2(OFFSET(0),
+			 ENDIAN_SWAP(ENDIAN_NONE),
+			 CONST_BUF_NO_STRIDE(0),
+			 MEGA_FETCH(1));
+    vs[i++] = VTX_DWORD_PAD;
+    //6/7
+    vs[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+			 FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+			 FETCH_WHOLE_QUAD(0),
+			 BUFFER_ID(0),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 SRC_SEL_X(SQ_SEL_X),
+			 MEGA_FETCH_COUNT(8));
+    vs[i++] = VTX_DWORD1_GPR(DST_GPR(0),
+			     DST_REL(0),
+			     DST_SEL_X(SQ_SEL_X),
+			     DST_SEL_Y(SQ_SEL_Y),
+			     DST_SEL_Z(SQ_SEL_0),
+			     DST_SEL_W(SQ_SEL_1),
+			     USE_CONST_FIELDS(0),
+			     DATA_FORMAT(FMT_32_32_FLOAT), //xxx
+			     NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), //xxx
+			     FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), //xxx
+			     SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
+    vs[i++] = VTX_DWORD2(OFFSET(8),
+			 ENDIAN_SWAP(ENDIAN_NONE),
+			 CONST_BUF_NO_STRIDE(0),
+			 MEGA_FETCH(0));
+    vs[i++] = VTX_DWORD_PAD;
+
+    // copy ps ---------------------------------------
+    i = accel_state->copy_ps_offset / 4;
+    // CF INST 0
+    ps[i++] = CF_DWORD0(ADDR(2));
+    ps[i++] = CF_DWORD1(POP_COUNT(0),
+			CF_CONST(0),
+			COND(SQ_CF_COND_ACTIVE),
+			I_COUNT(1),
+			CALL_COUNT(0),
+			END_OF_PROGRAM(0),
+			VALID_PIXEL_MODE(0),
+			CF_INST(SQ_CF_INST_TEX),
+			WHOLE_QUAD_MODE(0),
+			BARRIER(1));
+    // CF INST 1
+    ps[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
+				      TYPE(SQ_EXPORT_PIXEL),
+				      RW_GPR(0),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(1));
+    ps[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(1),
+					   END_OF_PROGRAM(1),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT_DONE),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(1));
+    // TEX INST 0
+    ps[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
+			 BC_FRAC_MODE(0),
+			 FETCH_WHOLE_QUAD(0),
+			 RESOURCE_ID(0),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 R7xx_ALT_CONST(0));
+    ps[i++] = TEX_DWORD1(DST_GPR(0),
+			 DST_REL(ABSOLUTE),
+			 DST_SEL_X(SQ_SEL_X), //R
+			 DST_SEL_Y(SQ_SEL_Y), //G
+			 DST_SEL_Z(SQ_SEL_Z), //B
+			 DST_SEL_W(SQ_SEL_W), //A
+			 LOD_BIAS(0),
+			 COORD_TYPE_X(TEX_UNNORMALIZED),
+			 COORD_TYPE_Y(TEX_UNNORMALIZED),
+			 COORD_TYPE_Z(TEX_UNNORMALIZED),
+			 COORD_TYPE_W(TEX_UNNORMALIZED));
+    ps[i++] = TEX_DWORD2(OFFSET_X(0),
+			 OFFSET_Y(0),
+			 OFFSET_Z(0),
+			 SAMPLER_ID(0),
+			 SRC_SEL_X(SQ_SEL_X),
+			 SRC_SEL_Y(SQ_SEL_Y),
+			 SRC_SEL_Z(SQ_SEL_0),
+			 SRC_SEL_W(SQ_SEL_1));
+    ps[i++] = TEX_DWORD_PAD;
+
+    // xv vs ---------------------------------------
+    i = accel_state->xv_vs_offset / 4;
+    //0
+    vs[i++] = CF_DWORD0(ADDR(4));
+    vs[i++] = CF_DWORD1(POP_COUNT(0),
+			CF_CONST(0),
+			COND(SQ_CF_COND_ACTIVE),
+			I_COUNT(2),
+			CALL_COUNT(0),
+			END_OF_PROGRAM(0),
+			VALID_PIXEL_MODE(0),
+			CF_INST(SQ_CF_INST_VTX),
+			WHOLE_QUAD_MODE(0),
+			BARRIER(1));
+    //1
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_POS0),
+				      TYPE(SQ_EXPORT_POS),
+				      RW_GPR(1),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(0));
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(0),
+					   END_OF_PROGRAM(0),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT_DONE),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(1));
+    //2
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(0),
+				      TYPE(SQ_EXPORT_PARAM),
+				      RW_GPR(0),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(0));
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(0),
+					   END_OF_PROGRAM(1),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT_DONE),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(0));
+    //3
+    vs[i++] = 0x00000000;
+    vs[i++] = 0x00000000;
+    //4/5
+    vs[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+			 FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+			 FETCH_WHOLE_QUAD(0),
+			 BUFFER_ID(0),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 SRC_SEL_X(SQ_SEL_X),
+			 MEGA_FETCH_COUNT(16));
+    vs[i++] = VTX_DWORD1_GPR(DST_GPR(1),
+			     DST_REL(0),
+			     DST_SEL_X(SQ_SEL_X),
+			     DST_SEL_Y(SQ_SEL_Y),
+			     DST_SEL_Z(SQ_SEL_0),
+			     DST_SEL_W(SQ_SEL_1),
+			     USE_CONST_FIELDS(0),
+			     DATA_FORMAT(FMT_32_32_FLOAT), //xxx
+			     NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), //xxx
+			     FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), //xxx
+			     SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
+    vs[i++] = VTX_DWORD2(OFFSET(0),
+			 ENDIAN_SWAP(ENDIAN_NONE),
+			 CONST_BUF_NO_STRIDE(0),
+			 MEGA_FETCH(1));
+    vs[i++] = VTX_DWORD_PAD;
+    //6/7
+    vs[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+			 FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+			 FETCH_WHOLE_QUAD(0),
+			 BUFFER_ID(0),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 SRC_SEL_X(SQ_SEL_X),
+			 MEGA_FETCH_COUNT(8));
+    vs[i++] = VTX_DWORD1_GPR(DST_GPR(0),
+			     DST_REL(0),
+			     DST_SEL_X(SQ_SEL_X),
+			     DST_SEL_Y(SQ_SEL_Y),
+			     DST_SEL_Z(SQ_SEL_0),
+			     DST_SEL_W(SQ_SEL_1),
+			     USE_CONST_FIELDS(0),
+			     DATA_FORMAT(FMT_32_32_FLOAT), //xxx
+			     NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), //xxx
+			     FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), //xxx
+			     SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
+    vs[i++] = VTX_DWORD2(OFFSET(8),
+			 ENDIAN_SWAP(ENDIAN_NONE),
+			 CONST_BUF_NO_STRIDE(0),
+			 MEGA_FETCH(0));
+    vs[i++] = VTX_DWORD_PAD;
+
+    // xv ps ---------------------------------------
+    i = accel_state->xv_ps_offset / 4;
+    // 0
+    ps[i++] = CF_DWORD0(ADDR(20));
+    ps[i++] = CF_DWORD1(POP_COUNT(0),
+			CF_CONST(0),
+			COND(SQ_CF_COND_ACTIVE),
+			I_COUNT(2),
+			CALL_COUNT(0),
+			END_OF_PROGRAM(0),
+			VALID_PIXEL_MODE(0),
+			CF_INST(SQ_CF_INST_TEX),
+			WHOLE_QUAD_MODE(0),
+			BARRIER(0));
+    // 1
+    ps[i++] = CF_ALU_DWORD0(ADDR(3),
+			    KCACHE_BANK0(0),
+			    KCACHE_BANK1(0),
+			    KCACHE_MODE0(SQ_CF_KCACHE_NOP));
+    ps[i++] = CF_ALU_DWORD1(KCACHE_MODE1(SQ_CF_KCACHE_NOP),
+			    KCACHE_ADDR0(0),
+			    KCACHE_ADDR1(0),
+			    I_COUNT(16),
+			    USES_WATERFALL(0),
+			    CF_INST(SQ_CF_INST_ALU),
+			    WHOLE_QUAD_MODE(0),
+			    BARRIER(1));
+    // 2
+    ps[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
+				      TYPE(SQ_EXPORT_PIXEL),
+				      RW_GPR(3),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(3));
+    ps[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(1),
+					   END_OF_PROGRAM(1),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT_DONE),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(1));
+    // 3 - alu 0
+    // DP4 gpr[2].x gpr[1].x c[0].x
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_X),
+			 SRC0_NEG(0),
+			 SRC1_SEL(256),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_X),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_X),
+			     CLAMP(1));
+    // 4 - alu 1
+    // DP4 gpr[2].y gpr[1].y c[0].y
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Y),
+			 SRC0_NEG(0),
+			 SRC1_SEL(256),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_Y),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Y),
+			     CLAMP(1));
+    // 5 - alu 2
+    // DP4 gpr[2].z gpr[1].z c[0].z
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Z),
+			 SRC0_NEG(0),
+			 SRC1_SEL(256),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_Z),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Z),
+			     CLAMP(1));
+    // 6 - alu 3
+    // DP4 gpr[2].w gpr[1].w c[0].w
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_W),
+			 SRC0_NEG(0),
+			 SRC1_SEL(256),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_W),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(1));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_021),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_W),
+			     CLAMP(1));
+    // 7 - alu 4
+    // DP4 gpr[2].x gpr[1].x c[1].x
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_X),
+			 SRC0_NEG(0),
+			 SRC1_SEL(257),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_X),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_X),
+			     CLAMP(1));
+    // 8 - alu 5
+    // DP4 gpr[2].y gpr[1].y c[1].y
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Y),
+			 SRC0_NEG(0),
+			 SRC1_SEL(257),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_Y),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Y),
+			     CLAMP(1));
+    // 9 - alu 6
+    // DP4 gpr[2].z gpr[1].z c[1].z
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Z),
+			 SRC0_NEG(0),
+			 SRC1_SEL(257),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_Z),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Z),
+			     CLAMP(1));
+    // 10 - alu 7
+    // DP4 gpr[2].w gpr[1].w c[1].w
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_W),
+			 SRC0_NEG(0),
+			 SRC1_SEL(257),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_W),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(1));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_021),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_W),
+			     CLAMP(1));
+    // 11 - alu 8
+    // DP4 gpr[2].x gpr[1].x c[2].x
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_X),
+			 SRC0_NEG(0),
+			 SRC1_SEL(258),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_X),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_X),
+			     CLAMP(1));
+    // 12 - alu 9
+    // DP4 gpr[2].y gpr[1].y c[2].y
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Y),
+			 SRC0_NEG(0),
+			 SRC1_SEL(258),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_Y),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Y),
+			     CLAMP(1));
+    // 13 - alu 10
+    // DP4 gpr[2].z gpr[1].z c[2].z
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Z),
+			 SRC0_NEG(0),
+			 SRC1_SEL(258),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_Z),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_102),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Z),
+			     CLAMP(1));
+    // 14 - alu 11
+    // DP4 gpr[2].w gpr[1].w c[2].w
+    ps[i++] = ALU_DWORD0(SRC0_SEL(1),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_W),
+			 SRC0_NEG(0),
+			 SRC1_SEL(258),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_W),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(1));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(0),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_DOT4),
+			     BANK_SWIZZLE(SQ_ALU_VEC_021),
+			     DST_GPR(2),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_W),
+			     CLAMP(1));
+    // 15 - alu 12
+    // MOV gpr[3].x gpr[2].x
+    ps[i++] = ALU_DWORD0(SRC0_SEL(2),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_X),
+			 SRC0_NEG(0),
+			 SRC1_SEL(0),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_X),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_MOV),
+			     BANK_SWIZZLE(SQ_ALU_VEC_210),
+			     DST_GPR(3),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_X),
+			     CLAMP(0));
+    // 16 - alu 13
+    // MOV gpr[3].y gpr[2].y
+    ps[i++] = ALU_DWORD0(SRC0_SEL(2),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Y),
+			 SRC0_NEG(0),
+			 SRC1_SEL(0),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_X),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_MOV),
+			     BANK_SWIZZLE(SQ_ALU_VEC_210),
+			     DST_GPR(3),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Y),
+			     CLAMP(0));
+    // 17 - alu 14
+    // MOV gpr[3].z gpr[2].z
+    ps[i++] = ALU_DWORD0(SRC0_SEL(2),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_Z),
+			 SRC0_NEG(0),
+			 SRC1_SEL(0),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_X),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(0));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_MOV),
+			     BANK_SWIZZLE(SQ_ALU_VEC_210),
+			     DST_GPR(3),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_Z),
+			     CLAMP(0));
+    // 18 - alu 15
+    // MOV gpr[3].w gpr[2].w
+    ps[i++] = ALU_DWORD0(SRC0_SEL(2),
+			 SRC0_REL(ABSOLUTE),
+			 SRC0_ELEM(ELEM_W),
+			 SRC0_NEG(0),
+			 SRC1_SEL(0),
+			 SRC1_REL(ABSOLUTE),
+			 SRC1_ELEM(ELEM_X),
+			 SRC1_NEG(0),
+			 INDEX_MODE(SQ_INDEX_LOOP),
+			 PRED_SEL(SQ_PRED_SEL_OFF),
+			 LAST(1));
+    ps[i++] = ALU_DWORD1_OP2(info->ChipFamily,
+			     SRC0_ABS(0),
+			     SRC1_ABS(0),
+			     UPDATE_EXECUTE_MASK(0),
+			     UPDATE_PRED(0),
+			     WRITE_MASK(1),
+			     FOG_MERGE(0),
+			     OMOD(SQ_ALU_OMOD_OFF),
+			     ALU_INST(SQ_OP2_INST_MOV),
+			     BANK_SWIZZLE(SQ_ALU_VEC_012),
+			     DST_GPR(3),
+			     DST_REL(ABSOLUTE),
+			     DST_ELEM(ELEM_W),
+			     CLAMP(0));
+    // 19 - alignment
+    ps[i++] = 0x00000000;
+    ps[i++] = 0x00000000;
+    // 20/21 - tex 0
+    ps[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
+			 BC_FRAC_MODE(0),
+			 FETCH_WHOLE_QUAD(0),
+			 RESOURCE_ID(0),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 R7xx_ALT_CONST(0));
+    ps[i++] = TEX_DWORD1(DST_GPR(1),
+			 DST_REL(ABSOLUTE),
+			 DST_SEL_X(SQ_SEL_X),    //R
+			 DST_SEL_Y(SQ_SEL_MASK), //G
+			 DST_SEL_Z(SQ_SEL_MASK), //B
+			 DST_SEL_W(SQ_SEL_1),    //A
+			 LOD_BIAS(0),
+			 COORD_TYPE_X(TEX_NORMALIZED),
+			 COORD_TYPE_Y(TEX_NORMALIZED),
+			 COORD_TYPE_Z(TEX_NORMALIZED),
+			 COORD_TYPE_W(TEX_NORMALIZED));
+    ps[i++] = TEX_DWORD2(OFFSET_X(0),
+			 OFFSET_Y(0),
+			 OFFSET_Z(0),
+			 SAMPLER_ID(0),
+			 SRC_SEL_X(SQ_SEL_X),
+			 SRC_SEL_Y(SQ_SEL_Y),
+			 SRC_SEL_Z(SQ_SEL_0),
+			 SRC_SEL_W(SQ_SEL_1));
+    ps[i++] = TEX_DWORD_PAD;
+    // 22/23 - tex 1
+    ps[i++] = TEX_DWORD0(TEX_INST(SQ_TEX_INST_SAMPLE),
+			 BC_FRAC_MODE(0),
+			 FETCH_WHOLE_QUAD(0),
+			 RESOURCE_ID(1),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 R7xx_ALT_CONST(0));
+    ps[i++] = TEX_DWORD1(DST_GPR(1),
+			 DST_REL(ABSOLUTE),
+			 DST_SEL_X(SQ_SEL_MASK), //R
+			 DST_SEL_Y(SQ_SEL_X),    //G
+			 DST_SEL_Z(SQ_SEL_Y),    //B
+			 DST_SEL_W(SQ_SEL_MASK), //A
+			 LOD_BIAS(0),
+			 COORD_TYPE_X(TEX_NORMALIZED),
+			 COORD_TYPE_Y(TEX_NORMALIZED),
+			 COORD_TYPE_Z(TEX_NORMALIZED),
+			 COORD_TYPE_W(TEX_NORMALIZED));
+    ps[i++] = TEX_DWORD2(OFFSET_X(0),
+			 OFFSET_Y(0),
+			 OFFSET_Z(0),
+			 SAMPLER_ID(1),
+			 SRC_SEL_X(SQ_SEL_X),
+			 SRC_SEL_Y(SQ_SEL_Y),
+			 SRC_SEL_Z(SQ_SEL_0),
+			 SRC_SEL_W(SQ_SEL_1));
+    ps[i++] = TEX_DWORD_PAD;
+
+    // comp mask vs ---------------------------------------
+    i = accel_state->comp_mask_vs_offset / 4;
+    //0
+    vs[i++] = CF_DWORD0(ADDR(4));
+    vs[i++] = CF_DWORD1(POP_COUNT(0),
+			CF_CONST(0),
+			COND(SQ_CF_COND_ACTIVE),
+			I_COUNT(3),
+			CALL_COUNT(0),
+			END_OF_PROGRAM(0),
+			VALID_PIXEL_MODE(0),
+			CF_INST(SQ_CF_INST_VTX),
+			WHOLE_QUAD_MODE(0),
+			BARRIER(1));
+    //1 - dst
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_POS0),
+				      TYPE(SQ_EXPORT_POS),
+				      RW_GPR(2),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(0));
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(1),
+					   END_OF_PROGRAM(0),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT_DONE),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(1));
+    //2 - src
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(0),
+				      TYPE(SQ_EXPORT_PARAM),
+				      RW_GPR(1),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(0));
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(1),
+					   END_OF_PROGRAM(0),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(0));
+    //3 - mask
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(1),
+				      TYPE(SQ_EXPORT_PARAM),
+				      RW_GPR(0),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(0));
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(1),
+					   END_OF_PROGRAM(1),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT_DONE),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(0));
+    //4/5 - dst
+    vs[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+			 FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+			 FETCH_WHOLE_QUAD(0),
+			 BUFFER_ID(0),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 SRC_SEL_X(SQ_SEL_X),
+			 MEGA_FETCH_COUNT(24));
+    vs[i++] = VTX_DWORD1_GPR(DST_GPR(2),
+			     DST_REL(0),
+			     DST_SEL_X(SQ_SEL_X),
+			     DST_SEL_Y(SQ_SEL_Y),
+			     DST_SEL_Z(SQ_SEL_0),
+			     DST_SEL_W(SQ_SEL_1),
+			     USE_CONST_FIELDS(0),
+			     DATA_FORMAT(FMT_32_32_FLOAT), //xxx
+			     NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), //xxx
+			     FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), //xxx
+			     SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
+    vs[i++] = VTX_DWORD2(OFFSET(0),
+			 ENDIAN_SWAP(ENDIAN_NONE),
+			 CONST_BUF_NO_STRIDE(0),
+			 MEGA_FETCH(1));
+    vs[i++] = VTX_DWORD_PAD;
+    //6/7 - src
+    vs[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+			 FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+			 FETCH_WHOLE_QUAD(0),
+			 BUFFER_ID(0),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 SRC_SEL_X(SQ_SEL_X),
+			 MEGA_FETCH_COUNT(8));
+    vs[i++] = VTX_DWORD1_GPR(DST_GPR(1),
+			     DST_REL(0),
+			     DST_SEL_X(SQ_SEL_X),
+			     DST_SEL_Y(SQ_SEL_Y),
+			     DST_SEL_Z(SQ_SEL_0),
+			     DST_SEL_W(SQ_SEL_1),
+			     USE_CONST_FIELDS(0),
+			     DATA_FORMAT(FMT_32_32_FLOAT), //xxx
+			     NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), //xxx
+			     FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), //xxx
+			     SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
+    vs[i++] = VTX_DWORD2(OFFSET(8),
+			 ENDIAN_SWAP(ENDIAN_NONE),
+			 CONST_BUF_NO_STRIDE(0),
+			 MEGA_FETCH(0));
+    vs[i++] = VTX_DWORD_PAD;
+    //8/9 - mask
+    vs[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+			 FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+			 FETCH_WHOLE_QUAD(0),
+			 BUFFER_ID(0),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 SRC_SEL_X(SQ_SEL_X),
+			 MEGA_FETCH_COUNT(8));
+    vs[i++] = VTX_DWORD1_GPR(DST_GPR(0),
+			     DST_REL(0),
+			     DST_SEL_X(SQ_SEL_X),
+			     DST_SEL_Y(SQ_SEL_Y),
+			     DST_SEL_Z(SQ_SEL_0),
+			     DST_SEL_W(SQ_SEL_1),
+			     USE_CONST_FIELDS(0),
+			     DATA_FORMAT(FMT_32_32_FLOAT), //xxx
+			     NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), //xxx
+			     FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), //xxx
+			     SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
+    vs[i++] = VTX_DWORD2(OFFSET(16),
+			 ENDIAN_SWAP(ENDIAN_NONE),
+			 CONST_BUF_NO_STRIDE(0),
+			 MEGA_FETCH(0));
+    vs[i++] = VTX_DWORD_PAD;
+
+    // comp mask ps ---------------------------------------
+    // not yet
+
+    // comp vs ---------------------------------------
+    i = accel_state->comp_vs_offset / 4;
+    //0
+    vs[i++] = CF_DWORD0(ADDR(4));
+    vs[i++] = CF_DWORD1(POP_COUNT(0),
+			CF_CONST(0),
+			COND(SQ_CF_COND_ACTIVE),
+			I_COUNT(2),
+			CALL_COUNT(0),
+			END_OF_PROGRAM(0),
+			VALID_PIXEL_MODE(0),
+			CF_INST(SQ_CF_INST_VTX),
+			WHOLE_QUAD_MODE(0),
+			BARRIER(1));
+    //1 - dst
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_POS0),
+				      TYPE(SQ_EXPORT_POS),
+				      RW_GPR(1),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(0));
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(0),
+					   END_OF_PROGRAM(0),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT_DONE),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(1));
+    //2 - src
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(0),
+				      TYPE(SQ_EXPORT_PARAM),
+				      RW_GPR(0),
+				      RW_REL(ABSOLUTE),
+				      INDEX_GPR(0),
+				      ELEM_SIZE(0));
+    vs[i++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+					   SRC_SEL_Y(SQ_SEL_Y),
+					   SRC_SEL_Z(SQ_SEL_Z),
+					   SRC_SEL_W(SQ_SEL_W),
+					   R6xx_ELEM_LOOP(0),
+					   BURST_COUNT(0),
+					   END_OF_PROGRAM(1),
+					   VALID_PIXEL_MODE(0),
+					   CF_INST(SQ_CF_INST_EXPORT_DONE),
+					   WHOLE_QUAD_MODE(0),
+					   BARRIER(0));
+    //3
+    vs[i++] = 0x00000000;
+    vs[i++] = 0x00000000;
+    //4/5 - dst
+    vs[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+			 FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+			 FETCH_WHOLE_QUAD(0),
+			 BUFFER_ID(0),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 SRC_SEL_X(SQ_SEL_X),
+			 MEGA_FETCH_COUNT(16));
+    vs[i++] = VTX_DWORD1_GPR(DST_GPR(1),
+			     DST_REL(0),
+			     DST_SEL_X(SQ_SEL_X),
+			     DST_SEL_Y(SQ_SEL_Y),
+			     DST_SEL_Z(SQ_SEL_0),
+			     DST_SEL_W(SQ_SEL_1),
+			     USE_CONST_FIELDS(0),
+			     DATA_FORMAT(FMT_32_32_FLOAT), //xxx
+			     NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), //xxx
+			     FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), //xxx
+			     SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
+    vs[i++] = VTX_DWORD2(OFFSET(0),
+			 ENDIAN_SWAP(ENDIAN_NONE),
+			 CONST_BUF_NO_STRIDE(0),
+			 MEGA_FETCH(1));
+    vs[i++] = VTX_DWORD_PAD;
+    //6/7 - src
+    vs[i++] = VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+			 FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+			 FETCH_WHOLE_QUAD(0),
+			 BUFFER_ID(0),
+			 SRC_GPR(0),
+			 SRC_REL(ABSOLUTE),
+			 SRC_SEL_X(SQ_SEL_X),
+			 MEGA_FETCH_COUNT(8));
+    vs[i++] = VTX_DWORD1_GPR(DST_GPR(0),
+			     DST_REL(0),
+			     DST_SEL_X(SQ_SEL_X),
+			     DST_SEL_Y(SQ_SEL_Y),
+			     DST_SEL_Z(SQ_SEL_0),
+			     DST_SEL_W(SQ_SEL_1),
+			     USE_CONST_FIELDS(0),
+			     DATA_FORMAT(FMT_32_32_FLOAT), //xxx
+			     NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), //xxx
+			     FORMAT_COMP_ALL(SQ_FORMAT_COMP_SIGNED), //xxx
+			     SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE));
+    vs[i++] = VTX_DWORD2(OFFSET(8),
+			 ENDIAN_SWAP(ENDIAN_NONE),
+			 CONST_BUF_NO_STRIDE(0),
+			 MEGA_FETCH(0));
+    vs[i++] = VTX_DWORD_PAD;
+
+    // comp ps ---------------------------------------
+    // not yet
+
+
+    return TRUE;
+}
+
+static Bool
+R600PrepareAccess(PixmapPtr pPix, int index)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pPix->drawable.pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    unsigned char *RADEONMMIO = info->MMIO;
+
+    //flush HDP read/write caches
+    OUTREG(HDP_MEM_COHERENCY_FLUSH_CNTL, 0x1);
+
+    return TRUE;
+}
+
+static void
+R600FinishAccess(PixmapPtr pPix, int index)
+{
+    ScrnInfoPtr pScrn = xf86Screens[pPix->drawable.pScreen->myNum];
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    unsigned char *RADEONMMIO = info->MMIO;
+
+    //flush HDP read/write caches
+    OUTREG(HDP_MEM_COHERENCY_FLUSH_CNTL, 0x1);
+
+}
+
+
+Bool
+R600DrawInit(ScreenPtr pScreen)
+{
+    ScrnInfoPtr pScrn =  xf86Screens[pScreen->myNum];
+    RADEONInfoPtr info   = RADEONPTR(pScrn);
+
+    if (info->accel_state->exa == NULL) {
+	xf86DrvMsg(pScreen->myNum, X_ERROR, "Memory map not set up\n");
+	return FALSE;
+    }
+
+    info->accel_state->exa->exa_major = EXA_VERSION_MAJOR;
+    info->accel_state->exa->exa_minor = EXA_VERSION_MINOR;
+
+    info->accel_state->exa->PrepareSolid = R600PrepareSolid;
+    info->accel_state->exa->Solid = R600Solid;
+    info->accel_state->exa->DoneSolid = R600DoneSolid;
+
+    info->accel_state->exa->PrepareCopy = R600PrepareCopy;
+    info->accel_state->exa->Copy = R600Copy;
+    info->accel_state->exa->DoneCopy = R600DoneCopy;
+
+    info->accel_state->exa->MarkSync = R600MarkSync;
+    info->accel_state->exa->WaitMarker = R600Sync;
+
+    info->accel_state->exa->PrepareAccess = R600PrepareAccess;
+    info->accel_state->exa->FinishAccess = R600FinishAccess;
+
+    info->accel_state->exa->flags = EXA_OFFSCREEN_PIXMAPS;
+    info->accel_state->exa->pixmapOffsetAlign = 256;
+    info->accel_state->exa->pixmapPitchAlign = 256;
+
+    info->accel_state->exa->CheckComposite = R600CheckComposite;
+    info->accel_state->exa->PrepareComposite = R600PrepareComposite;
+    info->accel_state->exa->Composite = R600Composite;
+    info->accel_state->exa->DoneComposite = R600DoneComposite;
+
+#if EXA_VERSION_MAJOR > 2 || (EXA_VERSION_MAJOR == 2 && EXA_VERSION_MINOR >= 3)
+    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Setting EXA maxPitchBytes\n");
+
+    info->accel_state->exa->maxPitchBytes = 16320;
+    info->accel_state->exa->maxX = 8192;
+#else
+    info->accel_state->exa->maxX = 16320 / 4;
+#endif
+    info->accel_state->exa->maxY = 8192;
+
+    if (xf86ReturnOptValBool(info->Options, OPTION_EXA_VSYNC, FALSE)) {
+	xf86DrvMsg(pScrn->scrnIndex, X_INFO, "EXA VSync enabled\n");
+	info->accel_state->vsync = TRUE;
+    } else
+	info->accel_state->vsync = FALSE;
+
+    if (!exaDriverInit(pScreen, info->accel_state->exa)) {
+	xfree(info->accel_state->exa);
+	return FALSE;
+    }
+
+    if (!info->gartLocation)
+	return FALSE;
+
+    info->accel_state->XInited3D = FALSE;
+
+    if (!R600LoadShaders(pScrn, pScreen))
+	return FALSE;
+
+    exaMarkSync(pScreen);
+
+    return TRUE;
+
+}
+
diff --git a/src/r600_reg.h b/src/r600_reg.h
new file mode 100644
index 0000000..dfe4703
--- /dev/null
+++ b/src/r600_reg.h
@@ -0,0 +1,118 @@
+/*
+ * RadeonHD R6xx, R7xx Register documentation
+ *
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ * Copyright (C) 2008-2009  Matthias Hopf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _R600_REG_H_
+#define _R600_REG_H_
+
+/*
+ * Register definitions
+ */
+
+#include "r600_reg_auto_r6xx.h"
+#include "r600_reg_r6xx.h"
+#include "r600_reg_r7xx.h"
+
+
+/* SET_*_REG offsets + ends */
+enum {
+    SET_CONFIG_REG_offset          = 0x00008000,
+    SET_CONFIG_REG_end             = 0x0000ac00,
+    SET_CONTEXT_REG_offset         = 0x00028000,
+    SET_CONTEXT_REG_end            = 0x00029000,
+    SET_ALU_CONST_offset           = 0x00030000,
+    SET_ALU_CONST_end              = 0x00032000,
+    SET_RESOURCE_offset            = 0x00038000,
+    SET_RESOURCE_end               = 0x0003c000,
+    SET_SAMPLER_offset             = 0x0003c000,
+    SET_SAMPLER_end                = 0x0003cff0,
+    SET_CTL_CONST_offset           = 0x0003cff0,
+    SET_CTL_CONST_end              = 0x0003e200,
+    SET_LOOP_CONST_offset          = 0x0003e200,
+    SET_LOOP_CONST_end             = 0x0003e380,
+    SET_BOOL_CONST_offset          = 0x0003e380,
+    SET_BOOL_CONST_end             = 0x00040000,
+} ;
+
+/* packet3 IT_SURFACE_BASE_UPDATE bits */
+enum {
+	DEPTH_BASE    = (1 << 0),
+	COLOR0_BASE   = (1 << 1),
+	COLOR1_BASE   = (1 << 2),
+	COLOR2_BASE   = (1 << 3),
+	COLOR3_BASE   = (1 << 4),
+	COLOR4_BASE   = (1 << 5),
+	COLOR5_BASE   = (1 << 6),
+	COLOR6_BASE   = (1 << 7),
+	COLOR7_BASE   = (1 << 8),
+	STRMOUT_BASE0 = (1 << 9),
+	STRMOUT_BASE1 = (1 << 10),
+	STRMOUT_BASE2 = (1 << 11),
+	STRMOUT_BASE3 = (1 << 12),
+	COHER_BASE0   = (1 << 13),
+	COHER_BASE1   = (1 << 14),
+};
+
+/* Packet3 commands */
+enum {
+    IT_NOP                               = 0x10,
+    IT_INDIRECT_BUFFER_END               = 0x17,
+    IT_SET_PREDICATION                   = 0x20,
+    IT_REG_RMW                           = 0x21,
+    IT_COND_EXEC                         = 0x22,
+    IT_PRED_EXEC                         = 0x23,
+    IT_START_3D_CMDBUF                   = 0x24,
+    IT_DRAW_INDEX_2                      = 0x27,
+    IT_CONTEXT_CONTROL                   = 0x28,
+    IT_DRAW_INDEX_IMMD_BE                = 0x29,
+    IT_INDEX_TYPE                        = 0x2A,
+    IT_DRAW_INDEX                        = 0x2B,
+    IT_DRAW_INDEX_AUTO                   = 0x2D,
+    IT_DRAW_INDEX_IMMD                   = 0x2E,
+    IT_NUM_INSTANCES                     = 0x2F,
+    IT_STRMOUT_BUFFER_UPDATE             = 0x34,
+    IT_INDIRECT_BUFFER_MP                = 0x38,
+    IT_MEM_SEMAPHORE                     = 0x39,
+    IT_MPEG_INDEX                        = 0x3A,
+    IT_WAIT_REG_MEM                      = 0x3C,
+    IT_MEM_WRITE                         = 0x3D,
+    IT_INDIRECT_BUFFER                   = 0x32,
+    IT_CP_INTERRUPT                      = 0x40,
+    IT_SURFACE_SYNC                      = 0x43,
+    IT_ME_INITIALIZE                     = 0x44,
+    IT_COND_WRITE                        = 0x45,
+    IT_EVENT_WRITE                       = 0x46,
+    IT_EVENT_WRITE_EOP                   = 0x47,
+    IT_ONE_REG_WRITE                     = 0x57,
+    IT_SET_CONFIG_REG                    = 0x68,
+    IT_SET_CONTEXT_REG                   = 0x69,
+    IT_SET_ALU_CONST                     = 0x6A,
+    IT_SET_BOOL_CONST                    = 0x6B,
+    IT_SET_LOOP_CONST                    = 0x6C,
+    IT_SET_RESOURCE                      = 0x6D,
+    IT_SET_SAMPLER                       = 0x6E,
+    IT_SET_CTL_CONST                     = 0x6F,
+    IT_SURFACE_BASE_UPDATE               = 0x73,
+} ;
+
+#endif
diff --git a/src/r600_reg_auto_r6xx.h b/src/r600_reg_auto_r6xx.h
new file mode 100644
index 0000000..9d5aa3c
--- /dev/null
+++ b/src/r600_reg_auto_r6xx.h
@@ -0,0 +1,3087 @@
+/*
+ * RadeonHD R6xx, R7xx Register documentation
+ *
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ * Copyright (C) 2008-2009  Matthias Hopf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _AUTOREGS
+#define _AUTOREGS
+
+enum {
+
+    VGT_VTX_VECT_EJECT_REG                                = 0x000088b0,
+	PRIM_COUNT_mask                                   = 0x3ff << 0,
+	PRIM_COUNT_shift                                  = 0,
+    VGT_LAST_COPY_STATE                                   = 0x000088c0,
+	SRC_STATE_ID_mask                                 = 0x07 << 0,
+	SRC_STATE_ID_shift                                = 0,
+	DST_STATE_ID_mask                                 = 0x07 << 16,
+	DST_STATE_ID_shift                                = 16,
+    VGT_CACHE_INVALIDATION                                = 0x000088c4,
+	CACHE_INVALIDATION_mask                           = 0x03 << 0,
+	CACHE_INVALIDATION_shift                          = 0,
+	    VC_ONLY                                       = 0x00,
+	    TC_ONLY                                       = 0x01,
+	    VC_AND_TC                                     = 0x02,
+	VS_NO_EXTRA_BUFFER_bit                            = 1 << 5,
+    VGT_GS_PER_ES                                         = 0x000088c8,
+    VGT_ES_PER_GS                                         = 0x000088cc,
+    VGT_GS_VERTEX_REUSE                                   = 0x000088d4,
+	VERT_REUSE_mask                                   = 0x1f << 0,
+	VERT_REUSE_shift                                  = 0,
+    VGT_MC_LAT_CNTL                                       = 0x000088d8,
+	MC_TIME_STAMP_RES_mask                            = 0x03 << 0,
+	MC_TIME_STAMP_RES_shift                           = 0,
+	    X_0_992_MAX_LATENCY                           = 0x00,
+	    X_0_496_MAX_LATENCY                           = 0x01,
+	    X_0_248_MAX_LATENCY                           = 0x02,
+	    X_0_124_MAX_LATENCY                           = 0x03,
+    VGT_GS_PER_VS                                         = 0x000088e8,
+	GS_PER_VS_mask                                    = 0x0f << 0,
+	GS_PER_VS_shift                                   = 0,
+    VGT_CNTL_STATUS                                       = 0x000088f0,
+	VGT_OUT_INDX_BUSY_bit                             = 1 << 0,
+	VGT_OUT_BUSY_bit                                  = 1 << 1,
+	VGT_PT_BUSY_bit                                   = 1 << 2,
+	VGT_TE_BUSY_bit                                   = 1 << 3,
+	VGT_VR_BUSY_bit                                   = 1 << 4,
+	VGT_GRP_BUSY_bit                                  = 1 << 5,
+	VGT_DMA_REQ_BUSY_bit                              = 1 << 6,
+	VGT_DMA_BUSY_bit                                  = 1 << 7,
+	VGT_GS_BUSY_bit                                   = 1 << 8,
+	VGT_BUSY_bit                                      = 1 << 9,
+    VGT_PRIMITIVE_TYPE                                    = 0x00008958,
+	VGT_PRIMITIVE_TYPE__PRIM_TYPE_mask                = 0x3f << 0,
+	VGT_PRIMITIVE_TYPE__PRIM_TYPE_shift               = 0,
+	    DI_PT_NONE                                    = 0x00,
+	    DI_PT_POINTLIST                               = 0x01,
+	    DI_PT_LINELIST                                = 0x02,
+	    DI_PT_LINESTRIP                               = 0x03,
+	    DI_PT_TRILIST                                 = 0x04,
+	    DI_PT_TRIFAN                                  = 0x05,
+	    DI_PT_TRISTRIP                                = 0x06,
+	    DI_PT_UNUSED_0                                = 0x07,
+	    DI_PT_UNUSED_1                                = 0x08,
+	    DI_PT_UNUSED_2                                = 0x09,
+	    DI_PT_LINELIST_ADJ                            = 0x0a,
+	    DI_PT_LINESTRIP_ADJ                           = 0x0b,
+	    DI_PT_TRILIST_ADJ                             = 0x0c,
+	    DI_PT_TRISTRIP_ADJ                            = 0x0d,
+	    DI_PT_UNUSED_3                                = 0x0e,
+	    DI_PT_UNUSED_4                                = 0x0f,
+	    DI_PT_TRI_WITH_WFLAGS                         = 0x10,
+	    DI_PT_RECTLIST                                = 0x11,
+	    DI_PT_LINELOOP                                = 0x12,
+	    DI_PT_QUADLIST                                = 0x13,
+	    DI_PT_QUADSTRIP                               = 0x14,
+	    DI_PT_POLYGON                                 = 0x15,
+	    DI_PT_2D_COPY_RECT_LIST_V0                    = 0x16,
+	    DI_PT_2D_COPY_RECT_LIST_V1                    = 0x17,
+	    DI_PT_2D_COPY_RECT_LIST_V2                    = 0x18,
+	    DI_PT_2D_COPY_RECT_LIST_V3                    = 0x19,
+	    DI_PT_2D_FILL_RECT_LIST                       = 0x1a,
+	    DI_PT_2D_LINE_STRIP                           = 0x1b,
+	    DI_PT_2D_TRI_STRIP                            = 0x1c,
+    VGT_INDEX_TYPE                                        = 0x0000895c,
+	INDEX_TYPE_mask                                   = 0x03 << 0,
+	INDEX_TYPE_shift                                  = 0,
+	    DI_INDEX_SIZE_16_BIT                          = 0x00,
+	    DI_INDEX_SIZE_32_BIT                          = 0x01,
+    VGT_STRMOUT_BUFFER_FILLED_SIZE_0                      = 0x00008960,
+    VGT_STRMOUT_BUFFER_FILLED_SIZE_1                      = 0x00008964,
+    VGT_STRMOUT_BUFFER_FILLED_SIZE_2                      = 0x00008968,
+    VGT_STRMOUT_BUFFER_FILLED_SIZE_3                      = 0x0000896c,
+    VGT_NUM_INDICES                                       = 0x00008970,
+    VGT_NUM_INSTANCES                                     = 0x00008974,
+    PA_CL_CNTL_STATUS                                     = 0x00008a10,
+	CL_BUSY_bit                                       = 1 << 31,
+    PA_CL_ENHANCE                                         = 0x00008a14,
+	CLIP_VTX_REORDER_ENA_bit                          = 1 << 0,
+	NUM_CLIP_SEQ_mask                                 = 0x03 << 1,
+	NUM_CLIP_SEQ_shift                                = 1,
+	CLIPPED_PRIM_SEQ_STALL_bit                        = 1 << 3,
+	VE_NAN_PROC_DISABLE_bit                           = 1 << 4,
+    PA_SU_CNTL_STATUS                                     = 0x00008a50,
+	SU_BUSY_bit                                       = 1 << 31,
+    PA_SC_LINE_STIPPLE_STATE                              = 0x00008b10,
+	CURRENT_PTR_mask                                  = 0x0f << 0,
+	CURRENT_PTR_shift                                 = 0,
+	CURRENT_COUNT_mask                                = 0xff << 8,
+	CURRENT_COUNT_shift                               = 8,
+    PA_SC_MULTI_CHIP_CNTL                                 = 0x00008b20,
+	LOG2_NUM_CHIPS_mask                               = 0x07 << 0,
+	LOG2_NUM_CHIPS_shift                              = 0,
+	MULTI_CHIP_TILE_SIZE_mask                         = 0x03 << 3,
+	MULTI_CHIP_TILE_SIZE_shift                        = 3,
+	    X_16_X_16_PIXEL_TILE_PER_CHIP                 = 0x00,
+	    X_32_X_32_PIXEL_TILE_PER_CHIP                 = 0x01,
+	    X_64_X_64_PIXEL_TILE_PER_CHIP                 = 0x02,
+	    X_128X128_PIXEL_TILE_PER_CHIP                 = 0x03,
+	CHIP_TILE_X_LOC_mask                              = 0x07 << 5,
+	CHIP_TILE_X_LOC_shift                             = 5,
+	CHIP_TILE_Y_LOC_mask                              = 0x07 << 8,
+	CHIP_TILE_Y_LOC_shift                             = 8,
+	CHIP_SUPER_TILE_B_bit                             = 1 << 11,
+    PA_SC_AA_SAMPLE_LOCS_2S                               = 0x00008b40,
+	S0_X_mask                                         = 0x0f << 0,
+	S0_X_shift                                        = 0,
+	S0_Y_mask                                         = 0x0f << 4,
+	S0_Y_shift                                        = 4,
+	S1_X_mask                                         = 0x0f << 8,
+	S1_X_shift                                        = 8,
+	S1_Y_mask                                         = 0x0f << 12,
+	S1_Y_shift                                        = 12,
+    PA_SC_AA_SAMPLE_LOCS_4S                               = 0x00008b44,
+/* 	S0_X_mask                                         = 0x0f << 0, */
+/* 	S0_X_shift                                        = 0, */
+/* 	S0_Y_mask                                         = 0x0f << 4, */
+/* 	S0_Y_shift                                        = 4, */
+/* 	S1_X_mask                                         = 0x0f << 8, */
+/* 	S1_X_shift                                        = 8, */
+/* 	S1_Y_mask                                         = 0x0f << 12, */
+/* 	S1_Y_shift                                        = 12, */
+	S2_X_mask                                         = 0x0f << 16,
+	S2_X_shift                                        = 16,
+	S2_Y_mask                                         = 0x0f << 20,
+	S2_Y_shift                                        = 20,
+	S3_X_mask                                         = 0x0f << 24,
+	S3_X_shift                                        = 24,
+	S3_Y_mask                                         = 0x0f << 28,
+	S3_Y_shift                                        = 28,
+    PA_SC_AA_SAMPLE_LOCS_8S_WD0                           = 0x00008b48,
+/* 	S0_X_mask                                         = 0x0f << 0, */
+/* 	S0_X_shift                                        = 0, */
+/* 	S0_Y_mask                                         = 0x0f << 4, */
+/* 	S0_Y_shift                                        = 4, */
+/* 	S1_X_mask                                         = 0x0f << 8, */
+/* 	S1_X_shift                                        = 8, */
+/* 	S1_Y_mask                                         = 0x0f << 12, */
+/* 	S1_Y_shift                                        = 12, */
+/* 	S2_X_mask                                         = 0x0f << 16, */
+/* 	S2_X_shift                                        = 16, */
+/* 	S2_Y_mask                                         = 0x0f << 20, */
+/* 	S2_Y_shift                                        = 20, */
+/* 	S3_X_mask                                         = 0x0f << 24, */
+/* 	S3_X_shift                                        = 24, */
+/* 	S3_Y_mask                                         = 0x0f << 28, */
+/* 	S3_Y_shift                                        = 28, */
+    PA_SC_AA_SAMPLE_LOCS_8S_WD1                           = 0x00008b4c,
+	S4_X_mask                                         = 0x0f << 0,
+	S4_X_shift                                        = 0,
+	S4_Y_mask                                         = 0x0f << 4,
+	S4_Y_shift                                        = 4,
+	S5_X_mask                                         = 0x0f << 8,
+	S5_X_shift                                        = 8,
+	S5_Y_mask                                         = 0x0f << 12,
+	S5_Y_shift                                        = 12,
+	S6_X_mask                                         = 0x0f << 16,
+	S6_X_shift                                        = 16,
+	S6_Y_mask                                         = 0x0f << 20,
+	S6_Y_shift                                        = 20,
+	S7_X_mask                                         = 0x0f << 24,
+	S7_X_shift                                        = 24,
+	S7_Y_mask                                         = 0x0f << 28,
+	S7_Y_shift                                        = 28,
+    PA_SC_CNTL_STATUS                                     = 0x00008be0,
+	MPASS_OVERFLOW_bit                                = 1 << 30,
+    PA_SC_ENHANCE                                         = 0x00008bf0,
+	FORCE_EOV_MAX_CLK_CNT_mask                        = 0xfff << 0,
+	FORCE_EOV_MAX_CLK_CNT_shift                       = 0,
+	FORCE_EOV_MAX_TILE_CNT_mask                       = 0xfff << 12,
+	FORCE_EOV_MAX_TILE_CNT_shift                      = 12,
+    SQ_CONFIG                                             = 0x00008c00,
+	VC_ENABLE_bit                                     = 1 << 0,
+	EXPORT_SRC_C_bit                                  = 1 << 1,
+	DX9_CONSTS_bit                                    = 1 << 2,
+	ALU_INST_PREFER_VECTOR_bit                        = 1 << 3,
+	SQ_CONFIG__DX10_CLAMP_bit                         = 1 << 4,
+	ALU_PREFER_ONE_WATERFALL_bit                      = 1 << 5,
+	ALU_MAX_ONE_WATERFALL_bit                         = 1 << 6,
+	CLAUSE_SEQ_PRIO_mask                              = 0x03 << 8,
+	CLAUSE_SEQ_PRIO_shift                             = 8,
+	    SQ_CL_PRIO_RND_ROBIN                          = 0x00,
+	    SQ_CL_PRIO_MACRO_SEQ                          = 0x01,
+	    SQ_CL_PRIO_NONE                               = 0x02,
+	PS_PRIO_mask                                      = 0x03 << 24,
+	PS_PRIO_shift                                     = 24,
+	VS_PRIO_mask                                      = 0x03 << 26,
+	VS_PRIO_shift                                     = 26,
+	GS_PRIO_mask                                      = 0x03 << 28,
+	GS_PRIO_shift                                     = 28,
+	ES_PRIO_mask                                      = 0x03 << 30,
+	ES_PRIO_shift                                     = 30,
+    SQ_GPR_RESOURCE_MGMT_1                                = 0x00008c04,
+	NUM_PS_GPRS_mask                                  = 0xff << 0,
+	NUM_PS_GPRS_shift                                 = 0,
+	NUM_VS_GPRS_mask                                  = 0xff << 16,
+	NUM_VS_GPRS_shift                                 = 16,
+	NUM_CLAUSE_TEMP_GPRS_mask                         = 0x0f << 28,
+	NUM_CLAUSE_TEMP_GPRS_shift                        = 28,
+    SQ_GPR_RESOURCE_MGMT_2                                = 0x00008c08,
+	NUM_GS_GPRS_mask                                  = 0xff << 0,
+	NUM_GS_GPRS_shift                                 = 0,
+	NUM_ES_GPRS_mask                                  = 0xff << 16,
+	NUM_ES_GPRS_shift                                 = 16,
+    SQ_THREAD_RESOURCE_MGMT                               = 0x00008c0c,
+	NUM_PS_THREADS_mask                               = 0xff << 0,
+	NUM_PS_THREADS_shift                              = 0,
+	NUM_VS_THREADS_mask                               = 0xff << 8,
+	NUM_VS_THREADS_shift                              = 8,
+	NUM_GS_THREADS_mask                               = 0xff << 16,
+	NUM_GS_THREADS_shift                              = 16,
+	NUM_ES_THREADS_mask                               = 0xff << 24,
+	NUM_ES_THREADS_shift                              = 24,
+    SQ_STACK_RESOURCE_MGMT_1                              = 0x00008c10,
+	NUM_PS_STACK_ENTRIES_mask                         = 0xfff << 0,
+	NUM_PS_STACK_ENTRIES_shift                        = 0,
+	NUM_VS_STACK_ENTRIES_mask                         = 0xfff << 16,
+	NUM_VS_STACK_ENTRIES_shift                        = 16,
+    SQ_STACK_RESOURCE_MGMT_2                              = 0x00008c14,
+	NUM_GS_STACK_ENTRIES_mask                         = 0xfff << 0,
+	NUM_GS_STACK_ENTRIES_shift                        = 0,
+	NUM_ES_STACK_ENTRIES_mask                         = 0xfff << 16,
+	NUM_ES_STACK_ENTRIES_shift                        = 16,
+    SQ_ESGS_RING_BASE                                     = 0x00008c40,
+    SQ_ESGS_RING_SIZE                                     = 0x00008c44,
+    SQ_GSVS_RING_BASE                                     = 0x00008c48,
+    SQ_GSVS_RING_SIZE                                     = 0x00008c4c,
+    SQ_ESTMP_RING_BASE                                    = 0x00008c50,
+    SQ_ESTMP_RING_SIZE                                    = 0x00008c54,
+    SQ_GSTMP_RING_BASE                                    = 0x00008c58,
+    SQ_GSTMP_RING_SIZE                                    = 0x00008c5c,
+    SQ_VSTMP_RING_BASE                                    = 0x00008c60,
+    SQ_VSTMP_RING_SIZE                                    = 0x00008c64,
+    SQ_PSTMP_RING_BASE                                    = 0x00008c68,
+    SQ_PSTMP_RING_SIZE                                    = 0x00008c6c,
+    SQ_FBUF_RING_BASE                                     = 0x00008c70,
+    SQ_FBUF_RING_SIZE                                     = 0x00008c74,
+    SQ_REDUC_RING_BASE                                    = 0x00008c78,
+    SQ_REDUC_RING_SIZE                                    = 0x00008c7c,
+    SQ_ALU_WORD1_OP3                                      = 0x00008dfc,
+	SRC2_SEL_mask                                     = 0x1ff << 0,
+	SRC2_SEL_shift                                    = 0,
+	    SQ_ALU_SRC_0                                  = 0xf8,
+	    SQ_ALU_SRC_1                                  = 0xf9,
+	    SQ_ALU_SRC_1_INT                              = 0xfa,
+	    SQ_ALU_SRC_M_1_INT                            = 0xfb,
+	    SQ_ALU_SRC_0_5                                = 0xfc,
+	    SQ_ALU_SRC_LITERAL                            = 0xfd,
+	    SQ_ALU_SRC_PV                                 = 0xfe,
+	    SQ_ALU_SRC_PS                                 = 0xff,
+	SRC2_REL_bit                                      = 1 << 9,
+	SRC2_CHAN_mask                                    = 0x03 << 10,
+	SRC2_CHAN_shift                                   = 10,
+	    SQ_CHAN_X                                     = 0x00,
+	    SQ_CHAN_Y                                     = 0x01,
+	    SQ_CHAN_Z                                     = 0x02,
+	    SQ_CHAN_W                                     = 0x03,
+	SRC2_NEG_bit                                      = 1 << 12,
+	SQ_ALU_WORD1_OP3__ALU_INST_mask                   = 0x1f << 13,
+	SQ_ALU_WORD1_OP3__ALU_INST_shift                  = 13,
+	    SQ_OP3_INST_MUL_LIT                           = 0x0c,
+	    SQ_OP3_INST_MUL_LIT_M2                        = 0x0d,
+	    SQ_OP3_INST_MUL_LIT_M4                        = 0x0e,
+	    SQ_OP3_INST_MUL_LIT_D2                        = 0x0f,
+	    SQ_OP3_INST_MULADD                            = 0x10,
+	    SQ_OP3_INST_MULADD_M2                         = 0x11,
+	    SQ_OP3_INST_MULADD_M4                         = 0x12,
+	    SQ_OP3_INST_MULADD_D2                         = 0x13,
+	    SQ_OP3_INST_MULADD_IEEE                       = 0x14,
+	    SQ_OP3_INST_MULADD_IEEE_M2                    = 0x15,
+	    SQ_OP3_INST_MULADD_IEEE_M4                    = 0x16,
+	    SQ_OP3_INST_MULADD_IEEE_D2                    = 0x17,
+	    SQ_OP3_INST_CNDE                              = 0x18,
+	    SQ_OP3_INST_CNDGT                             = 0x19,
+	    SQ_OP3_INST_CNDGE                             = 0x1a,
+	    SQ_OP3_INST_CNDE_INT                          = 0x1c,
+	    SQ_OP3_INST_CNDGT_INT                         = 0x1d,
+	    SQ_OP3_INST_CNDGE_INT                         = 0x1e,
+    SQ_TEX_WORD2                                          = 0x00008dfc,
+	OFFSET_X_mask                                     = 0x1f << 0,
+	OFFSET_X_shift                                    = 0,
+	OFFSET_Y_mask                                     = 0x1f << 5,
+	OFFSET_Y_shift                                    = 5,
+	OFFSET_Z_mask                                     = 0x1f << 10,
+	OFFSET_Z_shift                                    = 10,
+	SAMPLER_ID_mask                                   = 0x1f << 15,
+	SAMPLER_ID_shift                                  = 15,
+	SQ_TEX_WORD2__SRC_SEL_X_mask                      = 0x07 << 20,
+	SQ_TEX_WORD2__SRC_SEL_X_shift                     = 20,
+	    SQ_SEL_X                                      = 0x00,
+	    SQ_SEL_Y                                      = 0x01,
+	    SQ_SEL_Z                                      = 0x02,
+	    SQ_SEL_W                                      = 0x03,
+	    SQ_SEL_0                                      = 0x04,
+	    SQ_SEL_1                                      = 0x05,
+	SRC_SEL_Y_mask                                    = 0x07 << 23,
+	SRC_SEL_Y_shift                                   = 23,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	SRC_SEL_Z_mask                                    = 0x07 << 26,
+	SRC_SEL_Z_shift                                   = 26,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	SRC_SEL_W_mask                                    = 0x07 << 29,
+	SRC_SEL_W_shift                                   = 29,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+    SQ_CF_ALLOC_EXPORT_WORD1                              = 0x00008dfc,
+	BURST_COUNT_mask                                  = 0x0f << 17,
+	BURST_COUNT_shift                                 = 17,
+	END_OF_PROGRAM_bit                                = 1 << 21,
+	VALID_PIXEL_MODE_bit                              = 1 << 22,
+	SQ_CF_ALLOC_EXPORT_WORD1__CF_INST_mask            = 0x7f << 23,
+	SQ_CF_ALLOC_EXPORT_WORD1__CF_INST_shift           = 23,
+	    SQ_CF_INST_MEM_STREAM0                        = 0x20,
+	    SQ_CF_INST_MEM_STREAM1                        = 0x21,
+	    SQ_CF_INST_MEM_STREAM2                        = 0x22,
+	    SQ_CF_INST_MEM_STREAM3                        = 0x23,
+	    SQ_CF_INST_MEM_SCRATCH                        = 0x24,
+	    SQ_CF_INST_MEM_REDUCTION                      = 0x25,
+	    SQ_CF_INST_MEM_RING                           = 0x26,
+	    SQ_CF_INST_EXPORT                             = 0x27,
+	    SQ_CF_INST_EXPORT_DONE                        = 0x28,
+	WHOLE_QUAD_MODE_bit                               = 1 << 30,
+	BARRIER_bit                                       = 1 << 31,
+    SQ_CF_ALU_WORD1                                       = 0x00008dfc,
+	KCACHE_MODE1_mask                                 = 0x03 << 0,
+	KCACHE_MODE1_shift                                = 0,
+	    SQ_CF_KCACHE_NOP                              = 0x00,
+	    SQ_CF_KCACHE_LOCK_1                           = 0x01,
+	    SQ_CF_KCACHE_LOCK_2                           = 0x02,
+	    SQ_CF_KCACHE_LOCK_LOOP_INDEX                  = 0x03,
+	KCACHE_ADDR0_mask                                 = 0xff << 2,
+	KCACHE_ADDR0_shift                                = 2,
+	KCACHE_ADDR1_mask                                 = 0xff << 10,
+	KCACHE_ADDR1_shift                                = 10,
+	SQ_CF_ALU_WORD1__COUNT_mask                       = 0x7f << 18,
+	SQ_CF_ALU_WORD1__COUNT_shift                      = 18,
+	SQ_CF_ALU_WORD1__ALT_CONST_bit                    = 1 << 25,
+	SQ_CF_ALU_WORD1__CF_INST_mask                     = 0x0f << 26,
+	SQ_CF_ALU_WORD1__CF_INST_shift                    = 26,
+	    SQ_CF_INST_ALU                                = 0x08,
+	    SQ_CF_INST_ALU_PUSH_BEFORE                    = 0x09,
+	    SQ_CF_INST_ALU_POP_AFTER                      = 0x0a,
+	    SQ_CF_INST_ALU_POP2_AFTER                     = 0x0b,
+	    SQ_CF_INST_ALU_CONTINUE                       = 0x0d,
+	    SQ_CF_INST_ALU_BREAK                          = 0x0e,
+	    SQ_CF_INST_ALU_ELSE_AFTER                     = 0x0f,
+/* 	WHOLE_QUAD_MODE_bit                               = 1 << 30, */
+/* 	BARRIER_bit                                       = 1 << 31, */
+    SQ_TEX_WORD1                                          = 0x00008dfc,
+	SQ_TEX_WORD1__DST_GPR_mask                        = 0x7f << 0,
+	SQ_TEX_WORD1__DST_GPR_shift                       = 0,
+	SQ_TEX_WORD1__DST_REL_bit                         = 1 << 7,
+	SQ_TEX_WORD1__DST_SEL_X_mask                      = 0x07 << 9,
+	SQ_TEX_WORD1__DST_SEL_X_shift                     = 9,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	    SQ_SEL_MASK                                   = 0x07,
+	SQ_TEX_WORD1__DST_SEL_Y_mask                      = 0x07 << 12,
+	SQ_TEX_WORD1__DST_SEL_Y_shift                     = 12,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_TEX_WORD1__DST_SEL_Z_mask                      = 0x07 << 15,
+	SQ_TEX_WORD1__DST_SEL_Z_shift                     = 15,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_TEX_WORD1__DST_SEL_W_mask                      = 0x07 << 18,
+	SQ_TEX_WORD1__DST_SEL_W_shift                     = 18,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_TEX_WORD1__LOD_BIAS_mask                       = 0x7f << 21,
+	SQ_TEX_WORD1__LOD_BIAS_shift                      = 21,
+	COORD_TYPE_X_bit                                  = 1 << 28,
+	COORD_TYPE_Y_bit                                  = 1 << 29,
+	COORD_TYPE_Z_bit                                  = 1 << 30,
+	COORD_TYPE_W_bit                                  = 1 << 31,
+    SQ_VTX_WORD0                                          = 0x00008dfc,
+	VTX_INST_mask                                     = 0x1f << 0,
+	VTX_INST_shift                                    = 0,
+	    SQ_VTX_INST_FETCH                             = 0x00,
+	    SQ_VTX_INST_SEMANTIC                          = 0x01,
+	FETCH_TYPE_mask                                   = 0x03 << 5,
+	FETCH_TYPE_shift                                  = 5,
+	    SQ_VTX_FETCH_VERTEX_DATA                      = 0x00,
+	    SQ_VTX_FETCH_INSTANCE_DATA                    = 0x01,
+	    SQ_VTX_FETCH_NO_INDEX_OFFSET                  = 0x02,
+	FETCH_WHOLE_QUAD_bit                              = 1 << 7,
+	BUFFER_ID_mask                                    = 0xff << 8,
+	BUFFER_ID_shift                                   = 8,
+	SRC_GPR_mask                                      = 0x7f << 16,
+	SRC_GPR_shift                                     = 16,
+	SRC_REL_bit                                       = 1 << 23,
+	SQ_VTX_WORD0__SRC_SEL_X_mask                      = 0x03 << 24,
+	SQ_VTX_WORD0__SRC_SEL_X_shift                     = 24,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+	MEGA_FETCH_COUNT_mask                             = 0x3f << 26,
+	MEGA_FETCH_COUNT_shift                            = 26,
+    SQ_CF_ALLOC_EXPORT_WORD1_SWIZ                         = 0x00008dfc,
+	SEL_X_mask                                        = 0x07 << 0,
+	SEL_X_shift                                       = 0,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SEL_Y_mask                                        = 0x07 << 3,
+	SEL_Y_shift                                       = 3,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SEL_Z_mask                                        = 0x07 << 6,
+	SEL_Z_shift                                       = 6,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SEL_W_mask                                        = 0x07 << 9,
+	SEL_W_shift                                       = 9,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+    SQ_ALU_WORD1                                          = 0x00008dfc,
+	ENCODING_mask                                     = 0x07 << 15,
+	ENCODING_shift                                    = 15,
+	BANK_SWIZZLE_mask                                 = 0x07 << 18,
+	BANK_SWIZZLE_shift                                = 18,
+	    SQ_ALU_VEC_012                                = 0x00,
+	    SQ_ALU_VEC_021                                = 0x01,
+	    SQ_ALU_VEC_120                                = 0x02,
+	    SQ_ALU_VEC_102                                = 0x03,
+	    SQ_ALU_VEC_201                                = 0x04,
+	    SQ_ALU_VEC_210                                = 0x05,
+	SQ_ALU_WORD1__DST_GPR_mask                        = 0x7f << 21,
+	SQ_ALU_WORD1__DST_GPR_shift                       = 21,
+	SQ_ALU_WORD1__DST_REL_bit                         = 1 << 28,
+	DST_CHAN_mask                                     = 0x03 << 29,
+	DST_CHAN_shift                                    = 29,
+	    CHAN_X                                        = 0x00,
+	    CHAN_Y                                        = 0x01,
+	    CHAN_Z                                        = 0x02,
+	    CHAN_W                                        = 0x03,
+	SQ_ALU_WORD1__CLAMP_bit                           = 1 << 31,
+    SQ_CF_ALU_WORD0                                       = 0x00008dfc,
+	SQ_CF_ALU_WORD0__ADDR_mask                        = 0x3fffff << 0,
+	SQ_CF_ALU_WORD0__ADDR_shift                       = 0,
+	KCACHE_BANK0_mask                                 = 0x0f << 22,
+	KCACHE_BANK0_shift                                = 22,
+	KCACHE_BANK1_mask                                 = 0x0f << 26,
+	KCACHE_BANK1_shift                                = 26,
+	KCACHE_MODE0_mask                                 = 0x03 << 30,
+	KCACHE_MODE0_shift                                = 30,
+/* 	    SQ_CF_KCACHE_NOP                              = 0x00, */
+/* 	    SQ_CF_KCACHE_LOCK_1                           = 0x01, */
+/* 	    SQ_CF_KCACHE_LOCK_2                           = 0x02, */
+/* 	    SQ_CF_KCACHE_LOCK_LOOP_INDEX                  = 0x03, */
+    SQ_VTX_WORD2                                          = 0x00008dfc,
+	SQ_VTX_WORD2__OFFSET_mask                         = 0xffff << 0,
+	SQ_VTX_WORD2__OFFSET_shift                        = 0,
+	SQ_VTX_WORD2__ENDIAN_SWAP_mask                    = 0x03 << 16,
+	SQ_VTX_WORD2__ENDIAN_SWAP_shift                   = 16,
+	    SQ_ENDIAN_NONE                                = 0x00,
+	    SQ_ENDIAN_8IN16                               = 0x01,
+	    SQ_ENDIAN_8IN32                               = 0x02,
+	CONST_BUF_NO_STRIDE_bit                           = 1 << 18,
+	MEGA_FETCH_bit                                    = 1 << 19,
+	SQ_VTX_WORD2__ALT_CONST_bit                       = 1 << 20,
+    SQ_ALU_WORD1_OP2_V2                                   = 0x00008dfc,
+	SRC0_ABS_bit                                      = 1 << 0,
+	SRC1_ABS_bit                                      = 1 << 1,
+	UPDATE_EXECUTE_MASK_bit                           = 1 << 2,
+	UPDATE_PRED_bit                                   = 1 << 3,
+	WRITE_MASK_bit                                    = 1 << 4,
+	SQ_ALU_WORD1_OP2_V2__OMOD_mask                    = 0x03 << 5,
+	SQ_ALU_WORD1_OP2_V2__OMOD_shift                   = 5,
+	    SQ_ALU_OMOD_OFF                               = 0x00,
+	    SQ_ALU_OMOD_M2                                = 0x01,
+	    SQ_ALU_OMOD_M4                                = 0x02,
+	    SQ_ALU_OMOD_D2                                = 0x03,
+	SQ_ALU_WORD1_OP2_V2__ALU_INST_mask                = 0x7ff << 7,
+	SQ_ALU_WORD1_OP2_V2__ALU_INST_shift               = 7,
+	    SQ_OP2_INST_ADD                               = 0x00,
+	    SQ_OP2_INST_MUL                               = 0x01,
+	    SQ_OP2_INST_MUL_IEEE                          = 0x02,
+	    SQ_OP2_INST_MAX                               = 0x03,
+	    SQ_OP2_INST_MIN                               = 0x04,
+	    SQ_OP2_INST_MAX_DX10                          = 0x05,
+	    SQ_OP2_INST_MIN_DX10                          = 0x06,
+	    SQ_OP2_INST_SETE                              = 0x08,
+	    SQ_OP2_INST_SETGT                             = 0x09,
+	    SQ_OP2_INST_SETGE                             = 0x0a,
+	    SQ_OP2_INST_SETNE                             = 0x0b,
+	    SQ_OP2_INST_SETE_DX10                         = 0x0c,
+	    SQ_OP2_INST_SETGT_DX10                        = 0x0d,
+	    SQ_OP2_INST_SETGE_DX10                        = 0x0e,
+	    SQ_OP2_INST_SETNE_DX10                        = 0x0f,
+	    SQ_OP2_INST_FRACT                             = 0x10,
+	    SQ_OP2_INST_TRUNC                             = 0x11,
+	    SQ_OP2_INST_CEIL                              = 0x12,
+	    SQ_OP2_INST_RNDNE                             = 0x13,
+	    SQ_OP2_INST_FLOOR                             = 0x14,
+	    SQ_OP2_INST_MOVA                              = 0x15,
+	    SQ_OP2_INST_MOVA_FLOOR                        = 0x16,
+	    SQ_OP2_INST_MOVA_INT                          = 0x18,
+	    SQ_OP2_INST_MOV                               = 0x19,
+	    SQ_OP2_INST_NOP                               = 0x1a,
+	    SQ_OP2_INST_PRED_SETGT_UINT                   = 0x1e,
+	    SQ_OP2_INST_PRED_SETGE_UINT                   = 0x1f,
+	    SQ_OP2_INST_PRED_SETE                         = 0x20,
+	    SQ_OP2_INST_PRED_SETGT                        = 0x21,
+	    SQ_OP2_INST_PRED_SETGE                        = 0x22,
+	    SQ_OP2_INST_PRED_SETNE                        = 0x23,
+	    SQ_OP2_INST_PRED_SET_INV                      = 0x24,
+	    SQ_OP2_INST_PRED_SET_POP                      = 0x25,
+	    SQ_OP2_INST_PRED_SET_CLR                      = 0x26,
+	    SQ_OP2_INST_PRED_SET_RESTORE                  = 0x27,
+	    SQ_OP2_INST_PRED_SETE_PUSH                    = 0x28,
+	    SQ_OP2_INST_PRED_SETGT_PUSH                   = 0x29,
+	    SQ_OP2_INST_PRED_SETGE_PUSH                   = 0x2a,
+	    SQ_OP2_INST_PRED_SETNE_PUSH                   = 0x2b,
+	    SQ_OP2_INST_KILLE                             = 0x2c,
+	    SQ_OP2_INST_KILLGT                            = 0x2d,
+	    SQ_OP2_INST_KILLGE                            = 0x2e,
+	    SQ_OP2_INST_KILLNE                            = 0x2f,
+	    SQ_OP2_INST_AND_INT                           = 0x30,
+	    SQ_OP2_INST_OR_INT                            = 0x31,
+	    SQ_OP2_INST_XOR_INT                           = 0x32,
+	    SQ_OP2_INST_NOT_INT                           = 0x33,
+	    SQ_OP2_INST_ADD_INT                           = 0x34,
+	    SQ_OP2_INST_SUB_INT                           = 0x35,
+	    SQ_OP2_INST_MAX_INT                           = 0x36,
+	    SQ_OP2_INST_MIN_INT                           = 0x37,
+	    SQ_OP2_INST_MAX_UINT                          = 0x38,
+	    SQ_OP2_INST_MIN_UINT                          = 0x39,
+	    SQ_OP2_INST_SETE_INT                          = 0x3a,
+	    SQ_OP2_INST_SETGT_INT                         = 0x3b,
+	    SQ_OP2_INST_SETGE_INT                         = 0x3c,
+	    SQ_OP2_INST_SETNE_INT                         = 0x3d,
+	    SQ_OP2_INST_SETGT_UINT                        = 0x3e,
+	    SQ_OP2_INST_SETGE_UINT                        = 0x3f,
+	    SQ_OP2_INST_KILLGT_UINT                       = 0x40,
+	    SQ_OP2_INST_KILLGE_UINT                       = 0x41,
+	    SQ_OP2_INST_PRED_SETE_INT                     = 0x42,
+	    SQ_OP2_INST_PRED_SETGT_INT                    = 0x43,
+	    SQ_OP2_INST_PRED_SETGE_INT                    = 0x44,
+	    SQ_OP2_INST_PRED_SETNE_INT                    = 0x45,
+	    SQ_OP2_INST_KILLE_INT                         = 0x46,
+	    SQ_OP2_INST_KILLGT_INT                        = 0x47,
+	    SQ_OP2_INST_KILLGE_INT                        = 0x48,
+	    SQ_OP2_INST_KILLNE_INT                        = 0x49,
+	    SQ_OP2_INST_PRED_SETE_PUSH_INT                = 0x4a,
+	    SQ_OP2_INST_PRED_SETGT_PUSH_INT               = 0x4b,
+	    SQ_OP2_INST_PRED_SETGE_PUSH_INT               = 0x4c,
+	    SQ_OP2_INST_PRED_SETNE_PUSH_INT               = 0x4d,
+	    SQ_OP2_INST_PRED_SETLT_PUSH_INT               = 0x4e,
+	    SQ_OP2_INST_PRED_SETLE_PUSH_INT               = 0x4f,
+	    SQ_OP2_INST_DOT4                              = 0x50,
+	    SQ_OP2_INST_DOT4_IEEE                         = 0x51,
+	    SQ_OP2_INST_CUBE                              = 0x52,
+	    SQ_OP2_INST_MAX4                              = 0x53,
+	    SQ_OP2_INST_MOVA_GPR_INT                      = 0x60,
+	    SQ_OP2_INST_EXP_IEEE                          = 0x61,
+	    SQ_OP2_INST_LOG_CLAMPED                       = 0x62,
+	    SQ_OP2_INST_LOG_IEEE                          = 0x63,
+	    SQ_OP2_INST_RECIP_CLAMPED                     = 0x64,
+	    SQ_OP2_INST_RECIP_FF                          = 0x65,
+	    SQ_OP2_INST_RECIP_IEEE                        = 0x66,
+	    SQ_OP2_INST_RECIPSQRT_CLAMPED                 = 0x67,
+	    SQ_OP2_INST_RECIPSQRT_FF                      = 0x68,
+	    SQ_OP2_INST_RECIPSQRT_IEEE                    = 0x69,
+	    SQ_OP2_INST_SQRT_IEEE                         = 0x6a,
+	    SQ_OP2_INST_FLT_TO_INT                        = 0x6b,
+	    SQ_OP2_INST_INT_TO_FLT                        = 0x6c,
+	    SQ_OP2_INST_UINT_TO_FLT                       = 0x6d,
+	    SQ_OP2_INST_SIN                               = 0x6e,
+	    SQ_OP2_INST_COS                               = 0x6f,
+	    SQ_OP2_INST_ASHR_INT                          = 0x70,
+	    SQ_OP2_INST_LSHR_INT                          = 0x71,
+	    SQ_OP2_INST_LSHL_INT                          = 0x72,
+	    SQ_OP2_INST_MULLO_INT                         = 0x73,
+	    SQ_OP2_INST_MULHI_INT                         = 0x74,
+	    SQ_OP2_INST_MULLO_UINT                        = 0x75,
+	    SQ_OP2_INST_MULHI_UINT                        = 0x76,
+	    SQ_OP2_INST_RECIP_INT                         = 0x77,
+	    SQ_OP2_INST_RECIP_UINT                        = 0x78,
+	    SQ_OP2_INST_FLT_TO_UINT                       = 0x79,
+    SQ_CF_ALLOC_EXPORT_WORD1_BUF                          = 0x00008dfc,
+	ARRAY_SIZE_mask                                   = 0xfff << 0,
+	ARRAY_SIZE_shift                                  = 0,
+	COMP_MASK_mask                                    = 0x0f << 12,
+	COMP_MASK_shift                                   = 12,
+    SQ_CF_WORD0                                           = 0x00008dfc,
+    SQ_CF_ALLOC_EXPORT_WORD0                              = 0x00008dfc,
+	ARRAY_BASE_mask                                   = 0x1fff << 0,
+	ARRAY_BASE_shift                                  = 0,
+	SQ_CF_ALLOC_EXPORT_WORD0__TYPE_mask               = 0x03 << 13,
+	SQ_CF_ALLOC_EXPORT_WORD0__TYPE_shift              = 13,
+	    SQ_EXPORT_PIXEL                               = 0x00,
+	    SQ_EXPORT_POS                                 = 0x01,
+	    SQ_EXPORT_PARAM                               = 0x02,
+	    X_UNUSED_FOR_SX_EXPORTS                       = 0x03,
+	RW_GPR_mask                                       = 0x7f << 15,
+	RW_GPR_shift                                      = 15,
+	RW_REL_bit                                        = 1 << 22,
+	INDEX_GPR_mask                                    = 0x7f << 23,
+	INDEX_GPR_shift                                   = 23,
+	ELEM_SIZE_mask                                    = 0x03 << 30,
+	ELEM_SIZE_shift                                   = 30,
+    SQ_VTX_WORD1                                          = 0x00008dfc,
+	SQ_VTX_WORD1__DST_SEL_X_mask                      = 0x07 << 9,
+	SQ_VTX_WORD1__DST_SEL_X_shift                     = 9,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_VTX_WORD1__DST_SEL_Y_mask                      = 0x07 << 12,
+	SQ_VTX_WORD1__DST_SEL_Y_shift                     = 12,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_VTX_WORD1__DST_SEL_Z_mask                      = 0x07 << 15,
+	SQ_VTX_WORD1__DST_SEL_Z_shift                     = 15,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_VTX_WORD1__DST_SEL_W_mask                      = 0x07 << 18,
+	SQ_VTX_WORD1__DST_SEL_W_shift                     = 18,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	USE_CONST_FIELDS_bit                              = 1 << 21,
+	SQ_VTX_WORD1__DATA_FORMAT_mask                    = 0x3f << 22,
+	SQ_VTX_WORD1__DATA_FORMAT_shift                   = 22,
+	SQ_VTX_WORD1__NUM_FORMAT_ALL_mask                 = 0x03 << 28,
+	SQ_VTX_WORD1__NUM_FORMAT_ALL_shift                = 28,
+	    SQ_NUM_FORMAT_NORM                            = 0x00,
+	    SQ_NUM_FORMAT_INT                             = 0x01,
+	    SQ_NUM_FORMAT_SCALED                          = 0x02,
+	SQ_VTX_WORD1__FORMAT_COMP_ALL_bit                 = 1 << 30,
+	SQ_VTX_WORD1__SRF_MODE_ALL_bit                    = 1 << 31,
+    SQ_ALU_WORD1_OP2                                      = 0x00008dfc,
+/* 	SRC0_ABS_bit                                      = 1 << 0, */
+/* 	SRC1_ABS_bit                                      = 1 << 1, */
+/* 	UPDATE_EXECUTE_MASK_bit                           = 1 << 2, */
+/* 	UPDATE_PRED_bit                                   = 1 << 3, */
+/* 	WRITE_MASK_bit                                    = 1 << 4, */
+	FOG_MERGE_bit                                     = 1 << 5,
+	SQ_ALU_WORD1_OP2__OMOD_mask                       = 0x03 << 6,
+	SQ_ALU_WORD1_OP2__OMOD_shift                      = 6,
+/* 	    SQ_ALU_OMOD_OFF                               = 0x00, */
+/* 	    SQ_ALU_OMOD_M2                                = 0x01, */
+/* 	    SQ_ALU_OMOD_M4                                = 0x02, */
+/* 	    SQ_ALU_OMOD_D2                                = 0x03, */
+	SQ_ALU_WORD1_OP2__ALU_INST_mask                   = 0x3ff << 8,
+	SQ_ALU_WORD1_OP2__ALU_INST_shift                  = 8,
+/* 	    SQ_OP2_INST_ADD                               = 0x00, */
+/* 	    SQ_OP2_INST_MUL                               = 0x01, */
+/* 	    SQ_OP2_INST_MUL_IEEE                          = 0x02, */
+/* 	    SQ_OP2_INST_MAX                               = 0x03, */
+/* 	    SQ_OP2_INST_MIN                               = 0x04, */
+/* 	    SQ_OP2_INST_MAX_DX10                          = 0x05, */
+/* 	    SQ_OP2_INST_MIN_DX10                          = 0x06, */
+/* 	    SQ_OP2_INST_SETE                              = 0x08, */
+/* 	    SQ_OP2_INST_SETGT                             = 0x09, */
+/* 	    SQ_OP2_INST_SETGE                             = 0x0a, */
+/* 	    SQ_OP2_INST_SETNE                             = 0x0b, */
+/* 	    SQ_OP2_INST_SETE_DX10                         = 0x0c, */
+/* 	    SQ_OP2_INST_SETGT_DX10                        = 0x0d, */
+/* 	    SQ_OP2_INST_SETGE_DX10                        = 0x0e, */
+/* 	    SQ_OP2_INST_SETNE_DX10                        = 0x0f, */
+/* 	    SQ_OP2_INST_FRACT                             = 0x10, */
+/* 	    SQ_OP2_INST_TRUNC                             = 0x11, */
+/* 	    SQ_OP2_INST_CEIL                              = 0x12, */
+/* 	    SQ_OP2_INST_RNDNE                             = 0x13, */
+/* 	    SQ_OP2_INST_FLOOR                             = 0x14, */
+/* 	    SQ_OP2_INST_MOVA                              = 0x15, */
+/* 	    SQ_OP2_INST_MOVA_FLOOR                        = 0x16, */
+/* 	    SQ_OP2_INST_MOVA_INT                          = 0x18, */
+/* 	    SQ_OP2_INST_MOV                               = 0x19, */
+/* 	    SQ_OP2_INST_NOP                               = 0x1a, */
+/* 	    SQ_OP2_INST_PRED_SETGT_UINT                   = 0x1e, */
+/* 	    SQ_OP2_INST_PRED_SETGE_UINT                   = 0x1f, */
+/* 	    SQ_OP2_INST_PRED_SETE                         = 0x20, */
+/* 	    SQ_OP2_INST_PRED_SETGT                        = 0x21, */
+/* 	    SQ_OP2_INST_PRED_SETGE                        = 0x22, */
+/* 	    SQ_OP2_INST_PRED_SETNE                        = 0x23, */
+/* 	    SQ_OP2_INST_PRED_SET_INV                      = 0x24, */
+/* 	    SQ_OP2_INST_PRED_SET_POP                      = 0x25, */
+/* 	    SQ_OP2_INST_PRED_SET_CLR                      = 0x26, */
+/* 	    SQ_OP2_INST_PRED_SET_RESTORE                  = 0x27, */
+/* 	    SQ_OP2_INST_PRED_SETE_PUSH                    = 0x28, */
+/* 	    SQ_OP2_INST_PRED_SETGT_PUSH                   = 0x29, */
+/* 	    SQ_OP2_INST_PRED_SETGE_PUSH                   = 0x2a, */
+/* 	    SQ_OP2_INST_PRED_SETNE_PUSH                   = 0x2b, */
+/* 	    SQ_OP2_INST_KILLE                             = 0x2c, */
+/* 	    SQ_OP2_INST_KILLGT                            = 0x2d, */
+/* 	    SQ_OP2_INST_KILLGE                            = 0x2e, */
+/* 	    SQ_OP2_INST_KILLNE                            = 0x2f, */
+/* 	    SQ_OP2_INST_AND_INT                           = 0x30, */
+/* 	    SQ_OP2_INST_OR_INT                            = 0x31, */
+/* 	    SQ_OP2_INST_XOR_INT                           = 0x32, */
+/* 	    SQ_OP2_INST_NOT_INT                           = 0x33, */
+/* 	    SQ_OP2_INST_ADD_INT                           = 0x34, */
+/* 	    SQ_OP2_INST_SUB_INT                           = 0x35, */
+/* 	    SQ_OP2_INST_MAX_INT                           = 0x36, */
+/* 	    SQ_OP2_INST_MIN_INT                           = 0x37, */
+/* 	    SQ_OP2_INST_MAX_UINT                          = 0x38, */
+/* 	    SQ_OP2_INST_MIN_UINT                          = 0x39, */
+/* 	    SQ_OP2_INST_SETE_INT                          = 0x3a, */
+/* 	    SQ_OP2_INST_SETGT_INT                         = 0x3b, */
+/* 	    SQ_OP2_INST_SETGE_INT                         = 0x3c, */
+/* 	    SQ_OP2_INST_SETNE_INT                         = 0x3d, */
+/* 	    SQ_OP2_INST_SETGT_UINT                        = 0x3e, */
+/* 	    SQ_OP2_INST_SETGE_UINT                        = 0x3f, */
+/* 	    SQ_OP2_INST_KILLGT_UINT                       = 0x40, */
+/* 	    SQ_OP2_INST_KILLGE_UINT                       = 0x41, */
+/* 	    SQ_OP2_INST_PRED_SETE_INT                     = 0x42, */
+/* 	    SQ_OP2_INST_PRED_SETGT_INT                    = 0x43, */
+/* 	    SQ_OP2_INST_PRED_SETGE_INT                    = 0x44, */
+/* 	    SQ_OP2_INST_PRED_SETNE_INT                    = 0x45, */
+/* 	    SQ_OP2_INST_KILLE_INT                         = 0x46, */
+/* 	    SQ_OP2_INST_KILLGT_INT                        = 0x47, */
+/* 	    SQ_OP2_INST_KILLGE_INT                        = 0x48, */
+/* 	    SQ_OP2_INST_KILLNE_INT                        = 0x49, */
+/* 	    SQ_OP2_INST_PRED_SETE_PUSH_INT                = 0x4a, */
+/* 	    SQ_OP2_INST_PRED_SETGT_PUSH_INT               = 0x4b, */
+/* 	    SQ_OP2_INST_PRED_SETGE_PUSH_INT               = 0x4c, */
+/* 	    SQ_OP2_INST_PRED_SETNE_PUSH_INT               = 0x4d, */
+/* 	    SQ_OP2_INST_PRED_SETLT_PUSH_INT               = 0x4e, */
+/* 	    SQ_OP2_INST_PRED_SETLE_PUSH_INT               = 0x4f, */
+/* 	    SQ_OP2_INST_DOT4                              = 0x50, */
+/* 	    SQ_OP2_INST_DOT4_IEEE                         = 0x51, */
+/* 	    SQ_OP2_INST_CUBE                              = 0x52, */
+/* 	    SQ_OP2_INST_MAX4                              = 0x53, */
+/* 	    SQ_OP2_INST_MOVA_GPR_INT                      = 0x60, */
+/* 	    SQ_OP2_INST_EXP_IEEE                          = 0x61, */
+/* 	    SQ_OP2_INST_LOG_CLAMPED                       = 0x62, */
+/* 	    SQ_OP2_INST_LOG_IEEE                          = 0x63, */
+/* 	    SQ_OP2_INST_RECIP_CLAMPED                     = 0x64, */
+/* 	    SQ_OP2_INST_RECIP_FF                          = 0x65, */
+/* 	    SQ_OP2_INST_RECIP_IEEE                        = 0x66, */
+/* 	    SQ_OP2_INST_RECIPSQRT_CLAMPED                 = 0x67, */
+/* 	    SQ_OP2_INST_RECIPSQRT_FF                      = 0x68, */
+/* 	    SQ_OP2_INST_RECIPSQRT_IEEE                    = 0x69, */
+/* 	    SQ_OP2_INST_SQRT_IEEE                         = 0x6a, */
+/* 	    SQ_OP2_INST_FLT_TO_INT                        = 0x6b, */
+/* 	    SQ_OP2_INST_INT_TO_FLT                        = 0x6c, */
+/* 	    SQ_OP2_INST_UINT_TO_FLT                       = 0x6d, */
+/* 	    SQ_OP2_INST_SIN                               = 0x6e, */
+/* 	    SQ_OP2_INST_COS                               = 0x6f, */
+/* 	    SQ_OP2_INST_ASHR_INT                          = 0x70, */
+/* 	    SQ_OP2_INST_LSHR_INT                          = 0x71, */
+/* 	    SQ_OP2_INST_LSHL_INT                          = 0x72, */
+/* 	    SQ_OP2_INST_MULLO_INT                         = 0x73, */
+/* 	    SQ_OP2_INST_MULHI_INT                         = 0x74, */
+/* 	    SQ_OP2_INST_MULLO_UINT                        = 0x75, */
+/* 	    SQ_OP2_INST_MULHI_UINT                        = 0x76, */
+/* 	    SQ_OP2_INST_RECIP_INT                         = 0x77, */
+/* 	    SQ_OP2_INST_RECIP_UINT                        = 0x78, */
+/* 	    SQ_OP2_INST_FLT_TO_UINT                       = 0x79, */
+    SQ_CF_WORD1                                           = 0x00008dfc,
+	POP_COUNT_mask                                    = 0x07 << 0,
+	POP_COUNT_shift                                   = 0,
+	CF_CONST_mask                                     = 0x1f << 3,
+	CF_CONST_shift                                    = 3,
+	COND_mask                                         = 0x03 << 8,
+	COND_shift                                        = 8,
+	    SQ_CF_COND_ACTIVE                             = 0x00,
+	    SQ_CF_COND_FALSE                              = 0x01,
+	    SQ_CF_COND_BOOL                               = 0x02,
+	    SQ_CF_COND_NOT_BOOL                           = 0x03,
+	SQ_CF_WORD1__COUNT_mask                           = 0x07 << 10,
+	SQ_CF_WORD1__COUNT_shift                          = 10,
+	CALL_COUNT_mask                                   = 0x3f << 13,
+	CALL_COUNT_shift                                  = 13,
+	COUNT_3_bit                                       = 1 << 19,
+/* 	END_OF_PROGRAM_bit                                = 1 << 21, */
+/* 	VALID_PIXEL_MODE_bit                              = 1 << 22, */
+	SQ_CF_WORD1__CF_INST_mask                         = 0x7f << 23,
+	SQ_CF_WORD1__CF_INST_shift                        = 23,
+	    SQ_CF_INST_NOP                                = 0x00,
+	    SQ_CF_INST_TEX                                = 0x01,
+	    SQ_CF_INST_VTX                                = 0x02,
+	    SQ_CF_INST_VTX_TC                             = 0x03,
+	    SQ_CF_INST_LOOP_START                         = 0x04,
+	    SQ_CF_INST_LOOP_END                           = 0x05,
+	    SQ_CF_INST_LOOP_START_DX10                    = 0x06,
+	    SQ_CF_INST_LOOP_START_NO_AL                   = 0x07,
+	    SQ_CF_INST_LOOP_CONTINUE                      = 0x08,
+	    SQ_CF_INST_LOOP_BREAK                         = 0x09,
+	    SQ_CF_INST_JUMP                               = 0x0a,
+	    SQ_CF_INST_PUSH                               = 0x0b,
+	    SQ_CF_INST_PUSH_ELSE                          = 0x0c,
+	    SQ_CF_INST_ELSE                               = 0x0d,
+	    SQ_CF_INST_POP                                = 0x0e,
+	    SQ_CF_INST_POP_JUMP                           = 0x0f,
+	    SQ_CF_INST_POP_PUSH                           = 0x10,
+	    SQ_CF_INST_POP_PUSH_ELSE                      = 0x11,
+	    SQ_CF_INST_CALL                               = 0x12,
+	    SQ_CF_INST_CALL_FS                            = 0x13,
+	    SQ_CF_INST_RETURN                             = 0x14,
+	    SQ_CF_INST_EMIT_VERTEX                        = 0x15,
+	    SQ_CF_INST_EMIT_CUT_VERTEX                    = 0x16,
+	    SQ_CF_INST_CUT_VERTEX                         = 0x17,
+	    SQ_CF_INST_KILL                               = 0x18,
+/* 	WHOLE_QUAD_MODE_bit                               = 1 << 30, */
+/* 	BARRIER_bit                                       = 1 << 31, */
+    SQ_VTX_WORD1_SEM                                      = 0x00008dfc,
+	SEMANTIC_ID_mask                                  = 0xff << 0,
+	SEMANTIC_ID_shift                                 = 0,
+    SQ_TEX_WORD0                                          = 0x00008dfc,
+	TEX_INST_mask                                     = 0x1f << 0,
+	TEX_INST_shift                                    = 0,
+	    SQ_TEX_INST_VTX_FETCH                         = 0x00,
+	    SQ_TEX_INST_VTX_SEMANTIC                      = 0x01,
+	    SQ_TEX_INST_LD                                = 0x03,
+	    SQ_TEX_INST_GET_TEXTURE_RESINFO               = 0x04,
+	    SQ_TEX_INST_GET_NUMBER_OF_SAMPLES             = 0x05,
+	    SQ_TEX_INST_GET_LOD                           = 0x06,
+	    SQ_TEX_INST_GET_GRADIENTS_H                   = 0x07,
+	    SQ_TEX_INST_GET_GRADIENTS_V                   = 0x08,
+	    SQ_TEX_INST_GET_LERP                          = 0x09,
+	    SQ_TEX_INST_RESERVED_10                       = 0x0a,
+	    SQ_TEX_INST_SET_GRADIENTS_H                   = 0x0b,
+	    SQ_TEX_INST_SET_GRADIENTS_V                   = 0x0c,
+	    SQ_TEX_INST_PASS                              = 0x0d,
+	    X_Z_SET_INDEX_FOR_ARRAY_OF_CUBEMAPS           = 0x0e,
+	    SQ_TEX_INST_SAMPLE                            = 0x10,
+	    SQ_TEX_INST_SAMPLE_L                          = 0x11,
+	    SQ_TEX_INST_SAMPLE_LB                         = 0x12,
+	    SQ_TEX_INST_SAMPLE_LZ                         = 0x13,
+	    SQ_TEX_INST_SAMPLE_G                          = 0x14,
+	    SQ_TEX_INST_SAMPLE_G_L                        = 0x15,
+	    SQ_TEX_INST_SAMPLE_G_LB                       = 0x16,
+	    SQ_TEX_INST_SAMPLE_G_LZ                       = 0x17,
+	    SQ_TEX_INST_SAMPLE_C                          = 0x18,
+	    SQ_TEX_INST_SAMPLE_C_L                        = 0x19,
+	    SQ_TEX_INST_SAMPLE_C_LB                       = 0x1a,
+	    SQ_TEX_INST_SAMPLE_C_LZ                       = 0x1b,
+	    SQ_TEX_INST_SAMPLE_C_G                        = 0x1c,
+	    SQ_TEX_INST_SAMPLE_C_G_L                      = 0x1d,
+	    SQ_TEX_INST_SAMPLE_C_G_LB                     = 0x1e,
+	    SQ_TEX_INST_SAMPLE_C_G_LZ                     = 0x1f,
+	BC_FRAC_MODE_bit                                  = 1 << 5,
+/* 	FETCH_WHOLE_QUAD_bit                              = 1 << 7, */
+	RESOURCE_ID_mask                                  = 0xff << 8,
+	RESOURCE_ID_shift                                 = 8,
+/* 	SRC_GPR_mask                                      = 0x7f << 16, */
+/* 	SRC_GPR_shift                                     = 16, */
+/* 	SRC_REL_bit                                       = 1 << 23, */
+	SQ_TEX_WORD0__ALT_CONST_bit                       = 1 << 24,
+    SQ_VTX_WORD1_GPR                                      = 0x00008dfc,
+	SQ_VTX_WORD1_GPR__DST_GPR_mask                    = 0x7f << 0,
+	SQ_VTX_WORD1_GPR__DST_GPR_shift                   = 0,
+	SQ_VTX_WORD1_GPR__DST_REL_bit                     = 1 << 7,
+    SQ_ALU_WORD0                                          = 0x00008dfc,
+	SRC0_SEL_mask                                     = 0x1ff << 0,
+	SRC0_SEL_shift                                    = 0,
+/* 	    SQ_ALU_SRC_0                                  = 0xf8, */
+/* 	    SQ_ALU_SRC_1                                  = 0xf9, */
+/* 	    SQ_ALU_SRC_1_INT                              = 0xfa, */
+/* 	    SQ_ALU_SRC_M_1_INT                            = 0xfb, */
+/* 	    SQ_ALU_SRC_0_5                                = 0xfc, */
+/* 	    SQ_ALU_SRC_LITERAL                            = 0xfd, */
+/* 	    SQ_ALU_SRC_PV                                 = 0xfe, */
+/* 	    SQ_ALU_SRC_PS                                 = 0xff, */
+	SRC0_REL_bit                                      = 1 << 9,
+	SRC0_CHAN_mask                                    = 0x03 << 10,
+	SRC0_CHAN_shift                                   = 10,
+/* 	    SQ_CHAN_X                                     = 0x00, */
+/* 	    SQ_CHAN_Y                                     = 0x01, */
+/* 	    SQ_CHAN_Z                                     = 0x02, */
+/* 	    SQ_CHAN_W                                     = 0x03, */
+	SRC0_NEG_bit                                      = 1 << 12,
+	SRC1_SEL_mask                                     = 0x1ff << 13,
+	SRC1_SEL_shift                                    = 13,
+/* 	    SQ_ALU_SRC_0                                  = 0xf8, */
+/* 	    SQ_ALU_SRC_1                                  = 0xf9, */
+/* 	    SQ_ALU_SRC_1_INT                              = 0xfa, */
+/* 	    SQ_ALU_SRC_M_1_INT                            = 0xfb, */
+/* 	    SQ_ALU_SRC_0_5                                = 0xfc, */
+/* 	    SQ_ALU_SRC_LITERAL                            = 0xfd, */
+/* 	    SQ_ALU_SRC_PV                                 = 0xfe, */
+/* 	    SQ_ALU_SRC_PS                                 = 0xff, */
+	SRC1_REL_bit                                      = 1 << 22,
+	SRC1_CHAN_mask                                    = 0x03 << 23,
+	SRC1_CHAN_shift                                   = 23,
+/* 	    SQ_CHAN_X                                     = 0x00, */
+/* 	    SQ_CHAN_Y                                     = 0x01, */
+/* 	    SQ_CHAN_Z                                     = 0x02, */
+/* 	    SQ_CHAN_W                                     = 0x03, */
+	SRC1_NEG_bit                                      = 1 << 25,
+	INDEX_MODE_mask                                   = 0x07 << 26,
+	INDEX_MODE_shift                                  = 26,
+	    SQ_INDEX_AR_X                                 = 0x00,
+	    SQ_INDEX_AR_Y                                 = 0x01,
+	    SQ_INDEX_AR_Z                                 = 0x02,
+	    SQ_INDEX_AR_W                                 = 0x03,
+	    SQ_INDEX_LOOP                                 = 0x04,
+	PRED_SEL_mask                                     = 0x03 << 29,
+	PRED_SEL_shift                                    = 29,
+	    SQ_PRED_SEL_OFF                               = 0x00,
+	    SQ_PRED_SEL_ZERO                              = 0x02,
+	    SQ_PRED_SEL_ONE                               = 0x03,
+	LAST_bit                                          = 1 << 31,
+    SX_EXPORT_BUFFER_SIZES                                = 0x0000900c,
+	COLOR_BUFFER_SIZE_mask                            = 0xff << 0,
+	COLOR_BUFFER_SIZE_shift                           = 0,
+	POSITION_BUFFER_SIZE_mask                         = 0xff << 8,
+	POSITION_BUFFER_SIZE_shift                        = 8,
+	SMX_BUFFER_SIZE_mask                              = 0xff << 16,
+	SMX_BUFFER_SIZE_shift                             = 16,
+    SX_MEMORY_EXPORT_BASE                                 = 0x00009010,
+    SX_MEMORY_EXPORT_SIZE                                 = 0x00009014,
+    SPI_CONFIG_CNTL                                       = 0x00009100,
+	GPR_WRITE_PRIORITY_mask                           = 0x1f << 0,
+	GPR_WRITE_PRIORITY_shift                          = 0,
+	    X_PRIORITY_ORDER                              = 0x00,
+	    X_PRIORITY_ORDER_VS                           = 0x01,
+	DISABLE_INTERP_1_bit                              = 1 << 5,
+	DEBUG_THREAD_TYPE_SEL_mask                        = 0x03 << 6,
+	DEBUG_THREAD_TYPE_SEL_shift                       = 6,
+	DEBUG_GROUP_SEL_mask                              = 0x1f << 8,
+	DEBUG_GROUP_SEL_shift                             = 8,
+	DEBUG_GRBM_OVERRIDE_bit                           = 1 << 13,
+    SPI_CONFIG_CNTL_1                                     = 0x0000913c,
+	VTX_DONE_DELAY_mask                               = 0x0f << 0,
+	VTX_DONE_DELAY_shift                              = 0,
+	    X_DELAY_10_CLKS                               = 0x00,
+	    X_DELAY_11_CLKS                               = 0x01,
+	    X_DELAY_12_CLKS                               = 0x02,
+	    X_DELAY_13_CLKS                               = 0x03,
+	    X_DELAY_14_CLKS                               = 0x04,
+	    X_DELAY_15_CLKS                               = 0x05,
+	    X_DELAY_16_CLKS                               = 0x06,
+	    X_DELAY_17_CLKS                               = 0x07,
+	    X_DELAY_2_CLKS                                = 0x08,
+	    X_DELAY_3_CLKS                                = 0x09,
+	    X_DELAY_4_CLKS                                = 0x0a,
+	    X_DELAY_5_CLKS                                = 0x0b,
+	    X_DELAY_6_CLKS                                = 0x0c,
+	    X_DELAY_7_CLKS                                = 0x0d,
+	    X_DELAY_8_CLKS                                = 0x0e,
+	    X_DELAY_9_CLKS                                = 0x0f,
+	INTERP_ONE_PRIM_PER_ROW_bit                       = 1 << 4,
+    TD_FILTER4                                            = 0x00009400,
+	WEIGHT_1_mask                                     = 0x7ff << 0,
+	WEIGHT_1_shift                                    = 0,
+	WEIGHT_0_mask                                     = 0x7ff << 11,
+	WEIGHT_0_shift                                    = 11,
+	WEIGHT_PAIR_bit                                   = 1 << 22,
+	PHASE_mask                                        = 0x0f << 23,
+	PHASE_shift                                       = 23,
+	DIRECTION_bit                                     = 1 << 27,
+    TD_FILTER4_1                                          = 0x00009404,
+	TD_FILTER4_1_num                                  = 35,
+/* 	WEIGHT_1_mask                                     = 0x7ff << 0, */
+/* 	WEIGHT_1_shift                                    = 0, */
+/* 	WEIGHT_0_mask                                     = 0x7ff << 11, */
+/* 	WEIGHT_0_shift                                    = 11, */
+    TD_CNTL                                               = 0x00009490,
+	SYNC_PHASE_SH_mask                                = 0x03 << 0,
+	SYNC_PHASE_SH_shift                               = 0,
+	SYNC_PHASE_VC_SMX_mask                            = 0x03 << 4,
+	SYNC_PHASE_VC_SMX_shift                           = 4,
+    TD0_CNTL                                              = 0x00009494,
+	TD0_CNTL_num                                      = 4,
+	ID_OVERRIDE_mask                                  = 0x03 << 28,
+	ID_OVERRIDE_shift                                 = 28,
+    TD0_STATUS                                            = 0x000094a4,
+	TD0_STATUS_num                                    = 4,
+	BUSY_bit                                          = 1 << 31,
+    TA_CNTL                                               = 0x00009504,
+	GRADIENT_CREDIT_mask                              = 0x1f << 0,
+	GRADIENT_CREDIT_shift                             = 0,
+	WALKER_CREDIT_mask                                = 0x1f << 8,
+	WALKER_CREDIT_shift                               = 8,
+	ALIGNER_CREDIT_mask                               = 0x1f << 16,
+	ALIGNER_CREDIT_shift                              = 16,
+	TD_FIFO_CREDIT_mask                               = 0x3ff << 22,
+	TD_FIFO_CREDIT_shift                              = 22,
+    TA_CNTL_AUX                                           = 0x00009508,
+	DISABLE_CUBE_WRAP_bit                             = 1 << 0,
+	SYNC_GRADIENT_bit                                 = 1 << 24,
+	SYNC_WALKER_bit                                   = 1 << 25,
+	SYNC_ALIGNER_bit                                  = 1 << 26,
+	BILINEAR_PRECISION_bit                            = 1 << 31,
+    TA0_CNTL                                              = 0x00009510,
+/* 	ID_OVERRIDE_mask                                  = 0x03 << 28, */
+/* 	ID_OVERRIDE_shift                                 = 28, */
+    TA1_CNTL                                              = 0x00009514,
+/* 	ID_OVERRIDE_mask                                  = 0x03 << 28, */
+/* 	ID_OVERRIDE_shift                                 = 28, */
+    TA2_CNTL                                              = 0x00009518,
+/* 	ID_OVERRIDE_mask                                  = 0x03 << 28, */
+/* 	ID_OVERRIDE_shift                                 = 28, */
+    TA3_CNTL                                              = 0x0000951c,
+/* 	ID_OVERRIDE_mask                                  = 0x03 << 28, */
+/* 	ID_OVERRIDE_shift                                 = 28, */
+    TA0_STATUS                                            = 0x00009520,
+	FG_PFIFO_EMPTYB_bit                               = 1 << 12,
+	FG_LFIFO_EMPTYB_bit                               = 1 << 13,
+	FG_SFIFO_EMPTYB_bit                               = 1 << 14,
+	FL_PFIFO_EMPTYB_bit                               = 1 << 16,
+	FL_LFIFO_EMPTYB_bit                               = 1 << 17,
+	FL_SFIFO_EMPTYB_bit                               = 1 << 18,
+	FA_PFIFO_EMPTYB_bit                               = 1 << 20,
+	FA_LFIFO_EMPTYB_bit                               = 1 << 21,
+	FA_SFIFO_EMPTYB_bit                               = 1 << 22,
+	IN_BUSY_bit                                       = 1 << 24,
+	FG_BUSY_bit                                       = 1 << 25,
+	FL_BUSY_bit                                       = 1 << 27,
+	TA_BUSY_bit                                       = 1 << 28,
+	FA_BUSY_bit                                       = 1 << 29,
+	AL_BUSY_bit                                       = 1 << 30,
+/* 	BUSY_bit                                          = 1 << 31, */
+    TA1_STATUS                                            = 0x00009524,
+/* 	FG_PFIFO_EMPTYB_bit                               = 1 << 12, */
+/* 	FG_LFIFO_EMPTYB_bit                               = 1 << 13, */
+/* 	FG_SFIFO_EMPTYB_bit                               = 1 << 14, */
+/* 	FL_PFIFO_EMPTYB_bit                               = 1 << 16, */
+/* 	FL_LFIFO_EMPTYB_bit                               = 1 << 17, */
+/* 	FL_SFIFO_EMPTYB_bit                               = 1 << 18, */
+/* 	FA_PFIFO_EMPTYB_bit                               = 1 << 20, */
+/* 	FA_LFIFO_EMPTYB_bit                               = 1 << 21, */
+/* 	FA_SFIFO_EMPTYB_bit                               = 1 << 22, */
+/* 	IN_BUSY_bit                                       = 1 << 24, */
+/* 	FG_BUSY_bit                                       = 1 << 25, */
+/* 	FL_BUSY_bit                                       = 1 << 27, */
+/* 	TA_BUSY_bit                                       = 1 << 28, */
+/* 	FA_BUSY_bit                                       = 1 << 29, */
+/* 	AL_BUSY_bit                                       = 1 << 30, */
+/* 	BUSY_bit                                          = 1 << 31, */
+    TA2_STATUS                                            = 0x00009528,
+/* 	FG_PFIFO_EMPTYB_bit                               = 1 << 12, */
+/* 	FG_LFIFO_EMPTYB_bit                               = 1 << 13, */
+/* 	FG_SFIFO_EMPTYB_bit                               = 1 << 14, */
+/* 	FL_PFIFO_EMPTYB_bit                               = 1 << 16, */
+/* 	FL_LFIFO_EMPTYB_bit                               = 1 << 17, */
+/* 	FL_SFIFO_EMPTYB_bit                               = 1 << 18, */
+/* 	FA_PFIFO_EMPTYB_bit                               = 1 << 20, */
+/* 	FA_LFIFO_EMPTYB_bit                               = 1 << 21, */
+/* 	FA_SFIFO_EMPTYB_bit                               = 1 << 22, */
+/* 	IN_BUSY_bit                                       = 1 << 24, */
+/* 	FG_BUSY_bit                                       = 1 << 25, */
+/* 	FL_BUSY_bit                                       = 1 << 27, */
+/* 	TA_BUSY_bit                                       = 1 << 28, */
+/* 	FA_BUSY_bit                                       = 1 << 29, */
+/* 	AL_BUSY_bit                                       = 1 << 30, */
+/* 	BUSY_bit                                          = 1 << 31, */
+    TA3_STATUS                                            = 0x0000952c,
+/* 	FG_PFIFO_EMPTYB_bit                               = 1 << 12, */
+/* 	FG_LFIFO_EMPTYB_bit                               = 1 << 13, */
+/* 	FG_SFIFO_EMPTYB_bit                               = 1 << 14, */
+/* 	FL_PFIFO_EMPTYB_bit                               = 1 << 16, */
+/* 	FL_LFIFO_EMPTYB_bit                               = 1 << 17, */
+/* 	FL_SFIFO_EMPTYB_bit                               = 1 << 18, */
+/* 	FA_PFIFO_EMPTYB_bit                               = 1 << 20, */
+/* 	FA_LFIFO_EMPTYB_bit                               = 1 << 21, */
+/* 	FA_SFIFO_EMPTYB_bit                               = 1 << 22, */
+/* 	IN_BUSY_bit                                       = 1 << 24, */
+/* 	FG_BUSY_bit                                       = 1 << 25, */
+/* 	FL_BUSY_bit                                       = 1 << 27, */
+/* 	TA_BUSY_bit                                       = 1 << 28, */
+/* 	FA_BUSY_bit                                       = 1 << 29, */
+/* 	AL_BUSY_bit                                       = 1 << 30, */
+/* 	BUSY_bit                                          = 1 << 31, */
+    TC_STATUS                                             = 0x00009600,
+	TC_BUSY_bit                                       = 1 << 0,
+    TC_INVALIDATE                                         = 0x00009604,
+	START_bit                                         = 1 << 0,
+    TC_CNTL                                               = 0x00009608,
+	FORCE_HIT_bit                                     = 1 << 0,
+	FORCE_MISS_bit                                    = 1 << 1,
+	L2_SIZE_mask                                      = 0x0f << 5,
+	L2_SIZE_shift                                     = 5,
+	    _256K                                         = 0x00,
+	    _224K                                         = 0x01,
+	    _192K                                         = 0x02,
+	    _160K                                         = 0x03,
+	    _128K                                         = 0x04,
+	    _96K                                          = 0x05,
+	    _64K                                          = 0x06,
+	    _32K                                          = 0x07,
+	L2_DISABLE_LATE_HIT_bit                           = 1 << 9,
+	DISABLE_VERT_PERF_bit                             = 1 << 10,
+	DISABLE_INVAL_BUSY_bit                            = 1 << 11,
+	DISABLE_INVAL_SAME_SURFACE_bit                    = 1 << 12,
+	PARTITION_MODE_mask                               = 0x03 << 13,
+	PARTITION_MODE_shift                              = 13,
+	    X_VERTEX                                      = 0x00,
+	MISS_ARB_MODE_bit                                 = 1 << 15,
+	HIT_ARB_MODE_bit                                  = 1 << 16,
+	DISABLE_WRITE_DELAY_bit                           = 1 << 17,
+	HIT_FIFO_DEPTH_bit                                = 1 << 18,
+    VC_CNTL                                               = 0x00009700,
+	L2_INVALIDATE_bit                                 = 1 << 0,
+	RESERVED_bit                                      = 1 << 1,
+	CC_FORCE_MISS_bit                                 = 1 << 2,
+	MI_CHAN_SEL_mask                                  = 0x03 << 3,
+	MI_CHAN_SEL_shift                                 = 3,
+	    X_MC0_USES_CH_0_1                             = 0x00,
+	    X_MC0_USES_CH_0_3                             = 0x01,
+	    X_VC_MC0_IS_ACTIVE                            = 0x02,
+	    X_VC_MC1_IS_DISABLED                          = 0x03,
+	MI_STEER_DISABLE_bit                              = 1 << 5,
+	MI_CREDIT_CTR_mask                                = 0x0f << 6,
+	MI_CREDIT_CTR_shift                               = 6,
+	MI_CREDIT_WE_bit                                  = 1 << 10,
+	MI_REQ_STALL_THLD_mask                            = 0x07 << 11,
+	MI_REQ_STALL_THLD_shift                           = 11,
+	    X_LATENCY_EXCEEDS_399_CLOCKS                  = 0x00,
+	    X_LATENCY_EXCEEDS_415_CLOCKS                  = 0x01,
+	    X_LATENCY_EXCEEDS_431_CLOCKS                  = 0x02,
+	    X_LATENCY_EXCEEDS_447_CLOCKS                  = 0x03,
+	    X_LATENCY_EXCEEDS_463_CLOCKS                  = 0x04,
+	    X_LATENCY_EXCEEDS_479_CLOCKS                  = 0x05,
+	    X_LATENCY_EXCEEDS_495_CLOCKS                  = 0x06,
+	    X_LATENCY_EXCEEDS_511_CLOCKS                  = 0x07,
+	VC_CNTL__MI_TIMESTAMP_RES_mask                    = 0x1f << 14,
+	VC_CNTL__MI_TIMESTAMP_RES_shift                   = 14,
+	    X_1X_SYSTEM_CLOCK                             = 0x00,
+	    X_2X_SYSTEM_CLOCK                             = 0x01,
+	    X_4X_SYSTEM_CLOCK                             = 0x02,
+	    X_8X_SYSTEM_CLOCK                             = 0x03,
+	    X_16X_SYSTEM_CLOCK                            = 0x04,
+	    X_32X_SYSTEM_CLOCK                            = 0x05,
+	    X_64X_SYSTEM_CLOCK                            = 0x06,
+	    X_128X_SYSTEM_CLOCK                           = 0x07,
+	    X_256X_SYSTEM_CLOCK                           = 0x08,
+	    X_512X_SYSTEM_CLOCK                           = 0x09,
+	    X_1024X_SYSTEM_CLOCK                          = 0x0a,
+	    X_2048X_SYSTEM_CLOCK                          = 0x0b,
+	    X_4092X_SYSTEM_CLOCK                          = 0x0c,
+	    X_8192X_SYSTEM_CLOCK                          = 0x0d,
+	    X_16384X_SYSTEM_CLOCK                         = 0x0e,
+	    X_32768X_SYSTEM_CLOCK                         = 0x0f,
+    VC_CNTL_STATUS                                        = 0x00009704,
+	RP_BUSY_bit                                       = 1 << 0,
+	RG_BUSY_bit                                       = 1 << 1,
+	VC_BUSY_bit                                       = 1 << 2,
+	CLAMP_DETECT_bit                                  = 1 << 3,
+    VC_CONFIG                                             = 0x00009718,
+	WRITE_DIS_bit                                     = 1 << 0,
+	GPR_DATA_PHASE_ADJ_mask                           = 0x07 << 1,
+	GPR_DATA_PHASE_ADJ_shift                          = 1,
+	    X_LATENCY_BASE_0_CYCLES                       = 0x00,
+	    X_LATENCY_BASE_1_CYCLES                       = 0x01,
+	    X_LATENCY_BASE_2_CYCLES                       = 0x02,
+	    X_LATENCY_BASE_3_CYCLES                       = 0x03,
+	TD_SIMD_SYNC_ADJ_mask                             = 0x07 << 4,
+	TD_SIMD_SYNC_ADJ_shift                            = 4,
+	    X_0_CYCLES_DELAY                              = 0x00,
+	    X_1_CYCLES_DELAY                              = 0x01,
+	    X_2_CYCLES_DELAY                              = 0x02,
+	    X_3_CYCLES_DELAY                              = 0x03,
+	    X_4_CYCLES_DELAY                              = 0x04,
+	    X_5_CYCLES_DELAY                              = 0x05,
+	    X_6_CYCLES_DELAY                              = 0x06,
+	    X_7_CYCLES_DELAY                              = 0x07,
+    SMX_DC_CTL0                                           = 0x0000a020,
+	WR_GATHER_STREAM0_bit                             = 1 << 0,
+	WR_GATHER_STREAM1_bit                             = 1 << 1,
+	WR_GATHER_STREAM2_bit                             = 1 << 2,
+	WR_GATHER_STREAM3_bit                             = 1 << 3,
+	WR_GATHER_SCRATCH_bit                             = 1 << 4,
+	WR_GATHER_REDUC_BUF_bit                           = 1 << 5,
+	WR_GATHER_RING_BUF_bit                            = 1 << 6,
+	WR_GATHER_F_BUF_bit                               = 1 << 7,
+	DISABLE_CACHES_bit                                = 1 << 8,
+	AUTO_FLUSH_INVAL_EN_bit                           = 1 << 10,
+	AUTO_FLUSH_EN_bit                                 = 1 << 11,
+	AUTO_FLUSH_CNT_mask                               = 0xffff << 12,
+	AUTO_FLUSH_CNT_shift                              = 12,
+	MC_RD_STALL_FACTOR_mask                           = 0x03 << 28,
+	MC_RD_STALL_FACTOR_shift                          = 28,
+	MC_WR_STALL_FACTOR_mask                           = 0x03 << 30,
+	MC_WR_STALL_FACTOR_shift                          = 30,
+    SMX_DC_CTL1                                           = 0x0000a024,
+	OP_FIFO_SKID_mask                                 = 0x7f << 0,
+	OP_FIFO_SKID_shift                                = 0,
+	CACHE_LINE_SIZE_bit                               = 1 << 8,
+	MULTI_FLUSH_MODE_bit                              = 1 << 9,
+	MULTI_FLUSH_REQ_ABORT_IDX_FIFO_SKID_mask          = 0x0f << 10,
+	MULTI_FLUSH_REQ_ABORT_IDX_FIFO_SKID_shift         = 10,
+	DISABLE_WR_GATHER_RD_HIT_FORCE_EVICT_bit          = 1 << 16,
+	DISABLE_WR_GATHER_RD_HIT_COMP_VLDS_CHECK_bit      = 1 << 17,
+	DISABLE_FLUSH_ES_ALSO_INVALS_bit                  = 1 << 18,
+	DISABLE_FLUSH_GS_ALSO_INVALS_bit                  = 1 << 19,
+    SMX_DC_CTL2                                           = 0x0000a028,
+	INVALIDATE_CACHES_bit                             = 1 << 0,
+	CACHES_INVALID_bit                                = 1 << 1,
+	CACHES_DIRTY_bit                                  = 1 << 2,
+	FLUSH_ALL_bit                                     = 1 << 4,
+	FLUSH_GS_THREADS_bit                              = 1 << 8,
+	FLUSH_ES_THREADS_bit                              = 1 << 9,
+    SMX_DC_MC_INTF_CTL                                    = 0x0000a02c,
+	MC_RD_REQ_CRED_mask                               = 0xff << 0,
+	MC_RD_REQ_CRED_shift                              = 0,
+	MC_WR_REQ_CRED_mask                               = 0xff << 16,
+	MC_WR_REQ_CRED_shift                              = 16,
+    TD_PS_SAMPLER0_BORDER_RED                             = 0x0000a400,
+	TD_PS_SAMPLER0_BORDER_RED_num                     = 18,
+	TD_PS_SAMPLER0_BORDER_RED_offset                  = 16,
+    TD_PS_SAMPLER0_BORDER_GREEN                           = 0x0000a404,
+	TD_PS_SAMPLER0_BORDER_GREEN_num                   = 18,
+	TD_PS_SAMPLER0_BORDER_GREEN_offset                = 16,
+    TD_PS_SAMPLER0_BORDER_BLUE                            = 0x0000a408,
+	TD_PS_SAMPLER0_BORDER_BLUE_num                    = 18,
+	TD_PS_SAMPLER0_BORDER_BLUE_offset                 = 16,
+    TD_PS_SAMPLER0_BORDER_ALPHA                           = 0x0000a40c,
+	TD_PS_SAMPLER0_BORDER_ALPHA_num                   = 18,
+	TD_PS_SAMPLER0_BORDER_ALPHA_offset                = 16,
+    TD_VS_SAMPLER0_BORDER_RED                             = 0x0000a600,
+	TD_VS_SAMPLER0_BORDER_RED_num                     = 18,
+	TD_VS_SAMPLER0_BORDER_RED_offset                  = 16,
+    TD_VS_SAMPLER0_BORDER_GREEN                           = 0x0000a604,
+	TD_VS_SAMPLER0_BORDER_GREEN_num                   = 18,
+	TD_VS_SAMPLER0_BORDER_GREEN_offset                = 16,
+    TD_VS_SAMPLER0_BORDER_BLUE                            = 0x0000a608,
+	TD_VS_SAMPLER0_BORDER_BLUE_num                    = 18,
+	TD_VS_SAMPLER0_BORDER_BLUE_offset                 = 16,
+    TD_VS_SAMPLER0_BORDER_ALPHA                           = 0x0000a60c,
+	TD_VS_SAMPLER0_BORDER_ALPHA_num                   = 18,
+	TD_VS_SAMPLER0_BORDER_ALPHA_offset                = 16,
+    TD_GS_SAMPLER0_BORDER_RED                             = 0x0000a800,
+	TD_GS_SAMPLER0_BORDER_RED_num                     = 18,
+	TD_GS_SAMPLER0_BORDER_RED_offset                  = 16,
+    TD_GS_SAMPLER0_BORDER_GREEN                           = 0x0000a804,
+	TD_GS_SAMPLER0_BORDER_GREEN_num                   = 18,
+	TD_GS_SAMPLER0_BORDER_GREEN_offset                = 16,
+    TD_GS_SAMPLER0_BORDER_BLUE                            = 0x0000a808,
+	TD_GS_SAMPLER0_BORDER_BLUE_num                    = 18,
+	TD_GS_SAMPLER0_BORDER_BLUE_offset                 = 16,
+    TD_GS_SAMPLER0_BORDER_ALPHA                           = 0x0000a80c,
+	TD_GS_SAMPLER0_BORDER_ALPHA_num                   = 18,
+	TD_GS_SAMPLER0_BORDER_ALPHA_offset                = 16,
+    TD_PS_SAMPLER0_CLEARTYPE_KERNEL                       = 0x0000aa00,
+	TD_PS_SAMPLER0_CLEARTYPE_KERNEL_num               = 18,
+	TD_PS_SAMPLER0_CLEARTYPE_KERNEL__WIDTH_mask       = 0x07 << 0,
+	TD_PS_SAMPLER0_CLEARTYPE_KERNEL__WIDTH_shift      = 0,
+	TD_PS_SAMPLER0_CLEARTYPE_KERNEL__HEIGHT_mask      = 0x07 << 3,
+	TD_PS_SAMPLER0_CLEARTYPE_KERNEL__HEIGHT_shift     = 3,
+    DB_DEPTH_SIZE                                         = 0x00028000,
+	PITCH_TILE_MAX_mask                               = 0x3ff << 0,
+	PITCH_TILE_MAX_shift                              = 0,
+	SLICE_TILE_MAX_mask                               = 0xfffff << 10,
+	SLICE_TILE_MAX_shift                              = 10,
+    DB_DEPTH_VIEW                                         = 0x00028004,
+	SLICE_START_mask                                  = 0x7ff << 0,
+	SLICE_START_shift                                 = 0,
+	SLICE_MAX_mask                                    = 0x7ff << 13,
+	SLICE_MAX_shift                                   = 13,
+    DB_DEPTH_BASE                                         = 0x0002800c,
+    DB_DEPTH_INFO                                         = 0x00028010,
+	DB_DEPTH_INFO__FORMAT_mask                        = 0x07 << 0,
+	DB_DEPTH_INFO__FORMAT_shift                       = 0,
+	    DEPTH_INVALID                                 = 0x00,
+	    DEPTH_16                                      = 0x01,
+	    DEPTH_X8_24                                   = 0x02,
+	    DEPTH_8_24                                    = 0x03,
+	    DEPTH_X8_24_FLOAT                             = 0x04,
+	    DEPTH_8_24_FLOAT                              = 0x05,
+	    DEPTH_32_FLOAT                                = 0x06,
+	    DEPTH_X24_8_32_FLOAT                          = 0x07,
+	DB_DEPTH_INFO__READ_SIZE_bit                      = 1 << 3,
+	DB_DEPTH_INFO__ARRAY_MODE_mask                    = 0x0f << 15,
+	DB_DEPTH_INFO__ARRAY_MODE_shift                   = 15,
+	    ARRAY_2D_TILED_THIN1                          = 0x04,
+	TILE_SURFACE_ENABLE_bit                           = 1 << 25,
+	TILE_COMPACT_bit                                  = 1 << 26,
+	ZRANGE_PRECISION_bit                              = 1 << 31,
+    DB_HTILE_DATA_BASE                                    = 0x00028014,
+    DB_STENCIL_CLEAR                                      = 0x00028028,
+	DB_STENCIL_CLEAR__CLEAR_mask                      = 0xff << 0,
+	DB_STENCIL_CLEAR__CLEAR_shift                     = 0,
+	MIN_mask                                          = 0xff << 16,
+	MIN_shift                                         = 16,
+    DB_DEPTH_CLEAR                                        = 0x0002802c,
+    PA_SC_SCREEN_SCISSOR_TL                               = 0x00028030,
+	PA_SC_SCREEN_SCISSOR_TL__TL_X_mask                = 0x7fff << 0,
+	PA_SC_SCREEN_SCISSOR_TL__TL_X_shift               = 0,
+	PA_SC_SCREEN_SCISSOR_TL__TL_Y_mask                = 0x7fff << 16,
+	PA_SC_SCREEN_SCISSOR_TL__TL_Y_shift               = 16,
+    PA_SC_SCREEN_SCISSOR_BR                               = 0x00028034,
+	PA_SC_SCREEN_SCISSOR_BR__BR_X_mask                = 0x7fff << 0,
+	PA_SC_SCREEN_SCISSOR_BR__BR_X_shift               = 0,
+	PA_SC_SCREEN_SCISSOR_BR__BR_Y_mask                = 0x7fff << 16,
+	PA_SC_SCREEN_SCISSOR_BR__BR_Y_shift               = 16,
+    CB_COLOR0_BASE                                        = 0x00028040,
+	CB_COLOR0_BASE_num                                = 8,
+    CB_COLOR0_SIZE                                        = 0x00028060,
+	CB_COLOR0_SIZE_num                                = 8,
+/* 	PITCH_TILE_MAX_mask                               = 0x3ff << 0, */
+/* 	PITCH_TILE_MAX_shift                              = 0, */
+/* 	SLICE_TILE_MAX_mask                               = 0xfffff << 10, */
+/* 	SLICE_TILE_MAX_shift                              = 10, */
+    CB_COLOR0_VIEW                                        = 0x00028080,
+	CB_COLOR0_VIEW_num                                = 8,
+/* 	SLICE_START_mask                                  = 0x7ff << 0, */
+/* 	SLICE_START_shift                                 = 0, */
+/* 	SLICE_MAX_mask                                    = 0x7ff << 13, */
+/* 	SLICE_MAX_shift                                   = 13, */
+    CB_COLOR0_INFO                                        = 0x000280a0,
+	CB_COLOR0_INFO_num                                = 8,
+	ENDIAN_mask                                       = 0x03 << 0,
+	ENDIAN_shift                                      = 0,
+	    ENDIAN_NONE                                   = 0x00,
+	    ENDIAN_8IN16                                  = 0x01,
+	    ENDIAN_8IN32                                  = 0x02,
+	    ENDIAN_8IN64                                  = 0x03,
+	CB_COLOR0_INFO__FORMAT_mask                       = 0x3f << 2,
+	CB_COLOR0_INFO__FORMAT_shift                      = 2,
+	    COLOR_INVALID                                 = 0x00,
+	    COLOR_8                                       = 0x01,
+	    COLOR_4_4                                     = 0x02,
+	    COLOR_3_3_2                                   = 0x03,
+	    COLOR_16                                      = 0x05,
+	    COLOR_16_FLOAT                                = 0x06,
+	    COLOR_8_8                                     = 0x07,
+	    COLOR_5_6_5                                   = 0x08,
+	    COLOR_6_5_5                                   = 0x09,
+	    COLOR_1_5_5_5                                 = 0x0a,
+	    COLOR_4_4_4_4                                 = 0x0b,
+	    COLOR_5_5_5_1                                 = 0x0c,
+	    COLOR_32                                      = 0x0d,
+	    COLOR_32_FLOAT                                = 0x0e,
+	    COLOR_16_16                                   = 0x0f,
+	    COLOR_16_16_FLOAT                             = 0x10,
+	    COLOR_8_24                                    = 0x11,
+	    COLOR_8_24_FLOAT                              = 0x12,
+	    COLOR_24_8                                    = 0x13,
+	    COLOR_24_8_FLOAT                              = 0x14,
+	    COLOR_10_11_11                                = 0x15,
+	    COLOR_10_11_11_FLOAT                          = 0x16,
+	    COLOR_11_11_10                                = 0x17,
+	    COLOR_11_11_10_FLOAT                          = 0x18,
+	    COLOR_2_10_10_10                              = 0x19,
+	    COLOR_8_8_8_8                                 = 0x1a,
+	    COLOR_10_10_10_2                              = 0x1b,
+	    COLOR_X24_8_32_FLOAT                          = 0x1c,
+	    COLOR_32_32                                   = 0x1d,
+	    COLOR_32_32_FLOAT                             = 0x1e,
+	    COLOR_16_16_16_16                             = 0x1f,
+	    COLOR_16_16_16_16_FLOAT                       = 0x20,
+	    COLOR_32_32_32_32                             = 0x22,
+	    COLOR_32_32_32_32_FLOAT                       = 0x23,
+	CB_COLOR0_INFO__ARRAY_MODE_mask                   = 0x0f << 8,
+	CB_COLOR0_INFO__ARRAY_MODE_shift                  = 8,
+	    ARRAY_LINEAR_GENERAL                          = 0x00,
+	    ARRAY_LINEAR_ALIGNED                          = 0x01,
+/* 	    ARRAY_2D_TILED_THIN1                          = 0x04, */
+	NUMBER_TYPE_mask                                  = 0x07 << 12,
+	NUMBER_TYPE_shift                                 = 12,
+	    NUMBER_UNORM                                  = 0x00,
+	    NUMBER_SNORM                                  = 0x01,
+	    NUMBER_USCALED                                = 0x02,
+	    NUMBER_SSCALED                                = 0x03,
+	    NUMBER_UINT                                   = 0x04,
+	    NUMBER_SINT                                   = 0x05,
+	    NUMBER_SRGB                                   = 0x06,
+	    NUMBER_FLOAT                                  = 0x07,
+	CB_COLOR0_INFO__READ_SIZE_bit                     = 1 << 15,
+	COMP_SWAP_mask                                    = 0x03 << 16,
+	COMP_SWAP_shift                                   = 16,
+	    SWAP_STD                                      = 0x00,
+	    SWAP_ALT                                      = 0x01,
+	    SWAP_STD_REV                                  = 0x02,
+	    SWAP_ALT_REV                                  = 0x03,
+	CB_COLOR0_INFO__TILE_MODE_mask                    = 0x03 << 18,
+	CB_COLOR0_INFO__TILE_MODE_shift                   = 18,
+	    TILE_DISABLE                                  = 0x00,
+	    TILE_CLEAR_ENABLE                             = 0x01,
+	    TILE_FRAG_ENABLE                              = 0x02,
+	BLEND_CLAMP_bit                                   = 1 << 20,
+	CLEAR_COLOR_bit                                   = 1 << 21,
+	BLEND_BYPASS_bit                                  = 1 << 22,
+	BLEND_FLOAT32_bit                                 = 1 << 23,
+	SIMPLE_FLOAT_bit                                  = 1 << 24,
+	CB_COLOR0_INFO__ROUND_MODE_bit                    = 1 << 25,
+/* 	TILE_COMPACT_bit                                  = 1 << 26, */
+	SOURCE_FORMAT_bit                                 = 1 << 27,
+    CB_COLOR0_TILE                                        = 0x000280c0,
+	CB_COLOR0_TILE_num                                = 8,
+    CB_COLOR0_FRAG                                        = 0x000280e0,
+	CB_COLOR0_FRAG_num                                = 8,
+    CB_COLOR0_MASK                                        = 0x00028100,
+	CB_COLOR0_MASK_num                                = 8,
+	CMASK_BLOCK_MAX_mask                              = 0xfff << 0,
+	CMASK_BLOCK_MAX_shift                             = 0,
+	FMASK_TILE_MAX_mask                               = 0xfffff << 12,
+	FMASK_TILE_MAX_shift                              = 12,
+    CB_CLEAR_RED                                          = 0x00028120,
+    CB_CLEAR_GREEN                                        = 0x00028124,
+    CB_CLEAR_BLUE                                         = 0x00028128,
+    CB_CLEAR_ALPHA                                        = 0x0002812c,
+    SQ_ALU_CONST_BUFFER_SIZE_PS_0                         = 0x00028140,
+	SQ_ALU_CONST_BUFFER_SIZE_PS_0_num                 = 16,
+	SQ_ALU_CONST_BUFFER_SIZE_PS_0__DATA_mask          = 0x1ff << 0,
+	SQ_ALU_CONST_BUFFER_SIZE_PS_0__DATA_shift         = 0,
+    SQ_ALU_CONST_BUFFER_SIZE_VS_0                         = 0x00028180,
+	SQ_ALU_CONST_BUFFER_SIZE_VS_0_num                 = 16,
+	SQ_ALU_CONST_BUFFER_SIZE_VS_0__DATA_mask          = 0x1ff << 0,
+	SQ_ALU_CONST_BUFFER_SIZE_VS_0__DATA_shift         = 0,
+    SQ_ALU_CONST_BUFFER_SIZE_GS_0                         = 0x000281c0,
+	SQ_ALU_CONST_BUFFER_SIZE_GS_0_num                 = 16,
+	SQ_ALU_CONST_BUFFER_SIZE_GS_0__DATA_mask          = 0x1ff << 0,
+	SQ_ALU_CONST_BUFFER_SIZE_GS_0__DATA_shift         = 0,
+    PA_SC_WINDOW_OFFSET                                   = 0x00028200,
+	WINDOW_X_OFFSET_mask                              = 0x7fff << 0,
+	WINDOW_X_OFFSET_shift                             = 0,
+	WINDOW_Y_OFFSET_mask                              = 0x7fff << 16,
+	WINDOW_Y_OFFSET_shift                             = 16,
+    PA_SC_WINDOW_SCISSOR_TL                               = 0x00028204,
+	PA_SC_WINDOW_SCISSOR_TL__TL_X_mask                = 0x3fff << 0,
+	PA_SC_WINDOW_SCISSOR_TL__TL_X_shift               = 0,
+	PA_SC_WINDOW_SCISSOR_TL__TL_Y_mask                = 0x3fff << 16,
+	PA_SC_WINDOW_SCISSOR_TL__TL_Y_shift               = 16,
+	WINDOW_OFFSET_DISABLE_bit                         = 1 << 31,
+    PA_SC_WINDOW_SCISSOR_BR                               = 0x00028208,
+	PA_SC_WINDOW_SCISSOR_BR__BR_X_mask                = 0x3fff << 0,
+	PA_SC_WINDOW_SCISSOR_BR__BR_X_shift               = 0,
+	PA_SC_WINDOW_SCISSOR_BR__BR_Y_mask                = 0x3fff << 16,
+	PA_SC_WINDOW_SCISSOR_BR__BR_Y_shift               = 16,
+    PA_SC_CLIPRECT_RULE                                   = 0x0002820c,
+	CLIP_RULE_mask                                    = 0xffff << 0,
+	CLIP_RULE_shift                                   = 0,
+    PA_SC_CLIPRECT_0_TL                                   = 0x00028210,
+	PA_SC_CLIPRECT_0_TL_num                           = 4,
+	PA_SC_CLIPRECT_0_TL_offset                        = 8,
+	PA_SC_CLIPRECT_0_TL__TL_X_mask                    = 0x3fff << 0,
+	PA_SC_CLIPRECT_0_TL__TL_X_shift                   = 0,
+	PA_SC_CLIPRECT_0_TL__TL_Y_mask                    = 0x3fff << 16,
+	PA_SC_CLIPRECT_0_TL__TL_Y_shift                   = 16,
+    PA_SC_CLIPRECT_0_BR                                   = 0x00028214,
+	PA_SC_CLIPRECT_0_BR_num                           = 4,
+	PA_SC_CLIPRECT_0_BR_offset                        = 8,
+	PA_SC_CLIPRECT_0_BR__BR_X_mask                    = 0x3fff << 0,
+	PA_SC_CLIPRECT_0_BR__BR_X_shift                   = 0,
+	PA_SC_CLIPRECT_0_BR__BR_Y_mask                    = 0x3fff << 16,
+	PA_SC_CLIPRECT_0_BR__BR_Y_shift                   = 16,
+    CB_TARGET_MASK                                        = 0x00028238,
+	TARGET0_ENABLE_mask                               = 0x0f << 0,
+	TARGET0_ENABLE_shift                              = 0,
+	TARGET1_ENABLE_mask                               = 0x0f << 4,
+	TARGET1_ENABLE_shift                              = 4,
+	TARGET2_ENABLE_mask                               = 0x0f << 8,
+	TARGET2_ENABLE_shift                              = 8,
+	TARGET3_ENABLE_mask                               = 0x0f << 12,
+	TARGET3_ENABLE_shift                              = 12,
+	TARGET4_ENABLE_mask                               = 0x0f << 16,
+	TARGET4_ENABLE_shift                              = 16,
+	TARGET5_ENABLE_mask                               = 0x0f << 20,
+	TARGET5_ENABLE_shift                              = 20,
+	TARGET6_ENABLE_mask                               = 0x0f << 24,
+	TARGET6_ENABLE_shift                              = 24,
+	TARGET7_ENABLE_mask                               = 0x0f << 28,
+	TARGET7_ENABLE_shift                              = 28,
+    CB_SHADER_MASK                                        = 0x0002823c,
+	OUTPUT0_ENABLE_mask                               = 0x0f << 0,
+	OUTPUT0_ENABLE_shift                              = 0,
+	OUTPUT1_ENABLE_mask                               = 0x0f << 4,
+	OUTPUT1_ENABLE_shift                              = 4,
+	OUTPUT2_ENABLE_mask                               = 0x0f << 8,
+	OUTPUT2_ENABLE_shift                              = 8,
+	OUTPUT3_ENABLE_mask                               = 0x0f << 12,
+	OUTPUT3_ENABLE_shift                              = 12,
+	OUTPUT4_ENABLE_mask                               = 0x0f << 16,
+	OUTPUT4_ENABLE_shift                              = 16,
+	OUTPUT5_ENABLE_mask                               = 0x0f << 20,
+	OUTPUT5_ENABLE_shift                              = 20,
+	OUTPUT6_ENABLE_mask                               = 0x0f << 24,
+	OUTPUT6_ENABLE_shift                              = 24,
+	OUTPUT7_ENABLE_mask                               = 0x0f << 28,
+	OUTPUT7_ENABLE_shift                              = 28,
+    PA_SC_GENERIC_SCISSOR_TL                              = 0x00028240,
+	PA_SC_GENERIC_SCISSOR_TL__TL_X_mask               = 0x3fff << 0,
+	PA_SC_GENERIC_SCISSOR_TL__TL_X_shift              = 0,
+	PA_SC_GENERIC_SCISSOR_TL__TL_Y_mask               = 0x3fff << 16,
+	PA_SC_GENERIC_SCISSOR_TL__TL_Y_shift              = 16,
+/* 	WINDOW_OFFSET_DISABLE_bit                         = 1 << 31, */
+    PA_SC_GENERIC_SCISSOR_BR                              = 0x00028244,
+	PA_SC_GENERIC_SCISSOR_BR__BR_X_mask               = 0x3fff << 0,
+	PA_SC_GENERIC_SCISSOR_BR__BR_X_shift              = 0,
+	PA_SC_GENERIC_SCISSOR_BR__BR_Y_mask               = 0x3fff << 16,
+	PA_SC_GENERIC_SCISSOR_BR__BR_Y_shift              = 16,
+    PA_SC_VPORT_SCISSOR_0_TL                              = 0x00028250,
+	PA_SC_VPORT_SCISSOR_0_TL_num                      = 16,
+	PA_SC_VPORT_SCISSOR_0_TL_offset                   = 8,
+	PA_SC_VPORT_SCISSOR_0_TL__TL_X_mask               = 0x3fff << 0,
+	PA_SC_VPORT_SCISSOR_0_TL__TL_X_shift              = 0,
+	PA_SC_VPORT_SCISSOR_0_TL__TL_Y_mask               = 0x3fff << 16,
+	PA_SC_VPORT_SCISSOR_0_TL__TL_Y_shift              = 16,
+/* 	WINDOW_OFFSET_DISABLE_bit                         = 1 << 31, */
+    PA_SC_VPORT_SCISSOR_0_BR                              = 0x00028254,
+	PA_SC_VPORT_SCISSOR_0_BR_num                      = 16,
+	PA_SC_VPORT_SCISSOR_0_BR_offset                   = 8,
+	PA_SC_VPORT_SCISSOR_0_BR__BR_X_mask               = 0x3fff << 0,
+	PA_SC_VPORT_SCISSOR_0_BR__BR_X_shift              = 0,
+	PA_SC_VPORT_SCISSOR_0_BR__BR_Y_mask               = 0x3fff << 16,
+	PA_SC_VPORT_SCISSOR_0_BR__BR_Y_shift              = 16,
+    PA_SC_VPORT_ZMIN_0                                    = 0x000282d0,
+	PA_SC_VPORT_ZMIN_0_num                            = 16,
+	PA_SC_VPORT_ZMIN_0_offset                         = 8,
+    PA_SC_VPORT_ZMAX_0                                    = 0x000282d4,
+	PA_SC_VPORT_ZMAX_0_num                            = 16,
+	PA_SC_VPORT_ZMAX_0_offset                         = 8,
+    SX_MISC                                               = 0x00028350,
+	MULTIPASS_bit                                     = 1 << 0,
+    SQ_VTX_SEMANTIC_0                                     = 0x00028380,
+	SQ_VTX_SEMANTIC_0_num                             = 32,
+/* 	SEMANTIC_ID_mask                                  = 0xff << 0, */
+/* 	SEMANTIC_ID_shift                                 = 0, */
+    VGT_MAX_VTX_INDX                                      = 0x00028400,
+    VGT_MIN_VTX_INDX                                      = 0x00028404,
+    VGT_INDX_OFFSET                                       = 0x00028408,
+    VGT_MULTI_PRIM_IB_RESET_INDX                          = 0x0002840c,
+    SX_ALPHA_TEST_CONTROL                                 = 0x00028410,
+	ALPHA_FUNC_mask                                   = 0x07 << 0,
+	ALPHA_FUNC_shift                                  = 0,
+	    REF_NEVER                                     = 0x00,
+	    REF_LESS                                      = 0x01,
+	    REF_EQUAL                                     = 0x02,
+	    REF_LEQUAL                                    = 0x03,
+	    REF_GREATER                                   = 0x04,
+	    REF_NOTEQUAL                                  = 0x05,
+	    REF_GEQUAL                                    = 0x06,
+	    REF_ALWAYS                                    = 0x07,
+	ALPHA_TEST_ENABLE_bit                             = 1 << 3,
+	ALPHA_TEST_BYPASS_bit                             = 1 << 8,
+    CB_BLEND_RED                                          = 0x00028414,
+    CB_BLEND_GREEN                                        = 0x00028418,
+    CB_BLEND_BLUE                                         = 0x0002841c,
+    CB_BLEND_ALPHA                                        = 0x00028420,
+    CB_FOG_RED                                            = 0x00028424,
+    CB_FOG_GREEN                                          = 0x00028428,
+    CB_FOG_BLUE                                           = 0x0002842c,
+    DB_STENCILREFMASK                                     = 0x00028430,
+	STENCILREF_mask                                   = 0xff << 0,
+	STENCILREF_shift                                  = 0,
+	STENCILMASK_mask                                  = 0xff << 8,
+	STENCILMASK_shift                                 = 8,
+	STENCILWRITEMASK_mask                             = 0xff << 16,
+	STENCILWRITEMASK_shift                            = 16,
+    DB_STENCILREFMASK_BF                                  = 0x00028434,
+	STENCILREF_BF_mask                                = 0xff << 0,
+	STENCILREF_BF_shift                               = 0,
+	STENCILMASK_BF_mask                               = 0xff << 8,
+	STENCILMASK_BF_shift                              = 8,
+	STENCILWRITEMASK_BF_mask                          = 0xff << 16,
+	STENCILWRITEMASK_BF_shift                         = 16,
+    SX_ALPHA_REF                                          = 0x00028438,
+    PA_CL_VPORT_XSCALE_0                                  = 0x0002843c,
+	PA_CL_VPORT_XSCALE_0_num                          = 16,
+	PA_CL_VPORT_XSCALE_0_offset                       = 24,
+    PA_CL_VPORT_XOFFSET_0                                 = 0x00028440,
+	PA_CL_VPORT_XOFFSET_0_num                         = 16,
+	PA_CL_VPORT_XOFFSET_0_offset                      = 24,
+    PA_CL_VPORT_YSCALE_0                                  = 0x00028444,
+	PA_CL_VPORT_YSCALE_0_num                          = 16,
+	PA_CL_VPORT_YSCALE_0_offset                       = 24,
+    PA_CL_VPORT_YOFFSET_0                                 = 0x00028448,
+	PA_CL_VPORT_YOFFSET_0_num                         = 16,
+	PA_CL_VPORT_YOFFSET_0_offset                      = 24,
+    PA_CL_VPORT_ZSCALE_0                                  = 0x0002844c,
+	PA_CL_VPORT_ZSCALE_0_num                          = 16,
+	PA_CL_VPORT_ZSCALE_0_offset                       = 24,
+    PA_CL_VPORT_ZOFFSET_0                                 = 0x00028450,
+	PA_CL_VPORT_ZOFFSET_0_num                         = 16,
+	PA_CL_VPORT_ZOFFSET_0_offset                      = 24,
+    SPI_VS_OUT_ID_0                                       = 0x00028614,
+	SPI_VS_OUT_ID_0_num                               = 10,
+	SEMANTIC_0_mask                                   = 0xff << 0,
+	SEMANTIC_0_shift                                  = 0,
+	SEMANTIC_1_mask                                   = 0xff << 8,
+	SEMANTIC_1_shift                                  = 8,
+	SEMANTIC_2_mask                                   = 0xff << 16,
+	SEMANTIC_2_shift                                  = 16,
+	SEMANTIC_3_mask                                   = 0xff << 24,
+	SEMANTIC_3_shift                                  = 24,
+    SPI_PS_INPUT_CNTL_0                                   = 0x00028644,
+	SPI_PS_INPUT_CNTL_0_num                           = 32,
+	SEMANTIC_mask                                     = 0xff << 0,
+	SEMANTIC_shift                                    = 0,
+	DEFAULT_VAL_mask                                  = 0x03 << 8,
+	DEFAULT_VAL_shift                                 = 8,
+	    X_0_0F                                        = 0x00,
+	FLAT_SHADE_bit                                    = 1 << 10,
+	SEL_CENTROID_bit                                  = 1 << 11,
+	SEL_LINEAR_bit                                    = 1 << 12,
+	CYL_WRAP_mask                                     = 0x0f << 13,
+	CYL_WRAP_shift                                    = 13,
+	PT_SPRITE_TEX_bit                                 = 1 << 17,
+	SEL_SAMPLE_bit                                    = 1 << 18,
+    SPI_VS_OUT_CONFIG                                     = 0x000286c4,
+	VS_PER_COMPONENT_bit                              = 1 << 0,
+	VS_EXPORT_COUNT_mask                              = 0x1f << 1,
+	VS_EXPORT_COUNT_shift                             = 1,
+	VS_EXPORTS_FOG_bit                                = 1 << 8,
+	VS_OUT_FOG_VEC_ADDR_mask                          = 0x1f << 9,
+	VS_OUT_FOG_VEC_ADDR_shift                         = 9,
+    SPI_PS_IN_CONTROL_0                                   = 0x000286cc,
+	NUM_INTERP_mask                                   = 0x3f << 0,
+	NUM_INTERP_shift                                  = 0,
+	POSITION_ENA_bit                                  = 1 << 8,
+	POSITION_CENTROID_bit                             = 1 << 9,
+	POSITION_ADDR_mask                                = 0x1f << 10,
+	POSITION_ADDR_shift                               = 10,
+	PARAM_GEN_mask                                    = 0x0f << 15,
+	PARAM_GEN_shift                                   = 15,
+	PARAM_GEN_ADDR_mask                               = 0x7f << 19,
+	PARAM_GEN_ADDR_shift                              = 19,
+	BARYC_SAMPLE_CNTL_mask                            = 0x03 << 26,
+	BARYC_SAMPLE_CNTL_shift                           = 26,
+	    CENTROIDS_ONLY                                = 0x00,
+	    CENTERS_ONLY                                  = 0x01,
+	    CENTROIDS_AND_CENTERS                         = 0x02,
+	    UNDEF                                         = 0x03,
+	PERSP_GRADIENT_ENA_bit                            = 1 << 28,
+	LINEAR_GRADIENT_ENA_bit                           = 1 << 29,
+	POSITION_SAMPLE_bit                               = 1 << 30,
+	BARYC_AT_SAMPLE_ENA_bit                           = 1 << 31,
+    SPI_PS_IN_CONTROL_1                                   = 0x000286d0,
+	GEN_INDEX_PIX_bit                                 = 1 << 0,
+	GEN_INDEX_PIX_ADDR_mask                           = 0x7f << 1,
+	GEN_INDEX_PIX_ADDR_shift                          = 1,
+	FRONT_FACE_ENA_bit                                = 1 << 8,
+	FRONT_FACE_CHAN_mask                              = 0x03 << 9,
+	FRONT_FACE_CHAN_shift                             = 9,
+	FRONT_FACE_ALL_BITS_bit                           = 1 << 11,
+	FRONT_FACE_ADDR_mask                              = 0x1f << 12,
+	FRONT_FACE_ADDR_shift                             = 12,
+	FOG_ADDR_mask                                     = 0x7f << 17,
+	FOG_ADDR_shift                                    = 17,
+	FIXED_PT_POSITION_ENA_bit                         = 1 << 24,
+	FIXED_PT_POSITION_ADDR_mask                       = 0x1f << 25,
+	FIXED_PT_POSITION_ADDR_shift                      = 25,
+    SPI_INTERP_CONTROL_0                                  = 0x000286d4,
+	FLAT_SHADE_ENA_bit                                = 1 << 0,
+	PNT_SPRITE_ENA_bit                                = 1 << 1,
+	PNT_SPRITE_OVRD_X_mask                            = 0x07 << 2,
+	PNT_SPRITE_OVRD_X_shift                           = 2,
+	    SPI_PNT_SPRITE_SEL_0                          = 0x00,
+	    SPI_PNT_SPRITE_SEL_1                          = 0x01,
+	    SPI_PNT_SPRITE_SEL_S                          = 0x02,
+	    SPI_PNT_SPRITE_SEL_T                          = 0x03,
+	    SPI_PNT_SPRITE_SEL_NONE                       = 0x04,
+	PNT_SPRITE_OVRD_Y_mask                            = 0x07 << 5,
+	PNT_SPRITE_OVRD_Y_shift                           = 5,
+/* 	    SPI_PNT_SPRITE_SEL_0                          = 0x00, */
+/* 	    SPI_PNT_SPRITE_SEL_1                          = 0x01, */
+/* 	    SPI_PNT_SPRITE_SEL_S                          = 0x02, */
+/* 	    SPI_PNT_SPRITE_SEL_T                          = 0x03, */
+/* 	    SPI_PNT_SPRITE_SEL_NONE                       = 0x04, */
+	PNT_SPRITE_OVRD_Z_mask                            = 0x07 << 8,
+	PNT_SPRITE_OVRD_Z_shift                           = 8,
+/* 	    SPI_PNT_SPRITE_SEL_0                          = 0x00, */
+/* 	    SPI_PNT_SPRITE_SEL_1                          = 0x01, */
+/* 	    SPI_PNT_SPRITE_SEL_S                          = 0x02, */
+/* 	    SPI_PNT_SPRITE_SEL_T                          = 0x03, */
+/* 	    SPI_PNT_SPRITE_SEL_NONE                       = 0x04, */
+	PNT_SPRITE_OVRD_W_mask                            = 0x07 << 11,
+	PNT_SPRITE_OVRD_W_shift                           = 11,
+/* 	    SPI_PNT_SPRITE_SEL_0                          = 0x00, */
+/* 	    SPI_PNT_SPRITE_SEL_1                          = 0x01, */
+/* 	    SPI_PNT_SPRITE_SEL_S                          = 0x02, */
+/* 	    SPI_PNT_SPRITE_SEL_T                          = 0x03, */
+/* 	    SPI_PNT_SPRITE_SEL_NONE                       = 0x04, */
+	PNT_SPRITE_TOP_1_bit                              = 1 << 14,
+    SPI_INPUT_Z                                           = 0x000286d8,
+	PROVIDE_Z_TO_SPI_bit                              = 1 << 0,
+    SPI_FOG_CNTL                                          = 0x000286dc,
+	PASS_FOG_THROUGH_PS_bit                           = 1 << 0,
+	PIXEL_FOG_FUNC_mask                               = 0x03 << 1,
+	PIXEL_FOG_FUNC_shift                              = 1,
+	    SPI_FOG_NONE                                  = 0x00,
+	    SPI_FOG_EXP                                   = 0x01,
+	    SPI_FOG_EXP2                                  = 0x02,
+	    SPI_FOG_LINEAR                                = 0x03,
+	PIXEL_FOG_SRC_SEL_bit                             = 1 << 3,
+	VS_FOG_CLAMP_DISABLE_bit                          = 1 << 4,
+    SPI_FOG_FUNC_SCALE                                    = 0x000286e0,
+    SPI_FOG_FUNC_BIAS                                     = 0x000286e4,
+    CB_BLEND0_CONTROL                                     = 0x00028780,
+	CB_BLEND0_CONTROL_num                             = 8,
+	COLOR_SRCBLEND_mask                               = 0x1f << 0,
+	COLOR_SRCBLEND_shift                              = 0,
+	COLOR_COMB_FCN_mask                               = 0x07 << 5,
+	COLOR_COMB_FCN_shift                              = 5,
+	COLOR_DESTBLEND_mask                              = 0x1f << 8,
+	COLOR_DESTBLEND_shift                             = 8,
+	OPACITY_WEIGHT_bit                                = 1 << 13,
+	ALPHA_SRCBLEND_mask                               = 0x1f << 16,
+	ALPHA_SRCBLEND_shift                              = 16,
+	ALPHA_COMB_FCN_mask                               = 0x07 << 21,
+	ALPHA_COMB_FCN_shift                              = 21,
+	ALPHA_DESTBLEND_mask                              = 0x1f << 24,
+	ALPHA_DESTBLEND_shift                             = 24,
+	SEPARATE_ALPHA_BLEND_bit                          = 1 << 29,
+    VGT_DMA_BASE_HI                                       = 0x000287e4,
+	VGT_DMA_BASE_HI__BASE_ADDR_mask                   = 0xff << 0,
+	VGT_DMA_BASE_HI__BASE_ADDR_shift                  = 0,
+    VGT_DMA_BASE                                          = 0x000287e8,
+    VGT_DRAW_INITIATOR                                    = 0x000287f0,
+	SOURCE_SELECT_mask                                = 0x03 << 0,
+	SOURCE_SELECT_shift                               = 0,
+	    DI_SRC_SEL_DMA                                = 0x00,
+	    DI_SRC_SEL_IMMEDIATE                          = 0x01,
+	    DI_SRC_SEL_AUTO_INDEX                         = 0x02,
+	    DI_SRC_SEL_RESERVED                           = 0x03,
+	MAJOR_MODE_mask                                   = 0x03 << 2,
+	MAJOR_MODE_shift                                  = 2,
+	    DI_MAJOR_MODE_0                               = 0x00,
+	    DI_MAJOR_MODE_1                               = 0x01,
+	SPRITE_EN_bit                                     = 1 << 4,
+	NOT_EOP_bit                                       = 1 << 5,
+	USE_OPAQUE_bit                                    = 1 << 6,
+    VGT_IMMED_DATA                                        = 0x000287f4,
+    VGT_EVENT_ADDRESS_REG                                 = 0x000287f8,
+	ADDRESS_LOW_mask                                  = 0xfffffff << 0,
+	ADDRESS_LOW_shift                                 = 0,
+    DB_DEPTH_CONTROL                                      = 0x00028800,
+	STENCIL_ENABLE_bit                                = 1 << 0,
+	Z_ENABLE_bit                                      = 1 << 1,
+	Z_WRITE_ENABLE_bit                                = 1 << 2,
+	ZFUNC_mask                                        = 0x07 << 4,
+	ZFUNC_shift                                       = 4,
+	    FRAG_NEVER                                    = 0x00,
+	    FRAG_LESS                                     = 0x01,
+	    FRAG_EQUAL                                    = 0x02,
+	    FRAG_LEQUAL                                   = 0x03,
+	    FRAG_GREATER                                  = 0x04,
+	    FRAG_NOTEQUAL                                 = 0x05,
+	    FRAG_GEQUAL                                   = 0x06,
+	    FRAG_ALWAYS                                   = 0x07,
+	BACKFACE_ENABLE_bit                               = 1 << 7,
+	STENCILFUNC_mask                                  = 0x07 << 8,
+	STENCILFUNC_shift                                 = 8,
+/* 	    REF_NEVER                                     = 0x00, */
+/* 	    REF_LESS                                      = 0x01, */
+/* 	    REF_EQUAL                                     = 0x02, */
+/* 	    REF_LEQUAL                                    = 0x03, */
+/* 	    REF_GREATER                                   = 0x04, */
+/* 	    REF_NOTEQUAL                                  = 0x05, */
+/* 	    REF_GEQUAL                                    = 0x06, */
+/* 	    REF_ALWAYS                                    = 0x07, */
+	STENCILFAIL_mask                                  = 0x07 << 11,
+	STENCILFAIL_shift                                 = 11,
+	    STENCIL_KEEP                                  = 0x00,
+	    STENCIL_ZERO                                  = 0x01,
+	    STENCIL_REPLACE                               = 0x02,
+	    STENCIL_INCR_CLAMP                            = 0x03,
+	    STENCIL_DECR_CLAMP                            = 0x04,
+	    STENCIL_INVERT                                = 0x05,
+	    STENCIL_INCR_WRAP                             = 0x06,
+	    STENCIL_DECR_WRAP                             = 0x07,
+	STENCILZPASS_mask                                 = 0x07 << 14,
+	STENCILZPASS_shift                                = 14,
+/* 	    STENCIL_KEEP                                  = 0x00, */
+/* 	    STENCIL_ZERO                                  = 0x01, */
+/* 	    STENCIL_REPLACE                               = 0x02, */
+/* 	    STENCIL_INCR_CLAMP                            = 0x03, */
+/* 	    STENCIL_DECR_CLAMP                            = 0x04, */
+/* 	    STENCIL_INVERT                                = 0x05, */
+/* 	    STENCIL_INCR_WRAP                             = 0x06, */
+/* 	    STENCIL_DECR_WRAP                             = 0x07, */
+	STENCILZFAIL_mask                                 = 0x07 << 17,
+	STENCILZFAIL_shift                                = 17,
+/* 	    STENCIL_KEEP                                  = 0x00, */
+/* 	    STENCIL_ZERO                                  = 0x01, */
+/* 	    STENCIL_REPLACE                               = 0x02, */
+/* 	    STENCIL_INCR_CLAMP                            = 0x03, */
+/* 	    STENCIL_DECR_CLAMP                            = 0x04, */
+/* 	    STENCIL_INVERT                                = 0x05, */
+/* 	    STENCIL_INCR_WRAP                             = 0x06, */
+/* 	    STENCIL_DECR_WRAP                             = 0x07, */
+	STENCILFUNC_BF_mask                               = 0x07 << 20,
+	STENCILFUNC_BF_shift                              = 20,
+/* 	    REF_NEVER                                     = 0x00, */
+/* 	    REF_LESS                                      = 0x01, */
+/* 	    REF_EQUAL                                     = 0x02, */
+/* 	    REF_LEQUAL                                    = 0x03, */
+/* 	    REF_GREATER                                   = 0x04, */
+/* 	    REF_NOTEQUAL                                  = 0x05, */
+/* 	    REF_GEQUAL                                    = 0x06, */
+/* 	    REF_ALWAYS                                    = 0x07, */
+	STENCILFAIL_BF_mask                               = 0x07 << 23,
+	STENCILFAIL_BF_shift                              = 23,
+/* 	    STENCIL_KEEP                                  = 0x00, */
+/* 	    STENCIL_ZERO                                  = 0x01, */
+/* 	    STENCIL_REPLACE                               = 0x02, */
+/* 	    STENCIL_INCR_CLAMP                            = 0x03, */
+/* 	    STENCIL_DECR_CLAMP                            = 0x04, */
+/* 	    STENCIL_INVERT                                = 0x05, */
+/* 	    STENCIL_INCR_WRAP                             = 0x06, */
+/* 	    STENCIL_DECR_WRAP                             = 0x07, */
+	STENCILZPASS_BF_mask                              = 0x07 << 26,
+	STENCILZPASS_BF_shift                             = 26,
+/* 	    STENCIL_KEEP                                  = 0x00, */
+/* 	    STENCIL_ZERO                                  = 0x01, */
+/* 	    STENCIL_REPLACE                               = 0x02, */
+/* 	    STENCIL_INCR_CLAMP                            = 0x03, */
+/* 	    STENCIL_DECR_CLAMP                            = 0x04, */
+/* 	    STENCIL_INVERT                                = 0x05, */
+/* 	    STENCIL_INCR_WRAP                             = 0x06, */
+/* 	    STENCIL_DECR_WRAP                             = 0x07, */
+	STENCILZFAIL_BF_mask                              = 0x07 << 29,
+	STENCILZFAIL_BF_shift                             = 29,
+/* 	    STENCIL_KEEP                                  = 0x00, */
+/* 	    STENCIL_ZERO                                  = 0x01, */
+/* 	    STENCIL_REPLACE                               = 0x02, */
+/* 	    STENCIL_INCR_CLAMP                            = 0x03, */
+/* 	    STENCIL_DECR_CLAMP                            = 0x04, */
+/* 	    STENCIL_INVERT                                = 0x05, */
+/* 	    STENCIL_INCR_WRAP                             = 0x06, */
+/* 	    STENCIL_DECR_WRAP                             = 0x07, */
+    CB_BLEND_CONTROL                                      = 0x00028804,
+/* 	COLOR_SRCBLEND_mask                               = 0x1f << 0, */
+/* 	COLOR_SRCBLEND_shift                              = 0, */
+	    BLEND_ZERO                                    = 0x00,
+	    BLEND_ONE                                     = 0x01,
+	    BLEND_SRC_COLOR                               = 0x02,
+	    BLEND_ONE_MINUS_SRC_COLOR                     = 0x03,
+	    BLEND_SRC_ALPHA                               = 0x04,
+	    BLEND_ONE_MINUS_SRC_ALPHA                     = 0x05,
+	    BLEND_DST_ALPHA                               = 0x06,
+	    BLEND_ONE_MINUS_DST_ALPHA                     = 0x07,
+	    BLEND_DST_COLOR                               = 0x08,
+	    BLEND_ONE_MINUS_DST_COLOR                     = 0x09,
+	    BLEND_SRC_ALPHA_SATURATE                      = 0x0a,
+	    BLEND_BOTH_SRC_ALPHA                          = 0x0b,
+	    BLEND_BOTH_INV_SRC_ALPHA                      = 0x0c,
+	    BLEND_CONSTANT_COLOR                          = 0x0d,
+	    BLEND_ONE_MINUS_CONSTANT_COLOR                = 0x0e,
+	    BLEND_SRC1_COLOR                              = 0x0f,
+	    BLEND_INV_SRC1_COLOR                          = 0x10,
+	    BLEND_SRC1_ALPHA                              = 0x11,
+	    BLEND_INV_SRC1_ALPHA                          = 0x12,
+	    BLEND_CONSTANT_ALPHA                          = 0x13,
+	    BLEND_ONE_MINUS_CONSTANT_ALPHA                = 0x14,
+/* 	COLOR_COMB_FCN_mask                               = 0x07 << 5, */
+/* 	COLOR_COMB_FCN_shift                              = 5, */
+	    COMB_DST_PLUS_SRC                             = 0x00,
+	    COMB_SRC_MINUS_DST                            = 0x01,
+	    COMB_MIN_DST_SRC                              = 0x02,
+	    COMB_MAX_DST_SRC                              = 0x03,
+	    COMB_DST_MINUS_SRC                            = 0x04,
+/* 	COLOR_DESTBLEND_mask                              = 0x1f << 8, */
+/* 	COLOR_DESTBLEND_shift                             = 8, */
+/* 	    BLEND_ZERO                                    = 0x00, */
+/* 	    BLEND_ONE                                     = 0x01, */
+/* 	    BLEND_SRC_COLOR                               = 0x02, */
+/* 	    BLEND_ONE_MINUS_SRC_COLOR                     = 0x03, */
+/* 	    BLEND_SRC_ALPHA                               = 0x04, */
+/* 	    BLEND_ONE_MINUS_SRC_ALPHA                     = 0x05, */
+/* 	    BLEND_DST_ALPHA                               = 0x06, */
+/* 	    BLEND_ONE_MINUS_DST_ALPHA                     = 0x07, */
+/* 	    BLEND_DST_COLOR                               = 0x08, */
+/* 	    BLEND_ONE_MINUS_DST_COLOR                     = 0x09, */
+/* 	    BLEND_SRC_ALPHA_SATURATE                      = 0x0a, */
+/* 	    BLEND_BOTH_SRC_ALPHA                          = 0x0b, */
+/* 	    BLEND_BOTH_INV_SRC_ALPHA                      = 0x0c, */
+/* 	    BLEND_CONSTANT_COLOR                          = 0x0d, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_COLOR                = 0x0e, */
+/* 	    BLEND_SRC1_COLOR                              = 0x0f, */
+/* 	    BLEND_INV_SRC1_COLOR                          = 0x10, */
+/* 	    BLEND_SRC1_ALPHA                              = 0x11, */
+/* 	    BLEND_INV_SRC1_ALPHA                          = 0x12, */
+/* 	    BLEND_CONSTANT_ALPHA                          = 0x13, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_ALPHA                = 0x14, */
+/* 	OPACITY_WEIGHT_bit                                = 1 << 13, */
+/* 	ALPHA_SRCBLEND_mask                               = 0x1f << 16, */
+/* 	ALPHA_SRCBLEND_shift                              = 16, */
+/* 	    BLEND_ZERO                                    = 0x00, */
+/* 	    BLEND_ONE                                     = 0x01, */
+/* 	    BLEND_SRC_COLOR                               = 0x02, */
+/* 	    BLEND_ONE_MINUS_SRC_COLOR                     = 0x03, */
+/* 	    BLEND_SRC_ALPHA                               = 0x04, */
+/* 	    BLEND_ONE_MINUS_SRC_ALPHA                     = 0x05, */
+/* 	    BLEND_DST_ALPHA                               = 0x06, */
+/* 	    BLEND_ONE_MINUS_DST_ALPHA                     = 0x07, */
+/* 	    BLEND_DST_COLOR                               = 0x08, */
+/* 	    BLEND_ONE_MINUS_DST_COLOR                     = 0x09, */
+/* 	    BLEND_SRC_ALPHA_SATURATE                      = 0x0a, */
+/* 	    BLEND_BOTH_SRC_ALPHA                          = 0x0b, */
+/* 	    BLEND_BOTH_INV_SRC_ALPHA                      = 0x0c, */
+/* 	    BLEND_CONSTANT_COLOR                          = 0x0d, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_COLOR                = 0x0e, */
+/* 	    BLEND_SRC1_COLOR                              = 0x0f, */
+/* 	    BLEND_INV_SRC1_COLOR                          = 0x10, */
+/* 	    BLEND_SRC1_ALPHA                              = 0x11, */
+/* 	    BLEND_INV_SRC1_ALPHA                          = 0x12, */
+/* 	    BLEND_CONSTANT_ALPHA                          = 0x13, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_ALPHA                = 0x14, */
+/* 	ALPHA_COMB_FCN_mask                               = 0x07 << 21, */
+/* 	ALPHA_COMB_FCN_shift                              = 21, */
+/* 	    COMB_DST_PLUS_SRC                             = 0x00, */
+/* 	    COMB_SRC_MINUS_DST                            = 0x01, */
+/* 	    COMB_MIN_DST_SRC                              = 0x02, */
+/* 	    COMB_MAX_DST_SRC                              = 0x03, */
+/* 	    COMB_DST_MINUS_SRC                            = 0x04, */
+/* 	ALPHA_DESTBLEND_mask                              = 0x1f << 24, */
+/* 	ALPHA_DESTBLEND_shift                             = 24, */
+/* 	    BLEND_ZERO                                    = 0x00, */
+/* 	    BLEND_ONE                                     = 0x01, */
+/* 	    BLEND_SRC_COLOR                               = 0x02, */
+/* 	    BLEND_ONE_MINUS_SRC_COLOR                     = 0x03, */
+/* 	    BLEND_SRC_ALPHA                               = 0x04, */
+/* 	    BLEND_ONE_MINUS_SRC_ALPHA                     = 0x05, */
+/* 	    BLEND_DST_ALPHA                               = 0x06, */
+/* 	    BLEND_ONE_MINUS_DST_ALPHA                     = 0x07, */
+/* 	    BLEND_DST_COLOR                               = 0x08, */
+/* 	    BLEND_ONE_MINUS_DST_COLOR                     = 0x09, */
+/* 	    BLEND_SRC_ALPHA_SATURATE                      = 0x0a, */
+/* 	    BLEND_BOTH_SRC_ALPHA                          = 0x0b, */
+/* 	    BLEND_BOTH_INV_SRC_ALPHA                      = 0x0c, */
+/* 	    BLEND_CONSTANT_COLOR                          = 0x0d, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_COLOR                = 0x0e, */
+/* 	    BLEND_SRC1_COLOR                              = 0x0f, */
+/* 	    BLEND_INV_SRC1_COLOR                          = 0x10, */
+/* 	    BLEND_SRC1_ALPHA                              = 0x11, */
+/* 	    BLEND_INV_SRC1_ALPHA                          = 0x12, */
+/* 	    BLEND_CONSTANT_ALPHA                          = 0x13, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_ALPHA                = 0x14, */
+/* 	SEPARATE_ALPHA_BLEND_bit                          = 1 << 29, */
+    CB_COLOR_CONTROL                                      = 0x00028808,
+	FOG_ENABLE_bit                                    = 1 << 0,
+	MULTIWRITE_ENABLE_bit                             = 1 << 1,
+	DITHER_ENABLE_bit                                 = 1 << 2,
+	DEGAMMA_ENABLE_bit                                = 1 << 3,
+	SPECIAL_OP_mask                                   = 0x07 << 4,
+	SPECIAL_OP_shift                                  = 4,
+	    SPECIAL_NORMAL                                = 0x00,
+	    SPECIAL_DISABLE                               = 0x01,
+	    SPECIAL_FAST_CLEAR                            = 0x02,
+	    SPECIAL_FORCE_CLEAR                           = 0x03,
+	    SPECIAL_EXPAND_COLOR                          = 0x04,
+	    SPECIAL_EXPAND_TEXTURE                        = 0x05,
+	    SPECIAL_EXPAND_SAMPLES                        = 0x06,
+	    SPECIAL_RESOLVE_BOX                           = 0x07,
+	PER_MRT_BLEND_bit                                 = 1 << 7,
+	TARGET_BLEND_ENABLE_mask                          = 0xff << 8,
+	TARGET_BLEND_ENABLE_shift                         = 8,
+	ROP3_mask                                         = 0xff << 16,
+	ROP3_shift                                        = 16,
+    DB_SHADER_CONTROL                                     = 0x0002880c,
+	Z_EXPORT_ENABLE_bit                               = 1 << 0,
+	STENCIL_REF_EXPORT_ENABLE_bit                     = 1 << 1,
+	Z_ORDER_mask                                      = 0x03 << 4,
+	Z_ORDER_shift                                     = 4,
+	    LATE_Z                                        = 0x00,
+	    EARLY_Z_THEN_LATE_Z                           = 0x01,
+	    RE_Z                                          = 0x02,
+	    EARLY_Z_THEN_RE_Z                             = 0x03,
+	KILL_ENABLE_bit                                   = 1 << 6,
+	COVERAGE_TO_MASK_ENABLE_bit                       = 1 << 7,
+	MASK_EXPORT_ENABLE_bit                            = 1 << 8,
+	DUAL_EXPORT_ENABLE_bit                            = 1 << 9,
+	EXEC_ON_HIER_FAIL_bit                             = 1 << 10,
+	EXEC_ON_NOOP_bit                                  = 1 << 11,
+    PA_CL_CLIP_CNTL                                       = 0x00028810,
+	UCP_ENA_0_bit                                     = 1 << 0,
+	UCP_ENA_1_bit                                     = 1 << 1,
+	UCP_ENA_2_bit                                     = 1 << 2,
+	UCP_ENA_3_bit                                     = 1 << 3,
+	UCP_ENA_4_bit                                     = 1 << 4,
+	UCP_ENA_5_bit                                     = 1 << 5,
+	PS_UCP_Y_SCALE_NEG_bit                            = 1 << 13,
+	PS_UCP_MODE_mask                                  = 0x03 << 14,
+	PS_UCP_MODE_shift                                 = 14,
+	CLIP_DISABLE_bit                                  = 1 << 16,
+	UCP_CULL_ONLY_ENA_bit                             = 1 << 17,
+	BOUNDARY_EDGE_FLAG_ENA_bit                        = 1 << 18,
+	DX_CLIP_SPACE_DEF_bit                             = 1 << 19,
+	DIS_CLIP_ERR_DETECT_bit                           = 1 << 20,
+	VTX_KILL_OR_bit                                   = 1 << 21,
+	DX_LINEAR_ATTR_CLIP_ENA_bit                       = 1 << 24,
+	VTE_VPORT_PROVOKE_DISABLE_bit                     = 1 << 25,
+	ZCLIP_NEAR_DISABLE_bit                            = 1 << 26,
+	ZCLIP_FAR_DISABLE_bit                             = 1 << 27,
+    PA_SU_SC_MODE_CNTL                                    = 0x00028814,
+	CULL_FRONT_bit                                    = 1 << 0,
+	CULL_BACK_bit                                     = 1 << 1,
+	FACE_bit                                          = 1 << 2,
+	POLY_MODE_mask                                    = 0x03 << 3,
+	POLY_MODE_shift                                   = 3,
+	    X_DISABLE_POLY_MODE                           = 0x00,
+	    X_DUAL_MODE                                   = 0x01,
+	POLYMODE_FRONT_PTYPE_mask                         = 0x07 << 5,
+	POLYMODE_FRONT_PTYPE_shift                        = 5,
+	    X_DRAW_POINTS                                 = 0x00,
+	    X_DRAW_LINES                                  = 0x01,
+	    X_DRAW_TRIANGLES                              = 0x02,
+	POLYMODE_BACK_PTYPE_mask                          = 0x07 << 8,
+	POLYMODE_BACK_PTYPE_shift                         = 8,
+/* 	    X_DRAW_POINTS                                 = 0x00, */
+/* 	    X_DRAW_LINES                                  = 0x01, */
+/* 	    X_DRAW_TRIANGLES                              = 0x02, */
+	POLY_OFFSET_FRONT_ENABLE_bit                      = 1 << 11,
+	POLY_OFFSET_BACK_ENABLE_bit                       = 1 << 12,
+	POLY_OFFSET_PARA_ENABLE_bit                       = 1 << 13,
+	VTX_WINDOW_OFFSET_ENABLE_bit                      = 1 << 16,
+	PROVOKING_VTX_LAST_bit                            = 1 << 19,
+	PERSP_CORR_DIS_bit                                = 1 << 20,
+	MULTI_PRIM_IB_ENA_bit                             = 1 << 21,
+    PA_CL_VTE_CNTL                                        = 0x00028818,
+	VPORT_X_SCALE_ENA_bit                             = 1 << 0,
+	VPORT_X_OFFSET_ENA_bit                            = 1 << 1,
+	VPORT_Y_SCALE_ENA_bit                             = 1 << 2,
+	VPORT_Y_OFFSET_ENA_bit                            = 1 << 3,
+	VPORT_Z_SCALE_ENA_bit                             = 1 << 4,
+	VPORT_Z_OFFSET_ENA_bit                            = 1 << 5,
+	VTX_XY_FMT_bit                                    = 1 << 8,
+	VTX_Z_FMT_bit                                     = 1 << 9,
+	VTX_W0_FMT_bit                                    = 1 << 10,
+	PERFCOUNTER_REF_bit                               = 1 << 11,
+    PA_CL_VS_OUT_CNTL                                     = 0x0002881c,
+	CLIP_DIST_ENA_0_bit                               = 1 << 0,
+	CLIP_DIST_ENA_1_bit                               = 1 << 1,
+	CLIP_DIST_ENA_2_bit                               = 1 << 2,
+	CLIP_DIST_ENA_3_bit                               = 1 << 3,
+	CLIP_DIST_ENA_4_bit                               = 1 << 4,
+	CLIP_DIST_ENA_5_bit                               = 1 << 5,
+	CLIP_DIST_ENA_6_bit                               = 1 << 6,
+	CLIP_DIST_ENA_7_bit                               = 1 << 7,
+	CULL_DIST_ENA_0_bit                               = 1 << 8,
+	CULL_DIST_ENA_1_bit                               = 1 << 9,
+	CULL_DIST_ENA_2_bit                               = 1 << 10,
+	CULL_DIST_ENA_3_bit                               = 1 << 11,
+	CULL_DIST_ENA_4_bit                               = 1 << 12,
+	CULL_DIST_ENA_5_bit                               = 1 << 13,
+	CULL_DIST_ENA_6_bit                               = 1 << 14,
+	CULL_DIST_ENA_7_bit                               = 1 << 15,
+	USE_VTX_POINT_SIZE_bit                            = 1 << 16,
+	USE_VTX_EDGE_FLAG_bit                             = 1 << 17,
+	USE_VTX_RENDER_TARGET_INDX_bit                    = 1 << 18,
+	USE_VTX_VIEWPORT_INDX_bit                         = 1 << 19,
+	USE_VTX_KILL_FLAG_bit                             = 1 << 20,
+	VS_OUT_MISC_VEC_ENA_bit                           = 1 << 21,
+	VS_OUT_CCDIST0_VEC_ENA_bit                        = 1 << 22,
+	VS_OUT_CCDIST1_VEC_ENA_bit                        = 1 << 23,
+    PA_CL_NANINF_CNTL                                     = 0x00028820,
+	VTE_XY_INF_DISCARD_bit                            = 1 << 0,
+	VTE_Z_INF_DISCARD_bit                             = 1 << 1,
+	VTE_W_INF_DISCARD_bit                             = 1 << 2,
+	VTE_0XNANINF_IS_0_bit                             = 1 << 3,
+	VTE_XY_NAN_RETAIN_bit                             = 1 << 4,
+	VTE_Z_NAN_RETAIN_bit                              = 1 << 5,
+	VTE_W_NAN_RETAIN_bit                              = 1 << 6,
+	VTE_W_RECIP_NAN_IS_0_bit                          = 1 << 7,
+	VS_XY_NAN_TO_INF_bit                              = 1 << 8,
+	VS_XY_INF_RETAIN_bit                              = 1 << 9,
+	VS_Z_NAN_TO_INF_bit                               = 1 << 10,
+	VS_Z_INF_RETAIN_bit                               = 1 << 11,
+	VS_W_NAN_TO_INF_bit                               = 1 << 12,
+	VS_W_INF_RETAIN_bit                               = 1 << 13,
+	VS_CLIP_DIST_INF_DISCARD_bit                      = 1 << 14,
+	VTE_NO_OUTPUT_NEG_0_bit                           = 1 << 20,
+    SQ_PGM_START_PS                                       = 0x00028840,
+    SQ_PGM_RESOURCES_PS                                   = 0x00028850,
+	NUM_GPRS_mask                                     = 0xff << 0,
+	NUM_GPRS_shift                                    = 0,
+	STACK_SIZE_mask                                   = 0xff << 8,
+	STACK_SIZE_shift                                  = 8,
+	SQ_PGM_RESOURCES_PS__DX10_CLAMP_bit               = 1 << 21,
+	FETCH_CACHE_LINES_mask                            = 0x07 << 24,
+	FETCH_CACHE_LINES_shift                           = 24,
+	UNCACHED_FIRST_INST_bit                           = 1 << 28,
+	CLAMP_CONSTS_bit                                  = 1 << 31,
+    SQ_PGM_EXPORTS_PS                                     = 0x00028854,
+	EXPORT_MODE_mask                                  = 0x1f << 0,
+	EXPORT_MODE_shift                                 = 0,
+    SQ_PGM_START_VS                                       = 0x00028858,
+    SQ_PGM_RESOURCES_VS                                   = 0x00028868,
+/* 	NUM_GPRS_mask                                     = 0xff << 0, */
+/* 	NUM_GPRS_shift                                    = 0, */
+/* 	STACK_SIZE_mask                                   = 0xff << 8, */
+/* 	STACK_SIZE_shift                                  = 8, */
+	SQ_PGM_RESOURCES_VS__DX10_CLAMP_bit               = 1 << 21,
+/* 	FETCH_CACHE_LINES_mask                            = 0x07 << 24, */
+/* 	FETCH_CACHE_LINES_shift                           = 24, */
+/* 	UNCACHED_FIRST_INST_bit                           = 1 << 28, */
+    SQ_PGM_START_GS                                       = 0x0002886c,
+    SQ_PGM_RESOURCES_GS                                   = 0x0002887c,
+/* 	NUM_GPRS_mask                                     = 0xff << 0, */
+/* 	NUM_GPRS_shift                                    = 0, */
+/* 	STACK_SIZE_mask                                   = 0xff << 8, */
+/* 	STACK_SIZE_shift                                  = 8, */
+	SQ_PGM_RESOURCES_GS__DX10_CLAMP_bit               = 1 << 21,
+/* 	FETCH_CACHE_LINES_mask                            = 0x07 << 24, */
+/* 	FETCH_CACHE_LINES_shift                           = 24, */
+/* 	UNCACHED_FIRST_INST_bit                           = 1 << 28, */
+    SQ_PGM_START_ES                                       = 0x00028880,
+    SQ_PGM_RESOURCES_ES                                   = 0x00028890,
+/* 	NUM_GPRS_mask                                     = 0xff << 0, */
+/* 	NUM_GPRS_shift                                    = 0, */
+/* 	STACK_SIZE_mask                                   = 0xff << 8, */
+/* 	STACK_SIZE_shift                                  = 8, */
+	SQ_PGM_RESOURCES_ES__DX10_CLAMP_bit               = 1 << 21,
+/* 	FETCH_CACHE_LINES_mask                            = 0x07 << 24, */
+/* 	FETCH_CACHE_LINES_shift                           = 24, */
+/* 	UNCACHED_FIRST_INST_bit                           = 1 << 28, */
+    SQ_PGM_START_FS                                       = 0x00028894,
+    SQ_PGM_RESOURCES_FS                                   = 0x000288a4,
+/* 	NUM_GPRS_mask                                     = 0xff << 0, */
+/* 	NUM_GPRS_shift                                    = 0, */
+/* 	STACK_SIZE_mask                                   = 0xff << 8, */
+/* 	STACK_SIZE_shift                                  = 8, */
+	SQ_PGM_RESOURCES_FS__DX10_CLAMP_bit               = 1 << 21,
+    SQ_ESGS_RING_ITEMSIZE                                 = 0x000288a8,
+	ITEMSIZE_mask                                     = 0x7fff << 0,
+	ITEMSIZE_shift                                    = 0,
+    SQ_GSVS_RING_ITEMSIZE                                 = 0x000288ac,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_ESTMP_RING_ITEMSIZE                                = 0x000288b0,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_GSTMP_RING_ITEMSIZE                                = 0x000288b4,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_VSTMP_RING_ITEMSIZE                                = 0x000288b8,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_PSTMP_RING_ITEMSIZE                                = 0x000288bc,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_FBUF_RING_ITEMSIZE                                 = 0x000288c0,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_REDUC_RING_ITEMSIZE                                = 0x000288c4,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_GS_VERT_ITEMSIZE                                   = 0x000288c8,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_PGM_CF_OFFSET_PS                                   = 0x000288cc,
+	PGM_CF_OFFSET_mask                                = 0xfffff << 0,
+	PGM_CF_OFFSET_shift                               = 0,
+    SQ_PGM_CF_OFFSET_VS                                   = 0x000288d0,
+/* 	PGM_CF_OFFSET_mask                                = 0xfffff << 0, */
+/* 	PGM_CF_OFFSET_shift                               = 0, */
+    SQ_PGM_CF_OFFSET_GS                                   = 0x000288d4,
+/* 	PGM_CF_OFFSET_mask                                = 0xfffff << 0, */
+/* 	PGM_CF_OFFSET_shift                               = 0, */
+    SQ_PGM_CF_OFFSET_ES                                   = 0x000288d8,
+/* 	PGM_CF_OFFSET_mask                                = 0xfffff << 0, */
+/* 	PGM_CF_OFFSET_shift                               = 0, */
+    SQ_PGM_CF_OFFSET_FS                                   = 0x000288dc,
+/* 	PGM_CF_OFFSET_mask                                = 0xfffff << 0, */
+/* 	PGM_CF_OFFSET_shift                               = 0, */
+    SQ_VTX_SEMANTIC_CLEAR                                 = 0x000288e0,
+    SQ_ALU_CONST_CACHE_PS_0                               = 0x00028940,
+	SQ_ALU_CONST_CACHE_PS_0_num                       = 16,
+    SQ_ALU_CONST_CACHE_VS_0                               = 0x00028980,
+	SQ_ALU_CONST_CACHE_VS_0_num                       = 16,
+    SQ_ALU_CONST_CACHE_GS_0                               = 0x000289c0,
+	SQ_ALU_CONST_CACHE_GS_0_num                       = 16,
+    PA_SU_POINT_SIZE                                      = 0x00028a00,
+	PA_SU_POINT_SIZE__HEIGHT_mask                     = 0xffff << 0,
+	PA_SU_POINT_SIZE__HEIGHT_shift                    = 0,
+	PA_SU_POINT_SIZE__WIDTH_mask                      = 0xffff << 16,
+	PA_SU_POINT_SIZE__WIDTH_shift                     = 16,
+    PA_SU_POINT_MINMAX                                    = 0x00028a04,
+	MIN_SIZE_mask                                     = 0xffff << 0,
+	MIN_SIZE_shift                                    = 0,
+	MAX_SIZE_mask                                     = 0xffff << 16,
+	MAX_SIZE_shift                                    = 16,
+    PA_SU_LINE_CNTL                                       = 0x00028a08,
+	PA_SU_LINE_CNTL__WIDTH_mask                       = 0xffff << 0,
+	PA_SU_LINE_CNTL__WIDTH_shift                      = 0,
+    PA_SC_LINE_STIPPLE                                    = 0x00028a0c,
+	LINE_PATTERN_mask                                 = 0xffff << 0,
+	LINE_PATTERN_shift                                = 0,
+	REPEAT_COUNT_mask                                 = 0xff << 16,
+	REPEAT_COUNT_shift                                = 16,
+	PATTERN_BIT_ORDER_bit                             = 1 << 28,
+	AUTO_RESET_CNTL_mask                              = 0x03 << 29,
+	AUTO_RESET_CNTL_shift                             = 29,
+    VGT_OUTPUT_PATH_CNTL                                  = 0x00028a10,
+	PATH_SELECT_mask                                  = 0x03 << 0,
+	PATH_SELECT_shift                                 = 0,
+	    VGT_OUTPATH_VTX_REUSE                         = 0x00,
+	    VGT_OUTPATH_TESS_EN                           = 0x01,
+	    VGT_OUTPATH_PASSTHRU                          = 0x02,
+	    VGT_OUTPATH_GS_BLOCK                          = 0x03,
+    VGT_HOS_CNTL                                          = 0x00028a14,
+	TESS_MODE_mask                                    = 0x03 << 0,
+	TESS_MODE_shift                                   = 0,
+    VGT_HOS_MAX_TESS_LEVEL                                = 0x00028a18,
+    VGT_HOS_MIN_TESS_LEVEL                                = 0x00028a1c,
+    VGT_HOS_REUSE_DEPTH                                   = 0x00028a20,
+	REUSE_DEPTH_mask                                  = 0xff << 0,
+	REUSE_DEPTH_shift                                 = 0,
+    VGT_GROUP_PRIM_TYPE                                   = 0x00028a24,
+	VGT_GROUP_PRIM_TYPE__PRIM_TYPE_mask               = 0x1f << 0,
+	VGT_GROUP_PRIM_TYPE__PRIM_TYPE_shift              = 0,
+	    VGT_GRP_3D_POINT                              = 0x00,
+	    VGT_GRP_3D_LINE                               = 0x01,
+	    VGT_GRP_3D_TRI                                = 0x02,
+	    VGT_GRP_3D_RECT                               = 0x03,
+	    VGT_GRP_3D_QUAD                               = 0x04,
+	    VGT_GRP_2D_COPY_RECT_V0                       = 0x05,
+	    VGT_GRP_2D_COPY_RECT_V1                       = 0x06,
+	    VGT_GRP_2D_COPY_RECT_V2                       = 0x07,
+	    VGT_GRP_2D_COPY_RECT_V3                       = 0x08,
+	    VGT_GRP_2D_FILL_RECT                          = 0x09,
+	    VGT_GRP_2D_LINE                               = 0x0a,
+	    VGT_GRP_2D_TRI                                = 0x0b,
+	    VGT_GRP_PRIM_INDEX_LINE                       = 0x0c,
+	    VGT_GRP_PRIM_INDEX_TRI                        = 0x0d,
+	    VGT_GRP_PRIM_INDEX_QUAD                       = 0x0e,
+	    VGT_GRP_3D_LINE_ADJ                           = 0x0f,
+	    VGT_GRP_3D_TRI_ADJ                            = 0x10,
+	RETAIN_ORDER_bit                                  = 1 << 14,
+	RETAIN_QUADS_bit                                  = 1 << 15,
+	PRIM_ORDER_mask                                   = 0x07 << 16,
+	PRIM_ORDER_shift                                  = 16,
+	    VGT_GRP_LIST                                  = 0x00,
+	    VGT_GRP_STRIP                                 = 0x01,
+	    VGT_GRP_FAN                                   = 0x02,
+	    VGT_GRP_LOOP                                  = 0x03,
+	    VGT_GRP_POLYGON                               = 0x04,
+    VGT_GROUP_FIRST_DECR                                  = 0x00028a28,
+	FIRST_DECR_mask                                   = 0x0f << 0,
+	FIRST_DECR_shift                                  = 0,
+    VGT_GROUP_DECR                                        = 0x00028a2c,
+	DECR_mask                                         = 0x0f << 0,
+	DECR_shift                                        = 0,
+    VGT_GROUP_VECT_0_CNTL                                 = 0x00028a30,
+	COMP_X_EN_bit                                     = 1 << 0,
+	COMP_Y_EN_bit                                     = 1 << 1,
+	COMP_Z_EN_bit                                     = 1 << 2,
+	COMP_W_EN_bit                                     = 1 << 3,
+	VGT_GROUP_VECT_0_CNTL__STRIDE_mask                = 0xff << 8,
+	VGT_GROUP_VECT_0_CNTL__STRIDE_shift               = 8,
+	SHIFT_mask                                        = 0xff << 16,
+	SHIFT_shift                                       = 16,
+    VGT_GROUP_VECT_1_CNTL                                 = 0x00028a34,
+/* 	COMP_X_EN_bit                                     = 1 << 0, */
+/* 	COMP_Y_EN_bit                                     = 1 << 1, */
+/* 	COMP_Z_EN_bit                                     = 1 << 2, */
+/* 	COMP_W_EN_bit                                     = 1 << 3, */
+	VGT_GROUP_VECT_1_CNTL__STRIDE_mask                = 0xff << 8,
+	VGT_GROUP_VECT_1_CNTL__STRIDE_shift               = 8,
+/* 	SHIFT_mask                                        = 0xff << 16, */
+/* 	SHIFT_shift                                       = 16, */
+    VGT_GROUP_VECT_0_FMT_CNTL                             = 0x00028a38,
+	X_CONV_mask                                       = 0x0f << 0,
+	X_CONV_shift                                      = 0,
+	    VGT_GRP_INDEX_16                              = 0x00,
+	    VGT_GRP_INDEX_32                              = 0x01,
+	    VGT_GRP_UINT_16                               = 0x02,
+	    VGT_GRP_UINT_32                               = 0x03,
+	    VGT_GRP_SINT_16                               = 0x04,
+	    VGT_GRP_SINT_32                               = 0x05,
+	    VGT_GRP_FLOAT_32                              = 0x06,
+	    VGT_GRP_AUTO_PRIM                             = 0x07,
+	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08,
+	X_OFFSET_mask                                     = 0x0f << 4,
+	X_OFFSET_shift                                    = 4,
+	Y_CONV_mask                                       = 0x0f << 8,
+	Y_CONV_shift                                      = 8,
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+	Y_OFFSET_mask                                     = 0x0f << 12,
+	Y_OFFSET_shift                                    = 12,
+	Z_CONV_mask                                       = 0x0f << 16,
+	Z_CONV_shift                                      = 16,
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+	Z_OFFSET_mask                                     = 0x0f << 20,
+	Z_OFFSET_shift                                    = 20,
+	W_CONV_mask                                       = 0x0f << 24,
+	W_CONV_shift                                      = 24,
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+	W_OFFSET_mask                                     = 0x0f << 28,
+	W_OFFSET_shift                                    = 28,
+    VGT_GROUP_VECT_1_FMT_CNTL                             = 0x00028a3c,
+/* 	X_CONV_mask                                       = 0x0f << 0, */
+/* 	X_CONV_shift                                      = 0, */
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+/* 	X_OFFSET_mask                                     = 0x0f << 4, */
+/* 	X_OFFSET_shift                                    = 4, */
+/* 	Y_CONV_mask                                       = 0x0f << 8, */
+/* 	Y_CONV_shift                                      = 8, */
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+/* 	Y_OFFSET_mask                                     = 0x0f << 12, */
+/* 	Y_OFFSET_shift                                    = 12, */
+/* 	Z_CONV_mask                                       = 0x0f << 16, */
+/* 	Z_CONV_shift                                      = 16, */
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+/* 	Z_OFFSET_mask                                     = 0x0f << 20, */
+/* 	Z_OFFSET_shift                                    = 20, */
+/* 	W_CONV_mask                                       = 0x0f << 24, */
+/* 	W_CONV_shift                                      = 24, */
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+/* 	W_OFFSET_mask                                     = 0x0f << 28, */
+/* 	W_OFFSET_shift                                    = 28, */
+    VGT_GS_MODE                                           = 0x00028a40,
+	MODE_mask                                         = 0x03 << 0,
+	MODE_shift                                        = 0,
+	    GS_OFF                                        = 0x00,
+	    GS_SCENARIO_A                                 = 0x01,
+	    GS_SCENARIO_B                                 = 0x02,
+	    GS_SCENARIO_G                                 = 0x03,
+	ES_PASSTHRU_bit                                   = 1 << 2,
+	CUT_MODE_mask                                     = 0x03 << 3,
+	CUT_MODE_shift                                    = 3,
+	    GS_CUT_1024                                   = 0x00,
+	    GS_CUT_512                                    = 0x01,
+	    GS_CUT_256                                    = 0x02,
+	    GS_CUT_128                                    = 0x03,
+    PA_SC_MPASS_PS_CNTL                                   = 0x00028a48,
+	MPASS_PIX_VEC_PER_PASS_mask                       = 0xfffff << 0,
+	MPASS_PIX_VEC_PER_PASS_shift                      = 0,
+	MPASS_PS_ENA_bit                                  = 1 << 31,
+    PA_SC_MODE_CNTL                                       = 0x00028a4c,
+	MSAA_ENABLE_bit                                   = 1 << 0,
+	CLIPRECT_ENABLE_bit                               = 1 << 1,
+	LINE_STIPPLE_ENABLE_bit                           = 1 << 2,
+	MULTI_CHIP_PRIM_DISCARD_ENAB_bit                  = 1 << 3,
+	WALK_ORDER_ENABLE_bit                             = 1 << 4,
+	HALVE_DETAIL_SAMPLE_PERF_bit                      = 1 << 5,
+	WALK_SIZE_bit                                     = 1 << 6,
+	WALK_ALIGNMENT_bit                                = 1 << 7,
+	WALK_ALIGN8_PRIM_FITS_ST_bit                      = 1 << 8,
+	TILE_COVER_NO_SCISSOR_bit                         = 1 << 9,
+	KILL_PIX_POST_HI_Z_bit                            = 1 << 10,
+	KILL_PIX_POST_DETAIL_MASK_bit                     = 1 << 11,
+	MULTI_CHIP_SUPERTILE_ENABLE_bit                   = 1 << 12,
+	TILE_COVER_DISABLE_bit                            = 1 << 13,
+	FORCE_EOV_CNTDWN_ENABLE_bit                       = 1 << 14,
+	FORCE_EOV_TILE_ENABLE_bit                         = 1 << 15,
+	FORCE_EOV_REZ_ENABLE_bit                          = 1 << 16,
+	PS_ITER_SAMPLE_bit                                = 1 << 17,
+    VGT_ENHANCE                                           = 0x00028a50,
+	VGT_ENHANCE__MI_TIMESTAMP_RES_mask                = 0x03 << 0,
+	VGT_ENHANCE__MI_TIMESTAMP_RES_shift               = 0,
+	    X_0_992_CLOCKS_LATENCY_RANGE_IN_STEPS_OF_32   = 0x00,
+	    X_0_496_CLOCKS_LATENCY_RANGE_IN_STEPS_OF_16   = 0x01,
+	    X_0_248_CLOCKS_LATENCY_RANGE_IN_STEPS_OF_8    = 0x02,
+	    X_0_124_CLOCKS_LATENCY_RANGE_IN_STEPS_OF_4    = 0x03,
+	MISC_mask                                         = 0x3fffffff << 2,
+	MISC_shift                                        = 2,
+    VGT_GS_OUT_PRIM_TYPE                                  = 0x00028a6c,
+	OUTPRIM_TYPE_mask                                 = 0x3f << 0,
+	OUTPRIM_TYPE_shift                                = 0,
+	    POINTLIST                                     = 0x00,
+	    LINESTRIP                                     = 0x01,
+	    TRISTRIP                                      = 0x02,
+    VGT_DMA_SIZE                                          = 0x00028a74,
+    VGT_DMA_INDEX_TYPE                                    = 0x00028a7c,
+/* 	INDEX_TYPE_mask                                   = 0x03 << 0, */
+/* 	INDEX_TYPE_shift                                  = 0, */
+	    VGT_INDEX_16                                  = 0x00,
+	    VGT_INDEX_32                                  = 0x01,
+	SWAP_MODE_mask                                    = 0x03 << 2,
+	SWAP_MODE_shift                                   = 2,
+	    VGT_DMA_SWAP_NONE                             = 0x00,
+	    VGT_DMA_SWAP_16_BIT                           = 0x01,
+	    VGT_DMA_SWAP_32_BIT                           = 0x02,
+	    VGT_DMA_SWAP_WORD                             = 0x03,
+    VGT_PRIMITIVEID_EN                                    = 0x00028a84,
+	PRIMITIVEID_EN_bit                                = 1 << 0,
+    VGT_DMA_NUM_INSTANCES                                 = 0x00028a88,
+    VGT_EVENT_INITIATOR                                   = 0x00028a90,
+	EVENT_TYPE_mask                                   = 0x3f << 0,
+	EVENT_TYPE_shift                                  = 0,
+	    CACHE_FLUSH_TS                                = 0x04,
+	    CONTEXT_DONE                                  = 0x05,
+	    CACHE_FLUSH                                   = 0x06,
+	    VIZQUERY_START                                = 0x07,
+	    VIZQUERY_END                                  = 0x08,
+	    SC_WAIT_WC                                    = 0x09,
+	    MPASS_PS_CP_REFETCH                           = 0x0a,
+	    MPASS_PS_RST_START                            = 0x0b,
+	    MPASS_PS_INCR_START                           = 0x0c,
+	    RST_PIX_CNT                                   = 0x0d,
+	    RST_VTX_CNT                                   = 0x0e,
+	    VS_PARTIAL_FLUSH                              = 0x0f,
+	    PS_PARTIAL_FLUSH                              = 0x10,
+	    CACHE_FLUSH_AND_INV_TS_EVENT                  = 0x14,
+	    ZPASS_DONE                                    = 0x15,
+	    CACHE_FLUSH_AND_INV_EVENT                     = 0x16,
+	    PERFCOUNTER_START                             = 0x17,
+	    PERFCOUNTER_STOP                              = 0x18,
+	    PIPELINESTAT_START                            = 0x19,
+	    PIPELINESTAT_STOP                             = 0x1a,
+	    PERFCOUNTER_SAMPLE                            = 0x1b,
+	    FLUSH_ES_OUTPUT                               = 0x1c,
+	    FLUSH_GS_OUTPUT                               = 0x1d,
+	    SAMPLE_PIPELINESTAT                           = 0x1e,
+	    SO_VGTSTREAMOUT_FLUSH                         = 0x1f,
+	    SAMPLE_STREAMOUTSTATS                         = 0x20,
+	    RESET_VTX_CNT                                 = 0x21,
+	    BLOCK_CONTEXT_DONE                            = 0x22,
+	    CR_CONTEXT_DONE                               = 0x23,
+	    VGT_FLUSH                                     = 0x24,
+	    CR_DONE_TS                                    = 0x25,
+	    SQ_NON_EVENT                                  = 0x26,
+	    SC_SEND_DB_VPZ                                = 0x27,
+	    BOTTOM_OF_PIPE_TS                             = 0x28,
+	    DB_CACHE_FLUSH_AND_INV                        = 0x2a,
+	ADDRESS_HI_mask                                   = 0xff << 19,
+	ADDRESS_HI_shift                                  = 19,
+	EXTENDED_EVENT_bit                                = 1 << 27,
+    VGT_MULTI_PRIM_IB_RESET_EN                            = 0x00028a94,
+	RESET_EN_bit                                      = 1 << 0,
+    VGT_INSTANCE_STEP_RATE_0                              = 0x00028aa0,
+    VGT_INSTANCE_STEP_RATE_1                              = 0x00028aa4,
+    VGT_STRMOUT_EN                                        = 0x00028ab0,
+	STREAMOUT_bit                                     = 1 << 0,
+    VGT_REUSE_OFF                                         = 0x00028ab4,
+	REUSE_OFF_bit                                     = 1 << 0,
+    VGT_VTX_CNT_EN                                        = 0x00028ab8,
+	VTX_CNT_EN_bit                                    = 1 << 0,
+    VGT_STRMOUT_BUFFER_SIZE_0                             = 0x00028ad0,
+    VGT_STRMOUT_VTX_STRIDE_0                              = 0x00028ad4,
+	VGT_STRMOUT_VTX_STRIDE_0__STRIDE_mask             = 0x3ff << 0,
+	VGT_STRMOUT_VTX_STRIDE_0__STRIDE_shift            = 0,
+    VGT_STRMOUT_BUFFER_BASE_0                             = 0x00028ad8,
+    VGT_STRMOUT_BUFFER_OFFSET_0                           = 0x00028adc,
+    VGT_STRMOUT_BUFFER_SIZE_1                             = 0x00028ae0,
+    VGT_STRMOUT_VTX_STRIDE_1                              = 0x00028ae4,
+	VGT_STRMOUT_VTX_STRIDE_1__STRIDE_mask             = 0x3ff << 0,
+	VGT_STRMOUT_VTX_STRIDE_1__STRIDE_shift            = 0,
+    VGT_STRMOUT_BUFFER_BASE_1                             = 0x00028ae8,
+    VGT_STRMOUT_BUFFER_OFFSET_1                           = 0x00028aec,
+    VGT_STRMOUT_BUFFER_SIZE_2                             = 0x00028af0,
+    VGT_STRMOUT_VTX_STRIDE_2                              = 0x00028af4,
+	VGT_STRMOUT_VTX_STRIDE_2__STRIDE_mask             = 0x3ff << 0,
+	VGT_STRMOUT_VTX_STRIDE_2__STRIDE_shift            = 0,
+    VGT_STRMOUT_BUFFER_BASE_2                             = 0x00028af8,
+    VGT_STRMOUT_BUFFER_OFFSET_2                           = 0x00028afc,
+    VGT_STRMOUT_BUFFER_SIZE_3                             = 0x00028b00,
+    VGT_STRMOUT_VTX_STRIDE_3                              = 0x00028b04,
+	VGT_STRMOUT_VTX_STRIDE_3__STRIDE_mask             = 0x3ff << 0,
+	VGT_STRMOUT_VTX_STRIDE_3__STRIDE_shift            = 0,
+    VGT_STRMOUT_BUFFER_BASE_3                             = 0x00028b08,
+    VGT_STRMOUT_BUFFER_OFFSET_3                           = 0x00028b0c,
+    VGT_STRMOUT_BASE_OFFSET_0                             = 0x00028b10,
+    VGT_STRMOUT_BASE_OFFSET_1                             = 0x00028b14,
+    VGT_STRMOUT_BASE_OFFSET_2                             = 0x00028b18,
+    VGT_STRMOUT_BASE_OFFSET_3                             = 0x00028b1c,
+    VGT_STRMOUT_BUFFER_EN                                 = 0x00028b20,
+	BUFFER_0_EN_bit                                   = 1 << 0,
+	BUFFER_1_EN_bit                                   = 1 << 1,
+	BUFFER_2_EN_bit                                   = 1 << 2,
+	BUFFER_3_EN_bit                                   = 1 << 3,
+    VGT_STRMOUT_DRAW_OPAQUE_OFFSET                        = 0x00028b28,
+    VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE            = 0x00028b2c,
+    VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE                 = 0x00028b30,
+    VGT_STRMOUT_BASE_OFFSET_HI_0                          = 0x00028b44,
+	VGT_STRMOUT_BASE_OFFSET_HI_0__BASE_OFFSET_mask    = 0x3f << 0,
+	VGT_STRMOUT_BASE_OFFSET_HI_0__BASE_OFFSET_shift   = 0,
+    VGT_STRMOUT_BASE_OFFSET_HI_1                          = 0x00028b48,
+	VGT_STRMOUT_BASE_OFFSET_HI_1__BASE_OFFSET_mask    = 0x3f << 0,
+	VGT_STRMOUT_BASE_OFFSET_HI_1__BASE_OFFSET_shift   = 0,
+    VGT_STRMOUT_BASE_OFFSET_HI_2                          = 0x00028b4c,
+	VGT_STRMOUT_BASE_OFFSET_HI_2__BASE_OFFSET_mask    = 0x3f << 0,
+	VGT_STRMOUT_BASE_OFFSET_HI_2__BASE_OFFSET_shift   = 0,
+    VGT_STRMOUT_BASE_OFFSET_HI_3                          = 0x00028b50,
+	VGT_STRMOUT_BASE_OFFSET_HI_3__BASE_OFFSET_mask    = 0x3f << 0,
+	VGT_STRMOUT_BASE_OFFSET_HI_3__BASE_OFFSET_shift   = 0,
+    PA_SC_LINE_CNTL                                       = 0x00028c00,
+	BRES_CNTL_mask                                    = 0xff << 0,
+	BRES_CNTL_shift                                   = 0,
+	USE_BRES_CNTL_bit                                 = 1 << 8,
+	EXPAND_LINE_WIDTH_bit                             = 1 << 9,
+	LAST_PIXEL_bit                                    = 1 << 10,
+    PA_SC_AA_CONFIG                                       = 0x00028c04,
+	MSAA_NUM_SAMPLES_mask                             = 0x03 << 0,
+	MSAA_NUM_SAMPLES_shift                            = 0,
+	AA_MASK_CENTROID_DTMN_bit                         = 1 << 4,
+	MAX_SAMPLE_DIST_mask                              = 0x0f << 13,
+	MAX_SAMPLE_DIST_shift                             = 13,
+    PA_SU_VTX_CNTL                                        = 0x00028c08,
+	PIX_CENTER_bit                                    = 1 << 0,
+	PA_SU_VTX_CNTL__ROUND_MODE_mask                   = 0x03 << 1,
+	PA_SU_VTX_CNTL__ROUND_MODE_shift                  = 1,
+	    X_TRUNCATE                                    = 0x00,
+	    X_ROUND                                       = 0x01,
+	    X_ROUND_TO_EVEN                               = 0x02,
+	    X_ROUND_TO_ODD                                = 0x03,
+	QUANT_MODE_mask                                   = 0x07 << 3,
+	QUANT_MODE_shift                                  = 3,
+	    X_1_16TH                                      = 0x00,
+	    X_1_8TH                                       = 0x01,
+	    X_1_4TH                                       = 0x02,
+	    X_1_2                                         = 0x03,
+	    X_1                                           = 0x04,
+	    X_1_256TH                                     = 0x05,
+    PA_CL_GB_VERT_CLIP_ADJ                                = 0x00028c0c,
+    PA_CL_GB_VERT_DISC_ADJ                                = 0x00028c10,
+    PA_CL_GB_HORZ_CLIP_ADJ                                = 0x00028c14,
+    PA_CL_GB_HORZ_DISC_ADJ                                = 0x00028c18,
+    PA_SC_AA_SAMPLE_LOCS_MCTX                             = 0x00028c1c,
+/* 	S0_X_mask                                         = 0x0f << 0, */
+/* 	S0_X_shift                                        = 0, */
+/* 	S0_Y_mask                                         = 0x0f << 4, */
+/* 	S0_Y_shift                                        = 4, */
+/* 	S1_X_mask                                         = 0x0f << 8, */
+/* 	S1_X_shift                                        = 8, */
+/* 	S1_Y_mask                                         = 0x0f << 12, */
+/* 	S1_Y_shift                                        = 12, */
+/* 	S2_X_mask                                         = 0x0f << 16, */
+/* 	S2_X_shift                                        = 16, */
+/* 	S2_Y_mask                                         = 0x0f << 20, */
+/* 	S2_Y_shift                                        = 20, */
+/* 	S3_X_mask                                         = 0x0f << 24, */
+/* 	S3_X_shift                                        = 24, */
+/* 	S3_Y_mask                                         = 0x0f << 28, */
+/* 	S3_Y_shift                                        = 28, */
+    PA_SC_AA_SAMPLE_LOCS_8S_WD1_MCTX                      = 0x00028c20,
+/* 	S4_X_mask                                         = 0x0f << 0, */
+/* 	S4_X_shift                                        = 0, */
+/* 	S4_Y_mask                                         = 0x0f << 4, */
+/* 	S4_Y_shift                                        = 4, */
+/* 	S5_X_mask                                         = 0x0f << 8, */
+/* 	S5_X_shift                                        = 8, */
+/* 	S5_Y_mask                                         = 0x0f << 12, */
+/* 	S5_Y_shift                                        = 12, */
+/* 	S6_X_mask                                         = 0x0f << 16, */
+/* 	S6_X_shift                                        = 16, */
+/* 	S6_Y_mask                                         = 0x0f << 20, */
+/* 	S6_Y_shift                                        = 20, */
+/* 	S7_X_mask                                         = 0x0f << 24, */
+/* 	S7_X_shift                                        = 24, */
+/* 	S7_Y_mask                                         = 0x0f << 28, */
+/* 	S7_Y_shift                                        = 28, */
+    CB_CLRCMP_CONTROL                                     = 0x00028c30,
+	CLRCMP_FCN_SRC_mask                               = 0x07 << 0,
+	CLRCMP_FCN_SRC_shift                              = 0,
+	    CLRCMP_DRAW_ALWAYS                            = 0x00,
+	    CLRCMP_DRAW_NEVER                             = 0x01,
+	    CLRCMP_DRAW_ON_NEQ                            = 0x04,
+	    CLRCMP_DRAW_ON_EQ                             = 0x05,
+	CLRCMP_FCN_DST_mask                               = 0x07 << 8,
+	CLRCMP_FCN_DST_shift                              = 8,
+/* 	    CLRCMP_DRAW_ALWAYS                            = 0x00, */
+/* 	    CLRCMP_DRAW_NEVER                             = 0x01, */
+/* 	    CLRCMP_DRAW_ON_NEQ                            = 0x04, */
+/* 	    CLRCMP_DRAW_ON_EQ                             = 0x05, */
+	CLRCMP_FCN_SEL_mask                               = 0x03 << 24,
+	CLRCMP_FCN_SEL_shift                              = 24,
+	    CLRCMP_SEL_DST                                = 0x00,
+	    CLRCMP_SEL_SRC                                = 0x01,
+	    CLRCMP_SEL_AND                                = 0x02,
+    CB_CLRCMP_SRC                                         = 0x00028c34,
+    CB_CLRCMP_DST                                         = 0x00028c38,
+    CB_CLRCMP_MSK                                         = 0x00028c3c,
+    PA_SC_AA_MASK                                         = 0x00028c48,
+    VGT_VERTEX_REUSE_BLOCK_CNTL                           = 0x00028c58,
+	VTX_REUSE_DEPTH_mask                              = 0xff << 0,
+	VTX_REUSE_DEPTH_shift                             = 0,
+    VGT_OUT_DEALLOC_CNTL                                  = 0x00028c5c,
+	DEALLOC_DIST_mask                                 = 0x7f << 0,
+	DEALLOC_DIST_shift                                = 0,
+    DB_RENDER_CONTROL                                     = 0x00028d0c,
+	DEPTH_CLEAR_ENABLE_bit                            = 1 << 0,
+	STENCIL_CLEAR_ENABLE_bit                          = 1 << 1,
+	DEPTH_COPY_bit                                    = 1 << 2,
+	STENCIL_COPY_bit                                  = 1 << 3,
+	RESUMMARIZE_ENABLE_bit                            = 1 << 4,
+	STENCIL_COMPRESS_DISABLE_bit                      = 1 << 5,
+	DEPTH_COMPRESS_DISABLE_bit                        = 1 << 6,
+	COPY_CENTROID_bit                                 = 1 << 7,
+	COPY_SAMPLE_mask                                  = 0x07 << 8,
+	COPY_SAMPLE_shift                                 = 8,
+	ZPASS_INCREMENT_DISABLE_bit                       = 1 << 11,
+    DB_RENDER_OVERRIDE                                    = 0x00028d10,
+	FORCE_HIZ_ENABLE_mask                             = 0x03 << 0,
+	FORCE_HIZ_ENABLE_shift                            = 0,
+	    FORCE_OFF                                     = 0x00,
+	    FORCE_ENABLE                                  = 0x01,
+	    FORCE_DISABLE                                 = 0x02,
+	    FORCE_RESERVED                                = 0x03,
+	FORCE_HIS_ENABLE0_mask                            = 0x03 << 2,
+	FORCE_HIS_ENABLE0_shift                           = 2,
+/* 	    FORCE_OFF                                     = 0x00, */
+/* 	    FORCE_ENABLE                                  = 0x01, */
+/* 	    FORCE_DISABLE                                 = 0x02, */
+/* 	    FORCE_RESERVED                                = 0x03, */
+	FORCE_HIS_ENABLE1_mask                            = 0x03 << 4,
+	FORCE_HIS_ENABLE1_shift                           = 4,
+/* 	    FORCE_OFF                                     = 0x00, */
+/* 	    FORCE_ENABLE                                  = 0x01, */
+/* 	    FORCE_DISABLE                                 = 0x02, */
+/* 	    FORCE_RESERVED                                = 0x03, */
+	FORCE_SHADER_Z_ORDER_bit                          = 1 << 6,
+	FAST_Z_DISABLE_bit                                = 1 << 7,
+	FAST_STENCIL_DISABLE_bit                          = 1 << 8,
+	NOOP_CULL_DISABLE_bit                             = 1 << 9,
+	FORCE_COLOR_KILL_bit                              = 1 << 10,
+	FORCE_Z_READ_bit                                  = 1 << 11,
+	FORCE_STENCIL_READ_bit                            = 1 << 12,
+	FORCE_FULL_Z_RANGE_mask                           = 0x03 << 13,
+	FORCE_FULL_Z_RANGE_shift                          = 13,
+/* 	    FORCE_OFF                                     = 0x00, */
+/* 	    FORCE_ENABLE                                  = 0x01, */
+/* 	    FORCE_DISABLE                                 = 0x02, */
+/* 	    FORCE_RESERVED                                = 0x03, */
+	FORCE_QC_SMASK_CONFLICT_bit                       = 1 << 15,
+	DISABLE_VIEWPORT_CLAMP_bit                        = 1 << 16,
+	IGNORE_SC_ZRANGE_bit                              = 1 << 17,
+    DB_HTILE_SURFACE                                      = 0x00028d24,
+	HTILE_WIDTH_bit                                   = 1 << 0,
+	HTILE_HEIGHT_bit                                  = 1 << 1,
+	LINEAR_bit                                        = 1 << 2,
+	FULL_CACHE_bit                                    = 1 << 3,
+	HTILE_USES_PRELOAD_WIN_bit                        = 1 << 4,
+	PRELOAD_bit                                       = 1 << 5,
+	PREFETCH_WIDTH_mask                               = 0x3f << 6,
+	PREFETCH_WIDTH_shift                              = 6,
+	PREFETCH_HEIGHT_mask                              = 0x3f << 12,
+	PREFETCH_HEIGHT_shift                             = 12,
+    DB_SRESULTS_COMPARE_STATE1                            = 0x00028d2c,
+	COMPAREFUNC1_mask                                 = 0x07 << 0,
+	COMPAREFUNC1_shift                                = 0,
+/* 	    REF_NEVER                                     = 0x00, */
+/* 	    REF_LESS                                      = 0x01, */
+/* 	    REF_EQUAL                                     = 0x02, */
+/* 	    REF_LEQUAL                                    = 0x03, */
+/* 	    REF_GREATER                                   = 0x04, */
+/* 	    REF_NOTEQUAL                                  = 0x05, */
+/* 	    REF_GEQUAL                                    = 0x06, */
+/* 	    REF_ALWAYS                                    = 0x07, */
+	COMPAREVALUE1_mask                                = 0xff << 4,
+	COMPAREVALUE1_shift                               = 4,
+	COMPAREMASK1_mask                                 = 0xff << 12,
+	COMPAREMASK1_shift                                = 12,
+	ENABLE1_bit                                       = 1 << 24,
+    DB_PRELOAD_CONTROL                                    = 0x00028d30,
+	START_X_mask                                      = 0xff << 0,
+	START_X_shift                                     = 0,
+	START_Y_mask                                      = 0xff << 8,
+	START_Y_shift                                     = 8,
+	MAX_X_mask                                        = 0xff << 16,
+	MAX_X_shift                                       = 16,
+	MAX_Y_mask                                        = 0xff << 24,
+	MAX_Y_shift                                       = 24,
+    DB_PREFETCH_LIMIT                                     = 0x00028d34,
+	DEPTH_HEIGHT_TILE_MAX_mask                        = 0x3ff << 0,
+	DEPTH_HEIGHT_TILE_MAX_shift                       = 0,
+    PA_SU_POLY_OFFSET_DB_FMT_CNTL                         = 0x00028df8,
+	POLY_OFFSET_NEG_NUM_DB_BITS_mask                  = 0xff << 0,
+	POLY_OFFSET_NEG_NUM_DB_BITS_shift                 = 0,
+	POLY_OFFSET_DB_IS_FLOAT_FMT_bit                   = 1 << 8,
+    PA_SU_POLY_OFFSET_CLAMP                               = 0x00028dfc,
+    PA_SU_POLY_OFFSET_FRONT_SCALE                         = 0x00028e00,
+    PA_SU_POLY_OFFSET_FRONT_OFFSET                        = 0x00028e04,
+    PA_SU_POLY_OFFSET_BACK_SCALE                          = 0x00028e08,
+    PA_SU_POLY_OFFSET_BACK_OFFSET                         = 0x00028e0c,
+    PA_CL_POINT_X_RAD                                     = 0x00028e10,
+    PA_CL_POINT_Y_RAD                                     = 0x00028e14,
+    PA_CL_POINT_SIZE                                      = 0x00028e18,
+    PA_CL_POINT_CULL_RAD                                  = 0x00028e1c,
+    PA_CL_UCP_0_X                                         = 0x00028e20,
+	PA_CL_UCP_0_X_num                                 = 6,
+	PA_CL_UCP_0_X_offset                              = 16,
+    PA_CL_UCP_0_Y                                         = 0x00028e24,
+	PA_CL_UCP_0_Y_num                                 = 6,
+	PA_CL_UCP_0_Y_offset                              = 16,
+    PA_CL_UCP_0_Z                                         = 0x00028e28,
+	PA_CL_UCP_0_Z_num                                 = 6,
+	PA_CL_UCP_0_Z_offset                              = 16,
+    SQ_ALU_CONSTANT0_0                                    = 0x00030000,
+    SQ_ALU_CONSTANT1_0                                    = 0x00030004,
+    SQ_ALU_CONSTANT2_0                                    = 0x00030008,
+    SQ_ALU_CONSTANT3_0                                    = 0x0003000c,
+    SQ_VTX_CONSTANT_WORD0_0                               = 0x00038000,
+    SQ_TEX_RESOURCE_WORD0_0                               = 0x00038000,
+	DIM_mask                                          = 0x07 << 0,
+	DIM_shift                                         = 0,
+	    SQ_TEX_DIM_1D                                 = 0x00,
+	    SQ_TEX_DIM_2D                                 = 0x01,
+	    SQ_TEX_DIM_3D                                 = 0x02,
+	    SQ_TEX_DIM_CUBEMAP                            = 0x03,
+	    SQ_TEX_DIM_1D_ARRAY                           = 0x04,
+	    SQ_TEX_DIM_2D_ARRAY                           = 0x05,
+	    SQ_TEX_DIM_2D_MSAA                            = 0x06,
+	    SQ_TEX_DIM_2D_ARRAY_MSAA                      = 0x07,
+	SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_mask           = 0x0f << 3,
+	SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_shift          = 3,
+	TILE_TYPE_bit                                     = 1 << 7,
+	PITCH_mask                                        = 0x7ff << 8,
+	PITCH_shift                                       = 8,
+	TEX_WIDTH_mask                                    = 0x1fff << 19,
+	TEX_WIDTH_shift                                   = 19,
+    SQ_VTX_CONSTANT_WORD1_0                               = 0x00038004,
+    SQ_TEX_RESOURCE_WORD1_0                               = 0x00038004,
+	TEX_HEIGHT_mask                                   = 0x1fff << 0,
+	TEX_HEIGHT_shift                                  = 0,
+	TEX_DEPTH_mask                                    = 0x1fff << 13,
+	TEX_DEPTH_shift                                   = 13,
+	SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask         = 0x3f << 26,
+	SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift        = 26,
+    SQ_VTX_CONSTANT_WORD2_0                               = 0x00038008,
+	BASE_ADDRESS_HI_mask                              = 0xff << 0,
+	BASE_ADDRESS_HI_shift                             = 0,
+	SQ_VTX_CONSTANT_WORD2_0__STRIDE_mask              = 0x7ff << 8,
+	SQ_VTX_CONSTANT_WORD2_0__STRIDE_shift             = 8,
+	SQ_VTX_CONSTANT_WORD2_0__CLAMP_X_bit              = 1 << 19,
+	SQ_VTX_CONSTANT_WORD2_0__DATA_FORMAT_mask         = 0x3f << 20,
+	SQ_VTX_CONSTANT_WORD2_0__DATA_FORMAT_shift        = 20,
+	SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_mask      = 0x03 << 26,
+	SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_shift     = 26,
+/* 	    SQ_NUM_FORMAT_NORM                            = 0x00, */
+/* 	    SQ_NUM_FORMAT_INT                             = 0x01, */
+/* 	    SQ_NUM_FORMAT_SCALED                          = 0x02, */
+	SQ_VTX_CONSTANT_WORD2_0__FORMAT_COMP_ALL_bit      = 1 << 28,
+	SQ_VTX_CONSTANT_WORD2_0__SRF_MODE_ALL_bit         = 1 << 29,
+	SQ_VTX_CONSTANT_WORD2_0__ENDIAN_SWAP_mask         = 0x03 << 30,
+	SQ_VTX_CONSTANT_WORD2_0__ENDIAN_SWAP_shift        = 30,
+/* 	    SQ_ENDIAN_NONE                                = 0x00, */
+/* 	    SQ_ENDIAN_8IN16                               = 0x01, */
+/* 	    SQ_ENDIAN_8IN32                               = 0x02, */
+    SQ_TEX_RESOURCE_WORD2_0                               = 0x00038008,
+    SQ_VTX_CONSTANT_WORD3_0                               = 0x0003800c,
+	MEM_REQUEST_SIZE_mask                             = 0x03 << 0,
+	MEM_REQUEST_SIZE_shift                            = 0,
+    SQ_TEX_RESOURCE_WORD3_0                               = 0x0003800c,
+    SQ_TEX_RESOURCE_WORD4_0                               = 0x00038010,
+	FORMAT_COMP_X_mask                                = 0x03 << 0,
+	FORMAT_COMP_X_shift                               = 0,
+	    SQ_FORMAT_COMP_UNSIGNED                       = 0x00,
+	    SQ_FORMAT_COMP_SIGNED                         = 0x01,
+	    SQ_FORMAT_COMP_UNSIGNED_BIASED                = 0x02,
+	FORMAT_COMP_Y_mask                                = 0x03 << 2,
+	FORMAT_COMP_Y_shift                               = 2,
+/* 	    SQ_FORMAT_COMP_UNSIGNED                       = 0x00, */
+/* 	    SQ_FORMAT_COMP_SIGNED                         = 0x01, */
+/* 	    SQ_FORMAT_COMP_UNSIGNED_BIASED                = 0x02, */
+	FORMAT_COMP_Z_mask                                = 0x03 << 4,
+	FORMAT_COMP_Z_shift                               = 4,
+/* 	    SQ_FORMAT_COMP_UNSIGNED                       = 0x00, */
+/* 	    SQ_FORMAT_COMP_SIGNED                         = 0x01, */
+/* 	    SQ_FORMAT_COMP_UNSIGNED_BIASED                = 0x02, */
+	FORMAT_COMP_W_mask                                = 0x03 << 6,
+	FORMAT_COMP_W_shift                               = 6,
+/* 	    SQ_FORMAT_COMP_UNSIGNED                       = 0x00, */
+/* 	    SQ_FORMAT_COMP_SIGNED                         = 0x01, */
+/* 	    SQ_FORMAT_COMP_UNSIGNED_BIASED                = 0x02, */
+	SQ_TEX_RESOURCE_WORD4_0__NUM_FORMAT_ALL_mask      = 0x03 << 8,
+	SQ_TEX_RESOURCE_WORD4_0__NUM_FORMAT_ALL_shift     = 8,
+/* 	    SQ_NUM_FORMAT_NORM                            = 0x00, */
+/* 	    SQ_NUM_FORMAT_INT                             = 0x01, */
+/* 	    SQ_NUM_FORMAT_SCALED                          = 0x02, */
+	SQ_TEX_RESOURCE_WORD4_0__SRF_MODE_ALL_bit         = 1 << 10,
+	SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit        = 1 << 11,
+	SQ_TEX_RESOURCE_WORD4_0__ENDIAN_SWAP_mask         = 0x03 << 12,
+	SQ_TEX_RESOURCE_WORD4_0__ENDIAN_SWAP_shift        = 12,
+/* 	    SQ_ENDIAN_NONE                                = 0x00, */
+/* 	    SQ_ENDIAN_8IN16                               = 0x01, */
+/* 	    SQ_ENDIAN_8IN32                               = 0x02, */
+	REQUEST_SIZE_mask                                 = 0x03 << 14,
+	REQUEST_SIZE_shift                                = 14,
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask           = 0x07 << 16,
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift          = 16,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask           = 0x07 << 19,
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift          = 19,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask           = 0x07 << 22,
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift          = 22,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask           = 0x07 << 25,
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift          = 25,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	BASE_LEVEL_mask                                   = 0x0f << 28,
+	BASE_LEVEL_shift                                  = 28,
+    SQ_TEX_RESOURCE_WORD5_0                               = 0x00038014,
+	LAST_LEVEL_mask                                   = 0x0f << 0,
+	LAST_LEVEL_shift                                  = 0,
+	BASE_ARRAY_mask                                   = 0x1fff << 4,
+	BASE_ARRAY_shift                                  = 4,
+	LAST_ARRAY_mask                                   = 0x1fff << 17,
+	LAST_ARRAY_shift                                  = 17,
+    SQ_TEX_RESOURCE_WORD6_0                               = 0x00038018,
+	MPEG_CLAMP_mask                                   = 0x03 << 0,
+	MPEG_CLAMP_shift                                  = 0,
+	    SQ_TEX_MPEG_CLAMP_OFF                         = 0x00,
+	    SQ_TEX_MPEG_9                                 = 0x01,
+	    SQ_TEX_MPEG_10                                = 0x02,
+	PERF_MODULATION_mask                              = 0x07 << 5,
+	PERF_MODULATION_shift                             = 5,
+	INTERLACED_bit                                    = 1 << 8,
+	SQ_TEX_RESOURCE_WORD6_0__TYPE_mask                = 0x03 << 30,
+	SQ_TEX_RESOURCE_WORD6_0__TYPE_shift               = 30,
+	    SQ_TEX_VTX_INVALID_TEXTURE                    = 0x00,
+	    SQ_TEX_VTX_INVALID_BUFFER                     = 0x01,
+	    SQ_TEX_VTX_VALID_TEXTURE                      = 0x02,
+	    SQ_TEX_VTX_VALID_BUFFER                       = 0x03,
+    SQ_VTX_CONSTANT_WORD6_0                               = 0x00038018,
+	SQ_VTX_CONSTANT_WORD6_0__TYPE_mask                = 0x03 << 30,
+	SQ_VTX_CONSTANT_WORD6_0__TYPE_shift               = 30,
+/* 	    SQ_TEX_VTX_INVALID_TEXTURE                    = 0x00, */
+/* 	    SQ_TEX_VTX_INVALID_BUFFER                     = 0x01, */
+/* 	    SQ_TEX_VTX_VALID_TEXTURE                      = 0x02, */
+/* 	    SQ_TEX_VTX_VALID_BUFFER                       = 0x03, */
+    SQ_TEX_SAMPLER_WORD0_0                                = 0x0003c000,
+	SQ_TEX_SAMPLER_WORD0_0__CLAMP_X_mask              = 0x07 << 0,
+	SQ_TEX_SAMPLER_WORD0_0__CLAMP_X_shift             = 0,
+	    SQ_TEX_WRAP                                   = 0x00,
+	    SQ_TEX_MIRROR                                 = 0x01,
+	    SQ_TEX_CLAMP_LAST_TEXEL                       = 0x02,
+	    SQ_TEX_MIRROR_ONCE_LAST_TEXEL                 = 0x03,
+	    SQ_TEX_CLAMP_HALF_BORDER                      = 0x04,
+	    SQ_TEX_MIRROR_ONCE_HALF_BORDER                = 0x05,
+	    SQ_TEX_CLAMP_BORDER                           = 0x06,
+	    SQ_TEX_MIRROR_ONCE_BORDER                     = 0x07,
+	CLAMP_Y_mask                                      = 0x07 << 3,
+	CLAMP_Y_shift                                     = 3,
+/* 	    SQ_TEX_WRAP                                   = 0x00, */
+/* 	    SQ_TEX_MIRROR                                 = 0x01, */
+/* 	    SQ_TEX_CLAMP_LAST_TEXEL                       = 0x02, */
+/* 	    SQ_TEX_MIRROR_ONCE_LAST_TEXEL                 = 0x03, */
+/* 	    SQ_TEX_CLAMP_HALF_BORDER                      = 0x04, */
+/* 	    SQ_TEX_MIRROR_ONCE_HALF_BORDER                = 0x05, */
+/* 	    SQ_TEX_CLAMP_BORDER                           = 0x06, */
+/* 	    SQ_TEX_MIRROR_ONCE_BORDER                     = 0x07, */
+	CLAMP_Z_mask                                      = 0x07 << 6,
+	CLAMP_Z_shift                                     = 6,
+/* 	    SQ_TEX_WRAP                                   = 0x00, */
+/* 	    SQ_TEX_MIRROR                                 = 0x01, */
+/* 	    SQ_TEX_CLAMP_LAST_TEXEL                       = 0x02, */
+/* 	    SQ_TEX_MIRROR_ONCE_LAST_TEXEL                 = 0x03, */
+/* 	    SQ_TEX_CLAMP_HALF_BORDER                      = 0x04, */
+/* 	    SQ_TEX_MIRROR_ONCE_HALF_BORDER                = 0x05, */
+/* 	    SQ_TEX_CLAMP_BORDER                           = 0x06, */
+/* 	    SQ_TEX_MIRROR_ONCE_BORDER                     = 0x07, */
+	XY_MAG_FILTER_mask                                = 0x07 << 9,
+	XY_MAG_FILTER_shift                               = 9,
+	    SQ_TEX_XY_FILTER_POINT                        = 0x00,
+	    SQ_TEX_XY_FILTER_BILINEAR                     = 0x01,
+	    SQ_TEX_XY_FILTER_BICUBIC                      = 0x02,
+	XY_MIN_FILTER_mask                                = 0x07 << 12,
+	XY_MIN_FILTER_shift                               = 12,
+/* 	    SQ_TEX_XY_FILTER_POINT                        = 0x00, */
+/* 	    SQ_TEX_XY_FILTER_BILINEAR                     = 0x01, */
+/* 	    SQ_TEX_XY_FILTER_BICUBIC                      = 0x02, */
+	Z_FILTER_mask                                     = 0x03 << 15,
+	Z_FILTER_shift                                    = 15,
+	    SQ_TEX_Z_FILTER_NONE                          = 0x00,
+	    SQ_TEX_Z_FILTER_POINT                         = 0x01,
+	    SQ_TEX_Z_FILTER_LINEAR                        = 0x02,
+	MIP_FILTER_mask                                   = 0x03 << 17,
+	MIP_FILTER_shift                                  = 17,
+/* 	    SQ_TEX_Z_FILTER_NONE                          = 0x00, */
+/* 	    SQ_TEX_Z_FILTER_POINT                         = 0x01, */
+/* 	    SQ_TEX_Z_FILTER_LINEAR                        = 0x02, */
+	BORDER_COLOR_TYPE_mask                            = 0x03 << 22,
+	BORDER_COLOR_TYPE_shift                           = 22,
+	    SQ_TEX_BORDER_COLOR_TRANS_BLACK               = 0x00,
+	    SQ_TEX_BORDER_COLOR_OPAQUE_BLACK              = 0x01,
+	    SQ_TEX_BORDER_COLOR_OPAQUE_WHITE              = 0x02,
+	    SQ_TEX_BORDER_COLOR_REGISTER                  = 0x03,
+	POINT_SAMPLING_CLAMP_bit                          = 1 << 24,
+	TEX_ARRAY_OVERRIDE_bit                            = 1 << 25,
+	DEPTH_COMPARE_FUNCTION_mask                       = 0x07 << 26,
+	DEPTH_COMPARE_FUNCTION_shift                      = 26,
+	    SQ_TEX_DEPTH_COMPARE_NEVER                    = 0x00,
+	    SQ_TEX_DEPTH_COMPARE_LESS                     = 0x01,
+	    SQ_TEX_DEPTH_COMPARE_EQUAL                    = 0x02,
+	    SQ_TEX_DEPTH_COMPARE_LESSEQUAL                = 0x03,
+	    SQ_TEX_DEPTH_COMPARE_GREATER                  = 0x04,
+	    SQ_TEX_DEPTH_COMPARE_NOTEQUAL                 = 0x05,
+	    SQ_TEX_DEPTH_COMPARE_GREATEREQUAL             = 0x06,
+	    SQ_TEX_DEPTH_COMPARE_ALWAYS                   = 0x07,
+	CHROMA_KEY_mask                                   = 0x03 << 29,
+	CHROMA_KEY_shift                                  = 29,
+	    SQ_TEX_CHROMA_KEY_DISABLED                    = 0x00,
+	    SQ_TEX_CHROMA_KEY_KILL                        = 0x01,
+	    SQ_TEX_CHROMA_KEY_BLEND                       = 0x02,
+	LOD_USES_MINOR_AXIS_bit                           = 1 << 31,
+    SQ_TEX_SAMPLER_WORD1_0                                = 0x0003c004,
+	MIN_LOD_mask                                      = 0x3ff << 0,
+	MIN_LOD_shift                                     = 0,
+	MAX_LOD_mask                                      = 0x3ff << 10,
+	MAX_LOD_shift                                     = 10,
+	SQ_TEX_SAMPLER_WORD1_0__LOD_BIAS_mask             = 0xfff << 20,
+	SQ_TEX_SAMPLER_WORD1_0__LOD_BIAS_shift            = 20,
+    SQ_TEX_SAMPLER_WORD2_0                                = 0x0003c008,
+	LOD_BIAS_SEC_mask                                 = 0xfff << 0,
+	LOD_BIAS_SEC_shift                                = 0,
+	MC_COORD_TRUNCATE_bit                             = 1 << 12,
+	SQ_TEX_SAMPLER_WORD2_0__FORCE_DEGAMMA_bit         = 1 << 13,
+	HIGH_PRECISION_FILTER_bit                         = 1 << 14,
+	PERF_MIP_mask                                     = 0x07 << 15,
+	PERF_MIP_shift                                    = 15,
+	PERF_Z_mask                                       = 0x03 << 18,
+	PERF_Z_shift                                      = 18,
+	FETCH_4_bit                                       = 1 << 26,
+	SAMPLE_IS_PCF_bit                                 = 1 << 27,
+	SQ_TEX_SAMPLER_WORD2_0__TYPE_bit                  = 1 << 31,
+    SQ_VTX_BASE_VTX_LOC                                   = 0x0003cff0,
+    SQ_VTX_START_INST_LOC                                 = 0x0003cff4,
+    SQ_LOOP_CONST_DX10_0                                  = 0x0003e200,
+    SQ_LOOP_CONST_0                                       = 0x0003e200,
+	SQ_LOOP_CONST_0__COUNT_mask                       = 0xfff << 0,
+	SQ_LOOP_CONST_0__COUNT_shift                      = 0,
+	INIT_mask                                         = 0xfff << 12,
+	INIT_shift                                        = 12,
+	INC_mask                                          = 0xff << 24,
+	INC_shift                                         = 24,
+    SQ_BOOL_CONST_0                                       = 0x0003e380,
+	SQ_BOOL_CONST_0_num                               = 3,
+
+} ;
+
+#endif /* _AUTOREGS */
+
diff --git a/src/r600_reg_r6xx.h b/src/r600_reg_r6xx.h
new file mode 100644
index 0000000..2e7dfa9
--- /dev/null
+++ b/src/r600_reg_r6xx.h
@@ -0,0 +1,494 @@
+/*
+ * RadeonHD R6xx, R7xx Register documentation
+ *
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ * Copyright (C) 2008-2009  Matthias Hopf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _R600_REG_R6xx_H_
+#define _R600_REG_R6xx_H_
+
+/*
+ * Registers for R6xx chips that are not documented yet
+ */
+
+enum {
+
+    MM_INDEX                                              = 0x0000,
+    MM_DATA                                               = 0x0004,
+
+    SRBM_STATUS                                           = 0x0e50,
+	RLC_RQ_PENDING_bit                                = 1 << 3,
+	RCU_RQ_PENDING_bit                                = 1 << 4,
+	GRBM_RQ_PENDING_bit                               = 1 << 5,
+	HI_RQ_PENDING_bit                                 = 1 << 6,
+	IO_EXTERN_SIGNAL_bit                              = 1 << 7,
+	VMC_BUSY_bit                                      = 1 << 8,
+	MCB_BUSY_bit                                      = 1 << 9,
+	MCDZ_BUSY_bit                                     = 1 << 10,
+	MCDY_BUSY_bit                                     = 1 << 11,
+	MCDX_BUSY_bit                                     = 1 << 12,
+	MCDW_BUSY_bit                                     = 1 << 13,
+	SEM_BUSY_bit                                      = 1 << 14,
+	SRBM_STATUS__RLC_BUSY_bit                         = 1 << 15,
+	PDMA_BUSY_bit                                     = 1 << 16,
+	IH_BUSY_bit                                       = 1 << 17,
+	CSC_BUSY_bit                                      = 1 << 20,
+	CMC7_BUSY_bit                                     = 1 << 21,
+	CMC6_BUSY_bit                                     = 1 << 22,
+	CMC5_BUSY_bit                                     = 1 << 23,
+	CMC4_BUSY_bit                                     = 1 << 24,
+	CMC3_BUSY_bit                                     = 1 << 25,
+	CMC2_BUSY_bit                                     = 1 << 26,
+	CMC1_BUSY_bit                                     = 1 << 27,
+	CMC0_BUSY_bit                                     = 1 << 28,
+	BIF_BUSY_bit                                      = 1 << 29,
+	IDCT_BUSY_bit                                     = 1 << 30,
+
+    SRBM_READ_ERROR                                       = 0x0e98,
+	READ_ADDRESS_mask                                 = 0xffff << 2,
+	READ_ADDRESS_shift                                = 2,
+	READ_REQUESTER_HI_bit                             = 1 << 24,
+	READ_REQUESTER_GRBM_bit                           = 1 << 25,
+	READ_REQUESTER_RCU_bit                            = 1 << 26,
+	READ_REQUESTER_RLC_bit                            = 1 << 27,
+	READ_ERROR_bit                                    = 1 << 31,
+
+    SRBM_INT_STATUS                                       = 0x0ea4,
+	RDERR_INT_STAT_bit                                = 1 << 0,
+	GFX_CNTX_SWITCH_INT_STAT_bit                      = 1 << 1,
+    SRBM_INT_ACK                                          = 0x0ea8,
+	RDERR_INT_ACK_bit                                 = 1 << 0,
+	GFX_CNTX_SWITCH_INT_ACK_bit                       = 1 << 1,
+
+    R6XX_MC_VM_FB_LOCATION                                = 0x2180,
+
+    VENDOR_DEVICE_ID                                      = 0x4000,
+
+    HDP_MEM_COHERENCY_FLUSH_CNTL                          = 0x5480,
+
+    D1GRPH_PRIMARY_SURFACE_ADDRESS                        = 0x6110,
+    D1GRPH_PITCH                                          = 0x6120,
+    D1GRPH_Y_END                                          = 0x6138,
+
+    GRBM_STATUS                                           = 0x8010,
+	CMDFIFO_AVAIL_mask                                = 0x1f << 0,
+	CMDFIFO_AVAIL_shift                               = 0,
+	SRBM_RQ_PENDING_bit                               = 1 << 5,
+	CP_RQ_PENDING_bit                                 = 1 << 6,
+	CF_RQ_PENDING_bit                                 = 1 << 7,
+	PF_RQ_PENDING_bit                                 = 1 << 8,
+	GRBM_EE_BUSY_bit                                  = 1 << 10,
+	GRBM_STATUS__VC_BUSY_bit                          = 1 << 11,
+	DB03_CLEAN_bit                                    = 1 << 12,
+	CB03_CLEAN_bit                                    = 1 << 13,
+	VGT_BUSY_NO_DMA_bit                               = 1 << 16,
+	GRBM_STATUS__VGT_BUSY_bit                         = 1 << 17,
+	TA03_BUSY_bit                                     = 1 << 18,
+	GRBM_STATUS__TC_BUSY_bit                          = 1 << 19,
+	SX_BUSY_bit                                       = 1 << 20,
+	SH_BUSY_bit                                       = 1 << 21,
+	SPI03_BUSY_bit                                    = 1 << 22,
+	SMX_BUSY_bit                                      = 1 << 23,
+	SC_BUSY_bit                                       = 1 << 24,
+	PA_BUSY_bit                                       = 1 << 25,
+	DB03_BUSY_bit                                     = 1 << 26,
+	CR_BUSY_bit                                       = 1 << 27,
+	CP_COHERENCY_BUSY_bit                             = 1 << 28,
+	GRBM_STATUS__CP_BUSY_bit                          = 1 << 29,
+	CB03_BUSY_bit                                     = 1 << 30,
+	GUI_ACTIVE_bit                                    = 1 << 31,
+    GRBM_STATUS2                                          = 0x8014,
+	CR_CLEAN_bit                                      = 1 << 0,
+	SMX_CLEAN_bit                                     = 1 << 1,
+	SPI0_BUSY_bit                                     = 1 << 8,
+	SPI1_BUSY_bit                                     = 1 << 9,
+	SPI2_BUSY_bit                                     = 1 << 10,
+	SPI3_BUSY_bit                                     = 1 << 11,
+	TA0_BUSY_bit                                      = 1 << 12,
+	TA1_BUSY_bit                                      = 1 << 13,
+	TA2_BUSY_bit                                      = 1 << 14,
+	TA3_BUSY_bit                                      = 1 << 15,
+	DB0_BUSY_bit                                      = 1 << 16,
+	DB1_BUSY_bit                                      = 1 << 17,
+	DB2_BUSY_bit                                      = 1 << 18,
+	DB3_BUSY_bit                                      = 1 << 19,
+	CB0_BUSY_bit                                      = 1 << 20,
+	CB1_BUSY_bit                                      = 1 << 21,
+	CB2_BUSY_bit                                      = 1 << 22,
+	CB3_BUSY_bit                                      = 1 << 23,
+    GRBM_SOFT_RESET                                       = 0x8020,
+	SOFT_RESET_CP_bit                                 = 1 << 0,
+	SOFT_RESET_CB_bit                                 = 1 << 1,
+	SOFT_RESET_CR_bit                                 = 1 << 2,
+	SOFT_RESET_DB_bit                                 = 1 << 3,
+	SOFT_RESET_PA_bit                                 = 1 << 5,
+	SOFT_RESET_SC_bit                                 = 1 << 6,
+	SOFT_RESET_SMX_bit                                = 1 << 7,
+	SOFT_RESET_SPI_bit                                = 1 << 8,
+	SOFT_RESET_SH_bit                                 = 1 << 9,
+	SOFT_RESET_SX_bit                                 = 1 << 10,
+	SOFT_RESET_TC_bit                                 = 1 << 11,
+	SOFT_RESET_TA_bit                                 = 1 << 12,
+	SOFT_RESET_VC_bit                                 = 1 << 13,
+	SOFT_RESET_VGT_bit                                = 1 << 14,
+	SOFT_RESET_GRBM_GCA_bit                           = 1 << 15,
+
+    WAIT_UNTIL                                            = 0x8040,
+	WAIT_CP_DMA_IDLE_bit                              = 1 << 8,
+	WAIT_CMDFIFO_bit                                  = 1 << 10,
+	WAIT_2D_IDLE_bit                                  = 1 << 14,
+	WAIT_3D_IDLE_bit                                  = 1 << 15,
+	WAIT_2D_IDLECLEAN_bit                             = 1 << 16,
+	WAIT_3D_IDLECLEAN_bit                             = 1 << 17,
+	WAIT_EXTERN_SIG_bit                               = 1 << 19,
+	CMDFIFO_ENTRIES_mask                              = 0x1f << 20,
+	CMDFIFO_ENTRIES_shift                             = 20,
+
+    GRBM_READ_ERROR                                       = 0x8058,
+/* 	READ_ADDRESS_mask                                 = 0xffff << 2, */
+/* 	READ_ADDRESS_shift                                = 2, */
+	READ_REQUESTER_SRBM_bit                           = 1 << 28,
+	READ_REQUESTER_CP_bit                             = 1 << 29,
+	READ_REQUESTER_WU_POLL_bit                        = 1 << 30,
+/* 	READ_ERROR_bit                                    = 1 << 31, */
+
+    SCRATCH_REG0		                          = 0x8500,
+    SCRATCH_REG1		                          = 0x8504,
+    SCRATCH_REG2		                          = 0x8508,
+    SCRATCH_REG3		                          = 0x850c,
+    SCRATCH_REG4		                          = 0x8510,
+    SCRATCH_REG5		                          = 0x8514,
+    SCRATCH_REG6		                          = 0x8518,
+    SCRATCH_REG7		                          = 0x851c,
+    SCRATCH_UMSK		                          = 0x8540,
+    SCRATCH_ADDR		                          = 0x8544,
+
+    CP_COHER_CNTL                                         = 0x85f0,
+	DEST_BASE_0_ENA_bit                               = 1 << 0,
+	DEST_BASE_1_ENA_bit                               = 1 << 1,
+	SO0_DEST_BASE_ENA_bit                             = 1 << 2,
+	SO1_DEST_BASE_ENA_bit                             = 1 << 3,
+	SO2_DEST_BASE_ENA_bit                             = 1 << 4,
+	SO3_DEST_BASE_ENA_bit                             = 1 << 5,
+	CB0_DEST_BASE_ENA_bit                             = 1 << 6,
+	CB1_DEST_BASE_ENA_bit                             = 1 << 7,
+	CB2_DEST_BASE_ENA_bit                             = 1 << 8,
+	CB3_DEST_BASE_ENA_bit                             = 1 << 9,
+	CB4_DEST_BASE_ENA_bit                             = 1 << 10,
+	CB5_DEST_BASE_ENA_bit                             = 1 << 11,
+	CB6_DEST_BASE_ENA_bit                             = 1 << 12,
+	CB7_DEST_BASE_ENA_bit                             = 1 << 13,
+	DB_DEST_BASE_ENA_bit                              = 1 << 14,
+	CR_DEST_BASE_ENA_bit                              = 1 << 15,
+	TC_ACTION_ENA_bit                                 = 1 << 23,
+	VC_ACTION_ENA_bit                                 = 1 << 24,
+	CB_ACTION_ENA_bit                                 = 1 << 25,
+	DB_ACTION_ENA_bit                                 = 1 << 26,
+	SH_ACTION_ENA_bit                                 = 1 << 27,
+	SMX_ACTION_ENA_bit                                = 1 << 28,
+	CR0_ACTION_ENA_bit                                = 1 << 29,
+	CR1_ACTION_ENA_bit                                = 1 << 30,
+	CR2_ACTION_ENA_bit                                = 1 << 31,
+    CP_COHER_SIZE                                         = 0x85f4,
+    CP_COHER_BASE                                         = 0x85f8,
+    CP_COHER_STATUS                                       = 0x85fc,
+	MATCHING_GFX_CNTX_mask                            = 0xff << 0,
+	MATCHING_GFX_CNTX_shift                           = 0,
+	MATCHING_CR_CNTX_mask                             = 0xffff << 8,
+	MATCHING_CR_CNTX_shift                            = 8,
+	STATUS_bit                                        = 1 << 31,
+
+    CP_STALLED_STAT1                                      = 0x8674,
+	RBIU_TO_DMA_NOT_RDY_TO_RCV_bit                    = 1 << 0,
+	RBIU_TO_IBS_NOT_RDY_TO_RCV_bit                    = 1 << 1,
+	RBIU_TO_SEM_NOT_RDY_TO_RCV_bit                    = 1 << 2,
+	RBIU_TO_2DREGS_NOT_RDY_TO_RCV_bit                 = 1 << 3,
+	RBIU_TO_MEMWR_NOT_RDY_TO_RCV_bit                  = 1 << 4,
+	RBIU_TO_MEMRD_NOT_RDY_TO_RCV_bit                  = 1 << 5,
+	RBIU_TO_EOPD_NOT_RDY_TO_RCV_bit                   = 1 << 6,
+	RBIU_TO_RECT_NOT_RDY_TO_RCV_bit                   = 1 << 7,
+	RBIU_TO_STRMO_NOT_RDY_TO_RCV_bit                  = 1 << 8,
+	RBIU_TO_PSTAT_NOT_RDY_TO_RCV_bit                  = 1 << 9,
+	MIU_WAITING_ON_RDREQ_FREE_bit                     = 1 << 16,
+	MIU_WAITING_ON_WRREQ_FREE_bit                     = 1 << 17,
+	MIU_NEEDS_AVAIL_WRREQ_PHASE_bit                   = 1 << 18,
+	RCIU_WAITING_ON_GRBM_FREE_bit                     = 1 << 24,
+	RCIU_WAITING_ON_VGT_FREE_bit                      = 1 << 25,
+	RCIU_STALLED_ON_ME_READ_bit                       = 1 << 26,
+	RCIU_STALLED_ON_DMA_READ_bit                      = 1 << 27,
+	RCIU_HALTED_BY_REG_VIOLATION_bit                  = 1 << 28,
+    CP_STALLED_STAT2                                      = 0x8678,
+	PFP_TO_CSF_NOT_RDY_TO_RCV_bit                     = 1 << 0,
+	PFP_TO_MEQ_NOT_RDY_TO_RCV_bit                     = 1 << 1,
+	PFP_TO_VGT_NOT_RDY_TO_RCV_bit                     = 1 << 2,
+	PFP_HALTED_BY_INSTR_VIOLATION_bit                 = 1 << 3,
+	MULTIPASS_IB_PENDING_IN_PFP_bit                   = 1 << 4,
+	ME_BRUSH_WC_NOT_RDY_TO_RCV_bit                    = 1 << 8,
+	ME_STALLED_ON_BRUSH_LOGIC_bit                     = 1 << 9,
+	CR_CNTX_NOT_AVAIL_TO_ME_bit                       = 1 << 10,
+	GFX_CNTX_NOT_AVAIL_TO_ME_bit                      = 1 << 11,
+	ME_RCIU_NOT_RDY_TO_RCV_bit                        = 1 << 12,
+	ME_TO_CONST_NOT_RDY_TO_RCV_bit                    = 1 << 13,
+	ME_WAITING_DATA_FROM_PFP_bit                      = 1 << 14,
+	ME_WAITING_ON_PARTIAL_FLUSH_bit                   = 1 << 15,
+	RECT_FIFO_NEEDS_CR_RECT_DONE_bit                  = 1 << 16,
+	RECT_FIFO_NEEDS_WR_CONFIRM_bit                    = 1 << 17,
+	EOPD_FIFO_NEEDS_SC_EOP_DONE_bit                   = 1 << 18,
+	EOPD_FIFO_NEEDS_SMX_EOP_DONE_bit                  = 1 << 19,
+	EOPD_FIFO_NEEDS_WR_CONFIRM_bit                    = 1 << 20,
+	EOPD_FIFO_NEEDS_SIGNAL_SEM_bit                    = 1 << 21,
+	SO_NUMPRIM_FIFO_NEEDS_SOADDR_bit                  = 1 << 22,
+	SO_NUMPRIM_FIFO_NEEDS_NUMPRIM_bit                 = 1 << 23,
+	PIPE_STATS_FIFO_NEEDS_SAMPLE_bit                  = 1 << 24,
+	SURF_SYNC_NEEDS_IDLE_CNTXS_bit                    = 1 << 30,
+	SURF_SYNC_NEEDS_ALL_CLEAN_bit                     = 1 << 31,
+    CP_BUSY_STAT                                          = 0x867c,
+	REG_BUS_FIFO_BUSY_bit                             = 1 << 0,
+	RING_FETCHING_DATA_bit                            = 1 << 1,
+	INDR1_FETCHING_DATA_bit                           = 1 << 2,
+	INDR2_FETCHING_DATA_bit                           = 1 << 3,
+	STATE_FETCHING_DATA_bit                           = 1 << 4,
+	PRED_FETCHING_DATA_bit                            = 1 << 5,
+	COHER_CNTR_NEQ_ZERO_bit                           = 1 << 6,
+	PFP_PARSING_PACKETS_bit                           = 1 << 7,
+	ME_PARSING_PACKETS_bit                            = 1 << 8,
+	RCIU_PFP_BUSY_bit                                 = 1 << 9,
+	RCIU_ME_BUSY_bit                                  = 1 << 10,
+	OUTSTANDING_READ_TAGS_bit                         = 1 << 11,
+	SEM_CMDFIFO_NOT_EMPTY_bit                         = 1 << 12,
+	SEM_FAILED_AND_HOLDING_bit                        = 1 << 13,
+	SEM_POLLING_FOR_PASS_bit                          = 1 << 14,
+	_3D_BUSY_bit                                      = 1 << 15,
+	_2D_BUSY_bit                                      = 1 << 16,
+    CP_STAT                                               = 0x8680,
+	CSF_RING_BUSY_bit                                 = 1 << 0,
+	CSF_WPTR_POLL_BUSY_bit                            = 1 << 1,
+	CSF_INDIRECT1_BUSY_bit                            = 1 << 2,
+	CSF_INDIRECT2_BUSY_bit                            = 1 << 3,
+	CSF_STATE_BUSY_bit                                = 1 << 4,
+	CSF_PREDICATE_BUSY_bit                            = 1 << 5,
+	CSF_BUSY_bit                                      = 1 << 6,
+	MIU_RDREQ_BUSY_bit                                = 1 << 7,
+	MIU_WRREQ_BUSY_bit                                = 1 << 8,
+	ROQ_RING_BUSY_bit                                 = 1 << 9,
+	ROQ_INDIRECT1_BUSY_bit                            = 1 << 10,
+	ROQ_INDIRECT2_BUSY_bit                            = 1 << 11,
+	ROQ_STATE_BUSY_bit                                = 1 << 12,
+	ROQ_PREDICATE_BUSY_bit                            = 1 << 13,
+	ROQ_ALIGN_BUSY_bit                                = 1 << 14,
+	PFP_BUSY_bit                                      = 1 << 15,
+	MEQ_BUSY_bit                                      = 1 << 16,
+	ME_BUSY_bit                                       = 1 << 17,
+	QUERY_BUSY_bit                                    = 1 << 18,
+	SEMAPHORE_BUSY_bit                                = 1 << 19,
+	INTERRUPT_BUSY_bit                                = 1 << 20,
+	SURFACE_SYNC_BUSY_bit                             = 1 << 21,
+	DMA_BUSY_bit                                      = 1 << 22,
+	RCIU_BUSY_bit                                     = 1 << 23,
+	CP_STAT__CP_BUSY_bit                              = 1 << 31,
+
+    CP_ME_CNTL                                            = 0x86d8,
+	ME_STATMUX_mask                                   = 0xff << 0,
+	ME_STATMUX_shift                                  = 0,
+	ME_HALT_bit                                       = 1 << 28,
+    CP_ME_STATUS                                          = 0x86dc,
+
+    CP_RB_RPTR                                            = 0x8700,
+	RB_RPTR_mask                                      = 0xfffff << 0,
+	RB_RPTR_shift                                     = 0,
+    CP_RB_WPTR_DELAY                                      = 0x8704,
+	PRE_WRITE_TIMER_mask                              = 0xfffffff << 0,
+	PRE_WRITE_TIMER_shift                             = 0,
+	PRE_WRITE_LIMIT_mask                              = 0x0f << 28,
+	PRE_WRITE_LIMIT_shift                             = 28,
+
+    CP_ROQ_RB_STAT                                        = 0x8780,
+	ROQ_RPTR_PRIMARY_mask                             = 0x3ff << 0,
+	ROQ_RPTR_PRIMARY_shift                            = 0,
+	ROQ_WPTR_PRIMARY_mask                             = 0x3ff << 16,
+	ROQ_WPTR_PRIMARY_shift                            = 16,
+    CP_ROQ_IB1_STAT                                       = 0x8784,
+	ROQ_RPTR_INDIRECT1_mask                           = 0x3ff << 0,
+	ROQ_RPTR_INDIRECT1_shift                          = 0,
+	ROQ_WPTR_INDIRECT1_mask                           = 0x3ff << 16,
+	ROQ_WPTR_INDIRECT1_shift                          = 16,
+    CP_ROQ_IB2_STAT                                       = 0x8788,
+	ROQ_RPTR_INDIRECT2_mask                           = 0x3ff << 0,
+	ROQ_RPTR_INDIRECT2_shift                          = 0,
+	ROQ_WPTR_INDIRECT2_mask                           = 0x3ff << 16,
+	ROQ_WPTR_INDIRECT2_shift                          = 16,
+
+    CP_MEQ_STAT                                           = 0x8794,
+	MEQ_RPTR_mask                                     = 0x3ff << 0,
+	MEQ_RPTR_shift                                    = 0,
+	MEQ_WPTR_mask                                     = 0x3ff << 16,
+	MEQ_WPTR_shift                                    = 16,
+
+    CC_GC_SHADER_PIPE_CONFIG                              = 0x8950,
+	INACTIVE_QD_PIPES_mask                            = 0xff << 8,
+	INACTIVE_QD_PIPES_shift                           = 8,
+	    R6XX_MAX_QD_PIPES                             = 8,
+	INACTIVE_SIMDS_mask                               = 0xff << 16,
+	INACTIVE_SIMDS_shift                              = 16,
+	    R6XX_MAX_SIMDS                                = 8,
+    GC_USER_SHADER_PIPE_CONFIG                            = 0x8954,
+
+    VC_ENHANCE                                            = 0x9714,
+    DB_DEBUG                                              = 0x9830,
+        PREZ_MUST_WAIT_FOR_POSTZ_DONE                     = 1 << 31,
+
+    DB_WATERMARKS                                         = 0x00009838,
+	DEPTH_FREE_mask                                   = 0x1f << 0,
+	DEPTH_FREE_shift                                  = 0,
+	DEPTH_FLUSH_mask                                  = 0x3f << 5,
+	DEPTH_FLUSH_shift                                 = 5,
+	FORCE_SUMMARIZE_mask                              = 0x0f << 11,
+	FORCE_SUMMARIZE_shift                             = 11,
+	DEPTH_PENDING_FREE_mask                           = 0x1f << 15,
+	DEPTH_PENDING_FREE_shift                          = 15,
+	DEPTH_CACHELINE_FREE_mask                         = 0x1f << 20,
+	DEPTH_CACHELINE_FREE_shift                        = 20,
+	EARLY_Z_PANIC_DISABLE_bit                         = 1 << 25,
+	LATE_Z_PANIC_DISABLE_bit                          = 1 << 26,
+	RE_Z_PANIC_DISABLE_bit                            = 1 << 27,
+	DB_EXTRA_DEBUG_mask                               = 0x0f << 28,
+	DB_EXTRA_DEBUG_shift                              = 28,
+
+    CP_RB_BASE                                            = 0xc100,
+    CP_RB_CNTL                                            = 0xc104,
+        RB_BUFSZ_mask                                     = 0x3f << 0,
+    CP_RB_WPTR                                            = 0xc114,
+	RB_WPTR_mask                                      = 0xfffff << 0,
+	RB_WPTR_shift                                     = 0,
+    CP_RB_RPTR_WR                                         = 0xc108,
+	RB_RPTR_WR_mask                                   = 0xfffff << 0,
+	RB_RPTR_WR_shift                                  = 0,
+
+    CP_INT_STATUS                                         = 0xc128,
+	DISABLE_CNTX_SWITCH_INT_STAT_bit                  = 1 << 0,
+	ENABLE_CNTX_SWITCH_INT_STAT_bit                   = 1 << 1,
+	SEM_SIGNAL_INT_STAT_bit                           = 1 << 18,
+	CNTX_BUSY_INT_STAT_bit                            = 1 << 19,
+	CNTX_EMPTY_INT_STAT_bit                           = 1 << 20,
+	WAITMEM_SEM_INT_STAT_bit                          = 1 << 21,
+	PRIV_INSTR_INT_STAT_bit                           = 1 << 22,
+	PRIV_REG_INT_STAT_bit                             = 1 << 23,
+	OPCODE_ERROR_INT_STAT_bit                         = 1 << 24,
+	SCRATCH_INT_STAT_bit                              = 1 << 25,
+	TIME_STAMP_INT_STAT_bit                           = 1 << 26,
+	RESERVED_BIT_ERROR_INT_STAT_bit                   = 1 << 27,
+	DMA_INT_STAT_bit                                  = 1 << 28,
+	IB2_INT_STAT_bit                                  = 1 << 29,
+	IB1_INT_STAT_bit                                  = 1 << 30,
+	RB_INT_STAT_bit                                   = 1 << 31,
+
+//  SX_ALPHA_TEST_CONTROL                                 = 0x00028410,
+	ALPHA_FUNC__REF_NEVER                             = 0,
+	ALPHA_FUNC__REF_ALWAYS                            = 7,
+//  DB_SHADER_CONTROL                                     = 0x0002880c,
+	Z_ORDER__EARLY_Z_THEN_LATE_Z                      = 2,
+//  PA_SU_SC_MODE_CNTL                                    = 0x00028814,
+//	POLY_MODE_mask                                    = 0x03 << 3,
+	POLY_MODE__TRIANGLES = 0, POLY_MODE__DUAL_MODE,
+//	POLYMODE_FRONT_PTYPE_mask                         = 0x07 << 5,
+	POLYMODE_PTYPE__POINTS = 0, POLYMODE_PTYPE__LINES, POLYMODE_PTYPE__TRIANGLES,
+    PA_SC_AA_SAMPLE_LOCS_8S_WD1_M                         = 0x00028c20,
+    DB_SRESULTS_COMPARE_STATE0                            = 0x00028d28,	/* See autoregs: DB_SRESULTS_COMPARE_STATE1 */
+//  DB_SRESULTS_COMPARE_STATE1                            = 0x00028d2c,
+    DB_ALPHA_TO_MASK                                      = 0x00028d44,
+	ALPHA_TO_MASK_ENABLE                              = 1 << 0,
+	ALPHA_TO_MASK_OFFSET0_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET0_shift                       = 8,
+	ALPHA_TO_MASK_OFFSET1_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET1_shift                       = 10,
+	ALPHA_TO_MASK_OFFSET2_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET2_shift                       = 12,
+	ALPHA_TO_MASK_OFFSET3_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET3_shift                       = 14,
+
+//  SQ_VTX_CONSTANT_WORD2_0                               = 0x00038008,
+//    	SQ_VTX_CONSTANT_WORD2_0__DATA_FORMAT_mask         = 0x3f << 20,
+	FMT_INVALID=0,      FMT_8,          FMT_4_4,            FMT_3_3_2,
+	                    FMT_16=5,       FMT_16_FLOAT,       FMT_8_8,
+	FMT_5_6_5,          FMT_6_5_5,      FMT_1_5_5_5,        FMT_4_4_4_4,
+	FMT_5_5_5_1,        FMT_32,         FMT_32_FLOAT,       FMT_16_16,
+	FMT_16_16_FLOAT=16, FMT_8_24,       FMT_8_24_FLOAT,     FMT_24_8,
+	FMT_24_8_FLOAT,     FMT_10_11_11,   FMT_10_11_11_FLOAT, FMT_11_11_10,
+	FMT_11_11_10_FLOAT, FMT_2_10_10_10, FMT_8_8_8_8,        FMT_10_10_10_2,
+	FMT_X24_8_32_FLOAT, FMT_32_32,      FMT_32_32_FLOAT,    FMT_16_16_16_16,
+	FMT_16_16_16_16_FLOAT=32,           FMT_32_32_32_32=34, FMT_32_32_32_32_FLOAT,
+	                    FMT_1 = 37,                         FMT_GB_GR=39,
+	FMT_BG_RG,          FMT_32_AS_8,    FMT_32_AS_8_8,      FMT_5_9_9_9_SHAREDEXP,
+	FMT_8_8_8,          FMT_16_16_16,   FMT_16_16_16_FLOAT, FMT_32_32_32,
+	FMT_32_32_32_FLOAT=48,
+
+//  High level register file lengths
+    SQ_ALU_CONSTANT                                       = SQ_ALU_CONSTANT0_0,	/* 256 PS, 256 VS */
+    SQ_ALU_CONSTANT_ps_num                                = 256,
+    SQ_ALU_CONSTANT_vs_num                                = 256,
+    SQ_ALU_CONSTANT_all_num                               = 512,
+    SQ_ALU_CONSTANT_offset                                = 16,
+    SQ_ALU_CONSTANT_ps                                    = 0,
+    SQ_ALU_CONSTANT_vs                                    = SQ_ALU_CONSTANT_ps + SQ_ALU_CONSTANT_ps_num,
+    SQ_TEX_RESOURCE                                       = SQ_TEX_RESOURCE_WORD0_0,	/* 160 PS, 160 VS, 16 FS, 160 GS */
+    SQ_TEX_RESOURCE_ps_num                                = 160,
+    SQ_TEX_RESOURCE_vs_num                                = 160,
+    SQ_TEX_RESOURCE_fs_num                                = 16,
+    SQ_TEX_RESOURCE_gs_num                                = 160,
+    SQ_TEX_RESOURCE_all_num                               = 496,
+    SQ_TEX_RESOURCE_offset                                = 28,
+    SQ_TEX_RESOURCE_ps                                    = 0,
+    SQ_TEX_RESOURCE_vs                                    = SQ_TEX_RESOURCE_ps + SQ_TEX_RESOURCE_ps_num,
+    SQ_TEX_RESOURCE_fs                                    = SQ_TEX_RESOURCE_vs + SQ_TEX_RESOURCE_vs_num,
+    SQ_TEX_RESOURCE_gs                                    = SQ_TEX_RESOURCE_fs + SQ_TEX_RESOURCE_fs_num,
+    SQ_VTX_RESOURCE                                       = SQ_VTX_CONSTANT_WORD0_0,	/* 160 PS, 160 VS, 16 FS, 160 GS */
+    SQ_VTX_RESOURCE_ps_num                                = 160,
+    SQ_VTX_RESOURCE_vs_num                                = 160,
+    SQ_VTX_RESOURCE_fs_num                                = 16,
+    SQ_VTX_RESOURCE_gs_num                                = 160,
+    SQ_VTX_RESOURCE_all_num                               = 496,
+    SQ_VTX_RESOURCE_offset                                = 28,
+    SQ_VTX_RESOURCE_ps                                    = 0,
+    SQ_VTX_RESOURCE_vs                                    = SQ_VTX_RESOURCE_ps + SQ_VTX_RESOURCE_ps_num,
+    SQ_VTX_RESOURCE_fs                                    = SQ_VTX_RESOURCE_vs + SQ_VTX_RESOURCE_vs_num,
+    SQ_VTX_RESOURCE_gs                                    = SQ_VTX_RESOURCE_fs + SQ_VTX_RESOURCE_fs_num,
+    SQ_TEX_SAMPLER_WORD                                   = SQ_TEX_SAMPLER_WORD0_0,	/* 18 per PS, VS, GS */
+    SQ_TEX_SAMPLER_WORD_ps_num                            = 18,
+    SQ_TEX_SAMPLER_WORD_vs_num                            = 18,
+    SQ_TEX_SAMPLER_WORD_gs_num                            = 18,
+    SQ_TEX_SAMPLER_WORD_all_num                           = 54,
+    SQ_TEX_SAMPLER_WORD_offset                            = 12,
+    SQ_TEX_SAMPLER_WORD_ps                                = 0,
+    SQ_TEX_SAMPLER_WORD_vs                                = SQ_TEX_SAMPLER_WORD_ps + SQ_TEX_SAMPLER_WORD_ps_num,
+    SQ_TEX_SAMPLER_WORD_gs                                = SQ_TEX_SAMPLER_WORD_vs + SQ_TEX_SAMPLER_WORD_vs_num,
+    SQ_LOOP_CONST                                         = SQ_LOOP_CONST_0,		/* 32 per PS, VS, GS */
+    SQ_LOOP_CONST_ps_num                                  = 32,
+    SQ_LOOP_CONST_vs_num                                  = 32,
+    SQ_LOOP_CONST_gs_num                                  = 32,
+    SQ_LOOP_CONST_all_num                                 = 96,
+    SQ_LOOP_CONST_offset                                  = 4,
+    SQ_LOOP_CONST_ps                                      = 0,
+    SQ_LOOP_CONST_vs                                      = SQ_LOOP_CONST_ps + SQ_LOOP_CONST_ps_num,
+    SQ_LOOP_CONST_gs                                      = SQ_LOOP_CONST_vs + SQ_LOOP_CONST_vs_num,
+} ;
+
+
+#endif
diff --git a/src/r600_reg_r7xx.h b/src/r600_reg_r7xx.h
new file mode 100644
index 0000000..e5c01c8
--- /dev/null
+++ b/src/r600_reg_r7xx.h
@@ -0,0 +1,149 @@
+/*
+ * RadeonHD R6xx, R7xx Register documentation
+ *
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ * Copyright (C) 2008-2009  Matthias Hopf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _R600_REG_R7xx_H_
+#define _R600_REG_R7xx_H_
+
+/*
+ * Register update for R7xx chips
+ */
+
+enum {
+
+    R7XX_MC_VM_FB_LOCATION                                = 0x00002024,
+
+//  GRBM_STATUS                                           = 0x00008010,
+	R7XX_TA_BUSY_bit                                  = 1 << 14,
+
+    R7xx_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ                     = 0x00008d8c,
+	RING0_OFFSET_mask                                 = 0xff << 0,
+	RING0_OFFSET_shift                                = 0,
+	ISOLATE_ES_ENABLE_bit                             = 1 << 12,
+	ISOLATE_GS_ENABLE_bit                             = 1 << 13,
+	VS_PC_LIMIT_ENABLE_bit                            = 1 << 14,
+
+//  SQ_ALU_WORD0                                          = 0x00008dfc,
+//	SRC0_SEL_mask                                     = 0x1ff << 0,
+// 	SRC1_SEL_mask                                     = 0x1ff << 13,
+	    R7xx_SQ_ALU_SRC_1_DBL_L                       = 0xf4,
+	    R7xx_SQ_ALU_SRC_1_DBL_M                       = 0xf5,
+	    R7xx_SQ_ALU_SRC_0_5_DBL_L                     = 0xf6,
+	    R7xx_SQ_ALU_SRC_0_5_DBL_M                     = 0xf7,
+// 	INDEX_MODE_mask                                   = 0x07 << 26,
+	    R7xx_SQ_INDEX_GLOBAL                          = 0x05,
+	    R7xx_SQ_INDEX_GLOBAL_AR_X                     = 0x06,
+    R6xx_SQ_ALU_WORD1_OP2                                 = 0x00008dfc,
+    R7xx_SQ_ALU_WORD1_OP2_V2                              = 0x00008dfc,
+	R6xx_FOG_MERGE_bit                                = 1 << 5,
+	R6xx_OMOD_mask                                    = 0x03 << 6,
+	R7xx_OMOD_mask                                    = 0x03 << 5,
+	R6xx_OMOD_shift                                   = 6,
+	R7xx_OMOD_shift                                   = 5,
+	R6xx_SQ_ALU_WORD1_OP2__ALU_INST_mask              = 0x3ff << 8,
+	R7xx_SQ_ALU_WORD1_OP2_V2__ALU_INST_mask           = 0x7ff << 7,
+	R6xx_SQ_ALU_WORD1_OP2__ALU_INST_shift             = 8,
+	R7xx_SQ_ALU_WORD1_OP2_V2__ALU_INST_shift          = 7,
+	    R7xx_SQ_OP2_INST_FREXP_64                     = 0x07,
+	    R7xx_SQ_OP2_INST_ADD_64                       = 0x17,
+	    R7xx_SQ_OP2_INST_MUL_64                       = 0x1b,
+	    R7xx_SQ_OP2_INST_FLT64_TO_FLT32               = 0x1c,
+	    R7xx_SQ_OP2_INST_FLT32_TO_FLT64               = 0x1d,
+	    R7xx_SQ_OP2_INST_LDEXP_64                     = 0x7a,
+	    R7xx_SQ_OP2_INST_FRACT_64                     = 0x7b,
+	    R7xx_SQ_OP2_INST_PRED_SETGT_64                = 0x7c,
+	    R7xx_SQ_OP2_INST_PRED_SETE_64                 = 0x7d,
+	    R7xx_SQ_OP2_INST_PRED_SETGE_64                = 0x7e,
+//  SQ_ALU_WORD1_OP3                                      = 0x00008dfc,
+//	SRC2_SEL_mask                                     = 0x1ff << 0,
+//	    R7xx_SQ_ALU_SRC_1_DBL_L                       = 0xf4,
+//	    R7xx_SQ_ALU_SRC_1_DBL_M                       = 0xf5,
+//	    R7xx_SQ_ALU_SRC_0_5_DBL_L                     = 0xf6,
+//	    R7xx_SQ_ALU_SRC_0_5_DBL_M                     = 0xf7,
+// 	SQ_ALU_WORD1_OP3__ALU_INST_mask                   = 0x1f << 13,
+	    R7xx_SQ_OP3_INST_MULADD_64                    = 0x08,
+	    R7xx_SQ_OP3_INST_MULADD_64_M2                 = 0x09,
+	    R7xx_SQ_OP3_INST_MULADD_64_M4                 = 0x0a,
+	    R7xx_SQ_OP3_INST_MULADD_64_D2                 = 0x0b,
+//  SQ_CF_ALU_WORD1                                       = 0x00008dfc,
+	R6xx_USES_WATERFALL_bit                           = 1 << 25,
+	R7xx_SQ_CF_ALU_WORD1__ALT_CONST_bit               = 1 << 25,
+//  SQ_CF_ALLOC_EXPORT_WORD0                              = 0x00008dfc,
+//	ARRAY_BASE_mask                                   = 0x1fff << 0,
+//	TYPE_mask                                         = 0x03 << 13,
+//	    SQ_EXPORT_PARAM                               = 0x02,
+//	    X_UNUSED_FOR_SX_EXPORTS                       = 0x03,
+//	ELEM_SIZE_mask                                    = 0x03 << 30,
+//  SQ_CF_ALLOC_EXPORT_WORD1                              = 0x00008dfc,
+//	SQ_CF_ALLOC_EXPORT_WORD1__CF_INST_mask            = 0x7f << 23,
+	    R7xx_SQ_CF_INST_MEM_EXPORT                    = 0x3a,
+//  SQ_CF_WORD1                                           = 0x00008dfc,
+//	SQ_CF_WORD1__COUNT_mask                           = 0x07 << 10,
+	R7xx_COUNT_3_bit                                  = 1 << 19,
+//	SQ_CF_WORD1__CF_INST_mask                         = 0x7f << 23,
+	    R7xx_SQ_CF_INST_END_PROGRAM                   = 0x19,
+	    R7xx_SQ_CF_INST_WAIT_ACK                      = 0x1a,
+	    R7xx_SQ_CF_INST_TEX_ACK                       = 0x1b,
+	    R7xx_SQ_CF_INST_VTX_ACK                       = 0x1c,
+	    R7xx_SQ_CF_INST_VTX_TC_ACK                    = 0x1d,
+//  SQ_VTX_WORD0                                          = 0x00008dfc,
+//	VTX_INST_mask                                     = 0x1f << 0,
+	    R7xx_SQ_VTX_INST_MEM                          = 0x02,
+//  SQ_VTX_WORD2                                          = 0x00008dfc,
+	R7xx_SQ_VTX_WORD2__ALT_CONST_bit                  = 1 << 20,
+
+//  SQ_TEX_WORD0                                          = 0x00008dfc,
+//	TEX_INST_mask                                     = 0x1f << 0,
+	    R7xx_X_MEMORY_READ                            = 0x02,
+	    R7xx_SQ_TEX_INST_KEEP_GRADIENTS               = 0x0a,
+	    R7xx_X_FETCH4_LOAD4_INSTRUCTION_FOR_DX10_1    = 0x0f,
+	R7xx_SQ_TEX_WORD0__ALT_CONST_bit                  = 1 << 24,
+
+    R7xx_PA_SC_EDGERULE                                   = 0x00028230,
+    R7xx_SPI_THREAD_GROUPING                              = 0x000286c8,
+	PS_GROUPING_mask                                  = 0x1f << 0,
+	PS_GROUPING_shift                                 = 0,
+	VS_GROUPING_mask                                  = 0x1f << 8,
+	VS_GROUPING_shift                                 = 8,
+	GS_GROUPING_mask                                  = 0x1f << 16,
+	GS_GROUPING_shift                                 = 16,
+	ES_GROUPING_mask                                  = 0x1f << 24,
+	ES_GROUPING_shift                                 = 24,
+    R7xx_CB_SHADER_CONTROL                                = 0x000287a0,
+	RT0_ENABLE_bit                                    = 1 << 0,
+	RT1_ENABLE_bit                                    = 1 << 1,
+	RT2_ENABLE_bit                                    = 1 << 2,
+	RT3_ENABLE_bit                                    = 1 << 3,
+	RT4_ENABLE_bit                                    = 1 << 4,
+	RT5_ENABLE_bit                                    = 1 << 5,
+	RT6_ENABLE_bit                                    = 1 << 6,
+	RT7_ENABLE_bit                                    = 1 << 7,
+//  DB_ALPHA_TO_MASK                                      = 0x00028d44,
+	R7xx_OFFSET_ROUND_bit                             = 1 << 16,
+//  SQ_TEX_SAMPLER_MISC_0                                 = 0x0003d03c,
+	R7xx_TRUNCATE_COORD_bit                           = 1 << 9,
+	R7xx_DISABLE_CUBE_WRAP_bit                        = 1 << 10,
+
+} ;
+
+#endif /* _R600_REG_R7xx_H_ */
diff --git a/src/r600_shader.h b/src/r600_shader.h
new file mode 100644
index 0000000..58f5a52
--- /dev/null
+++ b/src/r600_shader.h
@@ -0,0 +1,346 @@
+/*
+ * RadeonHD R6xx, R7xx DRI driver
+ *
+ * Copyright (C) 2008-2009  Alexander Deucher
+ * Copyright (C) 2008-2009  Matthias Hopf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Shader macros
+ */
+
+#ifndef __SHADER_H__
+#define __SHADER_H__
+
+
+/* Restrictions of ALU instructions
+ * order of scalar ops is always x,y,z,w,t(rans), last to be indicated by last==1.
+ * max of 3 different src GPRs per instr.
+ * max of 4 different cfile constant components per instr.
+ * max of 2 (different) constants (any type) for t.
+ * bank swizzle (see below).
+ * GPR write stalls read of same register. Auto-replaced by PV/PS, NOP needed if registers are relative to
+ * different indices (gpr,loop,nothing).
+ * may use constant registers or constant cache, but not both.
+ */
+
+/* Bank_swizzle: (pp. 297ff)
+ * Only one of each x,y,z,w GPR component can be loaded per cycle (3 cycles per instr, called 0-2).
+ * per scalar instruction bank_swizzle can select which cycle each operand comes from. e.g.:
+ *   SRC0 SRC1 SRC2  SWIZZLE  cycle0 cycle1 cycle2
+ *   1.x  2.x          012     1.x    2.x     -
+ *   3.x  1.y          201     1.y     -     3.x
+ *   2.x  1.y          102    (1.y)  (2.x)    -
+ * If data is read in a cycle, multiple scalar instructions can reference it.
+ * Special case: square() - i.e. same component in src0+src1 doesn't need read port -> ignores swizzle for src1.
+ * No restrictions for constants or PV/PS.
+ * t can load multiple components in a single cycle slot, but has to share cycles with xyzw.
+ * t with single constant may not load GPRs or PV/PS in cycle 0 (carefull with ALU_TRANS_210).
+ * t with two constants may only load GPRs or PV/PS in cycle 2.
+ */
+
+
+/* Oder of instructions: All CF, All ALU, All Tex/Vtx fetches */
+
+
+// CF insts
+// addr
+#define ADDR(x)  (x)
+// pc
+#define POP_COUNT(x)      (x)
+// const
+#define CF_CONST(x)       (x)
+// cond
+#define COND(x)        (x)		// SQ_COND_*
+// count
+#define I_COUNT(x)        ((x) ? ((x) - 1) : 0)
+//r7xx
+#define COUNT_3(x)        (x)
+// call count
+#define CALL_COUNT(x)     (x)
+// eop
+#define END_OF_PROGRAM(x)   (x)
+// vpm
+#define VALID_PIXEL_MODE(x) (x)
+// cf inst
+#define CF_INST(x)        (x)		// SQ_CF_INST_*
+
+// wqm
+#define WHOLE_QUAD_MODE(x)  (x)
+// barrier
+#define BARRIER(x)          (x)
+//kb0
+#define KCACHE_BANK0(x)          (x)
+//kb1
+#define KCACHE_BANK1(x)          (x)
+// km0/1
+#define KCACHE_MODE0(x)          (x)
+#define KCACHE_MODE1(x)          (x)	// SQ_CF_KCACHE_*
+//
+#define KCACHE_ADDR0(x)          (x)
+#define KCACHE_ADDR1(x)          (x)
+// uw
+#define USES_WATERFALL(x)        (x)
+
+#define ARRAY_BASE(x)        (x)
+// export pixel
+#define CF_PIXEL_MRT0         0
+#define CF_PIXEL_MRT1         1
+#define CF_PIXEL_MRT2         2
+#define CF_PIXEL_MRT3         3
+#define CF_PIXEL_MRT4         4
+#define CF_PIXEL_MRT5         5
+#define CF_PIXEL_MRT6         6
+#define CF_PIXEL_MRT7         7
+// *_FOG: r6xx only
+#define CF_PIXEL_MRT0_FOG     16
+#define CF_PIXEL_MRT1_FOG     17
+#define CF_PIXEL_MRT2_FOG     18
+#define CF_PIXEL_MRT3_FOG     19
+#define CF_PIXEL_MRT4_FOG     20
+#define CF_PIXEL_MRT5_FOG     21
+#define CF_PIXEL_MRT6_FOG     22
+#define CF_PIXEL_MRT7_FOG     23
+#define CF_PIXEL_Z            61
+// export pos
+#define CF_POS0               60
+#define CF_POS1               61
+#define CF_POS2               62
+#define CF_POS3               63
+// export param
+// 0...31
+#define TYPE(x)              (x)	// SQ_EXPORT_*
+#if 0
+// type export
+#define SQ_EXPORT_PIXEL              0
+#define SQ_EXPORT_POS                1
+#define SQ_EXPORT_PARAM              2
+// reserved 3
+// type mem
+#define SQ_EXPORT_WRITE              0
+#define SQ_EXPORT_WRITE_IND          1
+#define SQ_EXPORT_WRITE_ACK          2
+#define SQ_EXPORT_WRITE_IND_ACK      3
+#endif
+
+#define RW_GPR(x)            (x)
+#define RW_REL(x)            (x)
+#define ABSOLUTE                  0
+#define RELATIVE                  1
+#define INDEX_GPR(x)            (x)
+#define ELEM_SIZE(x)            (x ? (x - 1) : 0)
+#define COMP_MASK(x)            (x)
+#define R6xx_ELEM_LOOP(x)            (x)
+#define BURST_COUNT(x)          (x ? (x - 1) : 0)
+
+// swiz
+#define SRC_SEL_X(x)    (x)		// SQ_SEL_* each
+#define SRC_SEL_Y(x)    (x)
+#define SRC_SEL_Z(x)    (x)
+#define SRC_SEL_W(x)    (x)
+
+#define CF_DWORD0(addr) (addr)
+// R7xx has another entry (COUNT3), but that is only used for adding a bit to count.
+// We allow one more bit for count in the argument of the macro on R7xx instead.
+// R6xx: [0,7]  R7xx: [1,16]
+#define CF_DWORD1(pc, cf_const, cond, count, call_count, eop, vpm, cf_inst, wqm, b) \
+        (((pc) << 0) | ((cf_const) << 3) | ((cond) << 8) | (((count) & 7) << 10) | (((count) >> 3) << 19) | \
+         ((call_count) << 13) | ((eop) << 21) | ((vpm) << 22) | ((cf_inst) << 23) | ((wqm) << 30) | ((b) << 31))
+
+#define CF_ALU_DWORD0(addr, kb0, kb1, km0) (((addr) << 0) | ((kb0) << 22) | ((kb1) << 26) | ((km0) << 30))
+#define CF_ALU_DWORD1(km1, kcache_addr0, kcache_addr1, count, uw, cf_inst, wqm, b) \
+        (((km1) << 0) | ((kcache_addr0) << 2) | ((kcache_addr1) << 10) | \
+	 ((count) << 18) | ((uw) << 25) | ((cf_inst) << 26) | ((wqm) << 30) | ((b) << 31))
+
+#define CF_ALLOC_IMP_EXP_DWORD0(array_base, type, rw_gpr, rr, index_gpr, es) \
+	 (((array_base) << 0) | ((type) << 13) | ((rw_gpr) << 15) | ((rr) << 22) | ((index_gpr) << 23) | \
+          ((es) << 30))
+// R7xx apparently doesn't have the ELEM_LOOP entry any more
+// We still expose it, but ELEM_LOOP is explicitely R6xx now.
+// TODO: is this just forgotten in the docs, or really not available any more?
+#define CF_ALLOC_IMP_EXP_DWORD1_BUF(array_size, comp_mask, el, bc, eop, vpm, cf_inst, wqm, b) \
+        (((array_size) << 0) | ((comp_mask) << 12) | ((el) << 16) | ((bc) << 17) | \
+	 ((eop) << 21) | ((vpm) << 22) | ((cf_inst) << 23) | ((wqm) << 30) | ((b) << 31))
+#define CF_ALLOC_IMP_EXP_DWORD1_SWIZ(sel_x, sel_y, sel_z, sel_w, el, bc, eop, vpm, cf_inst, wqm, b) \
+        (((sel_x) << 0) | ((sel_y) << 3) | ((sel_z) << 6) | ((sel_w) << 9) | ((el) << 16) | \
+	 ((bc) << 17) | ((eop) << 21) | ((vpm) << 22) | ((cf_inst) << 23) | \
+	 ((wqm) << 30) | ((b) << 31))
+
+// ALU clause insts
+#define SRC0_SEL(x)        (x)
+#define SRC1_SEL(x)        (x)
+#define SRC2_SEL(x)        (x)
+// src[0-2]_sel
+//   0-127 GPR
+// 128-159 kcache constants bank 0
+// 160-191 kcache constants bank 1
+// 248-255 special SQ_ALU_SRC_* (0, 1, etc.)
+
+#define SRC0_REL(x)        (x)
+#define SRC1_REL(x)        (x)
+#define SRC2_REL(x)        (x)
+// elem
+#define SRC0_ELEM(x)        (x)
+#define SRC1_ELEM(x)        (x)
+#define SRC2_ELEM(x)        (x)
+#define ELEM_X        0
+#define ELEM_Y        1
+#define ELEM_Z        2
+#define ELEM_W        3
+// neg
+#define SRC0_NEG(x)        (x)
+#define SRC1_NEG(x)        (x)
+#define SRC2_NEG(x)        (x)
+// im
+#define INDEX_MODE(x)    (x)		// SQ_INDEX_*
+// ps
+#define PRED_SEL(x)      (x)		// SQ_PRED_SEL_*
+// last
+#define LAST(x)          (x)
+// abs
+#define SRC0_ABS(x)       (x)
+#define SRC1_ABS(x)       (x)
+// uem
+#define UPDATE_EXECUTE_MASK(x) (x)
+// up
+#define UPDATE_PRED(x)      (x)
+// wm
+#define WRITE_MASK(x)   (x)
+// fm
+#define FOG_MERGE(x)    (x)
+// omod
+#define OMOD(x)        (x)		// SQ_ALU_OMOD_*
+// alu inst
+#define ALU_INST(x)        (x)		// SQ_ALU_INST_*
+//bs
+#define BANK_SWIZZLE(x)        (x)	// SQ_ALU_VEC_*
+#define DST_GPR(x)        (x)
+#define DST_REL(x)        (x)
+#define DST_ELEM(x)       (x)
+#define CLAMP(x)          (x)
+
+#define ALU_DWORD0(src0_sel, s0r, s0e, s0n, src1_sel, s1r, s1e, s1n, im, ps, last) \
+        (((src0_sel) << 0) | ((s0r) << 9) | ((s0e) << 10) | ((s0n) << 12) | \
+         ((src1_sel) << 13) | ((s1r) << 22) | ((s1e) << 23) | ((s1n) << 25) | \
+	 ((im) << 26) | ((ps) << 29) | ((last) << 31))
+// R7xx has alu_inst at a different slot, and no fog merge any more (no fix function fog any more)
+#define R6xx_ALU_DWORD1_OP2(s0a, s1a, uem, up, wm, fm, omod, alu_inst, bs, dst_gpr, dr, de, clamp) \
+        (((s0a) << 0) | ((s1a) << 1) | ((uem) << 2) | ((up) << 3) | ((wm) << 4) | \
+         ((fm) << 5) | ((omod) << 6) | ((alu_inst) << 8) | ((bs) << 18) | ((dst_gpr) << 21) | \
+	 ((dr) << 28) | ((de) << 29) | ((clamp) << 31))
+#define R7xx_ALU_DWORD1_OP2(s0a, s1a, uem, up, wm, omod, alu_inst, bs, dst_gpr, dr, de, clamp) \
+        (((s0a) << 0) | ((s1a) << 1) | ((uem) << 2) | ((up) << 3) | ((wm) << 4) | \
+         ((omod) << 5) | ((alu_inst) << 7) | ((bs) << 18) | ((dst_gpr) << 21) | \
+	 ((dr) << 28) | ((de) << 29) | ((clamp) << 31))
+// This is a general chipset macro, but due to selection by chipid typically not usable in static arrays
+// Fog is NOT USED on R7xx, even if specified.
+#define ALU_DWORD1_OP2(chipfamily, s0a, s1a, uem, up, wm, fm, omod, alu_inst, bs, dst_gpr, dr, de, clamp) \
+    ((chipfamily) < CHIP_FAMILY_RV770 ? \
+     R6xx_ALU_DWORD1_OP2(s0a, s1a, uem, up, wm, fm, omod, alu_inst, bs, dst_gpr, dr, de, clamp) : \
+     R7xx_ALU_DWORD1_OP2(s0a, s1a, uem, up, wm, omod, alu_inst, bs, dst_gpr, dr, de, clamp))
+#define ALU_DWORD1_OP3(src2_sel, s2r, s2e, s2n, alu_inst, bs, dst_gpr, dr, de, clamp) \
+        (((src2_sel) << 0) | ((s2r) << 9) | ((s2e) << 10) | ((s2n) << 12) | \
+         ((alu_inst) << 13) | ((bs) << 18) | ((dst_gpr) << 21) | ((dr) << 28) | \
+	 ((de) << 29) | ((clamp) << 31))
+
+// VTX clause insts
+// vxt insts
+#define VTX_INST(x)        (x)		// SQ_VTX_INST_*
+
+// fetch type
+#define FETCH_TYPE(x)        (x)	// SQ_VTX_FETCH_*
+
+#define FETCH_WHOLE_QUAD(x)        (x)
+#define BUFFER_ID(x)        (x)
+#define SRC_GPR(x)          (x)
+#define SRC_REL(x)          (x)
+#define MEGA_FETCH_COUNT(x)        ((x) ? ((x) - 1) : 0)
+
+#define SEMANTIC_ID(x)        (x)
+#define DST_SEL_X(x)          (x)
+#define DST_SEL_Y(x)          (x)
+#define DST_SEL_Z(x)          (x)
+#define DST_SEL_W(x)          (x)
+#define USE_CONST_FIELDS(x)   (x)
+#define DATA_FORMAT(x)        (x)
+// num format
+#define NUM_FORMAT_ALL(x)     (x)	// SQ_NUM_FORMAT_*
+// format comp
+#define FORMAT_COMP_ALL(x)     (x)	// SQ_FORMAT_COMP_*
+// sma
+#define SRF_MODE_ALL(x)     (x)
+#define SRF_MODE_ZERO_CLAMP_MINUS_ONE      0
+#define SRF_MODE_NO_ZERO                   1
+#define OFFSET(x)     (x)
+// endian swap
+#define ENDIAN_SWAP(x)     (x)		// SQ_ENDIAN_*
+#define CONST_BUF_NO_STRIDE(x)     (x)
+// mf
+#define MEGA_FETCH(x)     (x)
+
+#define VTX_DWORD0(vtx_inst, ft, fwq, buffer_id, src_gpr, sr, ssx, mfc) \
+        (((vtx_inst) << 0) | ((ft) << 5) | ((fwq) << 7) | ((buffer_id) << 8) | \
+	 ((src_gpr) << 16) | ((sr) << 23) | ((ssx) << 24) | ((mfc) << 26))
+#define VTX_DWORD1_SEM(semantic_id, dsx, dsy, dsz, dsw, ucf, data_format, nfa, fca, sma) \
+        (((semantic_id) << 0) | ((dsx) << 9) | ((dsy) << 12) | ((dsz) << 15) | ((dsw) << 18) | \
+	 ((ucf) << 21) | ((data_format) << 22) | ((nfa) << 28) | ((fca) << 30) | ((sma) << 31))
+#define VTX_DWORD1_GPR(dst_gpr, dr, dsx, dsy, dsz, dsw, ucf, data_format, nfa, fca, sma) \
+        (((dst_gpr) << 0) | ((dr) << 7) | ((dsx) << 9) | ((dsy) << 12) | ((dsz) << 15) | ((dsw) << 18) | \
+	 ((ucf) << 21) | ((data_format) << 22) | ((nfa) << 28) | ((fca) << 30) | ((sma) << 31))
+#define VTX_DWORD2(offset, es, cbns, mf) \
+	 (((offset) << 0) | ((es) << 16) | ((cbns) << 18) | ((mf) << 19))
+#define VTX_DWORD_PAD 0x00000000
+
+// TEX clause insts
+// tex insts
+#define TEX_INST(x)     (x)		// SQ_TEX_INST_*
+
+#define BC_FRAC_MODE(x)         (x)
+#define FETCH_WHOLE_QUAD(x)     (x)
+#define RESOURCE_ID(x)          (x)
+#define R7xx_ALT_CONST(x)            (x)
+
+#define LOD_BIAS(x)     (x)
+//ct
+#define COORD_TYPE_X(x)     (x)
+#define COORD_TYPE_Y(x)     (x)
+#define COORD_TYPE_Z(x)     (x)
+#define COORD_TYPE_W(x)     (x)
+#define TEX_UNNORMALIZED                0
+#define TEX_NORMALIZED                  1
+#define OFFSET_X(x)     (x)
+#define OFFSET_Y(x)     (x)
+#define OFFSET_Z(x)     (x)
+#define SAMPLER_ID(x)     (x)
+
+// R7xx has an additional parameter ALT_CONST. We always expose it, but ALT_CONST is R7xx only
+#define TEX_DWORD0(tex_inst, bfm, fwq, resource_id, src_gpr, sr, ac) \
+	 (((tex_inst) << 0) | ((bfm) << 5) | ((fwq) << 7) | ((resource_id) << 8) | \
+          ((src_gpr) << 16) | ((sr) << 23) | ((ac) << 24))
+#define TEX_DWORD1(dst_gpr, dr, dsx, dsy, dsz, dsw, lod_bias, ctx, cty, ctz, ctw) \
+        (((dst_gpr) << 0) | ((dr) << 7) | ((dsx) << 9) | ((dsy) << 12) | ((dsz) << 15) | ((dsw) << 18) | \
+	 ((lod_bias) << 21) | ((ctx) << 28) | ((cty) << 29) | ((ctz) << 30) | ((ctw) << 31))
+#define TEX_DWORD2(offset_x, offset_y, offset_z, sampler_id, ssx, ssy, ssz, ssw) \
+        (((offset_x) << 0) | ((offset_y) << 5) | ((offset_z) << 10) | ((sampler_id) << 15) | \
+	 ((ssx) << 20) | ((ssy) << 23) | ((ssz) << 26) | ((ssw) << 29))
+#define TEX_DWORD_PAD 0x00000000
+
+
+#endif
diff --git a/src/r600_state.h b/src/r600_state.h
new file mode 100644
index 0000000..bf9cdb5
--- /dev/null
+++ b/src/r600_state.h
@@ -0,0 +1,227 @@
+#ifndef __R600_STATE_H__
+#define __R600_STATE_H__
+
+#include "xf86drm.h"
+
+typedef int bool_t;
+
+/* seriously ?! @#$%% */
+# define uint32_t CARD32
+# define uint64_t CARD64
+
+#define CLEAR(x) memset (&x, 0, sizeof(x))
+
+/* Sequencer / thread handling */
+typedef struct {
+    int ps_prio;
+    int vs_prio;
+    int gs_prio;
+    int es_prio;
+    int num_ps_gprs;
+    int num_vs_gprs;
+    int num_gs_gprs;
+    int num_es_gprs;
+    int num_temp_gprs;
+    int num_ps_threads;
+    int num_vs_threads;
+    int num_gs_threads;
+    int num_es_threads;
+    int num_ps_stack_entries;
+    int num_vs_stack_entries;
+    int num_gs_stack_entries;
+    int num_es_stack_entries;
+} sq_config_t;
+
+/* Color buffer / render target */
+typedef struct {
+    int id;
+    int w;
+    int h;
+    uint64_t base;
+    int format;
+    int endian;
+    int array_mode;						// tiling
+    int number_type;
+    int read_size;
+    int comp_swap;
+    int tile_mode;
+    int blend_clamp;
+    int clear_color;
+    int blend_bypass;
+    int blend_float32;
+    int simple_float;
+    int round_mode;
+    int tile_compact;
+    int source_format;
+} cb_config_t;
+
+/* Depth buffer */
+typedef struct {
+    int w;
+    int h;
+    uint64_t base;
+    int format;
+    int read_size;
+    int array_mode;						// tiling
+    int tile_surface_en;
+    int tile_compact;
+    int zrange_precision;
+} db_config_t;
+
+/* Shader */
+typedef struct {
+    uint64_t shader_addr;
+    int num_gprs;
+    int stack_size;
+    int dx10_clamp;
+    int prime_cache_pgm_en;
+    int prime_cache_on_draw;
+    int fetch_cache_lines;
+    int prime_cache_en;
+    int prime_cache_on_const;
+    int clamp_consts;
+    int export_mode;
+    int uncached_first_inst;
+} shader_config_t;
+
+/* Vertex buffer / vtx resource */
+typedef struct {
+    int id;
+    uint64_t vb_addr;
+    uint32_t vtx_num_entries;
+    uint32_t vtx_size_dw;
+    int clamp_x;
+    int format;
+    int num_format_all;
+    int format_comp_all;
+    int srf_mode_all;
+    int endian;
+    int mem_req_size;
+} vtx_resource_t;
+
+/* Texture resource */
+typedef struct {
+    int id;
+    int w;
+    int h;
+    int pitch;
+    int depth;
+    int dim;
+    int tile_mode;
+    int tile_type;
+    int format;
+    uint64_t base;
+    uint64_t mip_base;
+    int format_comp_x;
+    int format_comp_y;
+    int format_comp_z;
+    int format_comp_w;
+    int num_format_all;
+    int srf_mode_all;
+    int force_degamma;
+    int endian;
+    int request_size;
+    int dst_sel_x;
+    int dst_sel_y;
+    int dst_sel_z;
+    int dst_sel_w;
+    int base_level;
+    int last_level;
+    int base_array;
+    int last_array;
+    int mpeg_clamp;
+    int perf_modulation;
+    int interlaced;
+} tex_resource_t;
+
+/* Texture sampler */
+typedef struct {
+    int				id;
+    /* Clamping */
+    int				clamp_x, clamp_y, clamp_z;
+    int		       		border_color;
+    /* Filtering */
+    int				xy_mag_filter, xy_min_filter;
+    int				z_filter;
+    int				mip_filter;
+    bool_t			high_precision_filter;	/* ? */
+    int				perf_mip;		/* ? 0-7 */
+    int				perf_z;			/* ? 3 */
+    /* LoD selection */
+    int				min_lod, max_lod;	/* 0-0x3ff */
+    int                         lod_bias;		/* 0-0xfff (signed?) */
+    int                         lod_bias2;		/* ? 0-0xfff (signed?) */
+    bool_t			lod_uses_minor_axis;	/* ? */
+    /* Other stuff */
+    bool_t			point_sampling_clamp;	/* ? */
+    bool_t			tex_array_override;	/* ? */
+    bool_t                      mc_coord_truncate;	/* ? */
+    bool_t			force_degamma;		/* ? */
+    bool_t			fetch_4;		/* ? */
+    bool_t			sample_is_pcf;		/* ? */
+    bool_t			type;			/* ? */
+    int				depth_compare;		/* only depth textures? */
+    int				chroma_key;
+} tex_sampler_t;
+
+/* Draw command */
+typedef struct {
+    uint32_t prim_type;
+    uint32_t vgt_draw_initiator;
+    uint32_t index_type;
+    uint32_t num_instances;
+    uint32_t num_indices;
+} draw_config_t;
+
+inline void e32(drmBufPtr ib, uint32_t dword);
+inline void efloat(drmBufPtr ib, float f);
+inline void pack3(drmBufPtr ib, int cmd, unsigned num);
+inline void pack0 (drmBufPtr ib, uint32_t reg, int num);
+inline void ereg (drmBufPtr ib, uint32_t reg, uint32_t val);
+void R600CPFlushIndirect(ScrnInfoPtr pScrn, drmBufPtr ib);
+void R600IBDiscard(ScrnInfoPtr pScrn, drmBufPtr ib);
+
+uint64_t
+upload (ScrnInfoPtr pScrn, void *shader, int size, int offset);
+void
+wait_3d_idle_clean(ScrnInfoPtr pScrn, drmBufPtr ib);
+void
+wait_3d_idle(ScrnInfoPtr pScrn, drmBufPtr ib);
+void
+start_3d(ScrnInfoPtr pScrn, drmBufPtr ib);
+void
+set_render_target(ScrnInfoPtr pScrn, drmBufPtr ib, cb_config_t *cb_conf);
+void
+cp_set_surface_sync(ScrnInfoPtr pScrn, drmBufPtr ib, uint32_t sync_type, uint32_t size, uint64_t mc_addr);
+void
+fs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *fs_conf);
+void
+vs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *vs_conf);
+void
+ps_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *ps_conf);
+void
+set_alu_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, float *const_buf);
+void
+set_vtx_resource(ScrnInfoPtr pScrn, drmBufPtr ib, vtx_resource_t *res);
+void
+set_tex_resource(ScrnInfoPtr pScrn, drmBufPtr ib, tex_resource_t *tex_res);
+void
+set_tex_sampler (ScrnInfoPtr pScrn, drmBufPtr ib, tex_sampler_t *s);
+void
+set_screen_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int y2);
+void
+set_vport_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int id, int x1, int y1, int x2, int y2);
+void
+set_generic_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int y2);
+void
+set_window_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int y2);
+void
+set_clip_rect(ScrnInfoPtr pScrn, drmBufPtr ib, int id, int x1, int y1, int x2, int y2);
+void
+set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib);
+void
+draw_immd(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf, uint32_t *indices);
+void
+draw_auto(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf);
+
+#endif
diff --git a/src/r600_textured_videofuncs.c b/src/r600_textured_videofuncs.c
new file mode 100644
index 0000000..b1cd4f1
--- /dev/null
+++ b/src/r600_textured_videofuncs.c
@@ -0,0 +1,374 @@
+/*
+ * Copyright 2008 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Author: Alex Deucher <alexander.deucher at amd.com>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "xf86.h"
+
+#include "exa.h"
+
+#include "radeon.h"
+#include "r600_shader.h"
+#include "r600_reg.h"
+#include "r600_state.h"
+
+#include "radeon_video.h"
+
+#include <X11/extensions/Xv.h>
+#include "fourcc.h"
+
+#include "damage.h"
+
+
+void
+R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv)
+{
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+    PixmapPtr pPixmap = pPriv->pPixmap;
+    BoxPtr pBox = REGION_RECTS(&pPriv->clip);
+    int nBox = REGION_NUM_RECTS(&pPriv->clip);
+    int dstxoff, dstyoff;
+    cb_config_t     cb_conf;
+    tex_resource_t  tex_res;
+    tex_sampler_t   tex_samp;
+    shader_config_t vs_conf, ps_conf;
+    draw_config_t   draw_conf;
+    vtx_resource_t  vtx_res;
+    int uv_offset;
+
+    static float ps_alu_consts[] = {
+	1.0,  0.0,      1.13983,  -1.13983/2,        // r - c[0]
+	1.0, -0.39465, -0.5806,  (0.39465+0.5806)/2, // g - c[1]
+	1.0,  2.03211,  0.0,     -2.03211/2,         // b - c[2]
+    };
+
+    CLEAR (cb_conf);
+    CLEAR (tex_res);
+    CLEAR (tex_samp);
+    CLEAR (vs_conf);
+    CLEAR (ps_conf);
+    CLEAR (draw_conf);
+    CLEAR (vtx_res);
+
+    accel_state->dst_pitch = exaGetPixmapPitch(pPixmap) / (pPixmap->drawable.bitsPerPixel / 8);
+    accel_state->src_pitch[0] = pPriv->src_pitch;
+
+    // bad pitch
+    if (accel_state->src_pitch[0] & 7)
+	return;
+    if (accel_state->dst_pitch & 7)
+	return;
+
+#ifdef COMPOSITE
+    dstxoff = -pPixmap->screen_x + pPixmap->drawable.x;
+    dstyoff = -pPixmap->screen_y + pPixmap->drawable.y;
+#else
+    dstxoff = 0;
+    dstyoff = 0;
+#endif
+
+    accel_state->ib = RADEONCPGetBuffer(pScrn);
+
+    /* Init */
+    start_3d(pScrn, accel_state->ib);
+
+    //cp_set_surface_sync(pScrn, accel_state->ib);
+
+    set_default_state(pScrn, accel_state->ib);
+
+    /* Scissor / viewport */
+    ereg  (accel_state->ib, PA_CL_VTE_CNTL,                      VTX_XY_FMT_bit);
+    ereg  (accel_state->ib, PA_CL_CLIP_CNTL,                     CLIP_DISABLE_bit);
+
+    accel_state->vs_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	accel_state->xv_vs_offset;
+    accel_state->ps_mc_addr = info->fbLocation + pScrn->fbOffset + accel_state->shaders->offset +
+	accel_state->xv_ps_offset;
+
+    accel_state->vs_size = 512;
+    accel_state->ps_size = 512;
+
+    /* Shader */
+
+    /* flush SQ cache */
+    cp_set_surface_sync(pScrn, accel_state->ib, SH_ACTION_ENA_bit,
+			accel_state->vs_size, accel_state->vs_mc_addr);
+
+    vs_conf.shader_addr         = accel_state->vs_mc_addr;
+    vs_conf.num_gprs            = 2;
+    vs_conf.stack_size          = 0;
+    vs_setup                    (pScrn, accel_state->ib, &vs_conf);
+
+    /* flush SQ cache */
+    cp_set_surface_sync(pScrn, accel_state->ib, SH_ACTION_ENA_bit,
+			accel_state->ps_size, accel_state->ps_mc_addr);
+
+    ps_conf.shader_addr         = accel_state->ps_mc_addr;
+    ps_conf.num_gprs            = 4;
+    ps_conf.stack_size          = 0;
+    ps_conf.uncached_first_inst = 1;
+    ps_conf.clamp_consts        = 0;
+    ps_conf.export_mode         = 2;
+    ps_setup                    (pScrn, accel_state->ib, &ps_conf);
+
+    // PS alu constants
+    set_alu_consts(pScrn, accel_state->ib, 0, sizeof(ps_alu_consts) / SQ_ALU_CONSTANT_offset, ps_alu_consts);
+
+    /* Texture */
+    accel_state->src_mc_addr[0] = pPriv->src_offset;
+    accel_state->src_size[0] = exaGetPixmapPitch(pPixmap) * pPriv->w;
+
+    /* flush texture cache */
+    cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit, 512,
+			accel_state->src_mc_addr[0]);
+
+    // Y texture
+    tex_res.id                  = 0;
+    tex_res.w                   = pPriv->w;
+    tex_res.h                   = pPriv->h;
+    tex_res.pitch               = accel_state->src_pitch[0];
+    tex_res.depth               = 0;
+    tex_res.dim                 = SQ_TEX_DIM_2D;
+    tex_res.base                = accel_state->src_mc_addr[0];
+    tex_res.mip_base            = accel_state->src_mc_addr[0];
+
+    tex_res.format              = FMT_8;
+    tex_res.dst_sel_x           = SQ_SEL_X; //Y
+    tex_res.dst_sel_y           = SQ_SEL_1;
+    tex_res.dst_sel_z           = SQ_SEL_1;
+    tex_res.dst_sel_w           = SQ_SEL_1;
+
+    tex_res.request_size        = 1;
+    tex_res.base_level          = 0;
+    tex_res.last_level          = 0;
+    tex_res.perf_modulation     = 0;
+    tex_res.interlaced          = 0;
+    set_tex_resource            (pScrn, accel_state->ib, &tex_res);
+
+    // UV texture
+    uv_offset = accel_state->src_pitch[0] * pPriv->h;
+    uv_offset = (uv_offset + 255) & ~255;
+
+    cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
+			accel_state->src_size[0] / 2,
+			accel_state->src_mc_addr[0] + uv_offset);
+
+    tex_res.id                  = 1;
+    tex_res.format              = FMT_8_8;
+    tex_res.w                   = pPriv->w >> 1;
+    tex_res.h                   = pPriv->h >> 1;
+    tex_res.pitch               = accel_state->src_pitch[0] >> 1;
+    tex_res.dst_sel_x           = SQ_SEL_Y; //V
+    tex_res.dst_sel_y           = SQ_SEL_X; //U
+    tex_res.dst_sel_z           = SQ_SEL_1;
+    tex_res.dst_sel_w           = SQ_SEL_1;
+    tex_res.interlaced          = 0;
+    // XXX tex bases need to be 256B aligned
+    tex_res.base                = accel_state->src_mc_addr[0] + uv_offset;
+    tex_res.mip_base            = accel_state->src_mc_addr[0] + uv_offset;
+    set_tex_resource            (pScrn, accel_state->ib, &tex_res);
+
+    // Y sampler
+    tex_samp.id                 = 0;
+    tex_samp.clamp_x            = SQ_TEX_CLAMP_LAST_TEXEL;
+    tex_samp.clamp_y            = SQ_TEX_CLAMP_LAST_TEXEL;
+    tex_samp.clamp_z            = SQ_TEX_WRAP;
+
+    // xxx: switch to bicubic
+    tex_samp.xy_mag_filter      = SQ_TEX_XY_FILTER_BILINEAR;
+    tex_samp.xy_min_filter      = SQ_TEX_XY_FILTER_BILINEAR;
+
+    tex_samp.z_filter           = SQ_TEX_Z_FILTER_NONE;
+    tex_samp.mip_filter         = 0;			/* no mipmap */
+    set_tex_sampler             (pScrn, accel_state->ib, &tex_samp);
+
+    // UV sampler
+    tex_samp.id                 = 1;
+    set_tex_sampler             (pScrn, accel_state->ib, &tex_samp);
+
+    /* Render setup */
+    ereg  (accel_state->ib, CB_SHADER_MASK,                      (0x0f << OUTPUT0_ENABLE_shift));
+    ereg  (accel_state->ib, R7xx_CB_SHADER_CONTROL,              (RT0_ENABLE_bit));
+    ereg  (accel_state->ib, CB_COLOR_CONTROL,                    (0xcc << ROP3_shift)); /* copy */
+
+    cb_conf.id = 0;
+
+    accel_state->dst_mc_addr = exaGetPixmapOffset(pPixmap) + info->fbLocation + pScrn->fbOffset;
+
+    cb_conf.w = accel_state->dst_pitch;
+    cb_conf.h = pPixmap->drawable.height;
+    cb_conf.base = accel_state->dst_mc_addr;
+
+    switch (pPixmap->drawable.bitsPerPixel) {
+    case 16:
+	if (pPixmap->drawable.depth == 15) {
+	    cb_conf.format = COLOR_1_5_5_5;
+	    cb_conf.comp_swap = 1; //ARGB
+	} else {
+	    cb_conf.format = COLOR_5_6_5;
+	    cb_conf.comp_swap = 2; //RGB
+	}
+	break;
+    case 32:
+	cb_conf.format = COLOR_8_8_8_8;
+	cb_conf.comp_swap = 1; //ARGB
+	break;
+    default:
+	return;
+    }
+
+    cb_conf.source_format = 1;
+    cb_conf.blend_clamp = 1;
+    set_render_target(pScrn, accel_state->ib, &cb_conf);
+
+    ereg  (accel_state->ib, PA_SU_SC_MODE_CNTL,                  (FACE_bit			|
+						 (POLYMODE_PTYPE__TRIANGLES << POLYMODE_FRONT_PTYPE_shift)	|
+						 (POLYMODE_PTYPE__TRIANGLES << POLYMODE_BACK_PTYPE_shift)));
+    ereg  (accel_state->ib, DB_SHADER_CONTROL,                   ((1 << Z_ORDER_shift)		| /* EARLY_Z_THEN_LATE_Z */
+						 DUAL_EXPORT_ENABLE_bit)); /* Only useful if no depth export */
+
+    /* Interpolator setup */
+    // export tex coords from VS
+    ereg  (accel_state->ib, SPI_VS_OUT_CONFIG, ((1 - 1) << VS_EXPORT_COUNT_shift));
+    ereg  (accel_state->ib, SPI_VS_OUT_ID_0, (0 << SEMANTIC_0_shift));
+
+    /* Enabling flat shading needs both FLAT_SHADE_bit in SPI_PS_INPUT_CNTL_x
+     * *and* FLAT_SHADE_ENA_bit in SPI_INTERP_CONTROL_0 */
+    ereg  (accel_state->ib, SPI_PS_IN_CONTROL_0,                 ((1 << NUM_INTERP_shift)));
+    ereg  (accel_state->ib, SPI_PS_IN_CONTROL_1,                 0);
+    ereg  (accel_state->ib, SPI_PS_INPUT_CNTL_0 + (0 <<2),       ((0    << SEMANTIC_shift)	|
+									     (0x03 << DEFAULT_VAL_shift)	|
+									     SEL_CENTROID_bit));
+    ereg  (accel_state->ib, SPI_INTERP_CONTROL_0,                0);
+
+
+    accel_state->vb_index = 0;
+
+    while (nBox--) {
+	int srcX, srcY, srcw, srch;
+	int dstX, dstY, dstw, dsth;
+	struct r6xx_copy_vertex *xv_vb = (pointer)((char*)accel_state->ib->address + (accel_state->ib->total / 2));
+	struct r6xx_copy_vertex vertex[3];
+
+	dstX = pBox->x1 + dstxoff;
+	dstY = pBox->y1 + dstyoff;
+	dstw = pBox->x2 - pBox->x1;
+	dsth = pBox->y2 - pBox->y1;
+
+	srcX = ((pBox->x1 - pPriv->drw_x) *
+		pPriv->src_w) / pPriv->dst_w;
+	srcY = ((pBox->y1 - pPriv->drw_y) *
+		pPriv->src_h) / pPriv->dst_h;
+
+	srcw = (pPriv->src_w * dstw) / pPriv->dst_w;
+	srch = (pPriv->src_h * dsth) / pPriv->dst_h;
+
+	vertex[0].x = (float)dstX;
+	vertex[0].y = (float)dstY;
+	vertex[0].s = (float)srcX / pPriv->w;
+	vertex[0].t = (float)srcY / pPriv->h;
+
+	vertex[1].x = (float)dstX;
+	vertex[1].y = (float)(dstY + dsth);
+	vertex[1].s = (float)srcX / pPriv->w;
+	vertex[1].t = (float)(srcY + srch) / pPriv->h;
+
+	vertex[2].x = (float)(dstX + dstw);
+	vertex[2].y = (float)(dstY + dsth);
+	vertex[2].s = (float)(srcX + srcw) / pPriv->w;
+	vertex[2].t = (float)(srcY + srch) / pPriv->h;
+
+#if 0
+	ErrorF("vertex 0: %f, %f, %f, %f\n", vertex[0].x, vertex[0].y, vertex[0].s, vertex[0].t);
+	ErrorF("vertex 1: %f, %f, %f, %f\n", vertex[1].x, vertex[1].y, vertex[1].s, vertex[1].t);
+	ErrorF("vertex 2: %f, %f, %f, %f\n", vertex[2].x, vertex[2].y, vertex[2].s, vertex[2].t);
+#endif
+
+	// append to vertex buffer
+	xv_vb[accel_state->vb_index++] = vertex[0];
+	xv_vb[accel_state->vb_index++] = vertex[1];
+	xv_vb[accel_state->vb_index++] = vertex[2];
+
+	pBox++;
+    }
+
+    if (accel_state->vb_index == 0) {
+	R600IBDiscard(pScrn, accel_state->ib);
+	DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
+	return;
+    }
+
+    accel_state->vb_mc_addr = info->gartLocation + info->dri->bufStart +
+	(accel_state->ib->idx * accel_state->ib->total) + (accel_state->ib->total / 2);
+    accel_state->vb_size = accel_state->vb_index * 16;
+
+    /* flush vertex cache */
+    if ((info->ChipFamily == CHIP_FAMILY_RV610) ||
+	(info->ChipFamily == CHIP_FAMILY_RV620) ||
+	(info->ChipFamily == CHIP_FAMILY_RS780) ||
+	(info->ChipFamily == CHIP_FAMILY_RV710))
+	cp_set_surface_sync(pScrn, accel_state->ib, TC_ACTION_ENA_bit,
+			    accel_state->vb_size, accel_state->vb_mc_addr);
+    else
+	cp_set_surface_sync(pScrn, accel_state->ib, VC_ACTION_ENA_bit,
+			    accel_state->vb_size, accel_state->vb_mc_addr);
+
+    /* Vertex buffer setup */
+    vtx_res.id              = SQ_VTX_RESOURCE_vs;
+    vtx_res.vtx_size_dw     = 16 / 4;
+    vtx_res.vtx_num_entries = accel_state->vb_size / 4;
+    vtx_res.mem_req_size    = 1;
+    vtx_res.vb_addr         = accel_state->vb_mc_addr;
+    set_vtx_resource        (pScrn, accel_state->ib, &vtx_res);
+
+    draw_conf.prim_type          = DI_PT_RECTLIST;
+    draw_conf.vgt_draw_initiator = DI_SRC_SEL_AUTO_INDEX;
+    draw_conf.num_instances      = 1;
+    draw_conf.num_indices        = vtx_res.vtx_num_entries / vtx_res.vtx_size_dw;
+    draw_conf.index_type         = DI_INDEX_SIZE_16_BIT;
+
+    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_0,            0);	/* ? */
+    ereg  (accel_state->ib, VGT_INSTANCE_STEP_RATE_1,            0);
+
+    ereg  (accel_state->ib, VGT_MAX_VTX_INDX,                    draw_conf.num_indices);
+    ereg  (accel_state->ib, VGT_MIN_VTX_INDX,                    0);
+    ereg  (accel_state->ib, VGT_INDX_OFFSET,                     0);
+
+    draw_auto(pScrn, accel_state->ib, &draw_conf);
+
+    wait_3d_idle_clean(pScrn, accel_state->ib);
+
+    /* sync destination surface */
+    cp_set_surface_sync(pScrn, accel_state->ib, (CB_ACTION_ENA_bit, CB0_DEST_BASE_ENA_bit),
+			accel_state->dst_size, accel_state->dst_mc_addr);
+
+    R600CPFlushIndirect(pScrn, accel_state->ib);
+
+    DamageDamageRegion(pPriv->pDraw, &pPriv->clip);
+}
diff --git a/src/r6xx_accel.c b/src/r6xx_accel.c
new file mode 100644
index 0000000..659d13d
--- /dev/null
+++ b/src/r6xx_accel.c
@@ -0,0 +1,1110 @@
+/*
+ * Copyright 2008 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors: Alex Deucher <alexander.deucher at amd.com>
+ *          Matthias Hopf <mhopf at suse.de>
+ */
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "xf86.h"
+
+#include <errno.h>
+
+#include "radeon.h"
+#include "r600_shader.h"
+#include "radeon_reg.h"
+#include "r600_reg.h"
+#include "r600_state.h"
+
+#include "xf86drm.h"
+#include "radeon_drm.h"
+
+/* Emit uint32_t */
+inline void e32(drmBufPtr ib, uint32_t dword)
+{
+    uint32_t *ib_head = (pointer)(char*)ib->address;
+
+    ib_head[ib->used>>2] = dword;
+    ib->used += 4;
+}
+
+inline void efloat(drmBufPtr ib, float f)
+{
+    union {
+	float f;
+	uint32_t d;
+    } a;
+    a.f=f;
+    e32(ib, a.d);
+}
+
+inline void pack3(drmBufPtr ib, int cmd, unsigned num)
+{
+    e32 (ib, RADEON_CP_PACKET3 | (cmd << 8) | (((num-1) & 0x3fff) << 16));
+}
+
+/* write num registers, start at reg */
+/* If register falls in a special area, special commands are issued */
+inline void pack0 (drmBufPtr ib, uint32_t reg, int num)
+{
+    if (reg >= SET_CONFIG_REG_offset && reg < SET_CONFIG_REG_end) {
+	pack3 (ib, IT_SET_CONFIG_REG, num+1);
+	e32 (ib, (reg-SET_CONFIG_REG_offset) >> 2);
+    } else if (reg >= SET_CONTEXT_REG_offset && reg < SET_CONTEXT_REG_end) {
+	pack3 (ib, IT_SET_CONTEXT_REG, num+1);
+	e32 (ib, (reg-0x28000) >> 2);
+    } else if (reg >= SET_ALU_CONST_offset && reg < SET_ALU_CONST_end) {
+	pack3 (ib, IT_SET_ALU_CONST, num+1);
+	e32 (ib, (reg-SET_ALU_CONST_offset) >> 2);
+    } else if (reg >= SET_RESOURCE_offset && reg < SET_RESOURCE_end) {
+	pack3 (ib, IT_SET_RESOURCE, num+1);
+	e32 (ib, (reg-SET_RESOURCE_offset) >> 2);
+    } else if (reg >= SET_SAMPLER_offset && reg < SET_SAMPLER_end) {
+	pack3 (ib, IT_SET_SAMPLER, num+1);
+	e32 (ib, (reg-SET_SAMPLER_offset) >> 2);
+    } else if (reg >= SET_CTL_CONST_offset && reg < SET_CTL_CONST_end) {
+	pack3 (ib, IT_SET_CTL_CONST, num+1);
+	e32 (ib, (reg-SET_CTL_CONST_offset) >> 2);
+    } else if (reg >= SET_LOOP_CONST_offset && reg < SET_LOOP_CONST_end) {
+	pack3 (ib, IT_SET_LOOP_CONST, num+1);
+	e32 (ib, (reg-SET_LOOP_CONST_offset) >> 2);
+    } else if (reg >= SET_BOOL_CONST_offset && reg < SET_BOOL_CONST_end) {
+	pack3 (ib, IT_SET_BOOL_CONST, num+1);
+	e32 (ib, (reg-SET_BOOL_CONST_offset) >> 2);
+    } else {
+	e32 (ib, CP_PACKET0 (reg, num-1));
+    }
+}
+
+/* write a single register */
+inline void ereg (drmBufPtr ib, uint32_t reg, uint32_t val)
+{
+    pack0 (ib, reg, 1);
+    e32   (ib, val);
+}
+
+/* Flush the indirect buffer to the kernel for submission to the card */
+void R600CPFlushIndirect(ScrnInfoPtr pScrn, drmBufPtr ib)
+{
+    RADEONInfoPtr  info = RADEONPTR(pScrn);
+    drmBufPtr          buffer = ib;
+    int                start  = 0;
+    drm_radeon_indirect_t  indirect;
+
+    if (!buffer) return;
+
+    //xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Flushing buffer %d\n",
+    //       buffer->idx);
+
+    while (buffer->used & 0x3c){
+        e32(buffer, CP_PACKET2()); /* fill up to multiple of 16 dwords */
+    }
+
+    //ErrorF("buffer bytes: %d\n", buffer->used);
+
+    indirect.idx     = buffer->idx;
+    indirect.start   = start;
+    indirect.end     = buffer->used;
+    indirect.discard = 1;
+
+    drmCommandWriteRead(info->dri->drmFD, DRM_RADEON_INDIRECT,
+			&indirect, sizeof(drm_radeon_indirect_t));
+
+}
+
+void R600IBDiscard(ScrnInfoPtr pScrn, drmBufPtr ib)
+{
+    if (!ib) return;
+
+    ib->used = 0;
+    R600CPFlushIndirect(pScrn, ib);
+}
+
+void
+wait_3d_idle_clean(ScrnInfoPtr pScrn, drmBufPtr ib)
+{
+
+    //flush caches, don't generate timestamp
+    pack3 (ib, IT_EVENT_WRITE, 1);
+    e32   (ib, CACHE_FLUSH_AND_INV_EVENT);
+    // wait for 3D idle clean
+    ereg  (ib, WAIT_UNTIL,                          (WAIT_3D_IDLE_bit |
+						     WAIT_3D_IDLECLEAN_bit));
+}
+
+void
+wait_3d_idle(ScrnInfoPtr pScrn, drmBufPtr ib)
+{
+
+    ereg  (ib, WAIT_UNTIL,                          WAIT_3D_IDLE_bit);
+
+}
+
+static void
+reset_cb(ScrnInfoPtr pScrn, drmBufPtr ib)
+{
+    int i;
+
+    pack0 (ib, CB_COLOR0_INFO, 8);
+    for (i = 0; i < 8; i++)
+	e32 (ib, 0);
+}
+
+static void
+reset_td_samplers(ScrnInfoPtr pScrn, drmBufPtr ib)
+{
+    int i;
+
+    wait_3d_idle(pScrn, ib);
+
+    pack0 (ib, TD_PS_SAMPLER0_BORDER_RED, 4*TD_PS_SAMPLER0_BORDER_RED_num);
+    for (i = 0; i < 4*TD_PS_SAMPLER0_BORDER_RED_num; i++)
+	e32   (ib, 0);
+    pack0 (ib, TD_VS_SAMPLER0_BORDER_RED, 4*TD_VS_SAMPLER0_BORDER_RED_num);
+    for (i = 0; i < 4*TD_VS_SAMPLER0_BORDER_RED_num; i++)
+	e32   (ib, 0);
+
+    wait_3d_idle(pScrn, ib);
+}
+
+static void
+reset_sampler_const (ScrnInfoPtr pScrn, drmBufPtr ib)
+{
+    int i;
+
+    for (i = 0; i < SQ_TEX_SAMPLER_WORD_all_num; i++) {
+	pack0 (ib, SQ_TEX_SAMPLER_WORD + i * SQ_TEX_SAMPLER_WORD_offset, 3);
+	e32   (ib, SQ_TEX_DEPTH_COMPARE_LESSEQUAL << DEPTH_COMPARE_FUNCTION_shift);
+	e32   (ib, MAX_LOD_mask);
+	e32   (ib, 0);
+    }
+}
+
+static void
+reset_dx9_alu_consts(ScrnInfoPtr pScrn, drmBufPtr ib)
+{
+    int i;
+
+    const int count = SQ_ALU_CONSTANT_all_num * (SQ_ALU_CONSTANT_offset >> 2);
+
+    pack0 (ib, SQ_ALU_CONSTANT, count);
+    for (i = 0; i < count; i++)
+	efloat (ib, 0.0);
+}
+
+static void
+reset_bool_loop_const(ScrnInfoPtr pScrn, drmBufPtr ib)
+{
+    int i;
+
+    for (i = 0; i < SQ_BOOL_CONST_0_num; i++)
+	ereg (ib, SQ_BOOL_CONST_0 + (i << 2), 0);
+
+    pack0 (ib, SQ_LOOP_CONST, SQ_LOOP_CONST_all_num);
+
+    for (i = 0; i < SQ_LOOP_CONST_all_num; i++)
+	e32 (ib, 0);
+
+}
+
+void
+start_3d(ScrnInfoPtr pScrn, drmBufPtr ib)
+{
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+
+    if (info->ChipFamily < CHIP_FAMILY_RV770) {
+	pack3 (ib, IT_START_3D_CMDBUF, 1);
+	e32   (ib, 0);
+    }
+
+    pack3 (ib, IT_CONTEXT_CONTROL, 2);
+    e32   (ib, 0x80000000);
+    e32   (ib, 0x80000000);
+
+    wait_3d_idle_clean (pScrn, ib);
+}
+
+/*
+ * Setup of functional groups
+ */
+
+// asic stack/thread/gpr limits - need to query the drm
+static void
+sq_setup(ScrnInfoPtr pScrn, drmBufPtr ib, sq_config_t *sq_conf)
+{
+    uint32_t sq_config, sq_gpr_resource_mgmt_1, sq_gpr_resource_mgmt_2;
+    uint32_t sq_thread_resource_mgmt, sq_stack_resource_mgmt_1, sq_stack_resource_mgmt_2;
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+
+    if ((info->ChipFamily == CHIP_FAMILY_RV610) ||
+	(info->ChipFamily == CHIP_FAMILY_RV620) ||
+	(info->ChipFamily == CHIP_FAMILY_RS780) ||
+	(info->ChipFamily == CHIP_FAMILY_RV710))
+	sq_config = 0;						// no VC
+    else
+	sq_config = VC_ENABLE_bit;
+
+    sq_config |= (DX9_CONSTS_bit |
+		  ALU_INST_PREFER_VECTOR_bit |
+		  (sq_conf->ps_prio << PS_PRIO_shift) |
+		  (sq_conf->vs_prio << VS_PRIO_shift) |
+		  (sq_conf->gs_prio << GS_PRIO_shift) |
+		  (sq_conf->es_prio << ES_PRIO_shift));
+
+    sq_gpr_resource_mgmt_1 = ((sq_conf->num_ps_gprs << NUM_PS_GPRS_shift) |
+			      (sq_conf->num_vs_gprs << NUM_VS_GPRS_shift) |
+			      (sq_conf->num_temp_gprs << NUM_CLAUSE_TEMP_GPRS_shift));
+    sq_gpr_resource_mgmt_2 = ((sq_conf->num_gs_gprs << NUM_GS_GPRS_shift) |
+			      (sq_conf->num_es_gprs << NUM_ES_GPRS_shift));
+
+    sq_thread_resource_mgmt = ((sq_conf->num_ps_threads << NUM_PS_THREADS_shift) |
+			       (sq_conf->num_vs_threads << NUM_VS_THREADS_shift) |
+			       (sq_conf->num_gs_threads << NUM_GS_THREADS_shift) |
+			       (sq_conf->num_es_threads << NUM_ES_THREADS_shift));
+
+    sq_stack_resource_mgmt_1 = ((sq_conf->num_ps_stack_entries << NUM_PS_STACK_ENTRIES_shift) |
+				(sq_conf->num_vs_stack_entries << NUM_VS_STACK_ENTRIES_shift));
+
+    sq_stack_resource_mgmt_2 = ((sq_conf->num_gs_stack_entries << NUM_GS_STACK_ENTRIES_shift) |
+				(sq_conf->num_es_stack_entries << NUM_ES_STACK_ENTRIES_shift));
+
+    pack0 (ib, SQ_CONFIG, 6);
+    e32   (ib, sq_config);
+    e32   (ib, sq_gpr_resource_mgmt_1);
+    e32   (ib, sq_gpr_resource_mgmt_2);
+    e32   (ib, sq_thread_resource_mgmt);
+    e32   (ib, sq_stack_resource_mgmt_1);
+    e32   (ib, sq_stack_resource_mgmt_2);
+
+}
+
+void
+set_render_target(ScrnInfoPtr pScrn, drmBufPtr ib, cb_config_t *cb_conf)
+{
+    uint32_t cb_color_info;
+    int pitch, slice, h;
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+
+    cb_color_info = ((cb_conf->endian      << ENDIAN_shift)				|
+		     (cb_conf->format      << CB_COLOR0_INFO__FORMAT_shift)		|
+		     (cb_conf->array_mode  << CB_COLOR0_INFO__ARRAY_MODE_shift)		|
+		     (cb_conf->number_type << NUMBER_TYPE_shift)			|
+		     (cb_conf->comp_swap   << COMP_SWAP_shift)				|
+		     (cb_conf->tile_mode   << CB_COLOR0_INFO__TILE_MODE_shift));
+    if (cb_conf->read_size)
+	cb_color_info |= CB_COLOR0_INFO__READ_SIZE_bit;
+    if (cb_conf->blend_clamp)
+	cb_color_info |= BLEND_CLAMP_bit;
+    if (cb_conf->clear_color)
+	cb_color_info |= CLEAR_COLOR_bit;
+    if (cb_conf->blend_bypass)
+	cb_color_info |= BLEND_BYPASS_bit;
+    if (cb_conf->blend_float32)
+	cb_color_info |= BLEND_FLOAT32_bit;
+    if (cb_conf->simple_float)
+	cb_color_info |= SIMPLE_FLOAT_bit;
+    if (cb_conf->round_mode)
+	cb_color_info |= CB_COLOR0_INFO__ROUND_MODE_bit;
+    if (cb_conf->tile_compact)
+	cb_color_info |= TILE_COMPACT_bit;
+    if (cb_conf->source_format)
+	cb_color_info |= SOURCE_FORMAT_bit;
+
+    pitch = (cb_conf->w / 8) - 1;
+    h = (cb_conf->h + 7) & ~7;
+    slice = ((cb_conf->w * h) / 64) - 1;
+
+    ereg (ib, (CB_COLOR0_BASE + (4 * cb_conf->id)), (cb_conf->base >> 8));
+
+    // rv6xx workaround
+    if ((info->ChipFamily > CHIP_FAMILY_R600) &&
+	(info->ChipFamily < CHIP_FAMILY_RV770)) {
+	pack3 (ib, IT_SURFACE_BASE_UPDATE, 1);
+	e32   (ib, (2 << cb_conf->id));
+    }
+
+    // pitch only for ARRAY_LINEAR_GENERAL, other tiling modes require addrlib
+    ereg (ib, (CB_COLOR0_SIZE + (4 * cb_conf->id)), ((pitch << PITCH_TILE_MAX_shift)	|
+						     (slice << SLICE_TILE_MAX_shift)));
+    ereg (ib, (CB_COLOR0_VIEW + (4 * cb_conf->id)), ((0    << SLICE_START_shift)		|
+						     (0    << SLICE_MAX_shift)));
+    ereg (ib, (CB_COLOR0_INFO + (4 * cb_conf->id)), cb_color_info);
+    ereg (ib, (CB_COLOR0_TILE + (4 * cb_conf->id)), (0     >> 8));	// CMASK per-tile data base/256
+    ereg (ib, (CB_COLOR0_FRAG + (4 * cb_conf->id)), (0     >> 8));	// FMASK per-tile data base/256
+    ereg (ib, (CB_COLOR0_MASK + (4 * cb_conf->id)), ((0    << CMASK_BLOCK_MAX_shift)	|
+						     (0    << FMASK_TILE_MAX_shift)));
+}
+
+void
+cp_set_surface_sync(ScrnInfoPtr pScrn, drmBufPtr ib, uint32_t sync_type, uint32_t size, uint64_t mc_addr)
+{
+    uint32_t cp_coher_size;
+    if (size == 0xffffffff)
+	cp_coher_size = 0xffffffff;
+    else
+	cp_coher_size = ((size + 255) >> 8);
+
+    ereg  (ib, CP_COHER_CNTL,                       sync_type);
+    ereg  (ib, CP_COHER_SIZE,                       cp_coher_size);
+    ereg  (ib, CP_COHER_BASE,                       (mc_addr >> 8));
+    pack3 (ib, IT_WAIT_REG_MEM, 6);
+    e32   (ib, 0x00000003);						// ME, Register, EqualTo
+    e32   (ib, CP_COHER_STATUS >> 2);
+    e32   (ib, 0);
+    e32   (ib, 0);							// Ref value
+    e32   (ib, STATUS_bit);						// Ref mask
+    e32   (ib, 10);							// Wait interval
+}
+
+void
+fs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *fs_conf)
+{
+    uint32_t sq_pgm_resources;
+
+    sq_pgm_resources = ((fs_conf->num_gprs << NUM_GPRS_shift) |
+			(fs_conf->stack_size << STACK_SIZE_shift));
+
+    if (fs_conf->dx10_clamp)
+	sq_pgm_resources |= SQ_PGM_RESOURCES_FS__DX10_CLAMP_bit;
+
+    ereg (ib, SQ_PGM_START_FS, fs_conf->shader_addr >> 8);
+    ereg (ib, SQ_PGM_RESOURCES_FS, sq_pgm_resources);
+    ereg (ib, SQ_PGM_CF_OFFSET_FS, 0);
+}
+
+void
+vs_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *vs_conf)
+{
+    uint32_t sq_pgm_resources;
+
+    sq_pgm_resources = ((vs_conf->num_gprs << NUM_GPRS_shift) |
+			(vs_conf->stack_size << STACK_SIZE_shift));
+
+    if (vs_conf->dx10_clamp)
+	sq_pgm_resources |= SQ_PGM_RESOURCES_VS__DX10_CLAMP_bit;
+    if (vs_conf->fetch_cache_lines)
+	sq_pgm_resources |= (vs_conf->fetch_cache_lines << FETCH_CACHE_LINES_shift);
+    if (vs_conf->uncached_first_inst)
+	sq_pgm_resources |= UNCACHED_FIRST_INST_bit;
+
+    ereg (ib, SQ_PGM_START_VS, vs_conf->shader_addr >> 8);
+    ereg (ib, SQ_PGM_RESOURCES_VS, sq_pgm_resources);
+    ereg (ib, SQ_PGM_CF_OFFSET_VS, 0);
+}
+
+void
+ps_setup(ScrnInfoPtr pScrn, drmBufPtr ib, shader_config_t *ps_conf)
+{
+    uint32_t sq_pgm_resources;
+
+    sq_pgm_resources = ((ps_conf->num_gprs << NUM_GPRS_shift) |
+			(ps_conf->stack_size << STACK_SIZE_shift));
+
+    if (ps_conf->dx10_clamp)
+	sq_pgm_resources |= SQ_PGM_RESOURCES_PS__DX10_CLAMP_bit;
+    if (ps_conf->fetch_cache_lines)
+	sq_pgm_resources |= (ps_conf->fetch_cache_lines << FETCH_CACHE_LINES_shift);
+    if (ps_conf->uncached_first_inst)
+	sq_pgm_resources |= UNCACHED_FIRST_INST_bit;
+    if (ps_conf->clamp_consts)
+	sq_pgm_resources |= CLAMP_CONSTS_bit;
+
+    ereg (ib, SQ_PGM_START_PS, ps_conf->shader_addr >> 8);
+    ereg (ib, SQ_PGM_RESOURCES_PS, sq_pgm_resources);
+    ereg (ib, SQ_PGM_EXPORTS_PS, ps_conf->export_mode);
+    ereg (ib, SQ_PGM_CF_OFFSET_PS, 0);
+}
+
+void
+set_alu_consts(ScrnInfoPtr pScrn, drmBufPtr ib, int offset, int count, float *const_buf)
+{
+    int i;
+    const int countreg = count * (SQ_ALU_CONSTANT_offset >> 2);
+
+    pack0 (ib, SQ_ALU_CONSTANT + offset * SQ_ALU_CONSTANT_offset, countreg);
+    for (i = 0; i < countreg; i++)
+	efloat (ib, const_buf[i]);
+}
+
+void
+set_vtx_resource(ScrnInfoPtr pScrn, drmBufPtr ib, vtx_resource_t *res)
+{
+    uint32_t sq_vtx_constant_word2;
+
+    sq_vtx_constant_word2 = ((((res->vb_addr) >> 32) & BASE_ADDRESS_HI_mask) |
+			     ((res->vtx_size_dw << 2) << SQ_VTX_CONSTANT_WORD2_0__STRIDE_shift) |
+			     (res->format << SQ_VTX_CONSTANT_WORD2_0__DATA_FORMAT_shift) |
+			     (res->num_format_all << SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_shift) |
+			     (res->endian << SQ_VTX_CONSTANT_WORD2_0__ENDIAN_SWAP_shift));
+    if (res->clamp_x)
+	    sq_vtx_constant_word2 |= SQ_VTX_CONSTANT_WORD2_0__CLAMP_X_bit;
+
+    if (res->format_comp_all)
+	    sq_vtx_constant_word2 |= SQ_VTX_CONSTANT_WORD2_0__FORMAT_COMP_ALL_bit;
+
+    if (res->srf_mode_all)
+	    sq_vtx_constant_word2 |= SQ_VTX_CONSTANT_WORD2_0__SRF_MODE_ALL_bit;
+
+    pack0 (ib, SQ_VTX_RESOURCE + res->id * SQ_VTX_RESOURCE_offset, 7);
+    e32 (ib, res->vb_addr & 0xffffffff);				// 0: BASE_ADDRESS
+    e32 (ib, (res->vtx_num_entries << 2) - 1);			// 1: SIZE
+    e32 (ib, sq_vtx_constant_word2);	// 2: BASE_HI, STRIDE, CLAMP, FORMAT, ENDIAN
+    e32 (ib, res->mem_req_size << MEM_REQUEST_SIZE_shift);		// 3: MEM_REQUEST_SIZE ?!?
+    e32 (ib, 0);							// 4: n/a
+    e32 (ib, 0);							// 5: n/a
+    e32 (ib, SQ_TEX_VTX_VALID_BUFFER << SQ_VTX_CONSTANT_WORD6_0__TYPE_shift);	// 6: TYPE
+}
+
+void
+set_tex_resource(ScrnInfoPtr pScrn, drmBufPtr ib, tex_resource_t *tex_res)
+{
+    uint32_t sq_tex_resource_word0, sq_tex_resource_word1, sq_tex_resource_word4;
+    uint32_t sq_tex_resource_word5, sq_tex_resource_word6;
+
+    sq_tex_resource_word0 = ((tex_res->dim << DIM_shift) |
+			     (tex_res->tile_mode << SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_shift));
+
+    if (tex_res->w)
+	sq_tex_resource_word0 |= (((((tex_res->pitch + 7) >> 3) - 1) << PITCH_shift) |
+				  ((tex_res->w - 1) << TEX_WIDTH_shift));
+
+    if (tex_res->tile_type)
+	sq_tex_resource_word0 |= TILE_TYPE_bit;
+
+    sq_tex_resource_word1 = (tex_res->format << SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift);
+
+    if (tex_res->h)
+	sq_tex_resource_word1 |= ((tex_res->h - 1) << TEX_HEIGHT_shift);
+    if (tex_res->depth)
+	sq_tex_resource_word1 |= ((tex_res->depth - 1) << TEX_DEPTH_shift);
+
+    sq_tex_resource_word4 = ((tex_res->format_comp_x << FORMAT_COMP_X_shift) |
+			     (tex_res->format_comp_y << FORMAT_COMP_Y_shift) |
+			     (tex_res->format_comp_z << FORMAT_COMP_Z_shift) |
+			     (tex_res->format_comp_w << FORMAT_COMP_W_shift) |
+			     (tex_res->num_format_all << SQ_TEX_RESOURCE_WORD4_0__NUM_FORMAT_ALL_shift) |
+			     (tex_res->endian << SQ_TEX_RESOURCE_WORD4_0__ENDIAN_SWAP_shift) |
+			     (tex_res->request_size << REQUEST_SIZE_shift) |
+			     (tex_res->dst_sel_x << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift) |
+			     (tex_res->dst_sel_y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift) |
+			     (tex_res->dst_sel_z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift) |
+			     (tex_res->dst_sel_w << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift) |
+			     (tex_res->base_level << BASE_LEVEL_shift));
+
+    if (tex_res->srf_mode_all)
+	sq_tex_resource_word4 |= SQ_TEX_RESOURCE_WORD4_0__SRF_MODE_ALL_bit;
+    if (tex_res->force_degamma)
+	sq_tex_resource_word4 |= SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit;
+
+    sq_tex_resource_word5 = ((tex_res->last_level << LAST_LEVEL_shift) |
+			     (tex_res->base_array << BASE_ARRAY_shift) |
+			     (tex_res->last_array << LAST_ARRAY_shift));
+
+    sq_tex_resource_word6 = ((tex_res->mpeg_clamp << MPEG_CLAMP_shift) |
+			     (tex_res->perf_modulation << PERF_MODULATION_shift) |
+			     (SQ_TEX_VTX_VALID_TEXTURE << SQ_TEX_RESOURCE_WORD6_0__TYPE_shift));
+
+    if (tex_res->interlaced)
+	sq_tex_resource_word6 |= INTERLACED_bit;
+
+    pack0 (ib, SQ_TEX_RESOURCE + tex_res->id * SQ_TEX_RESOURCE_offset, 7);
+    e32   (ib, sq_tex_resource_word0);
+    e32   (ib, sq_tex_resource_word1);
+    e32   (ib, ((tex_res->base) >> 8));
+    e32   (ib, ((tex_res->mip_base) >> 8));
+    e32   (ib, sq_tex_resource_word4);
+    e32   (ib, sq_tex_resource_word5);
+    e32   (ib, sq_tex_resource_word6);
+}
+
+void
+set_tex_sampler (ScrnInfoPtr pScrn, drmBufPtr ib, tex_sampler_t *s)
+{
+    uint32_t sq_tex_sampler_word0, sq_tex_sampler_word1, sq_tex_sampler_word2;
+
+    sq_tex_sampler_word0 = ((s->clamp_x       << SQ_TEX_SAMPLER_WORD0_0__CLAMP_X_shift)		|
+			    (s->clamp_y       << CLAMP_Y_shift)					|
+			    (s->clamp_z       << CLAMP_Z_shift)					|
+			    (s->xy_mag_filter << XY_MAG_FILTER_shift)				|
+			    (s->xy_min_filter << XY_MIN_FILTER_shift)				|
+			    (s->z_filter      << Z_FILTER_shift)	|
+			    (s->mip_filter    << MIP_FILTER_shift)				|
+			    (s->border_color  << BORDER_COLOR_TYPE_shift)			|
+			    (s->depth_compare << DEPTH_COMPARE_FUNCTION_shift)			|
+			    (s->chroma_key    << CHROMA_KEY_shift));
+    if (s->point_sampling_clamp)
+	sq_tex_sampler_word0 |= POINT_SAMPLING_CLAMP_bit;
+    if (s->tex_array_override)
+	sq_tex_sampler_word0 |= TEX_ARRAY_OVERRIDE_bit;
+    if (s->lod_uses_minor_axis)
+	sq_tex_sampler_word0 |= LOD_USES_MINOR_AXIS_bit;
+
+    sq_tex_sampler_word1 = ((s->min_lod       << MIN_LOD_shift)					|
+			    (s->max_lod       << MAX_LOD_shift)					|
+			    (s->lod_bias      << SQ_TEX_SAMPLER_WORD1_0__LOD_BIAS_shift));
+
+    sq_tex_sampler_word2 = ((s->lod_bias2     << LOD_BIAS_SEC_shift)	|
+			    (s->perf_mip      << PERF_MIP_shift)	|
+			    (s->perf_z        << PERF_Z_shift));
+    if (s->mc_coord_truncate)
+	sq_tex_sampler_word2 |= MC_COORD_TRUNCATE_bit;
+    if (s->force_degamma)
+	sq_tex_sampler_word2 |= SQ_TEX_SAMPLER_WORD2_0__FORCE_DEGAMMA_bit;
+    if (s->high_precision_filter)
+	sq_tex_sampler_word2 |= HIGH_PRECISION_FILTER_bit;
+    if (s->fetch_4)
+	sq_tex_sampler_word2 |= FETCH_4_bit;
+    if (s->sample_is_pcf)
+	sq_tex_sampler_word2 |= SAMPLE_IS_PCF_bit;
+    if (s->type)
+	sq_tex_sampler_word2 |= SQ_TEX_SAMPLER_WORD2_0__TYPE_bit;
+
+    pack0 (ib, SQ_TEX_SAMPLER_WORD + s->id * SQ_TEX_SAMPLER_WORD_offset, 3);
+    e32   (ib, sq_tex_sampler_word0);
+    e32   (ib, sq_tex_sampler_word1);
+    e32   (ib, sq_tex_sampler_word2);
+}
+
+//XXX deal with clip offsets in clip setup
+void
+set_screen_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int y2)
+{
+
+    ereg  (ib, PA_SC_SCREEN_SCISSOR_TL,              ((x1 << PA_SC_SCREEN_SCISSOR_TL__TL_X_shift) |
+						  (y1 << PA_SC_SCREEN_SCISSOR_TL__TL_Y_shift)));
+    ereg  (ib, PA_SC_SCREEN_SCISSOR_BR,              ((x2 << PA_SC_SCREEN_SCISSOR_BR__BR_X_shift) |
+						  (y2 << PA_SC_SCREEN_SCISSOR_BR__BR_Y_shift)));
+}
+
+void
+set_vport_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int id, int x1, int y1, int x2, int y2)
+{
+
+    ereg  (ib, PA_SC_VPORT_SCISSOR_0_TL +
+	   id * PA_SC_VPORT_SCISSOR_0_TL_offset, ((x1 << PA_SC_VPORT_SCISSOR_0_TL__TL_X_shift) |
+						  (y1 << PA_SC_VPORT_SCISSOR_0_TL__TL_Y_shift) |
+						  WINDOW_OFFSET_DISABLE_bit));
+    ereg  (ib, PA_SC_VPORT_SCISSOR_0_BR +
+	   id * PA_SC_VPORT_SCISSOR_0_BR_offset, ((x2 << PA_SC_VPORT_SCISSOR_0_BR__BR_X_shift) |
+						  (y2 << PA_SC_VPORT_SCISSOR_0_BR__BR_Y_shift)));
+}
+
+void
+set_generic_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int y2)
+{
+
+    ereg  (ib, PA_SC_GENERIC_SCISSOR_TL,            ((x1 << PA_SC_GENERIC_SCISSOR_TL__TL_X_shift) |
+						 (y1 << PA_SC_GENERIC_SCISSOR_TL__TL_Y_shift) |
+						 WINDOW_OFFSET_DISABLE_bit));
+    ereg  (ib, PA_SC_GENERIC_SCISSOR_BR,            ((x2 << PA_SC_GENERIC_SCISSOR_BR__BR_X_shift) |
+						 (y2 << PA_SC_GENERIC_SCISSOR_TL__TL_Y_shift)));
+}
+
+void
+set_window_scissor(ScrnInfoPtr pScrn, drmBufPtr ib, int x1, int y1, int x2, int y2)
+{
+
+    ereg  (ib, PA_SC_WINDOW_SCISSOR_TL,             ((x1 << PA_SC_WINDOW_SCISSOR_TL__TL_X_shift) |
+						 (y1 << PA_SC_WINDOW_SCISSOR_TL__TL_Y_shift) |
+						 WINDOW_OFFSET_DISABLE_bit));
+    ereg  (ib, PA_SC_WINDOW_SCISSOR_BR,             ((x2 << PA_SC_WINDOW_SCISSOR_BR__BR_X_shift) |
+						 (y2 << PA_SC_WINDOW_SCISSOR_BR__BR_Y_shift)));
+}
+
+void
+set_clip_rect(ScrnInfoPtr pScrn, drmBufPtr ib, int id, int x1, int y1, int x2, int y2)
+{
+
+    ereg  (ib, PA_SC_CLIPRECT_0_TL +
+	   id * PA_SC_CLIPRECT_0_TL_offset,     ((x1 << PA_SC_CLIPRECT_0_TL__TL_X_shift) |
+						 (y1 << PA_SC_CLIPRECT_0_TL__TL_Y_shift)));
+    ereg  (ib, PA_SC_CLIPRECT_0_BR +
+	   id * PA_SC_CLIPRECT_0_BR_offset,     ((x2 << PA_SC_CLIPRECT_0_BR__BR_X_shift) |
+						 (y2 << PA_SC_CLIPRECT_0_BR__BR_Y_shift)));
+}
+
+/*
+ * Setup of default state
+ */
+
+void
+set_default_state(ScrnInfoPtr pScrn, drmBufPtr ib)
+{
+    tex_resource_t tex_res;
+    shader_config_t fs_conf;
+    sq_config_t sq_conf;
+    int i;
+    RADEONInfoPtr info = RADEONPTR(pScrn);
+    struct radeon_accel_state *accel_state = info->accel_state;
+
+    memset(&tex_res, 0, sizeof(tex_resource_t));
+    memset(&fs_conf, 0, sizeof(shader_config_t));
+
+#if 1
+    if (accel_state->XInited3D)
+	return;
+#endif
+
+    accel_state->XInited3D = TRUE;
+
+    wait_3d_idle(pScrn, ib);
+
+    // ASIC specific setup, see drm
+    if (info->ChipFamily < CHIP_FAMILY_RV770) {
+	ereg  (ib, TA_CNTL_AUX,                     (( 3 << GRADIENT_CREDIT_shift)		|
+						 (28 << TD_FIFO_CREDIT_shift)));
+	ereg  (ib, VC_ENHANCE,                      0);
+	ereg  (ib, R7xx_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, 0);
+	ereg  (ib, DB_DEBUG,                        0x82000000); /* ? */
+	ereg  (ib, DB_WATERMARKS,		        ((4  << DEPTH_FREE_shift)		|
+						 (16 << DEPTH_FLUSH_shift)		|
+						 (0  << FORCE_SUMMARIZE_shift)		|
+						 (4  << DEPTH_PENDING_FREE_shift)	|
+						 (16 << DEPTH_CACHELINE_FREE_shift)	|
+						 0));
+    } else {
+	ereg (ib, TA_CNTL_AUX,                      (( 2 << GRADIENT_CREDIT_shift)		|
+						 (28 << TD_FIFO_CREDIT_shift)));
+	ereg (ib, VC_ENHANCE,                       0);
+	ereg (ib, R7xx_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, VS_PC_LIMIT_ENABLE_bit);
+	ereg (ib, DB_DEBUG,                         0);
+	ereg (ib, DB_WATERMARKS,                    ((4  << DEPTH_FREE_shift)		|
+						 (16 << DEPTH_FLUSH_shift)		|
+						 (0  << FORCE_SUMMARIZE_shift)		|
+						 (4  << DEPTH_PENDING_FREE_shift)	|
+						 (4  << DEPTH_CACHELINE_FREE_shift)	|
+						 0));
+    }
+
+    reset_td_samplers(pScrn, ib);
+    reset_dx9_alu_consts(pScrn, ib);
+    reset_bool_loop_const (pScrn, ib);
+    reset_sampler_const (pScrn, ib);
+
+    // SQ
+    sq_conf.ps_prio = 0;
+    sq_conf.vs_prio = 1;
+    sq_conf.gs_prio = 2;
+    sq_conf.es_prio = 3;
+    // need to set stack/thread/gpr limits based on the asic
+    // for now just set them low enough so any card will work
+    // see r600_cp.c in the drm
+    switch (info->ChipFamily) {
+    case CHIP_FAMILY_R600:
+	sq_conf.num_ps_gprs = 192;
+	sq_conf.num_vs_gprs = 56;
+	sq_conf.num_temp_gprs = 4;
+	sq_conf.num_gs_gprs = 0;
+	sq_conf.num_es_gprs = 0;
+	sq_conf.num_ps_threads = 136;
+	sq_conf.num_vs_threads = 48;
+	sq_conf.num_gs_threads = 4;
+	sq_conf.num_es_threads = 4;
+	sq_conf.num_ps_stack_entries = 128;
+	sq_conf.num_vs_stack_entries = 128;
+	sq_conf.num_gs_stack_entries = 0;
+	sq_conf.num_es_stack_entries = 0;
+	break;
+    case CHIP_FAMILY_RV630:
+    case CHIP_FAMILY_RV635:
+	sq_conf.num_ps_gprs = 84;
+	sq_conf.num_vs_gprs = 36;
+	sq_conf.num_temp_gprs = 4;
+	sq_conf.num_gs_gprs = 0;
+	sq_conf.num_es_gprs = 0;
+	sq_conf.num_ps_threads = 144;
+	sq_conf.num_vs_threads = 40;
+	sq_conf.num_gs_threads = 4;
+	sq_conf.num_es_threads = 4;
+	sq_conf.num_ps_stack_entries = 40;
+	sq_conf.num_vs_stack_entries = 40;
+	sq_conf.num_gs_stack_entries = 32;
+	sq_conf.num_es_stack_entries = 16;
+	break;
+    case CHIP_FAMILY_RV610:
+    case CHIP_FAMILY_RV620:
+    case CHIP_FAMILY_RS780:
+    default:
+	sq_conf.num_ps_gprs = 84;
+	sq_conf.num_vs_gprs = 36;
+	sq_conf.num_temp_gprs = 4;
+	sq_conf.num_gs_gprs = 0;
+	sq_conf.num_es_gprs = 0;
+	sq_conf.num_ps_threads = 136;
+	sq_conf.num_vs_threads = 48;
+	sq_conf.num_gs_threads = 4;
+	sq_conf.num_es_threads = 4;
+	sq_conf.num_ps_stack_entries = 40;
+	sq_conf.num_vs_stack_entries = 40;
+	sq_conf.num_gs_stack_entries = 32;
+	sq_conf.num_es_stack_entries = 16;
+	break;
+    case CHIP_FAMILY_RV670:
+	sq_conf.num_ps_gprs = 144;
+	sq_conf.num_vs_gprs = 40;
+	sq_conf.num_temp_gprs = 4;
+	sq_conf.num_gs_gprs = 0;
+	sq_conf.num_es_gprs = 0;
+	sq_conf.num_ps_threads = 136;
+	sq_conf.num_vs_threads = 48;
+	sq_conf.num_gs_threads = 4;
+	sq_conf.num_es_threads = 4;
+	sq_conf.num_ps_stack_entries = 40;
+	sq_conf.num_vs_stack_entries = 40;
+	sq_conf.num_gs_stack_entries = 32;
+	sq_conf.num_es_stack_entries = 16;
+	break;
+    case CHIP_FAMILY_RV770:
+	sq_conf.num_ps_gprs = 192;
+	sq_conf.num_vs_gprs = 56;
+	sq_conf.num_temp_gprs = 4;
+	sq_conf.num_gs_gprs = 0;
+	sq_conf.num_es_gprs = 0;
+	sq_conf.num_ps_threads = 188;
+	sq_conf.num_vs_threads = 60;
+	sq_conf.num_gs_threads = 0;
+	sq_conf.num_es_threads = 0;
+	sq_conf.num_ps_stack_entries = 256;
+	sq_conf.num_vs_stack_entries = 256;
+	sq_conf.num_gs_stack_entries = 0;
+	sq_conf.num_es_stack_entries = 0;
+	break;
+    case CHIP_FAMILY_RV730:
+	sq_conf.num_ps_gprs = 84;
+	sq_conf.num_vs_gprs = 36;
+	sq_conf.num_temp_gprs = 4;
+	sq_conf.num_gs_gprs = 0;
+	sq_conf.num_es_gprs = 0;
+	sq_conf.num_ps_threads = 188;
+	sq_conf.num_vs_threads = 60;
+	sq_conf.num_gs_threads = 0;
+	sq_conf.num_es_threads = 0;
+	sq_conf.num_ps_stack_entries = 128;
+	sq_conf.num_vs_stack_entries = 128;
+	sq_conf.num_gs_stack_entries = 0;
+	sq_conf.num_es_stack_entries = 0;
+	break;
+    case CHIP_FAMILY_RV710:
+	sq_conf.num_ps_gprs = 192;
+	sq_conf.num_vs_gprs = 56;
+	sq_conf.num_temp_gprs = 4;
+	sq_conf.num_gs_gprs = 0;
+	sq_conf.num_es_gprs = 0;
+	sq_conf.num_ps_threads = 144;
+	sq_conf.num_vs_threads = 48;
+	sq_conf.num_gs_threads = 0;
+	sq_conf.num_es_threads = 0;
+	sq_conf.num_ps_stack_entries = 128;
+	sq_conf.num_vs_stack_entries = 128;
+	sq_conf.num_gs_stack_entries = 0;
+	sq_conf.num_es_stack_entries = 0;
+	break;
+    }
+
+    sq_setup(pScrn, ib, &sq_conf);
+
+    ereg  (ib, SQ_VTX_BASE_VTX_LOC,                 0);
+    ereg  (ib, SQ_VTX_START_INST_LOC,               0);
+
+    pack0 (ib, SQ_ESGS_RING_ITEMSIZE, 9);
+    e32   (ib, 0);							// SQ_ESGS_RING_ITEMSIZE
+    e32   (ib, 0);							// SQ_GSVS_RING_ITEMSIZE
+    e32   (ib, 0);							// SQ_ESTMP_RING_ITEMSIZE
+    e32   (ib, 0);							// SQ_GSTMP_RING_ITEMSIZE
+    e32   (ib, 0);							// SQ_VSTMP_RING_ITEMSIZE
+    e32   (ib, 0);							// SQ_PSTMP_RING_ITEMSIZE
+    e32   (ib, 0);							// SQ_FBUF_RING_ITEMSIZE
+    e32   (ib, 0);							// SQ_REDUC_RING_ITEMSIZE
+    e32   (ib, 0);							// SQ_GS_VERT_ITEMSIZE
+
+    // DB
+    ereg  (ib, DB_DEPTH_INFO,                       0);
+    ereg  (ib, DB_STENCIL_CLEAR,                    0);
+    ereg  (ib, DB_DEPTH_CLEAR,                      0);
+    ereg  (ib, DB_STENCILREFMASK,                   0);
+    ereg  (ib, DB_STENCILREFMASK_BF,                0);
+    ereg  (ib, DB_DEPTH_CONTROL,                    0);
+    ereg  (ib, DB_RENDER_CONTROL,                   STENCIL_COMPRESS_DISABLE_bit | DEPTH_COMPRESS_DISABLE_bit);
+    if (info->ChipFamily < CHIP_FAMILY_RV770)
+	ereg  (ib, DB_RENDER_OVERRIDE,              FORCE_SHADER_Z_ORDER_bit);
+    else
+	ereg  (ib, DB_RENDER_OVERRIDE,              0);
+    ereg  (ib, DB_ALPHA_TO_MASK,                    ((2 << ALPHA_TO_MASK_OFFSET0_shift)	|
+						 (2 << ALPHA_TO_MASK_OFFSET1_shift)	|
+						 (2 << ALPHA_TO_MASK_OFFSET2_shift)	|
+						 (2 << ALPHA_TO_MASK_OFFSET3_shift)));
+
+    // SX
+    ereg  (ib, SX_ALPHA_TEST_CONTROL,               0);
+    ereg  (ib, SX_ALPHA_REF,                        0);
+
+    // CB
+    reset_cb(pScrn, ib);
+
+    pack0 (ib, CB_BLEND_RED, 4);
+    e32   (ib, 0x00000000);
+    e32   (ib, 0x00000000);
+    e32   (ib, 0x00000000);
+    e32   (ib, 0x00000000);
+
+    /* CB_COLOR_CONTROL.PER_MRT_BLEND is off */
+    // RV6xx+ have per-MRT blend
+    if (info->ChipFamily > CHIP_FAMILY_R600) {
+	pack0 (ib, CB_BLEND0_CONTROL, CB_BLEND0_CONTROL_num);
+	for   (i = 0; i < CB_BLEND0_CONTROL_num; i++)
+	    e32 (ib, 0);
+    }
+
+    ereg  (ib, CB_BLEND_CONTROL,                    0);
+
+    if (info->ChipFamily < CHIP_FAMILY_RV770) {
+	pack0 (ib, CB_FOG_RED, 3);
+	e32   (ib, 0x00000000);
+	e32   (ib, 0x00000000);
+	e32   (ib, 0x00000000);
+    }
+
+    ereg  (ib, CB_COLOR_CONTROL,                    0);
+    pack0 (ib, CB_CLRCMP_CONTROL, 4);
+    e32   (ib, 1 << CLRCMP_FCN_SEL_shift);				// CB_CLRCMP_CONTROL: use CLRCMP_FCN_SRC
+    e32   (ib, 0);							// CB_CLRCMP_SRC
+    e32   (ib, 0);							// CB_CLRCMP_DST
+    e32   (ib, 0);							// CB_CLRCMP_MSK
+
+
+    if (info->ChipFamily < CHIP_FAMILY_RV770) {
+	pack0 (ib, CB_CLEAR_RED, 4);
+	efloat(ib, 1.0);						/* WTF? */
+	efloat(ib, 0.0);
+	efloat(ib, 1.0);
+	efloat(ib, 1.0);
+    }
+    ereg  (ib, CB_TARGET_MASK,                      (0x0f << TARGET0_ENABLE_shift));
+
+    // SC
+    set_generic_scissor(pScrn, ib, 0, 0, 8192, 8192);
+    set_screen_scissor (pScrn, ib, 0, 0, 8192, 8192);
+    ereg  (ib, PA_SC_WINDOW_OFFSET,                 ((0 << WINDOW_X_OFFSET_shift) |
+						 (0 << WINDOW_Y_OFFSET_shift)));
+    set_window_scissor (pScrn, ib, 0, 0, 8192, 8192);
+
+    ereg  (ib, PA_SC_CLIPRECT_RULE,                 CLIP_RULE_mask);
+
+    /* clip boolean is set to always visible -> doesn't matter */
+    for (i = 0; i < PA_SC_CLIPRECT_0_TL_num; i++)
+	set_clip_rect (pScrn, ib, i, 0, 0, 8192, 8192);
+
+    if (info->ChipFamily < CHIP_FAMILY_RV770)
+	ereg  (ib, R7xx_PA_SC_EDGERULE,             0x00000000);
+    else
+	ereg  (ib, R7xx_PA_SC_EDGERULE,             0xAAAAAAAA); /* ? */
+
+    for (i = 0; i < PA_SC_VPORT_SCISSOR_0_TL_num; i++) {
+	set_vport_scissor (pScrn, ib, i, 0, 0, 8192, 8192);
+	pack0 (ib, PA_SC_VPORT_ZMIN_0 + i * PA_SC_VPORT_ZMIN_0_offset, 2);
+	efloat(ib, 0.0);
+	efloat(ib, 1.0);
+    }
+
+    if (info->ChipFamily < CHIP_FAMILY_RV770)
+	ereg  (ib, PA_SC_MODE_CNTL,                 (WALK_ORDER_ENABLE_bit | FORCE_EOV_CNTDWN_ENABLE_bit));
+    else
+	ereg  (ib, PA_SC_MODE_CNTL,                 (FORCE_EOV_CNTDWN_ENABLE_bit | FORCE_EOV_REZ_ENABLE_bit |
+						 0x00500000)); /* ? */
+
+    ereg  (ib, PA_SC_LINE_CNTL,                     0);
+    ereg  (ib, PA_SC_AA_CONFIG,                     0);
+    ereg  (ib, PA_SC_AA_MASK,                       0xFFFFFFFF);
+
+    //XXX: double check this
+    if (info->ChipFamily > CHIP_FAMILY_R600) {
+	ereg  (ib, PA_SC_AA_SAMPLE_LOCS_MCTX,       0);
+	ereg  (ib, PA_SC_AA_SAMPLE_LOCS_8S_WD1_M,   0);
+    }
+
+    ereg  (ib, PA_SC_LINE_STIPPLE,                  0);
+    ereg  (ib, PA_SC_MPASS_PS_CNTL,                 0);
+
+    // CL
+    pack0  (ib, PA_CL_VPORT_XSCALE_0, 6);
+    efloat (ib, 0.0f);						// PA_CL_VPORT_XSCALE
+    efloat (ib, 0.0f);						// PA_CL_VPORT_XOFFSET
+    efloat (ib, 0.0f);						// PA_CL_VPORT_YSCALE
+    efloat (ib, 0.0f);						// PA_CL_VPORT_YOFFSET
+    efloat (ib, 0.0f);						// PA_CL_VPORT_ZSCALE
+    efloat (ib, 0.0f);						// PA_CL_VPORT_ZOFFSET
+    ereg   (ib, PA_CL_CLIP_CNTL,                     (CLIP_DISABLE_bit | DX_CLIP_SPACE_DEF_bit));
+    ereg   (ib, PA_CL_VTE_CNTL,                      0);
+    ereg   (ib, PA_CL_VS_OUT_CNTL,                   0);
+    ereg   (ib, PA_CL_NANINF_CNTL,                   0);
+    pack0  (ib, PA_CL_GB_VERT_CLIP_ADJ, 4);
+    efloat (ib, 1.0);						// PA_CL_GB_VERT_CLIP_ADJ
+    efloat (ib, 1.0);						// PA_CL_GB_VERT_DISC_ADJ
+    efloat (ib, 1.0);						// PA_CL_GB_HORZ_CLIP_ADJ
+    efloat (ib, 1.0);						// PA_CL_GB_HORZ_DISC_ADJ
+
+    /* user clipping planes are disabled by default */
+    pack0 (ib, PA_CL_UCP_0_X, 24);
+    for (i = 0; i < 24; i++)
+	efloat (ib, 0.0);
+
+    // SU
+    ereg  (ib, PA_SU_SC_MODE_CNTL,                  FACE_bit);
+    ereg  (ib, PA_SU_POINT_SIZE,                    0);
+    ereg  (ib, PA_SU_POINT_MINMAX,                  0);
+    ereg  (ib, PA_SU_POLY_OFFSET_DB_FMT_CNTL,       0);
+    ereg  (ib, PA_SU_POLY_OFFSET_BACK_SCALE,        0);
+    ereg  (ib, PA_SU_POLY_OFFSET_FRONT_SCALE,       0);
+    ereg  (ib, PA_SU_POLY_OFFSET_BACK_OFFSET,       0);
+    ereg  (ib, PA_SU_POLY_OFFSET_FRONT_OFFSET,      0);
+
+    ereg  (ib, PA_SU_LINE_CNTL,                     (8 << PA_SU_LINE_CNTL__WIDTH_shift)); /* Line width 1 pixel */
+    ereg  (ib, PA_SU_VTX_CNTL,                      ((2 << PA_SU_VTX_CNTL__ROUND_MODE_shift) |
+						 (5 << QUANT_MODE_shift))); /* Round to Even, fixed point 1/256 */
+    ereg  (ib, PA_SU_POLY_OFFSET_CLAMP,             0);
+
+    // SPI
+    if (info->ChipFamily < CHIP_FAMILY_RV770)
+	ereg  (ib, R7xx_SPI_THREAD_GROUPING,        0);
+    else
+	ereg  (ib, R7xx_SPI_THREAD_GROUPING,        (1 << PS_GROUPING_shift));
+
+    ereg  (ib, SPI_INTERP_CONTROL_0,                ((2 << PNT_SPRITE_OVRD_X_shift)		|
+						 (3 << PNT_SPRITE_OVRD_Y_shift)		|
+						 (0 << PNT_SPRITE_OVRD_Z_shift)		|
+						 (1 << PNT_SPRITE_OVRD_W_shift))); /* s,t,0,1 */
+    ereg  (ib, SPI_INPUT_Z,                         0);
+    ereg  (ib, SPI_FOG_CNTL,                        0);
+    ereg  (ib, SPI_FOG_FUNC_SCALE,                  0);
+    ereg  (ib, SPI_FOG_FUNC_BIAS,                   0);
+
+    pack0 (ib, SPI_VS_OUT_ID_0, SPI_VS_OUT_ID_0_num);
+    for (i = 0; i < SPI_VS_OUT_ID_0_num; i++)		/* identity mapping */
+	e32 (ib, 0x03020100 + i*0x04040404);
+    ereg  (ib, SPI_VS_OUT_CONFIG,                   0);
+
+    // clear FS
+    fs_setup(pScrn, ib, &fs_conf);
+
+    // VGT
+    ereg  (ib, VGT_MAX_VTX_INDX,                    0);
+    ereg  (ib, VGT_MIN_VTX_INDX,                    0);
+    ereg  (ib, VGT_INDX_OFFSET,                     0);
+    ereg  (ib, VGT_INSTANCE_STEP_RATE_0,            0);
+    ereg  (ib, VGT_INSTANCE_STEP_RATE_1,            0);
+
+    ereg  (ib, VGT_MULTI_PRIM_IB_RESET_INDX,        0);
+    ereg  (ib, VGT_OUTPUT_PATH_CNTL,                0);
+    ereg  (ib, VGT_GS_MODE,                         0);
+    ereg  (ib, VGT_HOS_CNTL,                        0);
+    ereg  (ib, VGT_HOS_MAX_TESS_LEVEL,              0);
+    ereg  (ib, VGT_HOS_MIN_TESS_LEVEL,              0);
+    ereg  (ib, VGT_HOS_REUSE_DEPTH,                 0);
+    ereg  (ib, VGT_GROUP_PRIM_TYPE,                 0);
+    ereg  (ib, VGT_GROUP_FIRST_DECR,                0);
+    ereg  (ib, VGT_GROUP_DECR,                      0);
+    ereg  (ib, VGT_GROUP_VECT_0_CNTL,               0);
+    ereg  (ib, VGT_GROUP_VECT_1_CNTL,               0);
+    ereg  (ib, VGT_GROUP_VECT_0_FMT_CNTL,           0);
+    ereg  (ib, VGT_GROUP_VECT_1_FMT_CNTL,           0);
+    ereg  (ib, VGT_PRIMITIVEID_EN,                  0);
+    ereg  (ib, VGT_MULTI_PRIM_IB_RESET_EN,          0);
+    ereg  (ib, VGT_STRMOUT_EN,                      0);
+    ereg  (ib, VGT_REUSE_OFF,                       0);
+    ereg  (ib, VGT_VTX_CNT_EN,                      0);
+    ereg  (ib, VGT_STRMOUT_BUFFER_EN,               0);
+
+    // clear tex resources - PS
+    for (i = 0; i < 16; i++) {
+	tex_res.id = i;
+	set_tex_resource(pScrn, ib, &tex_res);
+    }
+
+    // clear tex resources - VS
+    for (i = 160; i < 164; i++) {
+	tex_res.id = i;
+	set_tex_resource(pScrn, ib, &tex_res);
+    }
+
+    // clear tex resources - FS
+    for (i = 320; i < 335; i++) {
+	tex_res.id = i;
+	set_tex_resource(pScrn, ib, &tex_res);
+    }
+
+}
+
+
+/*
+ * Commands
+ */
+
+void
+draw_immd(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf, uint32_t *indices)
+{
+    uint32_t i, count;
+
+    ereg  (ib, VGT_PRIMITIVE_TYPE, draw_conf->prim_type);
+    pack3 (ib, IT_INDEX_TYPE, 1);
+    e32   (ib, draw_conf->index_type);
+    pack3 (ib, IT_NUM_INSTANCES, 1);
+    e32   (ib, draw_conf->num_instances);
+
+    // calculate num of packets
+    count = 2;
+    if (draw_conf->index_type == DI_INDEX_SIZE_16_BIT)
+	count += (draw_conf->num_indices + 1) / 2;
+    else
+	count += draw_conf->num_indices;
+
+    pack3 (ib, IT_DRAW_INDEX_IMMD, count);
+    e32   (ib, draw_conf->num_indices);
+    e32   (ib, draw_conf->vgt_draw_initiator);
+
+    if (draw_conf->index_type == DI_INDEX_SIZE_16_BIT) {
+	for (i = 0; i < draw_conf->num_indices; i += 2) {
+	    if ((i + 1) == draw_conf->num_indices)
+		e32   (ib, indices[i]);
+	    else
+		e32   (ib, (indices[i] | (indices[i + 1] << 16)));
+	}
+    } else {
+	for (i = 0; i < draw_conf->num_indices; i++)
+	    e32   (ib, indices[i]);
+    }
+}
+
+void
+draw_auto(ScrnInfoPtr pScrn, drmBufPtr ib, draw_config_t *draw_conf)
+{
+
+    ereg  (ib, VGT_PRIMITIVE_TYPE, draw_conf->prim_type);
+    pack3 (ib, IT_INDEX_TYPE, 1);
+    e32   (ib, draw_conf->index_type);
+    pack3 (ib, IT_NUM_INSTANCES, 1);
+    e32   (ib, draw_conf->num_instances);
+    pack3 (ib, IT_DRAW_INDEX_AUTO, 2);
+    e32   (ib, draw_conf->num_indices);
+    e32   (ib, draw_conf->vgt_draw_initiator);
+}
diff --git a/src/radeon.h b/src/radeon.h
index 66b2330..629e1ff 100644
--- a/src/radeon.h
+++ b/src/radeon.h
@@ -354,6 +354,8 @@ typedef enum {
 
 #define IS_DCE32_VARIANT ((info->ChipFamily >= CHIP_FAMILY_RV730))
 
+#define IS_R600_3D (info->ChipFamily >= CHIP_FAMILY_R600)
+
 #define IS_R500_3D ((info->ChipFamily == CHIP_FAMILY_RV515)  ||  \
 	(info->ChipFamily == CHIP_FAMILY_R520)   ||  \
 	(info->ChipFamily == CHIP_FAMILY_RV530)  ||  \
@@ -569,6 +571,38 @@ struct radeon_dri {
 };
 #endif
 
+#ifdef XF86DRI
+#ifdef USE_EXA
+struct r6xx_solid_vertex {
+    float x;
+    float y;
+};
+
+struct r6xx_copy_vertex {
+    float x;
+    float y;
+    float s;
+    float t;
+};
+
+struct r6xx_comp_vertex {
+    float x;
+    float y;
+    float src_s;
+    float src_t;
+};
+
+struct r6xx_comp_mask_vertex {
+    float x;
+    float y;
+    float src_s;
+    float src_t;
+    float mask_s;
+    float mask_t;
+};
+#endif
+#endif
+
 struct radeon_accel_state {
     /* common accel data */
     int               fifo_slots;       /* Free slots in the FIFO (64 max)   */
@@ -609,6 +643,44 @@ struct radeon_accel_state {
     Bool              src_tile_height;
 
     Bool              vsync;
+
+    drmBufPtr         ib;
+    int               vb_index;
+
+    // shader storage
+    ExaOffscreenArea  *shaders;
+    uint32_t          solid_vs_offset;
+    uint32_t          solid_ps_offset;
+    uint32_t          copy_vs_offset;
+    uint32_t          copy_ps_offset;
+    uint32_t          comp_vs_offset;
+    uint32_t          comp_ps_offset;
+    uint32_t          comp_mask_vs_offset;
+    uint32_t          comp_mask_ps_offset;
+    uint32_t          xv_vs_offset;
+    uint32_t          xv_ps_offset;
+
+    //size/addr stuff
+    uint32_t          src_size[2];
+    uint64_t          src_mc_addr[2];
+    uint32_t          src_pitch[2];
+    uint32_t          dst_size;
+    uint64_t          dst_mc_addr;
+    uint32_t          dst_pitch;
+    uint32_t          vs_size;
+    uint64_t          vs_mc_addr;
+    uint32_t          ps_size;
+    uint64_t          ps_mc_addr;
+    uint32_t          vb_size;
+    uint64_t          vb_mc_addr;
+
+    // UTS/DFS
+    drmBufPtr         scratch;
+
+    // copy
+    Bool              same_surface;
+    int               rop;
+    uint32_t          planemask;
 #endif
 
 #ifdef USE_XAA
@@ -1032,6 +1104,7 @@ extern void RADEONDoPrepareCopyMMIO(ScrnInfoPtr pScrn,
 				    uint32_t dst_pitch_offset,
 				    uint32_t datatype, int rop,
 				    Pixel planemask);
+extern Bool R600DrawInit(ScreenPtr pScreen);
 #endif
 
 #if defined(XF86DRI) && defined(USE_EXA)
@@ -1116,15 +1189,16 @@ do {									\
 #define RADEONCP_STOP(pScrn, info)					\
 do {									\
     int _ret;								\
-     if (info->cp->CPStarted) {						\
+    if (info->cp->CPStarted) {						\
         _ret = RADEONCPStop(pScrn, info);				\
         if (_ret) {							\
 	    xf86DrvMsg(pScrn->scrnIndex, X_ERROR,			\
 		   "%s: CP stop %d\n", __FUNCTION__, _ret);		\
         }								\
         info->cp->CPStarted = FALSE;                                    \
-   }									\
-    RADEONEngineRestore(pScrn);						\
+    }									\
+    if (info->ChipFamily < CHIP_FAMILY_R600)                            \
+        RADEONEngineRestore(pScrn);					\
     info->cp->CPRuns = FALSE;						\
 } while (0)
 
@@ -1240,19 +1314,26 @@ do {									\
 
 #define RADEON_WAIT_UNTIL_2D_IDLE()					\
 do {									\
-    BEGIN_RING(2);							\
-    OUT_RING(CP_PACKET0(RADEON_WAIT_UNTIL, 0));				\
-    OUT_RING((RADEON_WAIT_2D_IDLECLEAN |				\
-	      RADEON_WAIT_HOST_IDLECLEAN));				\
-    ADVANCE_RING();							\
+    if (info->ChipFamily < CHIP_FAMILY_R600) {                          \
+	BEGIN_RING(2);                                                  \
+	OUT_RING(CP_PACKET0(RADEON_WAIT_UNTIL, 0));                     \
+	OUT_RING((RADEON_WAIT_2D_IDLECLEAN |                            \
+		  RADEON_WAIT_HOST_IDLECLEAN));                         \
+	ADVANCE_RING();                                                 \
+    }                                                                   \
 } while (0)
 
 #define RADEON_WAIT_UNTIL_3D_IDLE()					\
 do {									\
     BEGIN_RING(2);							\
-    OUT_RING(CP_PACKET0(RADEON_WAIT_UNTIL, 0));				\
-    OUT_RING((RADEON_WAIT_3D_IDLECLEAN |				\
-	      RADEON_WAIT_HOST_IDLECLEAN));				\
+    if (info->ChipFamily >= CHIP_FAMILY_R600) {                         \
+	OUT_RING(CP_PACKET0(R600_WAIT_UNTIL, 0));                       \
+	OUT_RING((RADEON_WAIT_3D_IDLECLEAN));                           \
+    } else {                                                            \
+	OUT_RING(CP_PACKET0(RADEON_WAIT_UNTIL, 0));                     \
+	OUT_RING((RADEON_WAIT_3D_IDLECLEAN |                            \
+		  RADEON_WAIT_HOST_IDLECLEAN));                         \
+    }                                                                   \
     ADVANCE_RING();							\
 } while (0)
 
@@ -1263,17 +1344,25 @@ do {									\
 		   "WAIT_UNTIL_IDLE() in %s\n", __FUNCTION__);		\
     }									\
     BEGIN_RING(2);							\
-    OUT_RING(CP_PACKET0(RADEON_WAIT_UNTIL, 0));				\
-    OUT_RING((RADEON_WAIT_2D_IDLECLEAN |				\
-	      RADEON_WAIT_3D_IDLECLEAN |				\
-	      RADEON_WAIT_HOST_IDLECLEAN));				\
+    if (info->ChipFamily >= CHIP_FAMILY_R600) {                         \
+	OUT_RING(CP_PACKET0(R600_WAIT_UNTIL, 0));                       \
+	OUT_RING((RADEON_WAIT_3D_IDLECLEAN));                           \
+    } else {                                                            \
+	OUT_RING(CP_PACKET0(RADEON_WAIT_UNTIL, 0));                     \
+	OUT_RING((RADEON_WAIT_2D_IDLECLEAN |                            \
+                  RADEON_WAIT_3D_IDLECLEAN |                            \
+		  RADEON_WAIT_HOST_IDLECLEAN));                         \
+    }                                                                   \
     ADVANCE_RING();							\
 } while (0)
 
 #define RADEON_PURGE_CACHE()						\
 do {									\
     BEGIN_RING(2);							\
-    if (info->ChipFamily <= CHIP_FAMILY_RV280) {                        \
+    if (info->ChipFamily >= CHIP_FAMILY_R600) {                         \
+	OUT_RING(CP_PACKET3(IT_EVENT_WRITE, 0));                        \
+	OUT_RING(CACHE_FLUSH_AND_INV_EVENT);                            \
+    } else if (info->ChipFamily <= CHIP_FAMILY_RV280) {                 \
         OUT_RING(CP_PACKET0(RADEON_RB3D_DSTCACHE_CTLSTAT, 0));		\
         OUT_RING(RADEON_RB3D_DC_FLUSH_ALL);				\
     } else {                                                            \
@@ -1285,15 +1374,17 @@ do {									\
 
 #define RADEON_PURGE_ZCACHE()						\
 do {									\
-    BEGIN_RING(2);							\
-    if (info->ChipFamily <= CHIP_FAMILY_RV280) {                        \
-        OUT_RING(CP_PACKET0(RADEON_RB3D_ZCACHE_CTLSTAT, 0));		\
-        OUT_RING(RADEON_RB3D_ZC_FLUSH_ALL);				\
-    } else {                                                            \
-        OUT_RING(CP_PACKET0(R300_RB3D_ZCACHE_CTLSTAT, 0));		\
-        OUT_RING(R300_ZC_FLUSH_ALL);					\
+    if (info->ChipFamily < CHIP_FAMILY_R600) {                          \
+	BEGIN_RING(2);                                                  \
+	if (info->ChipFamily <= CHIP_FAMILY_RV280) {                    \
+	    OUT_RING(CP_PACKET0(RADEON_RB3D_ZCACHE_CTLSTAT, 0));        \
+	    OUT_RING(RADEON_RB3D_ZC_FLUSH_ALL);                         \
+	} else {                                                        \
+	    OUT_RING(CP_PACKET0(R300_RB3D_ZCACHE_CTLSTAT, 0));          \
+	    OUT_RING(R300_ZC_FLUSH_ALL);                                \
+	}                                                               \
+	ADVANCE_RING();                                                 \
     }                                                                   \
-    ADVANCE_RING();							\
 } while (0)
 
 #endif /* XF86DRI */
diff --git a/src/radeon_accel.c b/src/radeon_accel.c
index 96570e8..2b17cd1 100644
--- a/src/radeon_accel.c
+++ b/src/radeon_accel.c
@@ -78,6 +78,7 @@
 				/* Driver data structures */
 #include "radeon.h"
 #include "radeon_reg.h"
+#include "r600_reg.h"
 #include "radeon_macros.h"
 #include "radeon_probe.h"
 #include "radeon_version.h"
@@ -156,9 +157,6 @@ void RADEONEngineFlush(ScrnInfoPtr pScrn)
     unsigned char *RADEONMMIO = info->MMIO;
     int            i;
 
-    if (info->ChipFamily >= CHIP_FAMILY_R600)
-        return;
-
     if (info->ChipFamily <= CHIP_FAMILY_RV280) {
 	OUTREGP(RADEON_RB3D_DSTCACHE_CTLSTAT,
 		RADEON_RB3D_DC_FLUSH_ALL,
@@ -198,8 +196,6 @@ void RADEONEngineReset(ScrnInfoPtr pScrn)
     uint32_t       rbbm_soft_reset;
     uint32_t       host_path_cntl;
 
-    if (info->ChipFamily >= CHIP_FAMILY_R600)
-        return;
     /* The following RBBM_SOFT_RESET sequence can help un-wedge
      * an R300 after the command processor got stuck.
      */
@@ -310,6 +306,35 @@ void RADEONEngineReset(ScrnInfoPtr pScrn)
     OUTPLL(pScrn, RADEON_MCLK_CNTL, mclk_cntl);
 }
 
+/* Reset graphics card to known state */
+void R600EngineReset(ScrnInfoPtr pScrn)
+{
+    RADEONInfoPtr  info       = RADEONPTR(pScrn);
+    unsigned char *RADEONMMIO = info->MMIO;
+    uint32_t cp_ptr, cp_me_cntl, cp_rb_cntl;
+
+    cp_ptr = INREG(R600_CP_RB_WPTR);
+
+    cp_me_cntl = INREG(R600_CP_ME_CNTL);
+    OUTREG(R600_CP_ME_CNTL, 0x10000000);
+
+    OUTREG(R600_GRBM_SOFT_RESET, 0x7fff);
+    INREG(R600_GRBM_SOFT_RESET);
+    usleep (50);
+    OUTREG(R600_GRBM_SOFT_RESET, 0);
+    INREG(R600_GRBM_SOFT_RESET);
+
+    OUTREG(R600_CP_RB_WPTR_DELAY, 0);
+    cp_rb_cntl = INREG(R600_CP_RB_CNTL);
+    OUTREG(R600_CP_RB_CNTL, 0x80000000);
+
+    OUTREG(R600_CP_RB_RPTR_WR, cp_ptr);
+    OUTREG(R600_CP_RB_WPTR, cp_ptr);
+    OUTREG(R600_CP_RB_CNTL, cp_rb_cntl);
+    OUTREG(R600_CP_ME_CNTL, cp_me_cntl);
+
+}
+
 /* Restore the acceleration hardware to its previous state */
 void RADEONEngineRestore(ScrnInfoPtr pScrn)
 {
@@ -611,8 +636,12 @@ drmBufPtr RADEONCPGetBuffer(ScrnInfoPtr pScrn)
 
 	xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
 		   "GetBuffer timed out, resetting engine...\n");
-	RADEONEngineReset(pScrn);
-	RADEONEngineRestore(pScrn);
+
+	if (info->ChipFamily < CHIP_FAMILY_R600) {
+	    RADEONEngineReset(pScrn);
+	    RADEONEngineRestore(pScrn);
+	} else
+	    R600EngineReset(pScrn);
 
 	/* Always restart the engine when doing CP 2D acceleration */
 	RADEONCP_RESET(pScrn, info);
@@ -627,6 +656,8 @@ void RADEONCPFlushIndirect(ScrnInfoPtr pScrn, int discard)
     drmBufPtr          buffer = info->cp->indirectBuffer;
     int                start  = info->cp->indirectStart;
     drm_radeon_indirect_t  indirect;
+    RING_LOCALS;
+    RADEONCP_REFRESH(pScrn, info);
 
     if (!buffer) return;
     if (start == buffer->used && !discard) return;
@@ -636,6 +667,14 @@ void RADEONCPFlushIndirect(ScrnInfoPtr pScrn, int discard)
 		   buffer->idx);
     }
 
+    if (info->ChipFamily >= CHIP_FAMILY_R600) {
+	while (buffer->used & 0x3c){
+	    BEGIN_RING(1);
+	    OUT_RING(CP_PACKET2()); /* fill up to multiple of 16 dwords */
+	    ADVANCE_RING();
+	}
+    }
+
     indirect.idx     = buffer->idx;
     indirect.start   = start;
     indirect.end     = buffer->used;
@@ -664,6 +703,19 @@ void RADEONCPReleaseIndirect(ScrnInfoPtr pScrn)
     drmBufPtr          buffer = info->cp->indirectBuffer;
     int                start  = info->cp->indirectStart;
     drm_radeon_indirect_t  indirect;
+    RING_LOCALS;
+    RADEONCP_REFRESH(pScrn, info);
+
+
+    if (info->ChipFamily >= CHIP_FAMILY_R600) {
+	if (buffer) {
+	    while (buffer->used & 0x3c) {
+		BEGIN_RING(1);
+		OUT_RING(CP_PACKET2()); /* fill up to multiple of 16 dwords */
+		ADVANCE_RING();
+	    }
+	}
+    }
 
     info->cp->indirectBuffer = NULL;
     info->cp->indirectStart  = 0;
@@ -926,20 +978,26 @@ Bool RADEONAccelInit(ScreenPtr pScreen)
     ScrnInfoPtr    pScrn = xf86Screens[pScreen->myNum];
     RADEONInfoPtr  info  = RADEONPTR(pScrn);
 
-    if (info->ChipFamily >= CHIP_FAMILY_R600)
-	return FALSE;
-
 #ifdef USE_EXA
     if (info->useEXA) {
 # ifdef XF86DRI
 	if (info->directRenderingEnabled) {
-	    if (!RADEONDrawInitCP(pScreen))
-		return FALSE;
+	    if (info->ChipFamily >= CHIP_FAMILY_R600) {
+		if (!R600DrawInit(pScreen))
+		    return FALSE;
+	    } else {
+		if (!RADEONDrawInitCP(pScreen))
+		    return FALSE;
+	    }
 	} else
 # endif /* XF86DRI */
 	{
-	    if (!RADEONDrawInitMMIO(pScreen))
+	    if (info->ChipFamily >= CHIP_FAMILY_R600)
 		return FALSE;
+	    else {
+		if (!RADEONDrawInitMMIO(pScreen))
+		    return FALSE;
+	    }
 	}
     }
 #endif /* USE_EXA */
@@ -947,6 +1005,9 @@ Bool RADEONAccelInit(ScreenPtr pScreen)
     if (!info->useEXA) {
 	XAAInfoRecPtr  a;
 
+	if (info->ChipFamily >= CHIP_FAMILY_R600)
+	    return FALSE;
+
 	if (!(a = info->accel_state->accel = XAACreateInfoRec())) {
 	    xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "XAACreateInfoRec Error\n");
 	    return FALSE;
diff --git a/src/radeon_commonfuncs.c b/src/radeon_commonfuncs.c
index 0a9f9db..7e00384 100644
--- a/src/radeon_commonfuncs.c
+++ b/src/radeon_commonfuncs.c
@@ -733,8 +733,11 @@ void FUNC_NAME(RADEONWaitForIdle)(ScrnInfoPtr pScrn)
 
 	    xf86DrvMsg(pScrn->scrnIndex, X_ERROR,
 		       "Idle timed out, resetting engine...\n");
-	    RADEONEngineReset(pScrn);
-	    RADEONEngineRestore(pScrn);
+	    if (info->ChipFamily < CHIP_FAMILY_R600) {
+		RADEONEngineReset(pScrn);
+		RADEONEngineRestore(pScrn);
+	    } else
+		R600EngineReset(pScrn);
 
 	    /* Always restart the engine when doing CP 2D acceleration */
 	    RADEONCP_RESET(pScrn, info);
@@ -751,7 +754,7 @@ void FUNC_NAME(RADEONWaitForIdle)(ScrnInfoPtr pScrn)
 #endif
 
     if (info->ChipFamily >= CHIP_FAMILY_R600)
-      return;
+	return;
 
     /* Wait for the engine to go idle */
     RADEONWaitForFifoFunction(pScrn, 64);
diff --git a/src/radeon_dri.c b/src/radeon_dri.c
index 968afe6..4520be5 100644
--- a/src/radeon_dri.c
+++ b/src/radeon_dri.c
@@ -45,6 +45,7 @@
 #include "radeon.h"
 #include "radeon_video.h"
 #include "radeon_reg.h"
+#include "r600_reg.h"
 #include "radeon_macros.h"
 #include "radeon_drm.h"
 #include "radeon_dri.h"
@@ -784,92 +785,96 @@ static Bool RADEONSetAgpMode(RADEONInfoPtr info, ScreenPtr pScreen)
     unsigned long mode   = drmAgpGetMode(info->dri->drmFD);	/* Default mode */
     unsigned int  vendor = drmAgpVendorId(info->dri->drmFD);
     unsigned int  device = drmAgpDeviceId(info->dri->drmFD);
-    /* ignore agp 3.0 mode bit from the chip as it's buggy on some cards with
-       pcie-agp rialto bridge chip - use the one from bridge which must match */
-    uint32_t agp_status = (INREG(RADEON_AGP_STATUS) | RADEON_AGPv3_MODE) & mode;
-    Bool is_v3 = (agp_status & RADEON_AGPv3_MODE);
-    unsigned int defaultMode;
-    MessageType from;
 
-    if (is_v3) {
-	defaultMode = (agp_status & RADEON_AGPv3_8X_MODE) ? 8 : 4;
-    } else {
-	if (agp_status & RADEON_AGP_4X_MODE) defaultMode = 4;
-	else if (agp_status & RADEON_AGP_2X_MODE) defaultMode = 2;
-	else defaultMode = 1;
-    }
-
-    /* Apply AGPMode Quirks */
-    radeon_agpmode_quirk_ptr p = radeon_agpmode_quirk_list;
-    while (p && p->chipDevice != 0) {
-        if (vendor == p->hostbridgeVendor &&
-            device == p->hostbridgeDevice &&
-            PCI_DEV_VENDOR_ID(info->PciInfo) == p->chipVendor &&
-            PCI_DEV_DEVICE_ID(info->PciInfo) == p->chipDevice &&
-            PCI_SUB_VENDOR_ID(info->PciInfo) == p->subsysVendor &&
-            PCI_SUB_DEVICE_ID(info->PciInfo) == p->subsysDevice)
-        {
-            defaultMode = p->defaultMode;
-        }
-        ++p;
-    }
+    if (info->ChipFamily < CHIP_FAMILY_R600) {
+	/* ignore agp 3.0 mode bit from the chip as it's buggy on some cards with
+	   pcie-agp rialto bridge chip - use the one from bridge which must match */
+	uint32_t agp_status = (INREG(RADEON_AGP_STATUS) | RADEON_AGPv3_MODE) & mode;
+	Bool is_v3 = (agp_status & RADEON_AGPv3_MODE);
+	unsigned int defaultMode;
+	MessageType from;
 
-    from = X_DEFAULT;
+	if (is_v3) {
+	    defaultMode = (agp_status & RADEON_AGPv3_8X_MODE) ? 8 : 4;
+	} else {
+	    if (agp_status & RADEON_AGP_4X_MODE) defaultMode = 4;
+	    else if (agp_status & RADEON_AGP_2X_MODE) defaultMode = 2;
+	    else defaultMode = 1;
+	}
 
-    if (xf86GetOptValInteger(info->Options, OPTION_AGP_MODE, &info->dri->agpMode)) {
-	if ((info->dri->agpMode < (is_v3 ? 4 : 1)) ||
-            (info->dri->agpMode > (is_v3 ? 8 : 4)) ||
-	    (info->dri->agpMode & (info->dri->agpMode - 1))) {
-	    xf86DrvMsg(pScreen->myNum, X_ERROR,
-		       "Illegal AGP Mode: %d (valid values: %s), leaving at "
-		       "%dx\n", info->dri->agpMode, is_v3 ? "4, 8" : "1, 2, 4",
-		       defaultMode);
-	    info->dri->agpMode = defaultMode;
+	/* Apply AGPMode Quirks */
+	radeon_agpmode_quirk_ptr p = radeon_agpmode_quirk_list;
+	while (p && p->chipDevice != 0) {
+	    if (vendor == p->hostbridgeVendor &&
+		device == p->hostbridgeDevice &&
+		PCI_DEV_VENDOR_ID(info->PciInfo) == p->chipVendor &&
+		PCI_DEV_DEVICE_ID(info->PciInfo) == p->chipDevice &&
+		PCI_SUB_VENDOR_ID(info->PciInfo) == p->subsysVendor &&
+		PCI_SUB_DEVICE_ID(info->PciInfo) == p->subsysDevice)
+	    {
+		defaultMode = p->defaultMode;
+	    }
+	    ++p;
+	}
+
+	from = X_DEFAULT;
+
+	if (xf86GetOptValInteger(info->Options, OPTION_AGP_MODE, &info->dri->agpMode)) {
+	    if ((info->dri->agpMode < (is_v3 ? 4 : 1)) ||
+		(info->dri->agpMode > (is_v3 ? 8 : 4)) ||
+		(info->dri->agpMode & (info->dri->agpMode - 1))) {
+		xf86DrvMsg(pScreen->myNum, X_ERROR,
+			   "Illegal AGP Mode: %d (valid values: %s), leaving at "
+			   "%dx\n", info->dri->agpMode, is_v3 ? "4, 8" : "1, 2, 4",
+			   defaultMode);
+		info->dri->agpMode = defaultMode;
+	    } else
+		from = X_CONFIG;
 	} else
-	    from = X_CONFIG;
-    } else
-	info->dri->agpMode = defaultMode;
+	    info->dri->agpMode = defaultMode;
 
-    xf86DrvMsg(pScreen->myNum, from, "Using AGP %dx\n", info->dri->agpMode);
+	xf86DrvMsg(pScreen->myNum, from, "Using AGP %dx\n", info->dri->agpMode);
 
-    mode &= ~RADEON_AGP_MODE_MASK;
-    if (is_v3) {
-	/* only set one mode bit for AGPv3 */
-	switch (info->dri->agpMode) {
-	case 8:          mode |= RADEON_AGPv3_8X_MODE; break;
-	case 4: default: mode |= RADEON_AGPv3_4X_MODE;
-	}
-	/*TODO: need to take care of other bits valid for v3 mode
-	 *      currently these bits are not used in all tested cards.
-	 */
-    } else {
-	switch (info->dri->agpMode) {
-	case 4:          mode |= RADEON_AGP_4X_MODE;
-	case 2:          mode |= RADEON_AGP_2X_MODE;
-	case 1: default: mode |= RADEON_AGP_1X_MODE;
+	mode &= ~RADEON_AGP_MODE_MASK;
+	if (is_v3) {
+	    /* only set one mode bit for AGPv3 */
+	    switch (info->dri->agpMode) {
+	    case 8:          mode |= RADEON_AGPv3_8X_MODE; break;
+	    case 4: default: mode |= RADEON_AGPv3_4X_MODE;
+	    }
+	    /*TODO: need to take care of other bits valid for v3 mode
+	     *      currently these bits are not used in all tested cards.
+	     */
+	} else {
+	    switch (info->dri->agpMode) {
+	    case 4:          mode |= RADEON_AGP_4X_MODE;
+	    case 2:          mode |= RADEON_AGP_2X_MODE;
+	    case 1: default: mode |= RADEON_AGP_1X_MODE;
+	    }
 	}
-    }
 
-    /* AGP Fast Writes.
-     * TODO: take into account that certain agp modes don't support fast
-     * writes at all */
-    mode &= ~RADEON_AGP_FW_MODE; /* Disable per default */
-    if (xf86ReturnOptValBool(info->Options, OPTION_AGP_FW, FALSE)) {
-	xf86DrvMsg(pScreen->myNum, X_WARNING,
-		   "WARNING: Using the AGPFastWrite option is not recommended.\n");
-	xf86Msg(X_NONE, "\tThis option does not provide much of a noticable speed"
-		" boost, while it\n\twill probably hard lock your machine."
-		" All bets are off!\n");
-
-	/* Black list some host/AGP bridges. */
-	if ((vendor == PCI_VENDOR_AMD) && (device == PCI_CHIP_AMD761))
-	    xf86DrvMsg(pScreen->myNum, X_PROBED, "Ignoring AGPFastWrite option "
-		       "for the AMD 761 northbridge.\n");
-	else {
-	    xf86DrvMsg(pScreen->myNum, X_CONFIG, "Enabling AGP Fast Writes.\n");
-	    mode |= RADEON_AGP_FW_MODE;
-	}
-    } /* Don't mention this otherwise, so that people don't get funny ideas */
+	/* AGP Fast Writes.
+	 * TODO: take into account that certain agp modes don't support fast
+	 * writes at all */
+	mode &= ~RADEON_AGP_FW_MODE; /* Disable per default */
+	if (xf86ReturnOptValBool(info->Options, OPTION_AGP_FW, FALSE)) {
+	    xf86DrvMsg(pScreen->myNum, X_WARNING,
+		       "WARNING: Using the AGPFastWrite option is not recommended.\n");
+	    xf86Msg(X_NONE, "\tThis option does not provide much of a noticable speed"
+		    " boost, while it\n\twill probably hard lock your machine."
+		    " All bets are off!\n");
+
+	    /* Black list some host/AGP bridges. */
+	    if ((vendor == PCI_VENDOR_AMD) && (device == PCI_CHIP_AMD761))
+		xf86DrvMsg(pScreen->myNum, X_PROBED, "Ignoring AGPFastWrite option "
+			   "for the AMD 761 northbridge.\n");
+	    else {
+		xf86DrvMsg(pScreen->myNum, X_CONFIG, "Enabling AGP Fast Writes.\n");
+		mode |= RADEON_AGP_FW_MODE;
+	    }
+	} /* Don't mention this otherwise, so that people don't get funny ideas */
+    } else
+	info->dri->agpMode = 8; /* doesn't matter at this point */
 
     xf86DrvMsg(pScreen->myNum, X_INFO,
 	       "[agp] Mode 0x%08lx [AGP 0x%04x/0x%04x; Card 0x%04x/0x%04x 0x%04x/0x%04x]\n",
@@ -904,6 +909,9 @@ static void RADEONSetAgpBase(RADEONInfoPtr info, ScreenPtr pScreen)
     ScrnInfoPtr    pScrn = xf86Screens[pScreen->myNum];
     unsigned char *RADEONMMIO = info->MMIO;
 
+    if (info->ChipFamily >= CHIP_FAMILY_R600)
+	return;
+
     /* drm already does this, so we can probably remove this.
      * agp_base_2 ?
      */
@@ -1177,13 +1185,14 @@ static int RADEONDRIKernelInit(RADEONInfoPtr info, ScreenPtr pScreen)
     drm_radeon_init_t  drmInfo;
 
     memset(&drmInfo, 0, sizeof(drm_radeon_init_t));
-    if ( info->ChipFamily >= CHIP_FAMILY_R300 )
-       drmInfo.func             = RADEON_INIT_R300_CP;
+    if ( info->ChipFamily >= CHIP_FAMILY_R600 )
+	drmInfo.func             = RADEON_INIT_R600_CP;
+    else if ( info->ChipFamily >= CHIP_FAMILY_R300 )
+	drmInfo.func             = RADEON_INIT_R300_CP;
+    else if ( info->ChipFamily >= CHIP_FAMILY_R200 )
+	drmInfo.func		= RADEON_INIT_R200_CP;
     else
-    if ( info->ChipFamily >= CHIP_FAMILY_R200 )
-       drmInfo.func		= RADEON_INIT_R200_CP;
-    else
-       drmInfo.func		= RADEON_INIT_CP;
+	drmInfo.func		= RADEON_INIT_CP;
 
     drmInfo.sarea_priv_offset   = sizeof(XF86DRISAREARec);
     drmInfo.is_pci              = (info->cardType!=CARD_AGP);
@@ -1217,7 +1226,8 @@ static int RADEONDRIKernelInit(RADEONInfoPtr info, ScreenPtr pScreen)
      * registers back to their default values, so we need to restore
      * those engine register here.
      */
-    RADEONEngineRestore(pScrn);
+    if (info->ChipFamily < CHIP_FAMILY_R600)
+	RADEONEngineRestore(pScrn);
 
     return TRUE;
 }
@@ -1293,14 +1303,16 @@ static void RADEONDRIIrqInit(RADEONInfoPtr info, ScreenPtr pScreen)
 		       "[drm] falling back to irq-free operation\n");
 	    info->dri->irq = 0;
 	} else {
-	    unsigned char *RADEONMMIO = info->MMIO;
-	    info->ModeReg->gen_int_cntl = INREG( RADEON_GEN_INT_CNTL );
-
-	    /* Let the DRM know it can safely disable the vblank interrupts */
-	    radeon_crtc_modeset_ioctl(XF86_CRTC_CONFIG_PTR(pScrn)->crtc[0],
-				      FALSE);
-	    radeon_crtc_modeset_ioctl(XF86_CRTC_CONFIG_PTR(pScrn)->crtc[0],
-				      TRUE);
+	    if (info->ChipFamily < CHIP_FAMILY_R600) {
+		unsigned char *RADEONMMIO = info->MMIO;
+		info->ModeReg->gen_int_cntl = INREG( RADEON_GEN_INT_CNTL );
+
+		/* Let the DRM know it can safely disable the vblank interrupts */
+		radeon_crtc_modeset_ioctl(XF86_CRTC_CONFIG_PTR(pScrn)->crtc[0],
+					  FALSE);
+		radeon_crtc_modeset_ioctl(XF86_CRTC_CONFIG_PTR(pScrn)->crtc[0],
+					  TRUE);
+	    }
 	}
     }
 
diff --git a/src/radeon_driver.c b/src/radeon_driver.c
index b0817b0..919a9fb 100644
--- a/src/radeon_driver.c
+++ b/src/radeon_driver.c
@@ -674,8 +674,8 @@ static void radeon_write_mc_fb_agp_location(ScrnInfoPtr pScrn, int mask, uint32_
 	if (mask & LOC_FB)
 	    OUTREG(R700_MC_VM_FB_LOCATION, fb_loc);
 	if (mask & LOC_AGP) {
-	    OUTREG(R600_MC_VM_AGP_BOT, agp_loc);
-	    OUTREG(R600_MC_VM_AGP_TOP, agp_loc_hi);
+	    OUTREG(R700_MC_VM_AGP_BOT, agp_loc);
+	    OUTREG(R700_MC_VM_AGP_TOP, agp_loc_hi);
 	}
     } else if (info->ChipFamily >= CHIP_FAMILY_R600) {
 	if (mask & LOC_FB)
@@ -724,8 +724,8 @@ static void radeon_read_mc_fb_agp_location(ScrnInfoPtr pScrn, int mask, uint32_t
 	if (mask & LOC_FB)
 	    *fb_loc = INREG(R700_MC_VM_FB_LOCATION);
 	if (mask & LOC_AGP) {
-	    *agp_loc = INREG(R600_MC_VM_AGP_BOT);
-	    *agp_loc_hi = INREG(R600_MC_VM_AGP_TOP);
+	    *agp_loc = INREG(R700_MC_VM_AGP_BOT);
+	    *agp_loc_hi = INREG(R700_MC_VM_AGP_TOP);
 	}
     } else if (info->ChipFamily >= CHIP_FAMILY_R600) {
 	if (mask & LOC_FB)
@@ -1876,7 +1876,10 @@ static Bool RADEONPreInitChipType(ScrnInfoPtr pScrn)
 
     /* treat PCIE IGP cards as PCI */
     if (info->cardType == CARD_PCIE && info->IsIGP)
-		info->cardType = CARD_PCI;
+	info->cardType = CARD_PCI;
+
+    if ((info->ChipFamily >= CHIP_FAMILY_R600) && info->IsIGP)
+	info->cardType = CARD_PCIE;
 
     if ((s = xf86GetOptValString(info->Options, OPTION_BUS_TYPE))) {
 	if (strcmp(s, "AGP") == 0) {
@@ -1905,6 +1908,7 @@ static Bool RADEONPreInitChipType(ScrnInfoPtr pScrn)
 					     info->Chipset != PCI_CHIP_RN50_5969);
 #endif
 
+#if 0
     if (info->ChipFamily >= CHIP_FAMILY_R600) {
         info->r600_shadow_fb = TRUE;
 	xf86DrvMsg(pScrn->scrnIndex, X_INFO,
@@ -1912,6 +1916,7 @@ static Bool RADEONPreInitChipType(ScrnInfoPtr pScrn)
         if (!xf86LoadSubModule(pScrn, "shadow"))
             return FALSE;
     }
+#endif
 
     return TRUE;
 }
@@ -1989,8 +1994,8 @@ static Bool RADEONPreInitAccel(ScrnInfoPtr pScrn)
 
     if (info->ChipFamily >= CHIP_FAMILY_R600) {
 	xf86DrvMsg(pScrn->scrnIndex, X_DEFAULT,
-	    "No acceleration support available on R600 yet.\n");
-	return TRUE;
+	    "Experimental R6xx/R7xx EXA support.\n");
+	info->useEXA = TRUE;
     }
 
     if (!xf86ReturnOptValBool(info->Options, OPTION_NOACCEL, FALSE)) {
@@ -2336,7 +2341,10 @@ static Bool RADEONPreInitDRI(ScrnInfoPtr pScrn)
     xf86DrvMsg(pScrn->scrnIndex, from, "Page Flipping %sabled%s\n",
 	       info->dri->allowPageFlip ? "en" : "dis", reason);
 
-    info->DMAForXv = TRUE;
+    if (info->ChipFamily >= CHIP_FAMILY_R600)
+	info->DMAForXv = FALSE;
+    else
+	info->DMAForXv = TRUE;
     from = xf86GetOptValBool(info->Options, OPTION_XV_DMA, &info->DMAForXv)
 	 ? X_CONFIG : X_INFO;
     xf86DrvMsg(pScrn->scrnIndex, from,
@@ -3638,11 +3646,9 @@ Bool RADEONScreenInit(int scrnIndex, ScreenPtr pScreen,
     RADEONDGAInit(pScreen);
 
     /* Init Xv */
-    if (info->ChipFamily < CHIP_FAMILY_R600) {
-	xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, RADEON_LOGLEVEL_DEBUG,
-		       "Initializing Xv\n");
-	RADEONInitVideo(pScreen);
-    }
+    xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, RADEON_LOGLEVEL_DEBUG,
+		   "Initializing Xv\n");
+    RADEONInitVideo(pScreen);
 
     if (info->r600_shadow_fb == TRUE) {
         if (!shadowSetup(pScreen)) {
@@ -3952,7 +3958,7 @@ static void RADEONAdjustMemMapRegisters(ScrnInfoPtr pScrn, RADEONSavePtr save)
     }
 
 #ifdef USE_EXA
-    if (info->accelDFS)
+    if (info->accelDFS || (info->ChipFamily >= CHIP_FAMILY_R600))
     {
 	drm_radeon_getparam_t gp;
 	int gart_base;
diff --git a/src/radeon_exa.c b/src/radeon_exa.c
index c4bc1bb..5e23645 100644
--- a/src/radeon_exa.c
+++ b/src/radeon_exa.c
@@ -35,6 +35,7 @@
 
 #include "radeon.h"
 #include "radeon_reg.h"
+#include "r600_reg.h"
 #ifdef XF86DRI
 #include "radeon_drm.h"
 #endif
diff --git a/src/radeon_exa_render.c b/src/radeon_exa_render.c
index ceeee49..0c84384 100644
--- a/src/radeon_exa_render.c
+++ b/src/radeon_exa_render.c
@@ -451,7 +451,7 @@ static Bool FUNC_NAME(R100TextureSetup)(PicturePtr pPict, PixmapPtr pPix,
 
 #ifdef ONLY_ONCE
 
-static PixmapPtr
+PixmapPtr
 RADEONGetDrawablePixmap(DrawablePtr pDrawable)
 {
     if (pDrawable->type == DRAWABLE_WINDOW)
diff --git a/src/radeon_modes.c b/src/radeon_modes.c
index e06f8dd..0a8fa00 100644
--- a/src/radeon_modes.c
+++ b/src/radeon_modes.c
@@ -65,15 +65,19 @@ void RADEONSetPitch (ScrnInfoPtr pScrn)
     align_large = info->allowColorTiling || IS_AVIVO_VARIANT;
 
     /* FIXME: May need to validate line pitch here */
-    switch (pScrn->depth / 8) {
-    case 1: pitch_mask = align_large ? 255 : 127;
-	break;
-    case 2: pitch_mask = align_large ? 127 : 31;
-	break;
-    case 3:
-    case 4: pitch_mask = align_large ? 63 : 15;
-	break;
-    }
+    if (info->ChipFamily < CHIP_FAMILY_R600) {
+	switch (pScrn->depth / 8) {
+	case 1: pitch_mask = align_large ? 255 : 127;
+	    break;
+	case 2: pitch_mask = align_large ? 127 : 31;
+	    break;
+	case 3:
+	case 4: pitch_mask = align_large ? 63 : 15;
+	    break;
+	}
+    } else
+	pitch_mask = 255; /* r6xx/r7xx need 256B alignment for accel */
+
     dummy = (pScrn->virtualX + pitch_mask) & ~pitch_mask;
     pScrn->displayWidth = dummy;
     info->CurrentLayout.displayWidth = pScrn->displayWidth;
diff --git a/src/radeon_reg.h b/src/radeon_reg.h
index 7b8840b..17f8575 100644
--- a/src/radeon_reg.h
+++ b/src/radeon_reg.h
@@ -3984,6 +3984,9 @@
 #define R600_MC_VM_SYSTEM_APERTURE_DEFAULT_ADDR                    0x2198
 
 #define R700_MC_VM_FB_LOCATION                                     0x2024
+#define R700_MC_VM_AGP_TOP                                         0x2028
+#define R700_MC_VM_AGP_BOT                                         0x202c
+#define R700_MC_VM_AGP_BASE                                        0x2030
 
 #define R600_HDP_NONSURFACE_BASE                                0x2c04
 
@@ -5362,4 +5365,27 @@
 
 #define R500_DYN_SCLK_PWMEM_PIPE                        0x000d /* PLL */
 
+/* r6xx/r7xx stuff */
+#define R600_GRBM_SOFT_RESET                                    0x8020
+#       define R600_SOFT_RESET_CP                               (1 << 0)
+
+#define R600_WAIT_UNTIL                                  0x8040
+
+#define R600_CP_ME_CNTL                                         0x86d8
+#       define R600_CP_ME_HALT                                  (1 << 28)
+
+#define R600_CP_RB_BASE                                            0xc100
+#define R600_CP_RB_CNTL                                            0xc104
+#       define R600_RB_NO_UPDATE                                   (1 << 27)
+#       define R600_RB_RPTR_WR_ENA                                 (1 << 31)
+#define R600_CP_RB_RPTR_WR                                         0xc108
+#define R600_CP_RB_RPTR_ADDR                                       0xc10c
+#define R600_CP_RB_RPTR_ADDR_HI                                    0xc110
+#define R600_CP_RB_WPTR                                            0xc114
+#define R600_CP_RB_WPTR_ADDR                                       0xc118
+#define R600_CP_RB_WPTR_ADDR_HI                                    0xc11c
+
+#define R600_CP_RB_RPTR                                            0x8700
+#define R600_CP_RB_WPTR_DELAY                                      0x8704
+
 #endif
diff --git a/src/radeon_textured_video.c b/src/radeon_textured_video.c
index 7712344..16b2c82 100644
--- a/src/radeon_textured_video.c
+++ b/src/radeon_textured_video.c
@@ -36,6 +36,7 @@
 
 #include "radeon.h"
 #include "radeon_reg.h"
+#include "r600_reg.h"
 #include "radeon_macros.h"
 #include "radeon_probe.h"
 #include "radeon_video.h"
@@ -43,12 +44,18 @@
 #include <X11/extensions/Xv.h>
 #include "fourcc.h"
 
+extern void
+R600DisplayTexturedVideo(ScrnInfoPtr pScrn, RADEONPortPrivPtr pPriv);
+
 #define IMAGE_MAX_WIDTH		2048
 #define IMAGE_MAX_HEIGHT	2048
 
 #define IMAGE_MAX_WIDTH_R500	4096
 #define IMAGE_MAX_HEIGHT_R500	4096
 
+#define IMAGE_MAX_WIDTH_R600	8192
+#define IMAGE_MAX_HEIGHT_R600	8192
+
 static Bool
 RADEONTilingEnabled(ScrnInfoPtr pScrn, PixmapPtr pPix)
 {
@@ -146,6 +153,82 @@ static __inline__ uint32_t F_TO_24(float val)
 
 #endif /* XF86DRI */
 
+static void
+CopyPlanartoNV12(unsigned char *y_src, unsigned char *u_src, unsigned char *v_src,
+		 unsigned char *dst,
+		 int srcPitch, int srcPitch2, int dstPitch,
+		 int w, int h)
+{
+    int i, j;
+
+    /* Y */
+    if (srcPitch == dstPitch) {
+        memcpy(dst, y_src, srcPitch * h);
+	dst += (dstPitch * h);
+    } else {
+	for (i = 0; i < h; i++) {
+            memcpy(dst, y_src, srcPitch);
+            y_src += srcPitch;
+            dst += dstPitch;
+        }
+    }
+
+    /* tex base need 256B alignment */
+    if (h & 1)
+	dst += dstPitch;
+
+    /* UV */
+    for (i = 0; i < (h >> 1); i++) {
+	unsigned char *u = u_src;
+	unsigned char *v = v_src;
+	unsigned char *uv = dst;
+
+	for (j = 0; j < w; j++) {
+	    uv[0] = v[j];
+	    uv[1] = u[j];
+	    uv += 2;
+	}
+	dst += dstPitch;
+	u_src += srcPitch2;
+	v_src += srcPitch2;
+    }
+}
+
+static void
+CopyPackedtoNV12(unsigned char *src, unsigned char *dst,
+		 int srcPitch, int dstPitch,
+		 int w, int h, int id)
+{
+    int i, j;
+    int uv_offset = dstPitch * h;
+    uv_offset = (uv_offset + 255) & ~255;
+
+    // FOURCC_UYVY: U0 Y0 V0 Y1
+    // FOURCC_YUY2: Y0 U0 Y1 V0
+    for (i = 0; i < h; i++) {
+	unsigned char *y = dst;
+	unsigned char *uv = (unsigned char *)dst + uv_offset;
+
+	for (j = 0; j < (w / 2); j++) {
+	    if (id == FOURCC_UYVY) {
+		uv[1] = src[(j * 4) + 0];
+		y[0]  = src[(j * 4) + 1];
+		uv[0] = src[(j * 4) + 2];
+		y[1]  = src[(j * 4) + 3];
+	    } else {
+		y[0]  = src[(j * 4) + 0];
+		uv[1] = src[(j * 4) + 1];
+		y[1]  = src[(j * 4) + 2];
+		uv[0] = src[(j * 4) + 3];
+	    }
+	    y += 2;
+	    uv += 2;
+	}
+	dst += dstPitch;
+	src += srcPitch;
+    }
+}
+
 static int
 RADEONPutImageTextured(ScrnInfoPtr pScrn,
 		       short src_x, short src_y,
@@ -214,7 +297,10 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
 	break;
     }
 
-   dstPitch = (dstPitch + 63) & ~63;
+    if (info->ChipFamily >= CHIP_FAMILY_R600)
+	dstPitch = (dstPitch + 511) & ~511;
+    else
+	dstPitch = (dstPitch + 63) & ~63;
 
     if (pPriv->video_memory != NULL && size != pPriv->size) {
 	radeon_legacy_free_memory(pScrn, pPriv->video_memory);
@@ -222,16 +308,21 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     }
 
     if (pPriv->video_memory == NULL) {
-	pPriv->video_offset = radeon_legacy_allocate_memory(pScrn,
-						            &pPriv->video_memory,
-						            size * 2, 64);
+	if (info->ChipFamily >= CHIP_FAMILY_R600)
+	    pPriv->video_offset = radeon_legacy_allocate_memory(pScrn,
+								&pPriv->video_memory,
+								size * 2, 512);
+	else
+	    pPriv->video_offset = radeon_legacy_allocate_memory(pScrn,
+								&pPriv->video_memory,
+								size * 2, 64);
 	if (pPriv->video_offset == 0)
 	    return BadAlloc;
     }
 
     /* Bicubic filter setup */
     pPriv->bicubic_enabled = (pPriv->bicubic_state != BICUBIC_OFF);
-    if (!(IS_R300_3D || IS_R500_3D))
+    if (!(IS_R300_3D || IS_R500_3D || IS_R600_3D))
 	pPriv->bicubic_enabled = FALSE;
     if (pPriv->bicubic_enabled && (pPriv->bicubic_state == BICUBIC_AUTO)) {
 	/*
@@ -280,7 +371,10 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     npixels = ((((x2 + 0xffff) >> 16) + 1) & ~1) - left;
 
     pPriv->src_offset = pPriv->video_offset + info->fbLocation + pScrn->fbOffset;
-    pPriv->src_addr = (uint8_t *)(info->FB + pPriv->video_offset + (top * dstPitch));
+    if (info->ChipFamily >= CHIP_FAMILY_R600)
+	pPriv->src_addr = (uint8_t *)(info->FB + pPriv->video_offset);
+    else
+	pPriv->src_addr = (uint8_t *)(info->FB + pPriv->video_offset + (top * dstPitch));
     pPriv->src_pitch = dstPitch;
     pPriv->size = size;
     pPriv->pDraw = pDraw;
@@ -294,29 +388,51 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     switch(id) {
     case FOURCC_YV12:
     case FOURCC_I420:
-	top &= ~1;
-	nlines = ((((y2 + 0xffff) >> 16) + 1) & ~1) - top;
-	s2offset = srcPitch * height;
-	s3offset = (srcPitch2 * (height >> 1)) + s2offset;
-	top &= ~1;
-	pPriv->src_addr += left << 1;
-	tmp = ((top >> 1) * srcPitch2) + (left >> 1);
-	s2offset += tmp;
-	s3offset += tmp;
-	if (id == FOURCC_I420) {
-	    tmp = s2offset;
-	    s2offset = s3offset;
-	    s3offset = tmp;
+	if (info->ChipFamily >= CHIP_FAMILY_R600) {
+	    s2offset = srcPitch * height;
+	    s3offset = (srcPitch2 * (height >> 1)) + s2offset;
+	    if (id == FOURCC_YV12)
+		CopyPlanartoNV12(buf, buf + s3offset, buf + s2offset,
+				 pPriv->src_addr,
+				 srcPitch, srcPitch2, pPriv->src_pitch,
+				 width, height);
+	    else
+		CopyPlanartoNV12(buf, buf + s2offset, buf + s3offset,
+				 pPriv->src_addr,
+				 srcPitch, srcPitch2, pPriv->src_pitch,
+				 width, height);
+
+	} else {
+	    top &= ~1;
+	    nlines = ((((y2 + 0xffff) >> 16) + 1) & ~1) - top;
+	    s2offset = srcPitch * height;
+	    s3offset = (srcPitch2 * (height >> 1)) + s2offset;
+	    top &= ~1;
+	    pPriv->src_addr += left << 1;
+	    tmp = ((top >> 1) * srcPitch2) + (left >> 1);
+	    s2offset += tmp;
+	    s3offset += tmp;
+	    if (id == FOURCC_I420) {
+		tmp = s2offset;
+		s2offset = s3offset;
+		s3offset = tmp;
+	    }
+	    RADEONCopyMungedData(pScrn, buf + (top * srcPitch) + left,
+				 buf + s2offset, buf + s3offset, pPriv->src_addr,
+				 srcPitch, srcPitch2, dstPitch, nlines, npixels);
 	}
-	RADEONCopyMungedData(pScrn, buf + (top * srcPitch) + left,
-			     buf + s2offset, buf + s3offset, pPriv->src_addr,
-			     srcPitch, srcPitch2, dstPitch, nlines, npixels);
 	break;
     case FOURCC_UYVY:
     case FOURCC_YUY2:
     default:
-	nlines = ((y2 + 0xffff) >> 16) - top;
-	RADEONCopyData(pScrn, buf, pPriv->src_addr, srcPitch, dstPitch, nlines, npixels, 2);
+	if (info->ChipFamily >= CHIP_FAMILY_R600) {
+	    CopyPackedtoNV12(buf, pPriv->src_addr,
+			     2 * width, pPriv->src_pitch,
+			     width, height, id);
+	} else {
+	    nlines = ((y2 + 0xffff) >> 16) - top;
+	    RADEONCopyData(pScrn, buf, pPriv->src_addr, srcPitch, dstPitch, nlines, npixels, 2);
+	}
 	break;
     }
 
@@ -340,7 +456,9 @@ RADEONPutImageTextured(ScrnInfoPtr pScrn,
     pPriv->h = height;
 
 #ifdef XF86DRI
-    if (info->directRenderingEnabled)
+    if (IS_R600_3D)
+	R600DisplayTexturedVideo(pScrn, pPriv);
+    else if (info->directRenderingEnabled)
 	RADEONDisplayTexturedVideoCP(pScrn, pPriv);
     else
 #endif
@@ -370,6 +488,16 @@ static XF86VideoEncodingRec DummyEncodingR500[1] =
     }
 };
 
+static XF86VideoEncodingRec DummyEncodingR600[1] =
+{
+    {
+	0,
+	"XV_IMAGE",
+	IMAGE_MAX_WIDTH_R600, IMAGE_MAX_HEIGHT_R600,
+	{1, 1}
+    }
+};
+
 #define NUM_FORMATS 3
 
 static XF86VideoFormatRec Formats[NUM_FORMATS] =
@@ -471,7 +599,9 @@ RADEONSetupImageTexturedVideo(ScreenPtr pScreen)
     adapt->flags = 0;
     adapt->name = "Radeon Textured Video";
     adapt->nEncodings = 1;
-    if (IS_R500_3D)
+    if (IS_R600_3D)
+	adapt->pEncodings = DummyEncodingR600;
+    else if (IS_R500_3D)
 	adapt->pEncodings = DummyEncodingR500;
     else
 	adapt->pEncodings = DummyEncoding;
@@ -483,7 +613,7 @@ RADEONSetupImageTexturedVideo(ScreenPtr pScreen)
     pPortPriv =
 	(RADEONPortPrivPtr)(&adapt->pPortPrivates[num_texture_ports]);
 
-    if (IS_R300_3D || IS_R500_3D) {
+    if (IS_R300_3D || IS_R500_3D || IS_R600_3D) {
 	adapt->pAttributes = Attributes_r300;
 	adapt->nAttributes = NUM_ATTRIBUTES_R300;
     } else {


More information about the xorg-commit mailing list