[Mesa-dev] [PATCH 3/6] swr: [rasterizer core] more backend refactoring

Tim Rowley timothy.o.rowley at intel.com
Mon Apr 25 22:12:12 UTC 2016


BackendPixelRate should be easier to read/maintain now hopefully.

Small perf bump by moving some of the pfn's to inline functions
without template params.
---
 .../swr/rasterizer/common/rdtsc_buckets.cpp        |   4 +-
 src/gallium/drivers/swr/rasterizer/core/api.cpp    |  29 +-
 .../drivers/swr/rasterizer/core/backend.cpp        | 689 +++++----------------
 src/gallium/drivers/swr/rasterizer/core/backend.h  | 394 +++++++++++-
 src/gallium/drivers/swr/rasterizer/core/context.h  |   5 -
 .../drivers/swr/rasterizer/core/multisample.h      | 407 +-----------
 .../drivers/swr/rasterizer/core/rdtsc_core.cpp     |   4 +
 .../drivers/swr/rasterizer/core/rdtsc_core.h       |   4 +
 8 files changed, 573 insertions(+), 963 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
index c6768b4..eb038b1 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
@@ -80,7 +80,9 @@ void BucketManager::PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint
         "        |-> ",
         "            |-> ",
         "                |-> ",
-        "                    |-> "
+        "                    |-> ",
+        "                        |-> ",
+        "                            |-> ",
     };
 
     // compute percent of total cycles used by this bucket
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 3b02d19..e312792 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -755,14 +755,12 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
         pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
     }
 }
+
 // templated backend function tables
 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
-extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
-extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
-extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
-extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
-extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
-extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
+extern PFN_BACKEND_FUNC gBackendSingleSample[2][2][2];
+extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2][2];
+extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
 void SetupPipeline(DRAW_CONTEXT *pDC)
 {
     DRAW_STATE* pState = pDC->pState;
@@ -775,13 +773,12 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
     if (psState.pfnPixelShader == nullptr)
     {
         backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
-        // always need to generate I & J per sample for Z interpolation
-        backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[1];
     }
     else
     {
         const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
         const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
+        const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0;
 
         // currently only support 'normal' input coverage
         SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
@@ -797,35 +794,25 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
             {
                 // always need to generate I & J per sample for Z interpolation
                 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount];
-                backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
+                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount][canEarlyZ];
             }
             else
             {
                 // always need to generate I & J per pixel for Z interpolation
                 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
-                backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid];
-                backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X];
+                backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ];
             }
             break;
         case SWR_SHADING_RATE_SAMPLE:
             SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
             // always need to generate I & J per sample for Z interpolation
             barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-            backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid];
-            backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
+            backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ];
             break;
         default:
             SWR_ASSERT(0 && "Invalid shading rate");
             break;
         }
-
-        // setup pointer to function that generates necessary barycentrics required by the PS
-        bool bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_PIXEL_MASK) > 0 ? 1 : 0;
-        backendFuncs.pfnCalcPixelBarycentrics = gPixelBarycentricTable[bBarycentrics];
-
-        bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_SAMPLE_MASK) > 0 ? 1 : 0;
-        backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[bBarycentrics];
     }
     
     PFN_PROCESS_PRIMS pfnBinner;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 310a7ed..1d923ea 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -29,7 +29,6 @@
 
 #include <smmintrin.h>
 
-#include "rdtsc_core.h"
 #include "backend.h"
 #include "depthstencil.h"
 #include "tilemgr.h"
@@ -459,221 +458,10 @@ simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscala
     return _simd_movemask_ps(vClipMask);
 }
 
-template<bool bGenerateBarycentrics>
-INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
-{
-    if(bGenerateBarycentrics)
-    {
-        // evaluate I,J
-        psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
-        psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
-        psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
-        psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
-
-        // interpolate 1/w
-        psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
-    }
-}
-
-template<bool bGenerateBarycentrics>
-INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
-{
-    if(bGenerateBarycentrics)
-    {
-        // evaluate I,J
-        psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
-        psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
-        psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
-        psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
-
-        // interpolate 1/w
-        psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
-    }
-}
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Centroid behaves exactly as follows :
-// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to 
-//     have a sample location there).
-// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the 
-//     coverage with the SampleMask Rasterizer State.
-// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is 
-//     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the 
-//     SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename T>
-INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask,
-                            const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
-{
-    uint32_t inputMask[KNOB_SIMD_WIDTH];
-    generateInputCoverage<T>(coverageMask, inputMask, sampleMask);
-
-    // Case (2) - partially covered pixel
-
-    // scan for first covered sample per pixel in the 4x2 span
-    unsigned long sampleNum[KNOB_SIMD_WIDTH];
-    (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
-    (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
-    (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
-    (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
-    (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
-    (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
-    (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
-    (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
-
-    // look up and set the sample offsets from UL pixel corner for first covered sample 
-    __m256 vXSample = _mm256_set_ps(T::MultisampleT::X(sampleNum[7]),
-                                    T::MultisampleT::X(sampleNum[6]),
-                                    T::MultisampleT::X(sampleNum[5]),
-                                    T::MultisampleT::X(sampleNum[4]),
-                                    T::MultisampleT::X(sampleNum[3]),
-                                    T::MultisampleT::X(sampleNum[2]),
-                                    T::MultisampleT::X(sampleNum[1]),
-                                    T::MultisampleT::X(sampleNum[0]));
-
-    __m256 vYSample = _mm256_set_ps(T::MultisampleT::Y(sampleNum[7]),
-                                    T::MultisampleT::Y(sampleNum[6]),
-                                    T::MultisampleT::Y(sampleNum[5]),
-                                    T::MultisampleT::Y(sampleNum[4]),
-                                    T::MultisampleT::Y(sampleNum[3]),
-                                    T::MultisampleT::Y(sampleNum[2]),
-                                    T::MultisampleT::Y(sampleNum[1]),
-                                    T::MultisampleT::Y(sampleNum[0]));
-    // add sample offset to UL pixel corner
-    vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
-    vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
-
-    // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
-    static const __m256i vFullyCoveredMask = T::MultisampleT::FullSampleMask();
-    __m256i vInputCoveragei =  _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
-    __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
-
-    static const __m256i vZero = _simd_setzero_si();
-    const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
-    __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
-    __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
-    __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
-
-    __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
-
-    // set the centroid position based on results from above
-    psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
-    psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
-
-    // Case (3a) No samples covered and partial sample mask
-    __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
-    // sample mask should never be all 0's for this case, but handle it anyways
-    unsigned long firstCoveredSampleMaskSample = 0;
-    (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
-
-    __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
-
-    vXSample = _simd_set1_ps(T::MultisampleT::X(firstCoveredSampleMaskSample));
-    vYSample = _simd_set1_ps(T::MultisampleT::Y(firstCoveredSampleMaskSample));
-
-    // blend in case 3a pixel locations
-    psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
-    psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
-}
-
-template<typename T>
-INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
-                                     const uint64_t *const coverageMask, const uint32_t sampleMask,
-                                     const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
-{
-    if(T::bIsStandardPattern)
-    {
-        ///@ todo: don't need to generate input coverage 2x if input coverage and centroid
-        CalcCentroidPos<T>(psContext, coverageMask, sampleMask, vXSamplePosUL, vYSamplePosUL);
-    }
-    else
-    {
-        static const __m256 pixelCenter = _simd_set1_ps(0.5f);
-        psContext.vX.centroid = _simd_add_ps(vXSamplePosUL, pixelCenter);
-        psContext.vY.centroid = _simd_add_ps(vYSamplePosUL, pixelCenter);
-    }
-    // evaluate I,J
-    psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
-    psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
-    psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
-    psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
-
-    // interpolate 1/w
-    psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
-}
-
-template<uint32_t NumRT, uint32_t sampleCountT>
-void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
-                  const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask)
-{
-    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
-    static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
-    uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample);
-    simdvector blendOut;
-
-    for(uint32_t rt = 0; rt < NumRT; ++rt)
-    {
-        uint8_t *pColorSample;
-        if(sampleCount == SWR_MULTISAMPLE_1X)
-        {
-            pColorSample = pColorBase[rt];
-        }
-        else
-        {
-            pColorSample = pColorBase[rt] + rasterTileColorOffset;
-        }
-
-        const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
-        // pfnBlendFunc may not update all channels.  Initialize with PS output.
-        /// TODO: move this into the blend JIT.
-        blendOut = psContext.shaded[rt];
-
-        // Blend outputs and update coverage mask for alpha test
-        if(pfnBlendFunc[rt] != nullptr)
-        {
-            pfnBlendFunc[rt](
-                pBlendState,
-                psContext.shaded[rt],
-                psContext.shaded[1],
-                sample,
-                pColorSample,
-                blendOut,
-                &psContext.oMask,
-                (simdscalari*)&coverageMask);
-        }
-
-        // final write mask 
-        simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
-
-        ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
-        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
-
-        const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
-
-        // store with color mask
-        if(!pRTBlend->writeDisableRed)
-        {
-            _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
-        }
-        if(!pRTBlend->writeDisableGreen)
-        {
-            _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
-        }
-        if(!pRTBlend->writeDisableBlue)
-        {
-            _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
-        }
-        if(!pRTBlend->writeDisableAlpha)
-        {
-            _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
-        }
-    }
-}
-
 template<typename T>
 void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
+    RDTSC_START(BESingleSampleBackend);
     RDTSC_START(BESetup);
 
     SWR_CONTEXT *pContext = pDC->pContext;
@@ -681,7 +469,6 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
     const SWR_RASTSTATE& rastState = state.rastState;
     const SWR_PS_STATE *pPSState = &state.psState;
     const SWR_BLEND_STATE *pBlendState = &state.blendState;
-    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
     uint64_t coverageMask = work.coverageMask[0];
 
     // broadcast scalars
@@ -736,19 +523,19 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
 
         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
-            if(T::bInputCoverage)
-            {
-                generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
-            }
-
             if(coverageMask & MASK)
             {
-                RDTSC_START(BEBarycentric);
                 psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
                 // pixel center
                 psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
-                backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
+                if(T::bInputCoverage)
+                {
+                    generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+                }
+
+                RDTSC_START(BEBarycentric);
+                CalcPixelBarycentrics(coeffs, psContext);
 
                 if(T::bCentroidPos)
                 {
@@ -763,11 +550,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 // interpolate and quantize z
                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
                 RDTSC_STOP(BEBarycentric, 0, 0);
 
                 simdmask clipCoverageMask = coverageMask & MASK;
-
                 // interpolate user clip distance if available
                 if(rastState.clipDistanceMask)
                 {
@@ -780,7 +565,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 simdscalar stencilPassMask = vCoverageMask;
 
                 // Early-Z?
-                if(CanEarlyZ(pPSState))
+                if(T::bCanEarlyZ)
                 {
                     RDTSC_START(BEEarlyDepthTest);
                     depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
@@ -812,7 +597,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
 
                 // late-Z
-                if(!CanEarlyZ(pPSState))
+                if(!T::bCanEarlyZ)
                 {
                     RDTSC_START(BELateDepthTest);
                     depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
@@ -834,8 +619,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
 
                 // output merger
                 RDTSC_START(BEOutputMerger);
-                backendFuncs.pfnOutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc,
-                                             vCoverageMask, depthPassMask);
+                OutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
 
                 // do final depth write after all pixel kills
                 if (!pPSState->forceEarlyZ)
@@ -859,11 +643,13 @@ Endtile:
             RDTSC_STOP(BEEndTile, 0, 0);
         }
     }
+    RDTSC_STOP(BESingleSampleBackend, 0, 0);
 }
 
 template<typename T>
 void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
+    RDTSC_START(BESampleRateBackend);
     RDTSC_START(BESetup);
 
     SWR_CONTEXT *pContext = pDC->pContext;
@@ -871,7 +657,6 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
     const SWR_RASTSTATE& rastState = state.rastState;
     const SWR_PS_STATE *pPSState = &state.psState;
     const SWR_BLEND_STATE *pBlendState = &state.blendState;
-    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
 
     // broadcast scalars
     BarycentricCoeffs coeffs;
@@ -915,7 +700,6 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
     psContext.recipDet = work.recipDet;
     psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
     psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
-    const uint32_t numSamples = T::MultisampleT::numSamples;
 
     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
@@ -931,7 +715,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
             psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
             RDTSC_START(BEBarycentric);
-            backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
+            CalcPixelBarycentrics(coeffs, psContext);
             RDTSC_STOP(BEBarycentric, 0, 0);
 
             if(T::bInputCoverage)
@@ -947,25 +731,21 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                 RDTSC_STOP(BEBarycentric, 0, 0);
             }
 
-            for(uint32_t sample = 0; sample < numSamples; sample++)
+            for(uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
             {
-                if (work.coverageMask[sample] & MASK)
+                simdmask coverageMask = work.coverageMask[sample] & MASK;
+                if (coverageMask)
                 {
                     RDTSC_START(BEBarycentric);
-
                     // calculate per sample positions
                     psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
                     psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
-                    
-                    simdmask coverageMask = work.coverageMask[sample] & MASK;
-                    simdscalar vCoverageMask = vMask(coverageMask);
 
-                    backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
+                    CalcSampleBarycentrics(coeffs, psContext);
 
                     // interpolate and quantize z
                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
                     RDTSC_STOP(BEBarycentric, 0, 0);
 
                     // interpolate user clip distance if available
@@ -974,16 +754,17 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                         coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
                             psContext.vI.sample, psContext.vJ.sample);
                     }
-                    
+
+                    simdscalar vCoverageMask = vMask(coverageMask);
                     simdscalar depthPassMask = vCoverageMask;
                     simdscalar stencilPassMask = vCoverageMask;
 
                     // offset depth/stencil buffers current sample
-                    uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample);
-                    uint8_t *pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample);
+                    uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
+                    uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
 
                     // Early-Z?
-                    if (CanEarlyZ(pPSState))
+                    if (T::bCanEarlyZ)
                     {
                         RDTSC_START(BEEarlyDepthTest);
                         depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
@@ -1016,7 +797,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     vCoverageMask = _simd_castsi_ps(psContext.activeMask);
 
                     // late-Z
-                    if (!CanEarlyZ(pPSState))
+                    if (!T::bCanEarlyZ)
                     {
                         RDTSC_START(BELateDepthTest);
                         depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
@@ -1040,8 +821,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
 
                     // output merger
                     RDTSC_START(BEOutputMerger);
-                    backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, 
-                                                 vCoverageMask, depthPassMask);
+                    OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, pPSState->numRenderTargets);
 
                     // do final depth write after all pixel kills
                     if (!pPSState->forceEarlyZ)
@@ -1064,11 +844,13 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
             RDTSC_STOP(BEEndTile, 0, 0);
         }
     }
+    RDTSC_STOP(BESampleRateBackend, 0, 0);
 }
 
 template<typename T>
 void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
+    RDTSC_START(BEPixelRateBackend);
     RDTSC_START(BESetup);
 
     SWR_CONTEXT *pContext = pDC->pContext;
@@ -1076,7 +858,6 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
     const SWR_RASTSTATE& rastState = state.rastState;
     const SWR_PS_STATE *pPSState = &state.psState;
     const SWR_BLEND_STATE *pBlendState = &state.blendState;
-    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
 
     // broadcast scalars
     BarycentricCoeffs coeffs;
@@ -1120,35 +901,25 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
     psContext.pSamplePosX = (const float*)&T::MultisampleT::samplePosX;
     psContext.pSamplePosY = (const float*)&T::MultisampleT::samplePosY;
     psContext.sampleIndex = 0;
-
-    uint32_t numOMSamples;
-    // RT has to be single sample if we're in forcedMSAA mode
-    if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
-    {
-        numOMSamples = 1;
-    }
-    // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
-    else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
-    {
-        numOMSamples = GetNumSamples(pBlendState->sampleCount);
-    }
-    // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
-    else
-    {
-        numOMSamples = T::MultisampleT::numSamples;
-    }
     
+    PixelRateZTestLoop<T> PixelRateZTest(pDC, work, coeffs, state, pDepthBase, pStencilBase, rastState.clipDistanceMask);
+
     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
         psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
         psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
-            simdscalar vZ[T::MultisampleT::numSamples]{ 0 };
+            if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
+
             psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
             // set pixel center positions
             psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
+            RDTSC_START(BEBarycentric);
+            CalcPixelBarycentrics(coeffs, psContext);
+            RDTSC_STOP(BEBarycentric, 0, 0);
+
             if (T::bInputCoverage)
             {
                 generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
@@ -1162,201 +933,109 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                 RDTSC_STOP(BEBarycentric, 0, 0);
             }
 
-            // if oDepth written to, or there is a potential to discard any samples, we need to 
-            // run the PS early, then interp or broadcast Z and test
-            if(pPSState->writesODepth || pPSState->killsPixel)
+			simdscalar activeLanes;
+            if(T::bForcedSampleCount)
             {
-                RDTSC_START(BEBarycentric);
-                backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
-
-                // interpolate and quantize z
-                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
-                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-                RDTSC_STOP(BEBarycentric, 0, 0);
+                // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
+                const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
+                activeLanes = _simd_and_ps(vMask(work.anyCoveredSamples & MASK), vSampleMask);
+            }
 
-                // execute pixel shader
-                RDTSC_START(BEPixelShader);
-                state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-                RDTSC_STOP(BEPixelShader, 0, 0);
+            // Early-Z?
+            if(T::bCanEarlyZ && !T::bForcedSampleCount)
+            {
+                activeLanes = _simd_setzero_ps();
+                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
+                UPDATE_STAT(DepthPassCount, depthPassCount);
             }
-            else
+            // if we can't do early z, set the active mask to any samples covered in the current simd
+            else if(!T::bCanEarlyZ && !T::bForcedSampleCount)
             {
-                psContext.activeMask = _simd_set1_epi32(-1);
+                activeLanes = vMask(work.anyCoveredSamples & MASK);
             }
 
-            // need to declare enough space for all samples
-            simdscalar vCoverageMask[T::MultisampleT::numSamples];
-            simdscalar depthPassMask[T::MultisampleT::numSamples]; 
-            simdscalar stencilPassMask[T::MultisampleT::numSamples];
-            simdscalar anyDepthSamplePassed = _simd_setzero_ps();
-            simdscalar anyStencilSamplePassed = _simd_setzero_ps();
-            for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
+            // if we have no covered samples that passed depth at this point, go to next tile
+            if(!_simd_movemask_ps(activeLanes))
             {
-                vCoverageMask[sample] = vMask(work.coverageMask[sample] & MASK);
-
-                // pull mask back out for any discards and and with coverage
-                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_castsi_ps(psContext.activeMask));
-
-                if (!_simd_movemask_ps(vCoverageMask[sample]))
-                {
-                    vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] =  _simd_setzero_ps();
-                    continue;
-                }
-
-                if(T::bForcedSampleCount)
-                {
-                    // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
-                    const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
-                    anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, _simd_and_ps(vCoverageMask[sample], vSampleMask));
-                    continue;
-                }
-
-                depthPassMask[sample] = vCoverageMask[sample];
-
-                // if oDepth isn't written to, we need to interpolate Z for each sample
-                // if clip distances are enabled, we need to interpolate for each sample
-                if(!pPSState->writesODepth || rastState.clipDistanceMask)
-                {
-                    RDTSC_START(BEBarycentric);
-                    if(T::bIsStandardPattern)
-                    {
-                        // calculate per sample positions
-                        psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
-                        psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
-                    }
-                    else
-                    {
-                        psContext.vX.sample = psContext.vX.center;
-                        psContext.vY.sample = psContext.vY.center;
-                    }
-
-                    // calc I & J per sample
-                    backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
-
-                    // interpolate and quantize z
-                    if (!pPSState->writesODepth)
-                    {
-                        vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
-                        vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
-                    }
-                    
-                    ///@todo: perspective correct vs non-perspective correct clipping?
-                    // interpolate clip distances
-                    if (rastState.clipDistanceMask)
-                    {
-                        uint8_t clipMask = ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
-                            psContext.vI.sample, psContext.vJ.sample);
-                        vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
-                    }
-                    RDTSC_STOP(BEBarycentric, 0, 0);
-                }
-                // else 'broadcast' and test psContext.vZ written from the PS each sample
-                else
-                {
-                    vZ[sample] = psContext.vZ;
-                }
-
-                // offset depth/stencil buffers current sample
-                uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample);
-                uint8_t * pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample);
-
-                // ZTest for this sample
-                RDTSC_START(BEEarlyDepthTest);
-                stencilPassMask[sample] = vCoverageMask[sample];
-                depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing,
-                                        vZ[sample], pDepthSample, vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
-                RDTSC_STOP(BEEarlyDepthTest, 0, 0);
-
-                anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
-                anyStencilSamplePassed = _simd_or_ps(anyStencilSamplePassed, stencilPassMask[sample]);
-                uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
-                uint32_t statCount = _mm_popcnt_u32(statMask);
-                UPDATE_STAT(DepthPassCount, statCount);
+                goto Endtile;
             }
 
-            // if we didn't have to execute the PS early, and at least 1 sample passed the depth test, run the PS
-            if(!pPSState->writesODepth && !pPSState->killsPixel && _simd_movemask_ps(anyDepthSamplePassed))
+            if(pPSState->usesSourceDepth)
             {
                 RDTSC_START(BEBarycentric);
-                backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
                 // interpolate and quantize z
                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
                 RDTSC_STOP(BEBarycentric, 0, 0);
+            }
 
-                // execute pixel shader
-                RDTSC_START(BEPixelShader);
-                state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-                RDTSC_STOP(BEPixelShader, 0, 0);
+            // pixels that are currently active
+            psContext.activeMask = _simd_castps_si(activeLanes);
+            psContext.oMask = T::MultisampleT::FullSampleMask();
+
+            // execute pixel shader
+            RDTSC_START(BEPixelShader);
+            state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+            UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
+            RDTSC_STOP(BEPixelShader, 0, 0);
+
+            // update active lanes to remove any discarded or oMask'd pixels
+            activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
+            if(!_simd_movemask_ps(activeLanes))
+            {
+                goto Endtile;
             }
-            ///@todo: make sure this works for kill pixel
-            else if(!_simd_movemask_ps(anyStencilSamplePassed))
+
+            // late-Z
+            if(!T::bCanEarlyZ && !T::bForcedSampleCount)
+            {
+                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
+                UPDATE_STAT(DepthPassCount, depthPassCount);
+            }
+
+            // if we have no covered samples that passed depth at this point, skip OM and go to next tile
+            if(!_simd_movemask_ps(activeLanes))
             {
                 goto Endtile;
             }
 
+            // output merger
             // loop over all samples, broadcasting the results of the PS to all passing pixels
-            for(uint32_t sample = 0; sample < numOMSamples; sample++)
+            for(uint32_t sample = 0; sample < GetNumOMSamples<T>(pBlendState->sampleCount); sample++)
             {
-                uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample);
-                uint8_t * pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample);
-
-                // output merger
                 RDTSC_START(BEOutputMerger);
-
-                // skip if none of the pixels for this sample passed
-                simdscalar coverageMaskSample;
-                simdscalar depthMaskSample;
-                simdscalar stencilMaskSample;
-                simdscalar vInterpolatedZ;
-
-                // forcedSampleCount outputs to any pixels with covered samples not masked off by SampleMask
-                // depth test is disabled, so just set the z val to 0.
+                // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
+                uint32_t coverageSampleNum = (T::bIsStandardPattern) ? sample : 0;
+                simdscalar coverageMask, depthMask;
                 if(T::bForcedSampleCount)
                 {
-                    coverageMaskSample = depthMaskSample = anyDepthSamplePassed;
-                    vInterpolatedZ = _simd_setzero_ps();
-                }
-                else if(T::bIsStandardPattern)
-                {
-                    if(!_simd_movemask_ps(depthPassMask[sample]))
-                    {
-                        depthPassMask[sample] = _simd_setzero_ps();
-                        DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], pDepthSample, depthPassMask[sample],
-                                          vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
-                        continue;
-                    }
-                    coverageMaskSample = vCoverageMask[sample];
-                    depthMaskSample = depthPassMask[sample];
-                    stencilMaskSample = stencilPassMask[sample];
-                    vInterpolatedZ = vZ[sample];
+                    coverageMask = depthMask = activeLanes;
                 }
                 else
                 {
-                    // center pattern only needs to use a single depth test as all samples are at the same position
-                    if(!_simd_movemask_ps(depthPassMask[0]))
+                    coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
+                    depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
+                    if(!_simd_movemask_ps(depthMask))
                     {
-                        depthPassMask[0] = _simd_setzero_ps();
-                        DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[0], pDepthSample, depthPassMask[0],
-                                          vCoverageMask[0], pStencilSample, stencilPassMask[0]);
+                        // stencil should already have been written in early/lateZ tests
+                        RDTSC_STOP(BEOutputMerger, 0, 0);
                         continue;
                     }
-                    coverageMaskSample = (vCoverageMask[0]);
-                    depthMaskSample = depthPassMask[0];
-                    stencilMaskSample = stencilPassMask[0];
-                    vInterpolatedZ = vZ[0];
                 }
+                
+                // broadcast the results of the PS to all passing pixels
+                OutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, coverageMask, depthMask, pPSState->numRenderTargets);
 
-                // output merger
-                RDTSC_START(BEOutputMerger);
-                backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc,
-                                             coverageMaskSample, depthMaskSample);
+                if(!pPSState->forceEarlyZ && !T::bForcedSampleCount)
+                {
+                    uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
+                    uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
 
-                DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vInterpolatedZ, pDepthSample, depthMaskSample,
-                                  coverageMaskSample, pStencilSample, stencilMaskSample);
-                RDTSC_STOP(BEOutputMerger, 0, 0);
+                    DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
+                                      pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
+                }
+                RDTSC_STOP(BEOutputMerger, 0, 0);        
             }
-
 Endtile:
             RDTSC_START(BEEndTile);
             for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
@@ -1364,6 +1043,7 @@ Endtile:
                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
 
+            work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
             pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
 
@@ -1374,18 +1054,19 @@ Endtile:
             RDTSC_STOP(BEEndTile, 0, 0);
         }
     }
+    RDTSC_STOP(BEPixelRateBackend, 0, 0);
 }
 // optimized backend flow with NULL PS
 template<uint32_t sampleCountT>
 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
+    RDTSC_START(BENullBackend);
     ///@todo: handle center multisample pattern
     typedef SwrBackendTraits<sampleCountT, SWR_MSAA_STANDARD_PATTERN> T;
     RDTSC_START(BESetup);
 
     SWR_CONTEXT *pContext = pDC->pContext;
     const API_STATE& state = GetApiState(pDC);
-    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
     const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
 
     // broadcast scalars
@@ -1433,7 +1114,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
                     psContext.vX.sample = _simd_add_ps(vXSamplePosUL, T::MultisampleT::vX(sample));
                     psContext.vY.sample = _simd_add_ps(vYSamplePosUL, T::MultisampleT::vY(sample));
 
-                    backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
+                    CalcSampleBarycentrics(coeffs, psContext);
 
                     // interpolate and quantize z
                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
@@ -1452,8 +1133,8 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
                     simdscalar stencilPassMask = vCoverageMask;
 
                     // offset depth/stencil buffers current sample
-                    uint8_t *pDepthSample = pDepthBase + T::MultisampleT::RasterTileDepthOffset(sample);
-                    uint8_t *pStencilSample = pStencilBase + T::MultisampleT::RasterTileStencilOffset(sample);
+                    uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
+                    uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
 
                     RDTSC_START(BEEarlyDepthTest);
                     simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
@@ -1472,6 +1153,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
             pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
         }
     }
+    RDTSC_STOP(BENullBackend, 0, 0);
 }
 
 void InitClearTilesTable()
@@ -1486,57 +1168,21 @@ void InitClearTilesTable()
 }
 
 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
-PFN_BACKEND_FUNC gBackendSingleSample[2][2] = {};
-PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2] = {};
-PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2] = {};
-PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX] = {};
-PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2] = {};
-PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2] = {};
-
-// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
-// arguments to static template arguments.
-template <uint32_t... ArgsT>
-struct OMChooser
-{
-    // Last Arg Terminator
-    static PFN_OUTPUT_MERGER GetFunc(SWR_MULTISAMPLE_COUNT tArg)
-    {
-        switch(tArg)
-        {
-        case SWR_MULTISAMPLE_1X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_1X>; break;
-        case SWR_MULTISAMPLE_2X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_2X>; break;
-        case SWR_MULTISAMPLE_4X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_4X>; break;
-        case SWR_MULTISAMPLE_8X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_8X>; break;
-        case SWR_MULTISAMPLE_16X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_16X>; break;
-        default:
-            SWR_ASSERT(0 && "Invalid sample count\n");
-            return nullptr;
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_OUTPUT_MERGER GetFunc(uint32_t tArg, TArgsT... remainingArgs)
-    {
-        switch(tArg)
-        {
-        case 0: return OMChooser<ArgsT..., 0>::GetFunc(remainingArgs...); break;
-        case 1: return OMChooser<ArgsT..., 1>::GetFunc(remainingArgs...); break;
-        case 2: return OMChooser<ArgsT..., 2>::GetFunc(remainingArgs...); break;
-        case 3: return OMChooser<ArgsT..., 3>::GetFunc(remainingArgs...); break;
-        case 4: return OMChooser<ArgsT..., 4>::GetFunc(remainingArgs...); break;
-        case 5: return OMChooser<ArgsT..., 5>::GetFunc(remainingArgs...); break;
-        case 6: return OMChooser<ArgsT..., 6>::GetFunc(remainingArgs...); break;
-        case 7: return OMChooser<ArgsT..., 7>::GetFunc(remainingArgs...); break;
-        case 8: return OMChooser<ArgsT..., 8>::GetFunc(remainingArgs...); break;
-        default:
-            SWR_ASSERT(0 && "Invalid RT index\n");
-            return nullptr;
-            break;
-        }
-    }
-};
+PFN_BACKEND_FUNC gBackendSingleSample[2] // input coverage
+                                     [2] // centroid
+                                     [2] // canEarlyZ
+                                     = {};
+PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX]
+                                       [SWR_MSAA_SAMPLE_PATTERN_MAX]
+                                       [SWR_INPUT_COVERAGE_MAX]
+                                       [2] // centroid
+                                       [2] // forcedSampleCount
+                                       [2] // canEarlyZ
+                                       = {};
+PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX]
+                                        [2] // centroid
+                                        [2] // canEarlyZ
+                                        = {};
 
 // Recursive template used to auto-nest conditionals.  Converts dynamic enum function
 // arguments to static template arguments.
@@ -1604,83 +1250,72 @@ struct BEChooser
     }
 };
 
-template <uint32_t numRenderTargets, SWR_MULTISAMPLE_COUNT numSampleRates>
-void InitBackendOMFuncTable(PFN_OUTPUT_MERGER (&table)[numRenderTargets][numSampleRates])
+void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[2][2][2])
 {
-    for(uint32_t rtNum = SWR_ATTACHMENT_COLOR0; rtNum < numRenderTargets; rtNum++)
+    for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++)
     {
-        for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+        for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
         {
-            table[rtNum][sampleCount] =
-                OMChooser<>::GetFunc((SWR_RENDERTARGET_ATTACHMENT)rtNum, (SWR_MULTISAMPLE_COUNT)sampleCount);
+            for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
+            {
+                table[inputCoverage][isCentroid][canEarlyZ] =
+                    BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL),
+                                         (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
+            }
         }
     }
 }
 
-template <SWR_MULTISAMPLE_COUNT numSampleRates>
-void InitBackendBarycentricsTables(PFN_CALC_PIXEL_BARYCENTRICS (&pixelTable)[2], 
-                                   PFN_CALC_SAMPLE_BARYCENTRICS (&sampleTable)[2])
+void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX]
+                                                        [2][2][2])
 {
-    pixelTable[0] = CalcPixelBarycentrics<0>;
-    pixelTable[1] = CalcPixelBarycentrics<1>;
-
-    sampleTable[0] = CalcSampleBarycentrics<0>;
-    sampleTable[1] = CalcSampleBarycentrics<1>;
-}
-
-void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[2][2])
-{
-    gBackendSingleSample[0][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false, false, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
-    gBackendSingleSample[0][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false, true, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
-    gBackendSingleSample[1][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, true, false, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
-    gBackendSingleSample[1][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, true, true, false, false,(SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
-}
-
-template <SWR_MULTISAMPLE_COUNT numSampleRates, SWR_MSAA_SAMPLE_PATTERN numSamplePatterns, SWR_INPUT_COVERAGE numCoverageModes>
-void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numSamplePatterns][numCoverageModes][2][2])
-{
-    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_MAX; sampleCount++)
     {
-        for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < numSamplePatterns; samplePattern++)
+        for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_MAX; samplePattern++)
         {
-            for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
+            for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++)
             {
                 for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
                 {
-                    table[sampleCount][samplePattern][inputCoverage][isCentroid][0] =
-                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), (isCentroid > 0),
-                                                     false, false, SWR_BACKEND_MSAA_PIXEL_RATE);
-                    table[sampleCount][samplePattern][inputCoverage][isCentroid][1] =
-                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), (isCentroid > 0),
-                                             true, false, SWR_BACKEND_MSAA_PIXEL_RATE);
+                    for(uint32_t forcedSampleCount = 0; forcedSampleCount < 2; forcedSampleCount++)
+                    {
+                        for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
+                        {
+                            table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] =
+                                BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), 
+                                                        (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE);
+                        }
+                    }
                 }
             }
         }
     }
 }
 
-template <uint32_t numSampleRates, uint32_t numCoverageModes>
-void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numCoverageModes][2])
+void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2][2])
 {
-    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_MAX; sampleCount++)
     {
-        for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
+        for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < SWR_INPUT_COVERAGE_MAX; inputCoverage++)
         {
-            table[sampleCount][inputCoverage][0] =
-                BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), false, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
-            table[sampleCount][inputCoverage][1] =
-                BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), true, false, false, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
+            for(uint32_t centroid = 0; centroid < 2; centroid++)
+            {
+                for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
+                {
+                    table[sampleCount][inputCoverage][centroid][canEarlyZ] =
+                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage == SWR_INPUT_COVERAGE_NORMAL), 
+                                             (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
+                }
+            }
         }
     }
 }
 
 void InitBackendFuncTables()
 {    
-    InitBackendSampleFuncTable(gBackendSingleSample);
-    InitBackendPixelFuncTable<(SWR_MULTISAMPLE_COUNT)SWR_MULTISAMPLE_TYPE_MAX, SWR_MSAA_SAMPLE_PATTERN_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendPixelRateTable);
-    InitBackendSampleFuncTable<SWR_MULTISAMPLE_TYPE_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendSampleRateTable);
-    InitBackendOMFuncTable<SWR_NUM_RENDERTARGETS+1, SWR_MULTISAMPLE_TYPE_MAX>(gBackendOutputMergerTable);
-    InitBackendBarycentricsTables<(SWR_MULTISAMPLE_COUNT)(SWR_MULTISAMPLE_TYPE_MAX)>(gPixelBarycentricTable, gSampleBarycentricTable);
+    InitBackendSingleFuncTable(gBackendSingleSample);
+    InitBackendPixelFuncTable(gBackendPixelRateTable);
+    InitBackendSampleFuncTable(gBackendSampleRateTable);
 
     gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
     gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
index 022e60a..24ba69e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -31,6 +31,7 @@
 #include "common/os.h"
 #include "core/context.h"
 #include "core/multisample.h"
+#include "rdtsc_core.h"
 
 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
@@ -43,6 +44,7 @@ void InitClearTilesTable();
 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
 void InitBackendFuncTables();
 void InitCPSFuncTables();
+void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
 
 enum SWR_BACKEND_FUNCS
 {
@@ -60,6 +62,78 @@ extern const __m256 vULOffsetsY;
 #define MASK 0xff
 #endif
 
+INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+{
+    static const uint32_t RasterTileColorOffsets[16]
+    { 0,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
+    };
+    assert(sampleNum < 16);
+    return RasterTileColorOffsets[sampleNum];
+}
+
+INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+{
+    static const uint32_t RasterTileDepthOffsets[16]
+    { 0,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
+    };
+    assert(sampleNum < 16);
+    return RasterTileDepthOffsets[sampleNum];
+}
+
+INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+{
+    static const uint32_t RasterTileStencilOffsets[16]
+    { 0,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
+    };
+    assert(sampleNum < 16);
+    return RasterTileStencilOffsets[sampleNum];
+}
+
 template<typename T>
 INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
 {
@@ -209,14 +283,328 @@ INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &in
     inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Centroid behaves exactly as follows :
+// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to 
+//     have a sample location there).
+// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the 
+//     coverage with the SampleMask Rasterizer State.
+// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is 
+//     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the 
+//     SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask,
+                            const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+{
+    uint32_t inputMask[KNOB_SIMD_WIDTH];
+    generateInputCoverage<T>(coverageMask, inputMask, sampleMask);
+
+    // Case (2) - partially covered pixel
+
+    // scan for first covered sample per pixel in the 4x2 span
+    unsigned long sampleNum[KNOB_SIMD_WIDTH];
+    (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
+    (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
+    (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
+    (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
+    (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
+    (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
+    (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
+    (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
+
+    // look up and set the sample offsets from UL pixel corner for first covered sample 
+    __m256 vXSample = _mm256_set_ps(T::MultisampleT::X(sampleNum[7]),
+                                    T::MultisampleT::X(sampleNum[6]),
+                                    T::MultisampleT::X(sampleNum[5]),
+                                    T::MultisampleT::X(sampleNum[4]),
+                                    T::MultisampleT::X(sampleNum[3]),
+                                    T::MultisampleT::X(sampleNum[2]),
+                                    T::MultisampleT::X(sampleNum[1]),
+                                    T::MultisampleT::X(sampleNum[0]));
+
+    __m256 vYSample = _mm256_set_ps(T::MultisampleT::Y(sampleNum[7]),
+                                    T::MultisampleT::Y(sampleNum[6]),
+                                    T::MultisampleT::Y(sampleNum[5]),
+                                    T::MultisampleT::Y(sampleNum[4]),
+                                    T::MultisampleT::Y(sampleNum[3]),
+                                    T::MultisampleT::Y(sampleNum[2]),
+                                    T::MultisampleT::Y(sampleNum[1]),
+                                    T::MultisampleT::Y(sampleNum[0]));
+    // add sample offset to UL pixel corner
+    vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
+    vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
+
+    // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
+    static const __m256i vFullyCoveredMask = T::MultisampleT::FullSampleMask();
+    __m256i vInputCoveragei =  _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
+    __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
+
+    static const __m256i vZero = _simd_setzero_si();
+    const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
+    __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
+    __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
+    __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
+
+    __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
+
+    // set the centroid position based on results from above
+    psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
+    psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
+
+    // Case (3a) No samples covered and partial sample mask
+    __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
+    // sample mask should never be all 0's for this case, but handle it anyways
+    unsigned long firstCoveredSampleMaskSample = 0;
+    (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
+
+    __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
+
+    vXSample = _simd_set1_ps(T::MultisampleT::X(firstCoveredSampleMaskSample));
+    vYSample = _simd_set1_ps(T::MultisampleT::Y(firstCoveredSampleMaskSample));
+
+    // blend in case 3a pixel locations
+    psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
+    psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
+}
+
+template<typename T>
+INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
+                                     const uint64_t *const coverageMask, const uint32_t sampleMask,
+                                     const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+{
+    if(T::bIsStandardPattern)
+    {
+        ///@ todo: don't need to generate input coverage 2x if input coverage and centroid
+        CalcCentroidPos<T>(psContext, coverageMask, sampleMask, vXSamplePosUL, vYSamplePosUL);
+    }
+    else
+    {
+        static const __m256 pixelCenter = _simd_set1_ps(0.5f);
+        psContext.vX.centroid = _simd_add_ps(vXSamplePosUL, pixelCenter);
+        psContext.vY.centroid = _simd_add_ps(vYSamplePosUL, pixelCenter);
+    }
+    // evaluate I,J
+    psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
+    psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
+    psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
+    psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
+
+    // interpolate 1/w
+    psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
+}
+
+template<typename T>
+INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
+{
+    // RT has to be single sample if we're in forcedMSAA mode
+    if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
+    {
+        return 1;
+    }
+    // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
+    else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
+    {
+        return GetNumSamples(blendSampleCount);
+    }
+    // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
+    else
+    {
+        return T::MultisampleT::numSamples;
+    }
+}
+
+template<typename T>
+struct PixelRateZTestLoop
+{
+    PixelRateZTestLoop(DRAW_CONTEXT *DC, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState, 
+                       uint8_t*& depthBase, uint8_t*& stencilBase, const uint8_t ClipDistanceMask) :
+                       work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
+                       clipDistanceMask(ClipDistanceMask), pDepthBase(depthBase), pStencilBase(stencilBase) {};
+           
+    INLINE
+    uint32_t operator()(simdscalar& anyDepthSamplePassed, SWR_PS_CONTEXT& psContext, 
+                        const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
+    {
+        uint32_t statCount = 0;
+        for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
+        {
+            const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
+            vCoverageMask[sample] = vMask(pCoverageMask[currentSimdIn8x8] & MASK);
+
+            if(!_simd_movemask_ps(vCoverageMask[sample]))
+            {
+                vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
+                continue;
+            }
+
+            RDTSC_START(BEBarycentric);
+            // calculate per sample positions
+            psContext.vX.sample = _simd_add_ps(psContext.vX.UL, T::MultisampleT::vX(sample));
+            psContext.vY.sample = _simd_add_ps(psContext.vY.UL, T::MultisampleT::vY(sample));
+
+            // calc I & J per sample
+            CalcSampleBarycentrics(coeffs, psContext);
+
+            if(psState.writesODepth)
+            {
+                // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
+                vZ[sample] = psContext.vZ;
+            }
+            else
+            {
+                vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+                vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
+            }
+            RDTSC_STOP(BEBarycentric, 0, 0);
+
+            ///@todo: perspective correct vs non-perspective correct clipping?
+            // if clip distances are enabled, we need to interpolate for each sample
+            if(clipDistanceMask)
+            {
+                uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer,
+                                                       psContext.vI.sample, psContext.vJ.sample);
+                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
+            }
+
+            // offset depth/stencil buffers current sample
+            uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
+            uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);
+
+            // ZTest for this sample
+            RDTSC_START(BEDepthBucket);
+            depthPassMask[sample] = vCoverageMask[sample];
+            stencilPassMask[sample] = vCoverageMask[sample];
+            depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, vZ[sample], pDepthSample, 
+                                                     vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
+            RDTSC_STOP(BEDepthBucket, 0, 0);
+
+            // early-exit if no pixels passed depth or earlyZ is forced on
+            if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
+            {
+                DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
+                                  pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
+
+                if(!_simd_movemask_ps(depthPassMask[sample]))
+                {
+                    continue;
+                }
+            }
+            anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
+            uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
+            statCount += _mm_popcnt_u32(statMask);
+        }
+        // return number of samples that passed depth and coverage
+        return statCount;
+    }
+
+    // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
+    simdscalar vZ[T::MultisampleT::numCoverageSamples];
+    simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
+    simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
+    simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
+
+private:
+    // functor inputs
+    const SWR_TRIANGLE_DESC& work;
+    const BarycentricCoeffs& coeffs;
+    const API_STATE& state;
+    const SWR_PS_STATE& psState;
+    const uint8_t clipDistanceMask;
+    uint8_t*& pDepthBase;
+    uint8_t*& pStencilBase;
+};
+
+INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
+{
+    // evaluate I,J
+    psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
+    psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
+    psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
+    psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
+
+    // interpolate 1/w
+    psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
+}
+
+INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
+{
+    // evaluate I,J
+    psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
+    psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
+    psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
+    psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
+
+    // interpolate 1/w
+    psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
+}
+
+INLINE void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
+                         const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT)
+{
+    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
+    const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
+    simdvector blendOut;
+
+    for(uint32_t rt = 0; rt < NumRT; ++rt)
+    {
+        uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
+
+        const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+        // pfnBlendFunc may not update all channels.  Initialize with PS output.
+        /// TODO: move this into the blend JIT.
+        blendOut = psContext.shaded[rt];
+
+        // Blend outputs and update coverage mask for alpha test
+        if(pfnBlendFunc[rt] != nullptr)
+        {
+            pfnBlendFunc[rt](
+                pBlendState,
+                psContext.shaded[rt],
+                psContext.shaded[1],
+                sample,
+                pColorSample,
+                blendOut,
+                &psContext.oMask,
+                (simdscalari*)&coverageMask);
+        }
+
+        // final write mask 
+        simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
+
+        ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
+        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
+
+        const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
+
+        // store with color mask
+        if(!pRTBlend->writeDisableRed)
+        {
+            _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
+        }
+        if(!pRTBlend->writeDisableGreen)
+        {
+            _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
+        }
+        if(!pRTBlend->writeDisableBlue)
+        {
+            _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
+        }
+        if(!pRTBlend->writeDisableAlpha)
+        {
+            _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
+        }
+    }
+}
+
 template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t samplePattern = SWR_MSAA_STANDARD_PATTERN,
-         uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t odepth = 0>
+         uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0>
 struct SwrBackendTraits
 {
     static const bool bIsStandardPattern = (samplePattern == SWR_MSAA_STANDARD_PATTERN);
     static const bool bInputCoverage = (coverage == 1);
     static const bool bCentroidPos = (centroid == 1);
     static const bool bForcedSampleCount = (forced == 1);
-    static const bool bWritesODepth = (odepth == 1);
+    static const bool bCanEarlyZ = (canEarlyZ == 1);
     typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, (bIsStandardPattern) ? SWR_MSAA_STANDARD_PATTERN : SWR_MSAA_CENTER_PATTERN> MultisampleT;
-};
\ No newline at end of file
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 540c690..03e5837 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -357,13 +357,8 @@ typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_C
 struct BACKEND_FUNCS
 {
     PFN_BACKEND_FUNC pfnBackend;
-    PFN_CALC_PIXEL_BARYCENTRICS pfnCalcPixelBarycentrics;
-    PFN_CALC_SAMPLE_BARYCENTRICS pfnCalcSampleBarycentrics;
-    PFN_CALC_CENTROID_BARYCENTRICS pfnCalcCentroidBarycentrics;
-    PFN_OUTPUT_MERGER pfnOutputMerger;
 };
 
-
 // Draw State
 struct DRAW_STATE
 {
diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.h b/src/gallium/drivers/swr/rasterizer/core/multisample.h
index c5096ed..7213a38 100644
--- a/src/gallium/drivers/swr/rasterizer/core/multisample.h
+++ b/src/gallium/drivers/swr/rasterizer/core/multisample.h
@@ -65,9 +65,6 @@ struct MultisampleTraits
     INLINE static float Y(uint32_t sampleNum) = delete;
     INLINE static __m128i TileSampleOffsetsX() = delete;
     INLINE static __m128i TileSampleOffsetsY() = delete;
-    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) = delete;
-    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) = delete;
-    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) = delete;
     INLINE static simdscalari FullSampleMask() = delete;
 
     static const uint32_t numSamples = 0;
@@ -121,21 +118,6 @@ struct MultisampleTraits<SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN>
         return tileSampleOffsetY;
     }
 
-    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-    {
-        return 0;
-    }
-
-    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-    {
-        return 0;
-    }
-
-    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-    {
-        return 0;
-    }
-
     INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);};
 
     static const uint32_t samplePosXi {0x80};
@@ -185,21 +167,6 @@ struct MultisampleTraits<SWR_MULTISAMPLE_1X, SWR_MSAA_CENTER_PATTERN>
         return _mm_set1_epi32(0x80);
     }
 
-    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-    {
-        return 0;
-    }
-
-    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-    {
-        return 0;
-    }
-
-    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-    {
-        return 0;
-    }
-
     INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);};
     
     static const uint32_t numSamples = 1;
@@ -261,36 +228,6 @@ struct MultisampleTraits<SWR_MULTISAMPLE_2X, SWR_MSAA_STANDARD_PATTERN>
         return tileSampleOffsetY;
     }
 
-    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileColorOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileColorOffsets[sampleNum];
-    }
-
-    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileDepthOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileDepthOffsets[sampleNum];
-    }
-
-    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileStencilOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileStencilOffsets[sampleNum];
-    }
-
     INLINE static simdscalari FullSampleMask()
     {
          static const simdscalari mask =_simd_set1_epi32(0x3);
@@ -344,36 +281,6 @@ struct MultisampleTraits<SWR_MULTISAMPLE_2X, SWR_MSAA_CENTER_PATTERN>
         return _mm_set1_epi32(0x80);
     }
 
-    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileColorOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileColorOffsets[sampleNum];
-    }
-
-    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileDepthOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileDepthOffsets[sampleNum];
-    }
-
-    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileStencilOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileStencilOffsets[sampleNum];
-    }
-
     INLINE static simdscalari FullSampleMask()
     {
          static const simdscalari mask =_simd_set1_epi32(0x3);
@@ -442,42 +349,6 @@ struct MultisampleTraits<SWR_MULTISAMPLE_4X, SWR_MSAA_STANDARD_PATTERN>
         return tileSampleOffsetY;
     }
 
-    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileColorOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileColorOffsets[sampleNum];
-    }
-
-    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileDepthOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileDepthOffsets[sampleNum];
-    }
-
-    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileStencilOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileStencilOffsets[sampleNum];
-    }
-
     INLINE static simdscalari FullSampleMask()
     {
         static const simdscalari mask = _simd_set1_epi32(0xF);
@@ -531,42 +402,6 @@ struct MultisampleTraits<SWR_MULTISAMPLE_4X, SWR_MSAA_CENTER_PATTERN>
         return _mm_set1_epi32(0x80);
     }
 
-    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileColorOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileColorOffsets[sampleNum];
-    }
-
-    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileDepthOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileDepthOffsets[sampleNum];
-    }
-
-    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileStencilOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileStencilOffsets[sampleNum];
-    }
-
     INLINE static simdscalari FullSampleMask()
     {
         static const simdscalari mask = _simd_set1_epi32(0xF);
@@ -639,54 +474,6 @@ struct MultisampleTraits<SWR_MULTISAMPLE_8X, SWR_MSAA_STANDARD_PATTERN>
         return tileSampleOffsetY;
     }
 
-    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileColorOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileColorOffsets[sampleNum];
-    }
-
-    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileDepthOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileDepthOffsets[sampleNum];
-    }
-
-    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileStencilOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileStencilOffsets[sampleNum];
-    }
-
     INLINE static simdscalari FullSampleMask()
     {
         static const simdscalari mask = _simd_set1_epi32(0xFF);
@@ -740,54 +527,6 @@ struct MultisampleTraits<SWR_MULTISAMPLE_8X, SWR_MSAA_CENTER_PATTERN>
         return _mm_set1_epi32(0x80);
     }
 
-    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileColorOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileColorOffsets[sampleNum];
-    }
-
-    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileDepthOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileDepthOffsets[sampleNum];
-    }
-
-    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileStencilOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileStencilOffsets[sampleNum];
-    }
-
     INLINE static simdscalari FullSampleMask()
     {
         static const simdscalari mask = _simd_set1_epi32(0xFF);
@@ -868,78 +607,6 @@ struct MultisampleTraits<SWR_MULTISAMPLE_16X, SWR_MSAA_STANDARD_PATTERN>
         return tileSampleOffsetY;
     }
 
-    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileColorOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileColorOffsets[sampleNum];
-    }
-
-    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileDepthOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileDepthOffsets[sampleNum];
-    }
-
-    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileStencilOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileStencilOffsets[sampleNum];
-    }
-
     INLINE static simdscalari FullSampleMask()
     {
         static const simdscalari mask = _simd_set1_epi32(0xFFFF);
@@ -992,79 +659,7 @@ struct MultisampleTraits<SWR_MULTISAMPLE_16X, SWR_MSAA_CENTER_PATTERN>
         // BR,             BL,             UR,          UL
         return _mm_set1_epi32(0x80);
     }
-
-    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileColorOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileColorOffsets[sampleNum];
-    }
-
-    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileDepthOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileDepthOffsets[sampleNum];
-    }
-
-    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-    {
-        static const uint32_t RasterTileStencilOffsets[numSamples]
-        { 0,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
-          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
-        };
-        assert(sampleNum < numSamples);
-        return RasterTileStencilOffsets[sampleNum];
-    }
-
+    
     INLINE static simdscalari FullSampleMask()
     {
         static const simdscalari mask = _simd_set1_epi32(0xFFFF);
diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
index 4b6b536..df8bad3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
@@ -77,6 +77,10 @@ BUCKET_DESC gCoreBuckets[] = {
     { "BEBarycentric", "", false, 0xffffffff },
     { "BEEarlyDepthTest", "", false, 0xffffffff },
     { "BEPixelShader", "", false, 0xffffffff },
+    { "BESingleSampleBackend", "", false, 0xffffffff },
+    { "BEPixelRateBackend", "", false, 0xffffffff },
+    { "BESampleRateBackend", "", false, 0xffffffff },
+    { "BENullBackend", "", false, 0xffffffff },
     { "BELateDepthTest", "", false, 0xffffffff },
     { "BEOutputMerger", "", false, 0xffffffff },
     { "BEStoreTiles", "", true, 0xff00cccc },
diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
index 5fcc40b..e1dde61 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
+++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
@@ -82,6 +82,10 @@ enum CORE_BUCKETS
     BEBarycentric,
     BEEarlyDepthTest,
     BEPixelShader,
+    BESingleSampleBackend,
+    BEPixelRateBackend,
+    BESampleRateBackend,
+    BENullBackend,
     BELateDepthTest,
     BEOutputMerger,
     BEStoreTiles,
-- 
1.9.1



More information about the mesa-dev mailing list