[Mesa-dev] [PATCH 1/8] swr/rast: Split backend.cpp to improve compile time

Tim Rowley timothy.o.rowley at intel.com
Thu Jun 22 21:13:00 UTC 2017


Hardcode split to four files currently.  Decreases swr build
time on a quad-core by ~10%.
---
 src/gallium/drivers/swr/Makefile.am                |   26 +-
 src/gallium/drivers/swr/Makefile.sources           |    4 +
 src/gallium/drivers/swr/SConscript                 |   19 +-
 .../drivers/swr/rasterizer/codegen/gen_backends.py |   38 +-
 .../drivers/swr/rasterizer/codegen/gen_common.py   |    7 +
 .../rasterizer/codegen/templates/gen_backend.cpp   |    1 +
 .../codegen/templates/gen_header_init.hpp          |   43 +
 src/gallium/drivers/swr/rasterizer/core/api.cpp    |    7 +-
 .../drivers/swr/rasterizer/core/backend.cpp        |  809 +--------------
 src/gallium/drivers/swr/rasterizer/core/backend.h  | 1033 +------------------
 .../drivers/swr/rasterizer/core/backend_clear.cpp  |  281 ++++++
 .../drivers/swr/rasterizer/core/backend_impl.h     | 1067 ++++++++++++++++++++
 .../drivers/swr/rasterizer/core/backend_sample.cpp |  345 +++++++
 .../swr/rasterizer/core/backend_singlesample.cpp   |  321 ++++++
 14 files changed, 2160 insertions(+), 1841 deletions(-)
 create mode 100644 src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp
 create mode 100644 src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
 create mode 100644 src/gallium/drivers/swr/rasterizer/core/backend_impl.h
 create mode 100644 src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
 create mode 100644 src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp

diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am
index 6650abd..0daec90 100644
--- a/src/gallium/drivers/swr/Makefile.am
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -34,6 +34,7 @@ COMMON_CXXFLAGS = \
 	$(LLVM_CXXFLAGS) \
 	$(SWR_CXX11_CXXFLAGS) \
 	-I$(builddir)/rasterizer/codegen \
+	-I$(builddir)/rasterizer/core \
 	-I$(builddir)/rasterizer/jitter \
 	-I$(builddir)/rasterizer/archrast \
 	-I$(srcdir)/rasterizer \
@@ -62,7 +63,11 @@ BUILT_SOURCES = \
 	rasterizer/archrast/gen_ar_event.cpp \
 	rasterizer/archrast/gen_ar_eventhandler.hpp \
 	rasterizer/archrast/gen_ar_eventhandlerfile.hpp \
-	rasterizer/core/gen_BackendPixelRate0.cpp
+	rasterizer/core/backends/gen_BackendPixelRate0.cpp \
+	rasterizer/core/backends/gen_BackendPixelRate1.cpp \
+	rasterizer/core/backends/gen_BackendPixelRate2.cpp \
+	rasterizer/core/backends/gen_BackendPixelRate3.cpp \
+	rasterizer/core/backends/gen_BackendPixelRate.hpp
 
 MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
 PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS)
@@ -140,20 +145,30 @@ rasterizer/archrast/gen_ar_eventhandlerfile.hpp: rasterizer/codegen/gen_archrast
 		--output rasterizer/archrast/gen_ar_eventhandlerfile.hpp \
 		--gen_eventhandlerfile_h
 
+rasterizer/core/backends/gen_BackendPixelRate0.cpp \
+rasterizer/core/backends/gen_BackendPixelRate1.cpp \
+rasterizer/core/backends/gen_BackendPixelRate2.cpp \
+rasterizer/core/backends/gen_BackendPixelRate3.cpp \
+rasterizer/core/backends/gen_BackendPixelRate.hpp: \
+backend.intermediate
+
 # 5 SWR_MULTISAMPLE_TYPE_COUNT
 # 2 SWR_MSAA_SAMPLE_PATTERN_COUNT
 # 3 SWR_INPUT_COVERAGE_COUNT
 # 2 centroid
 # 2 forcedSampleCount
 # 2 canEarlyZ
-rasterizer/core/gen_BackendPixelRate0.cpp: rasterizer/codegen/gen_backends.py rasterizer/codegen/templates/gen_backend.cpp
+
+.INTERMEDIATE: backend.intermediate
+backend.intermediate: rasterizer/codegen/gen_backends.py rasterizer/codegen/templates/gen_backend.cpp rasterizer/codegen/templates/gen_header_init.hpp
 	$(MKDIR_GEN)
 	$(PYTHON_GEN) \
 		$(srcdir)/rasterizer/codegen/gen_backends.py \
-		--outdir rasterizer/core \
+		--outdir rasterizer/core/backends \
 		--dim 5 2 3 2 2 2 \
-		--split 0 \
-		--cpp
+		--numfiles 4 \
+		--cpp \
+		--hpp
 
 COMMON_LIBADD = \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
@@ -227,5 +242,6 @@ EXTRA_DIST = \
 	rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp \
 	rasterizer/codegen/templates/gen_backend.cpp \
 	rasterizer/codegen/templates/gen_builder.hpp \
+	rasterizer/codegen/templates/gen_header_init.hpp \
 	rasterizer/codegen/templates/gen_knobs.cpp \
 	rasterizer/codegen/templates/gen_llvm.hpp
diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources
index a1172b7..d9894c2 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -73,7 +73,11 @@ CORE_CXX_SOURCES := \
 	rasterizer/core/api.h \
 	rasterizer/core/arena.h \
 	rasterizer/core/backend.cpp \
+	rasterizer/core/backend_clear.cpp \
+	rasterizer/core/backend_sample.cpp \
+	rasterizer/core/backend_singlesample.cpp \
 	rasterizer/core/backend.h \
+	rasterizer/core/backend_impl.h \
 	rasterizer/core/binner.cpp \
 	rasterizer/core/binner.h \
 	rasterizer/core/blend.h \
diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript
index cdb85e2..0f3cd6c 100644
--- a/src/gallium/drivers/swr/SConscript
+++ b/src/gallium/drivers/swr/SConscript
@@ -140,12 +140,22 @@ Depends('rasterizer/jitter/gen_state_llvm.h',
 # 2 centroid
 # 2 forcedSampleCount
 # 2 canEarlyZ
+backendPixelRateFileCount = 4
+backendPixelRateFilePat = "rasterizer/core/backends/gen_BackendPixelRate%s.cpp"
+backendPixelRateFiles = map(lambda x: backendPixelRateFilePat % x,
+                            range(0, backendPixelRateFileCount))
 env.CodeGenerate(
-    target = 'rasterizer/core/gen_BackendPixelRate0.cpp',
+    target = 'rasterizer/core/backends/gen_BackendPixelRate.hpp',
     script = swrroot + 'rasterizer/codegen/gen_backends.py',
     source = '',
-    command = python_cmd + ' $SCRIPT --outdir ' + bldroot + '/rasterizer/core --dim 5 2 3 2 2 2 --split 0 --cpp'
-)
+    command = python_cmd + ' $SCRIPT --outdir ' + bldroot + '/rasterizer/core/backends --dim 5 2 3 2 2 2 --numfiles ' + str(backendPixelRateFileCount) + ' --cpp --hpp'
+    )
+Depends(backendPixelRateFiles,
+        ['rasterizer/core/backends/gen_BackendPixelRate.hpp',
+         'rasterizer/archrast/gen_ar_event.hpp',
+         'rasterizer/codegen/gen_knobs.h']
+        )
+
 Depends('rasterizer/jitter/gen_state_llvm.h',
         swrroot + 'rasterizer/codegen/templates/gen_backend.cpp')
 
@@ -153,9 +163,10 @@ Depends('rasterizer/jitter/gen_state_llvm.h',
 built_sources = [
     'rasterizer/codegen/gen_knobs.cpp',
     'rasterizer/archrast/gen_ar_event.cpp',
-    'rasterizer/core/gen_BackendPixelRate0.cpp',
     ]
 
+built_sources += backendPixelRateFiles
+
 source = built_sources
 source += env.ParseSourceList(swrroot + 'Makefile.sources', [
     'CXX_SOURCES',
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py
index d9e938a..329a6e7 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py
@@ -1,7 +1,7 @@
 # Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
+# copy of this software and associated documentation files (the 'Software'),
 # to deal in the Software without restriction, including without limitation
 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 # and/or sell copies of the Software, and to permit persons to whom the
@@ -11,7 +11,7 @@
 # paragraph) shall be included in all copies or substantial portions of the
 # Software.
 #
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
@@ -31,23 +31,28 @@ from gen_common import ArgumentParser, MakoTemplateWriter
 
 def main(args=sys.argv[1:]):
     thisDir = os.path.dirname(os.path.realpath(__file__))
-    parser = ArgumentParser("Generate files and initialization functions for all permutuations of BackendPixelRate.")
-    parser.add_argument('--dim', help="gBackendPixelRateTable array dimensions", nargs='+', type=int, required=True)
-    parser.add_argument('--outdir', help="output directory", nargs='?', type=str, default=thisDir)
-    parser.add_argument('--split', help="how many lines of initialization per file [0=no split]", nargs='?', type=int, default='512')
-    parser.add_argument('--cpp', help="Generate cpp file(s)", action='store_true', default=False)
-    parser.add_argument('--cmake', help="Generate cmake file", action='store_true', default=False)
+    parser = ArgumentParser('Generate files and initialization functions for all permutuations of BackendPixelRate.')
+    parser.add_argument('--dim', help='gBackendPixelRateTable array dimensions', nargs='+', type=int, required=True)
+    parser.add_argument('--outdir', help='output directory', nargs='?', type=str, default=thisDir)
+    parser.add_argument('--split', help='how many lines of initialization per file [0=no split]', nargs='?', type=int, default='512')
+    parser.add_argument('--numfiles', help='how many output files to generate', nargs='?', type=int, default='0')
+    parser.add_argument('--cpp', help='Generate cpp file(s)', action='store_true', default=False)
+    parser.add_argument('--hpp', help='Generate hpp file', action='store_true', default=False)
+    parser.add_argument('--cmake', help='Generate cmake file', action='store_true', default=False)
 
     args = parser.parse_args(args);
 
     class backendStrs :
         def __init__(self) :
             self.outFileName = 'gen_BackendPixelRate%s.cpp'
+            self.outHeaderName = 'gen_BackendPixelRate.hpp'
             self.functionTableName = 'gBackendPixelRateTable'
             self.funcInstanceHeader = ' = BackendPixelRate<SwrBackendTraits<'
             self.template = 'gen_backend.cpp'
+            self.hpp_template = 'gen_header_init.hpp'
             self.cmakeFileName = 'gen_backends.cmake'
             self.cmakeSrcVar = 'GEN_BACKEND_SOURCES'
+            self.tableName = 'BackendPixelRate'
 
     backend = backendStrs()
 
@@ -77,6 +82,8 @@ def main(args=sys.argv[1:]):
         numFiles = 1
     else:
         numFiles = (len(output_list) + args.split - 1) // args.split
+    if (args.numfiles != 0):
+        numFiles = args.numfiles
     linesPerFile = (len(output_list) + numFiles - 1) // numFiles
     chunkedList = [output_list[x:x+linesPerFile] for x in range(0, len(output_list), linesPerFile)]
 
@@ -95,6 +102,19 @@ def main(args=sys.argv[1:]):
                 fileNum=fileNum,
                 funcList=chunkedList[fileNum])
 
+    if args.hpp:
+        baseHppName = os.path.join(args.outdir, backend.outHeaderName)
+        templateHpp = os.path.join(thisDir, 'templates', backend.hpp_template)
+
+        #print('Generating', filename)
+        MakoTemplateWriter.to_file(
+            templateHpp,
+            baseHppName,
+            cmdline=sys.argv,
+            numFiles=numFiles,
+            filename=backend.outHeaderName,
+            tableName=backend.tableName)
+
     # generate gen_backend.cmake file
     if args.cmake:
         templateCmake = os.path.join(thisDir, 'templates', 'gen_backend.cmake')
@@ -108,7 +128,7 @@ def main(args=sys.argv[1:]):
             numFiles=numFiles,
             baseCppName='${RASTY_GEN_SRC_DIR}/backends/' + os.path.basename(baseCppName))
 
-    #print("Generated %d template instantiations in %d files" % (len(output_list), numFiles))
+    #print('Generated %d template instantiations in %d files' % (len(output_list), numFiles))
 
     return 0
 
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py
index 07b455a..7f53ec6 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py
@@ -22,6 +22,7 @@
 # Python source
 from __future__ import print_function
 import os
+import errno
 import sys
 import argparse
 from mako.template import Template
@@ -62,6 +63,12 @@ class MakoTemplateWriter:
         '''
             Write template data to a file
         '''
+        if not os.path.exists(os.path.dirname(output_filename)):
+            try:
+                os.makedirs(os.path.dirname(output_filename))
+            except OSError as err:
+                if err.errno != errno.EEXIST:
+                    raise
         with open(output_filename, 'w') as outfile:
             print(MakoTemplateWriter.to_string(template_filename, **kwargs), file=outfile)
 
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp
index 4eb4ad4..088b1cd 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp
@@ -32,6 +32,7 @@
 //============================================================================
 
 #include "core/backend.h"
+#include "core/backend_impl.h"
 
 void InitBackendPixelRate${fileNum}()
 {
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp
new file mode 100644
index 0000000..5625ef8
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp
@@ -0,0 +1,43 @@
+//============================================================================
+// Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice (including the next
+// paragraph) shall be included in all copies or substantial portions of the
+// Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+// IN THE SOFTWARE.
+// 
+// @file ${filename}
+// 
+// @brief auto-generated file
+// 
+// DO NOT EDIT
+//
+// Generation Command Line:
+//  ${'\n//    '.join(cmdline)}
+//
+//============================================================================
+
+%for num in range(numFiles):
+void Init${tableName}${num}();
+%endfor
+
+static INLINE void Init${tableName}()
+{
+    %for num in range(numFiles):
+    Init${tableName}${num}();
+    %endfor
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index ae9ced2..cf895fb 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -784,9 +784,6 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
 
 
 // templated backend function tables
-extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
-extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2][2];
-extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2];
 
 void SetupPipeline(DRAW_CONTEXT *pDC)
 {
@@ -838,7 +835,9 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
             break;
         }
     }
-    
+
+    SWR_ASSERT(backendFuncs.pfnBackend);
+
     PFN_PROCESS_PRIMS pfnBinner;
 #if USE_SIMD16_FRONTEND
     PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 8c9449b..fe11cdf 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -30,15 +30,14 @@
 #include <smmintrin.h>
 
 #include "backend.h"
+#include "backend_impl.h"
 #include "tilemgr.h"
 #include "memory/tilingtraits.h"
 #include "core/multisample.h"
+#include "backends/gen_BackendPixelRate.hpp"
 
 #include <algorithm>
 
-typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
-static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
-
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Process compute work.
@@ -103,238 +102,6 @@ void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
     SWR_ASSERT(x == 0 && y == 0);
 }
 
-template<SWR_FORMAT format>
-void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
-{
-    auto lambda = [&](int32_t comp)
-    {
-        FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
-
-        pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
-    };
-
-    const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
-
-    for (uint32_t i = 0; i < numIter; ++i)
-    {
-        UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
-    }
-}
-
-#if USE_8x2_TILE_BACKEND
-template<SWR_FORMAT format>
-void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
-{
-    auto lambda = [&](int32_t comp)
-    {
-        FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
-
-        pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
-    };
-
-    const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
-
-    for (uint32_t i = 0; i < numIter; ++i)
-    {
-        UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
-    }
-}
-
-#endif
-template<SWR_FORMAT format>
-INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
-{
-    // convert clear color to hottile format
-    // clear color is in RGBA float/uint32
-#if USE_8x2_TILE_BACKEND
-    simd16vector vClear;
-    for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
-    {
-        simd16scalar vComp;
-        vComp = _simd16_load1_ps((const float*)&clear[comp]);
-        if (FormatTraits<format>::isNormalized(comp))
-        {
-            vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
-            vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
-        }
-        vComp = FormatTraits<format>::pack(comp, vComp);
-        vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
-    }
-
-#else
-    simdvector vClear;
-    for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
-    {
-        simdscalar vComp;
-        vComp = _simd_load1_ps((const float*)&clear[comp]);
-        if (FormatTraits<format>::isNormalized(comp))
-        {
-            vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
-            vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
-        }
-        vComp = FormatTraits<format>::pack(comp, vComp);
-        vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
-    }
-
-#endif
-    uint32_t tileX, tileY;
-    MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
-
-    // Init to full macrotile
-    SWR_RECT clearTile =
-    {
-        KNOB_MACROTILE_X_DIM * int32_t(tileX),
-        KNOB_MACROTILE_Y_DIM * int32_t(tileY),
-        KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
-        KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
-    };
-
-    // intersect with clear rect
-    clearTile &= rect;
-
-    // translate to local hottile origin
-    clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
-
-    // Make maximums inclusive (needed for convert to raster tiles)
-    clearTile.xmax -= 1;
-    clearTile.ymax -= 1;
-
-    // convert to raster tiles
-    clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
-    clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
-    clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
-    clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
-
-    const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
-    // compute steps between raster tile samples / raster tiles / macro tile rows
-    const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
-    const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
-    const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
-    const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
-
-    HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex);
-    uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
-    uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
-
-    // loop over all raster tiles in the current hot tile
-    for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
-    {
-        uint8_t* pRasterTile = pRasterTileRow;
-        for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
-        {
-            for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
-            {
-                ClearRasterTile<format>(pRasterTile, vClear);
-                pRasterTile += rasterTileSampleStep;
-            }
-        }
-        pRasterTileRow += macroTileRowStep;
-    }
-
-    pHotTile->state = HOTTILE_DIRTY;
-}
-
-
-void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
-{
-    SWR_CONTEXT *pContext = pDC->pContext;
-
-    if (KNOB_FAST_CLEAR)
-    {
-        CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
-        SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
-        uint32_t numSamples = GetNumSamples(sampleCount);
-
-        SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
-
-        AR_BEGIN(BEClear, pDC->drawId);
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
-        {
-            unsigned long rt = 0;
-            uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
-            while (_BitScanForward(&rt, mask))
-            {
-                mask &= ~(1 << rt);
-
-                HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
-
-                // All we want to do here is to mark the hot tile as being in a "needs clear" state.
-                pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
-                pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
-                pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
-                pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
-                pHotTile->state = HOTTILE_CLEAR;
-            }
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
-        {
-            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
-            pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
-            pHotTile->state = HOTTILE_CLEAR;
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
-        {
-            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
-
-            pHotTile->clearData[0] = pClear->clearStencil;
-            pHotTile->state = HOTTILE_CLEAR;
-        }
-
-        AR_END(BEClear, 1);
-    }
-    else
-    {
-        // Legacy clear
-        CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
-        AR_BEGIN(BEClear, pDC->drawId);
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
-        {
-            DWORD clearData[4];
-            clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
-            clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
-            clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
-            clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
-
-            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
-            SWR_ASSERT(pfnClearTiles != nullptr);
-
-            unsigned long rt = 0;
-            uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
-            while (_BitScanForward(&rt, mask))
-            {
-                mask &= ~(1 << rt);
-
-                pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
-            }
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
-        {
-            DWORD clearData[4];
-            clearData[0] = *(DWORD*)&pClear->clearDepth;
-            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
-            SWR_ASSERT(pfnClearTiles != nullptr);
-
-            pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
-        }
-
-        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
-        {
-            DWORD clearData[4];
-            clearData[0] = pClear->clearStencil;
-            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
-
-            pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
-        }
-
-        AR_END(BEClear, 1);
-    }
-}
-
 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc, 
     SWR_RENDERTARGET_ATTACHMENT attachment)
 {
@@ -368,7 +135,7 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
         // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
         if (pHotTile->state == HOTTILE_CLEAR)
         {
-            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
+            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
             SWR_ASSERT(pfnClearTiles != nullptr);
 
             pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
@@ -429,457 +196,6 @@ void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint3
     }
 }
 
-#if KNOB_SIMD_WIDTH == 8
-const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
-const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
-const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
-#else
-#error Unsupported vector width
-#endif
-
-simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
-{
-    simdscalar vClipMask = _simd_setzero_ps();
-    uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
-
-    for (uint32_t i = 0; i < numClipDistance; ++i)
-    {
-        // pull triangle clip distance values from clip buffer
-        simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
-        simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
-        simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
-
-        // interpolate
-        simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
-        
-        // clip if interpolated clip distance is < 0 || NAN
-        simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
-
-        vClipMask = _simd_or_ps(vClipMask, vCull);
-    }
-
-    return _simd_movemask_ps(vClipMask);
-}
-
-template<typename T>
-void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
-{
-    SWR_CONTEXT *pContext = pDC->pContext;
-
-    AR_BEGIN(BESingleSampleBackend, pDC->drawId);
-    AR_BEGIN(BESetup, pDC->drawId);
-
-    const API_STATE &state = GetApiState(pDC);
-
-    BarycentricCoeffs coeffs;
-    SetupBarycentricCoeffs(&coeffs, work);
-
-    uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
-
-    SWR_PS_CONTEXT psContext;
-    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
-    SetupPixelShaderContext<T>(&psContext, samplePos, work);
-
-    AR_END(BESetup, 1);
-
-    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
-    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
-    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
-    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-    {
-        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
-        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
-        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
-        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-        {
-#if USE_8x2_TILE_BACKEND
-            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-
-#endif
-            simdmask coverageMask = work.coverageMask[0] & MASK;
-
-            if (coverageMask)
-            {
-                if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-                {
-                    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-
-                    const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
-
-                    const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                    const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                    coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
-                }
-
-                if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
-                {
-                    const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
-
-                    generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
-                }
-
-                AR_BEGIN(BEBarycentric, pDC->drawId);
-
-                CalcPixelBarycentrics(coeffs, psContext);
-
-                CalcCentroid<T, true>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
-
-                // interpolate and quantize z
-                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
-                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
-                AR_END(BEBarycentric, 1);
-
-                // interpolate user clip distance if available
-                if (state.rastState.clipDistanceMask)
-                {
-                    coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
-                }
-
-                simdscalar vCoverageMask = vMask(coverageMask);
-                simdscalar depthPassMask = vCoverageMask;
-                simdscalar stencilPassMask = vCoverageMask;
-
-                // Early-Z?
-                if (T::bCanEarlyZ)
-                {
-                    AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
-                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                                     psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
-                    AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                    AR_END(BEEarlyDepthTest, 0);
-
-                    // early-exit if no pixels passed depth or earlyZ is forced on
-                    if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
-                    {
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                            pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
-
-                        if (!_simd_movemask_ps(depthPassMask))
-                        {
-                            goto Endtile;
-                        }
-                    }
-                }
-
-                psContext.sampleIndex = 0;
-                psContext.activeMask = _simd_castps_si(vCoverageMask);
-
-                // execute pixel shader
-                AR_BEGIN(BEPixelShader, pDC->drawId);
-                UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
-                state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-                AR_END(BEPixelShader, 0);
-
-                vCoverageMask = _simd_castsi_ps(psContext.activeMask);
-
-                // late-Z
-                if (!T::bCanEarlyZ)
-                {
-                    AR_BEGIN(BELateDepthTest, pDC->drawId);
-                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                                        psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
-                    AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                    AR_END(BELateDepthTest, 0);
-
-                    if (!_simd_movemask_ps(depthPassMask))
-                    {
-                        // need to call depth/stencil write for stencil write
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                            pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
-                        goto Endtile;
-                    }
-                } else {
-                    // for early z, consolidate discards from shader
-                    // into depthPassMask
-                    depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
-                }
-
-                uint32_t statMask = _simd_movemask_ps(depthPassMask);
-                uint32_t statCount = _mm_popcnt_u32(statMask);
-                UPDATE_STAT_BE(DepthPassCount, statCount);
-
-                // output merger
-                AR_BEGIN(BEOutputMerger, pDC->drawId);
-#if USE_8x2_TILE_BACKEND
-                OutputMerger8x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
-#else
-                OutputMerger4x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
-#endif
-
-                // do final depth write after all pixel kills
-                if (!state.psState.forceEarlyZ)
-                {
-                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                        pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
-                }
-                AR_END(BEOutputMerger, 0);
-            }
-
-Endtile:
-            AR_BEGIN(BEEndTile, pDC->drawId);
-
-            work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-            {
-                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-#if USE_8x2_TILE_BACKEND
-            if (useAlternateOffset)
-            {
-                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-                {
-                    pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-                }
-            }
-#else
-            for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-            {
-                pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-            }
-#endif
-            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
-            AR_END(BEEndTile, 0);
-
-            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
-            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
-        }
-
-        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
-        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
-    }
-
-    AR_END(BESingleSampleBackend, 0);
-}
-
-template<typename T>
-void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
-{
-    SWR_CONTEXT *pContext = pDC->pContext;
-
-    AR_BEGIN(BESampleRateBackend, pDC->drawId);
-    AR_BEGIN(BESetup, pDC->drawId);
-
-    const API_STATE &state = GetApiState(pDC);
-
-    BarycentricCoeffs coeffs;
-    SetupBarycentricCoeffs(&coeffs, work);
-
-    uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
-
-    SWR_PS_CONTEXT psContext;
-    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
-    SetupPixelShaderContext<T>(&psContext, samplePos, work);
-
-    AR_END(BESetup, 0);
-
-    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
-    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
-    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
-    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-    {
-        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
-        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
-        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
-        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-        {
-#if USE_8x2_TILE_BACKEND
-            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-
-#endif
-            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
-            {
-                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
-
-                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
-            }
-
-            AR_BEGIN(BEBarycentric, pDC->drawId);
-
-            CalcPixelBarycentrics(coeffs, psContext);
-
-            CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
-
-            AR_END(BEBarycentric, 0);
-
-            for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
-            {
-                simdmask coverageMask = work.coverageMask[sample] & MASK;
-
-                if (coverageMask)
-                {
-                    // offset depth/stencil buffers current sample
-                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
-                    uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
-                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-                    {
-                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-
-                        const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
-
-                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
-                    }
-
-                    AR_BEGIN(BEBarycentric, pDC->drawId);
-
-                    // calculate per sample positions
-                    psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
-                    psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
-
-                    CalcSampleBarycentrics(coeffs, psContext);
-
-                    // interpolate and quantize z
-                    psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
-                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-
-                    AR_END(BEBarycentric, 0);
-
-                    // interpolate user clip distance if available
-                    if (state.rastState.clipDistanceMask)
-                    {
-                        coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
-                    }
-
-                    simdscalar vCoverageMask = vMask(coverageMask);
-                    simdscalar depthPassMask = vCoverageMask;
-                    simdscalar stencilPassMask = vCoverageMask;
-
-                    // Early-Z?
-                    if (T::bCanEarlyZ)
-                    {
-                        AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
-                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                              psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
-                        AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                        AR_END(BEEarlyDepthTest, 0);
-
-                        // early-exit if no samples passed depth or earlyZ is forced on.
-                        if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
-                        {
-                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
-
-                            if (!_simd_movemask_ps(depthPassMask))
-                            {
-                                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-                                continue;
-                            }
-                        }
-                    }
-
-                    psContext.sampleIndex = sample;
-                    psContext.activeMask = _simd_castps_si(vCoverageMask);
-
-                    // execute pixel shader
-                    AR_BEGIN(BEPixelShader, pDC->drawId);
-                    UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
-                    state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-                    AR_END(BEPixelShader, 0);
-
-                    vCoverageMask = _simd_castsi_ps(psContext.activeMask);
-
-                    // late-Z
-                    if (!T::bCanEarlyZ)
-                    {
-                        AR_BEGIN(BELateDepthTest, pDC->drawId);
-                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                              psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
-                        AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                        AR_END(BELateDepthTest, 0);
-
-                        if (!_simd_movemask_ps(depthPassMask))
-                        {
-                            // need to call depth/stencil write for stencil write
-                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
-
-                            work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-                            continue;
-                        }
-                    }
-
-                    uint32_t statMask = _simd_movemask_ps(depthPassMask);
-                    uint32_t statCount = _mm_popcnt_u32(statMask);
-                    UPDATE_STAT_BE(DepthPassCount, statCount);
-
-                    // output merger
-                    AR_BEGIN(BEOutputMerger, pDC->drawId);
-#if USE_8x2_TILE_BACKEND
-                    OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
-#else
-                    OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
-#endif
-
-                    // do final depth write after all pixel kills
-                    if (!state.psState.forceEarlyZ)
-                    {
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                            pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
-                    }
-                    AR_END(BEOutputMerger, 0);
-                }
-                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-Endtile:
-            ATTR_UNUSED;
-
-            AR_BEGIN(BEEndTile, pDC->drawId);
-
-            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-            {
-                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-#if USE_8x2_TILE_BACKEND
-            if (useAlternateOffset)
-            {
-                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-                {
-                    pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-                }
-            }
-#else
-            for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-            {
-                pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-            }
-#endif
-            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
-            AR_END(BEEndTile, 0);
-
-            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
-            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
-        }
-
-        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
-        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
-    }
-
-    AR_END(BESampleRateBackend, 0);
-}
-// optimized backend flow with NULL PS
 template<uint32_t sampleCountT>
 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
 {
@@ -977,7 +293,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
                     UPDATE_STAT_BE(DepthPassCount, statCount);
                 }
 
-Endtile:
+            Endtile:
                 ATTR_UNUSED;
                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
@@ -994,17 +310,7 @@ Endtile:
     AR_END(BENullBackend, 0);
 }
 
-void InitClearTilesTable()
-{
-    memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
-
-    sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
-    sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
-    sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
-    sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
-    sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
-}
-
+PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {};
 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
                                      [2] // centroid
@@ -1023,113 +329,10 @@ PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
                                         [2] // canEarlyZ
                                         = {};
 
-// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
-// arguments to static template arguments.
-template <uint32_t... ArgsT>
-struct BEChooser
-{
-    // Last Arg Terminator
-    static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
-    {
-        switch(tArg)
-        {
-        case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
-        case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<SwrBackendTraits<ArgsT...>>; break;
-        case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
-        default:
-            SWR_ASSERT(0 && "Invalid backend func\n");
-            return nullptr;
-            break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
-    {
-        switch(tArg)
-        {
-        case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
-        case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
-        case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
-        default:
-        SWR_ASSERT(0 && "Invalid sample pattern\n");
-        return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
-        break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
-    {
-        switch(tArg)
-        {
-        case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
-        default:
-        SWR_ASSERT(0 && "Invalid sample count\n");
-        return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
-        break;
-        }
-    }
-
-    // Recursively parse args
-    template <typename... TArgsT>
-    static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
-    {
-        if(tArg == true)
-        {
-            return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
-        }
-
-        return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
-    }
-};
-
-void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
-{
-    for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
-    {
-        for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
-        {
-            for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
-            {
-                table[inputCoverage][isCentroid][canEarlyZ] =
-                    BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage,
-                                         (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
-            }
-        }
-    }
-}
-
-void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
-{
-    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
-    {
-        for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
-        {
-            for(uint32_t centroid = 0; centroid < 2; centroid++)
-            {
-                for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
-                {
-                    table[sampleCount][inputCoverage][centroid][canEarlyZ] =
-                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage, 
-                                             (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
-                }
-            }
-        }
-    }
-}
-
-void InitBackendPixelRate0();
 void InitBackendFuncTables()
 {    
+    InitBackendPixelRate();
     InitBackendSingleFuncTable(gBackendSingleSample);
-    InitBackendPixelRate0();
     InitBackendSampleFuncTable(gBackendSampleRateTable);
 
     gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
index 0359486..c8c37e6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -40,1022 +40,23 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
-void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
-simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
-void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
 
-extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
-                                              [2]   // isCenterPattern
-                                              [SWR_INPUT_COVERAGE_COUNT]
-                                              [2]  // centroid
-                                              [2]  // forcedSampleCount
-                                              [2]  // canEarlyZ
-                                              ;
-
-enum SWR_BACKEND_FUNCS
-{
-    SWR_BACKEND_SINGLE_SAMPLE,
-    SWR_BACKEND_MSAA_PIXEL_RATE,
-    SWR_BACKEND_MSAA_SAMPLE_RATE,
-    SWR_BACKEND_FUNCS_MAX,
-};
-
-#if KNOB_SIMD_WIDTH == 8
-extern const simdscalar vCenterOffsetsX;
-extern const simdscalar vCenterOffsetsY;
-extern const simdscalar vULOffsetsX;
-extern const simdscalar vULOffsetsY;
-#define MASK 0xff
-#endif
-
-INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
-{
-    static const uint32_t RasterTileColorOffsets[16]
-    { 0,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
-    };
-    assert(sampleNum < 16);
-    return RasterTileColorOffsets[sampleNum];
-}
-
-INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
-{
-    static const uint32_t RasterTileDepthOffsets[16]
-    { 0,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
-    };
-    assert(sampleNum < 16);
-    return RasterTileDepthOffsets[sampleNum];
-}
-
-INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
-{
-    static const uint32_t RasterTileStencilOffsets[16]
-    { 0,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
-    };
-    assert(sampleNum < 16);
-    return RasterTileStencilOffsets[sampleNum];
-}
-
-template<typename T, uint32_t InputCoverage>
-struct generateInputCoverage
-{
-    INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
-    {
-        // will need to update for avx512
-        assert(KNOB_SIMD_WIDTH == 8);
-
-        simdscalari mask[2];
-        simdscalari sampleCoverage[2];
-        
-        if(T::bIsCenterPattern)
-        {
-            // center coverage is the same for all samples; just broadcast to the sample slots
-            uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
-            if(T::MultisampleT::numSamples == 1)
-            {
-                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
-            }
-            else if(T::MultisampleT::numSamples == 2)
-            {
-                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
-            }
-            else if(T::MultisampleT::numSamples == 4)
-            {
-                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
-            }
-            else if(T::MultisampleT::numSamples == 8)
-            {
-                sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
-            }
-            else if(T::MultisampleT::numSamples == 16)
-            {
-                sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
-                sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
-            }
-        }
-        else
-        {
-            __m256i src = _mm256_set1_epi32(0);
-            __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
-
-            if(T::MultisampleT::numSamples == 1)
-            {
-                mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
-            }
-            else if(T::MultisampleT::numSamples == 2)
-            {
-                mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-            }
-            else if(T::MultisampleT::numSamples == 4)
-            {
-                mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
-            }
-            else if(T::MultisampleT::numSamples == 8)
-            {
-                mask[0] = _mm256_set1_epi32(-1);
-            }
-            else if(T::MultisampleT::numSamples == 16)
-            {
-                mask[0] = _mm256_set1_epi32(-1);
-                mask[1] = _mm256_set1_epi32(-1);
-                index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
-            }
-
-            // gather coverage for samples 0-7
-            sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
-            if(T::MultisampleT::numSamples > 8)
-            {
-                // gather coverage for samples 8-15
-                sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
-            }
-        }
-
-        mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
-                                  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
-        // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
-        simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
-
-        simdscalari packedCoverage1;
-        if(T::MultisampleT::numSamples > 8)
-        {
-            // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
-            packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
-        }
-
-    #if (KNOB_ARCH == KNOB_ARCH_AVX)
-        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
-        simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
-        simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-        packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
-
-        simdscalari packedSampleCoverage;
-        if(T::MultisampleT::numSamples > 8)
-        {
-            // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-            hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
-            shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-            shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
-            packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
-            packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
-        }
-        else
-        {
-            packedSampleCoverage = packedCoverage0;
-        }
-    #else
-        simdscalari permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
-        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
-        packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
-
-        simdscalari packedSampleCoverage;
-        if(T::MultisampleT::numSamples > 8)
-        {
-            permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
-            // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-            packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
-
-            // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
-            packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
-        }
-        else
-        {
-            packedSampleCoverage = packedCoverage0;
-        }
-    #endif
-
-        for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
-        {
-            // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
-            inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
-
-            if(!T::bForcedSampleCount)
-            {
-                // input coverage has to be anded with sample mask if MSAA isn't forced on
-                inputMask[i] &= sampleMask;
-            }
-
-            // shift to the next pixel in the 4x2
-            packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
-        }
-    }
-
-    INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
-    {
-        uint32_t inputMask[KNOB_SIMD_WIDTH];
-        generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
-        inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
-    }
-
-};
-
-template<typename T>
-struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
-{
-    INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
-    {
-        // will need to update for avx512
-        assert(KNOB_SIMD_WIDTH == 8);
-        simdscalari vec = _mm256_set1_epi32(coverageMask[0]);
-        const simdscalari bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
-        vec = _simd_and_si(vec, bit);
-        vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
-        vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
-        inputCoverage = _simd_castsi_ps(vec);
-    }
-
-    INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
-    {
-        uint32_t simdCoverage = (coverageMask[0] & MASK);
-        static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
-        for(int i = 0; i < KNOB_SIMD_WIDTH; i++)
-        {
-            // set all samples to covered if conservative coverage mask is set for that pixel
-            inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
-        }
-    }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Centroid behaves exactly as follows :
-// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to 
-//     have a sample location there).
-// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the 
-//     coverage with the SampleMask Rasterizer State.
-// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is 
-//     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the 
-//     SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename T>
-INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos,
-                            const uint64_t *const coverageMask, const uint32_t sampleMask,
-                            const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
-{
-    uint32_t inputMask[KNOB_SIMD_WIDTH];
-    generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
-
-    // Case (2) - partially covered pixel
-
-    // scan for first covered sample per pixel in the 4x2 span
-    unsigned long sampleNum[KNOB_SIMD_WIDTH];
-    (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
-    (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
-    (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
-    (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
-    (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
-    (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
-    (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
-    (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
-
-    // look up and set the sample offsets from UL pixel corner for first covered sample 
-    __m256 vXSample = _mm256_set_ps(samplePos.X(sampleNum[7]),
-                                    samplePos.X(sampleNum[6]),
-                                    samplePos.X(sampleNum[5]),
-                                    samplePos.X(sampleNum[4]),
-                                    samplePos.X(sampleNum[3]),
-                                    samplePos.X(sampleNum[2]),
-                                    samplePos.X(sampleNum[1]),
-                                    samplePos.X(sampleNum[0]));
-
-    __m256 vYSample = _mm256_set_ps(samplePos.Y(sampleNum[7]),
-                                    samplePos.Y(sampleNum[6]),
-                                    samplePos.Y(sampleNum[5]),
-                                    samplePos.Y(sampleNum[4]),
-                                    samplePos.Y(sampleNum[3]),
-                                    samplePos.Y(sampleNum[2]),
-                                    samplePos.Y(sampleNum[1]),
-                                    samplePos.Y(sampleNum[0]));
-    // add sample offset to UL pixel corner
-    vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
-    vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
-
-    // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
-    static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
-    simdscalari vInputCoveragei =  _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
-    simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
-
-    static const simdscalari vZero = _simd_setzero_si();
-    const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
-    simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
-    simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
-    simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
-
-    simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
-
-    // set the centroid position based on results from above
-    psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
-    psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
-
-    // Case (3a) No samples covered and partial sample mask
-    simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
-    // sample mask should never be all 0's for this case, but handle it anyways
-    unsigned long firstCoveredSampleMaskSample = 0;
-    (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
-
-    simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
-
-    vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample));
-    vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
-
-    // blend in case 3a pixel locations
-    psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
-    psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
-}
-
-INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
-                                     const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
-{
-    // evaluate I,J
-    psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
-    psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
-    psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
-    psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
-
-    // interpolate 1/w
-    psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
-}
-
-INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar z, float minz, float maxz)
-{
-    const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
-    const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
-
-    return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
-}
-
-template<typename T>
-INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
-{
-    // RT has to be single sample if we're in forcedMSAA mode
-    if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
-    {
-        return 1;
-    }
-    // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
-    else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
-    {
-        return GetNumSamples(blendSampleCount);
-    }
-    // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
-    else
-    {
-        return T::MultisampleT::numSamples;
-    }
-}
-
-inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
-{
-    // broadcast scalars
-
-    coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
-    coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
-    coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
-
-    coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
-    coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
-    coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
-
-    coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
-    coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
-    coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
-
-    coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
-
-    coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
-    coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
-    coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
-}
-
-inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers)
-{
-    assert(colorBufferCount <= SWR_NUM_RENDERTARGETS);
-
-    if (pColorBuffer)
-    {
-        for (uint32_t index = 0; index < colorBufferCount; index += 1)
-        {
-            pColorBuffer[index] = renderBuffers.pColor[index];
-        }
-    }
-
-    if (pDepthBuffer)
-    {
-        *pDepthBuffer = renderBuffers.pDepth;
-    }
-
-    if (pStencilBuffer)
-    {
-        *pStencilBuffer = renderBuffers.pStencil;;
-    }
-}
-
-template<typename T>
-void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, SWR_TRIANGLE_DESC &work)
-{
-    psContext->pAttribs = work.pAttribs;
-    psContext->pPerspAttribs = work.pPerspAttribs;
-    psContext->frontFace = work.triFlags.frontFacing;
-    psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex;
-
-    // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
-    psContext->I = work.I;
-    psContext->J = work.J;
-
-    psContext->recipDet = work.recipDet;
-    psContext->pRecipW = work.pRecipW;
-    psContext->pSamplePosX = samplePos.X();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
-    psContext->pSamplePosY = samplePos.Y();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
-    psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
-    psContext->sampleIndex = 0;
-}
-
-template<typename T, bool IsSingleSample>
-void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos,
-                  const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
-{
-    if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
-    {
-        // for 1x case, centroid is pixel center
-        psContext->vX.centroid = psContext->vX.center;
-        psContext->vY.centroid = psContext->vY.center;
-        psContext->vI.centroid = psContext->vI.center;
-        psContext->vJ.centroid = psContext->vJ.center;
-        psContext->vOneOverW.centroid = psContext->vOneOverW.center;
-    }
-    else
-    {
-        if (T::bCentroidPos)
-        {
-            ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
-            if (T::bIsCenterPattern)
-            {
-                psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
-                psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
-            }
-            else
-            {
-                // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
-                CalcCentroidPos<T>(*psContext, samplePos, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
-            }
-
-            CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
-        }
-        else
-        {
-            psContext->vX.centroid = psContext->vX.sample;
-            psContext->vY.centroid = psContext->vY.sample;
-        }
-    }
-}
-
-template<typename T>
-struct PixelRateZTestLoop
-{
-    PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
-                       uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
-                       pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
-                       samplePos(state.rastState.samplePositions),
-                       clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
-
-    INLINE
-    uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext, 
-                        const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
-    {
-        SWR_CONTEXT *pContext = pDC->pContext;
-
-        uint32_t statCount = 0;
-        simdscalar anyDepthSamplePassed = _simd_setzero_ps();
-        for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
-        {
-            const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
-            vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK));
-
-            if(!_simd_movemask_ps(vCoverageMask[sample]))
-            {
-                vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
-                continue;
-            }
-
-            // offset depth/stencil buffers current sample
-            uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
-            uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
-            if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
-            {
-                static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
-
-                const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
-
-                const float minz = state.depthBoundsState.depthBoundsTestMinValue;
-                const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
-
-                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz)));
-            }
-
-            AR_BEGIN(BEBarycentric, pDC->drawId);
-
-            // calculate per sample positions
-            psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
-            psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
-
-            // calc I & J per sample
-            CalcSampleBarycentrics(coeffs, psContext);
-
-            if(psState.writesODepth)
-            {
-                {
-                    // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
-                    vZ[sample] = psContext.vZ;
-                }
-            }
-            else
-            {
-                vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
-                vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
-            }
-
-            AR_END(BEBarycentric, 0);
-
-            ///@todo: perspective correct vs non-perspective correct clipping?
-            // if clip distances are enabled, we need to interpolate for each sample
-            if(clipDistanceMask)
-            {
-                uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
-
-                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
-            }
+typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
 
-            // ZTest for this sample
-            ///@todo Need to uncomment out this bucket.
-            //AR_BEGIN(BEDepthBucket, pDC->drawId);
-            depthPassMask[sample] = vCoverageMask[sample];
-            stencilPassMask[sample] = vCoverageMask[sample];
-            depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                                     vZ[sample], pDepthSample, vCoverageMask[sample], 
-                                                     pStencilSample, &stencilPassMask[sample]);
-            //AR_END(BEDepthBucket, 0);
-
-            // early-exit if no pixels passed depth or earlyZ is forced on
-            if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
-            {
-                DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
-                                  pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
-
-                if(!_simd_movemask_ps(depthPassMask[sample]))
-                {
-                    continue;
-                }
-            }
-            anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
-            uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
-            statCount += _mm_popcnt_u32(statMask);
-        }
-
-        activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
-        // return number of samples that passed depth and coverage
-        return statCount;
-    }
-
-    // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
-    simdscalar vZ[T::MultisampleT::numCoverageSamples];
-    simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
-    simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
-    simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
-
-private:
-    // functor inputs
-    DRAW_CONTEXT* pDC;
-    uint32_t workerId;
-
-    const SWR_TRIANGLE_DESC& work;
-    const BarycentricCoeffs& coeffs;
-    const API_STATE& state;
-    const SWR_PS_STATE& psState;
-    const SWR_MULTISAMPLE_POS& samplePos;
-    const uint8_t clipDistanceMask;
-    uint8_t*& pDepthBuffer;
-    uint8_t*& pStencilBuffer;
-};
-
-INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
-{
-    // evaluate I,J
-    psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
-    psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
-    psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
-    psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
-
-    // interpolate 1/w
-    psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
-}
-
-INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
-{
-    // evaluate I,J
-    psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
-    psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
-    psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
-    psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
-
-    // interpolate 1/w
-    psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
-}
-
-// Merge Output to 4x2 SIMD Tile Format
-INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
-    const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT)
-{
-    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
-    const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
-    simdvector blendOut;
-
-    for(uint32_t rt = 0; rt < NumRT; ++rt)
-    {
-        uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
-
-        const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
-
-        {
-            // pfnBlendFunc may not update all channels.  Initialize with PS output.
-            /// TODO: move this into the blend JIT.
-            blendOut = psContext.shaded[rt];
-
-            // Blend outputs and update coverage mask for alpha test
-            if(pfnBlendFunc[rt] != nullptr)
-            {
-                pfnBlendFunc[rt](
-                    pBlendState,
-                    psContext.shaded[rt],
-                    psContext.shaded[1],
-                    psContext.shaded[0].w,
-                    sample,
-                    pColorSample,
-                    blendOut,
-                    &psContext.oMask,
-                    (simdscalari*)&coverageMask);
-            }
-        }
-
-        // final write mask 
-        simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
-
-        ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
-        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
-
-        const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
-
-        // store with color mask
-        if(!pRTBlend->writeDisableRed)
-        {
-            _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
-        }
-        if(!pRTBlend->writeDisableGreen)
-        {
-            _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
-        }
-        if(!pRTBlend->writeDisableBlue)
-        {
-            _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
-        }
-        if(!pRTBlend->writeDisableAlpha)
-        {
-            _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
-        }
-    }
-}
-
-#if USE_8x2_TILE_BACKEND
-// Merge Output to 8x2 SIMD16 Tile Format
-INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
-    const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT, const uint32_t colorBufferEnableMask, bool useAlternateOffset)
-{
-    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
-    uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
-
-    if (useAlternateOffset)
-    {
-        rasterTileColorOffset += sizeof(simdscalar);
-    }
-
-    simdvector blendSrc;
-    simdvector blendOut;
-
-    uint32_t colorBufferBit = 1;
-    for (uint32_t rt = 0; rt < NumRT; rt += 1, colorBufferBit <<= 1)
-    {
-        simdscalar *pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
-
-        const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
-
-        if (colorBufferBit & colorBufferEnableMask)
-        {
-            blendSrc[0] = pColorSample[0];
-            blendSrc[1] = pColorSample[2];
-            blendSrc[2] = pColorSample[4];
-            blendSrc[3] = pColorSample[6];
-        }
-
-        {
-            // pfnBlendFunc may not update all channels.  Initialize with PS output.
-            /// TODO: move this into the blend JIT.
-            blendOut = psContext.shaded[rt];
-
-            // Blend outputs and update coverage mask for alpha test
-            if(pfnBlendFunc[rt] != nullptr)
-            {
-                pfnBlendFunc[rt](
-                    pBlendState,
-                    psContext.shaded[rt],
-                    psContext.shaded[1],
-                    psContext.shaded[0].w,
-                    sample,
-                    reinterpret_cast<uint8_t *>(&blendSrc),
-                    blendOut,
-                    &psContext.oMask,
-                    reinterpret_cast<simdscalari *>(&coverageMask));
-            }
-        }
-
-        // final write mask 
-        simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
-
-        ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
-        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
-
-        // store with color mask
-        if (!pRTBlend->writeDisableRed)
-        {
-            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x);
-        }
-        if (!pRTBlend->writeDisableGreen)
-        {
-            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y);
-        }
-        if (!pRTBlend->writeDisableBlue)
-        {
-            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z);
-        }
-        if (!pRTBlend->writeDisableAlpha)
-        {
-            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w);
-        }
-    }
-}
-
-#endif
-
-template<typename T>
-void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
-{
-    ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
-
-
-    SWR_CONTEXT *pContext = pDC->pContext;
-
-    AR_BEGIN(BEPixelRateBackend, pDC->drawId);
-    AR_BEGIN(BESetup, pDC->drawId);
-
-    const API_STATE &state = GetApiState(pDC);
-
-    BarycentricCoeffs coeffs;
-    SetupBarycentricCoeffs(&coeffs, work);
-
-    SWR_PS_CONTEXT psContext;
-    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
-    SetupPixelShaderContext<T>(&psContext, samplePos, work);
-
-    uint8_t *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
-
-    AR_END(BESetup, 0);
-
-    PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
-
-    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
-    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
-
-    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
-
-    for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
-    {
-        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
-        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
-
-        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
-
-        for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
-        {
-#if USE_8x2_TILE_BACKEND
-            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
-#endif
-            simdscalar activeLanes;
-            if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
-            activeLanes = vMask(work.anyCoveredSamples & MASK);
-
-            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
-            {
-                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
-
-                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
-            }
-
-            AR_BEGIN(BEBarycentric, pDC->drawId);
-
-            CalcPixelBarycentrics(coeffs, psContext);
-
-            CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
-
-            AR_END(BEBarycentric, 0);
-
-            if(T::bForcedSampleCount)
-            {
-                // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
-                const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
-                activeLanes = _simd_and_ps(activeLanes, vSampleMask);
-            }
-
-            // Early-Z?
-            if(T::bCanEarlyZ && !T::bForcedSampleCount)
-            {
-                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
-                UPDATE_STAT_BE(DepthPassCount, depthPassCount);
-                AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
-            }
-
-            // if we have no covered samples that passed depth at this point, go to next tile
-            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
-
-            if(state.psState.usesSourceDepth)
-            {
-                AR_BEGIN(BEBarycentric, pDC->drawId);
-                // interpolate and quantize z
-                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
-                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-                AR_END(BEBarycentric, 0);
-            }
-
-            // pixels that are currently active
-            psContext.activeMask = _simd_castps_si(activeLanes);
-            psContext.oMask = T::MultisampleT::FullSampleMask();
-
-            // execute pixel shader
-            AR_BEGIN(BEPixelShader, pDC->drawId);
-            state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-            UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
-            AR_END(BEPixelShader, 0);
-
-            // update active lanes to remove any discarded or oMask'd pixels
-            activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
-            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
-
-            // late-Z
-            if(!T::bCanEarlyZ && !T::bForcedSampleCount)
-            {
-                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
-                UPDATE_STAT_BE(DepthPassCount, depthPassCount);
-                AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
-            }
-
-            // if we have no covered samples that passed depth at this point, skip OM and go to next tile
-            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
-
-            // output merger
-            // loop over all samples, broadcasting the results of the PS to all passing pixels
-            for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
-            {
-                AR_BEGIN(BEOutputMerger, pDC->drawId);
-                // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
-                uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
-                simdscalar coverageMask, depthMask;
-                if(T::bForcedSampleCount)
-                {
-                    coverageMask = depthMask = activeLanes;
-                }
-                else
-                {
-                    coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
-                    depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
-                    if(!_simd_movemask_ps(depthMask))
-                    {
-                        // stencil should already have been written in early/lateZ tests
-                        AR_END(BEOutputMerger, 0);
-                        continue;
-                    }
-                }
-                
-                // broadcast the results of the PS to all passing pixels
-#if USE_8x2_TILE_BACKEND
-                OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
-#else // USE_8x2_TILE_BACKEND
-                OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
-#endif // USE_8x2_TILE_BACKEND
-
-                if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
-                {
-                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
-                    uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
-
-                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
-                                      pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
-                }
-                AR_END(BEOutputMerger, 0);
-            }
-Endtile:
-            AR_BEGIN(BEEndTile, pDC->drawId);
-
-            for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
-            {
-                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-
-            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
-            {
-                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            }
-            work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-
-#if USE_8x2_TILE_BACKEND
-            if (useAlternateOffset)
-            {
-                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-                {
-                    psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-                }
-            }
-#else
-            for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
-            {
-                psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
-            }
-#endif
-            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
-
-            AR_END(BEEndTile, 0);
-
-            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
-            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
-        }
-
-        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
-        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
-    }
-
-    AR_END(BEPixelRateBackend, 0);
-}
+extern PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS];
+extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
+extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
+                                     [2]  // centroid
+                                     [2]; // canEarlyZ
+extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
+                                       [2] // isCenterPattern
+                                       [SWR_INPUT_COVERAGE_COUNT]
+                                       [2] // centroid
+                                       [2] // forcedSampleCount
+                                       [2] // canEarlyZ
+                                       ;
+extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
+                                        [SWR_INPUT_COVERAGE_COUNT]
+                                        [2]  // centroid
+                                        [2]; // canEarlyZ
 
-template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t isCenter = 0,
-         uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0
-    >
-struct SwrBackendTraits
-{
-    static const bool bIsCenterPattern = (isCenter == 1);
-    static const uint32_t InputCoverage = coverage;
-    static const bool bCentroidPos = (centroid == 1);
-    static const bool bForcedSampleCount = (forced == 1);
-    static const bool bCanEarlyZ = (canEarlyZ == 1);
-    typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT;
-};
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
new file mode 100644
index 0000000..0ef54e2
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
@@ -0,0 +1,281 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file backend.cpp
+*
+* @brief Backend handles rasterization, pixel shading and output merger
+*        operations.
+*
+******************************************************************************/
+
+#include <smmintrin.h>
+
+#include "backend.h"
+#include "backend_impl.h"
+#include "tilemgr.h"
+#include "memory/tilingtraits.h"
+#include "core/multisample.h"
+
+#include <algorithm>
+
+template<SWR_FORMAT format>
+void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
+{
+    auto lambda = [&](int32_t comp)
+    {
+        FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
+
+        pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
+    };
+
+    const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
+
+    for (uint32_t i = 0; i < numIter; ++i)
+    {
+        UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
+    }
+}
+
+#if USE_8x2_TILE_BACKEND
+template<SWR_FORMAT format>
+void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
+{
+    auto lambda = [&](int32_t comp)
+    {
+        FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
+
+        pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
+    };
+
+    const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
+
+    for (uint32_t i = 0; i < numIter; ++i)
+    {
+        UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
+    }
+}
+
+#endif
+template<SWR_FORMAT format>
+INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
+{
+    // convert clear color to hottile format
+    // clear color is in RGBA float/uint32
+#if USE_8x2_TILE_BACKEND
+    simd16vector vClear;
+    for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
+    {
+        simd16scalar vComp;
+        vComp = _simd16_load1_ps((const float*)&clear[comp]);
+        if (FormatTraits<format>::isNormalized(comp))
+        {
+            vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
+            vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
+        }
+        vComp = FormatTraits<format>::pack(comp, vComp);
+        vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
+    }
+
+#else
+    simdvector vClear;
+    for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
+    {
+        simdscalar vComp;
+        vComp = _simd_load1_ps((const float*)&clear[comp]);
+        if (FormatTraits<format>::isNormalized(comp))
+        {
+            vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
+            vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
+        }
+        vComp = FormatTraits<format>::pack(comp, vComp);
+        vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
+    }
+
+#endif
+    uint32_t tileX, tileY;
+    MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
+
+    // Init to full macrotile
+    SWR_RECT clearTile =
+    {
+        KNOB_MACROTILE_X_DIM * int32_t(tileX),
+        KNOB_MACROTILE_Y_DIM * int32_t(tileY),
+        KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
+        KNOB_MACROTILE_Y_DIM * int32_t(tileY + 1),
+    };
+
+    // intersect with clear rect
+    clearTile &= rect;
+
+    // translate to local hottile origin
+    clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
+
+    // Make maximums inclusive (needed for convert to raster tiles)
+    clearTile.xmax -= 1;
+    clearTile.ymax -= 1;
+
+    // convert to raster tiles
+    clearTile.ymin >>= (KNOB_TILE_Y_DIM_SHIFT);
+    clearTile.ymax >>= (KNOB_TILE_Y_DIM_SHIFT);
+    clearTile.xmin >>= (KNOB_TILE_X_DIM_SHIFT);
+    clearTile.xmax >>= (KNOB_TILE_X_DIM_SHIFT);
+
+    const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
+    // compute steps between raster tile samples / raster tiles / macro tile rows
+    const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
+    const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
+    const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
+    const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
+
+    HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex);
+    uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
+    uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
+
+    // loop over all raster tiles in the current hot tile
+    for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
+    {
+        uint8_t* pRasterTile = pRasterTileRow;
+        for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
+        {
+            for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
+            {
+                ClearRasterTile<format>(pRasterTile, vClear);
+                pRasterTile += rasterTileSampleStep;
+            }
+        }
+        pRasterTileRow += macroTileRowStep;
+    }
+
+    pHotTile->state = HOTTILE_DIRTY;
+}
+
+
+void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
+{
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    if (KNOB_FAST_CLEAR)
+    {
+        CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
+        SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
+        uint32_t numSamples = GetNumSamples(sampleCount);
+
+        SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
+
+        AR_BEGIN(BEClear, pDC->drawId);
+
+        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
+        {
+            unsigned long rt = 0;
+            uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
+            while (_BitScanForward(&rt, mask))
+            {
+                mask &= ~(1 << rt);
+
+                HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
+
+                // All we want to do here is to mark the hot tile as being in a "needs clear" state.
+                pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
+                pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
+                pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
+                pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
+                pHotTile->state = HOTTILE_CLEAR;
+            }
+        }
+
+        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
+        {
+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
+            pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
+            pHotTile->state = HOTTILE_CLEAR;
+        }
+
+        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
+        {
+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
+
+            pHotTile->clearData[0] = pClear->clearStencil;
+            pHotTile->state = HOTTILE_CLEAR;
+        }
+
+        AR_END(BEClear, 1);
+    }
+    else
+    {
+        // Legacy clear
+        CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
+        AR_BEGIN(BEClear, pDC->drawId);
+
+        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
+        {
+            DWORD clearData[4];
+            clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
+            clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
+            clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
+            clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
+
+            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
+            SWR_ASSERT(pfnClearTiles != nullptr);
+
+            unsigned long rt = 0;
+            uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
+            while (_BitScanForward(&rt, mask))
+            {
+                mask &= ~(1 << rt);
+
+                pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
+            }
+        }
+
+        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
+        {
+            DWORD clearData[4];
+            clearData[0] = *(DWORD*)&pClear->clearDepth;
+            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
+            SWR_ASSERT(pfnClearTiles != nullptr);
+
+            pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
+        }
+
+        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
+        {
+            DWORD clearData[4];
+            clearData[0] = pClear->clearStencil;
+            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
+
+            pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
+        }
+
+        AR_END(BEClear, 1);
+    }
+}
+
+void InitClearTilesTable()
+{
+    memset(gClearTilesTable, 0, sizeof(gClearTilesTable));
+
+    gClearTilesTable[R8G8B8A8_UNORM]        = ClearMacroTile<R8G8B8A8_UNORM>;
+    gClearTilesTable[B8G8R8A8_UNORM]        = ClearMacroTile<B8G8R8A8_UNORM>;
+    gClearTilesTable[R32_FLOAT]             = ClearMacroTile<R32_FLOAT>;
+    gClearTilesTable[R32G32B32A32_FLOAT]    = ClearMacroTile<R32G32B32A32_FLOAT>;
+    gClearTilesTable[R8_UINT]               = ClearMacroTile<R8_UINT>;
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
new file mode 100644
index 0000000..e151871
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
@@ -0,0 +1,1067 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file backend.h
+*
+* @brief Backend handles rasterization, pixel shading and output merger
+*        operations.
+*
+******************************************************************************/
+#pragma once
+
+void InitBackendSingleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_INPUT_COVERAGE_COUNT][2][2]);
+void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]);
+
+static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
+
+
+enum SWR_BACKEND_FUNCS
+{
+    SWR_BACKEND_SINGLE_SAMPLE,
+    SWR_BACKEND_MSAA_PIXEL_RATE,
+    SWR_BACKEND_MSAA_SAMPLE_RATE,
+    SWR_BACKEND_FUNCS_MAX,
+};
+
+#if KNOB_SIMD_WIDTH == 8
+static const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
+static const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
+static const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+static const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
+#define MASK 0xff
+#endif
+
+static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
+{
+    simdscalar vClipMask = _simd_setzero_ps();
+    uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
+
+    for (uint32_t i = 0; i < numClipDistance; ++i)
+    {
+        // pull triangle clip distance values from clip buffer
+        simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
+        simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
+        simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
+
+        // interpolate
+        simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
+
+        // clip if interpolated clip distance is < 0 || NAN
+        simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
+
+        vClipMask = _simd_or_ps(vClipMask, vCull);
+    }
+
+    return _simd_movemask_ps(vClipMask);
+}
+
+INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+{
+    static const uint32_t RasterTileColorOffsets[16]
+    { 0,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
+    };
+    assert(sampleNum < 16);
+    return RasterTileColorOffsets[sampleNum];
+}
+
+INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+{
+    static const uint32_t RasterTileDepthOffsets[16]
+    { 0,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
+    };
+    assert(sampleNum < 16);
+    return RasterTileDepthOffsets[sampleNum];
+}
+
+INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+{
+    static const uint32_t RasterTileStencilOffsets[16]
+    { 0,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
+      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
+    };
+    assert(sampleNum < 16);
+    return RasterTileStencilOffsets[sampleNum];
+}
+
+template<typename T, uint32_t InputCoverage>
+struct generateInputCoverage
+{
+    INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+    {
+        // will need to update for avx512
+        assert(KNOB_SIMD_WIDTH == 8);
+
+        simdscalari mask[2];
+        simdscalari sampleCoverage[2];
+        
+        if(T::bIsCenterPattern)
+        {
+            // center coverage is the same for all samples; just broadcast to the sample slots
+            uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
+            if(T::MultisampleT::numSamples == 1)
+            {
+                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
+            }
+            else if(T::MultisampleT::numSamples == 2)
+            {
+                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+            }
+            else if(T::MultisampleT::numSamples == 4)
+            {
+                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+            }
+            else if(T::MultisampleT::numSamples == 8)
+            {
+                sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+            }
+            else if(T::MultisampleT::numSamples == 16)
+            {
+                sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+                sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
+            }
+        }
+        else
+        {
+            __m256i src = _mm256_set1_epi32(0);
+            __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
+
+            if(T::MultisampleT::numSamples == 1)
+            {
+                mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+            }
+            else if(T::MultisampleT::numSamples == 2)
+            {
+                mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+            }
+            else if(T::MultisampleT::numSamples == 4)
+            {
+                mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+            }
+            else if(T::MultisampleT::numSamples == 8)
+            {
+                mask[0] = _mm256_set1_epi32(-1);
+            }
+            else if(T::MultisampleT::numSamples == 16)
+            {
+                mask[0] = _mm256_set1_epi32(-1);
+                mask[1] = _mm256_set1_epi32(-1);
+                index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+            }
+
+            // gather coverage for samples 0-7
+            sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
+            if(T::MultisampleT::numSamples > 8)
+            {
+                // gather coverage for samples 8-15
+                sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
+            }
+        }
+
+        mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
+                                  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
+        // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
+        simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
+
+        simdscalari packedCoverage1;
+        if(T::MultisampleT::numSamples > 8)
+        {
+            // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
+            packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
+        }
+
+    #if (KNOB_ARCH == KNOB_ARCH_AVX)
+        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
+        simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
+        simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+        packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
+
+        simdscalari packedSampleCoverage;
+        if(T::MultisampleT::numSamples > 8)
+        {
+            // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+            hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
+            shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+            shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
+            packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
+            packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
+        }
+        else
+        {
+            packedSampleCoverage = packedCoverage0;
+        }
+    #else
+        simdscalari permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
+        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
+        packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
+
+        simdscalari packedSampleCoverage;
+        if(T::MultisampleT::numSamples > 8)
+        {
+            permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
+            // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+            packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
+
+            // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
+            packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
+        }
+        else
+        {
+            packedSampleCoverage = packedCoverage0;
+        }
+    #endif
+
+        for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
+        {
+            // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
+            inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
+
+            if(!T::bForcedSampleCount)
+            {
+                // input coverage has to be anded with sample mask if MSAA isn't forced on
+                inputMask[i] &= sampleMask;
+            }
+
+            // shift to the next pixel in the 4x2
+            packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
+        }
+    }
+
+    INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
+    {
+        uint32_t inputMask[KNOB_SIMD_WIDTH];
+        generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
+        inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
+    }
+
+};
+
+template<typename T>
+struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
+{
+    INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
+    {
+        // will need to update for avx512
+        assert(KNOB_SIMD_WIDTH == 8);
+        simdscalari vec = _mm256_set1_epi32(coverageMask[0]);
+        const simdscalari bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+        vec = _simd_and_si(vec, bit);
+        vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
+        vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
+        inputCoverage = _simd_castsi_ps(vec);
+    }
+
+    INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+    {
+        uint32_t simdCoverage = (coverageMask[0] & MASK);
+        static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
+        for(int i = 0; i < KNOB_SIMD_WIDTH; i++)
+        {
+            // set all samples to covered if conservative coverage mask is set for that pixel
+            inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
+        }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Centroid behaves exactly as follows :
+// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to 
+//     have a sample location there).
+// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the 
+//     coverage with the SampleMask Rasterizer State.
+// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is 
+//     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the 
+//     SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template<typename T>
+INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos,
+                            const uint64_t *const coverageMask, const uint32_t sampleMask,
+                            const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+{
+    uint32_t inputMask[KNOB_SIMD_WIDTH];
+    generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
+
+    // Case (2) - partially covered pixel
+
+    // scan for first covered sample per pixel in the 4x2 span
+    unsigned long sampleNum[KNOB_SIMD_WIDTH];
+    (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
+    (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
+    (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
+    (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
+    (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
+    (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
+    (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
+    (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
+
+    // look up and set the sample offsets from UL pixel corner for first covered sample 
+    __m256 vXSample = _mm256_set_ps(samplePos.X(sampleNum[7]),
+                                    samplePos.X(sampleNum[6]),
+                                    samplePos.X(sampleNum[5]),
+                                    samplePos.X(sampleNum[4]),
+                                    samplePos.X(sampleNum[3]),
+                                    samplePos.X(sampleNum[2]),
+                                    samplePos.X(sampleNum[1]),
+                                    samplePos.X(sampleNum[0]));
+
+    __m256 vYSample = _mm256_set_ps(samplePos.Y(sampleNum[7]),
+                                    samplePos.Y(sampleNum[6]),
+                                    samplePos.Y(sampleNum[5]),
+                                    samplePos.Y(sampleNum[4]),
+                                    samplePos.Y(sampleNum[3]),
+                                    samplePos.Y(sampleNum[2]),
+                                    samplePos.Y(sampleNum[1]),
+                                    samplePos.Y(sampleNum[0]));
+    // add sample offset to UL pixel corner
+    vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
+    vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
+
+    // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
+    static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
+    simdscalari vInputCoveragei =  _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
+    simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
+
+    static const simdscalari vZero = _simd_setzero_si();
+    const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
+    simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
+    simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
+    simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
+
+    simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
+
+    // set the centroid position based on results from above
+    psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
+    psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
+
+    // Case (3a) No samples covered and partial sample mask
+    simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
+    // sample mask should never be all 0's for this case, but handle it anyways
+    unsigned long firstCoveredSampleMaskSample = 0;
+    (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
+
+    simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
+
+    vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample));
+    vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
+
+    // blend in case 3a pixel locations
+    psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
+    psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
+}
+
+INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
+                                     const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+{
+    // evaluate I,J
+    psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
+    psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
+    psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
+    psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
+
+    // interpolate 1/w
+    psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
+}
+
+INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar z, float minz, float maxz)
+{
+    const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
+    const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
+
+    return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
+}
+
+template<typename T>
+INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
+{
+    // RT has to be single sample if we're in forcedMSAA mode
+    if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
+    {
+        return 1;
+    }
+    // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
+    else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
+    {
+        return GetNumSamples(blendSampleCount);
+    }
+    // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
+    else
+    {
+        return T::MultisampleT::numSamples;
+    }
+}
+
+inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
+{
+    // broadcast scalars
+
+    coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
+    coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
+    coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
+
+    coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
+    coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
+    coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
+
+    coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
+    coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
+    coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
+
+    coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
+
+    coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
+    coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
+    coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
+}
+
+inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers)
+{
+    assert(colorBufferCount <= SWR_NUM_RENDERTARGETS);
+
+    if (pColorBuffer)
+    {
+        for (uint32_t index = 0; index < colorBufferCount; index += 1)
+        {
+            pColorBuffer[index] = renderBuffers.pColor[index];
+        }
+    }
+
+    if (pDepthBuffer)
+    {
+        *pDepthBuffer = renderBuffers.pDepth;
+    }
+
+    if (pStencilBuffer)
+    {
+        *pStencilBuffer = renderBuffers.pStencil;;
+    }
+}
+
+template<typename T>
+void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, SWR_TRIANGLE_DESC &work)
+{
+    psContext->pAttribs = work.pAttribs;
+    psContext->pPerspAttribs = work.pPerspAttribs;
+    psContext->frontFace = work.triFlags.frontFacing;
+    psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex;
+
+    // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
+    psContext->I = work.I;
+    psContext->J = work.J;
+
+    psContext->recipDet = work.recipDet;
+    psContext->pRecipW = work.pRecipW;
+    psContext->pSamplePosX = samplePos.X();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
+    psContext->pSamplePosY = samplePos.Y();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
+    psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
+    psContext->sampleIndex = 0;
+}
+
+template<typename T, bool IsSingleSample>
+void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos,
+                  const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
+{
+    if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
+    {
+        // for 1x case, centroid is pixel center
+        psContext->vX.centroid = psContext->vX.center;
+        psContext->vY.centroid = psContext->vY.center;
+        psContext->vI.centroid = psContext->vI.center;
+        psContext->vJ.centroid = psContext->vJ.center;
+        psContext->vOneOverW.centroid = psContext->vOneOverW.center;
+    }
+    else
+    {
+        if (T::bCentroidPos)
+        {
+            ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
+            if (T::bIsCenterPattern)
+            {
+                psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
+                psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
+            }
+            else
+            {
+                // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
+                CalcCentroidPos<T>(*psContext, samplePos, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
+            }
+
+            CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
+        }
+        else
+        {
+            psContext->vX.centroid = psContext->vX.sample;
+            psContext->vY.centroid = psContext->vY.sample;
+        }
+    }
+}
+
+template<typename T>
+struct PixelRateZTestLoop
+{
+    PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
+                       uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
+                       pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
+                       samplePos(state.rastState.samplePositions),
+                       clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
+
+    INLINE
+    uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext, 
+                        const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
+    {
+        SWR_CONTEXT *pContext = pDC->pContext;
+
+        uint32_t statCount = 0;
+        simdscalar anyDepthSamplePassed = _simd_setzero_ps();
+        for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
+        {
+            const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
+            vCoverageMask[sample] = _simd_and_ps(activeLanes, vMask(pCoverageMask[currentSimdIn8x8] & MASK));
+
+            if(!_simd_movemask_ps(vCoverageMask[sample]))
+            {
+                vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
+                continue;
+            }
+
+            // offset depth/stencil buffers current sample
+            uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
+            uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+
+            if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
+            {
+                static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+
+                const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
+
+                const float minz = state.depthBoundsState.depthBoundsTestMinValue;
+                const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
+
+                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(CalcDepthBoundsAcceptMask(z, minz, maxz)));
+            }
+
+            AR_BEGIN(BEBarycentric, pDC->drawId);
+
+            // calculate per sample positions
+            psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
+            psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
+
+            // calc I & J per sample
+            CalcSampleBarycentrics(coeffs, psContext);
+
+            if(psState.writesODepth)
+            {
+                {
+                    // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
+                    vZ[sample] = psContext.vZ;
+                }
+            }
+            else
+            {
+                vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+                vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
+            }
+
+            AR_END(BEBarycentric, 0);
+
+            ///@todo: perspective correct vs non-perspective correct clipping?
+            // if clip distances are enabled, we need to interpolate for each sample
+            if(clipDistanceMask)
+            {
+                uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
+
+                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
+            }
+
+            // ZTest for this sample
+            ///@todo Need to uncomment out this bucket.
+            //AR_BEGIN(BEDepthBucket, pDC->drawId);
+            depthPassMask[sample] = vCoverageMask[sample];
+            stencilPassMask[sample] = vCoverageMask[sample];
+            depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+                                                     vZ[sample], pDepthSample, vCoverageMask[sample], 
+                                                     pStencilSample, &stencilPassMask[sample]);
+            //AR_END(BEDepthBucket, 0);
+
+            // early-exit if no pixels passed depth or earlyZ is forced on
+            if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
+            {
+                DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
+                                  pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
+
+                if(!_simd_movemask_ps(depthPassMask[sample]))
+                {
+                    continue;
+                }
+            }
+            anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
+            uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
+            statCount += _mm_popcnt_u32(statMask);
+        }
+
+        activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
+        // return number of samples that passed depth and coverage
+        return statCount;
+    }
+
+    // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
+    simdscalar vZ[T::MultisampleT::numCoverageSamples];
+    simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
+    simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
+    simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
+
+private:
+    // functor inputs
+    DRAW_CONTEXT* pDC;
+    uint32_t workerId;
+
+    const SWR_TRIANGLE_DESC& work;
+    const BarycentricCoeffs& coeffs;
+    const API_STATE& state;
+    const SWR_PS_STATE& psState;
+    const SWR_MULTISAMPLE_POS& samplePos;
+    const uint8_t clipDistanceMask;
+    uint8_t*& pDepthBuffer;
+    uint8_t*& pStencilBuffer;
+};
+
+INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
+{
+    // evaluate I,J
+    psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
+    psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
+    psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
+    psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
+
+    // interpolate 1/w
+    psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
+}
+
+static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
+{
+    // evaluate I,J
+    psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
+    psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
+    psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
+    psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
+
+    // interpolate 1/w
+    psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
+}
+
+// Merge Output to 4x2 SIMD Tile Format
+INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
+    const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT)
+{
+    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
+    const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
+    simdvector blendOut;
+
+    for(uint32_t rt = 0; rt < NumRT; ++rt)
+    {
+        uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
+
+        const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+
+        {
+            // pfnBlendFunc may not update all channels.  Initialize with PS output.
+            /// TODO: move this into the blend JIT.
+            blendOut = psContext.shaded[rt];
+
+            // Blend outputs and update coverage mask for alpha test
+            if(pfnBlendFunc[rt] != nullptr)
+            {
+                pfnBlendFunc[rt](
+                    pBlendState,
+                    psContext.shaded[rt],
+                    psContext.shaded[1],
+                    psContext.shaded[0].w,
+                    sample,
+                    pColorSample,
+                    blendOut,
+                    &psContext.oMask,
+                    (simdscalari*)&coverageMask);
+            }
+        }
+
+        // final write mask 
+        simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
+
+        ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
+        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
+
+        const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
+
+        // store with color mask
+        if(!pRTBlend->writeDisableRed)
+        {
+            _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
+        }
+        if(!pRTBlend->writeDisableGreen)
+        {
+            _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
+        }
+        if(!pRTBlend->writeDisableBlue)
+        {
+            _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
+        }
+        if(!pRTBlend->writeDisableAlpha)
+        {
+            _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
+        }
+    }
+}
+
+#if USE_8x2_TILE_BACKEND
+// Merge Output to 8x2 SIMD16 Tile Format
+INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
+    const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT, const uint32_t colorBufferEnableMask, bool useAlternateOffset)
+{
+    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
+    uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
+
+    if (useAlternateOffset)
+    {
+        rasterTileColorOffset += sizeof(simdscalar);
+    }
+
+    simdvector blendSrc;
+    simdvector blendOut;
+
+    uint32_t colorBufferBit = 1;
+    for (uint32_t rt = 0; rt < NumRT; rt += 1, colorBufferBit <<= 1)
+    {
+        simdscalar *pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
+
+        const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+
+        if (colorBufferBit & colorBufferEnableMask)
+        {
+            blendSrc[0] = pColorSample[0];
+            blendSrc[1] = pColorSample[2];
+            blendSrc[2] = pColorSample[4];
+            blendSrc[3] = pColorSample[6];
+        }
+
+        {
+            // pfnBlendFunc may not update all channels.  Initialize with PS output.
+            /// TODO: move this into the blend JIT.
+            blendOut = psContext.shaded[rt];
+
+            // Blend outputs and update coverage mask for alpha test
+            if(pfnBlendFunc[rt] != nullptr)
+            {
+                pfnBlendFunc[rt](
+                    pBlendState,
+                    psContext.shaded[rt],
+                    psContext.shaded[1],
+                    psContext.shaded[0].w,
+                    sample,
+                    reinterpret_cast<uint8_t *>(&blendSrc),
+                    blendOut,
+                    &psContext.oMask,
+                    reinterpret_cast<simdscalari *>(&coverageMask));
+            }
+        }
+
+        // final write mask 
+        simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
+
+        ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
+        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
+
+        // store with color mask
+        if (!pRTBlend->writeDisableRed)
+        {
+            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x);
+        }
+        if (!pRTBlend->writeDisableGreen)
+        {
+            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y);
+        }
+        if (!pRTBlend->writeDisableBlue)
+        {
+            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z);
+        }
+        if (!pRTBlend->writeDisableAlpha)
+        {
+            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w);
+        }
+    }
+}
+
+#endif
+
+template<typename T>
+void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+    ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
+
+
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(BEPixelRateBackend, pDC->drawId);
+    AR_BEGIN(BESetup, pDC->drawId);
+
+    const API_STATE &state = GetApiState(pDC);
+
+    BarycentricCoeffs coeffs;
+    SetupBarycentricCoeffs(&coeffs, work);
+
+    SWR_PS_CONTEXT psContext;
+    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
+    SetupPixelShaderContext<T>(&psContext, samplePos, work);
+
+    uint8_t *pDepthBuffer, *pStencilBuffer;
+    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
+
+    AR_END(BESetup, 0);
+
+    PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask);
+
+    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
+    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+
+    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
+
+    for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+    {
+        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
+        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+
+        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
+
+        for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+        {
+#if USE_8x2_TILE_BACKEND
+            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
+#endif
+            simdscalar activeLanes;
+            if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
+            activeLanes = vMask(work.anyCoveredSamples & MASK);
+
+            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
+            {
+                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+
+                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
+            }
+
+            AR_BEGIN(BEBarycentric, pDC->drawId);
+
+            CalcPixelBarycentrics(coeffs, psContext);
+
+            CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
+
+            AR_END(BEBarycentric, 0);
+
+            if(T::bForcedSampleCount)
+            {
+                // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
+                const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
+                activeLanes = _simd_and_ps(activeLanes, vSampleMask);
+            }
+
+            // Early-Z?
+            if(T::bCanEarlyZ && !T::bForcedSampleCount)
+            {
+                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
+                UPDATE_STAT_BE(DepthPassCount, depthPassCount);
+                AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
+            }
+
+            // if we have no covered samples that passed depth at this point, go to next tile
+            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+            if(state.psState.usesSourceDepth)
+            {
+                AR_BEGIN(BEBarycentric, pDC->drawId);
+                // interpolate and quantize z
+                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+                AR_END(BEBarycentric, 0);
+            }
+
+            // pixels that are currently active
+            psContext.activeMask = _simd_castps_si(activeLanes);
+            psContext.oMask = T::MultisampleT::FullSampleMask();
+
+            // execute pixel shader
+            AR_BEGIN(BEPixelShader, pDC->drawId);
+            state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+            UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
+            AR_END(BEPixelShader, 0);
+
+            // update active lanes to remove any discarded or oMask'd pixels
+            activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
+            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+            // late-Z
+            if(!T::bCanEarlyZ && !T::bForcedSampleCount)
+            {
+                uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
+                UPDATE_STAT_BE(DepthPassCount, depthPassCount);
+                AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
+            }
+
+            // if we have no covered samples that passed depth at this point, skip OM and go to next tile
+            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+
+            // output merger
+            // loop over all samples, broadcasting the results of the PS to all passing pixels
+            for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
+            {
+                AR_BEGIN(BEOutputMerger, pDC->drawId);
+                // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
+                uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
+                simdscalar coverageMask, depthMask;
+                if(T::bForcedSampleCount)
+                {
+                    coverageMask = depthMask = activeLanes;
+                }
+                else
+                {
+                    coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
+                    depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
+                    if(!_simd_movemask_ps(depthMask))
+                    {
+                        // stencil should already have been written in early/lateZ tests
+                        AR_END(BEOutputMerger, 0);
+                        continue;
+                    }
+                }
+                
+                // broadcast the results of the PS to all passing pixels
+#if USE_8x2_TILE_BACKEND
+                OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
+#else // USE_8x2_TILE_BACKEND
+                OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets);
+#endif // USE_8x2_TILE_BACKEND
+
+                if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
+                {
+                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
+                    uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+
+                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
+                                      pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
+                }
+                AR_END(BEOutputMerger, 0);
+            }
+Endtile:
+            AR_BEGIN(BEEndTile, pDC->drawId);
+
+            for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
+            {
+                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+
+            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+            {
+                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+            work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+
+#if USE_8x2_TILE_BACKEND
+            if (useAlternateOffset)
+            {
+                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+                {
+                    psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                }
+            }
+#else
+            for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+            {
+                psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+            }
+#endif
+            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+            AR_END(BEEndTile, 0);
+
+            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
+            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
+        }
+
+        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
+        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
+    }
+
+    AR_END(BEPixelRateBackend, 0);
+}
+
+template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t isCenter = 0,
+         uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0
+    >
+struct SwrBackendTraits
+{
+    static const bool bIsCenterPattern = (isCenter == 1);
+    static const uint32_t InputCoverage = coverage;
+    static const bool bCentroidPos = (centroid == 1);
+    static const bool bForcedSampleCount = (forced == 1);
+    static const bool bCanEarlyZ = (canEarlyZ == 1);
+    typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT;
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
new file mode 100644
index 0000000..0f75ec2
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
@@ -0,0 +1,345 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file backend.cpp
+*
+* @brief Backend handles rasterization, pixel shading and output merger
+*        operations.
+*
+******************************************************************************/
+
+#include <smmintrin.h>
+
+#include "backend.h"
+#include "backend_impl.h"
+#include "tilemgr.h"
+#include "memory/tilingtraits.h"
+#include "core/multisample.h"
+
+#include <algorithm>
+
+template<typename T>
+void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(BESampleRateBackend, pDC->drawId);
+    AR_BEGIN(BESetup, pDC->drawId);
+
+    const API_STATE &state = GetApiState(pDC);
+
+    BarycentricCoeffs coeffs;
+    SetupBarycentricCoeffs(&coeffs, work);
+
+    uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
+    SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
+
+    SWR_PS_CONTEXT psContext;
+    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
+    SetupPixelShaderContext<T>(&psContext, samplePos, work);
+
+    AR_END(BESetup, 0);
+
+    psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+
+    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
+
+    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+    {
+        psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+
+        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
+
+        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+        {
+#if USE_8x2_TILE_BACKEND
+            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
+
+#endif
+            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
+            {
+                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+
+                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
+            }
+
+            AR_BEGIN(BEBarycentric, pDC->drawId);
+
+            CalcPixelBarycentrics(coeffs, psContext);
+
+            CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
+
+            AR_END(BEBarycentric, 0);
+
+            for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
+            {
+                simdmask coverageMask = work.coverageMask[sample] & MASK;
+
+                if (coverageMask)
+                {
+                    // offset depth/stencil buffers current sample
+                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
+                    uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+
+                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
+                    {
+                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+
+                        const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
+
+                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
+                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
+
+                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
+                    }
+
+                    AR_BEGIN(BEBarycentric, pDC->drawId);
+
+                    // calculate per sample positions
+                    psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
+                    psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
+
+                    CalcSampleBarycentrics(coeffs, psContext);
+
+                    // interpolate and quantize z
+                    psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+
+                    AR_END(BEBarycentric, 0);
+
+                    // interpolate user clip distance if available
+                    if (state.rastState.clipDistanceMask)
+                    {
+                        coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
+                    }
+
+                    simdscalar vCoverageMask = vMask(coverageMask);
+                    simdscalar depthPassMask = vCoverageMask;
+                    simdscalar stencilPassMask = vCoverageMask;
+
+                    // Early-Z?
+                    if (T::bCanEarlyZ)
+                    {
+                        AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
+                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+                            psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
+                        AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
+                        AR_END(BEEarlyDepthTest, 0);
+
+                        // early-exit if no samples passed depth or earlyZ is forced on.
+                        if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
+                        {
+                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+
+                            if (!_simd_movemask_ps(depthPassMask))
+                            {
+                                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+                                continue;
+                            }
+                        }
+                    }
+
+                    psContext.sampleIndex = sample;
+                    psContext.activeMask = _simd_castps_si(vCoverageMask);
+
+                    // execute pixel shader
+                    AR_BEGIN(BEPixelShader, pDC->drawId);
+                    UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
+                    state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+                    AR_END(BEPixelShader, 0);
+
+                    vCoverageMask = _simd_castsi_ps(psContext.activeMask);
+
+                    // late-Z
+                    if (!T::bCanEarlyZ)
+                    {
+                        AR_BEGIN(BELateDepthTest, pDC->drawId);
+                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+                            psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
+                        AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
+                        AR_END(BELateDepthTest, 0);
+
+                        if (!_simd_movemask_ps(depthPassMask))
+                        {
+                            // need to call depth/stencil write for stencil write
+                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+
+                            work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+                            continue;
+                        }
+                    }
+
+                    uint32_t statMask = _simd_movemask_ps(depthPassMask);
+                    uint32_t statCount = _mm_popcnt_u32(statMask);
+                    UPDATE_STAT_BE(DepthPassCount, statCount);
+
+                    // output merger
+                    AR_BEGIN(BEOutputMerger, pDC->drawId);
+#if USE_8x2_TILE_BACKEND
+                    OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
+#else
+                    OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
+#endif
+
+                    // do final depth write after all pixel kills
+                    if (!state.psState.forceEarlyZ)
+                    {
+                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                            pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+                    }
+                    AR_END(BEOutputMerger, 0);
+                }
+                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+
+        Endtile:
+            ATTR_UNUSED;
+
+            AR_BEGIN(BEEndTile, pDC->drawId);
+
+            if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+            {
+                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+
+#if USE_8x2_TILE_BACKEND
+            if (useAlternateOffset)
+            {
+                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+                {
+                    pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                }
+            }
+#else
+            for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+            {
+                pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+            }
+#endif
+            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+            AR_END(BEEndTile, 0);
+
+            psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
+            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
+        }
+
+        psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
+        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
+    }
+
+    AR_END(BESampleRateBackend, 0);
+}
+
+// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
+// arguments to static template arguments.
+template <uint32_t... ArgsT>
+struct BEChooserSampleRate
+{
+    // Last Arg Terminator
+    static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
+    {
+        switch (tArg)
+        {
+        case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
+        case SWR_BACKEND_SINGLE_SAMPLE:
+        case SWR_BACKEND_MSAA_PIXEL_RATE:
+            SWR_ASSERT(0 && "Invalid backend func\n");
+            return nullptr;
+            break;
+        default:
+            SWR_ASSERT(0 && "Invalid backend func\n");
+            return nullptr;
+            break;
+        }
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
+    {
+        switch (tArg)
+        {
+        case SWR_INPUT_COVERAGE_NONE: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
+        case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
+        case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
+        default:
+            SWR_ASSERT(0 && "Invalid sample pattern\n");
+            return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
+            break;
+        }
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
+    {
+        switch (tArg)
+        {
+        case SWR_MULTISAMPLE_1X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_2X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_4X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_8X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_16X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
+        default:
+            SWR_ASSERT(0 && "Invalid sample count\n");
+            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
+            break;
+        }
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
+    {
+        if (tArg == true)
+        {
+            return BEChooserSampleRate<ArgsT..., 1>::GetFunc(remainingArgs...);
+        }
+
+        return BEChooserSampleRate<ArgsT..., 0>::GetFunc(remainingArgs...);
+    }
+};
+
+void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
+{
+    for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
+    {
+        for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
+        {
+            for (uint32_t centroid = 0; centroid < 2; centroid++)
+            {
+                for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
+                {
+                    table[sampleCount][inputCoverage][centroid][canEarlyZ] =
+                        BEChooserSampleRate<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage,
+                        (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
+                }
+            }
+        }
+    }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
new file mode 100644
index 0000000..0eecc25
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
@@ -0,0 +1,321 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file backend.cpp
+*
+* @brief Backend handles rasterization, pixel shading and output merger
+*        operations.
+*
+******************************************************************************/
+
+#include <smmintrin.h>
+
+#include "backend.h"
+#include "backend_impl.h"
+#include "tilemgr.h"
+#include "memory/tilingtraits.h"
+#include "core/multisample.h"
+
+#include <algorithm>
+
+template<typename T>
+void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    AR_BEGIN(BESingleSampleBackend, pDC->drawId);
+    AR_BEGIN(BESetup, pDC->drawId);
+
+    const API_STATE &state = GetApiState(pDC);
+
+    BarycentricCoeffs coeffs;
+    SetupBarycentricCoeffs(&coeffs, work);
+
+    uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer;
+    SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers);
+
+    SWR_PS_CONTEXT psContext;
+    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
+    SetupPixelShaderContext<T>(&psContext, samplePos, work);
+
+    AR_END(BESetup, 1);
+
+    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
+    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+
+    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
+
+    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+    {
+        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
+        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+
+        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
+
+        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+        {
+#if USE_8x2_TILE_BACKEND
+            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
+
+#endif
+            simdmask coverageMask = work.coverageMask[0] & MASK;
+
+            if (coverageMask)
+            {
+                if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
+                {
+                    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+
+                    const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
+
+                    const float minz = state.depthBoundsState.depthBoundsTestMinValue;
+                    const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
+
+                    coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
+                }
+
+                if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
+                {
+                    const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+
+                    generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
+                }
+
+                AR_BEGIN(BEBarycentric, pDC->drawId);
+
+                CalcPixelBarycentrics(coeffs, psContext);
+
+                CalcCentroid<T, true>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
+
+                // interpolate and quantize z
+                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+
+                AR_END(BEBarycentric, 1);
+
+                // interpolate user clip distance if available
+                if (state.rastState.clipDistanceMask)
+                {
+                    coverageMask &= ~ComputeUserClipMask(state.rastState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
+                }
+
+                simdscalar vCoverageMask = vMask(coverageMask);
+                simdscalar depthPassMask = vCoverageMask;
+                simdscalar stencilPassMask = vCoverageMask;
+
+                // Early-Z?
+                if (T::bCanEarlyZ)
+                {
+                    AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
+                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+                                                     psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
+                    AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
+                    AR_END(BEEarlyDepthTest, 0);
+
+                    // early-exit if no pixels passed depth or earlyZ is forced on
+                    if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
+                    {
+                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                            pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
+
+                        if (!_simd_movemask_ps(depthPassMask))
+                        {
+                            goto Endtile;
+                        }
+                    }
+                }
+
+                psContext.sampleIndex = 0;
+                psContext.activeMask = _simd_castps_si(vCoverageMask);
+
+                // execute pixel shader
+                AR_BEGIN(BEPixelShader, pDC->drawId);
+                UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
+                state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+                AR_END(BEPixelShader, 0);
+
+                vCoverageMask = _simd_castsi_ps(psContext.activeMask);
+
+                // late-Z
+                if (!T::bCanEarlyZ)
+                {
+                    AR_BEGIN(BELateDepthTest, pDC->drawId);
+                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
+                                                        psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
+                    AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
+                    AR_END(BELateDepthTest, 0);
+
+                    if (!_simd_movemask_ps(depthPassMask))
+                    {
+                        // need to call depth/stencil write for stencil write
+                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                            pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
+                        goto Endtile;
+                    }
+                } else {
+                    // for early z, consolidate discards from shader
+                    // into depthPassMask
+                    depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
+                }
+
+                uint32_t statMask = _simd_movemask_ps(depthPassMask);
+                uint32_t statCount = _mm_popcnt_u32(statMask);
+                UPDATE_STAT_BE(DepthPassCount, statCount);
+
+                // output merger
+                AR_BEGIN(BEOutputMerger, pDC->drawId);
+#if USE_8x2_TILE_BACKEND
+                OutputMerger8x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset);
+#else
+                OutputMerger4x2(psContext, pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets);
+#endif
+
+                // do final depth write after all pixel kills
+                if (!state.psState.forceEarlyZ)
+                {
+                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                        pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
+                }
+                AR_END(BEOutputMerger, 0);
+            }
+
+Endtile:
+            AR_BEGIN(BEEndTile, pDC->drawId);
+
+            work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+            {
+                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+
+#if USE_8x2_TILE_BACKEND
+            if (useAlternateOffset)
+            {
+                for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+                {
+                    pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                }
+            }
+#else
+            for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt)
+            {
+                pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+            }
+#endif
+            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+            AR_END(BEEndTile, 0);
+
+            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
+            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
+        }
+
+        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
+        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
+    }
+
+    AR_END(BESingleSampleBackend, 0);
+}
+
+// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
+// arguments to static template arguments.
+template <uint32_t... ArgsT>
+struct BEChooserSingleSample
+{
+    // Last Arg Terminator
+    static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
+    {
+        switch(tArg)
+        {
+        case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
+        case SWR_BACKEND_MSAA_PIXEL_RATE:
+        case SWR_BACKEND_MSAA_SAMPLE_RATE:
+        default:
+            SWR_ASSERT(0 && "Invalid backend func\n");
+            return nullptr;
+            break;
+        }
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
+    {
+        switch(tArg)
+        {
+        case SWR_INPUT_COVERAGE_NONE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
+        case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
+        case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
+        default:
+        SWR_ASSERT(0 && "Invalid sample pattern\n");
+        return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
+        break;
+        }
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
+    {
+        switch(tArg)
+        {
+        case SWR_MULTISAMPLE_1X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_2X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_4X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_8X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_16X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
+        default:
+        SWR_ASSERT(0 && "Invalid sample count\n");
+        return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
+        break;
+        }
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
+    {
+        if(tArg == true)
+        {
+            return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...);
+        }
+
+        return BEChooserSingleSample<ArgsT..., 0>::GetFunc(remainingArgs...);
+    }
+};
+
+void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
+{
+    for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
+    {
+        for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
+        {
+            for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
+            {
+                table[inputCoverage][isCentroid][canEarlyZ] =
+                    BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage,
+                                         (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
+            }
+        }
+    }
+}
-- 
2.7.4



More information about the mesa-dev mailing list