[Mesa-dev] [PATCH v2 07/13] swr/rast: Split rasterizer.cpp to improve compile time

Tim Rowley timothy.o.rowley at intel.com
Tue Jun 27 18:09:09 UTC 2017


Hardcode split to four files currently.  Decreases swr build
time on KNL by over 50%.
---
 src/gallium/drivers/swr/Makefile.am                |   39 +-
 src/gallium/drivers/swr/Makefile.sources           |    2 +-
 src/gallium/drivers/swr/SConscript                 |   24 +-
 .../drivers/swr/rasterizer/codegen/gen_backends.py |   12 +
 .../codegen/templates/gen_rasterizer.cpp           |   42 +
 src/gallium/drivers/swr/rasterizer/core/api.cpp    |    1 +
 .../drivers/swr/rasterizer/core/multisample.cpp    |   48 -
 .../drivers/swr/rasterizer/core/rasterizer.cpp     | 1788 +++-----------------
 .../drivers/swr/rasterizer/core/rasterizer.h       |   31 +-
 .../drivers/swr/rasterizer/core/rasterizer_impl.h  | 1376 +++++++++++++++
 10 files changed, 1739 insertions(+), 1624 deletions(-)
 create mode 100644 src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp
 delete mode 100644 src/gallium/drivers/swr/rasterizer/core/multisample.cpp
 create mode 100644 src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h

diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am
index 578f159..4b4bd37 100644
--- a/src/gallium/drivers/swr/Makefile.am
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -67,7 +67,12 @@ BUILT_SOURCES = \
 	rasterizer/core/backends/gen_BackendPixelRate1.cpp \
 	rasterizer/core/backends/gen_BackendPixelRate2.cpp \
 	rasterizer/core/backends/gen_BackendPixelRate3.cpp \
-	rasterizer/core/backends/gen_BackendPixelRate.hpp
+	rasterizer/core/backends/gen_BackendPixelRate.hpp \
+	rasterizer/core/backends/gen_rasterizer0.cpp \
+	rasterizer/core/backends/gen_rasterizer1.cpp \
+	rasterizer/core/backends/gen_rasterizer2.cpp \
+	rasterizer/core/backends/gen_rasterizer3.cpp \
+	rasterizer/core/backends/gen_rasterizer.hpp
 
 MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
 PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS)
@@ -173,6 +178,35 @@ backend.intermediate: rasterizer/codegen/gen_backends.py rasterizer/codegen/temp
 		--cpp \
 		--hpp
 
+rasterizer/core/backends/gen_rasterizer0.cpp \
+rasterizer/core/backends/gen_rasterizer1.cpp \
+rasterizer/core/backends/gen_rasterizer2.cpp \
+rasterizer/core/backends/gen_rasterizer3.cpp \
+rasterizer/core/backends/gen_rasterizer.hpp: \
+rasterizer.intermediate
+
+# 5 SWR_MULTISAMPLE_TYPE_COUNT
+# 2 CenterPattern
+# 2 Conservative
+# 3 SWR_INPUT_COVERAGE_COUNT
+# 5 STATE_VALID_TRI_EDGE_COUNT
+# 2 RasterScissorEdges
+
+# use intermediate rule to tell make that all files can be
+# generated in one invocation of gen_backends.py (prevents
+# parallel make race condition)
+.INTERMEDIATE: rasterizer.intermediate
+rasterizer.intermediate: rasterizer/codegen/gen_backends.py rasterizer/codegen/templates/gen_rasterizer.cpp rasterizer/codegen/templates/gen_header_init.hpp
+	$(MKDIR_GEN)
+	$(PYTHON_GEN) \
+		$(srcdir)/rasterizer/codegen/gen_backends.py \
+		--outdir rasterizer/core/backends \
+		--rast \
+		--dim 5 2 2 3 5 2 \
+		--numfiles 4 \
+		--cpp \
+		--hpp
+
 COMMON_LIBADD = \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/mesa/libmesagallium.la \
@@ -247,4 +281,5 @@ EXTRA_DIST = \
 	rasterizer/codegen/templates/gen_builder.hpp \
 	rasterizer/codegen/templates/gen_header_init.hpp \
 	rasterizer/codegen/templates/gen_knobs.cpp \
-	rasterizer/codegen/templates/gen_llvm.hpp
+	rasterizer/codegen/templates/gen_llvm.hpp \
+	rasterizer/codegen/templates/gen_rasterizer.cpp
diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources
index d9894c2..12a5e7d 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -95,12 +95,12 @@ CORE_CXX_SOURCES := \
 	rasterizer/core/frontend.h \
 	rasterizer/core/knobs.h \
 	rasterizer/core/knobs_init.h \
-	rasterizer/core/multisample.cpp \
 	rasterizer/core/multisample.h \
 	rasterizer/core/pa_avx.cpp \
 	rasterizer/core/pa.h \
 	rasterizer/core/rasterizer.cpp \
 	rasterizer/core/rasterizer.h \
+	rasterizer/core/rasterizer_impl.h \
 	rasterizer/core/rdtsc_core.cpp \
 	rasterizer/core/rdtsc_core.h \
 	rasterizer/core/ringbuffer.h \
diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript
index 0f3cd6c..512269a 100644
--- a/src/gallium/drivers/swr/SConscript
+++ b/src/gallium/drivers/swr/SConscript
@@ -156,6 +156,28 @@ Depends(backendPixelRateFiles,
          'rasterizer/codegen/gen_knobs.h']
         )
 
+# 5 SWR_MULTISAMPLE_TYPE_COUNT
+# 2 CenterPattern
+# 2 Conservative
+# 3 SWR_INPUT_COVERAGE_COUNT
+# 5 STATE_VALID_TRI_EDGE_COUNT
+# 2 RasterScissorEdges
+genRasterizerFileCount = 4
+genRasterizerFilePat = "rasterizer/core/backends/gen_rasterizer%s.cpp"
+genRasterizerFiles = map(lambda x: genRasterizerFilePat % x,
+                         range(0, genRasterizerFileCount))
+env.CodeGenerate(
+    target = 'rasterizer/core/backends/gen_rasterizer.hpp',
+    script = swrroot + 'rasterizer/codegen/gen_backends.py',
+    source = '',
+    command = python_cmd + ' $SCRIPT --outdir ' + bldroot + '/rasterizer/core/backends --rast --dim 5 2 2 3 5 2 --numfiles ' + str(genRasterizerFileCount) + ' --cpp --hpp'
+    )
+Depends(genRasterizerFiles,
+        ['rasterizer/core/backends/gen_rasterizer.hpp',
+         'rasterizer/archrast/gen_ar_event.hpp',
+         'rasterizer/codegen/gen_knobs.h']
+        )
+
 Depends('rasterizer/jitter/gen_state_llvm.h',
         swrroot + 'rasterizer/codegen/templates/gen_backend.cpp')
 
@@ -165,7 +187,7 @@ built_sources = [
     'rasterizer/archrast/gen_ar_event.cpp',
     ]
 
-built_sources += backendPixelRateFiles
+built_sources += [backendPixelRateFiles, genRasterizerFiles]
 
 source = built_sources
 source += env.ParseSourceList(swrroot + 'Makefile.sources', [
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py
index 2fc91d1..414a04e 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_backends.py
@@ -39,6 +39,7 @@ def main(args=sys.argv[1:]):
     parser.add_argument('--cpp', help='Generate cpp file(s)', action='store_true', default=False)
     parser.add_argument('--hpp', help='Generate hpp file', action='store_true', default=False)
     parser.add_argument('--cmake', help='Generate cmake file', action='store_true', default=False)
+    parser.add_argument('--rast', help='Generate rasterizer functions instead of normal backend', action='store_true', default=False)
 
     args = parser.parse_args(args)
 
@@ -55,6 +56,17 @@ def main(args=sys.argv[1:]):
             self.cmakeSrcVar = 'GEN_BACKEND_SOURCES'
             self.tableName = 'BackendPixelRate'
 
+            if args.rast:
+                self.outFileName = 'gen_rasterizer%s.cpp'
+                self.outHeaderName = 'gen_rasterizer.hpp'
+                self.functionTableName = 'gRasterizerFuncs'
+                self.funcInstanceHeader = ' = RasterizeTriangle<RasterizerTraits<'
+                self.template = 'gen_rasterizer.cpp'
+                self.cmakeFileName = 'gen_rasterizer.cmake'
+                self.cmakeSrcVar = 'GEN_RASTERIZER_SOURCES'
+                self.tableName = 'RasterizerFuncs'
+
+
     backend = backendStrs()
 
     output_list = []
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp
new file mode 100644
index 0000000..06c8762
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp
@@ -0,0 +1,42 @@
+//============================================================================
+// Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice (including the next
+// paragraph) shall be included in all copies or substantial portions of the
+// Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+// IN THE SOFTWARE.
+// 
+// @file gen_rasterizer${fileNum}.cpp
+// 
+// @brief auto-generated file
+// 
+// DO NOT EDIT
+//
+// Generation Command Line:
+//  ${'\n//    '.join(cmdline)}
+//
+//============================================================================
+
+#include "core/rasterizer.h"
+#include "core/rasterizer_impl.h"
+
+void InitRasterizerFuncs${fileNum}()
+{
+    %for func in funcList:
+    ${func}
+    %endfor
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index cf895fb..d3d80e4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -1651,6 +1651,7 @@ void SwrInit()
 
     InitClearTilesTable();
     InitBackendFuncTables();
+    InitRasterizerFunctions();
 }
 
 void SwrGetInterface(SWR_INTERFACE &out_funcs)
diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.cpp b/src/gallium/drivers/swr/rasterizer/core/multisample.cpp
deleted file mode 100644
index 8b20f7a..0000000
--- a/src/gallium/drivers/swr/rasterizer/core/multisample.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file multisample.cpp
-*
-******************************************************************************/
-
-#include "multisample.h"
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosXi[1];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosYi[1];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosXi[2];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosYi[2];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosXi[4];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosYi[4];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosXi[8];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosYi[8];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosXi[16];
-constexpr uint32_t MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosYi[16];
-
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosX[1];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosY[1];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosX[2];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosY[2];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosX[4];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosY[4];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosX[8];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosY[8];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosX[16];
-constexpr float MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosY[16];
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
index 4df146e..a3ff557 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
@@ -30,1539 +30,16 @@
 #include <algorithm>
 
 #include "rasterizer.h"
+#include "backends/gen_rasterizer.hpp"
 #include "rdtsc_core.h"
 #include "backend.h"
 #include "utils.h"
 #include "frontend.h"
 #include "tilemgr.h"
 #include "memory/tilingtraits.h"
+#include "rasterizer_impl.h"
 
-template <uint32_t numSamples = 1>
-void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
-template <typename RT>
-void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers);
-template <typename RT>
-void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
-
-#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
-const __m256d gMaskToVecpd[] =
-{
-    MASKTOVEC(0, 0, 0, 0),
-    MASKTOVEC(0, 0, 0, 1),
-    MASKTOVEC(0, 0, 1, 0),
-    MASKTOVEC(0, 0, 1, 1),
-    MASKTOVEC(0, 1, 0, 0),
-    MASKTOVEC(0, 1, 0, 1),
-    MASKTOVEC(0, 1, 1, 0),
-    MASKTOVEC(0, 1, 1, 1),
-    MASKTOVEC(1, 0, 0, 0),
-    MASKTOVEC(1, 0, 0, 1),
-    MASKTOVEC(1, 0, 1, 0),
-    MASKTOVEC(1, 0, 1, 1),
-    MASKTOVEC(1, 1, 0, 0),
-    MASKTOVEC(1, 1, 0, 1),
-    MASKTOVEC(1, 1, 1, 0),
-    MASKTOVEC(1, 1, 1, 1),
-};
-
-struct POS
-{
-    int32_t x, y;
-};
-
-struct EDGE
-{
-    double a, b;                // a, b edge coefficients in fix8
-    double stepQuadX;           // step to adjacent horizontal quad in fix16
-    double stepQuadY;           // step to adjacent vertical quad in fix16
-    double stepRasterTileX;     // step to adjacent horizontal raster tile in fix16
-    double stepRasterTileY;     // step to adjacent vertical raster tile in fix16
-
-    __m256d vQuadOffsets;       // offsets for 4 samples of a quad
-    __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief rasterize a raster tile partially covered by the triangle
-/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
-/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
-/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
-///        Used to step between quads when sweeping over the raster tile.
-template<uint32_t NumEdges, typename EdgeMaskT>
-INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
-{
-    uint64_t coverageMask = 0;
-
-    __m256d vEdges[NumEdges];
-    __m256d vStepX[NumEdges];
-    __m256d vStepY[NumEdges];
-
-    for (uint32_t e = 0; e < NumEdges; ++e)
-    {
-        // Step to the pixel sample locations of the 1st quad
-        vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
-
-        // compute step to next quad (mul by 2 in x and y direction)
-        vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
-        vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
-    }
-
-    // fast unrolled version for 8x8 tile
-#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
-    int edgeMask[NumEdges];
-    uint64_t mask;
-
-    auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
-    auto update_lambda = [&](int e){mask &= edgeMask[e];};
-    auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
-    auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
-    auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
-
-// evaluate which pixels in the quad are covered
-#define EVAL \
-            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
-
-    // update coverage mask
-    // if edge 0 is degenerate and will be skipped; init the mask
-#define UPDATE_MASK(bit) \
-            if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
-                mask = 0xf;\
-            }\
-            else{\
-                mask = edgeMask[0]; \
-            }\
-            UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
-            coverageMask |= (mask << bit);
-
-    // step in the +x direction to the next quad 
-#define INCX \
-            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
-
-    // step in the +y direction to the next quad 
-#define INCY \
-            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
-
-    // step in the -x direction to the next quad 
-#define DECX \
-            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
-
-    // sweep 2x2 quad back and forth through the raster tile, 
-    // computing coverage masks for the entire tile
-
-    // raster tile
-    // 0  1  2  3  4  5  6  7 
-    // x  x
-    // x  x ------------------>  
-    //                   x  x  |
-    // <-----------------x  x  V
-    // ..
-
-    // row 0
-    EVAL;
-    UPDATE_MASK(0);
-    INCX;
-    EVAL;
-    UPDATE_MASK(4);
-    INCX;
-    EVAL;
-    UPDATE_MASK(8);
-    INCX;
-    EVAL;
-    UPDATE_MASK(12);
-    INCY;
-
-    //row 1
-    EVAL;
-    UPDATE_MASK(28);
-    DECX;
-    EVAL;
-    UPDATE_MASK(24);
-    DECX;
-    EVAL;
-    UPDATE_MASK(20);
-    DECX;
-    EVAL;
-    UPDATE_MASK(16);
-    INCY;
-
-    // row 2
-    EVAL;
-    UPDATE_MASK(32);
-    INCX;
-    EVAL;
-    UPDATE_MASK(36);
-    INCX;
-    EVAL;
-    UPDATE_MASK(40);
-    INCX;
-    EVAL;
-    UPDATE_MASK(44);
-    INCY;
-
-    // row 3
-    EVAL;
-    UPDATE_MASK(60);
-    DECX;
-    EVAL;
-    UPDATE_MASK(56);
-    DECX;
-    EVAL;
-    UPDATE_MASK(52);
-    DECX;
-    EVAL;
-    UPDATE_MASK(48);
-#else
-    uint32_t bit = 0;
-    for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
-    {
-        __m256d vStartOfRowEdge[NumEdges];
-        for (uint32_t e = 0; e < NumEdges; ++e)
-        {
-            vStartOfRowEdge[e] = vEdges[e];
-        }
-
-        for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
-        {
-            int edgeMask[NumEdges];
-            for (uint32_t e = 0; e < NumEdges; ++e)
-            {
-                edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
-            }
-
-            uint64_t mask = edgeMask[0];
-            for (uint32_t e = 1; e < NumEdges; ++e)
-            {
-                mask &= edgeMask[e];
-            }
-            coverageMask |= (mask << bit);
-
-            // step to the next pixel in the x
-            for (uint32_t e = 0; e < NumEdges; ++e)
-            {
-                vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
-            }
-            bit+=4;
-        }
-
-        // step to the next row
-        for (uint32_t e = 0; e < NumEdges; ++e)
-        {
-            vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
-        }
-    }
-#endif
-    return coverageMask;
-
-}
-// Top left rule:
-// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
-// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
-// Top left: a sample is in if it is a top or left edge.
-// Out: !(horizontal && above) = !horizontal && below
-// Out: !horizontal && left = !(!horizontal && left) = horizontal and right 
-INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge) 
-{
-    // if vA < 0, vC--
-    // if vA == 0 && vB < 0, vC--
-
-    __m256d vEdgeOut = vEdge;
-    __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
-
-    // if vA < 0 (line is not horizontal and below)
-    int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
-
-    // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
-    __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
-    int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
-    msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
-
-    // if either of these are true and we're on the line (edge == 0), bump it outside the line
-    vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief calculates difference in precision between the result of manh
-/// calculation and the edge precision, based on compile time trait values
-template<typename RT>
-constexpr int64_t ManhToEdgePrecisionAdjust()
-{
-    static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
-                  "Inadequate precision of result of manh calculation ");
-    return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct adjustEdgeConservative
-/// @brief Primary template definition used for partially specializing 
-/// the adjustEdgeConservative function. This struct should never
-/// be instantiated.
-/// @tparam RT: rasterizer traits
-/// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
-template <typename RT, typename ConservativeEdgeOffsetT>
-struct adjustEdgeConservative
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Performs calculations to adjust each edge of a triangle away
-    /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
-    /// direction. 
-    ///
-    /// Uncertainty regions arise from fixed point rounding, which
-    /// can snap a vertex +/- by min fixed point value.
-    /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
-    /// This allows the rasterizer to test for coverage only at the pixel center, 
-    /// instead of having to test individual pixel corners for conservative coverage
-    INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
-    {
-        // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away 
-        // from the pixel center (in the direction of the edge normal A/B)
-
-        // edge = Ax + Bx + C - (manh/e)
-        // manh = manhattan distance = abs(A) + abs(B)
-        // e = absolute rounding error from snapping from float to fixed point precision
-
-        // 'fixed point' multiply (in double to be avx1 friendly) 
-        // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
-        __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
-        __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
-                                     _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
-
-        static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
-                      "Inadequate precision of result of manh calculation ");
-
-        // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
-        // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
-        manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
-
-        // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
-        // this allows the rasterizer to do a single conservative coverage test to see if the primitive
-        // intersects the pixel at all
-        vEdge = _mm256_sub_pd(vEdge, manh);
-    };
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief adjustEdgeConservative specialization where no edge offset is needed
-template <typename RT>
-struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
-{
-    INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief calculates the distance a degenerate BBox needs to be adjusted 
-/// for conservative rast based on compile time trait values
-template<typename RT>
-constexpr int64_t ConservativeScissorOffset()
-{
-    static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
-    // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
-    typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
-    // 1/2 pixel edge offset + conservative offset - degenerateTriangle
-    return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Performs calculations to adjust each a vector of evaluated edges out
-/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
-/// direction. 
-template <typename RT>
-INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
-{
-    int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
-    int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
-    vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Performs calculations to adjust each a scalar evaluated edge out
-/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
-/// direction. 
-template <typename RT, typename OffsetT>
-INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
-{
-    int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
-    int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
-    return (Edge - manh);
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Perform any needed adjustments to evaluated triangle edges
-template <typename RT, typename EdgeOffsetT>
-struct adjustEdgesFix16
-{
-    INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
-    {
-        static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
-                      "Edge equation expected to be in x.16 fixed point");
-
-        static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
-
-        // need to apply any edge offsets before applying the top-left rule
-        adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
-
-        adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Perform top left adjustments to evaluated triangle edges
-template <typename RT>
-struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
-{
-    INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
-    {
-        adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
-    }
-};
-
-// max(abs(dz/dx), abs(dz,dy)
-INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
-{
-    /*
-    // evaluate i,j at (0,0)
-    float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
-    float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
-
-    // evaluate i,j at (1,0)
-    float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
-    float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
-
-    // compute dz/dx
-    float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
-    float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
-    float dzdx = abs(d10 - d00);
-
-    // evaluate i,j at (0,1)
-    float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
-    float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
-
-    float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
-    float dzdy = abs(d01 - d00);
-    */
-
-    // optimized version of above
-    float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
-    float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
-
-    return std::max(dzdx, dzdy);
-}
-
-INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
-{
-    if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
-    {
-        return (1.0f / (1 << 24));
-    }
-    else if (pState->depthFormat == R16_UNORM)
-    {
-        return (1.0f / (1 << 16));
-    }
-    else
-    {
-        SWR_ASSERT(pState->depthFormat == R32_FLOAT);
-
-        // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
-        float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
-        uint32_t zMaxInt = *(uint32_t*)&zMax;
-        zMaxInt &= 0x7f800000;
-        zMax = *(float*)&zMaxInt;
-
-        return zMax * (1.0f / (1 << 23));
-    }
-}
-
-INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
-{
-    if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
-    {
-        return 0.0f;
-    }
-
-    float scale = pState->slopeScaledDepthBias;
-    if (scale != 0.0f)
-    {
-        scale *= ComputeMaxDepthSlope(pTri);
-    }
-
-    float bias = pState->depthBias;
-    if (!pState->depthBiasPreAdjusted)
-    {
-        bias *= ComputeBiasFactor(pState, pTri, z);
-    }
-    bias += scale;
-
-    if (pState->depthBiasClamp > 0.0f)
-    {
-        bias = std::min(bias, pState->depthBiasClamp);
-    }
-    else if (pState->depthBiasClamp < 0.0f)
-    {
-        bias = std::max(bias, pState->depthBiasClamp);
-    }
-
-    return bias;
-}
-
-// Prevent DCE by writing coverage mask from rasterizer to volatile
-#if KNOB_ENABLE_TOSS_POINTS
-__declspec(thread) volatile uint64_t gToss;
-#endif
-
-static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
-// try to avoid _chkstk insertions; make this thread local
-static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
-
-INLINE
-void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
-{
-    edge.a = a;
-    edge.b = b;
-
-    // compute constant steps to adjacent quads
-    edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
-    edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
-
-    // compute constant steps to adjacent raster tiles
-    edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
-    edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
-
-    // compute quad offsets
-    const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
-    const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
-
-    __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
-    __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
-    edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
-
-    // compute raster tile offsets
-    const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
-    const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
-
-    __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
-    __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
-    edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
-}
-
-INLINE
-void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
-{
-    ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary template definition used for partially specializing 
-/// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel 
-/// corner to sample position, and test for coverage
-/// @tparam sampleCount: multisample count
-template <typename NumSamplesT>
-INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
-                            int32_t &mask0, int32_t &mask1, int32_t &mask2)
-{
-    __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
-    // evaluate edge equations at the tile multisample bounding box
-    vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
-    vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
-    vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
-    mask0 = _mm256_movemask_pd(vSampleBboxTest0);
-    mask1 = _mm256_movemask_pd(vSampleBboxTest1);
-    mask2 = _mm256_movemask_pd(vSampleBboxTest2);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
-/// when only rasterizing a single coverage test point
-template <>
-INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
-                                           int32_t &mask0, int32_t &mask1, int32_t &mask2)
-{
-    mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
-    mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
-    mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @struct ComputeScissorEdges
-/// @brief Primary template definition. Allows the function to be generically
-/// called. When paired with below specializations, will result in an empty 
-/// inlined function if scissor is not enabled
-/// @tparam RasterScissorEdgesT: is scissor enabled?
-/// @tparam IsConservativeT: is conservative rast enabled?
-/// @tparam RT: rasterizer traits
-template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
-struct ComputeScissorEdges
-{
-    INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, 
-                              EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial 
-/// specialization. Instantiated when conservative rast and scissor are enabled
-template <typename RT>
-struct ComputeScissorEdges<std::true_type, std::true_type, RT>
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Intersect tri bbox with scissor, compute scissor edge vectors, 
-    /// evaluate edge equations and offset them away from pixel center.
-    INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
-                              EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
-    {
-        // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
-        SWR_RECT scissor;
-        scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
-        scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
-        scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
-        scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
-
-        POS topLeft{scissor.xmin, scissor.ymin};
-        POS bottomLeft{scissor.xmin, scissor.ymax};
-        POS topRight{scissor.xmax, scissor.ymin};
-        POS bottomRight{scissor.xmax, scissor.ymax};
-
-        // construct 4 scissor edges in ccw direction
-        ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
-        ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
-        ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
-        ComputeEdgeData(topRight, topLeft, rastEdges[6]);
-
-        vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
-        vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
-        vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
-        vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
-
-        // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
-        adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
-        adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
-        adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
-        adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
-
-        // Upper left rule for scissor
-        vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
-        vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial 
-/// specialization. Instantiated when scissor is enabled and conservative rast
-/// is disabled.
-template <typename RT>
-struct ComputeScissorEdges<std::true_type, std::false_type, RT>
-{
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Compute scissor edge vectors and evaluate edge equations
-    INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
-                              EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
-    {
-        const SWR_RECT &scissor = scissorBBox;
-        POS topLeft{scissor.xmin, scissor.ymin};
-        POS bottomLeft{scissor.xmin, scissor.ymax};
-        POS topRight{scissor.xmax, scissor.ymin};
-        POS bottomRight{scissor.xmax, scissor.ymax};
-
-        // construct 4 scissor edges in ccw direction
-        ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
-        ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
-        ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
-        ComputeEdgeData(topRight, topLeft, rastEdges[6]);
-
-        vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
-        vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
-        vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
-        vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
-
-        // Upper left rule for scissor
-        vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
-        vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary function template for TrivialRejectTest. Should
-/// never be called, but TemplateUnroller instantiates a few unused values,
-/// so it calls a runtime assert instead of a static_assert.
-template <typename ValidEdgeMaskT>
-INLINE bool TrivialRejectTest(const int, const int, const int)
-{
-    SWR_INVALID("Primary templated function should never be called");
-    return false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
-/// and edge 1 for trivial coverage reject
-template <>
-INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
-{
-    return (!(mask0 && mask1)) ? true : false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
-/// and edge 2 for trivial coverage reject
-template <>
-INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
-{
-    return (!(mask0 && mask2)) ? true : false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
-/// and edge 2 for trivial coverage reject
-template <>
-INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
-{
-    return (!(mask1 && mask2)) ? true : false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
-/// primitive edges for trivial coverage reject
-template <>
-INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
-{
-    return (!(mask0 && mask1 && mask2)) ? true : false;;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
-/// point, so return false and rasterize against conservative BBox
-template <>
-INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
-{
-    return false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary function template for TrivialAcceptTest. Always returns
-/// false, since it will only be called for degenerate tris, and as such 
-/// will never cover the entire raster tile
-template <typename ScissorEnableT>
-INLINE bool TrivialAcceptTest(const int, const int, const int)
-{
-    return false;
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
-/// edge masks for a fully covered raster tile
-template <>
-INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
-{
-    return ((mask0 & mask1 & mask2) == 0xf);
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary function template for GenerateSVInnerCoverage. Results
-/// in an empty function call if SVInnerCoverage isn't requested
-template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
-struct GenerateSVInnerCoverage
-{
-    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*,  uint64_t &){};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Specialization of GenerateSVInnerCoverage where all edges
-/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated 
-/// edge values from OuterConservative to InnerConservative and rasterizes.
-template <typename RT>
-struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
-{
-    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges,  uint64_t &innerCoverageMask)
-    {
-        SWR_CONTEXT *pContext = pDC->pContext;
-
-        double startQuadEdgesAdj[RT::NumEdgesT::value];
-        for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-        {
-            startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
-        }
-
-        // not trivial accept or reject, must rasterize full tile
-        AR_BEGIN(BERasterizePartial, pDC->drawId);
-        innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
-        AR_END(BERasterizePartial, 0);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
-/// in an empty function call if SVInnerCoverage isn't requested
-template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
-struct UpdateEdgeMasksInnerConservative
-{
-    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
-                                           const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
-/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges 
-/// evaluated at raster tile corners to inner conservative position and 
-/// updates edge masks
-template <typename RT>
-struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
-{
-    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
-                                           const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
-    {
-        __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
-
-        // instead of keeping 2 copies of evaluated edges around, just compensate for the outer 
-        // conservative evaluated edge when adjusting the edge in for inner conservative tests
-        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
-        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
-        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
-
-        UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
-    }
-};
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage 
-/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot 
-/// cover an entire raster tile, set mask0 to 0 to force it down the
-/// rastierizePartialTile path
-template <typename RT, typename ValidEdgeMaskT>
-struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
-{
-    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
-                                   const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
-    {
-        // set one mask to zero to force the triangle down the rastierizePartialTile path
-        mask0 = 0;
-    }
-};
-
-template <typename RT>
-void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
-{
-    SWR_CONTEXT *pContext = pDC->pContext;
-    const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
-#if KNOB_ENABLE_TOSS_POINTS
-    if (KNOB_TOSS_BIN_TRIS)
-    {
-        return;
-    }
-#endif
-    AR_BEGIN(BERasterizeTriangle, pDC->drawId);
-    AR_BEGIN(BETriangleSetup, pDC->drawId);
-
-    const API_STATE &state = GetApiState(pDC);
-    const SWR_RASTSTATE &rastState = state.rastState;
-    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
-
-    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
-    triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
-
-    __m128 vX, vY, vZ, vRecipW;
-    
-    // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
-    // eg: vX = [x0 x1 x2 dc]
-    vX = _mm_load_ps(workDesc.pTriBuffer);
-    vY = _mm_load_ps(workDesc.pTriBuffer + 4);
-    vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
-    vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
-
-    // convert to fixed point
-    static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
-    __m128i vXi = fpToFixedPoint(vX);
-    __m128i vYi = fpToFixedPoint(vY);
-
-    // quantize floating point position to fixed point precision
-    // to prevent attribute creep around the triangle vertices
-    vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
-    vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
-
-    // triangle setup - A and B edge equation coefs
-    __m128 vA, vB;
-    triangleSetupAB(vX, vY, vA, vB);
-
-    __m128i vAi, vBi;
-    triangleSetupABInt(vXi, vYi, vAi, vBi);
-    
-    // determinant
-    float det = calcDeterminantInt(vAi, vBi);
-
-    // Verts in Pixel Coordinate Space at this point
-    // Det > 0 = CW winding order 
-    // Convert CW triangles to CCW
-    if (det > 0.0)
-    {
-        vA  = _mm_mul_ps(vA, _mm_set1_ps(-1));
-        vB  = _mm_mul_ps(vB, _mm_set1_ps(-1));
-        vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
-        vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
-        det = -det;
-    }
-
-    __m128 vC;
-    // Finish triangle setup - C edge coef
-    triangleSetupC(vX, vY, vA, vB, vC);
-
-    if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
-    {
-        // If we have degenerate edge(s) to rasterize, set I and J coefs 
-        // to 0 for constant interpolation of attributes
-        triDesc.I[0] = 0.0f;
-        triDesc.I[1] = 0.0f;
-        triDesc.I[2] = 0.0f;
-        triDesc.J[0] = 0.0f;
-        triDesc.J[1] = 0.0f;
-        triDesc.J[2] = 0.0f;
-
-        // Degenerate triangles have no area
-        triDesc.recipDet = 0.0f;
-    }
-    else
-    {
-        // only extract coefs for 2 of the barycentrics; the 3rd can be 
-        // determined from the barycentric equation:
-        // i + j + k = 1 <=> k = 1 - j - i
-        _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
-        _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
-        _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
-        _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
-        _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
-        _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
-
-        // compute recipDet, used to calculate barycentric i and j in the backend
-        triDesc.recipDet = 1.0f/det;
-    }
-
-    OSALIGNSIMD(float) oneOverW[4];
-    _mm_store_ps(oneOverW, vRecipW);
-    triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
-    triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
-    triDesc.OneOverW[2] = oneOverW[2];
-
-    // calculate perspective correct coefs per vertex attrib 
-    float* pPerspAttribs = perspAttribsTLS;
-    float* pAttribs = workDesc.pAttribs;
-    triDesc.pPerspAttribs = pPerspAttribs;
-    triDesc.pAttribs = pAttribs;
-    float *pRecipW = workDesc.pTriBuffer + 12;
-    triDesc.pRecipW = pRecipW;
-    __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
-    __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
-    __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
-    for(uint32_t i = 0; i < workDesc.numAttribs; i++)
-    {
-        __m128 attribA = _mm_load_ps(pAttribs);
-        __m128 attribB = _mm_load_ps(pAttribs+=4);
-        __m128 attribC = _mm_load_ps(pAttribs+=4);
-        pAttribs+=4;
-
-        attribA = _mm_mul_ps(attribA, vOneOverWV0);
-        attribB = _mm_mul_ps(attribB, vOneOverWV1);
-        attribC = _mm_mul_ps(attribC, vOneOverWV2);
-
-        _mm_store_ps(pPerspAttribs, attribA);
-        _mm_store_ps(pPerspAttribs+=4, attribB);
-        _mm_store_ps(pPerspAttribs+=4, attribC);
-        pPerspAttribs+=4;
-    }
-
-    // compute bary Z
-    // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
-    OSALIGNSIMD(float) a[4];
-    _mm_store_ps(a, vZ);
-    triDesc.Z[0] = a[0] - a[2];
-    triDesc.Z[1] = a[1] - a[2];
-    triDesc.Z[2] = a[2];
-        
-    // add depth bias
-    triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
-
-    // Calc bounding box of triangle
-    OSALIGNSIMD(SWR_RECT) bbox;
-    calcBoundingBoxInt(vXi, vYi, bbox);
-
-    const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
-
-    if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
-    {
-        // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
-        bbox.xmin--;    bbox.xmax++;    bbox.ymin--;    bbox.ymax++;
-        SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
-                   "Conservative rast degenerate handling requires a valid scissor rect");
-    }
-
-    // Intersect with scissor/viewport
-    OSALIGNSIMD(SWR_RECT) intersect;
-    intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
-    intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
-    intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
-    intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
-
-    triDesc.triFlags = workDesc.triFlags;
-
-    // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
-    uint32_t macroX, macroY;
-    MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
-    int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
-    int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
-    int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
-    int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
-
-    intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
-    intersect.ymin = std::max(intersect.ymin, macroBoxTop);
-    intersect.xmax = std::min(intersect.xmax, macroBoxRight);
-    intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
-
-    SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
-
-    AR_END(BETriangleSetup, 0);
-
-    // update triangle desc
-    uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
-    uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-    uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
-    uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-    uint32_t numTilesX = maxTileX - minTileX + 1;
-    uint32_t numTilesY = maxTileY - minTileY + 1;
-
-    if (numTilesX == 0 || numTilesY == 0) 
-    {
-        RDTSC_EVENT(BEEmptyTriangle, 1, 0);
-        AR_END(BERasterizeTriangle, 1);
-        return;
-    }
-
-    AR_BEGIN(BEStepSetup, pDC->drawId);
-
-    // Step to pixel center of top-left pixel of the triangle bbox
-    // Align intersect bbox (top/left) to raster tile's (top/left).
-    int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
-    int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
-
-    // convenience typedef
-    typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
-
-    // single sample rasterization evaluates edges at pixel center,
-    // multisample evaluates edges UL pixel corner and steps to each sample position
-    if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
-    {
-        // Add 0.5, in fixed point, to offset to pixel center
-        x += (FIXED_POINT_SCALE / 2);
-        y += (FIXED_POINT_SCALE / 2);
-    }
-
-    __m128i vTopLeftX = _mm_set1_epi32(x);
-    __m128i vTopLeftY = _mm_set1_epi32(y);
-
-    // evaluate edge equations at top-left pixel using 64bit math
-    // 
-    // line = Ax + By + C
-    // solving for C:
-    // C = -Ax - By
-    // we know x0 and y0 are on the line; plug them in:
-    // C = -Ax0 - By0
-    // plug C back into line equation:
-    // line = Ax - By - Ax0 - By0
-    // line = A(x - x0) + B(y - y0)
-    // dX = (x-x0), dY = (y-y0)
-    // so all this simplifies to 
-    // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
-
-    __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
-    __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
-
-    // evaluate A(dx) and B(dY) for all points
-    __m256d vAipd = _mm256_cvtepi32_pd(vAi);
-    __m256d vBipd = _mm256_cvtepi32_pd(vBi);
-    __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
-    __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
-
-    __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
-    __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
-    __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
-
-    // apply any edge adjustments(top-left, crast, etc)
-    adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
-
-    // broadcast respective edge results to all lanes
-    double* pEdge = (double*)&vEdge;
-    __m256d vEdgeFix16[7];
-    vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
-    vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
-    vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
-
-    OSALIGNSIMD(int32_t) aAi[4], aBi[4];
-    _mm_store_si128((__m128i*)aAi, vAi);
-    _mm_store_si128((__m128i*)aBi, vBi);
-    EDGE rastEdges[RT::NumEdgesT::value];
-
-    // Compute and store triangle edge data
-    ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
-    ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
-    ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
-
-    // Compute and store triangle edge data if scissor needs to rasterized
-    ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
-                       (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
-
-    // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
-    // used to for testing if entire raster tile is inside a triangle
-    for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-    {
-        vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
-    }
-
-    // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
-    // step sample positions to the raster tile bbox of multisample points
-    // min(xSamples),min(ySamples)  ------  max(xSamples),min(ySamples)
-    //                             |      |
-    //                             |      |
-    // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
-    __m256d vEdgeTileBbox[3];
-    if (NumCoverageSamplesT::value > 1)
-    {
-        const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
-        const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
-        const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
-
-        __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
-        __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
-
-        // step edge equation tests from Tile
-        // used to for testing if entire raster tile is inside a triangle
-        for (uint32_t e = 0; e < 3; ++e)
-        {
-            __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
-            __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
-            vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-
-            // adjust for msaa tile bbox edges outward for conservative rast, if enabled
-            adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
-        }
-    }
-
-    AR_END(BEStepSetup, 0);
-
-    uint32_t tY = minTileY;
-    uint32_t tX = minTileX;
-    uint32_t maxY = maxTileY;
-    uint32_t maxX = maxTileX;
-
-    RenderOutputBuffers renderBuffers, currentRenderBufferRow;
-    GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
-    currentRenderBufferRow = renderBuffers;
-
-    // rasterize and generate coverage masks per sample
-    for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
-    {
-        __m256d vStartOfRowEdge[RT::NumEdgesT::value];
-        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-        {
-            vStartOfRowEdge[e] = vEdgeFix16[e];
-        }
-
-        for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
-        {
-            triDesc.anyCoveredSamples = 0;
-
-            // is the corner of the edge outside of the raster tile? (vEdge < 0)
-            int mask0, mask1, mask2;
-            UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
-
-            for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
-            {
-                // trivial reject, at least one edge has all 4 corners of raster tile outside
-                bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
-
-                if (!trivialReject)
-                {
-                    // trivial accept mask
-                    triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
-
-                    // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
-                    UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
-                        (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
-
-                    // @todo Make this a bit smarter to allow use of trivial accept when:
-                    //   1) scissor/vp intersection rect is raster tile aligned
-                    //   2) raster tile is entirely within scissor/vp intersection rect
-                    if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
-                    {
-                        // trivial accept, all 4 corners of all 3 edges are negative 
-                        // i.e. raster tile completely inside triangle
-                        triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
-                        if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
-                        {
-                            triDesc.innerCoverageMask = 0xffffffffffffffffULL;
-                        }
-                        RDTSC_EVENT(BETrivialAccept, 1, 0);
-                    }
-                    else
-                    {
-                        __m256d vEdgeAtSample[RT::NumEdgesT::value];
-                        if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
-                        {
-                            // should get optimized out for single sample case (global value numbering or copy propagation)
-                            for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-                            {
-                                vEdgeAtSample[e] = vEdgeFix16[e];
-                            }
-                        }
-                        else
-                        {
-                            const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
-                            __m128i vSampleOffsetXh = samplePos.vXi(sampleNum);
-                            __m128i vSampleOffsetYh = samplePos.vYi(sampleNum);
-                            __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
-                            __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
-
-                            // step edge equation tests from UL tile corner to pixel sample position
-                            for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-                            {
-                                __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
-                                __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
-                                vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-                                vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
-                            }
-                        }
-
-                        double startQuadEdges[RT::NumEdgesT::value];
-                        const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-                        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-                        {
-                            _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
-                        }
-
-                        // not trivial accept or reject, must rasterize full tile
-                        AR_BEGIN(BERasterizePartial, pDC->drawId);
-                        triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
-                        AR_END(BERasterizePartial, 0);
-
-                        triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 
-                        
-                        // Output SV InnerCoverage, if needed
-                        GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
-                    }
-                }
-                else
-                {
-                    // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
-                    if(NumCoverageSamplesT::value > 1)
-                    {
-                        triDesc.coverageMask[sampleNum] = 0;
-                    }
-                    RDTSC_EVENT(BETrivialReject, 1, 0);
-                }
-            }
-
-#if KNOB_ENABLE_TOSS_POINTS
-            if(KNOB_TOSS_RS)
-            {
-                gToss = triDesc.coverageMask[0];
-            }
-            else
-#endif
-            if(triDesc.anyCoveredSamples)
-            {
-                // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
-                // copy conservative coverage result to all samples
-                if(RT::IsConservativeT::value)
-                {
-                    auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
-                    UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
-                }
-
-                AR_BEGIN(BEPixelBackend, pDC->drawId);
-                backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
-                AR_END(BEPixelBackend, 0);
-            }
-
-            // step to the next tile in X
-            for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-            {
-                vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
-            }
-            StepRasterTileX<RT>(state.psState.numRenderTargets, renderBuffers);
-        }
-
-        // step to the next tile in Y
-        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
-        {
-            vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
-        }
-        StepRasterTileY<RT>(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow);
-    }
-
-    AR_END(BERasterizeTriangle, 1);
-}
-
-void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
-{
-    const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
-    const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
-    const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
-
-    bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
-
-    // load point vertex
-    float x = *workDesc.pTriBuffer;
-    float y = *(workDesc.pTriBuffer + 1);
-    float z = *(workDesc.pTriBuffer + 2);
-
-    // create a copy of the triangle buffer to write our adjusted vertices to
-    OSALIGNSIMD(float) newTriBuffer[4 * 4];
-    TRIANGLE_WORK_DESC newWorkDesc = workDesc;
-    newWorkDesc.pTriBuffer = &newTriBuffer[0];
-
-    // create a copy of the attrib buffer to write our adjusted attribs to
-    OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
-    newWorkDesc.pAttribs = &newAttribBuffer[0];
-
-    newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
-    newWorkDesc.numAttribs = workDesc.numAttribs;
-    newWorkDesc.triFlags = workDesc.triFlags;
-
-    // construct two tris by bloating point by point size
-    float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
-    float lowerX = x - halfPointSize;
-    float upperX = x + halfPointSize;
-    float lowerY = y - halfPointSize;
-    float upperY = y + halfPointSize;
-
-    // tri 0
-    float *pBuf = &newTriBuffer[0];
-    *pBuf++ = lowerX;
-    *pBuf++ = lowerX;
-    *pBuf++ = upperX;
-    pBuf++;
-    *pBuf++ = lowerY;
-    *pBuf++ = upperY;
-    *pBuf++ = upperY;
-    pBuf++;
-    _mm_store_ps(pBuf, _mm_set1_ps(z));
-    _mm_store_ps(pBuf+=4, _mm_set1_ps(1.0f));
-
-    // setup triangle rasterizer function
-    PFN_WORK_FUNC pfnTriRast;
-    // conservative rast not supported for points/lines
-    pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false, 
-                                   SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
-
-    // overwrite texcoords for point sprites
-    if (isPointSpriteTexCoordEnabled)
-    {
-        // copy original attribs
-        memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
-        newWorkDesc.pAttribs = &newAttribBuffer[0];
-
-        // overwrite texcoord for point sprites
-        uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
-        DWORD texCoordAttrib = 0;
-
-        while (_BitScanForward(&texCoordAttrib, texCoordMask))
-        {
-            texCoordMask &= ~(1 << texCoordAttrib);
-            __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
-            if (rastState.pointSpriteTopOrigin)
-            {
-                pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
-                pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
-                pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
-            }
-            else
-            {
-                pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
-                pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
-                pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
-            }
-        }
-    }
-    else
-    {
-        // no texcoord overwrite, can reuse the attrib buffer from frontend
-        newWorkDesc.pAttribs = workDesc.pAttribs;
-    }
-
-    pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
-
-    // tri 1
-    pBuf = &newTriBuffer[0];
-    *pBuf++ = lowerX;
-    *pBuf++ = upperX;
-    *pBuf++ = upperX;
-    pBuf++;
-    *pBuf++ = lowerY;
-    *pBuf++ = upperY;
-    *pBuf++ = lowerY;
-    // z, w unchanged
-
-    if (isPointSpriteTexCoordEnabled)
-    {
-        uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
-        DWORD texCoordAttrib = 0;
-
-        while (_BitScanForward(&texCoordAttrib, texCoordMask))
-        {
-            texCoordMask &= ~(1 << texCoordAttrib);
-            __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
-            if (rastState.pointSpriteTopOrigin)
-            {
-                pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
-                pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
-                pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
-
-            }
-            else
-            {
-                pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
-                pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
-                pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
-            }
-        }
-    }
-
-    pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
-}
-
-void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
-{
-    SWR_CONTEXT *pContext = pDC->pContext;
-
-#if KNOB_ENABLE_TOSS_POINTS
-    if (KNOB_TOSS_BIN_TRIS)
-    {
-        return;
-    }
-#endif
-
-    const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
-    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
-
-    // map x,y relative offsets from start of raster tile to bit position in 
-    // coverage mask for the point
-    static const uint32_t coverageMap[8][8] = {
-        { 0, 1, 4, 5, 8, 9, 12, 13 },
-        { 2, 3, 6, 7, 10, 11, 14, 15 },
-        { 16, 17, 20, 21, 24, 25, 28, 29 },
-        { 18, 19, 22, 23, 26, 27, 30, 31 },
-        { 32, 33, 36, 37, 40, 41, 44, 45 },
-        { 34, 35, 38, 39, 42, 43, 46, 47 },
-        { 48, 49, 52, 53, 56, 57, 60, 61 },
-        { 50, 51, 54, 55, 58, 59, 62, 63 }
-    };
-
-    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
-
-    // pull point information from triangle buffer
-    // @todo use structs for readability
-    uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
-    uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
-    float z = *(workDesc.pTriBuffer + 2);
-
-    // construct triangle descriptor for point
-    // no interpolation, set up i,j for constant interpolation of z and attribs
-    // @todo implement an optimized backend that doesn't require triangle information
-
-    // compute coverage mask from x,y packed into the coverageMask flag
-    // mask indices by the maximum valid index for x/y of coveragemap.
-    uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
-    uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
-    // todo: multisample points?
-    triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
-
-    // no persp divide needed for points
-    triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
-    triDesc.triFlags = workDesc.triFlags;
-    triDesc.recipDet = 1.0f;
-    triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
-    triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
-    triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
-    triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
-
-    RenderOutputBuffers renderBuffers;
-    GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, 
-        renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
-
-    AR_BEGIN(BEPixelBackend, pDC->drawId);
-    backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
-    AR_END(BEPixelBackend, 0);
-}
-
-// Get pointers to hot tile memory for color RT, depth, stencil
-template <uint32_t numSamples>
-void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
-{
-    const API_STATE& state = GetApiState(pDC);
-    SWR_CONTEXT *pContext = pDC->pContext;
-
-    uint32_t mx, my;
-    MacroTileMgr::getTileIndices(macroID, mx, my);
-    tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
-    tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
-
-    // compute tile offset for active hottile buffers
-    const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
-    uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
-    offset*=numSamples;
-
-    unsigned long rtSlot = 0;
-    uint32_t colorHottileEnableMask = state.colorHottileEnable;
-    while(_BitScanForward(&rtSlot, colorHottileEnableMask))
-    {
-        HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, 
-            numSamples, renderTargetArrayIndex);
-        pColor->state = HOTTILE_DIRTY;
-        renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
-        
-        colorHottileEnableMask &= ~(1 << rtSlot);
-    }
-    if(state.depthHottileEnable)
-    {
-        const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
-        uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
-        offset*=numSamples;
-        HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, 
-            numSamples, renderTargetArrayIndex);
-        pDepth->state = HOTTILE_DIRTY;
-        SWR_ASSERT(pDepth->pBuffer != nullptr);
-        renderBuffers.pDepth = pDepth->pBuffer + offset;
-    }
-    if(state.stencilHottileEnable)
-    {
-        const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
-        uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
-        offset*=numSamples;
-        HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, 
-            numSamples, renderTargetArrayIndex);
-        pStencil->state = HOTTILE_DIRTY;
-        SWR_ASSERT(pStencil->pBuffer != nullptr);
-        renderBuffers.pStencil = pStencil->pBuffer + offset;
-    }
-}
-
-template <typename RT>
-INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers)
-{
-    for(uint32_t rt = 0; rt < NumRT; ++rt)
-    {
-        buffers.pColor[rt] += RT::colorRasterTileStep;
-    }
-    
-    buffers.pDepth += RT::depthRasterTileStep;
-    buffers.pStencil += RT::stencilRasterTileStep;
-}
-
-template <typename RT>
-INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
-{
-    for(uint32_t rt = 0; rt < NumRT; ++rt)
-    {
-        startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
-        buffers.pColor[rt] = startBufferRow.pColor[rt];
-    }
-    startBufferRow.pDepth += RT::depthRasterTileRowStep;
-    buffers.pDepth = startBufferRow.pDepth;
-
-    startBufferRow.pStencil += RT::stencilRasterTileRowStep;
-    buffers.pStencil = startBufferRow.pStencil;
-}
+PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
 
 void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
 {
@@ -1638,12 +115,12 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
     // tri0 needs v0, v0, v1
     for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
     {
-        __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a*12 + 0]);
-        __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a*12 + 4]);
+        __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
+        __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
 
-        _mm_store_ps((float*)&newAttribBuffer[a*12 + 0], vAttrib0);
-        _mm_store_ps((float*)&newAttribBuffer[a*12 + 4], vAttrib0);
-        _mm_store_ps((float*)&newAttribBuffer[a*12 + 8], vAttrib1);
+        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib0);
+        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib0);
+        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib1);
     }
 
     // Store user clip distances for triangle 0
@@ -1675,8 +152,8 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
     // setup triangle rasterizer function
     PFN_WORK_FUNC pfnTriRast;
     // conservative rast not supported for points/lines
-    pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false, 
-                                   SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
+    pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
+        SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
 
     // make sure this macrotile intersects the triangle
     __m128i vXai = fpToFixedPoint(vXa);
@@ -1685,13 +162,13 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
     calcBoundingBoxInt(vXai, vYai, bboxA);
 
     if (!(bboxA.xmin > macroBoxRight ||
-          bboxA.xmin > scissorInFixedPoint.xmax ||
-          bboxA.xmax - 1 < macroBoxLeft ||
-          bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
-          bboxA.ymin > macroBoxBottom ||
-          bboxA.ymin > scissorInFixedPoint.ymax ||
-          bboxA.ymax - 1 < macroBoxTop ||
-          bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
+        bboxA.xmin > scissorInFixedPoint.xmax ||
+        bboxA.xmax - 1 < macroBoxLeft ||
+        bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
+        bboxA.ymin > macroBoxBottom ||
+        bboxA.ymin > scissorInFixedPoint.ymax ||
+        bboxA.ymax - 1 < macroBoxTop ||
+        bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
         // rasterize triangle
         pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
     }
@@ -1758,13 +235,13 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
     calcBoundingBoxInt(vXai, vYai, bboxA);
 
     if (!(bboxA.xmin > macroBoxRight ||
-          bboxA.xmin > scissorInFixedPoint.xmax ||
-          bboxA.xmax - 1 < macroBoxLeft ||
-          bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
-          bboxA.ymin > macroBoxBottom ||
-          bboxA.ymin > scissorInFixedPoint.ymax ||
-          bboxA.ymax - 1 < macroBoxTop ||
-          bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
+        bboxA.xmin > scissorInFixedPoint.xmax ||
+        bboxA.xmax - 1 < macroBoxLeft ||
+        bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
+        bboxA.ymin > macroBoxBottom ||
+        bboxA.ymin > scissorInFixedPoint.ymax ||
+        bboxA.ymax - 1 < macroBoxTop ||
+        bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
         // rasterize triangle
         pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
     }
@@ -1772,32 +249,219 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
     AR_END(BERasterizeLine, 1);
 }
 
-struct RasterizerChooser
+void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
+{
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+#if KNOB_ENABLE_TOSS_POINTS
+    if (KNOB_TOSS_BIN_TRIS)
+    {
+        return;
+    }
+#endif
+
+    const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
+    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+
+    // map x,y relative offsets from start of raster tile to bit position in 
+    // coverage mask for the point
+    static const uint32_t coverageMap[8][8] = {
+        { 0, 1, 4, 5, 8, 9, 12, 13 },
+        { 2, 3, 6, 7, 10, 11, 14, 15 },
+        { 16, 17, 20, 21, 24, 25, 28, 29 },
+        { 18, 19, 22, 23, 26, 27, 30, 31 },
+        { 32, 33, 36, 37, 40, 41, 44, 45 },
+        { 34, 35, 38, 39, 42, 43, 46, 47 },
+        { 48, 49, 52, 53, 56, 57, 60, 61 },
+        { 50, 51, 54, 55, 58, 59, 62, 63 }
+    };
+
+    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
+
+    // pull point information from triangle buffer
+    // @todo use structs for readability
+    uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
+    uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
+    float z = *(workDesc.pTriBuffer + 2);
+
+    // construct triangle descriptor for point
+    // no interpolation, set up i,j for constant interpolation of z and attribs
+    // @todo implement an optimized backend that doesn't require triangle information
+
+    // compute coverage mask from x,y packed into the coverageMask flag
+    // mask indices by the maximum valid index for x/y of coveragemap.
+    uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
+    uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
+    // todo: multisample points?
+    triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
+
+    // no persp divide needed for points
+    triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
+    triDesc.triFlags = workDesc.triFlags;
+    triDesc.recipDet = 1.0f;
+    triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
+    triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
+    triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
+    triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
+
+    RenderOutputBuffers renderBuffers;
+    GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, 
+        renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
+
+    AR_BEGIN(BEPixelBackend, pDC->drawId);
+    backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
+    AR_END(BEPixelBackend, 0);
+}
+
+void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
 {
-    typedef PFN_WORK_FUNC FuncType;
+    const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
+    const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
+    const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
+
+    bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
+
+    // load point vertex
+    float x = *workDesc.pTriBuffer;
+    float y = *(workDesc.pTriBuffer + 1);
+    float z = *(workDesc.pTriBuffer + 2);
+
+    // create a copy of the triangle buffer to write our adjusted vertices to
+    OSALIGNSIMD(float) newTriBuffer[4 * 4];
+    TRIANGLE_WORK_DESC newWorkDesc = workDesc;
+    newWorkDesc.pTriBuffer = &newTriBuffer[0];
+
+    // create a copy of the attrib buffer to write our adjusted attribs to
+    OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
+    newWorkDesc.pAttribs = &newAttribBuffer[0];
+
+    newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
+    newWorkDesc.numAttribs = workDesc.numAttribs;
+    newWorkDesc.triFlags = workDesc.triFlags;
+
+    // construct two tris by bloating point by point size
+    float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
+    float lowerX = x - halfPointSize;
+    float upperX = x + halfPointSize;
+    float lowerY = y - halfPointSize;
+    float upperY = y + halfPointSize;
+
+    // tri 0
+    float *pBuf = &newTriBuffer[0];
+    *pBuf++ = lowerX;
+    *pBuf++ = lowerX;
+    *pBuf++ = upperX;
+    pBuf++;
+    *pBuf++ = lowerY;
+    *pBuf++ = upperY;
+    *pBuf++ = upperY;
+    pBuf++;
+    _mm_store_ps(pBuf, _mm_set1_ps(z));
+    _mm_store_ps(pBuf += 4, _mm_set1_ps(1.0f));
+
+    // setup triangle rasterizer function
+    PFN_WORK_FUNC pfnTriRast;
+    // conservative rast not supported for points/lines
+    pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
+        SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
+
+    // overwrite texcoords for point sprites
+    if (isPointSpriteTexCoordEnabled)
+    {
+        // copy original attribs
+        memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
+        newWorkDesc.pAttribs = &newAttribBuffer[0];
+
+        // overwrite texcoord for point sprites
+        uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
+        DWORD texCoordAttrib = 0;
+
+        while (_BitScanForward(&texCoordAttrib, texCoordMask))
+        {
+            texCoordMask &= ~(1 << texCoordAttrib);
+            __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
+            if (rastState.pointSpriteTopOrigin)
+            {
+                pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
+                pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
+                pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
+            }
+            else
+            {
+                pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
+                pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
+                pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
+            }
+        }
+    }
+    else
+    {
+        // no texcoord overwrite, can reuse the attrib buffer from frontend
+        newWorkDesc.pAttribs = workDesc.pAttribs;
+    }
+
+    pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
+
+    // tri 1
+    pBuf = &newTriBuffer[0];
+    *pBuf++ = lowerX;
+    *pBuf++ = upperX;
+    *pBuf++ = upperX;
+    pBuf++;
+    *pBuf++ = lowerY;
+    *pBuf++ = upperY;
+    *pBuf++ = lowerY;
+    // z, w unchanged
 
-    template <typename... ArgsB>
-    static FuncType GetFunc()
+    if (isPointSpriteTexCoordEnabled)
     {
-        return RasterizeTriangle<RasterizerTraits<ArgsB...>>;
+        uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
+        DWORD texCoordAttrib = 0;
+
+        while (_BitScanForward(&texCoordAttrib, texCoordMask))
+        {
+            texCoordMask &= ~(1 << texCoordAttrib);
+            __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
+            if (rastState.pointSpriteTopOrigin)
+            {
+                pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
+                pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
+                pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
+
+            }
+            else
+            {
+                pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
+                pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
+                pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
+            }
+        }
     }
-};
+
+    pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
+}
+
+void InitRasterizerFunctions()
+{
+    InitRasterizerFuncs();
+}
 
 // Selector for correct templated RasterizeTriangle function
 PFN_WORK_FUNC GetRasterizerFunc(
-    uint32_t numSamples,
+    SWR_MULTISAMPLE_COUNT numSamples,
     bool IsCenter,
     bool IsConservative,
-    uint32_t InputCoverage,
+    SWR_INPUT_COVERAGE InputCoverage,
     uint32_t EdgeEnable,
     bool RasterizeScissorEdges
 )
 {
-    return TemplateArgUnroller<RasterizerChooser>::GetFunc(
-        IntArg<SWR_MULTISAMPLE_1X,SWR_MULTISAMPLE_TYPE_COUNT-1>{numSamples},
-        IsCenter,
-        IsConservative,
-        IntArg<SWR_INPUT_COVERAGE_NONE, SWR_INPUT_COVERAGE_COUNT-1>{InputCoverage},
-        IntArg<0, STATE_VALID_TRI_EDGE_COUNT-1>{EdgeEnable},
-        RasterizeScissorEdges);
+    SWR_ASSERT(numSamples >= 0 && numSamples < SWR_MULTISAMPLE_TYPE_COUNT);
+    SWR_ASSERT(InputCoverage >= 0 && InputCoverage < SWR_INPUT_COVERAGE_COUNT);
+    SWR_ASSERT(EdgeEnable < STATE_VALID_TRI_EDGE_COUNT);
+
+    PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage][EdgeEnable][RasterizeScissorEdges];
+    SWR_ASSERT(func);
+
+    return func;
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
index e99920a..414d0f0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
@@ -35,6 +35,7 @@
 void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
 void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
 void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void InitRasterizerFunctions();
 
 INLINE
 __m128i fpToFixedPoint(const __m128 vIn)
@@ -43,15 +44,6 @@ __m128i fpToFixedPoint(const __m128 vIn)
     return _mm_cvtps_epi32(vFixed);
 }
 
-// Selector for correct templated RasterizeTriangle function
-PFN_WORK_FUNC GetRasterizerFunc(
-    uint32_t numSamples,
-    bool IsCenter,
-    bool IsConservative,
-    uint32_t InputCoverage,
-    uint32_t EdgeEnable,
-    bool RasterizeScissorEdges);
-
 enum TriEdgesStates
 {
     STATE_NO_VALID_EDGES = 0,
@@ -72,6 +64,15 @@ enum TriEdgesValues
     VALID_TRI_EDGE_COUNT,
 };
 
+// Selector for correct templated RasterizeTriangle function
+PFN_WORK_FUNC GetRasterizerFunc(
+    SWR_MULTISAMPLE_COUNT numSamples,
+    bool IsCenter,
+    bool IsConservative,
+    SWR_INPUT_COVERAGE InputCoverage,
+    uint32_t EdgeEnable,
+    bool RasterizeScissorEdges);
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief ValidTriEdges convenience typedefs used for templated function 
 /// specialization supported Fixed Point precisions
@@ -173,7 +174,7 @@ struct RasterEdgeTraits<std::false_type, std::false_type, EdgeMaskT>
 /// (only used with conservative rasterization)
 /// @tparam RasterScissorEdgesT: do we need to rasterize with a scissor?
 template <typename NumSamplesT, typename CenterPatternT, typename ConservativeT, typename InputCoverageT, typename EdgeEnableT, typename RasterScissorEdgesT>
-struct RasterizerTraits final : public ConservativeRastBETraits<ConservativeT, InputCoverageT>,
+struct _RasterizerTraits : public ConservativeRastBETraits<ConservativeT, InputCoverageT>,
                                 public RasterEdgeTraits<RasterScissorEdgesT, ConservativeT, EdgeEnableT>
 {
     typedef MultisampleTraits<static_cast<SWR_MULTISAMPLE_COUNT>(NumSamplesT::value), CenterPatternT::value> MT;
@@ -197,3 +198,13 @@ struct RasterizerTraits final : public ConservativeRastBETraits<ConservativeT, I
     static const int depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM)* depthRasterTileStep};
     static const int stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * stencilRasterTileStep};
 };
+
+template <uint32_t NumSamplesT, uint32_t CenterPatternT, uint32_t ConservativeT, uint32_t InputCoverageT, uint32_t EdgeEnableT, uint32_t RasterScissorEdgesT>
+struct RasterizerTraits final : public _RasterizerTraits <
+    std::integral_constant<uint32_t, NumSamplesT>,
+    std::integral_constant<bool, CenterPatternT != 0>,
+    std::integral_constant<bool, ConservativeT != 0>,
+    std::integral_constant<uint32_t, InputCoverageT>,
+    std::integral_constant<uint32_t, EdgeEnableT>,
+    std::integral_constant<bool, RasterScissorEdgesT != 0> >
+{};
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
new file mode 100644
index 0000000..b73a99b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
@@ -0,0 +1,1376 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file rasterizer.cpp
+*
+* @brief Implementation for the rasterizer.
+*
+******************************************************************************/
+
+#include <vector>
+#include <algorithm>
+
+#include "rasterizer.h"
+#include "rdtsc_core.h"
+#include "backend.h"
+#include "utils.h"
+#include "frontend.h"
+#include "tilemgr.h"
+#include "memory/tilingtraits.h"
+
+extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
+
+template <uint32_t numSamples = 1>
+void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
+template <typename RT>
+void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers);
+template <typename RT>
+void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
+
+#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
+static const __m256d gMaskToVecpd[] =
+{
+    MASKTOVEC(0, 0, 0, 0),
+    MASKTOVEC(0, 0, 0, 1),
+    MASKTOVEC(0, 0, 1, 0),
+    MASKTOVEC(0, 0, 1, 1),
+    MASKTOVEC(0, 1, 0, 0),
+    MASKTOVEC(0, 1, 0, 1),
+    MASKTOVEC(0, 1, 1, 0),
+    MASKTOVEC(0, 1, 1, 1),
+    MASKTOVEC(1, 0, 0, 0),
+    MASKTOVEC(1, 0, 0, 1),
+    MASKTOVEC(1, 0, 1, 0),
+    MASKTOVEC(1, 0, 1, 1),
+    MASKTOVEC(1, 1, 0, 0),
+    MASKTOVEC(1, 1, 0, 1),
+    MASKTOVEC(1, 1, 1, 0),
+    MASKTOVEC(1, 1, 1, 1),
+};
+
+struct POS
+{
+    int32_t x, y;
+};
+
+struct EDGE
+{
+    double a, b;                // a, b edge coefficients in fix8
+    double stepQuadX;           // step to adjacent horizontal quad in fix16
+    double stepQuadY;           // step to adjacent vertical quad in fix16
+    double stepRasterTileX;     // step to adjacent horizontal raster tile in fix16
+    double stepRasterTileY;     // step to adjacent vertical raster tile in fix16
+
+    __m256d vQuadOffsets;       // offsets for 4 samples of a quad
+    __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief rasterize a raster tile partially covered by the triangle
+/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
+/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
+/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
+///        Used to step between quads when sweeping over the raster tile.
+template<uint32_t NumEdges, typename EdgeMaskT>
+INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
+{
+    uint64_t coverageMask = 0;
+
+    __m256d vEdges[NumEdges];
+    __m256d vStepX[NumEdges];
+    __m256d vStepY[NumEdges];
+
+    for (uint32_t e = 0; e < NumEdges; ++e)
+    {
+        // Step to the pixel sample locations of the 1st quad
+        vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
+
+        // compute step to next quad (mul by 2 in x and y direction)
+        vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
+        vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
+    }
+
+    // fast unrolled version for 8x8 tile
+#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
+    int edgeMask[NumEdges];
+    uint64_t mask;
+
+    auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
+    auto update_lambda = [&](int e){mask &= edgeMask[e];};
+    auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
+    auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
+    auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
+
+// evaluate which pixels in the quad are covered
+#define EVAL \
+            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
+
+    // update coverage mask
+    // if edge 0 is degenerate and will be skipped; init the mask
+#define UPDATE_MASK(bit) \
+            if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
+                mask = 0xf;\
+            }\
+            else{\
+                mask = edgeMask[0]; \
+            }\
+            UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
+            coverageMask |= (mask << bit);
+
+    // step in the +x direction to the next quad 
+#define INCX \
+            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
+
+    // step in the +y direction to the next quad 
+#define INCY \
+            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
+
+    // step in the -x direction to the next quad 
+#define DECX \
+            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
+
+    // sweep 2x2 quad back and forth through the raster tile, 
+    // computing coverage masks for the entire tile
+
+    // raster tile
+    // 0  1  2  3  4  5  6  7 
+    // x  x
+    // x  x ------------------>  
+    //                   x  x  |
+    // <-----------------x  x  V
+    // ..
+
+    // row 0
+    EVAL;
+    UPDATE_MASK(0);
+    INCX;
+    EVAL;
+    UPDATE_MASK(4);
+    INCX;
+    EVAL;
+    UPDATE_MASK(8);
+    INCX;
+    EVAL;
+    UPDATE_MASK(12);
+    INCY;
+
+    //row 1
+    EVAL;
+    UPDATE_MASK(28);
+    DECX;
+    EVAL;
+    UPDATE_MASK(24);
+    DECX;
+    EVAL;
+    UPDATE_MASK(20);
+    DECX;
+    EVAL;
+    UPDATE_MASK(16);
+    INCY;
+
+    // row 2
+    EVAL;
+    UPDATE_MASK(32);
+    INCX;
+    EVAL;
+    UPDATE_MASK(36);
+    INCX;
+    EVAL;
+    UPDATE_MASK(40);
+    INCX;
+    EVAL;
+    UPDATE_MASK(44);
+    INCY;
+
+    // row 3
+    EVAL;
+    UPDATE_MASK(60);
+    DECX;
+    EVAL;
+    UPDATE_MASK(56);
+    DECX;
+    EVAL;
+    UPDATE_MASK(52);
+    DECX;
+    EVAL;
+    UPDATE_MASK(48);
+#else
+    uint32_t bit = 0;
+    for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
+    {
+        __m256d vStartOfRowEdge[NumEdges];
+        for (uint32_t e = 0; e < NumEdges; ++e)
+        {
+            vStartOfRowEdge[e] = vEdges[e];
+        }
+
+        for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
+        {
+            int edgeMask[NumEdges];
+            for (uint32_t e = 0; e < NumEdges; ++e)
+            {
+                edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
+            }
+
+            uint64_t mask = edgeMask[0];
+            for (uint32_t e = 1; e < NumEdges; ++e)
+            {
+                mask &= edgeMask[e];
+            }
+            coverageMask |= (mask << bit);
+
+            // step to the next pixel in the x
+            for (uint32_t e = 0; e < NumEdges; ++e)
+            {
+                vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
+            }
+            bit+=4;
+        }
+
+        // step to the next row
+        for (uint32_t e = 0; e < NumEdges; ++e)
+        {
+            vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
+        }
+    }
+#endif
+    return coverageMask;
+
+}
+// Top left rule:
+// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
+// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
+// Top left: a sample is in if it is a top or left edge.
+// Out: !(horizontal && above) = !horizontal && below
+// Out: !horizontal && left = !(!horizontal && left) = horizontal and right 
+INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge) 
+{
+    // if vA < 0, vC--
+    // if vA == 0 && vB < 0, vC--
+
+    __m256d vEdgeOut = vEdge;
+    __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
+
+    // if vA < 0 (line is not horizontal and below)
+    int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
+
+    // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
+    __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
+    int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
+    msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
+
+    // if either of these are true and we're on the line (edge == 0), bump it outside the line
+    vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief calculates difference in precision between the result of manh
+/// calculation and the edge precision, based on compile time trait values
+template<typename RT>
+constexpr int64_t ManhToEdgePrecisionAdjust()
+{
+    static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
+                  "Inadequate precision of result of manh calculation ");
+    return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @struct adjustEdgeConservative
+/// @brief Primary template definition used for partially specializing 
+/// the adjustEdgeConservative function. This struct should never
+/// be instantiated.
+/// @tparam RT: rasterizer traits
+/// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
+template <typename RT, typename ConservativeEdgeOffsetT>
+struct adjustEdgeConservative
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs calculations to adjust each edge of a triangle away
+    /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
+    /// direction. 
+    ///
+    /// Uncertainty regions arise from fixed point rounding, which
+    /// can snap a vertex +/- by min fixed point value.
+    /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
+    /// This allows the rasterizer to test for coverage only at the pixel center, 
+    /// instead of having to test individual pixel corners for conservative coverage
+    INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
+    {
+        // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away 
+        // from the pixel center (in the direction of the edge normal A/B)
+
+        // edge = Ax + Bx + C - (manh/e)
+        // manh = manhattan distance = abs(A) + abs(B)
+        // e = absolute rounding error from snapping from float to fixed point precision
+
+        // 'fixed point' multiply (in double to be avx1 friendly) 
+        // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
+        __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
+        __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
+                                     _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
+
+        static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
+                      "Inadequate precision of result of manh calculation ");
+
+        // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
+        // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
+        manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
+
+        // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
+        // this allows the rasterizer to do a single conservative coverage test to see if the primitive
+        // intersects the pixel at all
+        vEdge = _mm256_sub_pd(vEdge, manh);
+    };
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief adjustEdgeConservative specialization where no edge offset is needed
+template <typename RT>
+struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
+{
+    INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief calculates the distance a degenerate BBox needs to be adjusted 
+/// for conservative rast based on compile time trait values
+template<typename RT>
+constexpr int64_t ConservativeScissorOffset()
+{
+    static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
+    // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
+    typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
+    // 1/2 pixel edge offset + conservative offset - degenerateTriangle
+    return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Performs calculations to adjust each a vector of evaluated edges out
+/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
+/// direction. 
+template <typename RT>
+INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
+{
+    int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
+    int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
+    vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Performs calculations to adjust each a scalar evaluated edge out
+/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
+/// direction. 
+template <typename RT, typename OffsetT>
+INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
+{
+    int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
+    int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
+    return (Edge - manh);
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Perform any needed adjustments to evaluated triangle edges
+template <typename RT, typename EdgeOffsetT>
+struct adjustEdgesFix16
+{
+    INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
+    {
+        static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
+                      "Edge equation expected to be in x.16 fixed point");
+
+        static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
+
+        // need to apply any edge offsets before applying the top-left rule
+        adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
+
+        adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Perform top left adjustments to evaluated triangle edges
+template <typename RT>
+struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
+{
+    INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
+    {
+        adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
+    }
+};
+
+// max(abs(dz/dx), abs(dz,dy)
+INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
+{
+    /*
+    // evaluate i,j at (0,0)
+    float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
+    float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
+
+    // evaluate i,j at (1,0)
+    float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
+    float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
+
+    // compute dz/dx
+    float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
+    float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
+    float dzdx = abs(d10 - d00);
+
+    // evaluate i,j at (0,1)
+    float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
+    float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
+
+    float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
+    float dzdy = abs(d01 - d00);
+    */
+
+    // optimized version of above
+    float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
+    float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
+
+    return std::max(dzdx, dzdy);
+}
+
+INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
+{
+    if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
+    {
+        return (1.0f / (1 << 24));
+    }
+    else if (pState->depthFormat == R16_UNORM)
+    {
+        return (1.0f / (1 << 16));
+    }
+    else
+    {
+        SWR_ASSERT(pState->depthFormat == R32_FLOAT);
+
+        // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
+        float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
+        uint32_t zMaxInt = *(uint32_t*)&zMax;
+        zMaxInt &= 0x7f800000;
+        zMax = *(float*)&zMaxInt;
+
+        return zMax * (1.0f / (1 << 23));
+    }
+}
+
+INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
+{
+    if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
+    {
+        return 0.0f;
+    }
+
+    float scale = pState->slopeScaledDepthBias;
+    if (scale != 0.0f)
+    {
+        scale *= ComputeMaxDepthSlope(pTri);
+    }
+
+    float bias = pState->depthBias;
+    if (!pState->depthBiasPreAdjusted)
+    {
+        bias *= ComputeBiasFactor(pState, pTri, z);
+    }
+    bias += scale;
+
+    if (pState->depthBiasClamp > 0.0f)
+    {
+        bias = std::min(bias, pState->depthBiasClamp);
+    }
+    else if (pState->depthBiasClamp < 0.0f)
+    {
+        bias = std::max(bias, pState->depthBiasClamp);
+    }
+
+    return bias;
+}
+
+// Prevent DCE by writing coverage mask from rasterizer to volatile
+#if KNOB_ENABLE_TOSS_POINTS
+__declspec(thread) volatile uint64_t gToss;
+#endif
+
+static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
+// try to avoid _chkstk insertions; make this thread local
+static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
+
+INLINE
+void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
+{
+    edge.a = a;
+    edge.b = b;
+
+    // compute constant steps to adjacent quads
+    edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
+    edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
+
+    // compute constant steps to adjacent raster tiles
+    edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
+    edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
+
+    // compute quad offsets
+    const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
+    const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
+
+    __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
+    __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
+    edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
+
+    // compute raster tile offsets
+    const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
+    const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
+
+    __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
+    __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
+    edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
+}
+
+INLINE
+void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
+{
+    ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary template definition used for partially specializing 
+/// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel 
+/// corner to sample position, and test for coverage
+/// @tparam sampleCount: multisample count
+template <typename NumSamplesT>
+INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
+                            int32_t &mask0, int32_t &mask1, int32_t &mask2)
+{
+    __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
+    // evaluate edge equations at the tile multisample bounding box
+    vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
+    vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
+    vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
+    mask0 = _mm256_movemask_pd(vSampleBboxTest0);
+    mask1 = _mm256_movemask_pd(vSampleBboxTest1);
+    mask2 = _mm256_movemask_pd(vSampleBboxTest2);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
+/// when only rasterizing a single coverage test point
+template <>
+INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
+                                           int32_t &mask0, int32_t &mask1, int32_t &mask2)
+{
+    mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
+    mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
+    mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @struct ComputeScissorEdges
+/// @brief Primary template definition. Allows the function to be generically
+/// called. When paired with below specializations, will result in an empty 
+/// inlined function if scissor is not enabled
+/// @tparam RasterScissorEdgesT: is scissor enabled?
+/// @tparam IsConservativeT: is conservative rast enabled?
+/// @tparam RT: rasterizer traits
+template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
+struct ComputeScissorEdges
+{
+    INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, 
+                              EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial 
+/// specialization. Instantiated when conservative rast and scissor are enabled
+template <typename RT>
+struct ComputeScissorEdges<std::true_type, std::true_type, RT>
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Intersect tri bbox with scissor, compute scissor edge vectors, 
+    /// evaluate edge equations and offset them away from pixel center.
+    INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
+                              EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
+    {
+        // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
+        SWR_RECT scissor;
+        scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
+        scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
+        scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
+        scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
+
+        POS topLeft{scissor.xmin, scissor.ymin};
+        POS bottomLeft{scissor.xmin, scissor.ymax};
+        POS topRight{scissor.xmax, scissor.ymin};
+        POS bottomRight{scissor.xmax, scissor.ymax};
+
+        // construct 4 scissor edges in ccw direction
+        ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
+        ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
+        ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
+        ComputeEdgeData(topRight, topLeft, rastEdges[6]);
+
+        vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
+        vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
+        vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
+        vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
+
+        // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
+        adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
+        adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
+        adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
+        adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
+
+        // Upper left rule for scissor
+        vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
+        vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial 
+/// specialization. Instantiated when scissor is enabled and conservative rast
+/// is disabled.
+template <typename RT>
+struct ComputeScissorEdges<std::true_type, std::false_type, RT>
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Compute scissor edge vectors and evaluate edge equations
+    INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
+                              EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
+    {
+        const SWR_RECT &scissor = scissorBBox;
+        POS topLeft{scissor.xmin, scissor.ymin};
+        POS bottomLeft{scissor.xmin, scissor.ymax};
+        POS topRight{scissor.xmax, scissor.ymin};
+        POS bottomRight{scissor.xmax, scissor.ymax};
+
+        // construct 4 scissor edges in ccw direction
+        ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
+        ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
+        ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
+        ComputeEdgeData(topRight, topLeft, rastEdges[6]);
+
+        vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
+        vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
+        vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
+        vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
+
+        // Upper left rule for scissor
+        vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
+        vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary function template for TrivialRejectTest. Should
+/// never be called, but TemplateUnroller instantiates a few unused values,
+/// so it calls a runtime assert instead of a static_assert.
+template <typename ValidEdgeMaskT>
+INLINE bool TrivialRejectTest(const int, const int, const int)
+{
+    SWR_INVALID("Primary templated function should never be called");
+    return false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
+/// and edge 1 for trivial coverage reject
+template <>
+INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
+{
+    return (!(mask0 && mask1)) ? true : false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
+/// and edge 2 for trivial coverage reject
+template <>
+INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
+{
+    return (!(mask0 && mask2)) ? true : false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
+/// and edge 2 for trivial coverage reject
+template <>
+INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
+{
+    return (!(mask1 && mask2)) ? true : false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
+/// primitive edges for trivial coverage reject
+template <>
+INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
+{
+    return (!(mask0 && mask1 && mask2)) ? true : false;;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
+/// point, so return false and rasterize against conservative BBox
+template <>
+INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
+{
+    return false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary function template for TrivialAcceptTest. Always returns
+/// false, since it will only be called for degenerate tris, and as such 
+/// will never cover the entire raster tile
+template <typename ScissorEnableT>
+INLINE bool TrivialAcceptTest(const int, const int, const int)
+{
+    return false;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
+/// edge masks for a fully covered raster tile
+template <>
+INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
+{
+    return ((mask0 & mask1 & mask2) == 0xf);
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary function template for GenerateSVInnerCoverage. Results
+/// in an empty function call if SVInnerCoverage isn't requested
+template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
+struct GenerateSVInnerCoverage
+{
+    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*,  uint64_t &){};
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Specialization of GenerateSVInnerCoverage where all edges
+/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated 
+/// edge values from OuterConservative to InnerConservative and rasterizes.
+template <typename RT>
+struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
+{
+    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges,  uint64_t &innerCoverageMask)
+    {
+        SWR_CONTEXT *pContext = pDC->pContext;
+
+        double startQuadEdgesAdj[RT::NumEdgesT::value];
+        for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+        {
+            startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
+        }
+
+        // not trivial accept or reject, must rasterize full tile
+        AR_BEGIN(BERasterizePartial, pDC->drawId);
+        innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
+        AR_END(BERasterizePartial, 0);
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
+/// in an empty function call if SVInnerCoverage isn't requested
+template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
+struct UpdateEdgeMasksInnerConservative
+{
+    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
+                                           const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
+/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges 
+/// evaluated at raster tile corners to inner conservative position and 
+/// updates edge masks
+template <typename RT>
+struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
+{
+    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
+                                           const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
+    {
+        __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
+
+        // instead of keeping 2 copies of evaluated edges around, just compensate for the outer 
+        // conservative evaluated edge when adjusting the edge in for inner conservative tests
+        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
+        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
+        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
+
+        UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage 
+/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot 
+/// cover an entire raster tile, set mask0 to 0 to force it down the
+/// rastierizePartialTile path
+template <typename RT, typename ValidEdgeMaskT>
+struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
+{
+    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
+                                   const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
+    {
+        // set one mask to zero to force the triangle down the rastierizePartialTile path
+        mask0 = 0;
+    }
+};
+
+template <typename RT>
+void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
+{
+    SWR_CONTEXT *pContext = pDC->pContext;
+    const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
+#if KNOB_ENABLE_TOSS_POINTS
+    if (KNOB_TOSS_BIN_TRIS)
+    {
+        return;
+    }
+#endif
+    AR_BEGIN(BERasterizeTriangle, pDC->drawId);
+    AR_BEGIN(BETriangleSetup, pDC->drawId);
+
+    const API_STATE &state = GetApiState(pDC);
+    const SWR_RASTSTATE &rastState = state.rastState;
+    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+
+    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
+    triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
+
+    __m128 vX, vY, vZ, vRecipW;
+    
+    // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
+    // eg: vX = [x0 x1 x2 dc]
+    vX = _mm_load_ps(workDesc.pTriBuffer);
+    vY = _mm_load_ps(workDesc.pTriBuffer + 4);
+    vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
+    vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
+
+    // convert to fixed point
+    static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
+    __m128i vXi = fpToFixedPoint(vX);
+    __m128i vYi = fpToFixedPoint(vY);
+
+    // quantize floating point position to fixed point precision
+    // to prevent attribute creep around the triangle vertices
+    vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
+    vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
+
+    // triangle setup - A and B edge equation coefs
+    __m128 vA, vB;
+    triangleSetupAB(vX, vY, vA, vB);
+
+    __m128i vAi, vBi;
+    triangleSetupABInt(vXi, vYi, vAi, vBi);
+    
+    // determinant
+    float det = calcDeterminantInt(vAi, vBi);
+
+    // Verts in Pixel Coordinate Space at this point
+    // Det > 0 = CW winding order 
+    // Convert CW triangles to CCW
+    if (det > 0.0)
+    {
+        vA  = _mm_mul_ps(vA, _mm_set1_ps(-1));
+        vB  = _mm_mul_ps(vB, _mm_set1_ps(-1));
+        vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
+        vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
+        det = -det;
+    }
+
+    __m128 vC;
+    // Finish triangle setup - C edge coef
+    triangleSetupC(vX, vY, vA, vB, vC);
+
+    if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
+    {
+        // If we have degenerate edge(s) to rasterize, set I and J coefs 
+        // to 0 for constant interpolation of attributes
+        triDesc.I[0] = 0.0f;
+        triDesc.I[1] = 0.0f;
+        triDesc.I[2] = 0.0f;
+        triDesc.J[0] = 0.0f;
+        triDesc.J[1] = 0.0f;
+        triDesc.J[2] = 0.0f;
+
+        // Degenerate triangles have no area
+        triDesc.recipDet = 0.0f;
+    }
+    else
+    {
+        // only extract coefs for 2 of the barycentrics; the 3rd can be 
+        // determined from the barycentric equation:
+        // i + j + k = 1 <=> k = 1 - j - i
+        _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
+        _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
+        _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
+        _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
+        _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
+        _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
+
+        // compute recipDet, used to calculate barycentric i and j in the backend
+        triDesc.recipDet = 1.0f/det;
+    }
+
+    OSALIGNSIMD(float) oneOverW[4];
+    _mm_store_ps(oneOverW, vRecipW);
+    triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
+    triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
+    triDesc.OneOverW[2] = oneOverW[2];
+
+    // calculate perspective correct coefs per vertex attrib 
+    float* pPerspAttribs = perspAttribsTLS;
+    float* pAttribs = workDesc.pAttribs;
+    triDesc.pPerspAttribs = pPerspAttribs;
+    triDesc.pAttribs = pAttribs;
+    float *pRecipW = workDesc.pTriBuffer + 12;
+    triDesc.pRecipW = pRecipW;
+    __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
+    __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
+    __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
+    for(uint32_t i = 0; i < workDesc.numAttribs; i++)
+    {
+        __m128 attribA = _mm_load_ps(pAttribs);
+        __m128 attribB = _mm_load_ps(pAttribs+=4);
+        __m128 attribC = _mm_load_ps(pAttribs+=4);
+        pAttribs+=4;
+
+        attribA = _mm_mul_ps(attribA, vOneOverWV0);
+        attribB = _mm_mul_ps(attribB, vOneOverWV1);
+        attribC = _mm_mul_ps(attribC, vOneOverWV2);
+
+        _mm_store_ps(pPerspAttribs, attribA);
+        _mm_store_ps(pPerspAttribs+=4, attribB);
+        _mm_store_ps(pPerspAttribs+=4, attribC);
+        pPerspAttribs+=4;
+    }
+
+    // compute bary Z
+    // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
+    OSALIGNSIMD(float) a[4];
+    _mm_store_ps(a, vZ);
+    triDesc.Z[0] = a[0] - a[2];
+    triDesc.Z[1] = a[1] - a[2];
+    triDesc.Z[2] = a[2];
+        
+    // add depth bias
+    triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
+
+    // Calc bounding box of triangle
+    OSALIGNSIMD(SWR_RECT) bbox;
+    calcBoundingBoxInt(vXi, vYi, bbox);
+
+    const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
+
+    if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
+    {
+        // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
+        bbox.xmin--;    bbox.xmax++;    bbox.ymin--;    bbox.ymax++;
+        SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
+                   "Conservative rast degenerate handling requires a valid scissor rect");
+    }
+
+    // Intersect with scissor/viewport
+    OSALIGNSIMD(SWR_RECT) intersect;
+    intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
+    intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
+    intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
+    intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
+
+    triDesc.triFlags = workDesc.triFlags;
+
+    // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
+    uint32_t macroX, macroY;
+    MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
+    int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
+    int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
+    int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
+    int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
+
+    intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
+    intersect.ymin = std::max(intersect.ymin, macroBoxTop);
+    intersect.xmax = std::min(intersect.xmax, macroBoxRight);
+    intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
+
+    SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
+
+    AR_END(BETriangleSetup, 0);
+
+    // update triangle desc
+    uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+    uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+    uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+    uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+    uint32_t numTilesX = maxTileX - minTileX + 1;
+    uint32_t numTilesY = maxTileY - minTileY + 1;
+
+    if (numTilesX == 0 || numTilesY == 0) 
+    {
+        RDTSC_EVENT(BEEmptyTriangle, 1, 0);
+        AR_END(BERasterizeTriangle, 1);
+        return;
+    }
+
+    AR_BEGIN(BEStepSetup, pDC->drawId);
+
+    // Step to pixel center of top-left pixel of the triangle bbox
+    // Align intersect bbox (top/left) to raster tile's (top/left).
+    int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
+    int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
+
+    // convenience typedef
+    typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
+
+    // single sample rasterization evaluates edges at pixel center,
+    // multisample evaluates edges UL pixel corner and steps to each sample position
+    if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
+    {
+        // Add 0.5, in fixed point, to offset to pixel center
+        x += (FIXED_POINT_SCALE / 2);
+        y += (FIXED_POINT_SCALE / 2);
+    }
+
+    __m128i vTopLeftX = _mm_set1_epi32(x);
+    __m128i vTopLeftY = _mm_set1_epi32(y);
+
+    // evaluate edge equations at top-left pixel using 64bit math
+    // 
+    // line = Ax + By + C
+    // solving for C:
+    // C = -Ax - By
+    // we know x0 and y0 are on the line; plug them in:
+    // C = -Ax0 - By0
+    // plug C back into line equation:
+    // line = Ax - By - Ax0 - By0
+    // line = A(x - x0) + B(y - y0)
+    // dX = (x-x0), dY = (y-y0)
+    // so all this simplifies to 
+    // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
+
+    __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
+    __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
+
+    // evaluate A(dx) and B(dY) for all points
+    __m256d vAipd = _mm256_cvtepi32_pd(vAi);
+    __m256d vBipd = _mm256_cvtepi32_pd(vBi);
+    __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
+    __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
+
+    __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
+    __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
+    __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
+
+    // apply any edge adjustments(top-left, crast, etc)
+    adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
+
+    // broadcast respective edge results to all lanes
+    double* pEdge = (double*)&vEdge;
+    __m256d vEdgeFix16[7];
+    vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
+    vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
+    vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
+
+    OSALIGNSIMD(int32_t) aAi[4], aBi[4];
+    _mm_store_si128((__m128i*)aAi, vAi);
+    _mm_store_si128((__m128i*)aBi, vBi);
+    EDGE rastEdges[RT::NumEdgesT::value];
+
+    // Compute and store triangle edge data
+    ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
+    ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
+    ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
+
+    // Compute and store triangle edge data if scissor needs to rasterized
+    ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
+                       (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
+
+    // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
+    // used to for testing if entire raster tile is inside a triangle
+    for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+    {
+        vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
+    }
+
+    // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
+    // step sample positions to the raster tile bbox of multisample points
+    // min(xSamples),min(ySamples)  ------  max(xSamples),min(ySamples)
+    //                             |      |
+    //                             |      |
+    // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
+    __m256d vEdgeTileBbox[3];
+    if (NumCoverageSamplesT::value > 1)
+    {
+        const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
+        const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
+        const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
+
+        __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
+        __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
+
+        // step edge equation tests from Tile
+        // used to for testing if entire raster tile is inside a triangle
+        for (uint32_t e = 0; e < 3; ++e)
+        {
+            __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
+            __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
+            vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+
+            // adjust for msaa tile bbox edges outward for conservative rast, if enabled
+            adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
+        }
+    }
+
+    AR_END(BEStepSetup, 0);
+
+    uint32_t tY = minTileY;
+    uint32_t tX = minTileX;
+    uint32_t maxY = maxTileY;
+    uint32_t maxX = maxTileX;
+
+    RenderOutputBuffers renderBuffers, currentRenderBufferRow;
+    GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
+    currentRenderBufferRow = renderBuffers;
+
+    // rasterize and generate coverage masks per sample
+    for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
+    {
+        __m256d vStartOfRowEdge[RT::NumEdgesT::value];
+        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+        {
+            vStartOfRowEdge[e] = vEdgeFix16[e];
+        }
+
+        for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
+        {
+            triDesc.anyCoveredSamples = 0;
+
+            // is the corner of the edge outside of the raster tile? (vEdge < 0)
+            int mask0, mask1, mask2;
+            UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
+
+            for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
+            {
+                // trivial reject, at least one edge has all 4 corners of raster tile outside
+                bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
+
+                if (!trivialReject)
+                {
+                    // trivial accept mask
+                    triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
+
+                    // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
+                    UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
+                        (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
+
+                    // @todo Make this a bit smarter to allow use of trivial accept when:
+                    //   1) scissor/vp intersection rect is raster tile aligned
+                    //   2) raster tile is entirely within scissor/vp intersection rect
+                    if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
+                    {
+                        // trivial accept, all 4 corners of all 3 edges are negative 
+                        // i.e. raster tile completely inside triangle
+                        triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
+                        if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
+                        {
+                            triDesc.innerCoverageMask = 0xffffffffffffffffULL;
+                        }
+                        RDTSC_EVENT(BETrivialAccept, 1, 0);
+                    }
+                    else
+                    {
+                        __m256d vEdgeAtSample[RT::NumEdgesT::value];
+                        if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
+                        {
+                            // should get optimized out for single sample case (global value numbering or copy propagation)
+                            for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+                            {
+                                vEdgeAtSample[e] = vEdgeFix16[e];
+                            }
+                        }
+                        else
+                        {
+                            const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
+                            __m128i vSampleOffsetXh = samplePos.vXi(sampleNum);
+                            __m128i vSampleOffsetYh = samplePos.vYi(sampleNum);
+                            __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
+                            __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
+
+                            // step edge equation tests from UL tile corner to pixel sample position
+                            for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+                            {
+                                __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
+                                __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
+                                vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+                                vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
+                            }
+                        }
+
+                        double startQuadEdges[RT::NumEdgesT::value];
+                        const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+                        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+                        {
+                            _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
+                        }
+
+                        // not trivial accept or reject, must rasterize full tile
+                        AR_BEGIN(BERasterizePartial, pDC->drawId);
+                        triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
+                        AR_END(BERasterizePartial, 0);
+
+                        triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 
+                        
+                        // Output SV InnerCoverage, if needed
+                        GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
+                    }
+                }
+                else
+                {
+                    // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
+                    if(NumCoverageSamplesT::value > 1)
+                    {
+                        triDesc.coverageMask[sampleNum] = 0;
+                    }
+                    RDTSC_EVENT(BETrivialReject, 1, 0);
+                }
+            }
+
+#if KNOB_ENABLE_TOSS_POINTS
+            if(KNOB_TOSS_RS)
+            {
+                gToss = triDesc.coverageMask[0];
+            }
+            else
+#endif
+            if(triDesc.anyCoveredSamples)
+            {
+                // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
+                // copy conservative coverage result to all samples
+                if(RT::IsConservativeT::value)
+                {
+                    auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
+                    UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
+                }
+
+                AR_BEGIN(BEPixelBackend, pDC->drawId);
+                backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
+                AR_END(BEPixelBackend, 0);
+            }
+
+            // step to the next tile in X
+            for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+            {
+                vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
+            }
+            StepRasterTileX<RT>(state.psState.numRenderTargets, renderBuffers);
+        }
+
+        // step to the next tile in Y
+        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+        {
+            vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
+        }
+        StepRasterTileY<RT>(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow);
+    }
+
+    AR_END(BERasterizeTriangle, 1);
+}
+
+// Get pointers to hot tile memory for color RT, depth, stencil
+template <uint32_t numSamples>
+void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
+{
+    const API_STATE& state = GetApiState(pDC);
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    uint32_t mx, my;
+    MacroTileMgr::getTileIndices(macroID, mx, my);
+    tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
+    tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
+
+    // compute tile offset for active hottile buffers
+    const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
+    uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
+    offset*=numSamples;
+
+    unsigned long rtSlot = 0;
+    uint32_t colorHottileEnableMask = state.colorHottileEnable;
+    while(_BitScanForward(&rtSlot, colorHottileEnableMask))
+    {
+        HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, 
+            numSamples, renderTargetArrayIndex);
+        pColor->state = HOTTILE_DIRTY;
+        renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
+        
+        colorHottileEnableMask &= ~(1 << rtSlot);
+    }
+    if(state.depthHottileEnable)
+    {
+        const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
+        uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
+        offset*=numSamples;
+        HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, 
+            numSamples, renderTargetArrayIndex);
+        pDepth->state = HOTTILE_DIRTY;
+        SWR_ASSERT(pDepth->pBuffer != nullptr);
+        renderBuffers.pDepth = pDepth->pBuffer + offset;
+    }
+    if(state.stencilHottileEnable)
+    {
+        const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
+        uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
+        offset*=numSamples;
+        HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, 
+            numSamples, renderTargetArrayIndex);
+        pStencil->state = HOTTILE_DIRTY;
+        SWR_ASSERT(pStencil->pBuffer != nullptr);
+        renderBuffers.pStencil = pStencil->pBuffer + offset;
+    }
+}
+
+template <typename RT>
+INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers)
+{
+    for(uint32_t rt = 0; rt < NumRT; ++rt)
+    {
+        buffers.pColor[rt] += RT::colorRasterTileStep;
+    }
+    
+    buffers.pDepth += RT::depthRasterTileStep;
+    buffers.pStencil += RT::stencilRasterTileStep;
+}
+
+template <typename RT>
+INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
+{
+    for(uint32_t rt = 0; rt < NumRT; ++rt)
+    {
+        startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
+        buffers.pColor[rt] = startBufferRow.pColor[rt];
+    }
+    startBufferRow.pDepth += RT::depthRasterTileRowStep;
+    buffers.pDepth = startBufferRow.pDepth;
+
+    startBufferRow.pStencil += RT::stencilRasterTileRowStep;
+    buffers.pStencil = startBufferRow.pStencil;
+}
+
-- 
2.7.4



More information about the mesa-dev mailing list