[Mesa-dev] [PATCH 02/13] swr/rast: simdlib better seperation of core vs. knights avx512

Mon Jul 31 19:40:00 UTC 2017

---
 src/gallium/drivers/swr/Makefile.am                |   2 +-
 src/gallium/drivers/swr/Makefile.sources           |   8 +
 .../drivers/swr/rasterizer/common/simdlib.hpp      |  21 ++-
 .../swr/rasterizer/common/simdlib_128_avx512.inl   | 108 +++---------
 .../rasterizer/common/simdlib_128_avx512_core.inl  | 193 +++++++++++++++++++++
 .../common/simdlib_128_avx512_knights.inl          |  35 ++++
 .../swr/rasterizer/common/simdlib_256_avx512.inl   | 128 +++-----------
 .../rasterizer/common/simdlib_256_avx512_core.inl  | 127 ++++++++++++++
 .../common/simdlib_256_avx512_knights.inl          |  35 ++++
 .../swr/rasterizer/common/simdlib_512_avx512.inl   |  79 +++------
 .../rasterizer/common/simdlib_512_avx512_core.inl  | 181 +++++++++++++++++++
 .../common/simdlib_512_avx512_knights.inl          | 183 +++++++++++++++++++
 .../common/simdlib_512_avx512_masks_core.inl       |  27 +++
 .../common/simdlib_512_avx512_masks_knights.inl    |  27 +++
 .../swr/rasterizer/common/simdlib_types.hpp        |   2 +-
 15 files changed, 911 insertions(+), 245 deletions(-)
 create mode 100644 src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl
 create mode 100644 src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl
 create mode 100644 src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl
 create mode 100644 src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl
 create mode 100644 src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl
 create mode 100644 src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl
 create mode 100644 src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl
 create mode 100644 src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl

diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am
index 05fc3b3..73fe904 100644
--- a/src/gallium/drivers/swr/Makefile.am
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -285,7 +285,7 @@ lib_LTLIBRARIES += libswrKNL.la
 libswrKNL_la_CXXFLAGS = \
 	$(PTHREAD_CFLAGS) \
 	$(SWR_KNL_CXXFLAGS) \
-	-DKNOB_ARCH=KNOB_ARCH_AVX512 -DAVX512F_STRICT \
+	-DKNOB_ARCH=KNOB_ARCH_AVX512 -DSIMD_ARCH_KNIGHTS \
 	$(COMMON_CXXFLAGS)
 
 libswrKNL_la_SOURCES = \
diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources
index 3c1118b..53f8bf0 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -69,11 +69,19 @@ COMMON_CXX_SOURCES := \
 	rasterizer/common/simdlib_128_avx.inl \
 	rasterizer/common/simdlib_128_avx2.inl \
 	rasterizer/common/simdlib_128_avx512.inl \
+	rasterizer/common/simdlib_128_avx512_core.inl \
+	rasterizer/common/simdlib_128_avx512_knights.inl \
 	rasterizer/common/simdlib_256_avx.inl \
 	rasterizer/common/simdlib_256_avx2.inl \
 	rasterizer/common/simdlib_256_avx512.inl \
+	rasterizer/common/simdlib_256_avx512_core.inl \
+	rasterizer/common/simdlib_256_avx512_knights.inl \
 	rasterizer/common/simdlib_512_avx512.inl \
+	rasterizer/common/simdlib_512_avx512_core.inl \
+	rasterizer/common/simdlib_512_avx512_knights.inl \
 	rasterizer/common/simdlib_512_avx512_masks.inl \
+	rasterizer/common/simdlib_512_avx512_masks_core.inl \
+	rasterizer/common/simdlib_512_avx512_masks_knights.inl \
 	rasterizer/common/simdlib_512_emu.inl \
 	rasterizer/common/simdlib_512_emu_masks.inl \
 	rasterizer/common/simdlib_interface.hpp \
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
index fb11132..0c79cdd 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
@@ -55,6 +55,11 @@ namespace SIMDImpl
         {
 #define __SIMD_LIB_AVX512_HPP__
 #include "simdlib_128_avx512.inl"
+#if defined(SIMD_ARCH_KNIGHTS)
+#include "simdlib_128_avx512_knights.inl"
+#else // optimize for core
+#include "simdlib_128_avx512_core.inl"
+#endif // defined(SIMD_ARCH_KNIGHTS)
 #undef __SIMD_LIB_AVX512_HPP__
         }; // struct AVX2Impl
 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
@@ -105,6 +110,11 @@ namespace SIMDImpl
         {
 #define __SIMD_LIB_AVX512_HPP__
 #include "simdlib_256_avx512.inl"
+#if defined(SIMD_ARCH_KNIGHTS)
+#include "simdlib_256_avx512_knights.inl"
+#else // optimize for core
+#include "simdlib_256_avx512_core.inl"
+#endif // defined(SIMD_ARCH_KNIGHTS)
 #undef __SIMD_LIB_AVX512_HPP__
         }; // struct AVX2Impl
 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
@@ -150,13 +160,20 @@ namespace SIMDImpl
 
 
 #if SIMD_ARCH >= SIMD_ARCH_AVX512
-        struct AVX512Impl
+        struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl>
         {
 #define __SIMD_LIB_AVX512_HPP__
 #include "simdlib_512_avx512.inl"
 #include "simdlib_512_avx512_masks.inl"
+#if defined(SIMD_ARCH_KNIGHTS)
+#include "simdlib_512_avx512_knights.inl"
+#include "simdlib_512_avx512_masks_knights.inl"
+#else // optimize for core
+#include "simdlib_512_avx512_core.inl"
+#include "simdlib_512_avx512_masks_core.inl"
+#endif // defined(SIMD_ARCH_KNIGHTS)
 #undef __SIMD_LIB_AVX512_HPP__
-        }; // struct AVX512Impl
+        }; // struct AVX512ImplBase
 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
 
         struct Traits : SIMDImpl::Traits
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
index 012f310..66e8309 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
@@ -78,34 +78,6 @@ public:
     }
 #define SIMD_WRAPPER_3(op)  SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
 
-#define SIMD_DWRAPPER_1_(op, intrin, mask)  \
-    static SIMDINLINE Double SIMDCALL op(Double a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
-    }
-#if !defined(AVX512F_STRICT)
-#define SIMD_DWRAPPER_1(op)  SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
-#endif
-
-#define SIMD_DWRAPPER_1I_(op, intrin, mask)  \
-    template<int ImmT> \
-    static SIMDINLINE Double SIMDCALL op(Double a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
-    }
-#if !defined(AVX512F_STRICT)
-#define SIMD_DWRAPPER_1I(op)  SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
-#endif
-
-#define SIMD_DWRAPPER_2_(op, intrin, mask)  \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
-    }
-#if !defined(AVX512F_STRICT)
-#define SIMD_DWRAPPER_2(op)  SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
-#endif
-
 #define SIMD_DWRAPPER_2I(op)  \
     template<int ImmT>\
     static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
@@ -119,11 +91,6 @@ public:
         return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
     }
 #define SIMD_IWRAPPER_1_32(op)  SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
-#if !defined(AVX512F_STRICT)
-#define SIMD_IWRAPPER_1_8(op)   SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
-#define SIMD_IWRAPPER_1_16(op)  SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
-#define SIMD_IWRAPPER_1_64(op)  SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
-#endif
 
 #define SIMD_IWRAPPER_1I_(op, intrin, mask)  \
     template<int ImmT> \
@@ -132,11 +99,6 @@ public:
         return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
     }
 #define SIMD_IWRAPPER_1I_32(op)  SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
-#if !defined(AVX512F_STRICT)
-#define SIMD_IWRAPPER_1I_8(op)   SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
-#define SIMD_IWRAPPER_1I_16(op)  SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
-#define SIMD_IWRAPPER_1I_64(op)  SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
-#endif
 
 #define SIMD_IWRAPPER_2_(op, intrin, mask)  \
     static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
@@ -144,11 +106,6 @@ public:
         return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
     }
 #define SIMD_IWRAPPER_2_32(op)  SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
-#if !defined(AVX512F_STRICT)
-#define SIMD_IWRAPPER_2_8(op)   SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
-#define SIMD_IWRAPPER_2_16(op)  SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
-#define SIMD_IWRAPPER_2_64(op)  SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
-#endif
 
 #define SIMD_IWRAPPER_2I(op)  \
     template<int ImmT>\
@@ -182,12 +139,8 @@ SIMD_IWRAPPER_2_32(min_epi32);  // return (a < b) ? a : b (int32)
 SIMD_IWRAPPER_2_32(min_epu32);  // return (a < b) ? a : b (uint32)
 SIMD_IWRAPPER_2_32(mul_epi32);  // return a * b (int32)
 
-#if !defined(AVX512F_STRICT)
-
-SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
-SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
-
-#endif
+// SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
+// SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
 
 // return (a * b) & 0xFFFFFFFF
 //
@@ -196,12 +149,8 @@ SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uin
 SIMD_IWRAPPER_2_32(mullo_epi32);
 SIMD_IWRAPPER_2_32(sub_epi32);  // return a - b (int32)
 
-#if !defined(AVX512F_STRICT)
-
-SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
-SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
-
-#endif
+// SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
+// SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
 
 //-----------------------------------------------------------------------
 // Logical operations
@@ -253,14 +202,10 @@ SIMD_IWRAPPER_2_32(srlv_epi32);                // return a >> b      (uint32)
 //-----------------------------------------------------------------------
 // Blend / shuffle / permute operations
 //-----------------------------------------------------------------------
-#if !defined(AVX512F_STRICT)
-
-SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
-
-#endif
+// SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+// SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+// SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+// SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
 // SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
 
 //static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
@@ -278,16 +223,12 @@ SIMD_IWRAPPER_1I_32(shuffle_epi32);
 SIMD_IWRAPPER_2_32(unpackhi_epi32);
 SIMD_IWRAPPER_2_32(unpacklo_epi32);
 
-#if !defined(AVX512F_STRICT)
-
-SIMD_IWRAPPER_2_16(unpackhi_epi16);
-SIMD_IWRAPPER_2_64(unpackhi_epi64);
-SIMD_IWRAPPER_2_8(unpackhi_epi8);
-SIMD_IWRAPPER_2_16(unpacklo_epi16);
-SIMD_IWRAPPER_2_64(unpacklo_epi64);
-SIMD_IWRAPPER_2_8(unpacklo_epi8);
-
-#endif
+// SIMD_IWRAPPER_2_16(unpackhi_epi16);
+// SIMD_IWRAPPER_2_64(unpackhi_epi64);
+// SIMD_IWRAPPER_2_8(unpackhi_epi8);
+// SIMD_IWRAPPER_2_16(unpacklo_epi16);
+// SIMD_IWRAPPER_2_64(unpacklo_epi64);
+// SIMD_IWRAPPER_2_8(unpacklo_epi8);
 
 //-----------------------------------------------------------------------
 // Load / store operations
@@ -338,16 +279,12 @@ static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, In
                     static_cast<int>(ScaleT)));
 }
 
-#if !defined(AVX512F_STRICT)
-
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
-{
-    __mmask64 m = 0xffffull;
-    return static_cast<uint32_t>(
-        _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
-}
-
-#endif
+// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+// {
+//     __mmask64 m = 0xffffull;
+//     return static_cast<uint32_t>(
+//         _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+// }
 
 static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
 {
@@ -366,6 +303,11 @@ static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
     _mm512_mask_storeu_epi32(p, __mmask16(0xf), __conv(a));
 }
 
+static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
+{
+    return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xf), -1)));
+}
+
 //=======================================================================
 // Legacy interface (available only in SIMD256 width)
 //=======================================================================
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl
new file mode 100644
index 0000000..a4ecd09
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl
@@ -0,0 +1,193 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD128 AVX (512) implementation
+//
+// Since this implementation inherits from the AVX (2) implementation,
+// the only operations below ones that replace AVX (2) operations.
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
+#define SIMD_WRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_WRAPPER_1(op)  SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_WRAPPER_1I(op)  SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_WRAPPER_2(op)  SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_WRAPPER_3_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
+    }
+#define SIMD_WRAPPER_3(op)  SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
+
+#define SIMD_DWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_DWRAPPER_1(op)  SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
+
+#define SIMD_DWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_DWRAPPER_1I(op)  SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
+
+#define SIMD_DWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_DWRAPPER_2(op)  SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_IWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_IWRAPPER_1_8(op)   SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_1_16(op)  SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_1_64(op)  SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
+
+#define SIMD_IWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_IWRAPPER_1I_8(op)   SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_1I_16(op)  SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_1I_64(op)  SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
+
+#define SIMD_IWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_IWRAPPER_2_8(op)   SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_2_16(op)  SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_2_64(op)  SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+    }
+
+SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
+SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
+SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
+SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+SIMD_IWRAPPER_2_16(unpackhi_epi16);
+SIMD_IWRAPPER_2_64(unpackhi_epi64);
+SIMD_IWRAPPER_2_8(unpackhi_epi8);
+SIMD_IWRAPPER_2_16(unpacklo_epi16);
+SIMD_IWRAPPER_2_64(unpacklo_epi64);
+SIMD_IWRAPPER_2_8(unpacklo_epi8);
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    __mmask64 m = 0xffffull;
+    return static_cast<uint32_t>(
+        _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+}
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_1I_
+#undef SIMD_WRAPPER_1I
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_3
+#undef SIMD_DWRAPPER_1_
+#undef SIMD_DWRAPPER_1
+#undef SIMD_DWRAPPER_1I_
+#undef SIMD_DWRAPPER_1I
+#undef SIMD_DWRAPPER_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_IWRAPPER_1_
+#undef SIMD_IWRAPPER_1_8
+#undef SIMD_IWRAPPER_1_16
+#undef SIMD_IWRAPPER_1_32
+#undef SIMD_IWRAPPER_1_64
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_1I_8
+#undef SIMD_IWRAPPER_1I_16
+#undef SIMD_IWRAPPER_1I_32
+#undef SIMD_IWRAPPER_1I_64
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2_8
+#undef SIMD_IWRAPPER_2_16
+#undef SIMD_IWRAPPER_2_32
+#undef SIMD_IWRAPPER_2_64
+#undef SIMD_IWRAPPER_2I
+//#undef SIMD_IWRAPPER_2I_8
+//#undef SIMD_IWRAPPER_2I_16
+//#undef SIMD_IWRAPPER_2I_32
+//#undef SIMD_IWRAPPER_2I_64
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl
new file mode 100644
index 0000000..b0cae50
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl
@@ -0,0 +1,35 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD128 AVX (512) implementation for Knights Family
+//
+// Since this implementation inherits from the AVX512Base implementation,
+// the only operations below ones that replace AVX512F / AVX512CD operations
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
index a8d2a4b..3f93cfb 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
@@ -78,34 +78,6 @@ public:
     }
 #define SIMD_WRAPPER_3(op)  SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
 
-#define SIMD_DWRAPPER_1_(op, intrin, mask)  \
-    static SIMDINLINE Double SIMDCALL op(Double a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
-    }
-#if !defined(AVX512F_STRICT)
-#define SIMD_DWRAPPER_1(op)  SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
-#endif
-
-#define SIMD_DWRAPPER_1I_(op, intrin, mask)  \
-    template<int ImmT> \
-    static SIMDINLINE Double SIMDCALL op(Double a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
-    }
-#if !defined(AVX512F_STRICT)
-#define SIMD_DWRAPPER_1I(op)  SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
-#endif
-
-#define SIMD_DWRAPPER_2_(op, intrin, mask)  \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
-    }
-#if !defined(AVX512F_STRICT)
-#define SIMD_DWRAPPER_2(op)  SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
-#endif
-
 #define SIMD_DWRAPPER_2I(op)  \
     template<int ImmT>\
     static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
@@ -119,11 +91,6 @@ public:
         return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
     }
 #define SIMD_IWRAPPER_1_32(op)  SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
-#if !defined(AVX512F_STRICT)
-#define SIMD_IWRAPPER_1_8(op)   SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
-#define SIMD_IWRAPPER_1_16(op)  SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
-#define SIMD_IWRAPPER_1_64(op)  SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
-#endif
 
 #define SIMD_IWRAPPER_1I_(op, intrin, mask)  \
     template<int ImmT> \
@@ -132,11 +99,6 @@ public:
         return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
     }
 #define SIMD_IWRAPPER_1I_32(op)  SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
-#if !defined(AVX512F_STRICT)
-#define SIMD_IWRAPPER_1I_8(op)   SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
-#define SIMD_IWRAPPER_1I_16(op)  SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
-#define SIMD_IWRAPPER_1I_64(op)  SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
-#endif
 
 #define SIMD_IWRAPPER_2_(op, intrin, mask)  \
     static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
@@ -144,11 +106,6 @@ public:
         return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
     }
 #define SIMD_IWRAPPER_2_32(op)  SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
-#if !defined(AVX512F_STRICT)
-#define SIMD_IWRAPPER_2_8(op)   SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
-#define SIMD_IWRAPPER_2_16(op)  SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
-#define SIMD_IWRAPPER_2_64(op)  SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
-#endif
 
 #define SIMD_IWRAPPER_2I(op)  \
     template<int ImmT>\
@@ -182,12 +139,8 @@ SIMD_IWRAPPER_2_32(min_epi32);  // return (a < b) ? a : b (int32)
 SIMD_IWRAPPER_2_32(min_epu32);  // return (a < b) ? a : b (uint32)
 SIMD_IWRAPPER_2_32(mul_epi32);  // return a * b (int32)
 
-#if !defined(AVX512F_STRICT)
-
-SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
-SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
-
-#endif
+// SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
+// SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
 
 // return (a * b) & 0xFFFFFFFF
 //
@@ -196,12 +149,8 @@ SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uin
 SIMD_IWRAPPER_2_32(mullo_epi32);
 SIMD_IWRAPPER_2_32(sub_epi32);  // return a - b (int32)
 
-#if !defined(AVX512F_STRICT)
-
-SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
-SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
-
-#endif
+// SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
+// SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
 
 //-----------------------------------------------------------------------
 // Logical operations
@@ -253,14 +202,10 @@ SIMD_IWRAPPER_2_32(srlv_epi32);                // return a >> b      (uint32)
 //-----------------------------------------------------------------------
 // Blend / shuffle / permute operations
 //-----------------------------------------------------------------------
-#if !defined(AVX512F_STRICT)
-
-SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
-
-#endif
+// SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+// SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+// SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+// SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
 
 // SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
 
@@ -279,16 +224,12 @@ SIMD_IWRAPPER_1I_32(shuffle_epi32);
 SIMD_IWRAPPER_2_32(unpackhi_epi32);
 SIMD_IWRAPPER_2_32(unpacklo_epi32);
 
-#if !defined(AVX512F_STRICT)
-
-SIMD_IWRAPPER_2_16(unpackhi_epi16);
-SIMD_IWRAPPER_2_64(unpackhi_epi64);
-SIMD_IWRAPPER_2_8(unpackhi_epi8);
-SIMD_IWRAPPER_2_16(unpacklo_epi16);
-SIMD_IWRAPPER_2_64(unpacklo_epi64);
-SIMD_IWRAPPER_2_8(unpacklo_epi8);
-
-#endif
+// SIMD_IWRAPPER_2_16(unpackhi_epi16);
+// SIMD_IWRAPPER_2_64(unpackhi_epi64);
+// SIMD_IWRAPPER_2_8(unpackhi_epi8);
+// SIMD_IWRAPPER_2_16(unpacklo_epi16);
+// SIMD_IWRAPPER_2_64(unpacklo_epi64);
+// SIMD_IWRAPPER_2_8(unpacklo_epi8);
 
 //-----------------------------------------------------------------------
 // Load / store operations
@@ -339,16 +280,12 @@ static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, In
                     static_cast<int>(ScaleT)));
 }
 
-#if !defined(AVX512F_STRICT)
-
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
-{
-    __mmask64 m = 0xffffffffull;
-    return static_cast<uint32_t>(
-        _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
-}
-
-#endif
+// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+// {
+//     __mmask64 m = 0xffffffffull;
+//     return static_cast<uint32_t>(
+//         _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+// }
 
 static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
 {
@@ -367,6 +304,11 @@ static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
     _mm512_mask_storeu_epi32(p, __mmask16(0xff), __conv(a));
 }
 
+static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
+{
+    return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xff), -1)));
+}
+
 //=======================================================================
 // Legacy interface (available only in SIMD256 width)
 //=======================================================================
@@ -380,30 +322,10 @@ static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
 #undef SIMD_WRAPPER_2I
 #undef SIMD_WRAPPER_3_
 #undef SIMD_WRAPPER_3
-#undef SIMD_DWRAPPER_1_
-#undef SIMD_DWRAPPER_1
-#undef SIMD_DWRAPPER_1I_
-#undef SIMD_DWRAPPER_1I
-#undef SIMD_DWRAPPER_2_
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
 #undef SIMD_IWRAPPER_1_
-#undef SIMD_IWRAPPER_1_8
-#undef SIMD_IWRAPPER_1_16
 #undef SIMD_IWRAPPER_1_32
-#undef SIMD_IWRAPPER_1_64
 #undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_1I_8
-#undef SIMD_IWRAPPER_1I_16
 #undef SIMD_IWRAPPER_1I_32
-#undef SIMD_IWRAPPER_1I_64
 #undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2_8
-#undef SIMD_IWRAPPER_2_16
 #undef SIMD_IWRAPPER_2_32
-#undef SIMD_IWRAPPER_2_64
 #undef SIMD_IWRAPPER_2I
-//#undef SIMD_IWRAPPER_2I_8
-//#undef SIMD_IWRAPPER_2I_16
-//#undef SIMD_IWRAPPER_2I_32
-//#undef SIMD_IWRAPPER_2I_64
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl
new file mode 100644
index 0000000..6ffe7c2
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl
@@ -0,0 +1,127 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD256 AVX (512) implementation for Core processors
+//
+// Since this implementation inherits from the AVX (2) implementation,
+// the only operations below ones that replace AVX (2) operations.
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
+#define SIMD_DWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_DWRAPPER_1(op)  SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
+
+#define SIMD_DWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_DWRAPPER_1I(op)  SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
+
+#define SIMD_DWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_DWRAPPER_2(op)  SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
+
+#define SIMD_IWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_IWRAPPER_1_8(op)   SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_1_16(op)  SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_1_64(op)  SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
+
+#define SIMD_IWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_IWRAPPER_1I_8(op)   SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_1I_16(op)  SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_1I_64(op)  SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
+
+#define SIMD_IWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_IWRAPPER_2_8(op)   SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_2_16(op)  SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_2_64(op)  SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
+
+
+SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
+SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
+SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
+SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+SIMD_IWRAPPER_2_16(unpackhi_epi16);
+SIMD_IWRAPPER_2_64(unpackhi_epi64);
+SIMD_IWRAPPER_2_8(unpackhi_epi8);
+SIMD_IWRAPPER_2_16(unpacklo_epi16);
+SIMD_IWRAPPER_2_64(unpacklo_epi64);
+SIMD_IWRAPPER_2_8(unpacklo_epi8);
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    __mmask64 m = 0xffffffffull;
+    return static_cast<uint32_t>(
+        _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+}
+
+#undef SIMD_DWRAPPER_1_
+#undef SIMD_DWRAPPER_1
+#undef SIMD_DWRAPPER_1I_
+#undef SIMD_DWRAPPER_1I
+#undef SIMD_DWRAPPER_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_IWRAPPER_1_
+#undef SIMD_IWRAPPER_1_8
+#undef SIMD_IWRAPPER_1_16
+#undef SIMD_IWRAPPER_1_64
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_1I_8
+#undef SIMD_IWRAPPER_1I_16
+#undef SIMD_IWRAPPER_1I_64
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2_8
+#undef SIMD_IWRAPPER_2_16
+#undef SIMD_IWRAPPER_2_64
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl
new file mode 100644
index 0000000..acd8ffd
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl
@@ -0,0 +1,35 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD256 AVX (512) implementation for Knights Family
+//
+// Since this implementation inherits from the AVX (2) implementation,
+// the only operations below ones that replace AVX (2) operations.
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
index 7447d35..1f93da7 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
@@ -25,7 +25,7 @@
 #endif
 
 #if defined(__GNUC__) && !defined( __clang__) && !defined(__INTEL_COMPILER)
-// gcc missing these intrinsics
+// gcc as of 7.1 was missing these intrinsics
 #ifndef _mm512_cmpneq_ps_mask
 #define _mm512_cmpneq_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_NEQ_UQ)
 #endif
@@ -37,14 +37,13 @@
 #ifndef _mm512_cmplt_pd_mask
 #define _mm512_cmplt_pd_mask(a,b) _mm512_cmp_pd_mask((a),(b),_CMP_LT_OS)
 #endif
+
 #endif
 
 //============================================================================
-// SIMD16 AVX512 (F) implementation
+// SIMD16 AVX512 (F) implementation (compatible with Knights and Core
+// processors)
 //
-//  TODO: Optimize for KNL / KNH or for SKX??
-//      For now probably optimizing more for KNL as that's where
-//      immediate customers are.
 //============================================================================
 
 static const int TARGET_SIMD_WIDTH = 16;
@@ -153,34 +152,11 @@ using SIMD256T = SIMD256Impl::AVX2Impl;
     }
 #define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
 
-#define SIMD_EMU_IWRAPPER_2(op) \
-    static SIMDINLINE \
-    Integer SIMDCALL op(Integer a, Integer b)\
-    {\
-        return Integer\
-        {\
-            SIMD256T::op(a.v8[0], b.v8[0]),\
-            SIMD256T::op(a.v8[1], b.v8[1]),\
-        };\
-    }
-
 private:
-    static SIMDINLINE Integer vmask(__mmask8 m)
-    {
-        return _mm512_maskz_set1_epi64(m, -1LL);
-    }
     static SIMDINLINE Integer vmask(__mmask16 m)
     {
         return _mm512_maskz_set1_epi32(m, -1);
     }
-    static SIMDINLINE Integer vmask(__mmask32 m)
-    {
-        return _mm512_maskz_set1_epi16(m, -1);
-    }
-    static SIMDINLINE Integer vmask(__mmask64 m)
-    {
-        return _mm512_maskz_set1_epi8(m, -1);
-    }
 
 public:
 //-----------------------------------------------------------------------
@@ -236,21 +212,10 @@ SIMD_IWRAPPER_2_(andnot_si, andnot_si512);  // return (~a) & b    (int)
 SIMD_IWRAPPER_2_(or_si, or_si512);          // return a | b       (int)
 SIMD_IWRAPPER_2_(xor_si, xor_si512);        // return a ^ b       (int)
 
-#if defined(AVX512F_STRICT)
-
-SIMD_WRAPPERI_2_(and_ps, and_epi32);          // return a & b       (float treated as int)
-SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32);    // return (~a) & b    (float treated as int)
-SIMD_WRAPPERI_2_(or_ps, or_epi32);            // return a | b       (float treated as int)
-SIMD_WRAPPERI_2_(xor_ps, xor_epi32);          // return a ^ b       (float treated as int)
-
-#else
-
-SIMD_WRAPPER_2(and_ps);                     // return a & b       (float treated as int)
-SIMD_WRAPPER_2(andnot_ps);                  // return (~a) & b    (float treated as int)
-SIMD_WRAPPER_2(or_ps);                      // return a | b       (float treated as int)
-SIMD_WRAPPER_2(xor_ps);                     // return a ^ b       (float treated as int)
-
-#endif
+// SIMD_WRAPPER_2(and_ps);                     // return a & b       (float treated as int)
+// SIMD_WRAPPER_2(andnot_ps);                  // return (~a) & b    (float treated as int)
+// SIMD_WRAPPER_2(or_ps);                      // return a | b       (float treated as int)
+// SIMD_WRAPPER_2(xor_ps);                     // return a ^ b       (float treated as int)
 
 
 //-----------------------------------------------------------------------
@@ -260,6 +225,17 @@ SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
 SIMD_IWRAPPER_2(sllv_epi32);
 SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
 SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
+
+#if 0
+SIMD_IWRAPPER_1I_(srli_si, srli_si512);     // return a >> (ImmT*8) (uint)
+
+template<int ImmT>                              // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+    return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+#endif
+
 SIMD_IWRAPPER_2(srlv_epi32);
 
 //-----------------------------------------------------------------------
@@ -461,17 +437,10 @@ static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
     return _mm512_inserti64x4(a, b, imm);
 }
 
-#if !defined(AVX512F_STRICT)
-SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
-#else
-SIMD_EMU_IWRAPPER_2(packs_epi16)
-SIMD_EMU_IWRAPPER_2(packs_epi32)
-SIMD_EMU_IWRAPPER_2(packus_epi16)
-SIMD_EMU_IWRAPPER_2(packus_epi32)
-#endif
+// SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
+// SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
+// SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
+// SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
 
 static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
 {
@@ -704,4 +673,4 @@ static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
 #undef SIMD_IWRAPPER_2
 #undef SIMD_IWRAPPER_2_
 #undef SIMD_IWRAPPER_2I
-#undef SIMD_EMU_IWRAPPER_2
+
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl
new file mode 100644
index 0000000..5063c52
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl
@@ -0,0 +1,181 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD16 AVX512 (F) implementation for Core processors
+//
+//============================================================================
+
+#define SIMD_WRAPPER_1_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return intrin(a);\
+    }
+
+#define SIMD_WRAPPER_1(op)  \
+    SIMD_WRAPPER_1_(op, _mm512_##op)
+
+#define SIMD_WRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_##intrin(a, b);\
+    }
+#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
+
+#define SIMD_WRAPPERI_2_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_castsi512_ps(_mm512_##intrin(\
+            _mm512_castps_si512(a), _mm512_castps_si512(b)));\
+    }
+
+#define SIMD_DWRAPPER_2(op)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm512_##op(a, b);\
+    }
+
+#define SIMD_WRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_WRAPPER_2I(op)  SIMD_WRAPPER_2I_(op, op)
+
+#define SIMD_DWRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_DWRAPPER_2I(op)  SIMD_DWRAPPER_2I_(op, op)
+
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm512_##op(a, b, c);\
+    }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+#define SIMD_IWRAPPER_1_8(op)  \
+    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1_4(op)  \
+    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1I_(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return intrin(a, ImmT);\
+    }
+#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
+
+#define SIMD_IWRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm512_##intrin(a, b);\
+    }
+#define SIMD_IWRAPPER_2(op)  SIMD_IWRAPPER_2_(op, op)
+
+#define SIMD_IWRAPPER_2_CMP(op, cmp)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return cmp(a, b);\
+    }
+
+#define SIMD_IFWRAPPER_2(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
+    }
+
+#define SIMD_IWRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
+
+private:
+    static SIMDINLINE Integer vmask(__mmask8 m)
+    {
+        return _mm512_maskz_set1_epi64(m, -1LL);
+    }
+    static SIMDINLINE Integer vmask(__mmask32 m)
+    {
+        return _mm512_maskz_set1_epi16(m, -1);
+    }
+    static SIMDINLINE Integer vmask(__mmask64 m)
+    {
+        return _mm512_maskz_set1_epi8(m, -1);
+    }
+
+public:
+SIMD_WRAPPER_2(and_ps);                     // return a & b       (float treated as int)
+SIMD_WRAPPER_2(andnot_ps);                  // return (~a) & b    (float treated as int)
+SIMD_WRAPPER_2(or_ps);                      // return a | b       (float treated as int)
+SIMD_WRAPPER_2(xor_ps);                     // return a ^ b       (float treated as int)
+
+SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPERI_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I_
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2I
+
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl
new file mode 100644
index 0000000..310f154
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl
@@ -0,0 +1,183 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD16 AVX512 (F) implementation for Knights Family Processors
+//
+//============================================================================
+
+static const int TARGET_SIMD_WIDTH = 16;
+using SIMD256T = SIMD256Impl::AVX2Impl;
+
+#define SIMD_WRAPPER_1_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return intrin(a);\
+    }
+
+#define SIMD_WRAPPER_1(op)  \
+    SIMD_WRAPPER_1_(op, _mm512_##op)
+
+#define SIMD_WRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_##intrin(a, b);\
+    }
+#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
+
+#define SIMD_WRAPPERI_2_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_castsi512_ps(_mm512_##intrin(\
+            _mm512_castps_si512(a), _mm512_castps_si512(b)));\
+    }
+
+#define SIMD_DWRAPPER_2(op)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm512_##op(a, b);\
+    }
+
+#define SIMD_WRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_WRAPPER_2I(op)  SIMD_WRAPPER_2I_(op, op)
+
+#define SIMD_DWRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_DWRAPPER_2I(op)  SIMD_DWRAPPER_2I_(op, op)
+
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm512_##op(a, b, c);\
+    }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+#define SIMD_IWRAPPER_1_8(op)  \
+    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1_4(op)  \
+    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1I_(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return intrin(a, ImmT);\
+    }
+#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
+
+#define SIMD_IWRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm512_##intrin(a, b);\
+    }
+#define SIMD_IWRAPPER_2(op)  SIMD_IWRAPPER_2_(op, op)
+
+#define SIMD_IWRAPPER_2_CMP(op, cmp)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return cmp(a, b);\
+    }
+
+#define SIMD_IFWRAPPER_2(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
+    }
+
+#define SIMD_IWRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
+
+private:
+    static SIMDINLINE Integer vmask(__mmask8 m)
+    {
+        return _mm512_maskz_set1_epi64(m, -1LL);
+    }
+    static SIMDINLINE Integer vmask(__mmask16 m)
+    {
+        return _mm512_maskz_set1_epi32(m, -1);
+    }
+    static SIMDINLINE Integer vmask(__mmask32 m)
+    {
+        return _mm512_maskz_set1_epi16(m, -1);
+    }
+    static SIMDINLINE Integer vmask(__mmask64 m)
+    {
+        return _mm512_maskz_set1_epi8(m, -1);
+    }
+
+public:
+SIMD_WRAPPERI_2_(and_ps, and_epi32);          // return a & b       (float treated as int)
+SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32);    // return (~a) & b    (float treated as int)
+SIMD_WRAPPERI_2_(or_ps, or_epi32);            // return a | b       (float treated as int)
+SIMD_WRAPPERI_2_(xor_ps, xor_epi32);          // return a ^ b       (float treated as int)
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPERI_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I_
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2I
+
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl
new file mode 100644
index 0000000..3e36ce5
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl
@@ -0,0 +1,27 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+// Implement mask-enabled SIMD functions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl
new file mode 100644
index 0000000..3e36ce5
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl
@@ -0,0 +1,27 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+// Implement mask-enabled SIMD functions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
index bc23867..236257f 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
@@ -262,7 +262,7 @@ namespace SIMDImpl
 
     namespace SIMD512Impl
     {
-#if !defined(__AVX512F__)
+#if !(defined(__AVX512F__) || defined(_MM_K0_REG))
         // Define AVX512 types if not included via immintrin.h.
         // All data members of these types are ONLY to viewed
         // in a debugger.  Do NOT access them via code!
-- 
2.7.4