[Mesa-dev] [PATCH 11/13] swr/rast: fixes for 32-bit builds

Mon Jul 31 19:40:09 UTC 2017

Still doesn't compile in 32-bit with the Intel Compiler.
---
 .../drivers/swr/rasterizer/common/simdintrin.h     |  14 +-
 .../drivers/swr/rasterizer/common/simdlib.hpp      |  10 +-
 .../swr/rasterizer/common/simdlib_256_avx.inl      | 130 +++++++++----------
 .../swr/rasterizer/common/simdlib_256_avx2.inl     |  32 ++---
 .../swr/rasterizer/common/simdlib_512_emu.inl      | 143 +++++++++++----------
 .../swr/rasterizer/common/simdlib_types.hpp        |  76 +++++------
 .../drivers/swr/rasterizer/core/backend_impl.h     |  12 +-
 src/gallium/drivers/swr/rasterizer/core/binner.cpp |  36 +++---
 src/gallium/drivers/swr/rasterizer/core/binner.h   |   4 +-
 src/gallium/drivers/swr/rasterizer/core/clip.cpp   |  12 +-
 src/gallium/drivers/swr/rasterizer/core/clip.h     |  66 +++++-----
 src/gallium/drivers/swr/rasterizer/core/context.h  |   8 +-
 .../drivers/swr/rasterizer/core/depthstencil.h     |  12 +-
 .../swr/rasterizer/core/format_conversion.h        |  18 ++-
 .../drivers/swr/rasterizer/core/format_types.h     |  71 +++++-----
 .../drivers/swr/rasterizer/core/frontend.cpp       |   4 +-
 src/gallium/drivers/swr/rasterizer/core/frontend.h |  12 +-
 src/gallium/drivers/swr/rasterizer/core/state.h    |   2 +-
 18 files changed, 339 insertions(+), 323 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index f4b9e10..fce360d 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -184,7 +184,7 @@ typedef SIMD256                             SIMD;
 #define _simd_vmask_ps                      SIMD::vmask_ps
 
 template<int mask> SIMDINLINE
-SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer a, SIMD128::Integer b)
+SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const &a, SIMD128::Integer const &b)
 {
     return SIMD128::castps_si(SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
 }
@@ -242,7 +242,7 @@ void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Compute plane equation vA * vX + vB * vY + vC
-SIMDINLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
+SIMDINLINE simdscalar vplaneps(simdscalar const &vA, simdscalar const &vB, simdscalar const &vC, simdscalar const &vX, simdscalar const &vY)
 {
     simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
     vOut = _simd_fmadd_ps(vB, vY, vOut);
@@ -251,7 +251,7 @@ SIMDINLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simd
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Compute plane equation vA * vX + vB * vY + vC
-SIMDINLINE simd4scalar vplaneps(simd4scalar vA, simd4scalar vB, simd4scalar vC, simd4scalar &vX, simd4scalar &vY)
+SIMDINLINE simd4scalar vplaneps(simd4scalar const &vA, simd4scalar const &vB, simd4scalar const &vC, simd4scalar const &vX, simd4scalar const &vY)
 {
     simd4scalar vOut = _simd128_fmadd_ps(vA, vX, vC);
     vOut = _simd128_fmadd_ps(vB, vY, vOut);
@@ -264,7 +264,7 @@ SIMDINLINE simd4scalar vplaneps(simd4scalar vA, simd4scalar vB, simd4scalar vC,
 /// @param vJ - barycentric J
 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static SIMDINLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
+static SIMDINLINE simdscalar InterpolateComponent(simdscalar const &vI, simdscalar const &vJ, const float *pInterpBuffer)
 {
     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
     const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
@@ -299,7 +299,7 @@ static SIMDINLINE simdscalar InterpolateComponentFlat(const float *pInterpBuffer
 /// @param vJ - barycentric J
 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar vI, simd4scalar vJ, const float *pInterpBuffer)
+static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar const &vI, simd4scalar const &vJ, const float *pInterpBuffer)
 {
     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
     const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
@@ -315,13 +315,13 @@ static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar vI, simd4scalar v
     return vplaneps(vA, vB, vC, vI, vJ);
 }
 
-static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar a)
+static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar const &a)
 {
     simd4scalari ai = SIMD128::castps_si(a);
     return SIMD128::castsi_ps(SIMD128::and_si(ai, SIMD128::set1_epi32(0x7fffffff)));
 }
 
-static SIMDINLINE simdscalar _simd_abs_ps(simdscalar a)
+static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const &a)
 {
     simdscalari ai = _simd_castps_si(a);
     return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
index a4b5854..22d7da4 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
@@ -225,7 +225,7 @@ struct SIMDBase : Traits::IsaImpl
     }
 
     static SIMDINLINE
-    void vec4_set1_vps(Vec4& r, Float s)
+    void vec4_set1_vps(Vec4& r, Float const &s)
     {
         r[0] = s;
         r[1] = s;
@@ -285,7 +285,7 @@ struct SIMDBase : Traits::IsaImpl
     }
 
     static SIMDINLINE
-    void vec4_mul_ps(Vec4& r, const Vec4& v, Float s)
+    void vec4_mul_ps(Vec4& r, const Vec4& v, Float const &s)
     {
         r[0] = SIMD::mul_ps(v[0], s);
         r[1] = SIMD::mul_ps(v[1], s);
@@ -303,7 +303,7 @@ struct SIMDBase : Traits::IsaImpl
     }
 
     static SIMDINLINE
-    void vec4_add_ps(Vec4& r, const Vec4& v0, Float s)
+    void vec4_add_ps(Vec4& r, const Vec4& v0, Float const &s)
     {
         r[0] = SIMD::add_ps(v0[0], s);
         r[1] = SIMD::add_ps(v0[1], s);
@@ -321,7 +321,7 @@ struct SIMDBase : Traits::IsaImpl
     }
 
     static SIMDINLINE
-    void vec4_min_ps(Vec4& r, const Vec4& v0, Float s)
+    void vec4_min_ps(Vec4& r, const Vec4& v0, Float const &s)
     {
         r[0] = SIMD::min_ps(v0[0], s);
         r[1] = SIMD::min_ps(v0[1], s);
@@ -330,7 +330,7 @@ struct SIMDBase : Traits::IsaImpl
     }
 
     static SIMDINLINE
-    void vec4_max_ps(Vec4& r, const Vec4& v0, Float s)
+    void vec4_max_ps(Vec4& r, const Vec4& v0, Float const &s)
     {
         r[0] = SIMD::max_ps(v0[0], s);
         r[1] = SIMD::max_ps(v0[1], s);
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
index 7708611..42b4552 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
@@ -31,78 +31,78 @@ using SIMD128T = SIMD128Impl::AVXImpl;
 //============================================================================
 
 #define SIMD_WRAPPER_1(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    static SIMDINLINE Float SIMDCALL op(Float const &a)   \
     {\
         return _mm256_##op(a);\
     }
 
 #define SIMD_WRAPPER_2(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)   \
     {\
         return _mm256_##op(a, b);\
     }
 
 #define SIMD_DWRAPPER_2(op)  \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    static SIMDINLINE Double SIMDCALL op(Double const &a, Double const &b)   \
     {\
         return _mm256_##op(a, b);\
     }
 
 #define SIMD_WRAPPER_2I(op)  \
     template<int ImmT>\
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)   \
     {\
         return  _mm256_##op(a, b, ImmT);\
     }
 
 #define SIMD_DWRAPPER_2I(op)  \
     template<int ImmT>\
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    static SIMDINLINE Double SIMDCALL op(Double const &a, Double const &b)   \
     {\
         return _mm256_##op(a, b, ImmT);\
     }
 
 #define SIMD_WRAPPER_3(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b, Float const &c)   \
     {\
         return _mm256_##op(a, b, c);\
     }
 
 #define SIMD_IWRAPPER_1(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a)   \
     {\
         return _mm256_##op(a);\
     }
 
 #define SIMD_IWRAPPER_2(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
     {\
         return _mm256_##op(a, b);\
     }
 
 #define SIMD_IFWRAPPER_2(op, intrin)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
     {\
         return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
     }
 
 #define SIMD_IFWRAPPER_2I(op, intrin)  \
     template<int ImmT> \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
     {\
         return castps_si( intrin(castsi_ps(a), castsi_ps(b), ImmT) );\
     }
 
 #define SIMD_IWRAPPER_2I_(op, intrin)  \
     template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
     {\
         return _mm256_##intrin(a, b, ImmT);\
     }
 #define SIMD_IWRAPPER_2I(op)  SIMD_IWRAPPER_2I_(op, op)
 
 #define SIMD_IWRAPPER_3(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b, Integer const &c)   \
     {\
         return _mm256_##op(a, b, c);\
     }
@@ -110,7 +110,7 @@ using SIMD128T = SIMD128Impl::AVXImpl;
 // emulated integer simd
 #define SIMD_EMU_IWRAPPER_1(op) \
     static SIMDINLINE \
-    Integer SIMDCALL op(Integer a)\
+    Integer SIMDCALL op(Integer const &a)\
     {\
         return Integer\
         {\
@@ -120,7 +120,7 @@ using SIMD128T = SIMD128Impl::AVXImpl;
     }
 #define SIMD_EMU_IWRAPPER_1L(op, shift) \
     static SIMDINLINE \
-    Integer SIMDCALL op(Integer a)\
+    Integer SIMDCALL op(Integer const &a)\
     {\
         return Integer \
         {\
@@ -129,7 +129,7 @@ using SIMD128T = SIMD128Impl::AVXImpl;
         };\
     }\
     static SIMDINLINE \
-    Integer SIMDCALL op(SIMD128Impl::Integer a)\
+    Integer SIMDCALL op(SIMD128Impl::Integer const &a)\
     {\
         return Integer \
         {\
@@ -140,7 +140,7 @@ using SIMD128T = SIMD128Impl::AVXImpl;
 
 #define SIMD_EMU_IWRAPPER_1I(op) \
     template <int ImmT> static SIMDINLINE \
-    Integer SIMDCALL op(Integer a)\
+    Integer SIMDCALL op(Integer const &a)\
     {\
         return Integer\
         {\
@@ -151,7 +151,7 @@ using SIMD128T = SIMD128Impl::AVXImpl;
 
 #define SIMD_EMU_IWRAPPER_2(op) \
     static SIMDINLINE \
-    Integer SIMDCALL op(Integer a, Integer b)\
+    Integer SIMDCALL op(Integer const &a, Integer const &b)\
     {\
         return Integer\
         {\
@@ -162,7 +162,7 @@ using SIMD128T = SIMD128Impl::AVXImpl;
 
 #define SIMD_EMU_IWRAPPER_2I(op) \
     template <int ImmT> static SIMDINLINE \
-    Integer SIMDCALL op(Integer a, Integer b)\
+    Integer SIMDCALL op(Integer const &a, Integer const &b)\
     {\
         return Integer\
         {\
@@ -177,12 +177,12 @@ using SIMD128T = SIMD128Impl::AVXImpl;
 SIMD_WRAPPER_2(add_ps);     // return a + b
 SIMD_WRAPPER_2(div_ps);     // return a / b
 
-static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float const &a, Float const &b, Float const &c) // return (a * b) + c
 {
     return add_ps(mul_ps(a, b), c);
 }
 
-static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c
+static SIMDINLINE Float SIMDCALL fmsub_ps(Float const &a, Float const &b, Float const &c) // return (a * b) - c
 {
     return sub_ps(mul_ps(a, b), c);
 }
@@ -195,13 +195,13 @@ SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
 SIMD_WRAPPER_2(sub_ps);     // return a - b
 
 template <RoundMode RMT>
-static SIMDINLINE Float SIMDCALL round_ps(Float a)
+static SIMDINLINE Float SIMDCALL round_ps(Float const &a)
 {
     return _mm256_round_ps(a, static_cast<int>(RMT));
 }
 
-static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
-static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL ceil_ps(Float const &a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
 
 //-----------------------------------------------------------------------
 // Integer (various width) arithmetic operations
@@ -243,7 +243,7 @@ SIMD_EMU_IWRAPPER_2(xor_si);    // return a ^ b       (int)
 //-----------------------------------------------------------------------
 SIMD_EMU_IWRAPPER_1I(slli_epi32);               // return a << ImmT
 
-static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vCount) // return a << b      (uint32)
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer const &vA, Integer const &vCount) // return a << b      (uint32)
 {
     int32_t aHi, aLow, countHi, countLow;
     __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
@@ -302,12 +302,12 @@ SIMD_EMU_IWRAPPER_1I(srli_epi32);   // return a >> ImmT   (uint32)
 SIMD_EMU_IWRAPPER_1I(srli_si);      // return a >> (ImmT*8) (uint)
 
 template<int ImmT>                              // same as srli_si, but with Float cast to int
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a)
 {
     return castsi_ps(srli_si<ImmT>(castps_si(a)));
 }
 
-static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vCount) // return a >> b      (uint32)
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer const &vA, Integer const &vCount) // return a >> b      (uint32)
 {
     int32_t aHi, aLow, countHi, countLow;
     __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
@@ -366,37 +366,37 @@ static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vCount) // ret
 //-----------------------------------------------------------------------
 // Conversion operations
 //-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castpd_ps(Double const &a)   // return *(Float*)(&a)
 {
     return _mm256_castpd_ps(a);
 }
 
-static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
+static SIMDINLINE Integer SIMDCALL castps_si(Float const &a)   // return *(Integer*)(&a)
 {
     return _mm256_castps_si256(a);
 }
 
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer const &a)   // return *(Double*)(&a)
 {
     return _mm256_castsi256_pd(a);
 }
 
-static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castps_pd(Float const &a)   // return *(Double*)(&a)
 {
     return _mm256_castps_pd(a);
 }
 
-static SIMDINLINE Integer SIMDCALL castpd_si(Double a)   // return *(Integer*)(&a)
+static SIMDINLINE Integer SIMDCALL castpd_si(Double const &a)   // return *(Integer*)(&a)
 {
     return _mm256_castpd_si256(a);
 }
 
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer const &a)   // return *(Float*)(&a)
 {
     return _mm256_castsi256_ps(a);
 }
 
-static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer const &a) // return (float)a    (int32 --> float)
 {
     return _mm256_cvtepi32_ps(a);
 }
@@ -407,12 +407,12 @@ SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8);                 // return (int32)a    (
 SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4);                 // return (int64)a    (uint16 --> int64)
 SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8);                 // return (int64)a    (uint32 --> int64)
 
-static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float const &a)            // return (int32)a    (float --> int32)
 {
     return _mm256_cvtps_epi32(a);
 }
 
-static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a)           // return (int32)a    (rnd_to_zero(float) --> int32)
 {
     return _mm256_cvttps_epi32(a);
 }
@@ -421,16 +421,16 @@ static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (in
 // Comparison operations
 //-----------------------------------------------------------------------
 template<CompareType CmpTypeT>
-static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+static SIMDINLINE Float SIMDCALL cmp_ps(Float const &a, Float const &b) // return a (CmpTypeT) b
 {
     return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
 }
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
 
 SIMD_EMU_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
 SIMD_EMU_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
@@ -442,12 +442,12 @@ SIMD_EMU_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
 SIMD_EMU_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
 SIMD_EMU_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
 
-static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+static SIMDINLINE bool SIMDCALL testz_ps(Float const &a, Float const &b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
 {
     return  0 != _mm256_testz_ps(a, b);
 }
 
-static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+static SIMDINLINE bool SIMDCALL testz_si(Integer const &a, Integer const &b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
 {
     return  0 != _mm256_testz_si256(a, b);
 }
@@ -459,12 +459,12 @@ SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
 SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps);  // return ImmT ? b : a  (int32)
 SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
 
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Float const &mask) // return mask ? b : a (int)
 {
     return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
 }
 
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Integer const &mask) // return mask ? b : a (int)
 {
     return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
 }
@@ -479,7 +479,7 @@ SIMD_EMU_IWRAPPER_2(packs_epi32);   // See documentation for _mm256_packs_epi32
 SIMD_EMU_IWRAPPER_2(packus_epi16);  // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
 SIMD_EMU_IWRAPPER_2(packus_epi32);  // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
 
-static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
 {
     Integer result;
 
@@ -496,7 +496,7 @@ static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // ret
     return result;
 }
 
-static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
 {
     Float result;
 
@@ -521,7 +521,7 @@ SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256);
 SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
 
 template<int ImmT>
-static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const &a, Integer const &b)
 {
     return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
 }
@@ -545,7 +545,7 @@ SIMD_WRAPPER_2(unpacklo_ps);
 // Load / store operations
 //-----------------------------------------------------------------------
 template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 {
     uint32_t *pOffsets = (uint32_t*)&idx;
     Float vResult;
@@ -587,7 +587,7 @@ static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (s
 
 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
 template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask)
 {
     uint32_t *pOffsets = (uint32_t*)&idx;
     Float vResult = old;
@@ -605,22 +605,22 @@ static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, In
     return vResult;
 }
 
-static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer const &mask, Float const &src)
 {
     _mm256_maskstore_ps(p, mask, src);
 }
 
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const &a)
 {
     return SIMD128T::movemask_epi8(a.v4[0]) |
            (SIMD128T::movemask_epi8(a.v4[1]) << 16);
 }
 
-static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const &a)
 {
     return static_cast<uint32_t>(_mm256_movemask_pd(a));
 }
-static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const &a)
 {
     return static_cast<uint32_t>(_mm256_movemask_ps(a));
 }
@@ -650,17 +650,17 @@ static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
     return _mm256_setzero_si256();
 }
 
-static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float const &a)    // *p = a   (stores all elements contiguously in memory)
 {
     _mm256_store_ps(p, a);
 }
 
-static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer const &a)   // *p = a
 {
     _mm256_store_si256(&p->v, a);
 }
 
-static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float const &a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
 {
     _mm256_stream_ps(p, a);
 }
@@ -675,37 +675,37 @@ static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const *p)
 }
 
 template<int ImmT>
-static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double a)
+static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double const &a)
 {
     return _mm256_extractf128_pd(a, ImmT);
 }
 
 template<int ImmT>
-static SIMDINLINE SIMD128Impl::Float  SIMDCALL extractf128_ps(Float a)
+static SIMDINLINE SIMD128Impl::Float  SIMDCALL extractf128_ps(Float const &a)
 {
     return _mm256_extractf128_ps(a, ImmT);
 }
 
 template<int ImmT>
-static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer a)
+static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer const &a)
 {
     return _mm256_extractf128_si256(a, ImmT);
 }
 
 template<int ImmT>
-static SIMDINLINE Double SIMDCALL insertf128_pd(Double a, SIMD128Impl::Double b)
+static SIMDINLINE Double SIMDCALL insertf128_pd(Double const &a, SIMD128Impl::Double const &b)
 {
     return _mm256_insertf128_pd(a, b, ImmT);
 }
 
 template<int ImmT>
-static SIMDINLINE Float SIMDCALL insertf128_ps(Float a, SIMD128Impl::Float b)
+static SIMDINLINE Float SIMDCALL insertf128_ps(Float const &a, SIMD128Impl::Float const &b)
 {
     return _mm256_insertf128_ps(a, b, ImmT);
 }
 
 template<int ImmT>
-static SIMDINLINE Integer SIMDCALL insertf128_si(Integer a, SIMD128Impl::Integer b)
+static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const &a, SIMD128Impl::Integer const &b)
 {
     return _mm256_insertf128_si256(a, b, ImmT);
 }
@@ -736,7 +736,7 @@ static SIMDINLINE Float SIMDCALL set_ps(float i7, float i6, float i5, float i4,
     return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
 }
 
-static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Impl::Integer *plo, Integer src)
+static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Impl::Integer *plo, Integer const &src)
 {
     _mm256_storeu2_m128i(&phi->v, &plo->v, src);
 }
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
index 0a81203..9cd0a64 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
@@ -33,53 +33,53 @@
 //============================================================================
 
 #define SIMD_IWRAPPER_1(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a)   \
     {\
         return _mm256_##op(a);\
     }
 
 #define SIMD_IWRAPPER_1L(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a)   \
     {\
         return _mm256_##op(_mm256_castsi256_si128(a));\
     }\
 
 #define SIMD_IWRAPPER_1I(op)  \
     template<int ImmT> \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a)   \
     {\
         return _mm256_##op(a, ImmT);\
     }
 
 #define SIMD_IWRAPPER_1I_(op, intrin)  \
     template<int ImmT> \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a)   \
     {\
         return _mm256_##intrin(a, ImmT);\
     }
 
 #define SIMD_IWRAPPER_2_(op, intrin)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
     {\
         return _mm256_##intrin(a, b);\
     }
 
 #define SIMD_IWRAPPER_2(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
     {\
         return _mm256_##op(a, b);\
     }
 
 #define SIMD_IWRAPPER_2I(op)  \
     template<int ImmT> \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
     {\
         return _mm256_##op(a, b, ImmT);\
     }
 
 #define SIMD_IWRAPPER_2I(op)  \
     template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
     {\
         return _mm256_##op(a, b, ImmT);\
     }
@@ -87,7 +87,7 @@
 //-----------------------------------------------------------------------
 // Floating point arithmetic operations
 //-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c)   // return (a * b) + c
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float const &a, Float const &b, Float const &c)   // return (a * b) + c
 {
     return _mm256_fmadd_ps(a, b, c);
 }
@@ -134,7 +134,7 @@ SIMD_IWRAPPER_2(srlv_epi32);                // return a >> b      (uint32)
 SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
 
 template<int ImmT>                          // same as srli_si, but with Float cast to int
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a)
 {
     return castsi_ps(srli_si<ImmT>(castps_si(a)));
 }
@@ -161,7 +161,7 @@ SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
 SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
 SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
 
-static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
+static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer const &a, Integer const &b)   // return a < b (int32)
 {
     return cmpgt_epi32(b, a);
 }
@@ -176,14 +176,14 @@ SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm256_packus_epi16 and
 SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
 SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
 
-static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
 {
     return _mm256_permutevar8x32_ps(a, swiz);
 }
 
 SIMD_IWRAPPER_1I(shuffle_epi32);
 template<int ImmT>
-static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const &a, Integer const &b)
 {
     return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
 }
@@ -201,21 +201,21 @@ SIMD_IWRAPPER_2(unpacklo_epi8);
 // Load / store operations
 //-----------------------------------------------------------------------
 template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 {
     return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
 }
 
 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
 template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask)
 {
 	// g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256()
 	// Only for this intrinsic - not sure why. :(
     return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
 }
 
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const &a)
 {
     return static_cast<uint32_t>(_mm256_movemask_epi8(a));
 }
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
index c414d75..d6af7b1 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
@@ -32,7 +32,7 @@ static const int TARGET_SIMD_WIDTH = 8;
 using SIMD128T = SIMD128Impl::AVXImpl;
 
 #define SIMD_WRAPPER_1(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    static SIMDINLINE Float SIMDCALL op(Float const &a)   \
     {\
         return Float\
         {\
@@ -42,7 +42,7 @@ using SIMD128T = SIMD128Impl::AVXImpl;
     }
 
 #define SIMD_WRAPPER_2(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)    \
     {\
         return Float\
         {\
@@ -53,7 +53,7 @@ using SIMD128T = SIMD128Impl::AVXImpl;
 
 #define SIMD_WRAPPER_2I(op)  \
     template<int ImmT>\
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)   \
     {\
         return Float\
         {\
@@ -64,7 +64,7 @@ using SIMD128T = SIMD128Impl::AVXImpl;
 
 #define SIMD_WRAPPER_2I_1(op)  \
     template<int ImmT>\
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)   \
     {\
         return Float\
         {\
@@ -74,7 +74,7 @@ using SIMD128T = SIMD128Impl::AVXImpl;
     }
 
 #define SIMD_WRAPPER_3(op)  \
-        static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+        static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b, Float const &c)   \
         {\
             return Float\
             {\
@@ -84,7 +84,7 @@ using SIMD128T = SIMD128Impl::AVXImpl;
         }
 
 #define SIMD_IWRAPPER_1(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a)   \
     {\
         return Integer\
         {\
@@ -94,7 +94,7 @@ using SIMD128T = SIMD128Impl::AVXImpl;
     }
 
 #define SIMD_IWRAPPER_2(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
     {\
         return Integer\
         {\
@@ -105,7 +105,7 @@ using SIMD128T = SIMD128Impl::AVXImpl;
 
 #define SIMD_IWRAPPER_2I(op)  \
     template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
     {\
         return Integer\
         {\
@@ -116,7 +116,7 @@ using SIMD128T = SIMD128Impl::AVXImpl;
 
 #define SIMD_IWRAPPER_2I_1(op)  \
     template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
     {\
         return Integer\
         {\
@@ -127,7 +127,7 @@ using SIMD128T = SIMD128Impl::AVXImpl;
 
 #define SIMD_IWRAPPER_2I_2(op)  \
     template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
     {\
         return Integer\
         {\
@@ -137,7 +137,7 @@ using SIMD128T = SIMD128Impl::AVXImpl;
     }
 
 #define SIMD_IWRAPPER_3(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c)   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b, Integer const &c)   \
     {\
         return Integer\
         {\
@@ -161,7 +161,7 @@ SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
 SIMD_WRAPPER_2(sub_ps);     // return a - b
 
 template <RoundMode RMT>
-static SIMDINLINE Float SIMDCALL round_ps(Float a)
+static SIMDINLINE Float SIMDCALL round_ps(Float const &a)
 {
     return Float
     {
@@ -170,8 +170,8 @@ static SIMDINLINE Float SIMDCALL round_ps(Float a)
     };
 }
 
-static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
-static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL ceil_ps(Float const &a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
 
 //-----------------------------------------------------------------------
 // Integer (various width) arithmetic operations
@@ -212,7 +212,7 @@ SIMD_IWRAPPER_2(xor_si);    // return a ^ b       (int)
 // Shift operations
 //-----------------------------------------------------------------------
 template<int ImmT>
-static SIMDINLINE Integer SIMDCALL slli_epi32(Integer a)      // return a << ImmT
+static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const &a)      // return a << ImmT
 {
     return Integer
     {
@@ -224,7 +224,7 @@ static SIMDINLINE Integer SIMDCALL slli_epi32(Integer a)      // return a << Imm
 SIMD_IWRAPPER_2(sllv_epi32);                                // return a << b      (uint32)
 
 template<int ImmT>
-static SIMDINLINE Integer SIMDCALL srai_epi32(Integer a)      // return a >> ImmT   (int32)
+static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const &a)      // return a >> ImmT   (int32)
 {
     return Integer
     {
@@ -234,7 +234,7 @@ static SIMDINLINE Integer SIMDCALL srai_epi32(Integer a)      // return a >> Imm
 }
 
 template<int ImmT>
-static SIMDINLINE Integer SIMDCALL srli_epi32(Integer a)      // return a >> ImmT   (uint32)
+static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const &a)      // return a >> ImmT   (uint32)
 {
     return Integer
     {
@@ -244,7 +244,7 @@ static SIMDINLINE Integer SIMDCALL srli_epi32(Integer a)      // return a >> Imm
 }
 
 template<int ImmT>                                          // for each 128-bit lane:
-static SIMDINLINE Integer SIMDCALL srli_si(Integer a)         //  return a >> (ImmT*8) (uint)
+static SIMDINLINE Integer SIMDCALL srli_si(Integer const &a)         //  return a >> (ImmT*8) (uint)
 {
     return Integer
     {
@@ -253,7 +253,7 @@ static SIMDINLINE Integer SIMDCALL srli_si(Integer a)         //  return a >> (I
     };
 }
 template<int ImmT>
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)       // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a)       // same as srli_si, but with Float cast to int
 {
     return Float
     {
@@ -267,7 +267,7 @@ SIMD_IWRAPPER_2(srlv_epi32);                                // return a >> b
 //-----------------------------------------------------------------------
 // Conversion operations
 //-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double a)              // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castpd_ps(Double const &a)              // return *(Float*)(&a)
 {
     return Float
     {
@@ -276,7 +276,7 @@ static SIMDINLINE Float SIMDCALL castpd_ps(Double a)              // return *(Fl
     };
 }
 
-static SIMDINLINE Integer SIMDCALL castps_si(Float a)              // return *(Integer*)(&a)
+static SIMDINLINE Integer SIMDCALL castps_si(Float const &a)              // return *(Integer*)(&a)
 {
     return Integer
     {
@@ -285,7 +285,7 @@ static SIMDINLINE Integer SIMDCALL castps_si(Float a)              // return *(I
     };
 }
 
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)              // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer const &a)              // return *(Double*)(&a)
 {
     return Double
     {
@@ -294,7 +294,7 @@ static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)              // return *(
     };
 }
 
-static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castps_pd(Float const &a)   // return *(Double*)(&a)
 {
     return Double
     {
@@ -303,7 +303,7 @@ static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
     };
 }
 
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)              // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer const &a)              // return *(Float*)(&a)
 {
     return Float
     {
@@ -312,7 +312,7 @@ static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)              // return *(F
     };
 }
 
-static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a)            // return (float)a    (int32 --> float)
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer const &a)            // return (float)a    (int32 --> float)
 {
     return Float
     {
@@ -321,7 +321,7 @@ static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a)            // return (fl
     };
 }
 
-static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer a)          // return (int16)a    (uint8 --> int16)
+static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer const &a)          // return (int16)a    (uint8 --> int16)
 {
     return Integer
     {
@@ -330,7 +330,7 @@ static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer a)
     };
 }
 
-static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer a)          // return (int32)a    (uint8 --> int32)
+static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer const &a)          // return (int32)a    (uint8 --> int32)
 {
     return Integer
 	{
@@ -339,7 +339,7 @@ static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer a)
 	};
 }
 
-static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer a)         // return (int32)a    (uint16 --> int32)
+static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer const &a)         // return (int32)a    (uint16 --> int32)
 {
     return Integer
     {
@@ -348,7 +348,7 @@ static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer a)
     };
 }
 
-static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer a)         // return (int64)a    (uint16 --> int64)
+static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer const &a)         // return (int64)a    (uint16 --> int64)
 {
     return Integer
     {
@@ -357,7 +357,7 @@ static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer a)
     };
 }
 
-static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer a)         // return (int64)a    (uint32 --> int64)
+static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer const &a)         // return (int64)a    (uint32 --> int64)
 {
     return Integer
     {
@@ -366,7 +366,7 @@ static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer a)
     };
 }
 
-static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float const &a)            // return (int32)a    (float --> int32)
 {
     return Integer
     {
@@ -375,7 +375,7 @@ static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (in
     };
 }
 
-static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a)           // return (int32)a    (rnd_to_zero(float) --> int32)
 {
     return Integer
     {
@@ -388,7 +388,7 @@ static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (in
 // Comparison operations
 //-----------------------------------------------------------------------
 template<CompareType CmpTypeT>
-static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+static SIMDINLINE Float SIMDCALL cmp_ps(Float const &a, Float const &b) // return a (CmpTypeT) b
 {
     return Float
     {
@@ -396,15 +396,15 @@ static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT)
         SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
     };
 }
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
 
 template<CompareType CmpTypeT>
-static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
+static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const &a, Float const &b)
 {
     return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
 }
@@ -420,13 +420,13 @@ SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
 SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
 SIMD_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
 
-static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+static SIMDINLINE bool SIMDCALL testz_ps(Float const &a, Float const &b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
 {
     return  0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) &
                   SIMD256T::testz_ps(a.v8[1], b.v8[1]));
 }
 
-static SIMDINLINE int SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+static SIMDINLINE int SIMDCALL testz_si(Integer const &a, Integer const &b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
 {
     return  0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
                   SIMD256T::testz_si(a.v8[1], b.v8[1]));
@@ -438,7 +438,7 @@ static SIMDINLINE int SIMDCALL testz_si(Integer a, Integer b)  // return all_lan
 SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
 SIMD_IWRAPPER_2I(blend_epi32);  // return ImmT ? b : a  (int32)
 SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Float const &mask) // return mask ? b : a (int)
 {
     return Integer
     {
@@ -447,7 +447,7 @@ static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask
     };
 }
 
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Integer const &mask) // return mask ? b : a (int)
 {
     return Integer
     {
@@ -467,48 +467,51 @@ static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)         // return
 }
 
 template<int imm>
-static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
+static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const &a)
 {
     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
     return a.v8[imm];
 }
 
 template<int imm>
-static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
+static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const &a)
 {
     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
     return a.v8[imm];
 }
 
 template<int imm>
-static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
+static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const &a)
 {
     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
     return a.v8[imm];
 }
 
 template<int imm>
-static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
+static SIMDINLINE Float SIMDCALL insert_ps(Float const &a, SIMD256Impl::Float const &b)
 {
     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    a.v8[imm] = b;
-    return a;
+    Float r = a;
+    r.v8[imm] = b;
+    return r;
 }
 
 template<int imm>
-static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
+static SIMDINLINE Double SIMDCALL insert_pd(Double const &a, SIMD256Impl::Double const &b)
 {
     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    a.v8[imm] = b;
-    return a;
+    Double r = a;
+    r.v8[imm] = b;
+    return r;
 }
 
 template<int imm>
-static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
+static SIMDINLINE Integer SIMDCALL insert_si(Integer const &a, SIMD256Impl::Integer const &b)
 {
     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    a.v8[imm] = b;
-    return a;
+    Integer r = a;
+    r.v8[imm] = b;
+    return r;
 }
 
 SIMD_IWRAPPER_2(packs_epi16);      // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
@@ -516,7 +519,7 @@ SIMD_IWRAPPER_2(packs_epi32);      // See documentation for _mm256_packs_epi32 a
 SIMD_IWRAPPER_2(packus_epi16);     // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
 SIMD_IWRAPPER_2(packus_epi32);     // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
 
-static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
 {
     Integer result;
 
@@ -533,7 +536,7 @@ static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // ret
     return result;
 }
 
-static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
 {
     Float result;
 
@@ -573,7 +576,7 @@ static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return
 // AVX instructions for emulation.
 //
 template <int shuf>
-static SIMDINLINE Float SIMDCALL permute2f128_ps(Float a, Float b)
+static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const &a, Float const &b)
 {
     return Float
     {
@@ -583,7 +586,7 @@ static SIMDINLINE Float SIMDCALL permute2f128_ps(Float a, Float b)
 }
 
 template <int shuf>
-static SIMDINLINE Double SIMDCALL permute2f128_pd(Double a, Double b)
+static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const &a, Double const &b)
 {
     return Double
     {
@@ -593,7 +596,7 @@ static SIMDINLINE Double SIMDCALL permute2f128_pd(Double a, Double b)
 }
 
 template <int shuf>
-static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer a, Integer b)
+static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const &a, Integer const &b)
 {
     return Integer
 	{
@@ -624,7 +627,7 @@ SIMD_WRAPPER_2(unpacklo_ps);
 // Load / store operations
 //-----------------------------------------------------------------------
 template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 {
     return Float
     {
@@ -676,7 +679,7 @@ static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (s
 
 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
 template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask)
 {
     return Float
     {
@@ -685,13 +688,13 @@ static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, In
     };
 }
 
-static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer const &mask, Float const &src)
 {
     SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
     SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
 }
 
-static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
+static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const &a)
 {
     uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
              mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
@@ -699,14 +702,14 @@ static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
     return mask;
 }
 
-static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const &a)
 {
     uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
              mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
 
     return mask;
 }
-static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const &a)
 {
     uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
              mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
@@ -759,19 +762,19 @@ static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
     };
 }
 
-static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float const &a)    // *p = a   (stores all elements contiguously in memory)
 {
     SIMD256T::store_ps(p, a.v8[0]);
     SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
 }
 
-static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer const &a)   // *p = a
 {
     SIMD256T::store_si(&p->v8[0], a.v8[0]);
     SIMD256T::store_si(&p->v8[1], a.v8[1]);
 }
 
-static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float const &a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
 {
     SIMD256T::stream_ps(p, a.v8[0]);
     SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
index 236257f..0fad0e1 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
@@ -153,9 +153,9 @@ namespace SIMDImpl
             Float  z;
             Float  w;
         };
-        SIMDINLINE Float& operator[] (const int i) { return v[i]; }
-        SIMDINLINE Float const & operator[] (const int i) const { return v[i]; }
-        SIMDINLINE Vec4& operator=(Vec4 const & in)
+        SIMDINLINE Float& SIMDCALL operator[] (const int i) { return v[i]; }
+        SIMDINLINE Float const & SIMDCALL operator[] (const int i) const { return v[i]; }
+        SIMDINLINE Vec4& SIMDCALL operator=(Vec4 const & in)
         {
             v[0] = in.v[0];
             v[1] = in.v[1];
@@ -171,9 +171,9 @@ namespace SIMDImpl
         {
             SIMDINLINE Float() = default;
             SIMDINLINE Float(__m128 in) : v(in) {}
-            SIMDINLINE Float& operator=(__m128 in) { v = in; return *this; }
-            SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
-            SIMDINLINE operator __m128() const { return v; }
+            SIMDINLINE Float& SIMDCALL operator=(__m128 in) { v = in; return *this; }
+            SIMDINLINE Float& SIMDCALL operator=(Float const & in) { v = in.v; return *this; }
+            SIMDINLINE SIMDCALL operator __m128() const { return v; }
 
             SIMDALIGN(__m128, 16) v;
         };
@@ -182,9 +182,10 @@ namespace SIMDImpl
         {
             SIMDINLINE Integer() = default;
             SIMDINLINE Integer(__m128i in) : v(in) {}
-            SIMDINLINE Integer& operator=(__m128i in) { v = in; return *this; }
-            SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
-            SIMDINLINE operator __m128i() const { return v; }
+            SIMDINLINE Integer& SIMDCALL operator=(__m128i in) { v = in; return *this; }
+            SIMDINLINE Integer& SIMDCALL operator=(Integer const & in) { v = in.v; return *this; }
+            SIMDINLINE SIMDCALL operator __m128i() const { return v; }
+
             SIMDALIGN(__m128i, 16) v;
         };
 
@@ -192,9 +193,10 @@ namespace SIMDImpl
         {
             SIMDINLINE Double() = default;
             SIMDINLINE Double(__m128d in) : v(in) {}
-            SIMDINLINE Double& operator=(__m128d in) { v = in; return *this; }
-            SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
-            SIMDINLINE operator __m128d() const { return v; }
+            SIMDINLINE Double& SIMDCALL operator=(__m128d in) { v = in; return *this; }
+            SIMDINLINE Double& SIMDCALL operator=(Double const & in) { v = in.v; return *this; }
+            SIMDINLINE SIMDCALL operator __m128d() const { return v; }
+
             SIMDALIGN(__m128d, 16) v;
         };
 
@@ -210,13 +212,13 @@ namespace SIMDImpl
         {
             SIMDINLINE Float() = default;
             SIMDINLINE Float(__m256 in) : v(in) {}
-            SIMDINLINE Float(SIMD128Impl::Float in_lo, SIMD128Impl::Float in_hi = _mm_setzero_ps())
+            SIMDINLINE Float(SIMD128Impl::Float const &in_lo, SIMD128Impl::Float const &in_hi = _mm_setzero_ps())
             {
                 v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
             }
-            SIMDINLINE Float& operator=(__m256 in) { v = in; return *this; }
-            SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
-            SIMDINLINE operator __m256() const { return v; }
+            SIMDINLINE Float& SIMDCALL operator=(__m256 in) { v = in; return *this; }
+            SIMDINLINE Float& SIMDCALL operator=(Float const & in) { v = in.v; return *this; }
+            SIMDINLINE SIMDCALL operator __m256() const { return v; }
 
             SIMDALIGN(__m256, 32) v;
             SIMD128Impl::Float v4[2];
@@ -226,13 +228,13 @@ namespace SIMDImpl
         {
             SIMDINLINE Integer() = default;
             SIMDINLINE Integer(__m256i in) : v(in) {}
-            SIMDINLINE Integer(SIMD128Impl::Integer in_lo, SIMD128Impl::Integer in_hi = _mm_setzero_si128())
+            SIMDINLINE Integer(SIMD128Impl::Integer const &in_lo, SIMD128Impl::Integer const &in_hi = _mm_setzero_si128())
             {
                 v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
             }
-            SIMDINLINE Integer& operator=(__m256i in) { v = in; return *this; }
-            SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
-            SIMDINLINE operator __m256i() const { return v; }
+            SIMDINLINE Integer& SIMDCALL operator=(__m256i in) { v = in; return *this; }
+            SIMDINLINE Integer& SIMDCALL operator=(Integer const & in) { v = in.v; return *this; }
+            SIMDINLINE SIMDCALL operator __m256i() const { return v; }
 
             SIMDALIGN(__m256i, 32) v;
             SIMD128Impl::Integer v4[2];
@@ -241,14 +243,14 @@ namespace SIMDImpl
         union Double
         {
             SIMDINLINE Double() = default;
-            SIMDINLINE Double(__m256d in) : v(in) {}
-            SIMDINLINE Double(SIMD128Impl::Double in_lo, SIMD128Impl::Double in_hi = _mm_setzero_pd())
+            SIMDINLINE Double(__m256d const &in) : v(in) {}
+            SIMDINLINE Double(SIMD128Impl::Double const &in_lo, SIMD128Impl::Double const &in_hi = _mm_setzero_pd())
             {
                 v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
             }
-            SIMDINLINE Double& operator=(__m256d in) { v = in; return *this; }
-            SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
-            SIMDINLINE operator __m256d() const { return v; }
+            SIMDINLINE Double& SIMDCALL operator=(__m256d in) { v = in; return *this; }
+            SIMDINLINE Double& SIMDCALL operator=(Double const & in) { v = in.v; return *this; }
+            SIMDINLINE SIMDCALL operator __m256d() const { return v; }
 
             SIMDALIGN(__m256d, 32) v;
             SIMD128Impl::Double v4[2];
@@ -303,9 +305,9 @@ namespace SIMDImpl
         {
             SIMDINLINE Float() = default;
             SIMDINLINE Float(__m512 in) : v(in) {}
-            SIMDINLINE Float(SIMD256Impl::Float in_lo, SIMD256Impl::Float in_hi = _mm256_setzero_ps()) { v8[0] = in_lo; v8[1] = in_hi; }
-            SIMDINLINE Float& operator=(__m512 in) { v = in; return *this; }
-            SIMDINLINE Float& operator=(Float const & in)
+            SIMDINLINE Float(SIMD256Impl::Float const &in_lo, SIMD256Impl::Float const &in_hi = _mm256_setzero_ps()) { v8[0] = in_lo; v8[1] = in_hi; }
+            SIMDINLINE Float& SIMDCALL operator=(__m512 in) { v = in; return *this; }
+            SIMDINLINE Float& SIMDCALL operator=(Float const & in)
             {
 #if SIMD_ARCH >= SIMD_ARCH_AVX512
                 v = in.v;
@@ -315,7 +317,7 @@ namespace SIMDImpl
 #endif
                 return *this;
             }
-            SIMDINLINE operator __m512() const { return v; }
+            SIMDINLINE SIMDCALL operator __m512() const { return v; }
 
             SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
             SIMD256Impl::Float v8[2];
@@ -325,9 +327,9 @@ namespace SIMDImpl
         {
             SIMDINLINE Integer() = default;
             SIMDINLINE Integer(__m512i in) : v(in) {}
-            SIMDINLINE Integer(SIMD256Impl::Integer in_lo, SIMD256Impl::Integer in_hi = _mm256_setzero_si256()) { v8[0] = in_lo; v8[1] = in_hi; }
-            SIMDINLINE Integer& operator=(__m512i in) { v = in; return *this; }
-            SIMDINLINE Integer& operator=(Integer const & in)
+            SIMDINLINE Integer(SIMD256Impl::Integer const &in_lo, SIMD256Impl::Integer const &in_hi = _mm256_setzero_si256()) { v8[0] = in_lo; v8[1] = in_hi; }
+            SIMDINLINE Integer& SIMDCALL operator=(__m512i in) { v = in; return *this; }
+            SIMDINLINE Integer& SIMDCALL operator=(Integer const & in)
             {
 #if SIMD_ARCH >= SIMD_ARCH_AVX512
                 v = in.v;
@@ -338,7 +340,7 @@ namespace SIMDImpl
                 return *this;
             }
 
-            SIMDINLINE operator __m512i() const { return v; }
+            SIMDINLINE SIMDCALL operator __m512i() const { return v; }
 
             SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
             SIMD256Impl::Integer v8[2];
@@ -348,9 +350,9 @@ namespace SIMDImpl
         {
             SIMDINLINE Double() = default;
             SIMDINLINE Double(__m512d in) : v(in) {}
-            SIMDINLINE Double(SIMD256Impl::Double in_lo, SIMD256Impl::Double in_hi = _mm256_setzero_pd()) { v8[0] = in_lo; v8[1] = in_hi; }
-            SIMDINLINE Double& operator=(__m512d in) { v = in; return *this; }
-            SIMDINLINE Double& operator=(Double const & in)
+            SIMDINLINE Double(SIMD256Impl::Double const &in_lo, SIMD256Impl::Double const &in_hi = _mm256_setzero_pd()) { v8[0] = in_lo; v8[1] = in_hi; }
+            SIMDINLINE Double& SIMDCALL operator=(__m512d in) { v = in; return *this; }
+            SIMDINLINE Double& SIMDCALL operator=(Double const & in)
             {
 #if SIMD_ARCH >= SIMD_ARCH_AVX512
                 v = in.v;
@@ -361,7 +363,7 @@ namespace SIMDImpl
                 return *this;
             }
 
-            SIMDINLINE operator __m512d() const { return v; }
+            SIMDINLINE SIMDCALL operator __m512d() const { return v; }
 
             SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
             SIMD256Impl::Double v8[2];
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
index 97ca0ef..0f430ef 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
@@ -50,7 +50,7 @@ static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}
 #define MASK 0xff
 #endif
 
-static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
+static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar const &vI, simdscalar const &vJ)
 {
     simdscalar vClipMask = _simd_setzero_ps();
     uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
@@ -338,7 +338,7 @@ struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
 template<typename T>
 INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos,
                             const uint64_t *const coverageMask, const uint32_t sampleMask,
-                            const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+                            simdscalar const &vXSamplePosUL, simdscalar const &vYSamplePosUL)
 {
     uint32_t inputMask[KNOB_SIMD_WIDTH];
     generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
@@ -412,7 +412,7 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS
 }
 
 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
-                                     const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+                                     const simdscalar &vXSamplePosUL, const simdscalar &vYSamplePosUL)
 {
     // evaluate I,J
     psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
@@ -424,7 +424,7 @@ INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CON
     psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
 }
 
-INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar z, float minz, float maxz)
+INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const &z, float minz, float maxz)
 {
     const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
     const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
@@ -711,7 +711,7 @@ static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_P
 
 // Merge Output to 4x2 SIMD Tile Format
 INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
-    const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, uint32_t renderTargetMask)
+    const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar const &depthPassMask, uint32_t renderTargetMask)
 {
     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
     const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
@@ -777,7 +777,7 @@ INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW
 #if USE_8x2_TILE_BACKEND
 // Merge Output to 8x2 SIMD16 Tile Format
 INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
-    const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, uint32_t renderTargetMask, bool useAlternateOffset)
+    const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar const &depthPassMask, uint32_t renderTargetMask, bool useAlternateOffset)
 {
     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
     uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index c1f0f07..9fe1b01 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -36,12 +36,12 @@
 #include "tilemgr.h"
 
 // Function Prototype
-void BinPostSetupLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], simdscalar vRecipW[2], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
-void BinPostSetupPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
+void BinPostSetupLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], simdscalar vRecipW[2], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx);
+void BinPostSetupPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx);
 
 #if USE_SIMD16_FRONTEND
-void BinPostSetupLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], simd16scalar vRecipW[2], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
-void BinPostSetupPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
+void BinPostSetupLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], simd16scalar vRecipW[2], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx);
+void BinPostSetupPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx);
 #endif
 
 //////////////////////////////////////////////////////////////////////////
@@ -433,7 +433,7 @@ void BinTriangles(
     uint32_t workerId,
     simdvector tri[3],
     uint32_t triMask,
-    simdscalari primID)
+    simdscalari const &primID)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
@@ -878,7 +878,7 @@ void SIMDCALL BinTriangles_simd16(
     uint32_t workerId,
     simd16vector tri[3],
     uint32_t triMask,
-    simd16scalari primID)
+    simd16scalari const &primID)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
@@ -1386,8 +1386,8 @@ void BinPostSetupPoints(
     uint32_t workerId,
     simdvector prim[],
     uint32_t primMask,
-    simdscalari primID,
-    simdscalari viewportIdx)
+    simdscalari const &primID,
+    simdscalari const &viewportIdx)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
@@ -1703,7 +1703,7 @@ void BinPoints(
     uint32_t workerId,
     simdvector prim[3],
     uint32_t primMask,
-    simdscalari primID)
+    simdscalari const &primID)
 {
     simdvector& primVerts = prim[0];
 
@@ -1767,8 +1767,8 @@ void BinPostSetupPoints_simd16(
     uint32_t workerId,
     simd16vector prim[],
     uint32_t primMask,
-    simd16scalari primID,
-    simd16scalari viewportIdx)
+    simd16scalari const &primID,
+    simd16scalari const &viewportIdx)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
@@ -2086,7 +2086,7 @@ void SIMDCALL BinPoints_simd16(
     uint32_t workerId,
     simd16vector prim[3],
     uint32_t primMask,
-    simd16scalari primID)
+    simd16scalari const &primID)
 {
     simd16vector& primVerts = prim[0];
 
@@ -2160,8 +2160,8 @@ void BinPostSetupLines(
     simdvector prim[],
     simdscalar recipW[],
     uint32_t primMask,
-    simdscalari primID,
-    simdscalari viewportIdx)
+    simdscalari const &primID,
+    simdscalari const &viewportIdx)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
@@ -2365,8 +2365,8 @@ void BinPostSetupLines_simd16(
     simd16vector prim[],
     simd16scalar recipW[],
     uint32_t primMask,
-    simd16scalari primID,
-    simd16scalari viewportIdx)
+    simd16scalari const &primID,
+    simd16scalari const &viewportIdx)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
@@ -2596,7 +2596,7 @@ void BinLines(
     uint32_t workerId,
     simdvector prim[],
     uint32_t primMask,
-    simdscalari primID)
+    simdscalari const &primID)
 {
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
@@ -2670,7 +2670,7 @@ void SIMDCALL BinLines_simd16(
     uint32_t workerId,
     simd16vector prim[3],
     uint32_t primMask,
-    simd16scalari primID)
+    simd16scalari const &primID)
 {
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.h b/src/gallium/drivers/swr/rasterizer/core/binner.h
index 875e0b7..1616143 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.h
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.h
@@ -49,7 +49,7 @@ static const simd16scalar g_pixelOffsets_simd16[SWR_PIXEL_LOCATION_UL + 1] =
 /// @brief Convert the X,Y coords of a triangle to the requested Fixed 
 /// Point precision from FP32.
 template <typename PT = FixedPointTraits<Fixed_16_8>>
-INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
+INLINE simdscalari fpToFixedPointVertical(const simdscalar &vIn)
 {
     simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
     return _simd_cvtps_epi32(vFixed);
@@ -57,7 +57,7 @@ INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
 
 #if USE_SIMD16_FRONTEND
 template <typename PT = FixedPointTraits<Fixed_16_8>>
-INLINE simd16scalari fpToFixedPointVertical(const simd16scalar vIn)
+INLINE simd16scalari fpToFixedPointVertical(const simd16scalar &vIn)
 {
     simd16scalar vFixed = _simd16_mul_ps(vIn, _simd16_set1_ps(PT::ScaleT::value));
     return _simd16_cvtps_epi32(vFixed);
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
index bf542f1..4b5512c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -160,7 +160,7 @@ int ClipTriToPlane( const float *pInPts, int numInPts,
     return i;
 }
 
-void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipTriangles, pDC->drawId);
@@ -169,7 +169,7 @@ void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvecto
     AR_END(FEClipTriangles, 1);
 }
 
-void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipLines, pDC->drawId);
@@ -178,7 +178,7 @@ void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector pr
     AR_END(FEClipLines, 1);
 }
 
-void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipPoints, pDC->drawId);
@@ -188,7 +188,7 @@ void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector p
 }
 
 #if USE_SIMD16_FRONTEND
-void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
+void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipTriangles, pDC->drawId);
@@ -203,7 +203,7 @@ void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t wor
     AR_END(FEClipTriangles, 1);
 }
 
-void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
+void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipLines, pDC->drawId);
@@ -218,7 +218,7 @@ void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerI
     AR_END(FEClipLines, 1);
 }
 
-void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
+void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipPoints, pDC->drawId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index ca6596e..ffc69c4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -62,7 +62,7 @@ enum SWR_CLIPCODES
 #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
 
 INLINE
-void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes, simdscalari viewportIndexes)
+void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes, simdscalari const &viewportIndexes)
 {
     clipCodes = _simd_setzero_ps();
 
@@ -131,7 +131,7 @@ void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, simdscal
 
 #if USE_SIMD16_FRONTEND
 INLINE
-void ComputeClipCodes(const API_STATE& state, const simd16vector& vertex, simd16scalar& clipCodes, simd16scalari viewportIndexes)
+void ComputeClipCodes(const API_STATE& state, const simd16vector& vertex, simd16scalar& clipCodes, simd16scalari const &viewportIndexes)
 {
     clipCodes = _simd16_setzero_ps();
 
@@ -203,13 +203,13 @@ template<uint32_t NumVertsPerPrim>
 class Clipper
 {
 public:
-    Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
+    INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
         workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
     {
         static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
     }
 
-    void ComputeClipCodes(simdvector vertex[], simdscalari viewportIndexes)
+    INLINE void ComputeClipCodes(simdvector vertex[], simdscalari const &viewportIndexes)
     {
         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
         {
@@ -218,7 +218,7 @@ public:
     }
 
 #if USE_SIMD16_FRONTEND
-    void ComputeClipCodes(simd16vector vertex[], simd16scalari viewportIndexes)
+    INLINE void ComputeClipCodes(simd16vector vertex[], simd16scalari const &viewportIndexes)
     {
         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
         {
@@ -227,7 +227,7 @@ public:
     }
 
 #endif
-    simdscalar ComputeClipCodeIntersection()
+    INLINE simdscalar ComputeClipCodeIntersection()
     {
         simdscalar result = this->clipCodes[0];
         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
@@ -238,7 +238,7 @@ public:
     }
 
 #if USE_SIMD16_FRONTEND
-    simd16scalar ComputeClipCodeIntersection_simd16()
+    INLINE simd16scalar ComputeClipCodeIntersection_simd16()
     {
         simd16scalar result = this->clipCodes_simd16[0];
         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
@@ -249,7 +249,7 @@ public:
     }
 
 #endif
-    simdscalar ComputeClipCodeUnion()
+    INLINE simdscalar ComputeClipCodeUnion()
     {
         simdscalar result = this->clipCodes[0];
         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
@@ -260,7 +260,7 @@ public:
     }
 
 #if USE_SIMD16_FRONTEND
-    simd16scalar ComputeClipCodeUnion_simd16()
+    INLINE simd16scalar ComputeClipCodeUnion_simd16()
     {
         simd16scalar result = this->clipCodes_simd16[0];
         for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
@@ -271,14 +271,14 @@ public:
     }
 
 #endif
-    int ComputeNegWMask()
+    INLINE int ComputeNegWMask()
     {
         simdscalar clipCodeUnion = ComputeClipCodeUnion();
         clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW)));
         return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps()));
     }
 
-    int ComputeClipMask()
+    INLINE int ComputeClipMask()
     {
         simdscalar clipUnion = ComputeClipCodeUnion();
         clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK)));
@@ -286,7 +286,7 @@ public:
     }
 
 #if USE_SIMD16_FRONTEND
-    int ComputeClipMask_simd16()
+    INLINE int ComputeClipMask_simd16()
     {
         simd16scalar clipUnion = ComputeClipCodeUnion_simd16();
         clipUnion = _simd16_and_ps(clipUnion, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_CLIP_MASK)));
@@ -295,7 +295,7 @@ public:
 
 #endif
     // clipper is responsible for culling any prims with NAN coordinates
-    int ComputeNaNMask(simdvector prim[])
+    INLINE int ComputeNaNMask(simdvector prim[])
     {
         simdscalar vNanMask = _simd_setzero_ps();
         for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
@@ -310,7 +310,7 @@ public:
     }
 
 #if USE_SIMD16_FRONTEND
-    int ComputeNaNMask(simd16vector prim[])
+    INLINE int ComputeNaNMask(simd16vector prim[])
     {
         simd16scalar vNanMask = _simd16_setzero_ps();
         for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
@@ -325,7 +325,7 @@ public:
     }
 
 #endif
-    int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
+    INLINE int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
     {
         uint8_t cullMask = this->state.rastState.cullDistanceMask;
         simdscalar vClipCullMask = _simd_setzero_ps();
@@ -391,7 +391,7 @@ public:
     }
 
 #if USE_SIMD16_FRONTEND
-    int ComputeUserClipCullMask(PA_STATE& pa, simd16vector prim[])
+    INLINE int ComputeUserClipCullMask(PA_STATE& pa, simd16vector prim[])
     {
         uint8_t cullMask = this->state.rastState.cullDistanceMask;
         simd16scalar vClipCullMask = _simd16_setzero_ps();
@@ -459,7 +459,7 @@ public:
 
 #endif
     // clip SIMD primitives
-    void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId)
+    INLINE void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId)
     {
         // input/output vertex store for clipper
         simdvertex vertices[7]; // maximum 7 verts generated per triangle
@@ -943,7 +943,7 @@ public:
 
 #endif
     // execute the clipper stage
-    void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId)
+    void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari const &primId)
     {
         SWR_ASSERT(this->pDC != nullptr);
         SWR_CONTEXT* pContext = this->pDC->pContext;
@@ -1027,7 +1027,7 @@ public:
     }
 
 #if USE_SIMD16_FRONTEND
-    void ExecuteStage(PA_STATE& pa, simd16vector prim[], uint32_t primMask, simd16scalari primId)
+    void ExecuteStage(PA_STATE& pa, simd16vector prim[], uint32_t primMask, simd16scalari const &primId)
     {
         SWR_ASSERT(pa.pDC != nullptr);
         SWR_CONTEXT* pContext = pa.pDC->pContext;
@@ -1110,19 +1110,19 @@ public:
 
 #endif
 private:
-    inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1)
+    inline simdscalar ComputeInterpFactor(simdscalar const &boundaryCoord0, simdscalar const &boundaryCoord1)
     {
         return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1));
     }
 
 #if USE_SIMD16_FRONTEND
-    inline simd16scalar ComputeInterpFactor(simd16scalar boundaryCoord0, simd16scalar boundaryCoord1)
+    inline simd16scalar ComputeInterpFactor(simd16scalar const &boundaryCoord0, simd16scalar const &boundaryCoord1)
     {
         return _simd16_div_ps(boundaryCoord0, _simd16_sub_ps(boundaryCoord0, boundaryCoord1));
     }
 
 #endif
-    inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component)
+    inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari const &vIndices, uint32_t component)
     {
         const uint32_t simdVertexStride = sizeof(simdvertex);
         const uint32_t componentStride = sizeof(simdscalar);
@@ -1143,7 +1143,7 @@ private:
     }
 
 #if USE_SIMD16_FRONTEND
-    inline simd16scalari ComputeOffsets(uint32_t attrib, simd16scalari vIndices, uint32_t component)
+    inline simd16scalari ComputeOffsets(uint32_t attrib, simd16scalari const &vIndices, uint32_t component)
     {
         const uint32_t simdVertexStride = sizeof(simd16vertex);
         const uint32_t componentStride = sizeof(simd16scalar);
@@ -1168,7 +1168,7 @@ private:
 
 #endif
     // gathers a single component for a given attribute for each SIMD lane
-    inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component)
+    inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar const &vMask, simdscalari const &vIndices, uint32_t component)
     {
         simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
         simdscalar vSrc = _mm256_undefined_ps();
@@ -1176,7 +1176,7 @@ private:
     }
 
 #if USE_SIMD16_FRONTEND
-    inline simd16scalar GatherComponent(const float* pBuffer, uint32_t attrib, simd16scalar vMask, simd16scalari vIndices, uint32_t component)
+    inline simd16scalar GatherComponent(const float* pBuffer, uint32_t attrib, simd16scalar const &vMask, simd16scalari const &vIndices, uint32_t component)
     {
         simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component);
         simd16scalar vSrc = _simd16_setzero_ps();
@@ -1184,7 +1184,7 @@ private:
     }
 
 #endif
-    inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc)
+    inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar const &vMask, simdscalari const &vIndices, uint32_t component, simdscalar const &vSrc)
     {
         simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
 
@@ -1201,7 +1201,7 @@ private:
     }
 
 #if USE_SIMD16_FRONTEND
-    inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simd16scalar vMask, simd16scalari vIndices, uint32_t component, simd16scalar vSrc)
+    inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simd16scalar const &vMask, simd16scalari const &vIndices, uint32_t component, simd16scalar const &vSrc)
     {
         simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component);
 
@@ -1891,12 +1891,12 @@ private:
 
 
 // pipeline stage functions
-void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
-void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
-void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId);
+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId);
+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId);
 #if USE_SIMD16_FRONTEND
-void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
-void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
-void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
+void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId);
+void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId);
+void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId);
 #endif
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index a694f2d..131b3cb 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -214,12 +214,12 @@ struct PA_STATE;
 
 // function signature for pipeline stages that execute after primitive assembly
 typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], 
-    uint32_t primMask, simdscalari primID);
+    uint32_t primMask, simdscalari const &primID);
 
 #if ENABLE_AVX512_SIMD16
 // function signature for pipeline stages that execute after primitive assembly
 typedef void(SIMDCALL *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
-    uint32_t primMask, simd16scalari primID);
+    uint32_t primMask, simd16scalari const &primID);
 
 #endif
 OSALIGNLINE(struct) API_STATE
@@ -343,11 +343,11 @@ struct BarycentricCoeffs
 // pipeline function pointer types
 typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
 typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*,
-                                 const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar);
+                                 const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar const &);
 typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &);
 typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
 typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t,
-                                              const simdscalar, const simdscalar);
+                                              simdscalar const &, simdscalar const &);
 
 struct BACKEND_FUNCS
 {
diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
index 590c569..fafc36d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
+++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
@@ -30,7 +30,7 @@
 #include "format_conversion.h"
 
 INLINE
-void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simdscalar &stencilps)
+void StencilOp(SWR_STENCILOP op, simdscalar const &mask, simdscalar const &stencilRefps, simdscalar &stencilps)
 {
     simdscalari stencil = _simd_castps_si(stencilps);
 
@@ -81,7 +81,7 @@ void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simds
 
 
 template<SWR_FORMAT depthFormatT>
-simdscalar QuantizeDepth(simdscalar depth)
+simdscalar QuantizeDepth(simdscalar const &depth)
 {
     SWR_TYPE depthType = FormatTraits<depthFormatT>::GetType(0);
     uint32_t depthBpc = FormatTraits<depthFormatT>::GetBPC(0);
@@ -117,7 +117,7 @@ simdscalar QuantizeDepth(simdscalar depth)
 
 INLINE
 simdscalar DepthStencilTest(const API_STATE* pState,
-                 bool frontFacing, uint32_t viewportIndex, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask,
+                 bool frontFacing, uint32_t viewportIndex, simdscalar const &iZ, uint8_t* pDepthBase, simdscalar const &coverageMask,
                  uint8_t *pStencilBase, simdscalar* pStencilMask)
 {
     static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
@@ -132,7 +132,7 @@ simdscalar DepthStencilTest(const API_STATE* pState,
     // clamp Z to viewport [minZ..maxZ]
     simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
     simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
-    interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, interpZ));
+    simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ));
     
     if (pDSState->depthTestEnable)
     {
@@ -215,7 +215,7 @@ simdscalar DepthStencilTest(const API_STATE* pState,
 
 INLINE
 void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
-        bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask, 
+        bool frontFacing, simdscalar const &iZ, uint8_t* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask,
         uint8_t *pStencilBase, const simdscalar& stencilMask)
 {
     if (pDSState->depthWriteEnable)
@@ -223,7 +223,7 @@ void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_ST
         // clamp Z to viewport [minZ..maxZ]
         simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
         simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
-        interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, interpZ));
+        simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ));
 
         simdscalar vMask = _simd_and_ps(depthMask, coverageMask);
         _simd_maskstore_ps((float*)pDepthBase, _simd_castps_si(vMask), interpZ);
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
index 4e642f8..72843f5 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
@@ -79,8 +79,9 @@ INLINE void LoadSOA(const uint8_t *pSrc, simdvector &dst)
 /// @param vComp - SIMD vector of floats
 /// @param Component - component
 template<SWR_FORMAT Format>
-INLINE simdscalar Clamp(simdscalar vComp, uint32_t Component)
+INLINE simdscalar Clamp(simdscalar const &vC, uint32_t Component)
 {
+    simdscalar vComp = vC;
     if (FormatTraits<Format>::isNormalized(Component))
     {
         if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM)
@@ -125,8 +126,9 @@ INLINE simdscalar Clamp(simdscalar vComp, uint32_t Component)
 /// @param vComp - SIMD vector of floats
 /// @param Component - component
 template<SWR_FORMAT Format>
-INLINE simdscalar Normalize(simdscalar vComp, uint32_t Component)
+INLINE simdscalar Normalize(simdscalar const &vC, uint32_t Component)
 {
+    simdscalar vComp = vC;
     if (FormatTraits<Format>::isNormalized(Component))
     {
         vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<Format>::fromFloat(Component)));
@@ -247,8 +249,9 @@ INLINE void SIMDCALL LoadSOA(const uint8_t *pSrc, simd16vector &dst)
 /// @param vComp - SIMD vector of floats
 /// @param Component - component
 template<SWR_FORMAT Format>
-INLINE simd16scalar SIMDCALL Clamp(simd16scalar vComp, uint32_t Component)
+INLINE simd16scalar SIMDCALL Clamp(simd16scalar const &v, uint32_t Component)
 {
+    simd16scalar vComp = v;
     if (FormatTraits<Format>::isNormalized(Component))
     {
         if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM)
@@ -293,14 +296,15 @@ INLINE simd16scalar SIMDCALL Clamp(simd16scalar vComp, uint32_t Component)
 /// @param vComp - SIMD vector of floats
 /// @param Component - component
 template<SWR_FORMAT Format>
-INLINE simd16scalar SIMDCALL Normalize(simd16scalar vComp, uint32_t Component)
+INLINE simd16scalar SIMDCALL Normalize(simd16scalar const &vComp, uint32_t Component)
 {
+    simd16scalar r = vComp;
     if (FormatTraits<Format>::isNormalized(Component))
     {
-        vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<Format>::fromFloat(Component)));
-        vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
+        r = _simd16_mul_ps(r, _simd16_set1_ps(FormatTraits<Format>::fromFloat(Component)));
+        r = _simd16_castsi_ps(_simd16_cvtps_epi32(r));
     }
-    return vComp;
+    return r;
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h
index 43053b6..c3327c1 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_types.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
@@ -38,12 +38,12 @@ struct PackTraits
 {
     static const uint32_t MyNumBits = NumBits;
     static simdscalar loadSOA(const uint8_t *pSrc) = delete;
-    static void storeSOA(uint8_t *pDst, simdscalar src) = delete;
+    static void storeSOA(uint8_t *pDst, simdscalar const &src) = delete;
     static simdscalar unpack(simdscalar &in) = delete;
     static simdscalar pack(simdscalar &in) = delete;
 #if ENABLE_AVX512_SIMD16
     static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete;
-    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src) = delete;
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) = delete;
     static simd16scalar unpack(simd16scalar &in) = delete;
     static simd16scalar pack(simd16scalar &in) = delete;
 #endif
@@ -58,12 +58,12 @@ struct PackTraits<0, false>
     static const uint32_t MyNumBits = 0;
 
     static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
-    static void storeSOA(uint8_t *pDst, simdscalar src) { return; }
+    static void storeSOA(uint8_t *pDst, simdscalar const &src) { return; }
     static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
     static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
 #if ENABLE_AVX512_SIMD16
     static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); }
-    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src) { return; }
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) { return; }
     static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); }
     static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); }
 #endif
@@ -88,7 +88,7 @@ struct PackTraits<8, false>
 #endif
     }
 
-    static void storeSOA(uint8_t *pDst, simdscalar src)
+    static void storeSOA(uint8_t *pDst, simdscalar const &src)
     {
         // store simd bytes
 #if KNOB_SIMD_WIDTH == 8
@@ -144,7 +144,7 @@ struct PackTraits<8, false>
         return result;
     }
 
-    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
     {
         // store simd16 bytes
         _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
@@ -203,7 +203,7 @@ struct PackTraits<8, true>
 #endif
     }
 
-    static void storeSOA(uint8_t *pDst, simdscalar src)
+    static void storeSOA(uint8_t *pDst, simdscalar const &src)
     {
         // store simd bytes
 #if KNOB_SIMD_WIDTH == 8
@@ -260,7 +260,7 @@ struct PackTraits<8, true>
         return result;
     }
 
-    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
     {
         // store simd16 bytes
         _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
@@ -319,7 +319,7 @@ struct PackTraits<16, false>
 #endif
     }
 
-    static void storeSOA(uint8_t *pDst, simdscalar src)
+    static void storeSOA(uint8_t *pDst, simdscalar const &src)
     {
 #if KNOB_SIMD_WIDTH == 8
         // store 16B (2B * 8)
@@ -372,7 +372,7 @@ struct PackTraits<16, false>
         return result;
     }
 
-    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
     {
         _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
     }
@@ -417,7 +417,7 @@ struct PackTraits<16, true>
 #endif
     }
 
-    static void storeSOA(uint8_t *pDst, simdscalar src)
+    static void storeSOA(uint8_t *pDst, simdscalar const &src)
     {
 #if KNOB_SIMD_WIDTH == 8
         // store 16B (2B * 8)
@@ -471,7 +471,7 @@ struct PackTraits<16, true>
         return result;
     }
 
-    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
     {
         _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
     }
@@ -506,7 +506,7 @@ struct PackTraits<32, false>
     static const uint32_t MyNumBits = 32;
 
     static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
-    static void storeSOA(uint8_t *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
+    static void storeSOA(uint8_t *pDst, simdscalar const &src) { _simd_store_ps((float*)pDst, src); }
     static simdscalar unpack(simdscalar &in) { return in; }
     static simdscalar pack(simdscalar &in) { return in; }
 #if ENABLE_AVX512_SIMD16
@@ -516,7 +516,7 @@ struct PackTraits<32, false>
         return _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
     }
 
-    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
     {
         _simd16_store_ps(reinterpret_cast<float *>(pDst), src);
     }
@@ -814,7 +814,7 @@ static inline __m128 ConvertFloatToSRGB2(__m128& Src)
 
 #if ENABLE_AVX512_SIMD16
 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
-inline static simd16scalar SIMDCALL fastpow(simd16scalar value)
+inline static simd16scalar SIMDCALL fastpow(simd16scalar const &value)
 {
     static const float factor1 = exp2(127.0f * expden / expnum - 127.0f)
         * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
@@ -836,7 +836,7 @@ inline static simd16scalar SIMDCALL fastpow(simd16scalar value)
     return result;
 }
 
-inline static simd16scalar SIMDCALL pow512_4(simd16scalar arg)
+inline static simd16scalar SIMDCALL pow512_4(simd16scalar const &arg)
 {
     // 5/12 is too small, so compute the 4th root of 20/12 instead.
     // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
@@ -857,7 +857,7 @@ inline static simd16scalar SIMDCALL pow512_4(simd16scalar arg)
     return xavg;
 }
 
-inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar base, float exp)
+inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar &base, float exp)
 {
     const float *f = reinterpret_cast<const float *>(&base);
 
@@ -1322,7 +1322,7 @@ struct ComponentTraits
         return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
     }
 
-    INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar src)
+    INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar const &src)
     {
         switch (comp)
         {
@@ -1340,41 +1340,48 @@ struct ComponentTraits
             return;
         }
         SWR_INVALID("Invalid component: %d", comp);
-        TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
     }
 
     INLINE static simdscalar unpack(uint32_t comp, simdscalar &in)
     {
+        simdscalar out;
         switch (comp)
         {
         case 0:
-            return TypeTraits<X, NumBitsX>::unpack(in);
+            out = TypeTraits<X, NumBitsX>::unpack(in); break;
         case 1:
-            return TypeTraits<Y, NumBitsY>::unpack(in);
+            out = TypeTraits<Y, NumBitsY>::unpack(in); break;
         case 2:
-            return TypeTraits<Z, NumBitsZ>::unpack(in);
+            out = TypeTraits<Z, NumBitsZ>::unpack(in); break;
         case 3:
-            return TypeTraits<W, NumBitsW>::unpack(in);
+            out = TypeTraits<W, NumBitsW>::unpack(in); break;
+        default:
+            SWR_INVALID("Invalid component: %d", comp);
+            out = in;
+            break;
         }
-        SWR_INVALID("Invalid component: %d", comp);
-        return TypeTraits<X, NumBitsX>::unpack(in);
+        return out;
     }
 
     INLINE static simdscalar pack(uint32_t comp, simdscalar &in)
     {
+        simdscalar out;
         switch (comp)
         {
         case 0:
-            return TypeTraits<X, NumBitsX>::pack(in);
+            out = TypeTraits<X, NumBitsX>::pack(in); break;
         case 1:
-            return TypeTraits<Y, NumBitsY>::pack(in);
+            out = TypeTraits<Y, NumBitsY>::pack(in); break;
         case 2:
-            return TypeTraits<Z, NumBitsZ>::pack(in);
+            out = TypeTraits<Z, NumBitsZ>::pack(in); break;
         case 3:
-            return TypeTraits<W, NumBitsW>::pack(in);
+            out = TypeTraits<W, NumBitsW>::pack(in); break;
+        default:
+            SWR_INVALID("Invalid component: %d", comp);
+            out = in;
+            break;
         }
-        SWR_INVALID("Invalid component: %d", comp);
-        return TypeTraits<X, NumBitsX>::pack(in);
+        return out;
     }
 
     INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in)
@@ -1412,7 +1419,7 @@ struct ComponentTraits
         return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
     }
 
-    INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
+    INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar const &src)
     {
         switch (comp)
         {
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index daea088..406a0e0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -770,7 +770,7 @@ static void GeometryShaderStage(
 #if USE_SIMD16_FRONTEND
     uint32_t numPrims_simd8,
 #endif
-    simdscalari primID)
+    simdscalari const &primID)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
@@ -1069,7 +1069,7 @@ static void TessellationStages(
 #if USE_SIMD16_FRONTEND
     uint32_t numPrims_simd8,
 #endif
-    simdscalari primID)
+    simdscalari const &primID)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     const API_STATE& state = GetApiState(pDC);
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
index 3d7b26d..5cb2f87 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@@ -275,7 +275,7 @@ void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices
 #endif
 template<uint32_t NumVerts>
 INLINE
-void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari vViewportIdx)
+void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari const &vViewportIdx)
 {
     // perform a gather of each matrix element based on the viewport array indexes
     simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
@@ -296,7 +296,7 @@ void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices,
 #if USE_SIMD16_FRONTEND
 template<uint32_t NumVerts>
 INLINE
-void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simd16scalari vViewportIdx)
+void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simd16scalari const &vViewportIdx)
 {
     // perform a gather of each matrix element based on the viewport array indexes
     const simd16scalar m00 = _simd16_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
@@ -388,10 +388,10 @@ PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative);
 #endif
 
 struct PA_STATE_BASE;  // forward decl
-void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
-void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
+void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari const &primID);
+void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari const &primID);
 #if USE_SIMD16_FRONTEND
-void SIMDCALL BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
-void SIMDCALL BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
+void SIMDCALL BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari const &primID);
+void SIMDCALL BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari const &primID);
 #endif
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
index 9e63955..2e52698 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -852,7 +852,7 @@ typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pC
 typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, 
     simdvector& vSrc, simdvector& vSrc1, simdscalar& vSrc0Alpha, uint32_t sample, 
     uint8_t* pDst, simdvector& vResult, simdscalari* vOMask, simdscalari* vCoverageMask);
-typedef simdscalar(*PFN_QUANTIZE_DEPTH)(simdscalar);
+typedef simdscalar(*PFN_QUANTIZE_DEPTH)(simdscalar const &);
 
 
 
-- 
2.7.4