[Mesa-dev] [PATCH 07/11] swr: [rasterizer] Interpolation utility functions

Roland Scheidegger sroland at vmware.com
Thu Apr 14 21:42:39 UTC 2016


Am 14.04.2016 um 21:53 schrieb Tim Rowley:
> ---
>  .../drivers/swr/rasterizer/common/simdintrin.h     | 51 ++++++++++++++++++++--
>  src/gallium/drivers/swr/rasterizer/core/frontend.h | 12 +++++
>  src/gallium/drivers/swr/rasterizer/core/state.h    |  2 -
>  3 files changed, 59 insertions(+), 6 deletions(-)
> 
> diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
> index fa792b4..72fe15a 100644
> --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
> +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
> @@ -915,16 +915,25 @@ INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscal
>  }
>  
>  //////////////////////////////////////////////////////////////////////////
> +/// @brief Compute plane equation vA * vX + vB * vY + vC
> +INLINE __m128 vplaneps128(__m128 vA, __m128 vB, __m128 vC, __m128 &vX, __m128 &vY)
> +{
> +    __m128 vOut = _simd128_fmadd_ps(vA, vX, vC);
> +    vOut = _simd128_fmadd_ps(vB, vY, vOut);
> +    return vOut;
> +}
> +
> +//////////////////////////////////////////////////////////////////////////
>  /// @brief Interpolates a single component.
>  /// @param vI - barycentric I
>  /// @param vJ - barycentric J
>  /// @param pInterpBuffer - pointer to attribute barycentric coeffs
> -template<UINT Attrib, UINT Comp>
> +template<UINT Attrib, UINT Comp, UINT numComponents = 4>
>  static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
>  {
> -    const float *pInterpA = &pInterpBuffer[Attrib * 12 + 0 + Comp];
> -    const float *pInterpB = &pInterpBuffer[Attrib * 12 + 4 + Comp];
> -    const float *pInterpC = &pInterpBuffer[Attrib * 12 + 8 + Comp];
> +    const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
> +    const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
> +    const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
>  
>      simdscalar vA = _simd_broadcast_ss(pInterpA);
>      simdscalar vB = _simd_broadcast_ss(pInterpB);
> @@ -936,6 +945,40 @@ static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, cons
>      return vplaneps(vA, vB, vC, vI, vJ);
>  }
>  
> +//////////////////////////////////////////////////////////////////////////
> +/// @brief Interpolates a single component.
> +/// @param vI - barycentric I
> +/// @param vJ - barycentric J
> +/// @param pInterpBuffer - pointer to attribute barycentric coeffs
> +template<UINT Attrib, UINT Comp, UINT numComponents = 4>
> +static INLINE __m128 InterpolateComponent(__m128 vI, __m128 vJ, const float *pInterpBuffer)
> +{
> +    const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
> +    const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
> +    const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
> +
> +    __m128 vA = _mm_broadcast_ss(pInterpA);
> +    __m128 vB = _mm_broadcast_ss(pInterpB);
> +    __m128 vC = _mm_broadcast_ss(pInterpC);
> +
> +    __m128 vk = _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f), vI), vJ);
> +    vC = _mm_mul_ps(vk, vC);
> +
> +    return vplaneps128(vA, vB, vC, vI, vJ);
> +}
> +
> +static INLINE __m128 _simd128_abs_ps(__m128 a)
> +{
> +    __m128i ai = _mm_castps_si128(a);
> +    return _mm_castsi128_ps(_mm_and_si128(ai, _mm_set1_epi32(0x7fffffff)));
> +}
> +
> +static INLINE simdscalar _simd_abs_ps(simdscalar a)
> +{
> +    simdscalari ai = _simd_castps_si(a);
> +    return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
> +}
> +
>  INLINE
>  UINT pdep_u32(UINT a, UINT mask)
>  {
> diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
> index 8307c0b..12e7ae4 100644
> --- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
> +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
> @@ -307,6 +307,18 @@ bool CanUseSimplePoints(DRAW_CONTEXT *pDC)
>              !state.rastState.pointSpriteEnable);
>  }
>  
> +INLINE
> +bool vIsNaN(const __m128& vec)
> +{
> +    const __m128i& veci = _mm_castps_si128(vec);
> +    const __m128i fraction = _mm_and_si128(veci, _mm_set1_epi32(0x007fffff));
> +    const __m128i exponent = _mm_and_si128(veci, _mm_set1_epi32(0x7f800000));
> +    __m128i result = _mm_cmpeq_epi32(exponent, _mm_set1_epi32(0));
> +    result = _mm_andnot_si128(_mm_cmpeq_epi32(fraction, _mm_set1_epi32(0)), result);
> +    int32_t mask = _mm_movemask_ps(_mm_castsi128_ps(result));
> +    return (mask > 0);
> +}
You could do this simpler by just doing abs on the source (which is a and)
followed by a single _mm_cmpgt_epi32() against max exponent (0x7f800000).
Or do what lp_build_isnan does: just use _mm_cmp_ps with ordered/eq
(using same source twice) and revert the bits. (Albeit I think we're not
using the integer comparisons, which are nominally faster, in that code
because we might have 8-wide vectors hence when avx but not avx2 isn't
available this would be quite suboptimal.)
That said, I'm actually wondering why not just doing a simple single
unordered comparison, that should give the right result without having
to invert the bits (though it's possible llvm does this on its own in
the gallivm code).

Roland




>  uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements);
>  uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts);
>  
> diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
> index 5036106..88ec4b0 100644
> --- a/src/gallium/drivers/swr/rasterizer/core/state.h
> +++ b/src/gallium/drivers/swr/rasterizer/core/state.h
> @@ -197,8 +197,6 @@ enum SWR_OUTER_TESSFACTOR_ID
>  #define VERTEX_CLIPCULL_DIST_LO_SLOT 35 // VS writes lower 4 clip/cull dist
>  #define VERTEX_CLIPCULL_DIST_HI_SLOT 36 // VS writes upper 4 clip/cull dist
>  #define VERTEX_POINT_SIZE_SLOT 37       // VS writes point size here
> -static_assert(VERTEX_POINT_SIZE_SLOT < KNOB_NUM_ATTRIBUTES, "Mismatched attribute slot size");
> -
>  // SoAoSoA
>  struct simdvertex
>  {
> 



More information about the mesa-dev mailing list