[Mesa-stable] [Mesa-dev] [PATCH] swr/rast: Faster emulated simd16 permute

Tue Nov 14 17:36:34 UTC 2017

Reviewed-by: Bruce Cherniak <bruce.cherniak at intel.com> 

> On Nov 13, 2017, at 8:03 PM, Tim Rowley <timothy.o.rowley at intel.com> wrote:
> 
> Speed up simd16 frontend (default) on avx/avx2 platforms;
> fixes performance regression caused by switch to simdlib.
> 
> Cc: mesa-stable at lists.freedesktop.org
> ---
> .../swr/rasterizer/common/simdlib_512_emu.inl      | 34 +++++++---------------
> 1 file changed, 11 insertions(+), 23 deletions(-)
> 
> diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
> index d6af7b1c64..44eba0b126 100644
> --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
> +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
> @@ -521,36 +521,24 @@ SIMD_IWRAPPER_2(packus_epi32);     // See documentation for _mm256_packus_epi32
> 
> static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
> {
> -    Integer result;
> -
> -    // Ugly slow implementation
> -    uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
> -    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
> -    uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
> -
> -    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
> -    {
> -        pResult[i] = pA[0xF & pSwiz[i]];
> -    }
> -
> -    return result;
> +    return castps_si(permute_ps(castsi_ps(a), swiz));
> }
> 
> static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
> {
> -    Float result;
> +    const auto mask = SIMD256T::set1_epi32(7);
> 
> -    // Ugly slow implementation
> -    float const *pA = reinterpret_cast<float const*>(&a);
> -    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
> -    float *pResult = reinterpret_cast<float *>(&result);
> +    auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask));
> +    auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask));
> 
> -    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
> -    {
> -        pResult[i] = pA[0xF & pSwiz[i]];
> -    }
> +    auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask));
> +    auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask));
> 
> -    return result;
> +    return Float
> +    {
> +        SIMD256T::blendv_ps(lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
> +        SIMD256T::blendv_ps(hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
> +    };
> }
> 
> // All of the 512-bit permute2f128_XX intrinsics do the following:
> -- 
> 2.14.1
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev