[Mesa-dev] [PATCH 3/3] util: add avx2 and xop detection to cpu detection code

Tue Aug 20 12:33:11 PDT 2013

Series looks good to me.

Jose

----- Original Message -----
> From: Roland Scheidegger <sroland at vmware.com>
> 
> Going to need this soon (not going to bother with avx2 intrinsics at this
> time
> but don't want to do workarounds for true vector shifts if llvm itself can
> use
> them just fine and won't need the gazillion instruction emulation).
> Not really tested other than my cpu returns 0 for these features...
> (I have no idea if llvm actually would emit avx2/xop instructions neither...)
> ---
>  src/gallium/auxiliary/gallivm/lp_bld_init.c |   11 ++++--
>  src/gallium/auxiliary/util/u_cpu_detect.c   |   48
>  +++++++++++++++++++++++++++
>  src/gallium/auxiliary/util/u_cpu_detect.h   |    2 ++
>  3 files changed, 59 insertions(+), 2 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c
> b/src/gallium/auxiliary/gallivm/lp_bld_init.c
> index 61eadb8..61b561f 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
> @@ -461,12 +461,15 @@ lp_build_init(void)
>                                                   lp_native_vector_width);
>  
>     if (lp_native_vector_width <= 128) {
> -      /* Hide AVX support, as often LLVM AVX instrinsics are only guarded by
> +      /* Hide AVX support, as often LLVM AVX intrinsics are only guarded by
>         * "util_cpu_caps.has_avx" predicate, and lack the
>         * "lp_native_vector_width > 128" predicate. And also to ensure a more
>         * consistent behavior, allowing one to test SSE2 on AVX machines.
> +       * XXX: should not play games with util_cpu_caps directly as it might
> +       * get used for other things outside llvm too.
>         */
>        util_cpu_caps.has_avx = 0;
> +      util_cpu_caps.has_avx2 = 0;
>     }
>  
>     if (!HAVE_AVX) {
> @@ -476,13 +479,17 @@ lp_build_init(void)
>         * omit it unnecessarily on amd cpus, see above).
>         */
>        util_cpu_caps.has_f16c = 0;
> +      util_cpu_caps.has_xop = 0;
>     }
>  
>  #ifdef PIPE_ARCH_PPC_64
>     /* Set the NJ bit in VSCR to 0 so denormalized values are handled as
> -    * specified by IEEE standard (PowerISA 2.06 - Section 6.3). This
> garantees
> +    * specified by IEEE standard (PowerISA 2.06 - Section 6.3). This
> guarantees
>      * that some rounding and half-float to float handling does not round
>      * incorrectly to 0.
> +    * XXX: should eventually follow same logic on all platforms.
> +    * Right now denorms get explicitly disabled (but elsewhere) for x86,
> +    * whereas ppc64 explicitly enables them...
>      */
>     if (util_cpu_caps.has_altivec) {
>        unsigned short mask[] = { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
> diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c
> b/src/gallium/auxiliary/util/u_cpu_detect.c
> index 87ad780..2ff40bb 100644
> --- a/src/gallium/auxiliary/util/u_cpu_detect.c
> +++ b/src/gallium/auxiliary/util/u_cpu_detect.c
> @@ -212,6 +212,44 @@ cpuid(uint32_t ax, uint32_t *p)
>  #endif
>  }
>  
> +/**
> + * @sa cpuid.h included in gcc-4.4 onwards.
> + * @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx
> + */
> +static INLINE void
> +cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p)
> +{
> +#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) &&
> defined(PIPE_ARCH_X86)
> +   __asm __volatile (
> +     "xchgl %%ebx, %1\n\t"
> +     "cpuid\n\t"
> +     "xchgl %%ebx, %1"
> +     : "=a" (p[0]),
> +       "=S" (p[1]),
> +       "=c" (p[2]),
> +       "=d" (p[3])
> +     : "0" (ax), "2" (cx)
> +   );
> +#elif (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) &&
> defined(PIPE_ARCH_X86_64)
> +   __asm __volatile (
> +     "cpuid\n\t"
> +     : "=a" (p[0]),
> +       "=b" (p[1]),
> +       "=c" (p[2]),
> +       "=d" (p[3])
> +     : "0" (ax), "2" (cx)
> +   );
> +#elif defined(PIPE_CC_MSVC)
> +   __cpuidex(p, ax, cx);
> +#else
> +   p[0] = 0;
> +   p[1] = 0;
> +   p[2] = 0;
> +   p[3] = 0;
> +#endif
> +}
> +
> +
>  static INLINE uint64_t xgetbv(void)
>  {
>  #if defined(PIPE_CC_GCC)
> @@ -341,6 +379,11 @@ util_cpu_detect(void)
>           if (cacheline > 0)
>              util_cpu_caps.cacheline = cacheline;
>        }
> +      if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) {
> +         uint32_t regs7[4];
> +         cpuid_count(0x00000007, 0x00000000, regs7);
> +         util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1;
> +      }
>  
>        if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] ==
>        0x49656e69) {
>           /* GenuineIntel */
> @@ -357,6 +400,9 @@ util_cpu_detect(void)
>           util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1;
>           util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1;
>           util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1;
> +
> +         util_cpu_caps.has_xop = util_cpu_caps.has_avx &&
> +                                 ((regs2[2] >> 11) & 1);
>        }
>  
>        if (regs[0] >= 0x80000006) {
> @@ -394,10 +440,12 @@ util_cpu_detect(void)
>        debug_printf("util_cpu_caps.has_sse4_1 = %u\n",
>        util_cpu_caps.has_sse4_1);
>        debug_printf("util_cpu_caps.has_sse4_2 = %u\n",
>        util_cpu_caps.has_sse4_2);
>        debug_printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx);
> +      debug_printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2);
>        debug_printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c);
>        debug_printf("util_cpu_caps.has_popcnt = %u\n",
>        util_cpu_caps.has_popcnt);
>        debug_printf("util_cpu_caps.has_3dnow = %u\n",
>        util_cpu_caps.has_3dnow);
>        debug_printf("util_cpu_caps.has_3dnow_ext = %u\n",
>        util_cpu_caps.has_3dnow_ext);
> +      debug_printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop);
>        debug_printf("util_cpu_caps.has_altivec = %u\n",
>        util_cpu_caps.has_altivec);
>        debug_printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz);
>     }
> diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h
> b/src/gallium/auxiliary/util/u_cpu_detect.h
> index cc3e0ce..5ccfc93 100644
> --- a/src/gallium/auxiliary/util/u_cpu_detect.h
> +++ b/src/gallium/auxiliary/util/u_cpu_detect.h
> @@ -64,9 +64,11 @@ struct util_cpu_caps {
>     unsigned has_sse4_2:1;
>     unsigned has_popcnt:1;
>     unsigned has_avx:1;
> +   unsigned has_avx2:1;
>     unsigned has_f16c:1;
>     unsigned has_3dnow:1;
>     unsigned has_3dnow_ext:1;
> +   unsigned has_xop:1;
>     unsigned has_altivec:1;
>     unsigned has_daz:1;
>  };
> --
> 1.7.9.5
>