mesa: update to the latest u_math.h

uif is now defined. util_is_power_of_two is replaced by other variants. Signed-off-by: Chia-I Wu <olvaffe@gmail.com> Reviewed-by: Yiwei Zhang <zzyiwei@chromium.org> Reviewed-by: Ryan Neph <ryanneph@google.com> Acked-by: Gert Wollny <gert.wollny@collabora.com>
4 years ago · 10b89464a3
parent 9526a95d47
commit 10b89464a3
16 changed files with 1971 additions and 1008 deletions
--- a/config.h.meson
+++ b/config.h.meson
@ -1,7 +1,15 @@
 #mesondefine VERSION
 #mesondefine _GNU_SOURCE
 #mesondefine VIRGL_RENDERER_UNSTABLE_APIS
+#mesondefine HAVE___BUILTIN_BSWAP32
+#mesondefine HAVE___BUILTIN_BSWAP64
+#mesondefine HAVE___BUILTIN_CLZ
+#mesondefine HAVE___BUILTIN_CLZLL
 #mesondefine HAVE___BUILTIN_EXPECT
+#mesondefine HAVE___BUILTIN_FFS
+#mesondefine HAVE___BUILTIN_FFSLL
+#mesondefine HAVE___BUILTIN_POPCOUNT
+#mesondefine HAVE___BUILTIN_POPCOUNTLL
 #mesondefine HAVE___BUILTIN_TYPES_COMPATIBLE_P
 #mesondefine HAVE___BUILTIN_UNREACHABLE
 #mesondefine HAVE_FUNC_ATTRIBUTE_CONST
--- a/meson.build
+++ b/meson.build
@ -129,7 +129,8 @@ if cc.has_header('sys/select.h')
  conf_data.set('HAVE_SYS_SELECT_H', 1)
 endif

-foreach b : ['expect', 'types_compatible_p', 'unreachable']
+foreach b : ['bswap32', 'bswap64', 'clz', 'clzll', 'expect', 'ffs', 'ffsll',
+             'popcount', 'popcountll', 'types_compatible_p', 'unreachable']
  if cc.has_function(b)
    conf_data.set('HAVE___BUILTIN_@0@'.format(b.to_upper()), 1)
  endif
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@ -1,458 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Dennis Smit
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * @file
- * CPU feature detection.
- *
- * @author Dennis Smit
- * @author Based on the work of Eric Anholt <anholt@FreeBSD.org>
- */
-
-#include "pipe/p_config.h"
-
-#include "u_debug.h"
-#include "u_cpu_detect.h"
-
-#if defined(PIPE_ARCH_PPC)
-#if defined(PIPE_OS_APPLE)
-#include <sys/sysctl.h>
-#else
-#include <signal.h>
-#include <setjmp.h>
-#endif
-#endif
-
-#if defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
-#include <sys/param.h>
-#include <sys/sysctl.h>
-#include <machine/cpu.h>
-#endif
-
-#if defined(PIPE_OS_FREEBSD)
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#endif
-
-#if defined(PIPE_OS_LINUX)
-#include <signal.h>
-#endif
-
-#ifdef PIPE_OS_UNIX
-#include <unistd.h>
-#endif
-
-#if defined(PIPE_OS_WINDOWS)
-#include <windows.h>
-#if defined(PIPE_CC_MSVC)
-#include <intrin.h>
-#endif
-#endif
-
-
-#ifdef DEBUG
-DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", FALSE)
-#endif
-
-
-struct util_cpu_caps util_cpu_caps;
-
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-static int has_cpuid(void);
-#endif
-
-
-#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE)
-static jmp_buf  __lv_powerpc_jmpbuf;
-static volatile sig_atomic_t __lv_powerpc_canjump = 0;
-
-static void
-sigill_handler(int sig)
-{
-   if (!__lv_powerpc_canjump) {
-      signal (sig, SIG_DFL);
-      raise (sig);
-   }
-
-   __lv_powerpc_canjump = 0;
-   longjmp(__lv_powerpc_jmpbuf, 1);
-}
-#endif
-
-#if defined(PIPE_ARCH_PPC)
-static void
-check_os_altivec_support(void)
-{
-#if defined(PIPE_OS_APPLE)
-   int sels[2] = {CTL_HW, HW_VECTORUNIT};
-   int has_vu = 0;
-   int len = sizeof (has_vu);
-   int err;
-
-   err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
-
-   if (err == 0) {
-      if (has_vu != 0) {
-         util_cpu_caps.has_altivec = 1;
-      }
-   }
-#else /* !PIPE_OS_APPLE */
-   /* not on Apple/Darwin, do it the brute-force way */
-   /* this is borrowed from the libmpeg2 library */
-   signal(SIGILL, sigill_handler);
-   if (setjmp(__lv_powerpc_jmpbuf)) {
-      signal(SIGILL, SIG_DFL);
-   } else {
-      __lv_powerpc_canjump = 1;
-
-      __asm __volatile
-         ("mtspr 256, %0\n\t"
-          "vand %%v0, %%v0, %%v0"
-          :
-          : "r" (-1));
-
-      signal(SIGILL, SIG_DFL);
-      util_cpu_caps.has_altivec = 1;
-   }
-#endif /* !PIPE_OS_APPLE */
-}
-#endif /* PIPE_ARCH_PPC */
-
-
-#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64)
-static int has_cpuid(void)
-{
-#if defined(PIPE_ARCH_X86)
-#if defined(PIPE_OS_GCC)
-   int a, c;
-
-   __asm __volatile
-      ("pushf\n"
-       "popl %0\n"
-       "movl %0, %1\n"
-       "xorl $0x200000, %0\n"
-       "push %0\n"
-       "popf\n"
-       "pushf\n"
-       "popl %0\n"
-       : "=a" (a), "=c" (c)
-       :
-       : "cc");
-
-   return a != c;
-#else
-   /* FIXME */
-   return 1;
-#endif
-#elif defined(PIPE_ARCH_X86_64)
-   return 1;
-#else
-   return 0;
-#endif
-}
-
-
-/**
- * @sa cpuid.h included in gcc-4.3 onwards.
- * @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
- */
-static inline void
-cpuid(uint32_t ax, uint32_t *p)
-{
-#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86)
-   __asm __volatile (
-     "xchgl %%ebx, %1\n\t"
-     "cpuid\n\t"
-     "xchgl %%ebx, %1"
-     : "=a" (p[0]),
-       "=S" (p[1]),
-       "=c" (p[2]),
-       "=d" (p[3])
-     : "0" (ax)
-   );
-#elif (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86_64)
-   __asm __volatile (
-     "cpuid\n\t"
-     : "=a" (p[0]),
-       "=b" (p[1]),
-       "=c" (p[2]),
-       "=d" (p[3])
-     : "0" (ax)
-   );
-#elif defined(PIPE_CC_MSVC)
-   __cpuid(p, ax);
-#else
-   p[0] = 0;
-   p[1] = 0;
-   p[2] = 0;
-   p[3] = 0;
-#endif
-}
-
-/**
- * @sa cpuid.h included in gcc-4.4 onwards.
- * @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx
- */
-static inline void
-cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p)
-{
-#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86)
-   __asm __volatile (
-     "xchgl %%ebx, %1\n\t"
-     "cpuid\n\t"
-     "xchgl %%ebx, %1"
-     : "=a" (p[0]),
-       "=S" (p[1]),
-       "=c" (p[2]),
-       "=d" (p[3])
-     : "0" (ax), "2" (cx)
-   );
-#elif (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86_64)
-   __asm __volatile (
-     "cpuid\n\t"
-     : "=a" (p[0]),
-       "=b" (p[1]),
-       "=c" (p[2]),
-       "=d" (p[3])
-     : "0" (ax), "2" (cx)
-   );
-#elif defined(PIPE_CC_MSVC)
-   __cpuidex(p, ax, cx);
-#else
-   p[0] = 0;
-   p[1] = 0;
-   p[2] = 0;
-   p[3] = 0;
-#endif
-}
-
-
-static inline uint64_t xgetbv(void)
-{
-#if defined(PIPE_CC_GCC)
-   uint32_t eax, edx;
-
-   __asm __volatile (
-     ".byte 0x0f, 0x01, 0xd0" // xgetbv isn't supported on gcc < 4.4
-     : "=a"(eax),
-       "=d"(edx)
-     : "c"(0)
-   );
-
-   return ((uint64_t)edx << 32) | eax;
-#elif defined(PIPE_CC_MSVC) && defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
-   return _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
-#else
-   return 0;
-#endif
-}
-
-
-#if defined(PIPE_ARCH_X86)
-static inline boolean sse2_has_daz(void)
-{
-   struct {
-      uint32_t pad1[7];
-      uint32_t mxcsr_mask;
-      uint32_t pad2[128-8];
-   } PIPE_ALIGN_VAR(16) fxarea;
-
-   fxarea.mxcsr_mask = 0;
-#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO))
-   __asm __volatile ("fxsave %0" : "+m" (fxarea));
-#elif (defined(PIPE_CC_MSVC) && _MSC_VER >= 1700) || defined(PIPE_CC_ICL)
-   /* 1700 = Visual Studio 2012 */
-   _fxsave(&fxarea);
-#else
-   fxarea.mxcsr_mask = 0;
-#endif
-   return !!(fxarea.mxcsr_mask & (1 << 6));
-}
-#endif
-
-#endif /* X86 or X86_64 */
-
-void
-util_cpu_detect(void)
-{
-   static boolean util_cpu_detect_initialized = FALSE;
-
-   if(util_cpu_detect_initialized)
-      return;
-
-   memset(&util_cpu_caps, 0, sizeof util_cpu_caps);
-
-   /* Count the number of CPUs in system */
-#if defined(PIPE_OS_WINDOWS)
-   {
-      SYSTEM_INFO system_info;
-      GetSystemInfo(&system_info);
-      util_cpu_caps.nr_cpus = system_info.dwNumberOfProcessors;
-   }
-#elif defined(PIPE_OS_UNIX) && defined(_SC_NPROCESSORS_ONLN)
-   util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-   if (util_cpu_caps.nr_cpus == -1)
-      util_cpu_caps.nr_cpus = 1;
-#elif defined(PIPE_OS_BSD)
-   {
-      int mib[2], ncpu;
-      int len;
-
-      mib[0] = CTL_HW;
-      mib[1] = HW_NCPU;
-
-      len = sizeof (ncpu);
-      sysctl(mib, 2, &ncpu, &len, NULL, 0);
-      util_cpu_caps.nr_cpus = ncpu;
-   }
-#else
-   util_cpu_caps.nr_cpus = 1;
-#endif
-
-   /* Make the fallback cacheline size nonzero so that it can be
-    * safely passed to align().
-    */
-   util_cpu_caps.cacheline = sizeof(void *);
-
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if (has_cpuid()) {
-      uint32_t regs[4];
-      uint32_t regs2[4];
-
-      util_cpu_caps.cacheline = 32;
-
-      /* Get max cpuid level */
-      cpuid(0x00000000, regs);
-
-      if (regs[0] >= 0x00000001) {
-         unsigned int cacheline;
-
-         cpuid (0x00000001, regs2);
-
-         util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf;
-         if (util_cpu_caps.x86_cpu_type == 0xf)
-             util_cpu_caps.x86_cpu_type = 8 + ((regs2[0] >> 20) & 255); /* use extended family (P4, IA64) */
-
-         /* general feature flags */
-         util_cpu_caps.has_tsc    = (regs2[3] >>  4) & 1; /* 0x0000010 */
-         util_cpu_caps.has_mmx    = (regs2[3] >> 23) & 1; /* 0x0800000 */
-         util_cpu_caps.has_sse    = (regs2[3] >> 25) & 1; /* 0x2000000 */
-         util_cpu_caps.has_sse2   = (regs2[3] >> 26) & 1; /* 0x4000000 */
-         util_cpu_caps.has_sse3   = (regs2[2] >>  0) & 1; /* 0x0000001 */
-         util_cpu_caps.has_ssse3  = (regs2[2] >>  9) & 1; /* 0x0000020 */
-         util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1;
-         util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1;
-         util_cpu_caps.has_popcnt = (regs2[2] >> 23) & 1;
-         util_cpu_caps.has_avx    = ((regs2[2] >> 28) & 1) && // AVX
-                                    ((regs2[2] >> 27) & 1) && // OSXSAVE
-                                    ((xgetbv() & 6) == 6);    // XMM & YMM
-         util_cpu_caps.has_f16c   = (regs2[2] >> 29) & 1;
-         util_cpu_caps.has_mmx2   = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */
-#if defined(PIPE_ARCH_X86_64)
-         util_cpu_caps.has_daz = 1;
-#else
-         util_cpu_caps.has_daz = util_cpu_caps.has_sse3 ||
-            (util_cpu_caps.has_sse2 && sse2_has_daz());
-#endif
-
-         cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
-         if (cacheline > 0)
-            util_cpu_caps.cacheline = cacheline;
-      }
-      if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) {
-         uint32_t regs7[4];
-         cpuid_count(0x00000007, 0x00000000, regs7);
-         util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1;
-      }
-
-      if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) {
-         /* GenuineIntel */
-         util_cpu_caps.has_intel = 1;
-      }
-
-      cpuid(0x80000000, regs);
-
-      if (regs[0] >= 0x80000001) {
-
-         cpuid(0x80000001, regs2);
-
-         util_cpu_caps.has_mmx  |= (regs2[3] >> 23) & 1;
-         util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1;
-         util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1;
-         util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1;
-
-         util_cpu_caps.has_xop = util_cpu_caps.has_avx &&
-                                 ((regs2[2] >> 11) & 1);
-      }
-
-      if (regs[0] >= 0x80000006) {
-         cpuid(0x80000006, regs2);
-         util_cpu_caps.cacheline = regs2[2] & 0xFF;
-      }
-
-      if (!util_cpu_caps.has_sse) {
-         util_cpu_caps.has_sse2 = 0;
-         util_cpu_caps.has_sse3 = 0;
-         util_cpu_caps.has_ssse3 = 0;
-         util_cpu_caps.has_sse4_1 = 0;
-      }
-   }
-#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
-
-#if defined(PIPE_ARCH_PPC)
-   check_os_altivec_support();
-#endif /* PIPE_ARCH_PPC */
-
-#ifdef DEBUG
-   if (debug_get_option_dump_cpu()) {
-      debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
-
-      debug_printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type);
-      debug_printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline);
-
-      debug_printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc);
-      debug_printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx);
-      debug_printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2);
-      debug_printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse);
-      debug_printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2);
-      debug_printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3);
-      debug_printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3);
-      debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1);
-      debug_printf("util_cpu_caps.has_sse4_2 = %u\n", util_cpu_caps.has_sse4_2);
-      debug_printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx);
-      debug_printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2);
-      debug_printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c);
-      debug_printf("util_cpu_caps.has_popcnt = %u\n", util_cpu_caps.has_popcnt);
-      debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow);
-      debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext);
-      debug_printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop);
-      debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec);
-      debug_printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz);
-   }
-#endif
-
-   util_cpu_detect_initialized = TRUE;
-}
--- a/src/gallium/auxiliary/util/u_math.c
+++ b/src/gallium/auxiliary/util/u_math.c
@ -1,139 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 VMware, Inc.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-
-#include "pipe/p_config.h"
-#include "util/u_math.h"
-#include "util/u_cpu_detect.h"
-
-#if defined(PIPE_ARCH_SSE)
-#include <xmmintrin.h>
-/* This is defined in pmmintrin.h, but it can only be included when -msse3 is
- * used, so just define it here to avoid further. */
-#ifndef _MM_DENORMALS_ZERO_MASK
-#define _MM_DENORMALS_ZERO_MASK	0x0040
-#endif
-#endif
-
-#if 0
-/** 2^x, for x in [-1.0, 1.0) */
-float pow2_table[POW2_TABLE_SIZE];
-
-
-static void
-init_pow2_table(void)
-{
-   int i;
-   for (i = 0; i < POW2_TABLE_SIZE; i++)
-      pow2_table[i] = (float) pow(2.0, (i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE);
-}
-
-
-/** log2(x), for x in [1.0, 2.0) */
-float log2_table[LOG2_TABLE_SIZE];
-
-
-static void 
-init_log2_table(void)
-{
-   unsigned i;
-   for (i = 0; i < LOG2_TABLE_SIZE; i++)
-      log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SCALE));
-}
-#endif
-
-/**
- * One time init for math utilities.
- */
-void
-util_init_math(void)
-{
-   static boolean initialized = FALSE;
-   if (!initialized) {
-     //      init_pow2_table();
-      /*      init_log2_table();*/
-      initialized = TRUE;
-   }
-}
-
-/**
- * Fetches the contents of the fpstate (mxcsr on x86) register.
- *
- * On platforms without support for it just returns 0.
- */
-unsigned
-util_fpstate_get(void)
-{
-   unsigned mxcsr = 0;
-
-#if defined(PIPE_ARCH_SSE)
-   if (util_cpu_caps.has_sse) {
-      mxcsr = _mm_getcsr();
-   }
-#endif
-
-   return mxcsr;
-}
-
-/**
- * Make sure that the fp treats the denormalized floating
- * point numbers as zero.
- *
- * This is the behavior required by D3D10. OpenGL doesn't care.
- */
-unsigned
-util_fpstate_set_denorms_to_zero(unsigned current_mxcsr)
-{
-#if defined(PIPE_ARCH_SSE)
-   if (util_cpu_caps.has_sse) {
-      /* Enable flush to zero mode */
-      current_mxcsr |= _MM_FLUSH_ZERO_MASK;
-      if (util_cpu_caps.has_daz) {
-         /* Enable denormals are zero mode */
-         current_mxcsr |= _MM_DENORMALS_ZERO_MASK;
-      }
-      util_fpstate_set(current_mxcsr);
-   }
-#endif
-   return current_mxcsr;
-}
-
-/**
- * Set the state of the fpstate (mxcsr on x86) register.
- *
- * On platforms without support for it's a noop.
- */
-void
-util_fpstate_set(unsigned mxcsr)
-{
-#if defined(PIPE_ARCH_SSE)
-   if (util_cpu_caps.has_sse) {
-      _mm_setcsr(mxcsr);
-   }
-#endif
-}
--- a/src/gallium/meson.build
+++ b/src/gallium/meson.build
@ -32,7 +32,6 @@ sources_libgallium = [
   'auxiliary/util/u_format.h',
   'auxiliary/util/u_rect.h',
   'auxiliary/util/u_surface.h',
-   'auxiliary/util/u_math.h',
   'auxiliary/util/rgtc.h',
   'auxiliary/util/u_format.c',
   'auxiliary/util/u_inlines.h',
@ -44,16 +43,13 @@ sources_libgallium = [
   'auxiliary/util/u_texture.h',
   'auxiliary/util/u_hash_table.h',
   'auxiliary/util/u_box.h',
-   'auxiliary/util/u_cpu_detect.c',
   'auxiliary/util/u_pack_color.h',
   'auxiliary/util/u_double_list.h',
   'auxiliary/util/u_debug_refcnt.h',
   'auxiliary/util/u_bitmask.c',
-   'auxiliary/util/u_cpu_detect.h',
   'auxiliary/util/u_bitmask.h',
   'auxiliary/util/u_format_s3tc.h',
   'auxiliary/util/u_surface.c',
-   'auxiliary/util/u_math.c',
   'auxiliary/util/u_half.h',
   'auxiliary/util/u_prim.h',
   'auxiliary/util/u_debug_describe.c',
--- a/src/mesa/meson.build
+++ b/src/mesa/meson.build
@ -4,9 +4,12 @@
 inc_mesa = include_directories('.', 'compat', 'pipe', 'util')

 files_mesa = files(
+  'util/bitscan.c',
  'util/os_file.c',
  'util/os_misc.c',
+  'util/u_cpu_detect.c',
  'util/u_debug.c',
+  'util/u_math.c',
 )

 deps_mesa = [
--- a/src/mesa/util/bitscan.c
+++ b/src/mesa/util/bitscan.c
@ -0,0 +1,80 @@
+/**************************************************************************
+ *
+ * Copyright 2008 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "bitscan.h"
+
+#ifdef HAVE___BUILTIN_FFS
+#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
+#else
+int
+ffs(int i)
+{
+   int bit = 0;
+   if (!i)
+      return bit;
+   if (!(i & 0xffff)) {
+      bit += 16;
+      i >>= 16;
+   }
+   if (!(i & 0xff)) {
+      bit += 8;
+      i >>= 8;
+   }
+   if (!(i & 0xf)) {
+      bit += 4;
+      i >>= 4;
+   }
+   if (!(i & 0x3)) {
+      bit += 2;
+      i >>= 2;
+   }
+   if (!(i & 0x1))
+      bit += 1;
+   return bit + 1;
+}
+#endif
+
+#ifdef HAVE___BUILTIN_FFSLL
+#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
+#else
+int
+ffsll(long long int val)
+{
+   int bit;
+
+   bit = ffs((unsigned) (val & 0xffffffff));
+   if (bit != 0)
+      return bit;
+
+   bit = ffs((unsigned) (val >> 32));
+   if (bit != 0)
+      return 32 + bit;
+
+   return 0;
+}
+#endif
--- a/src/mesa/util/bitscan.h
+++ b/src/mesa/util/bitscan.h
@ -0,0 +1,356 @@
+/**************************************************************************
+ *
+ * Copyright 2008 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef BITSCAN_H
+#define BITSCAN_H
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+#if defined(__POPCNT__)
+#include <popcntintrin.h>
+#endif
+
+#include "c99_compat.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * Find first bit set in word.  Least significant bit is 1.
+ * Return 0 if no bits set.
+ */
+#ifdef HAVE___BUILTIN_FFS
+#define ffs __builtin_ffs
+#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
+static inline
+int ffs(int i)
+{
+   unsigned long index;
+   if (_BitScanForward(&index, i))
+      return index + 1;
+   else
+      return 0;
+}
+#else
+extern
+int ffs(int i);
+#endif
+
+#ifdef HAVE___BUILTIN_FFSLL
+#define ffsll __builtin_ffsll
+#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
+static inline int
+ffsll(long long int i)
+{
+   unsigned long index;
+   if (_BitScanForward64(&index, i))
+      return index + 1;
+   else
+      return 0;
+}
+#else
+extern int
+ffsll(long long int val);
+#endif
+
+
+/* Destructively loop over all of the bits in a mask as in:
+ *
+ * while (mymask) {
+ *   int i = u_bit_scan(&mymask);
+ *   ... process element i
+ * }
+ *
+ */
+static inline int
+u_bit_scan(unsigned *mask)
+{
+   const int i = ffs(*mask) - 1;
+   *mask ^= (1u << i);
+   return i;
+}
+
+#define u_foreach_bit(b, dword)                          \
+   for (uint32_t __dword = (dword), b;                     \
+        ((b) = ffs(__dword) - 1, __dword);      \
+        __dword &= ~(1 << (b)))
+
+static inline int
+u_bit_scan64(uint64_t *mask)
+{
+   const int i = ffsll(*mask) - 1;
+   *mask ^= (((uint64_t)1) << i);
+   return i;
+}
+
+#define u_foreach_bit64(b, dword)                          \
+   for (uint64_t __dword = (dword), b;                     \
+        ((b) = ffsll(__dword) - 1, __dword);      \
+        __dword &= ~(1ull << (b)))
+
+/* Determine if an unsigned value is a power of two.
+ *
+ * \note
+ * Zero is treated as a power of two.
+ */
+static inline bool
+util_is_power_of_two_or_zero(unsigned v)
+{
+   return (v & (v - 1)) == 0;
+}
+
+/* Determine if an uint64_t value is a power of two.
+ *
+ * \note
+ * Zero is treated as a power of two.
+ */
+static inline bool
+util_is_power_of_two_or_zero64(uint64_t v)
+{
+   return (v & (v - 1)) == 0;
+}
+
+/* Determine if an unsigned value is a power of two.
+ *
+ * \note
+ * Zero is \b not treated as a power of two.
+ */
+static inline bool
+util_is_power_of_two_nonzero(unsigned v)
+{
+   /* __POPCNT__ is different from HAVE___BUILTIN_POPCOUNT.  The latter
+    * indicates the existence of the __builtin_popcount function.  The former
+    * indicates that _mm_popcnt_u32 exists and is a native instruction.
+    *
+    * The other alternative is to use SSE 4.2 compile-time flags.  This has
+    * two drawbacks.  First, there is currently no build infrastructure for
+    * SSE 4.2 (only 4.1), so that would have to be added.  Second, some AMD
+    * CPUs support POPCNT but not SSE 4.2 (e.g., Barcelona).
+    */
+#ifdef __POPCNT__
+   return _mm_popcnt_u32(v) == 1;
+#else
+   return v != 0 && (v & (v - 1)) == 0;
+#endif
+}
+
+/* For looping over a bitmask when you want to loop over consecutive bits
+ * manually, for example:
+ *
+ * while (mask) {
+ *    int start, count, i;
+ *
+ *    u_bit_scan_consecutive_range(&mask, &start, &count);
+ *
+ *    for (i = 0; i < count; i++)
+ *       ... process element (start+i)
+ * }
+ */
+static inline void
+u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count)
+{
+   if (*mask == 0xffffffff) {
+      *start = 0;
+      *count = 32;
+      *mask = 0;
+      return;
+   }
+   *start = ffs(*mask) - 1;
+   *count = ffs(~(*mask >> *start)) - 1;
+   *mask &= ~(((1u << *count) - 1) << *start);
+}
+
+static inline void
+u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count)
+{
+   if (*mask == ~0ull) {
+      *start = 0;
+      *count = 64;
+      *mask = 0;
+      return;
+   }
+   *start = ffsll(*mask) - 1;
+   *count = ffsll(~(*mask >> *start)) - 1;
+   *mask &= ~(((((uint64_t)1) << *count) - 1) << *start);
+}
+
+
+/**
+ * Find last bit set in a word.  The least significant bit is 1.
+ * Return 0 if no bits are set.
+ * Essentially ffs() in the reverse direction.
+ */
+static inline unsigned
+util_last_bit(unsigned u)
+{
+#if defined(HAVE___BUILTIN_CLZ)
+   return u == 0 ? 0 : 32 - __builtin_clz(u);
+#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
+   unsigned long index;
+   if (_BitScanReverse(&index, u))
+      return index + 1;
+   else
+      return 0;
+#else
+   unsigned r = 0;
+   while (u) {
+      r++;
+      u >>= 1;
+   }
+   return r;
+#endif
+}
+
+/**
+ * Find last bit set in a word.  The least significant bit is 1.
+ * Return 0 if no bits are set.
+ * Essentially ffsll() in the reverse direction.
+ */
+static inline unsigned
+util_last_bit64(uint64_t u)
+{
+#if defined(HAVE___BUILTIN_CLZLL)
+   return u == 0 ? 0 : 64 - __builtin_clzll(u);
+#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
+   unsigned long index;
+   if (_BitScanReverse64(&index, u))
+      return index + 1;
+   else
+      return 0;
+#else
+   unsigned r = 0;
+   while (u) {
+      r++;
+      u >>= 1;
+   }
+   return r;
+#endif
+}
+
+/**
+ * Find last bit in a word that does not match the sign bit. The least
+ * significant bit is 1.
+ * Return 0 if no bits are set.
+ */
+static inline unsigned
+util_last_bit_signed(int i)
+{
+   if (i >= 0)
+      return util_last_bit(i);
+   else
+      return util_last_bit(~(unsigned)i);
+}
+
+/* Returns a bitfield in which the first count bits starting at start are
+ * set.
+ */
+static inline unsigned
+u_bit_consecutive(unsigned start, unsigned count)
+{
+   assert(start + count <= 32);
+   if (count == 32)
+      return ~0;
+   return ((1u << count) - 1) << start;
+}
+
+static inline uint64_t
+u_bit_consecutive64(unsigned start, unsigned count)
+{
+   assert(start + count <= 64);
+   if (count == 64)
+      return ~(uint64_t)0;
+   return (((uint64_t)1 << count) - 1) << start;
+}
+
+/**
+ * Return number of bits set in n.
+ */
+static inline unsigned
+util_bitcount(unsigned n)
+{
+#if defined(HAVE___BUILTIN_POPCOUNT)
+   return __builtin_popcount(n);
+#else
+   /* K&R classic bitcount.
+    *
+    * For each iteration, clear the LSB from the bitfield.
+    * Requires only one iteration per set bit, instead of
+    * one iteration per bit less than highest set bit.
+    */
+   unsigned bits;
+   for (bits = 0; n; bits++) {
+      n &= n - 1;
+   }
+   return bits;
+#endif
+}
+
+/**
+ * Return the number of bits set in n using the native popcnt instruction.
+ * The caller is responsible for ensuring that popcnt is supported by the CPU.
+ *
+ * gcc doesn't use it if -mpopcnt or -march= that has popcnt is missing.
+ *
+ */
+static inline unsigned
+util_popcnt_inline_asm(unsigned n)
+{
+#if defined(USE_X86_64_ASM) || defined(USE_X86_ASM)
+   uint32_t out;
+   __asm volatile("popcnt %1, %0" : "=r"(out) : "r"(n));
+   return out;
+#else
+   /* We should never get here by accident, but I'm sure it'll happen. */
+   return util_bitcount(n);
+#endif
+}
+
+static inline unsigned
+util_bitcount64(uint64_t n)
+{
+#ifdef HAVE___BUILTIN_POPCOUNTLL
+   return __builtin_popcountll(n);
+#else
+   return util_bitcount(n) + util_bitcount(n >> 32);
+#endif
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BITSCAN_H */
--- a/src/mesa/util/u_cpu_detect.c
+++ b/src/mesa/util/u_cpu_detect.c
@ -0,0 +1,865 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Dennis Smit
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * CPU feature detection.
+ *
+ * @author Dennis Smit
+ * @author Based on the work of Eric Anholt <anholt@FreeBSD.org>
+ */
+
+#include "pipe/p_config.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_debug.h"
+#include "u_cpu_detect.h"
+#include "u_math.h"
+#include "c11/threads.h"
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#if defined(PIPE_ARCH_PPC)
+#if defined(PIPE_OS_APPLE)
+#include <sys/sysctl.h>
+#else
+#include <signal.h>
+#include <setjmp.h>
+#endif
+#endif
+
+#if defined(PIPE_OS_BSD)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+#endif
+
+#if defined(PIPE_OS_FREEBSD)
+#if __has_include(<sys/auxv.h>)
+#include <sys/auxv.h>
+#define HAVE_ELF_AUX_INFO
+#endif
+#endif
+
+#if defined(PIPE_OS_LINUX)
+#include <signal.h>
+#include <fcntl.h>
+#include <elf.h>
+#endif
+
+#ifdef PIPE_OS_UNIX
+#include <unistd.h>
+#endif
+
+#if defined(HAS_ANDROID_CPUFEATURES)
+#include <cpu-features.h>
+#endif
+
+#if defined(PIPE_OS_WINDOWS)
+#include <windows.h>
+#if defined(PIPE_CC_MSVC)
+#include <intrin.h>
+#endif
+#endif
+
+#if defined(HAS_SCHED_H)
+#include <sched.h>
+#endif
+
+DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false)
+
+
+struct util_cpu_caps_t util_cpu_caps;
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+static int has_cpuid(void);
+#endif
+
+
+#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE) && !defined(PIPE_OS_BSD) && !defined(PIPE_OS_LINUX)
+static jmp_buf  __lv_powerpc_jmpbuf;
+static volatile sig_atomic_t __lv_powerpc_canjump = 0;
+
+static void
+sigill_handler(int sig)
+{
+   if (!__lv_powerpc_canjump) {
+      signal (sig, SIG_DFL);
+      raise (sig);
+   }
+
+   __lv_powerpc_canjump = 0;
+   longjmp(__lv_powerpc_jmpbuf, 1);
+}
+#endif
+
+#if defined(PIPE_ARCH_PPC)
+static void
+check_os_altivec_support(void)
+{
+#if defined(__ALTIVEC__)
+   util_cpu_caps.has_altivec = 1;
+#endif
+#if defined(__VSX__)
+   util_cpu_caps.has_vsx = 1;
+#endif
+#if defined(__ALTIVEC__) && defined(__VSX__)
+/* Do nothing */
+#elif defined(PIPE_OS_APPLE) || defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
+#ifdef HW_VECTORUNIT
+   int sels[2] = {CTL_HW, HW_VECTORUNIT};
+#else
+   int sels[2] = {CTL_MACHDEP, CPU_ALTIVEC};
+#endif
+   int has_vu = 0;
+   int len = sizeof (has_vu);
+   int err;
+
+   err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
+
+   if (err == 0) {
+      if (has_vu != 0) {
+         util_cpu_caps.has_altivec = 1;
+      }
+   }
+#elif defined(PIPE_OS_FREEBSD) /* !PIPE_OS_APPLE && !PIPE_OS_NETBSD && !PIPE_OS_OPENBSD */
+   unsigned long hwcap = 0;
+#ifdef HAVE_ELF_AUX_INFO
+   elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+#else
+   size_t len = sizeof(hwcap);
+   sysctlbyname("hw.cpu_features", &hwcap, &len, NULL, 0);
+#endif
+   if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
+      util_cpu_caps.has_altivec = 1;
+   if (hwcap & PPC_FEATURE_HAS_VSX)
+      util_cpu_caps.has_vsx = 1;
+#elif defined(PIPE_OS_LINUX) /* !PIPE_OS_FREEBSD */
+#if defined(PIPE_ARCH_PPC_64)
+    Elf64_auxv_t aux;
+#else
+    Elf32_auxv_t aux;
+#endif
+    int fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
+    if (fd >= 0) {
+       while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) {
+          if (aux.a_type == AT_HWCAP) {
+             char *env_vsx = getenv("GALLIVM_VSX");
+             uint64_t hwcap = aux.a_un.a_val;
+             util_cpu_caps.has_altivec = (hwcap >> 28) & 1;
+             if (!env_vsx || env_vsx[0] != '0') {
+                util_cpu_caps.has_vsx  = (hwcap >>  7) & 1;
+             }
+             break;
+          }
+       }
+       close(fd);
+    }
+#else /* !PIPE_OS_APPLE && !PIPE_OS_BSD && !PIPE_OS_LINUX */
+   /* not on Apple/Darwin or Linux, do it the brute-force way */
+   /* this is borrowed from the libmpeg2 library */
+   signal(SIGILL, sigill_handler);
+   if (setjmp(__lv_powerpc_jmpbuf)) {
+      signal(SIGILL, SIG_DFL);
+   } else {
+      boolean enable_altivec = TRUE;    /* Default: enable  if available, and if not overridden */
+      boolean enable_vsx = TRUE;
+#ifdef DEBUG
+      /* Disabling Altivec code generation is not the same as disabling VSX code generation,
+       * which can be done simply by passing -mattr=-vsx to the LLVM compiler; cf.
+       * lp_build_create_jit_compiler_for_module().
+       * If you want to disable Altivec code generation, the best place to do it is here.
+       */
+      char *env_control = getenv("GALLIVM_ALTIVEC");    /* 1=enable (default); 0=disable */
+      if (env_control && env_control[0] == '0') {
+         enable_altivec = FALSE;
+      }
+#endif
+      /* VSX instructions can be explicitly enabled/disabled via GALLIVM_VSX=1 or 0 */
+      char *env_vsx = getenv("GALLIVM_VSX");
+      if (env_vsx && env_vsx[0] == '0') {
+         enable_vsx = FALSE;
+      }
+      if (enable_altivec) {
+         __lv_powerpc_canjump = 1;
+
+         __asm __volatile
+            ("mtspr 256, %0\n\t"
+             "vand %%v0, %%v0, %%v0"
+             :
+             : "r" (-1));
+
+         util_cpu_caps.has_altivec = 1;
+
+         if (enable_vsx) {
+            __asm __volatile("xxland %vs0, %vs0, %vs0");
+            util_cpu_caps.has_vsx = 1;
+         }
+         signal(SIGILL, SIG_DFL);
+      } else {
+         util_cpu_caps.has_altivec = 0;
+      }
+   }
+#endif /* !PIPE_OS_APPLE && !PIPE_OS_LINUX */
+}
+#endif /* PIPE_ARCH_PPC */
+
+
+#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64)
+static int has_cpuid(void)
+{
+#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_OS_GCC)
+   int a, c;
+
+   __asm __volatile
+      ("pushf\n"
+       "popl %0\n"
+       "movl %0, %1\n"
+       "xorl $0x200000, %0\n"
+       "push %0\n"
+       "popf\n"
+       "pushf\n"
+       "popl %0\n"
+       : "=a" (a), "=c" (c)
+       :
+       : "cc");
+
+   return a != c;
+#else
+   /* FIXME */
+   return 1;
+#endif
+#elif defined(PIPE_ARCH_X86_64)
+   return 1;
+#else
+   return 0;
+#endif
+}
+
+
+/**
+ * @sa cpuid.h included in gcc-4.3 onwards.
+ * @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
+ */
+static inline void
+cpuid(uint32_t ax, uint32_t *p)
+{
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
+   __asm __volatile (
+     "xchgl %%ebx, %1\n\t"
+     "cpuid\n\t"
+     "xchgl %%ebx, %1"
+     : "=a" (p[0]),
+       "=S" (p[1]),
+       "=c" (p[2]),
+       "=d" (p[3])
+     : "0" (ax)
+   );
+#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64)
+   __asm __volatile (
+     "cpuid\n\t"
+     : "=a" (p[0]),
+       "=b" (p[1]),
+       "=c" (p[2]),
+       "=d" (p[3])
+     : "0" (ax)
+   );
+#elif defined(PIPE_CC_MSVC)
+   __cpuid(p, ax);
+#else
+   p[0] = 0;
+   p[1] = 0;
+   p[2] = 0;
+   p[3] = 0;
+#endif
+}
+
+/**
+ * @sa cpuid.h included in gcc-4.4 onwards.
+ * @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx
+ */
+static inline void
+cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p)
+{
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
+   __asm __volatile (
+     "xchgl %%ebx, %1\n\t"
+     "cpuid\n\t"
+     "xchgl %%ebx, %1"
+     : "=a" (p[0]),
+       "=S" (p[1]),
+       "=c" (p[2]),
+       "=d" (p[3])
+     : "0" (ax), "2" (cx)
+   );
+#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64)
+   __asm __volatile (
+     "cpuid\n\t"
+     : "=a" (p[0]),
+       "=b" (p[1]),
+       "=c" (p[2]),
+       "=d" (p[3])
+     : "0" (ax), "2" (cx)
+   );
+#elif defined(PIPE_CC_MSVC)
+   __cpuidex(p, ax, cx);
+#else
+   p[0] = 0;
+   p[1] = 0;
+   p[2] = 0;
+   p[3] = 0;
+#endif
+}
+
+
+static inline uint64_t xgetbv(void)
+{
+#if defined(PIPE_CC_GCC)
+   uint32_t eax, edx;
+
+   __asm __volatile (
+     ".byte 0x0f, 0x01, 0xd0" // xgetbv isn't supported on gcc < 4.4
+     : "=a"(eax),
+       "=d"(edx)
+     : "c"(0)
+   );
+
+   return ((uint64_t)edx << 32) | eax;
+#elif defined(PIPE_CC_MSVC) && defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
+   return _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+#else
+   return 0;
+#endif
+}
+
+
+#if defined(PIPE_ARCH_X86)
+PIPE_ALIGN_STACK static inline boolean sse2_has_daz(void)
+{
+   struct {
+      uint32_t pad1[7];
+      uint32_t mxcsr_mask;
+      uint32_t pad2[128-8];
+   } PIPE_ALIGN_VAR(16) fxarea;
+
+   fxarea.mxcsr_mask = 0;
+#if defined(PIPE_CC_GCC)
+   __asm __volatile ("fxsave %0" : "+m" (fxarea));
+#elif defined(PIPE_CC_MSVC) || defined(PIPE_CC_ICL)
+   _fxsave(&fxarea);
+#else
+   fxarea.mxcsr_mask = 0;
+#endif
+   return !!(fxarea.mxcsr_mask & (1 << 6));
+}
+#endif
+
+#endif /* X86 or X86_64 */
+
+#if defined(PIPE_ARCH_ARM)
+static void
+check_os_arm_support(void)
+{
+   /*
+    * On Android, the cpufeatures library is preferred way of checking
+    * CPU capabilities. However, it is not available for standalone Mesa
+    * builds, i.e. when Android build system (Android.mk-based) is not
+    * used. Because of this we cannot use PIPE_OS_ANDROID here, but rather
+    * have a separate macro that only gets enabled from respective Android.mk.
+    */
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+   util_cpu_caps.has_neon = 1;
+#elif defined(PIPE_OS_FREEBSD) && defined(HAVE_ELF_AUX_INFO)
+   unsigned long hwcap = 0;
+   elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+   if (hwcap & HWCAP_NEON)
+      util_cpu_caps.has_neon = 1;
+#elif defined(HAS_ANDROID_CPUFEATURES)
+   AndroidCpuFamily cpu_family = android_getCpuFamily();
+   uint64_t cpu_features = android_getCpuFeatures();
+
+   if (cpu_family == ANDROID_CPU_FAMILY_ARM) {
+      if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON)
+         util_cpu_caps.has_neon = 1;
+   }
+#elif defined(PIPE_OS_LINUX)
+    Elf32_auxv_t aux;
+    int fd;
+
+    fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
+    if (fd >= 0) {
+       while (read(fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) {
+          if (aux.a_type == AT_HWCAP) {
+             uint32_t hwcap = aux.a_un.a_val;
+
+             util_cpu_caps.has_neon = (hwcap >> 12) & 1;
+             break;
+          }
+       }
+       close (fd);
+    }
+#endif /* PIPE_OS_LINUX */
+}
+
+#elif defined(PIPE_ARCH_AARCH64)
+static void
+check_os_arm_support(void)
+{
+    util_cpu_caps.has_neon = true;
+}
+#endif /* PIPE_ARCH_ARM || PIPE_ARCH_AARCH64 */
+
+#if defined(PIPE_ARCH_MIPS64)
+static void
+check_os_mips64_support(void)
+{
+    Elf64_auxv_t aux;
+    int fd;
+
+    fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
+    if (fd >= 0) {
+       while (read(fd, &aux, sizeof(Elf64_auxv_t)) == sizeof(Elf64_auxv_t)) {
+          if (aux.a_type == AT_HWCAP) {
+             uint64_t hwcap = aux.a_un.a_val;
+
+             util_cpu_caps.has_msa = (hwcap >> 1) & 1;
+             break;
+          }
+       }
+       close (fd);
+    }
+}
+#endif /* PIPE_ARCH_MIPS64 */
+
+
+static void
+get_cpu_topology(void)
+{
+   /* Default. This is OK if L3 is not present or there is only one. */
+   util_cpu_caps.num_L3_caches = 1;
+
+   memset(util_cpu_caps.cpu_to_L3, 0xff, sizeof(util_cpu_caps.cpu_to_L3));
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   /* AMD Zen */
+   if (util_cpu_caps.family >= CPU_AMD_ZEN1_ZEN2 &&
+       util_cpu_caps.family < CPU_AMD_LAST) {
+      uint32_t regs[4];
+
+      uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0};
+      uint32_t mask[UTIL_MAX_CPUS / 32] = {0};
+      bool saved = false;
+
+      uint32_t L3_found[UTIL_MAX_CPUS] = {0};
+      uint32_t num_L3_caches = 0;
+      util_affinity_mask *L3_affinity_masks = NULL;
+
+      /* Query APIC IDs from each CPU core.
+       *
+       * An APIC ID is a logical ID of the CPU with respect to the cache
+       * hierarchy, meaning that consecutive APIC IDs are neighbours in
+       * the hierarchy, e.g. sharing the same cache.
+       *
+       * For example, CPU 0 can have APIC ID 0 and CPU 12 can have APIC ID 1,
+       * which means that both CPU 0 and 12 are next to each other.
+       * (e.g. they are 2 threads belonging to 1 SMT2 core)
+       *
+       * We need to find out which CPUs share the same L3 cache and they can
+       * be all over the place.
+       *
+       * Querying the APIC ID can only be done by pinning the current thread
+       * to each core. The original affinity mask is saved.
+       *
+       * Loop over all possible CPUs even though some may be offline.
+       */
+      for (int16_t i = 0; i < util_cpu_caps.max_cpus && i < UTIL_MAX_CPUS; i++) {
+         uint32_t cpu_bit = 1u << (i % 32);
+
+         mask[i / 32] = cpu_bit;
+
+         /* The assumption is that trying to bind the thread to a CPU that is
+          * offline will fail.
+          */
+         if (util_set_current_thread_affinity(mask,
+                                              !saved ? saved_mask : NULL,
+                                              util_cpu_caps.num_cpu_mask_bits)) {
+            saved = true;
+
+            /* Query the APIC ID of the current core. */
+            cpuid(0x00000001, regs);
+            unsigned apic_id = regs[1] >> 24;
+
+            /* Query the total core count for the CPU */
+            uint32_t core_count = 1;
+            if (regs[3] & (1 << 28))
+               core_count = (regs[1] >> 16) & 0xff;
+
+            core_count = util_next_power_of_two(core_count);
+
+            /* Query the L3 cache count. */
+            cpuid_count(0x8000001D, 3, regs);
+            unsigned cache_level = (regs[0] >> 5) & 0x7;
+            unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1;
+
+            if (cache_level != 3)
+               continue;
+
+            unsigned local_core_id = apic_id & (core_count - 1);
+            unsigned phys_id = (apic_id & ~(core_count - 1)) >> util_logbase2(core_count);
+            unsigned local_l3_cache_index = local_core_id / util_next_power_of_two(cores_per_L3);
+#define L3_ID(p, i) (p << 16 | i << 1 | 1);
+
+            unsigned l3_id = L3_ID(phys_id, local_l3_cache_index);
+            int idx = -1;
+            for (unsigned c = 0; c < num_L3_caches; c++) {
+               if (L3_found[c] == l3_id) {
+                  idx = c;
+                  break;
+               }
+            }
+            if (idx == -1) {
+               idx = num_L3_caches;
+               L3_found[num_L3_caches++] = l3_id;
+               L3_affinity_masks = realloc(L3_affinity_masks, sizeof(util_affinity_mask) * num_L3_caches);
+               if (!L3_affinity_masks)
+                  return;
+               memset(&L3_affinity_masks[num_L3_caches - 1], 0, sizeof(util_affinity_mask));
+            }
+            util_cpu_caps.cpu_to_L3[i] = idx;
+            L3_affinity_masks[idx][i / 32] |= cpu_bit;
+
+         }
+         mask[i / 32] = 0;
+      }
+
+      util_cpu_caps.num_L3_caches = num_L3_caches;
+      util_cpu_caps.L3_affinity_mask = L3_affinity_masks;
+
+      if (saved) {
+         if (debug_get_option_dump_cpu()) {
+            fprintf(stderr, "CPU <-> L3 cache mapping:\n");
+            for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) {
+               fprintf(stderr, "  - L3 %u mask = ", i);
+               for (int j = util_cpu_caps.max_cpus - 1; j >= 0; j -= 32)
+                  fprintf(stderr, "%08x ", util_cpu_caps.L3_affinity_mask[i][j / 32]);
+               fprintf(stderr, "\n");
+            }
+         }
+
+         /* Restore the original affinity mask. */
+         util_set_current_thread_affinity(saved_mask, NULL,
+                                          util_cpu_caps.num_cpu_mask_bits);
+      } else {
+         if (debug_get_option_dump_cpu())
+            fprintf(stderr, "Cannot set thread affinity for any thread.\n");
+      }
+   }
+#endif
+}
+
+static void
+util_cpu_detect_once(void)
+{
+   int available_cpus = 0;
+   int total_cpus = 0;
+
+   memset(&util_cpu_caps, 0, sizeof util_cpu_caps);
+
+   /* Count the number of CPUs in system */
+#if defined(PIPE_OS_WINDOWS)
+   {
+      SYSTEM_INFO system_info;
+      GetSystemInfo(&system_info);
+      available_cpus = MAX2(1, system_info.dwNumberOfProcessors);
+   }
+#elif defined(PIPE_OS_UNIX)
+#  if defined(HAS_SCHED_GETAFFINITY)
+   {
+      /* sched_setaffinity() can be used to further restrict the number of
+       * CPUs on which the process can run.  Use sched_getaffinity() to
+       * determine the true number of available CPUs.
+       *
+       * FIXME: The Linux manual page for sched_getaffinity describes how this
+       * simple implementation will fail with > 1024 CPUs, and we'll fall back
+       * to the _SC_NPROCESSORS_ONLN path.  Support for > 1024 CPUs can be
+       * added to this path once someone has such a system for testing.
+       */
+      cpu_set_t affin;
+      if (sched_getaffinity(getpid(), sizeof(affin), &affin) == 0)
+         available_cpus = CPU_COUNT(&affin);
+   }
+#  endif
+
+   /* Linux, FreeBSD, DragonFly, and Mac OS X should have
+    * _SC_NOPROCESSORS_ONLN.  NetBSD and OpenBSD should have HW_NCPUONLINE.
+    * This is what FFmpeg uses on those platforms.
+    */
+#  if defined(PIPE_OS_BSD) && defined(HW_NCPUONLINE)
+   if (available_cpus == 0) {
+      const int mib[] = { CTL_HW, HW_NCPUONLINE };
+      int ncpu;
+      int len = sizeof(ncpu);
+
+      sysctl(mib, 2, &ncpu, &len, NULL, 0);
+      available_cpus = ncpu;
+   }
+#  elif defined(_SC_NPROCESSORS_ONLN)
+   if (available_cpus == 0) {
+      available_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+      if (available_cpus == ~0)
+         available_cpus = 1;
+   }
+#  elif defined(PIPE_OS_BSD)
+   if (available_cpus == 0) {
+      const int mib[] = { CTL_HW, HW_NCPU };
+      int ncpu;
+      int len = sizeof(ncpu);
+
+      sysctl(mib, 2, &ncpu, &len, NULL, 0);
+      available_cpus = ncpu;
+   }
+#  endif /* defined(PIPE_OS_BSD) */
+
+   /* Determine the maximum number of CPUs configured in the system.  This is
+    * used to properly set num_cpu_mask_bits below.  On BSDs that don't have
+    * HW_NCPUONLINE, it was not clear whether HW_NCPU is the number of
+    * configured or the number of online CPUs.  For that reason, prefer the
+    * _SC_NPROCESSORS_CONF path on all BSDs.
+    */
+#  if defined(_SC_NPROCESSORS_CONF)
+   total_cpus = sysconf(_SC_NPROCESSORS_CONF);
+   if (total_cpus == ~0)
+      total_cpus = 1;
+#  elif defined(PIPE_OS_BSD)
+   {
+      const int mib[] = { CTL_HW, HW_NCPU };
+      int ncpu;
+      int len = sizeof(ncpu);
+
+      sysctl(mib, 2, &ncpu, &len, NULL, 0);
+      total_cpus = ncpu;
+   }
+#  endif /* defined(PIPE_OS_BSD) */
+#endif /* defined(PIPE_OS_UNIX) */
+
+   util_cpu_caps.nr_cpus = MAX2(1, available_cpus);
+   total_cpus = MAX2(total_cpus, util_cpu_caps.nr_cpus);
+
+   util_cpu_caps.max_cpus = total_cpus;
+   util_cpu_caps.num_cpu_mask_bits = align(total_cpus, 32);
+
+   /* Make the fallback cacheline size nonzero so that it can be
+    * safely passed to align().
+    */
+   util_cpu_caps.cacheline = sizeof(void *);
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   if (has_cpuid()) {
+      uint32_t regs[4];
+      uint32_t regs2[4];
+
+      util_cpu_caps.cacheline = 32;
+
+      /* Get max cpuid level */
+      cpuid(0x00000000, regs);
+
+      if (regs[0] >= 0x00000001) {
+         unsigned int cacheline;
+
+         cpuid (0x00000001, regs2);
+
+         util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf;
+         /* Add "extended family". */
+         if (util_cpu_caps.x86_cpu_type == 0xf)
+             util_cpu_caps.x86_cpu_type += ((regs2[0] >> 20) & 0xff);
+
+         switch (util_cpu_caps.x86_cpu_type) {
+         case 0x17:
+            util_cpu_caps.family = CPU_AMD_ZEN1_ZEN2;
+            break;
+         case 0x18:
+            util_cpu_caps.family = CPU_AMD_ZEN_HYGON;
+            break;
+         case 0x19:
+            util_cpu_caps.family = CPU_AMD_ZEN3;
+            break;
+         default:
+            if (util_cpu_caps.x86_cpu_type > 0x19)
+               util_cpu_caps.family = CPU_AMD_ZEN_NEXT;
+         }
+
+         /* general feature flags */
+         util_cpu_caps.has_tsc    = (regs2[3] >>  4) & 1; /* 0x0000010 */
+         util_cpu_caps.has_mmx    = (regs2[3] >> 23) & 1; /* 0x0800000 */
+         util_cpu_caps.has_sse    = (regs2[3] >> 25) & 1; /* 0x2000000 */
+         util_cpu_caps.has_sse2   = (regs2[3] >> 26) & 1; /* 0x4000000 */
+         util_cpu_caps.has_sse3   = (regs2[2] >>  0) & 1; /* 0x0000001 */
+         util_cpu_caps.has_ssse3  = (regs2[2] >>  9) & 1; /* 0x0000020 */
+         util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1;
+         util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1;
+         util_cpu_caps.has_popcnt = (regs2[2] >> 23) & 1;
+         util_cpu_caps.has_avx    = ((regs2[2] >> 28) & 1) && // AVX
+                                    ((regs2[2] >> 27) & 1) && // OSXSAVE
+                                    ((xgetbv() & 6) == 6);    // XMM & YMM
+         util_cpu_caps.has_f16c   = ((regs2[2] >> 29) & 1) && util_cpu_caps.has_avx;
+         util_cpu_caps.has_fma    = ((regs2[2] >> 12) & 1) && util_cpu_caps.has_avx;
+         util_cpu_caps.has_mmx2   = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */
+#if defined(PIPE_ARCH_X86_64)
+         util_cpu_caps.has_daz = 1;
+#else
+         util_cpu_caps.has_daz = util_cpu_caps.has_sse3 ||
+            (util_cpu_caps.has_sse2 && sse2_has_daz());
+#endif
+
+         cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
+         if (cacheline > 0)
+            util_cpu_caps.cacheline = cacheline;
+      }
+      if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) {
+         uint32_t regs7[4];
+         cpuid_count(0x00000007, 0x00000000, regs7);
+         util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1;
+      }
+
+      // check for avx512
+      if (((regs2[2] >> 27) & 1) && // OSXSAVE
+          (xgetbv() & (0x7 << 5)) && // OPMASK: upper-256 enabled by OS
+          ((xgetbv() & 6) == 6)) { // XMM/YMM enabled by OS
+         uint32_t regs3[4];
+         cpuid_count(0x00000007, 0x00000000, regs3);
+         util_cpu_caps.has_avx512f    = (regs3[1] >> 16) & 1;
+         util_cpu_caps.has_avx512dq   = (regs3[1] >> 17) & 1;
+         util_cpu_caps.has_avx512ifma = (regs3[1] >> 21) & 1;
+         util_cpu_caps.has_avx512pf   = (regs3[1] >> 26) & 1;
+         util_cpu_caps.has_avx512er   = (regs3[1] >> 27) & 1;
+         util_cpu_caps.has_avx512cd   = (regs3[1] >> 28) & 1;
+         util_cpu_caps.has_avx512bw   = (regs3[1] >> 30) & 1;
+         util_cpu_caps.has_avx512vl   = (regs3[1] >> 31) & 1;
+         util_cpu_caps.has_avx512vbmi = (regs3[2] >>  1) & 1;
+      }
+
+      if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) {
+         /* GenuineIntel */
+         util_cpu_caps.has_intel = 1;
+      }
+
+      cpuid(0x80000000, regs);
+
+      if (regs[0] >= 0x80000001) {
+
+         cpuid(0x80000001, regs2);
+
+         util_cpu_caps.has_mmx  |= (regs2[3] >> 23) & 1;
+         util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1;
+         util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1;
+         util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1;
+
+         util_cpu_caps.has_xop = util_cpu_caps.has_avx &&
+                                 ((regs2[2] >> 11) & 1);
+      }
+
+      if (regs[0] >= 0x80000006) {
+         /* should we really do this if the clflush size above worked? */
+         unsigned int cacheline;
+         cpuid(0x80000006, regs2);
+         cacheline = regs2[2] & 0xFF;
+         if (cacheline > 0)
+            util_cpu_caps.cacheline = cacheline;
+      }
+
+      if (!util_cpu_caps.has_sse) {
+         util_cpu_caps.has_sse2 = 0;
+         util_cpu_caps.has_sse3 = 0;
+         util_cpu_caps.has_ssse3 = 0;
+         util_cpu_caps.has_sse4_1 = 0;
+      }
+   }
+#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
+
+#if defined(PIPE_ARCH_ARM) || defined(PIPE_ARCH_AARCH64)
+   check_os_arm_support();
+#endif
+
+#if defined(PIPE_ARCH_PPC)
+   check_os_altivec_support();
+#endif /* PIPE_ARCH_PPC */
+
+#if defined(PIPE_ARCH_MIPS64)
+   check_os_mips64_support();
+#endif /* PIPE_ARCH_MIPS64 */
+
+   get_cpu_topology();
+
+   if (debug_get_option_dump_cpu()) {
+      printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
+
+      printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type);
+      printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline);
+
+      printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc);
+      printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx);
+      printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2);
+      printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse);
+      printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2);
+      printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3);
+      printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3);
+      printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1);
+      printf("util_cpu_caps.has_sse4_2 = %u\n", util_cpu_caps.has_sse4_2);
+      printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx);
+      printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2);
+      printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c);
+      printf("util_cpu_caps.has_popcnt = %u\n", util_cpu_caps.has_popcnt);
+      printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow);
+      printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext);
+      printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop);
+      printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec);
+      printf("util_cpu_caps.has_vsx = %u\n", util_cpu_caps.has_vsx);
+      printf("util_cpu_caps.has_neon = %u\n", util_cpu_caps.has_neon);
+      printf("util_cpu_caps.has_msa = %u\n", util_cpu_caps.has_msa);
+      printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz);
+      printf("util_cpu_caps.has_avx512f = %u\n", util_cpu_caps.has_avx512f);
+      printf("util_cpu_caps.has_avx512dq = %u\n", util_cpu_caps.has_avx512dq);
+      printf("util_cpu_caps.has_avx512ifma = %u\n", util_cpu_caps.has_avx512ifma);
+      printf("util_cpu_caps.has_avx512pf = %u\n", util_cpu_caps.has_avx512pf);
+      printf("util_cpu_caps.has_avx512er = %u\n", util_cpu_caps.has_avx512er);
+      printf("util_cpu_caps.has_avx512cd = %u\n", util_cpu_caps.has_avx512cd);
+      printf("util_cpu_caps.has_avx512bw = %u\n", util_cpu_caps.has_avx512bw);
+      printf("util_cpu_caps.has_avx512vl = %u\n", util_cpu_caps.has_avx512vl);
+      printf("util_cpu_caps.has_avx512vbmi = %u\n", util_cpu_caps.has_avx512vbmi);
+      printf("util_cpu_caps.num_L3_caches = %u\n", util_cpu_caps.num_L3_caches);
+      printf("util_cpu_caps.num_cpu_mask_bits = %u\n", util_cpu_caps.num_cpu_mask_bits);
+   }
+}
+
+static once_flag cpu_once_flag = ONCE_FLAG_INIT;
+
+void
+util_cpu_detect(void)
+{
+   call_once(&cpu_once_flag, util_cpu_detect_once);
+}
--- a/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@ -36,17 +36,45 @@
 #define _UTIL_CPU_DETECT_H


-#include "pipe/p_compiler.h"
 #include "pipe/p_config.h"
+#include "util/u_thread.h"


 #ifdef	__cplusplus
 extern "C" {
 #endif

+enum cpu_family {
+   CPU_UNKNOWN,

-struct util_cpu_caps {
-   int nr_cpus;
+   CPU_AMD_ZEN1_ZEN2,
+   CPU_AMD_ZEN_HYGON,
+   CPU_AMD_ZEN3,
+   CPU_AMD_ZEN_NEXT,
+   CPU_AMD_LAST,
+};
+
+typedef uint32_t util_affinity_mask[UTIL_MAX_CPUS / 32];
+
+struct util_cpu_caps_t {
+   /**
+    * Number of CPUs available to the process.
+    *
+    * This will be less than or equal to \c max_cpus.  This is the number of
+    * CPUs that are online and available to the process.
+    */
+   int16_t nr_cpus;
+
+   /**
+    * Maximum number of CPUs that can be online in the system.
+    *
+    * This will be greater than or equal to \c nr_cpus.  This is the number of
+    * CPUs installed in the system.  \c nr_cpus will be less if some CPUs are
+    * offline.
+    */
+   int16_t max_cpus;
+
+   enum cpu_family family;

   /* Feature flags */
   int x86_cpu_type;
@ -66,15 +94,48 @@ struct util_cpu_caps {
   unsigned has_avx:1;
   unsigned has_avx2:1;
   unsigned has_f16c:1;
+   unsigned has_fma:1;
   unsigned has_3dnow:1;
   unsigned has_3dnow_ext:1;
   unsigned has_xop:1;
   unsigned has_altivec:1;
+   unsigned has_vsx:1;
   unsigned has_daz:1;
+   unsigned has_neon:1;
+   unsigned has_msa:1;
+
+   unsigned has_avx512f:1;
+   unsigned has_avx512dq:1;
+   unsigned has_avx512ifma:1;
+   unsigned has_avx512pf:1;
+   unsigned has_avx512er:1;
+   unsigned has_avx512cd:1;
+   unsigned has_avx512bw:1;
+   unsigned has_avx512vl:1;
+   unsigned has_avx512vbmi:1;
+
+   unsigned num_L3_caches;
+   unsigned num_cpu_mask_bits;
+
+   uint16_t cpu_to_L3[UTIL_MAX_CPUS];
+   /* Affinity masks for each L3 cache. */
+   util_affinity_mask *L3_affinity_mask;
 };

-extern struct util_cpu_caps
-util_cpu_caps;
+#define U_CPU_INVALID_L3 0xffff
+
+static inline const struct util_cpu_caps_t *
+util_get_cpu_caps(void)
+{
+	extern struct util_cpu_caps_t util_cpu_caps;
+
+	/* If you hit this assert, it means that something is using the
+	 * cpu-caps without having first called util_cpu_detect()
+	 */
+	assert(util_cpu_caps.nr_cpus >= 1);
+
+	return &util_cpu_caps;
+}

 void util_cpu_detect(void);

--- a/src/mesa/util/u_math.c
+++ b/src/mesa/util/u_math.c
@ -0,0 +1,311 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#include "pipe/p_config.h"
+#include "util/u_math.h"
+#include "util/u_cpu_detect.h"
+
+#if defined(PIPE_ARCH_SSE)
+#include <xmmintrin.h>
+/* This is defined in pmmintrin.h, but it can only be included when -msse3 is
+ * used, so just define it here to avoid further. */
+#ifndef _MM_DENORMALS_ZERO_MASK
+#define _MM_DENORMALS_ZERO_MASK	0x0040
+#endif
+#endif
+
+
+/** log2(x), for x in [1.0, 2.0) */
+float log2_table[LOG2_TABLE_SIZE];
+
+
+static void 
+init_log2_table(void)
+{
+   unsigned i;
+   for (i = 0; i < LOG2_TABLE_SIZE; i++)
+      log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SCALE));
+}
+
+
+/**
+ * One time init for math utilities.
+ */
+void
+util_init_math(void)
+{
+   static bool initialized = false;
+   if (!initialized) {
+      init_log2_table();
+      initialized = true;
+   }
+}
+
+/**
+ * Fetches the contents of the fpstate (mxcsr on x86) register.
+ *
+ * On platforms without support for it just returns 0.
+ */
+unsigned
+util_fpstate_get(void)
+{
+   unsigned mxcsr = 0;
+
+#if defined(PIPE_ARCH_SSE)
+   if (util_get_cpu_caps()->has_sse) {
+      mxcsr = _mm_getcsr();
+   }
+#endif
+
+   return mxcsr;
+}
+
+/**
+ * Make sure that the fp treats the denormalized floating
+ * point numbers as zero.
+ *
+ * This is the behavior required by D3D10. OpenGL doesn't care.
+ */
+unsigned
+util_fpstate_set_denorms_to_zero(unsigned current_mxcsr)
+{
+#if defined(PIPE_ARCH_SSE)
+   if (util_get_cpu_caps()->has_sse) {
+      /* Enable flush to zero mode */
+      current_mxcsr |= _MM_FLUSH_ZERO_MASK;
+      if (util_get_cpu_caps()->has_daz) {
+         /* Enable denormals are zero mode */
+         current_mxcsr |= _MM_DENORMALS_ZERO_MASK;
+      }
+      util_fpstate_set(current_mxcsr);
+   }
+#endif
+   return current_mxcsr;
+}
+
+/**
+ * Set the state of the fpstate (mxcsr on x86) register.
+ *
+ * On platforms without support for it's a noop.
+ */
+void
+util_fpstate_set(unsigned mxcsr)
+{
+#if defined(PIPE_ARCH_SSE)
+   if (util_get_cpu_caps()->has_sse) {
+      _mm_setcsr(mxcsr);
+   }
+#endif
+}
+
+/**
+ * Compute inverse of 4x4 matrix.
+ *
+ * \return false if the source matrix is singular.
+ *
+ * \author
+ * Code contributed by Jacques Leroy jle@star.be
+ *
+ * Calculates the inverse matrix by performing the gaussian matrix reduction
+ * with partial pivoting followed by back/substitution with the loops manually
+ * unrolled.
+ */
+bool
+util_invert_mat4x4(float *out, const float *m)
+{
+   float wtmp[4][8];
+   float m0, m1, m2, m3, s;
+   float *r0, *r1, *r2, *r3;
+
+#define MAT(m, r, c) (m)[(c)*4 + (r)]
+#define SWAP_ROWS(a, b)                                                                            \
+   {                                                                                               \
+      float *_tmp = a;                                                                             \
+      (a) = (b);                                                                                   \
+      (b) = _tmp;                                                                                  \
+   }
+
+   r0 = wtmp[0], r1 = wtmp[1], r2 = wtmp[2], r3 = wtmp[3];
+
+   r0[0] = MAT(m, 0, 0), r0[1] = MAT(m, 0, 1), r0[2] = MAT(m, 0, 2), r0[3] = MAT(m, 0, 3),
+   r0[4] = 1.0, r0[5] = r0[6] = r0[7] = 0.0,
+
+   r1[0] = MAT(m, 1, 0), r1[1] = MAT(m, 1, 1), r1[2] = MAT(m, 1, 2), r1[3] = MAT(m, 1, 3),
+   r1[5] = 1.0, r1[4] = r1[6] = r1[7] = 0.0,
+
+   r2[0] = MAT(m, 2, 0), r2[1] = MAT(m, 2, 1), r2[2] = MAT(m, 2, 2), r2[3] = MAT(m, 2, 3),
+   r2[6] = 1.0, r2[4] = r2[5] = r2[7] = 0.0,
+
+   r3[0] = MAT(m, 3, 0), r3[1] = MAT(m, 3, 1), r3[2] = MAT(m, 3, 2), r3[3] = MAT(m, 3, 3),
+   r3[7] = 1.0, r3[4] = r3[5] = r3[6] = 0.0;
+
+   /* choose pivot - or die */
+   if (fabsf(r3[0]) > fabsf(r2[0]))
+      SWAP_ROWS(r3, r2);
+   if (fabsf(r2[0]) > fabsf(r1[0]))
+      SWAP_ROWS(r2, r1);
+   if (fabsf(r1[0]) > fabsf(r0[0]))
+      SWAP_ROWS(r1, r0);
+   if (0.0F == r0[0])
+      return false;
+
+   /* eliminate first variable     */
+   m1 = r1[0] / r0[0];
+   m2 = r2[0] / r0[0];
+   m3 = r3[0] / r0[0];
+   s = r0[1];
+   r1[1] -= m1 * s;
+   r2[1] -= m2 * s;
+   r3[1] -= m3 * s;
+   s = r0[2];
+   r1[2] -= m1 * s;
+   r2[2] -= m2 * s;
+   r3[2] -= m3 * s;
+   s = r0[3];
+   r1[3] -= m1 * s;
+   r2[3] -= m2 * s;
+   r3[3] -= m3 * s;
+   s = r0[4];
+   if (s != 0.0F) {
+      r1[4] -= m1 * s;
+      r2[4] -= m2 * s;
+      r3[4] -= m3 * s;
+   }
+   s = r0[5];
+   if (s != 0.0F) {
+      r1[5] -= m1 * s;
+      r2[5] -= m2 * s;
+      r3[5] -= m3 * s;
+   }
+   s = r0[6];
+   if (s != 0.0F) {
+      r1[6] -= m1 * s;
+      r2[6] -= m2 * s;
+      r3[6] -= m3 * s;
+   }
+   s = r0[7];
+   if (s != 0.0F) {
+      r1[7] -= m1 * s;
+      r2[7] -= m2 * s;
+      r3[7] -= m3 * s;
+   }
+
+   /* choose pivot - or die */
+   if (fabsf(r3[1]) > fabsf(r2[1]))
+      SWAP_ROWS(r3, r2);
+   if (fabsf(r2[1]) > fabsf(r1[1]))
+      SWAP_ROWS(r2, r1);
+   if (0.0F == r1[1])
+      return false;
+
+   /* eliminate second variable */
+   m2 = r2[1] / r1[1];
+   m3 = r3[1] / r1[1];
+   r2[2] -= m2 * r1[2];
+   r3[2] -= m3 * r1[2];
+   r2[3] -= m2 * r1[3];
+   r3[3] -= m3 * r1[3];
+   s = r1[4];
+   if (0.0F != s) {
+      r2[4] -= m2 * s;
+      r3[4] -= m3 * s;
+   }
+   s = r1[5];
+   if (0.0F != s) {
+      r2[5] -= m2 * s;
+      r3[5] -= m3 * s;
+   }
+   s = r1[6];
+   if (0.0F != s) {
+      r2[6] -= m2 * s;
+      r3[6] -= m3 * s;
+   }
+   s = r1[7];
+   if (0.0F != s) {
+      r2[7] -= m2 * s;
+      r3[7] -= m3 * s;
+   }
+
+   /* choose pivot - or die */
+   if (fabsf(r3[2]) > fabsf(r2[2]))
+      SWAP_ROWS(r3, r2);
+   if (0.0F == r2[2])
+      return false;
+
+   /* eliminate third variable */
+   m3 = r3[2] / r2[2];
+   r3[3] -= m3 * r2[3], r3[4] -= m3 * r2[4], r3[5] -= m3 * r2[5], r3[6] -= m3 * r2[6],
+      r3[7] -= m3 * r2[7];
+
+   /* last check */
+   if (0.0F == r3[3])
+      return false;
+
+   s = 1.0F / r3[3]; /* now back substitute row 3 */
+   r3[4] *= s;
+   r3[5] *= s;
+   r3[6] *= s;
+   r3[7] *= s;
+
+   m2 = r2[3]; /* now back substitute row 2 */
+   s = 1.0F / r2[2];
+   r2[4] = s * (r2[4] - r3[4] * m2), r2[5] = s * (r2[5] - r3[5] * m2),
+   r2[6] = s * (r2[6] - r3[6] * m2), r2[7] = s * (r2[7] - r3[7] * m2);
+   m1 = r1[3];
+   r1[4] -= r3[4] * m1, r1[5] -= r3[5] * m1, r1[6] -= r3[6] * m1, r1[7] -= r3[7] * m1;
+   m0 = r0[3];
+   r0[4] -= r3[4] * m0, r0[5] -= r3[5] * m0, r0[6] -= r3[6] * m0, r0[7] -= r3[7] * m0;
+
+   m1 = r1[2]; /* now back substitute row 1 */
+   s = 1.0F / r1[1];
+   r1[4] = s * (r1[4] - r2[4] * m1), r1[5] = s * (r1[5] - r2[5] * m1),
+   r1[6] = s * (r1[6] - r2[6] * m1), r1[7] = s * (r1[7] - r2[7] * m1);
+   m0 = r0[2];
+   r0[4] -= r2[4] * m0, r0[5] -= r2[5] * m0, r0[6] -= r2[6] * m0, r0[7] -= r2[7] * m0;
+
+   m0 = r0[1]; /* now back substitute row 0 */
+   s = 1.0F / r0[0];
+   r0[4] = s * (r0[4] - r1[4] * m0), r0[5] = s * (r0[5] - r1[5] * m0),
+   r0[6] = s * (r0[6] - r1[6] * m0), r0[7] = s * (r0[7] - r1[7] * m0);
+
+   MAT(out, 0, 0) = r0[4];
+   MAT(out, 0, 1) = r0[5], MAT(out, 0, 2) = r0[6];
+   MAT(out, 0, 3) = r0[7], MAT(out, 1, 0) = r1[4];
+   MAT(out, 1, 1) = r1[5], MAT(out, 1, 2) = r1[6];
+   MAT(out, 1, 3) = r1[7], MAT(out, 2, 0) = r2[4];
+   MAT(out, 2, 1) = r2[5], MAT(out, 2, 2) = r2[6];
+   MAT(out, 2, 3) = r2[7], MAT(out, 3, 0) = r3[4];
+   MAT(out, 3, 1) = r3[5], MAT(out, 3, 2) = r3[6];
+   MAT(out, 3, 3) = r3[7];
+
+#undef MAT
+#undef SWAP_ROWS
+
+   return true;
+}
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@ -39,177 +39,24 @@
 #define U_MATH_H


-#include "pipe/p_compiler.h"
+#include "c99_math.h"
+#include <assert.h>
+#include <float.h>
+#include <stdarg.h>

+#include "bitscan.h"
+#include "u_endian.h" /* for UTIL_ARCH_BIG_ENDIAN */

 #ifdef __cplusplus
 extern "C" {
 #endif


-#include <math.h>
-#include <float.h>
-#include <stdarg.h>
-
-#ifdef PIPE_OS_UNIX
-#include <strings.h> /* for ffs */
-#endif
-
-
 #ifndef M_SQRT2
 #define M_SQRT2 1.41421356237309504880
 #endif


-#if defined(_MSC_VER) 
-
-#if _MSC_VER < 1400 && !defined(__cplusplus)
- 
-static inline float cosf( float f ) 
-{
-   return (float) cos( (double) f );
-}
-
-static inline float sinf( float f ) 
-{
-   return (float) sin( (double) f );
-}
-
-static inline float ceilf( float f ) 
-{
-   return (float) ceil( (double) f );
-}
-
-static inline float floorf( float f ) 
-{
-   return (float) floor( (double) f );
-}
-
-static inline float powf( float f, float g ) 
-{
-   return (float) pow( (double) f, (double) g );
-}
-
-static inline float sqrtf( float f ) 
-{
-   return (float) sqrt( (double) f );
-}
-
-static inline float fabsf( float f ) 
-{
-   return (float) fabs( (double) f );
-}
-
-static inline float logf( float f ) 
-{
-   return (float) log( (double) f );
-}
-
-#else
-/* Work-around an extra semi-colon in VS 2005 logf definition */
-#ifdef logf
-#undef logf
-#define logf(x) ((float)log((double)(x)))
-#endif /* logf */
-
-#if _MSC_VER < 1800
-#define isfinite(x) _finite((double)(x))
-#define isnan(x) _isnan((double)(x))
-#endif /* _MSC_VER < 1800 */
-#endif /* _MSC_VER < 1400 && !defined(__cplusplus) */
-
-#if _MSC_VER < 1800
-static inline double log2( double x )
-{
-   const double invln2 = 1.442695041;
-   return log( x ) * invln2;
-}
-
-static inline double
-round(double x)
-{
-   return x >= 0.0 ? floor(x + 0.5) : ceil(x - 0.5);
-}
-
-static inline float
-roundf(float x)
-{
-   return x >= 0.0f ? floorf(x + 0.5f) : ceilf(x - 0.5f);
-}
-#endif
-
-#ifndef INFINITY
-#define INFINITY (DBL_MAX + DBL_MAX)
-#endif
-
-#ifndef NAN
-#define NAN (INFINITY - INFINITY)
-#endif
-
-#endif /* _MSC_VER */
-
-
-#if __STDC_VERSION__ < 199901L && (!defined(__cplusplus) || defined(_MSC_VER))
-static inline long int
-lrint(double d)
-{
-   long int rounded = (long int)(d + 0.5);
-
-   if (d - floor(d) == 0.5) {
-      if (rounded % 2 != 0)
-         rounded += (d > 0) ? -1 : 1;
-   }
-
-   return rounded;
-}
-
-static inline long int
-lrintf(float f)
-{
-   long int rounded = (long int)(f + 0.5f);
-
-   if (f - floorf(f) == 0.5f) {
-      if (rounded % 2 != 0)
-         rounded += (f > 0) ? -1 : 1;
-   }
-
-   return rounded;
-}
-
-static inline long long int
-llrint(double d)
-{
-   long long int rounded = (long long int)(d + 0.5);
-
-   if (d - floor(d) == 0.5) {
-      if (rounded % 2 != 0)
-         rounded += (d > 0) ? -1 : 1;
-   }
-
-   return rounded;
-}
-
-static inline long long int
-llrintf(float f)
-{
-   long long int rounded = (long long int)(f + 0.5f);
-
-   if (f - floorf(f) == 0.5f) {
-      if (rounded % 2 != 0)
-         rounded += (f > 0) ? -1 : 1;
-   }
-
-   return rounded;
-}
-#endif /* C99 */
-
-#define POW2_TABLE_SIZE_LOG2 9
-#define POW2_TABLE_SIZE (1 << POW2_TABLE_SIZE_LOG2)
-#define POW2_TABLE_OFFSET (POW2_TABLE_SIZE/2)
-#define POW2_TABLE_SCALE ((float)(POW2_TABLE_SIZE/2))
-extern float pow2_table[POW2_TABLE_SIZE];
-
-
 /**
 * Initialize math module.  This should be called before using any
 * other functions in this module.
@ -236,7 +83,8 @@ union di {
 * Extract the IEEE float32 exponent.
 */
 static inline signed
-util_get_float32_exponent(float x) {
+util_get_float32_exponent(float x)
+{
   union fi f;

   f.f = x;
@ -245,57 +93,7 @@ util_get_float32_exponent(float x) {
 }


-/**
- * Fast version of 2^x
- * Identity: exp2(a + b) = exp2(a) * exp2(b)
- * Let ipart = int(x)
- * Let fpart = x - ipart;
- * So, exp2(x) = exp2(ipart) * exp2(fpart)
- * Compute exp2(ipart) with i << ipart
- * Compute exp2(fpart) with lookup table.
- */
-static inline float
-util_fast_exp2(float x)
-{
-   int32_t ipart;
-   float fpart, mpart;
-   union fi epart;
-
-   if(x > 129.00000f)
-      return 3.402823466e+38f;
-
-   if (x < -126.99999f)
-      return 0.0f;
-
-   ipart = (int32_t) x;
-   fpart = x - (float) ipart;
-
-   /* same as
-    *   epart.f = (float) (1 << ipart)
-    * but faster and without integer overflow for ipart > 31
-    */
-   epart.i = (ipart + 127 ) << 23;
-
-   mpart = pow2_table[POW2_TABLE_OFFSET + (int)(fpart * POW2_TABLE_SCALE)];
-
-   return epart.f * mpart;
-}
-
-
-/**
- * Fast approximation to exp(x).
- */
-static inline float
-util_fast_exp(float x)
-{
-   const float k = 1.44269f; /* = log2(e) */
-   return util_fast_exp2(k * x);
-}
-
-
-#if 0
-
-#define LOG2_TABLE_SIZE_LOG2 16
+#define LOG2_TABLE_SIZE_LOG2 8
 #define LOG2_TABLE_SCALE (1 << LOG2_TABLE_SIZE_LOG2)
 #define LOG2_TABLE_SIZE (LOG2_TABLE_SCALE + 1)
 extern float log2_table[LOG2_TABLE_SIZE];
@ -317,30 +115,29 @@ util_fast_log2(float x)
 }


-/**
- * Fast approximation to x^y.
- */
-static inline float
-util_fast_pow(float x, float y)
-{
-   return util_fast_exp2(util_fast_log2(x) * y);
-}
-#endif
-/* Note that this counts zero as a power of two.
- */
-static inline boolean
-util_is_power_of_two( unsigned v )
-{
-   return (v & (v-1)) == 0;
-}
-
-
 /**
 * Floor(x), returned as int.
 */
 static inline int
 util_ifloor(float f)
 {
+#if defined(USE_X86_ASM) && defined(__GNUC__) && defined(__i386__)
+   /*
+    * IEEE floor for computers that round to nearest or even.
+    * 'f' must be between -4194304 and 4194303.
+    * This floor operation is done by "(iround(f + .5) + iround(f - .5)) >> 1",
+    * but uses some IEEE specific tricks for better speed.
+    * Contributed by Josh Vanderhoof
+    */
+   int ai, bi;
+   double af, bf;
+   af = (3 << 22) + 0.5 + (double)f;
+   bf = (3 << 22) + 0.5 - (double)f;
+   /* GCC generates an extra fstp/fld without this. */
+   __asm__ ("fstps %0" : "=m" (ai) : "t" (af) : "st");
+   __asm__ ("fstps %0" : "=m" (bi) : "t" (bf) : "st");
+   return (ai - bi) >> 1;
+#else
   int ai, bi;
   double af, bf;
   union fi u;
@ -349,6 +146,7 @@ util_ifloor(float f)
   u.f = (float) af;  ai = u.i;
   u.f = (float) bf;  bi = u.i;
   return (ai - bi) >> 1;
+#endif
 }


@ -381,10 +179,10 @@ util_iround(float f)
 /**
 * Approximate floating point comparison
 */
-static inline boolean
+static inline bool
 util_is_approx(float a, float b, float tol)
 {
-   return fabs(b - a) <= tol;
+   return fabsf(b - a) <= tol;
 }


@ -400,7 +198,7 @@ util_is_approx(float a, float b, float tol)
 /**
 * Single-float
 */
-static inline boolean
+static inline bool
 util_is_inf_or_nan(float x)
 {
   union fi tmp;
@ -409,7 +207,7 @@ util_is_inf_or_nan(float x)
 }


-static inline boolean
+static inline bool
 util_is_nan(float x)
 {
   union fi tmp;
@ -434,7 +232,7 @@ util_inf_sign(float x)
 /**
 * Double-float
 */
-static inline boolean
+static inline bool
 util_is_double_inf_or_nan(double x)
 {
   union di tmp;
@ -443,7 +241,7 @@ util_is_double_inf_or_nan(double x)
 }


-static inline boolean
+static inline bool
 util_is_double_nan(double x)
 {
   union di tmp;
@ -468,14 +266,14 @@ util_double_inf_sign(double x)
 /**
 * Half-float
 */
-static inline boolean
+static inline bool
 util_is_half_inf_or_nan(int16_t x)
 {
   return (x & 0x7c00) == 0x7c00;
 }


-static inline boolean
+static inline bool
 util_is_half_nan(int16_t x)
 {
   return (x & 0x7fff) > 0x7c00;
@ -494,163 +292,84 @@ util_half_inf_sign(int16_t x)


 /**
- * Find first bit set in word.  Least significant bit is 1.
- * Return 0 if no bits set.
+ * Return float bits.
 */
-#ifndef FFS_DEFINED
-#define FFS_DEFINED 1
-
-#if defined(_MSC_VER) && _MSC_VER >= 1300 && (_M_IX86 || _M_AMD64 || _M_IA64)
-unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask);
-#pragma intrinsic(_BitScanForward)
-static inline
-unsigned long ffs( unsigned long u )
-{
-   unsigned long i;
-   if (_BitScanForward(&i, u))
-      return i + 1;
-   else
-      return 0;
-}
-#elif defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86)
-static inline
-unsigned ffs( unsigned u )
+static inline unsigned
+fui( float f )
 {
-   unsigned i;
-
-   if (u == 0) {
-      return 0;
+   union fi fi;
+   fi.f = f;
+   return fi.ui;
 }

-   __asm bsf eax, [u]
-   __asm inc eax
-   __asm mov [i], eax
-
-   return i;
+static inline float
+uif(uint32_t ui)
+{
+   union fi fi;
+   fi.ui = ui;
+   return fi.f;
 }
-#elif defined(__MINGW32__) || defined(PIPE_OS_ANDROID)
-#define ffs __builtin_ffs
-#endif

-#endif /* FFS_DEFINED */

 /**
- * Find last bit set in a word.  The least significant bit is 1.
- * Return 0 if no bits are set.
+ * Convert uint8_t to float in [0, 1].
 */
-static inline unsigned util_last_bit(unsigned u)
+static inline float
+ubyte_to_float(uint8_t ub)
 {
-#if defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304)
-   return u == 0 ? 0 : 32 - __builtin_clz(u);
-#else
-   unsigned r = 0;
-   while (u) {
-       r++;
-       u >>= 1;
-   }
-   return r;
-#endif
+   return (float) ub * (1.0f / 255.0f);
 }

-/**
- * Find last bit in a word that does not match the sign bit. The least
- * significant bit is 1.
- * Return 0 if no bits are set.
- */
-static inline unsigned util_last_bit_signed(int i)
-{
-#if defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 407)
-   return 31 - __builtin_clrsb(i);
-#else
-   if (i >= 0)
-      return util_last_bit(i);
-   else
-      return util_last_bit(~(unsigned)i);
-#endif
-}

-/* Destructively loop over all of the bits in a mask as in:
- *
- * while (mymask) {
- *   int i = u_bit_scan(&mymask);
- *   ... process element i
- * }
- * 
+/**
+ * Convert float in [0,1] to uint8_t in [0,255] with clamping.
 */
-static inline int u_bit_scan(unsigned *mask)
+static inline uint8_t
+float_to_ubyte(float f)
 {
-   int i = ffs(*mask) - 1;
-   *mask &= ~(1 << i);
-   return i;
+   /* return 0 for NaN too */
+   if (!(f > 0.0f)) {
+      return (uint8_t) 0;
   }
-
-/* For looping over a bitmask when you want to loop over consecutive bits
- * manually, for example:
- *
- * while (mask) {
- *    int start, count, i;
- *
- *    u_bit_scan_consecutive_range(&mask, &start, &count);
- *
- *    for (i = 0; i < count; i++)
- *       ... process element (start+i)
- * }
- */
-static inline void
-u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count)
-{
-   if (*mask == 0xffffffff) {
-      *start = 0;
-      *count = 32;
-      *mask = 0;
-      return;
+   else if (f >= 1.0f) {
+      return (uint8_t) 255;
   }
-   *start = ffs(*mask) - 1;
-   *count = ffs(~(*mask >> *start)) - 1;
-   *mask &= ~(((1u << *count) - 1) << *start);
+   else {
+      union fi tmp;
+      tmp.f = f;
+      tmp.f = tmp.f * (255.0f/256.0f) + 32768.0f;
+      return (uint8_t) tmp.i;
   }
-
-/**
- * Return float bits.
- */
-static inline unsigned
-fui( float f )
-{
-   union fi fi;
-   fi.f = f;
-   return fi.ui;
 }

-
 /**
- * Convert ubyte to float in [0, 1].
- * XXX a 256-entry lookup table would be slightly faster.
+ * Convert uint16_t to float in [0, 1].
 */
 static inline float
-ubyte_to_float(ubyte ub)
+ushort_to_float(uint16_t us)
 {
-   return (float) ub * (1.0f / 255.0f);
+   return (float) us * (1.0f / 65535.0f);
 }


 /**
- * Convert float in [0,1] to ubyte in [0,255] with clamping.
+ * Convert float in [0,1] to uint16_t in [0,65535] with clamping.
 */
-static inline ubyte
-float_to_ubyte(float f)
+static inline uint16_t
+float_to_ushort(float f)
 {
-   union fi tmp;
-
-   tmp.f = f;
-   if (tmp.i < 0) {
-      return (ubyte) 0;
+   /* return 0 for NaN too */
+   if (!(f > 0.0f)) {
+      return (uint16_t) 0;
   }
-   else if (tmp.i >= 0x3f800000 /* 1.0f */) {
-      return (ubyte) 255;
+   else if (f >= 1.0f) {
+      return (uint16_t) 65535;
   }
   else {
-      tmp.f = tmp.f * (255.0f/256.0f) + 32768.0f;
-      return (ubyte) tmp.i;
+      union fi tmp;
+      tmp.f = f;
+      tmp.f = tmp.f * (65535.0f/65536.0f) + 128.0f;
+      return (uint16_t) tmp.i;
   }
 }

@ -672,7 +391,7 @@ float_to_byte_tex(float f)
 static inline unsigned
 util_logbase2(unsigned n)
 {
-#if defined(PIPE_CC_GCC) && (PIPE_CC_GCC_VERSION >= 304)
+#if defined(HAVE___BUILTIN_CLZ)
   return ((sizeof(unsigned) * 8 - 1) - __builtin_clz(n | 1));
 #else
   unsigned pos = 0;
@ -685,6 +404,44 @@ util_logbase2(unsigned n)
 #endif
 }

+static inline uint64_t
+util_logbase2_64(uint64_t n)
+{
+#if defined(HAVE___BUILTIN_CLZLL)
+   return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1));
+#else
+   uint64_t pos = 0ull;
+   if (n >= 1ull<<32) { n >>= 32; pos += 32; }
+   if (n >= 1ull<<16) { n >>= 16; pos += 16; }
+   if (n >= 1ull<< 8) { n >>=  8; pos +=  8; }
+   if (n >= 1ull<< 4) { n >>=  4; pos +=  4; }
+   if (n >= 1ull<< 2) { n >>=  2; pos +=  2; }
+   if (n >= 1ull<< 1) {           pos +=  1; }
+   return pos;
+#endif
+}
+
+/**
+ * Returns the ceiling of log n base 2, and 0 when n == 0. Equivalently,
+ * returns the smallest x such that n <= 2**x.
+ */
+static inline unsigned
+util_logbase2_ceil(unsigned n)
+{
+   if (n <= 1)
+      return 0;
+
+   return 1 + util_logbase2(n - 1);
+}
+
+static inline uint64_t
+util_logbase2_ceil64(uint64_t n)
+{
+   if (n <= 1)
+      return 0;
+
+   return 1ull + util_logbase2_64(n - 1);
+}

 /**
 * Returns the smallest power of two >= x
@ -692,7 +449,7 @@ util_logbase2(unsigned n)
 static inline unsigned
 util_next_power_of_two(unsigned x)
 {
-#if defined(PIPE_CC_GCC) && (PIPE_CC_GCC_VERSION >= 304)
+#if defined(HAVE___BUILTIN_CLZ)
   if (x <= 1)
       return 1;

@ -703,7 +460,7 @@ util_next_power_of_two(unsigned x)
   if (x <= 1)
      return 1;

-   if (util_is_power_of_two(x))
+   if (util_is_power_of_two_or_zero(x))
      return x;

   val--;
@ -717,27 +474,32 @@ util_next_power_of_two(unsigned x)
 #endif
 }

-
-/**
- * Return number of bits set in n.
- */
-static inline unsigned
-util_bitcount(unsigned n)
+static inline uint64_t
+util_next_power_of_two64(uint64_t x)
 {
-#if defined(PIPE_CC_GCC) && (PIPE_CC_GCC_VERSION >= 304)
-   return __builtin_popcount(n);
+#if defined(HAVE___BUILTIN_CLZLL)
+   if (x <= 1)
+       return 1;
+
+   return (1ull << ((sizeof(uint64_t) * 8) - __builtin_clzll(x - 1)));
 #else
-   /* K&R classic bitcount.
-    *
-    * For each iteration, clear the LSB from the bitfield.
-    * Requires only one iteration per set bit, instead of
-    * one iteration per bit less than highest set bit.
-    */
-   unsigned bits = 0;
-   for (bits; n; bits++) {
-      n &= n - 1;
-   }
-   return bits;
+   uint64_t val = x;
+
+   if (x <= 1)
+      return 1;
+
+   if (util_is_power_of_two_or_zero64(x))
+      return x;
+
+   val--;
+   val = (val >> 1)  | val;
+   val = (val >> 2)  | val;
+   val = (val >> 4)  | val;
+   val = (val >> 8)  | val;
+   val = (val >> 16) | val;
+   val = (val >> 32) | val;
+   val++;
+   return val;
 #endif
 }

@ -781,8 +543,7 @@ util_bitreverse(unsigned n)
 static inline uint32_t
 util_bswap32(uint32_t n)
 {
-/* We need the gcc version checks for non-autoconf build system */
-#if defined(HAVE___BUILTIN_BSWAP32) || (defined(PIPE_CC_GCC) && (PIPE_CC_GCC_VERSION >= 403))
+#if defined(HAVE___BUILTIN_BSWAP32)
   return __builtin_bswap32(n);
 #else
   return (n >> 24) |
@ -801,7 +562,7 @@ util_bswap64(uint64_t n)
 #if defined(HAVE___BUILTIN_BSWAP64)
   return __builtin_bswap64(n);
 #else
-   return ((uint64_t)util_bswap32(n) << 32) |
+   return ((uint64_t)util_bswap32((uint32_t)n) << 32) |
          util_bswap32((n >> 32));
 #endif
 }
@ -817,6 +578,37 @@ util_bswap16(uint16_t n)
          (n << 8);
 }

+/**
+ * Extend sign.
+ */
+static inline int64_t
+util_sign_extend(uint64_t val, unsigned width)
+{
+	assert(width > 0);
+	if (val & (UINT64_C(1) << (width - 1))) {
+		return -(int64_t)((UINT64_C(1) << width) - val);
+	} else {
+		return val;
+	}
+}
+
+static inline void*
+util_memcpy_cpu_to_le32(void * restrict dest, const void * restrict src, size_t n)
+{
+#if UTIL_ARCH_BIG_ENDIAN
+   size_t i, e;
+   assert(n % 4 == 0);
+
+   for (i = 0, e = n / 4; i < e; i++) {
+      uint32_t * restrict d = (uint32_t* restrict)dest;
+      const uint32_t * restrict s = (const uint32_t* restrict)src;
+      d[i] = util_bswap32(s[i]);
+   }
+   return dest;
+#else
+   return memcpy(dest, src, n);
+#endif
+}

 /**
 * Clamp X to [MIN, MAX].
@ -825,6 +617,9 @@ util_bswap16(uint16_t n)
 */
 #define CLAMP( X, MIN, MAX )  ( (X)>(MIN) ? ((X)>(MAX) ? (MAX) : (X)) : (MIN) )

+/* Syntax sugar occuring frequently in graphics code */
+#define SATURATE( X ) CLAMP(X, 0.0f, 1.0f)
+
 #define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
 #define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )

@ -835,6 +630,56 @@ util_bswap16(uint16_t n)
 #define MAX4( A, B, C, D ) ((A) > (B) ? MAX3(A, C, D) : MAX3(B, C, D))


+/**
+ * Align a value up to an alignment value
+ *
+ * If \c value is not already aligned to the requested alignment value, it
+ * will be rounded up.
+ *
+ * \param value  Value to be rounded
+ * \param alignment  Alignment value to be used.  This must be a power of two.
+ *
+ * \sa ROUND_DOWN_TO()
+ */
+
+#if defined(ALIGN)
+#undef ALIGN
+#endif
+static inline uintptr_t
+ALIGN(uintptr_t value, int32_t alignment)
+{
+   assert(util_is_power_of_two_nonzero(alignment));
+   return (((value) + (alignment) - 1) & ~((alignment) - 1));
+}
+
+/**
+ * Like ALIGN(), but works with a non-power-of-two alignment.
+ */
+static inline uintptr_t
+ALIGN_NPOT(uintptr_t value, int32_t alignment)
+{
+   assert(alignment > 0);
+   return (value + alignment - 1) / alignment * alignment;
+}
+
+/**
+ * Align a value down to an alignment value
+ *
+ * If \c value is not already aligned to the requested alignment value, it
+ * will be rounded down.
+ *
+ * \param value  Value to be rounded
+ * \param alignment  Alignment value to be used.  This must be a power of two.
+ *
+ * \sa ALIGN()
+ */
+static inline uint64_t
+ROUND_DOWN_TO(uint64_t value, int32_t alignment)
+{
+   assert(util_is_power_of_two_nonzero(alignment));
+   return ((value) & ~(alignment - 1));
+}
+
 /**
 * Align a value, only works pot alignemnts.
 */
@ -844,6 +689,12 @@ align(int value, int alignment)
   return (value + alignment - 1) & ~(alignment - 1);
 }

+static inline uint64_t
+align64(uint64_t value, unsigned alignment)
+{
+   return (value + alignment - 1) & ~((uint64_t)alignment - 1);
+}
+
 /**
 * Works like align but on npot alignments.
 */
@ -888,12 +739,14 @@ do {                                     \
 #endif


-static inline uint32_t util_unsigned_fixed(float value, unsigned frac_bits)
+static inline uint32_t
+util_unsigned_fixed(float value, unsigned frac_bits)
 {
   return value < 0 ? 0 : (uint32_t)(value * (1<<frac_bits));
 }

-static inline int32_t util_signed_fixed(float value, unsigned frac_bits)
+static inline int32_t
+util_signed_fixed(float value, unsigned frac_bits)
 {
   return (int32_t)(value * (1<<frac_bits));
 }
@ -905,7 +758,41 @@ util_fpstate_set_denorms_to_zero(unsigned current_fpstate);
 void
 util_fpstate_set(unsigned fpstate);

+/**
+ * For indexed draw calls, return true if the vertex count to be drawn is
+ * much lower than the vertex count that has to be uploaded, meaning
+ * that the driver should flatten indices instead of trying to upload
+ * a too big range.
+ *
+ * This is used by vertex upload code in u_vbuf and glthread.
+ */
+static inline bool
+util_is_vbo_upload_ratio_too_large(unsigned draw_vertex_count,
+                                   unsigned upload_vertex_count)
+{
+   if (draw_vertex_count > 1024)
+      return upload_vertex_count > draw_vertex_count * 4;
+   else if (draw_vertex_count > 32)
+      return upload_vertex_count > draw_vertex_count * 8;
+   else
+      return upload_vertex_count > draw_vertex_count * 16;
+}
+
+bool util_invert_mat4x4(float *out, const float *m);

+/* Quantize the lod bias value to reduce the number of sampler state
+ * variants in gallium because apps use it for smooth mipmap transitions,
+ * thrashing cso_cache and degrading performance.
+ *
+ * This quantization matches the AMD hw specification, so having more
+ * precision would have no effect anyway.
+ */
+static inline float
+util_quantize_lod_bias(float lod)
+{
+   lod = CLAMP(lod, -16, 16);
+   return roundf(lod * 256) / 256;
+}

 #ifdef __cplusplus
 }
--- a/src/venus/vkr_common.h
+++ b/src/venus/vkr_common.h
@ -258,7 +258,7 @@ vkr_region_size(const struct vkr_region *region)
 static inline bool
 vkr_region_is_aligned(const struct vkr_region *region, size_t align)
 {
-   assert(align && util_is_power_of_two(align));
+   assert(util_is_power_of_two_nonzero(align));
   return !((region->begin | region->end) & (align - 1));
 }

--- a/src/venus/vkr_ring.c
+++ b/src/venus/vkr_ring.c
@ -66,7 +66,7 @@ vkr_ring_init_buffer(struct vkr_ring *ring, const struct vkr_ring_layout *layout
                    &buf->base_iov_offset);

   buf->size = vkr_region_size(&layout->buffer);
-   assert(buf->size && util_is_power_of_two(buf->size));
+   assert(util_is_power_of_two_nonzero(buf->size));
   buf->mask = buf->size - 1;

   buf->cur = 0;
--- a/src/venus/vkr_transport.c
+++ b/src/venus/vkr_transport.c
@ -217,8 +217,7 @@ vkr_ring_layout_init(struct vkr_ring_layout *layout,
   }

   const size_t buf_size = vkr_region_size(&layout->buffer);
-   if (!buf_size || buf_size > VKR_RING_BUFFER_MAX_SIZE ||
-       !util_is_power_of_two(buf_size)) {
+   if (buf_size > VKR_RING_BUFFER_MAX_SIZE || !util_is_power_of_two_nonzero(buf_size)) {
      vkr_log("ring buffer size (%lu) must be a power of two and not exceed %lu",
              buf_size, VKR_RING_BUFFER_MAX_SIZE);
      return false;
--- a/src/vrend_decode.c
+++ b/src/vrend_decode.c
@ -233,13 +233,6 @@ static int vrend_decode_clear_texture(struct vrend_context *ctx, const uint32_t
   return 0;
 }

-static float uif(unsigned int ui)
-{
-   union { float f; unsigned int ui; } myuif;
-   myuif.ui = ui;
-   return myuif.f;
-}
-
 static int vrend_decode_set_viewport_state(struct vrend_context *ctx, const uint32_t *buf, uint32_t length)
 {
   struct pipe_viewport_state vps[PIPE_MAX_VIEWPORTS];