mesa: update to the latest u_math.h

uif is now defined.  util_is_power_of_two is replaced by other variants.

Signed-off-by: Chia-I Wu <olvaffe@gmail.com>
Reviewed-by: Yiwei Zhang <zzyiwei@chromium.org>
Reviewed-by: Ryan Neph <ryanneph@google.com>
Acked-by: Gert Wollny <gert.wollny@collabora.com>
macos/master
Chia-I Wu 3 years ago
parent 9526a95d47
commit 10b89464a3
  1. 8
      config.h.meson
  2. 3
      meson.build
  3. 458
      src/gallium/auxiliary/util/u_cpu_detect.c
  4. 139
      src/gallium/auxiliary/util/u_math.c
  5. 4
      src/gallium/meson.build
  6. 3
      src/mesa/meson.build
  7. 80
      src/mesa/util/bitscan.c
  8. 356
      src/mesa/util/bitscan.h
  9. 865
      src/mesa/util/u_cpu_detect.c
  10. 71
      src/mesa/util/u_cpu_detect.h
  11. 311
      src/mesa/util/u_math.c
  12. 667
      src/mesa/util/u_math.h
  13. 2
      src/venus/vkr_common.h
  14. 2
      src/venus/vkr_ring.c
  15. 3
      src/venus/vkr_transport.c
  16. 7
      src/vrend_decode.c

@ -1,7 +1,15 @@
#mesondefine VERSION
#mesondefine _GNU_SOURCE
#mesondefine VIRGL_RENDERER_UNSTABLE_APIS
#mesondefine HAVE___BUILTIN_BSWAP32
#mesondefine HAVE___BUILTIN_BSWAP64
#mesondefine HAVE___BUILTIN_CLZ
#mesondefine HAVE___BUILTIN_CLZLL
#mesondefine HAVE___BUILTIN_EXPECT
#mesondefine HAVE___BUILTIN_FFS
#mesondefine HAVE___BUILTIN_FFSLL
#mesondefine HAVE___BUILTIN_POPCOUNT
#mesondefine HAVE___BUILTIN_POPCOUNTLL
#mesondefine HAVE___BUILTIN_TYPES_COMPATIBLE_P
#mesondefine HAVE___BUILTIN_UNREACHABLE
#mesondefine HAVE_FUNC_ATTRIBUTE_CONST

@ -129,7 +129,8 @@ if cc.has_header('sys/select.h')
conf_data.set('HAVE_SYS_SELECT_H', 1)
endif
foreach b : ['expect', 'types_compatible_p', 'unreachable']
foreach b : ['bswap32', 'bswap64', 'clz', 'clzll', 'expect', 'ffs', 'ffsll',
'popcount', 'popcountll', 'types_compatible_p', 'unreachable']
if cc.has_function(b)
conf_data.set('HAVE___BUILTIN_@0@'.format(b.to_upper()), 1)
endif

@ -1,458 +0,0 @@
/**************************************************************************
*
* Copyright 2008 Dennis Smit
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* on the rights to use, copy, modify, merge, publish, distribute, sub
* license, and/or sell copies of the Software, and to permit persons to whom
* the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*
**************************************************************************/
/**
* @file
* CPU feature detection.
*
* @author Dennis Smit
* @author Based on the work of Eric Anholt <anholt@FreeBSD.org>
*/
#include "pipe/p_config.h"
#include "u_debug.h"
#include "u_cpu_detect.h"
#if defined(PIPE_ARCH_PPC)
#if defined(PIPE_OS_APPLE)
#include <sys/sysctl.h>
#else
#include <signal.h>
#include <setjmp.h>
#endif
#endif
#if defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
#include <sys/param.h>
#include <sys/sysctl.h>
#include <machine/cpu.h>
#endif
#if defined(PIPE_OS_FREEBSD)
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
#if defined(PIPE_OS_LINUX)
#include <signal.h>
#endif
#ifdef PIPE_OS_UNIX
#include <unistd.h>
#endif
#if defined(PIPE_OS_WINDOWS)
#include <windows.h>
#if defined(PIPE_CC_MSVC)
#include <intrin.h>
#endif
#endif
#ifdef DEBUG
DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", FALSE)
#endif
struct util_cpu_caps util_cpu_caps;
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
static int has_cpuid(void);
#endif
#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE)
static jmp_buf __lv_powerpc_jmpbuf;
static volatile sig_atomic_t __lv_powerpc_canjump = 0;
static void
sigill_handler(int sig)
{
if (!__lv_powerpc_canjump) {
signal (sig, SIG_DFL);
raise (sig);
}
__lv_powerpc_canjump = 0;
longjmp(__lv_powerpc_jmpbuf, 1);
}
#endif
#if defined(PIPE_ARCH_PPC)
static void
check_os_altivec_support(void)
{
#if defined(PIPE_OS_APPLE)
int sels[2] = {CTL_HW, HW_VECTORUNIT};
int has_vu = 0;
int len = sizeof (has_vu);
int err;
err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
if (err == 0) {
if (has_vu != 0) {
util_cpu_caps.has_altivec = 1;
}
}
#else /* !PIPE_OS_APPLE */
/* not on Apple/Darwin, do it the brute-force way */
/* this is borrowed from the libmpeg2 library */
signal(SIGILL, sigill_handler);
if (setjmp(__lv_powerpc_jmpbuf)) {
signal(SIGILL, SIG_DFL);
} else {
__lv_powerpc_canjump = 1;
__asm __volatile
("mtspr 256, %0\n\t"
"vand %%v0, %%v0, %%v0"
:
: "r" (-1));
signal(SIGILL, SIG_DFL);
util_cpu_caps.has_altivec = 1;
}
#endif /* !PIPE_OS_APPLE */
}
#endif /* PIPE_ARCH_PPC */
#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64)
static int has_cpuid(void)
{
#if defined(PIPE_ARCH_X86)
#if defined(PIPE_OS_GCC)
int a, c;
__asm __volatile
("pushf\n"
"popl %0\n"
"movl %0, %1\n"
"xorl $0x200000, %0\n"
"push %0\n"
"popf\n"
"pushf\n"
"popl %0\n"
: "=a" (a), "=c" (c)
:
: "cc");
return a != c;
#else
/* FIXME */
return 1;
#endif
#elif defined(PIPE_ARCH_X86_64)
return 1;
#else
return 0;
#endif
}
/**
* @sa cpuid.h included in gcc-4.3 onwards.
* @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
*/
static inline void
cpuid(uint32_t ax, uint32_t *p)
{
#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86)
__asm __volatile (
"xchgl %%ebx, %1\n\t"
"cpuid\n\t"
"xchgl %%ebx, %1"
: "=a" (p[0]),
"=S" (p[1]),
"=c" (p[2]),
"=d" (p[3])
: "0" (ax)
);
#elif (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86_64)
__asm __volatile (
"cpuid\n\t"
: "=a" (p[0]),
"=b" (p[1]),
"=c" (p[2]),
"=d" (p[3])
: "0" (ax)
);
#elif defined(PIPE_CC_MSVC)
__cpuid(p, ax);
#else
p[0] = 0;
p[1] = 0;
p[2] = 0;
p[3] = 0;
#endif
}
/**
* @sa cpuid.h included in gcc-4.4 onwards.
* @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx
*/
static inline void
cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p)
{
#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86)
__asm __volatile (
"xchgl %%ebx, %1\n\t"
"cpuid\n\t"
"xchgl %%ebx, %1"
: "=a" (p[0]),
"=S" (p[1]),
"=c" (p[2]),
"=d" (p[3])
: "0" (ax), "2" (cx)
);
#elif (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86_64)
__asm __volatile (
"cpuid\n\t"
: "=a" (p[0]),
"=b" (p[1]),
"=c" (p[2]),
"=d" (p[3])
: "0" (ax), "2" (cx)
);
#elif defined(PIPE_CC_MSVC)
__cpuidex(p, ax, cx);
#else
p[0] = 0;
p[1] = 0;
p[2] = 0;
p[3] = 0;
#endif
}
static inline uint64_t xgetbv(void)
{
#if defined(PIPE_CC_GCC)
uint32_t eax, edx;
__asm __volatile (
".byte 0x0f, 0x01, 0xd0" // xgetbv isn't supported on gcc < 4.4
: "=a"(eax),
"=d"(edx)
: "c"(0)
);
return ((uint64_t)edx << 32) | eax;
#elif defined(PIPE_CC_MSVC) && defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
return _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
#else
return 0;
#endif
}
#if defined(PIPE_ARCH_X86)
static inline boolean sse2_has_daz(void)
{
struct {
uint32_t pad1[7];
uint32_t mxcsr_mask;
uint32_t pad2[128-8];
} PIPE_ALIGN_VAR(16) fxarea;
fxarea.mxcsr_mask = 0;
#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO))
__asm __volatile ("fxsave %0" : "+m" (fxarea));
#elif (defined(PIPE_CC_MSVC) && _MSC_VER >= 1700) || defined(PIPE_CC_ICL)
/* 1700 = Visual Studio 2012 */
_fxsave(&fxarea);
#else
fxarea.mxcsr_mask = 0;
#endif
return !!(fxarea.mxcsr_mask & (1 << 6));
}
#endif
#endif /* X86 or X86_64 */
void
util_cpu_detect(void)
{
static boolean util_cpu_detect_initialized = FALSE;
if(util_cpu_detect_initialized)
return;
memset(&util_cpu_caps, 0, sizeof util_cpu_caps);
/* Count the number of CPUs in system */
#if defined(PIPE_OS_WINDOWS)
{
SYSTEM_INFO system_info;
GetSystemInfo(&system_info);
util_cpu_caps.nr_cpus = system_info.dwNumberOfProcessors;
}
#elif defined(PIPE_OS_UNIX) && defined(_SC_NPROCESSORS_ONLN)
util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
if (util_cpu_caps.nr_cpus == -1)
util_cpu_caps.nr_cpus = 1;
#elif defined(PIPE_OS_BSD)
{
int mib[2], ncpu;
int len;
mib[0] = CTL_HW;
mib[1] = HW_NCPU;
len = sizeof (ncpu);
sysctl(mib, 2, &ncpu, &len, NULL, 0);
util_cpu_caps.nr_cpus = ncpu;
}
#else
util_cpu_caps.nr_cpus = 1;
#endif
/* Make the fallback cacheline size nonzero so that it can be
* safely passed to align().
*/
util_cpu_caps.cacheline = sizeof(void *);
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
if (has_cpuid()) {
uint32_t regs[4];
uint32_t regs2[4];
util_cpu_caps.cacheline = 32;
/* Get max cpuid level */
cpuid(0x00000000, regs);
if (regs[0] >= 0x00000001) {
unsigned int cacheline;
cpuid (0x00000001, regs2);
util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf;
if (util_cpu_caps.x86_cpu_type == 0xf)
util_cpu_caps.x86_cpu_type = 8 + ((regs2[0] >> 20) & 255); /* use extended family (P4, IA64) */
/* general feature flags */
util_cpu_caps.has_tsc = (regs2[3] >> 4) & 1; /* 0x0000010 */
util_cpu_caps.has_mmx = (regs2[3] >> 23) & 1; /* 0x0800000 */
util_cpu_caps.has_sse = (regs2[3] >> 25) & 1; /* 0x2000000 */
util_cpu_caps.has_sse2 = (regs2[3] >> 26) & 1; /* 0x4000000 */
util_cpu_caps.has_sse3 = (regs2[2] >> 0) & 1; /* 0x0000001 */
util_cpu_caps.has_ssse3 = (regs2[2] >> 9) & 1; /* 0x0000020 */
util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1;
util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1;
util_cpu_caps.has_popcnt = (regs2[2] >> 23) & 1;
util_cpu_caps.has_avx = ((regs2[2] >> 28) & 1) && // AVX
((regs2[2] >> 27) & 1) && // OSXSAVE
((xgetbv() & 6) == 6); // XMM & YMM
util_cpu_caps.has_f16c = (regs2[2] >> 29) & 1;
util_cpu_caps.has_mmx2 = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */
#if defined(PIPE_ARCH_X86_64)
util_cpu_caps.has_daz = 1;
#else
util_cpu_caps.has_daz = util_cpu_caps.has_sse3 ||
(util_cpu_caps.has_sse2 && sse2_has_daz());
#endif
cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
if (cacheline > 0)
util_cpu_caps.cacheline = cacheline;
}
if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) {
uint32_t regs7[4];
cpuid_count(0x00000007, 0x00000000, regs7);
util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1;
}
if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) {
/* GenuineIntel */
util_cpu_caps.has_intel = 1;
}
cpuid(0x80000000, regs);
if (regs[0] >= 0x80000001) {
cpuid(0x80000001, regs2);
util_cpu_caps.has_mmx |= (regs2[3] >> 23) & 1;
util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1;
util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1;
util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1;
util_cpu_caps.has_xop = util_cpu_caps.has_avx &&
((regs2[2] >> 11) & 1);
}
if (regs[0] >= 0x80000006) {
cpuid(0x80000006, regs2);
util_cpu_caps.cacheline = regs2[2] & 0xFF;
}
if (!util_cpu_caps.has_sse) {
util_cpu_caps.has_sse2 = 0;
util_cpu_caps.has_sse3 = 0;
util_cpu_caps.has_ssse3 = 0;
util_cpu_caps.has_sse4_1 = 0;
}
}
#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
#if defined(PIPE_ARCH_PPC)
check_os_altivec_support();
#endif /* PIPE_ARCH_PPC */
#ifdef DEBUG
if (debug_get_option_dump_cpu()) {
debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
debug_printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type);
debug_printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline);
debug_printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc);
debug_printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx);
debug_printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2);
debug_printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse);
debug_printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2);
debug_printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3);
debug_printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3);
debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1);
debug_printf("util_cpu_caps.has_sse4_2 = %u\n", util_cpu_caps.has_sse4_2);
debug_printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx);
debug_printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2);
debug_printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c);
debug_printf("util_cpu_caps.has_popcnt = %u\n", util_cpu_caps.has_popcnt);
debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow);
debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext);
debug_printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop);
debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec);
debug_printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz);
}
#endif
util_cpu_detect_initialized = TRUE;
}

@ -1,139 +0,0 @@
/**************************************************************************
*
* Copyright 2008 VMware, Inc.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
**************************************************************************/
#include "pipe/p_config.h"
#include "util/u_math.h"
#include "util/u_cpu_detect.h"
#if defined(PIPE_ARCH_SSE)
#include <xmmintrin.h>
/* This is defined in pmmintrin.h, but it can only be included when -msse3 is
* used, so just define it here to avoid further. */
#ifndef _MM_DENORMALS_ZERO_MASK
#define _MM_DENORMALS_ZERO_MASK 0x0040
#endif
#endif
#if 0
/** 2^x, for x in [-1.0, 1.0) */
float pow2_table[POW2_TABLE_SIZE];
static void
init_pow2_table(void)
{
int i;
for (i = 0; i < POW2_TABLE_SIZE; i++)
pow2_table[i] = (float) pow(2.0, (i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE);
}
/** log2(x), for x in [1.0, 2.0) */
float log2_table[LOG2_TABLE_SIZE];
static void
init_log2_table(void)
{
unsigned i;
for (i = 0; i < LOG2_TABLE_SIZE; i++)
log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SCALE));
}
#endif
/**
* One time init for math utilities.
*/
void
util_init_math(void)
{
static boolean initialized = FALSE;
if (!initialized) {
// init_pow2_table();
/* init_log2_table();*/
initialized = TRUE;
}
}
/**
* Fetches the contents of the fpstate (mxcsr on x86) register.
*
* On platforms without support for it just returns 0.
*/
unsigned
util_fpstate_get(void)
{
unsigned mxcsr = 0;
#if defined(PIPE_ARCH_SSE)
if (util_cpu_caps.has_sse) {
mxcsr = _mm_getcsr();
}
#endif
return mxcsr;
}
/**
* Make sure that the fp treats the denormalized floating
* point numbers as zero.
*
* This is the behavior required by D3D10. OpenGL doesn't care.
*/
unsigned
util_fpstate_set_denorms_to_zero(unsigned current_mxcsr)
{
#if defined(PIPE_ARCH_SSE)
if (util_cpu_caps.has_sse) {
/* Enable flush to zero mode */
current_mxcsr |= _MM_FLUSH_ZERO_MASK;
if (util_cpu_caps.has_daz) {
/* Enable denormals are zero mode */
current_mxcsr |= _MM_DENORMALS_ZERO_MASK;
}
util_fpstate_set(current_mxcsr);
}
#endif
return current_mxcsr;
}
/**
* Set the state of the fpstate (mxcsr on x86) register.
*
* On platforms without support for it's a noop.
*/
void
util_fpstate_set(unsigned mxcsr)
{
#if defined(PIPE_ARCH_SSE)
if (util_cpu_caps.has_sse) {
_mm_setcsr(mxcsr);
}
#endif
}

@ -32,7 +32,6 @@ sources_libgallium = [
'auxiliary/util/u_format.h',
'auxiliary/util/u_rect.h',
'auxiliary/util/u_surface.h',
'auxiliary/util/u_math.h',
'auxiliary/util/rgtc.h',
'auxiliary/util/u_format.c',
'auxiliary/util/u_inlines.h',
@ -44,16 +43,13 @@ sources_libgallium = [
'auxiliary/util/u_texture.h',
'auxiliary/util/u_hash_table.h',
'auxiliary/util/u_box.h',
'auxiliary/util/u_cpu_detect.c',
'auxiliary/util/u_pack_color.h',
'auxiliary/util/u_double_list.h',
'auxiliary/util/u_debug_refcnt.h',
'auxiliary/util/u_bitmask.c',
'auxiliary/util/u_cpu_detect.h',
'auxiliary/util/u_bitmask.h',
'auxiliary/util/u_format_s3tc.h',
'auxiliary/util/u_surface.c',
'auxiliary/util/u_math.c',
'auxiliary/util/u_half.h',
'auxiliary/util/u_prim.h',
'auxiliary/util/u_debug_describe.c',

@ -4,9 +4,12 @@
inc_mesa = include_directories('.', 'compat', 'pipe', 'util')
files_mesa = files(
'util/bitscan.c',
'util/os_file.c',
'util/os_misc.c',
'util/u_cpu_detect.c',
'util/u_debug.c',
'util/u_math.c',
)
deps_mesa = [

@ -0,0 +1,80 @@
/**************************************************************************
*
* Copyright 2008 VMware, Inc.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
**************************************************************************/
#include "bitscan.h"
#ifdef HAVE___BUILTIN_FFS
#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
#else
int
ffs(int i)
{
int bit = 0;
if (!i)
return bit;
if (!(i & 0xffff)) {
bit += 16;
i >>= 16;
}
if (!(i & 0xff)) {
bit += 8;
i >>= 8;
}
if (!(i & 0xf)) {
bit += 4;
i >>= 4;
}
if (!(i & 0x3)) {
bit += 2;
i >>= 2;
}
if (!(i & 0x1))
bit += 1;
return bit + 1;
}
#endif
#ifdef HAVE___BUILTIN_FFSLL
#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
#else
int
ffsll(long long int val)
{
int bit;
bit = ffs((unsigned) (val & 0xffffffff));
if (bit != 0)
return bit;
bit = ffs((unsigned) (val >> 32));
if (bit != 0)
return 32 + bit;
return 0;
}
#endif

@ -0,0 +1,356 @@
/**************************************************************************
*
* Copyright 2008 VMware, Inc.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
**************************************************************************/
#ifndef BITSCAN_H
#define BITSCAN_H
#include <assert.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#if defined(_MSC_VER)
#include <intrin.h>
#endif
#if defined(__POPCNT__)
#include <popcntintrin.h>
#endif
#include "c99_compat.h"
#ifdef __cplusplus
extern "C" {
#endif
/**
* Find first bit set in word. Least significant bit is 1.
* Return 0 if no bits set.
*/
#ifdef HAVE___BUILTIN_FFS
#define ffs __builtin_ffs
#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
static inline
int ffs(int i)
{
unsigned long index;
if (_BitScanForward(&index, i))
return index + 1;
else
return 0;
}
#else
extern
int ffs(int i);
#endif
#ifdef HAVE___BUILTIN_FFSLL
#define ffsll __builtin_ffsll
#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
static inline int
ffsll(long long int i)
{
unsigned long index;
if (_BitScanForward64(&index, i))
return index + 1;
else
return 0;
}
#else
extern int
ffsll(long long int val);
#endif
/* Destructively loop over all of the bits in a mask as in:
*
* while (mymask) {
* int i = u_bit_scan(&mymask);
* ... process element i
* }
*
*/
static inline int
u_bit_scan(unsigned *mask)
{
const int i = ffs(*mask) - 1;
*mask ^= (1u << i);
return i;
}
#define u_foreach_bit(b, dword) \
for (uint32_t __dword = (dword), b; \
((b) = ffs(__dword) - 1, __dword); \
__dword &= ~(1 << (b)))
static inline int
u_bit_scan64(uint64_t *mask)
{
const int i = ffsll(*mask) - 1;
*mask ^= (((uint64_t)1) << i);
return i;
}
#define u_foreach_bit64(b, dword) \
for (uint64_t __dword = (dword), b; \
((b) = ffsll(__dword) - 1, __dword); \
__dword &= ~(1ull << (b)))
/* Determine if an unsigned value is a power of two.
*
* \note
* Zero is treated as a power of two.
*/
static inline bool
util_is_power_of_two_or_zero(unsigned v)
{
return (v & (v - 1)) == 0;
}
/* Determine if an uint64_t value is a power of two.
*
* \note
* Zero is treated as a power of two.
*/
static inline bool
util_is_power_of_two_or_zero64(uint64_t v)
{
return (v & (v - 1)) == 0;
}
/* Determine if an unsigned value is a power of two.
*
* \note
* Zero is \b not treated as a power of two.
*/
static inline bool
util_is_power_of_two_nonzero(unsigned v)
{
/* __POPCNT__ is different from HAVE___BUILTIN_POPCOUNT. The latter
* indicates the existence of the __builtin_popcount function. The former
* indicates that _mm_popcnt_u32 exists and is a native instruction.
*
* The other alternative is to use SSE 4.2 compile-time flags. This has
* two drawbacks. First, there is currently no build infrastructure for
* SSE 4.2 (only 4.1), so that would have to be added. Second, some AMD
* CPUs support POPCNT but not SSE 4.2 (e.g., Barcelona).
*/
#ifdef __POPCNT__
return _mm_popcnt_u32(v) == 1;
#else
return v != 0 && (v & (v - 1)) == 0;
#endif
}
/* For looping over a bitmask when you want to loop over consecutive bits
* manually, for example:
*
* while (mask) {
* int start, count, i;
*
* u_bit_scan_consecutive_range(&mask, &start, &count);
*
* for (i = 0; i < count; i++)
* ... process element (start+i)
* }
*/
static inline void
u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count)
{
if (*mask == 0xffffffff) {
*start = 0;
*count = 32;
*mask = 0;
return;
}
*start = ffs(*mask) - 1;
*count = ffs(~(*mask >> *start)) - 1;
*mask &= ~(((1u << *count) - 1) << *start);
}
static inline void
u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count)
{
if (*mask == ~0ull) {
*start = 0;
*count = 64;
*mask = 0;
return;
}
*start = ffsll(*mask) - 1;
*count = ffsll(~(*mask >> *start)) - 1;
*mask &= ~(((((uint64_t)1) << *count) - 1) << *start);
}
/**
* Find last bit set in a word. The least significant bit is 1.
* Return 0 if no bits are set.
* Essentially ffs() in the reverse direction.
*/
static inline unsigned
util_last_bit(unsigned u)
{
#if defined(HAVE___BUILTIN_CLZ)
return u == 0 ? 0 : 32 - __builtin_clz(u);
#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64)
unsigned long index;
if (_BitScanReverse(&index, u))
return index + 1;
else
return 0;
#else
unsigned r = 0;
while (u) {
r++;
u >>= 1;
}
return r;
#endif
}
/**
* Find last bit set in a word. The least significant bit is 1.
* Return 0 if no bits are set.
* Essentially ffsll() in the reverse direction.
*/
static inline unsigned
util_last_bit64(uint64_t u)
{
#if defined(HAVE___BUILTIN_CLZLL)
return u == 0 ? 0 : 64 - __builtin_clzll(u);
#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64)
unsigned long index;
if (_BitScanReverse64(&index, u))
return index + 1;
else
return 0;
#else
unsigned r = 0;
while (u) {
r++;
u >>= 1;
}
return r;
#endif
}
/**
* Find last bit in a word that does not match the sign bit. The least
* significant bit is 1.
* Return 0 if no bits are set.
*/
static inline unsigned
util_last_bit_signed(int i)
{
if (i >= 0)
return util_last_bit(i);
else
return util_last_bit(~(unsigned)i);
}
/* Returns a bitfield in which the first count bits starting at start are
* set.
*/
static inline unsigned
u_bit_consecutive(unsigned start, unsigned count)
{
assert(start + count <= 32);
if (count == 32)
return ~0;
return ((1u << count) - 1) << start;
}
static inline uint64_t
u_bit_consecutive64(unsigned start, unsigned count)
{
assert(start + count <= 64);
if (count == 64)
return ~(uint64_t)0;
return (((uint64_t)1 << count) - 1) << start;
}
/**
* Return number of bits set in n.
*/
static inline unsigned
util_bitcount(unsigned n)
{
#if defined(HAVE___BUILTIN_POPCOUNT)
return __builtin_popcount(n);
#else
/* K&R classic bitcount.
*
* For each iteration, clear the LSB from the bitfield.
* Requires only one iteration per set bit, instead of
* one iteration per bit less than highest set bit.
*/
unsigned bits;
for (bits = 0; n; bits++) {
n &= n - 1;
}
return bits;
#endif
}
/**
* Return the number of bits set in n using the native popcnt instruction.
* The caller is responsible for ensuring that popcnt is supported by the CPU.
*
* gcc doesn't use it if -mpopcnt or -march= that has popcnt is missing.
*
*/
static inline unsigned
util_popcnt_inline_asm(unsigned n)
{
#if defined(USE_X86_64_ASM) || defined(USE_X86_ASM)
uint32_t out;
__asm volatile("popcnt %1, %0" : "=r"(out) : "r"(n));
return out;
#else
/* We should never get here by accident, but I'm sure it'll happen. */
return util_bitcount(n);
#endif
}
static inline unsigned
util_bitcount64(uint64_t n)
{
#ifdef HAVE___BUILTIN_POPCOUNTLL
return __builtin_popcountll(n);
#else
return util_bitcount(n) + util_bitcount(n >> 32);
#endif
}
#ifdef __cplusplus
}
#endif
#endif /* BITSCAN_H */

@ -0,0 +1,865 @@
/**************************************************************************
*
* Copyright 2008 Dennis Smit
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* on the rights to use, copy, modify, merge, publish, distribute, sub
* license, and/or sell copies of the Software, and to permit persons to whom
* the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*
**************************************************************************/
/**
* @file
* CPU feature detection.
*
* @author Dennis Smit
* @author Based on the work of Eric Anholt <anholt@FreeBSD.org>
*/
#include "pipe/p_config.h"
#include "pipe/p_compiler.h"
#include "util/u_debug.h"
#include "u_cpu_detect.h"
#include "u_math.h"
#include "c11/threads.h"
#include <stdio.h>
#include <inttypes.h>
#if defined(PIPE_ARCH_PPC)
#if defined(PIPE_OS_APPLE)
#include <sys/sysctl.h>
#else
#include <signal.h>
#include <setjmp.h>
#endif
#endif
#if defined(PIPE_OS_BSD)
#include <sys/param.h>
#include <sys/sysctl.h>
#include <machine/cpu.h>
#endif
#if defined(PIPE_OS_FREEBSD)
#if __has_include(<sys/auxv.h>)
#include <sys/auxv.h>
#define HAVE_ELF_AUX_INFO
#endif
#endif
#if defined(PIPE_OS_LINUX)
#include <signal.h>
#include <fcntl.h>
#include <elf.h>
#endif
#ifdef PIPE_OS_UNIX
#include <unistd.h>
#endif
#if defined(HAS_ANDROID_CPUFEATURES)
#include <cpu-features.h>
#endif
#if defined(PIPE_OS_WINDOWS)
#include <windows.h>
#if defined(PIPE_CC_MSVC)
#include <intrin.h>
#endif
#endif
#if defined(HAS_SCHED_H)
#include <sched.h>
#endif
DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false)
struct util_cpu_caps_t util_cpu_caps;
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
static int has_cpuid(void);
#endif
#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE) && !defined(PIPE_OS_BSD) && !defined(PIPE_OS_LINUX)
static jmp_buf __lv_powerpc_jmpbuf;
static volatile sig_atomic_t __lv_powerpc_canjump = 0;
static void
sigill_handler(int sig)
{
if (!__lv_powerpc_canjump) {
signal (sig, SIG_DFL);
raise (sig);
}
__lv_powerpc_canjump = 0;
longjmp(__lv_powerpc_jmpbuf, 1);
}
#endif
#if defined(PIPE_ARCH_PPC)
static void
check_os_altivec_support(void)
{
#if defined(__ALTIVEC__)
util_cpu_caps.has_altivec = 1;
#endif
#if defined(__VSX__)
util_cpu_caps.has_vsx = 1;
#endif
#if defined(__ALTIVEC__) && defined(__VSX__)
/* Do nothing */
#elif defined(PIPE_OS_APPLE) || defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
#ifdef HW_VECTORUNIT
int sels[2] = {CTL_HW, HW_VECTORUNIT};
#else
int sels[2] = {CTL_MACHDEP, CPU_ALTIVEC};
#endif
int has_vu = 0;
int len = sizeof (has_vu);
int err;
err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
if (err == 0) {
if (has_vu != 0) {
util_cpu_caps.has_altivec = 1;
}
}
#elif defined(PIPE_OS_FREEBSD) /* !PIPE_OS_APPLE && !PIPE_OS_NETBSD && !PIPE_OS_OPENBSD */
unsigned long hwcap = 0;
#ifdef HAVE_ELF_AUX_INFO
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
#else
size_t len = sizeof(hwcap);
sysctlbyname("hw.cpu_features", &hwcap, &len, NULL, 0);
#endif
if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
util_cpu_caps.has_altivec = 1;
if (hwcap & PPC_FEATURE_HAS_VSX)
util_cpu_caps.has_vsx = 1;
#elif defined(PIPE_OS_LINUX) /* !PIPE_OS_FREEBSD */
#if defined(PIPE_ARCH_PPC_64)
Elf64_auxv_t aux;
#else
Elf32_auxv_t aux;
#endif
int fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
if (fd >= 0) {
while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) {
if (aux.a_type == AT_HWCAP) {
char *env_vsx = getenv("GALLIVM_VSX");
uint64_t hwcap = aux.a_un.a_val;
util_cpu_caps.has_altivec = (hwcap >> 28) & 1;
if (!env_vsx || env_vsx[0] != '0') {
util_cpu_caps.has_vsx = (hwcap >> 7) & 1;
}
break;
}
}
close(fd);
}
#else /* !PIPE_OS_APPLE && !PIPE_OS_BSD && !PIPE_OS_LINUX */
/* not on Apple/Darwin or Linux, do it the brute-force way */
/* this is borrowed from the libmpeg2 library */
signal(SIGILL, sigill_handler);
if (setjmp(__lv_powerpc_jmpbuf)) {
signal(SIGILL, SIG_DFL);
} else {
boolean enable_altivec = TRUE; /* Default: enable if available, and if not overridden */
boolean enable_vsx = TRUE;
#ifdef DEBUG
/* Disabling Altivec code generation is not the same as disabling VSX code generation,
* which can be done simply by passing -mattr=-vsx to the LLVM compiler; cf.
* lp_build_create_jit_compiler_for_module().
* If you want to disable Altivec code generation, the best place to do it is here.
*/
char *env_control = getenv("GALLIVM_ALTIVEC"); /* 1=enable (default); 0=disable */
if (env_control && env_control[0] == '0') {
enable_altivec = FALSE;
}
#endif
/* VSX instructions can be explicitly enabled/disabled via GALLIVM_VSX=1 or 0 */
char *env_vsx = getenv("GALLIVM_VSX");
if (env_vsx && env_vsx[0] == '0') {
enable_vsx = FALSE;
}
if (enable_altivec) {
__lv_powerpc_canjump = 1;
__asm __volatile
("mtspr 256, %0\n\t"
"vand %%v0, %%v0, %%v0"
:
: "r" (-1));
util_cpu_caps.has_altivec = 1;
if (enable_vsx) {
__asm __volatile("xxland %vs0, %vs0, %vs0");
util_cpu_caps.has_vsx = 1;
}
signal(SIGILL, SIG_DFL);
} else {
util_cpu_caps.has_altivec = 0;
}
}
#endif /* !PIPE_OS_APPLE && !PIPE_OS_LINUX */
}
#endif /* PIPE_ARCH_PPC */
#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64)
static int has_cpuid(void)
{
#if defined(PIPE_ARCH_X86)
#if defined(PIPE_OS_GCC)
int a, c;
__asm __volatile
("pushf\n"
"popl %0\n"
"movl %0, %1\n"
"xorl $0x200000, %0\n"
"push %0\n"
"popf\n"
"pushf\n"
"popl %0\n"
: "=a" (a), "=c" (c)
:
: "cc");
return a != c;
#else
/* FIXME */
return 1;
#endif
#elif defined(PIPE_ARCH_X86_64)
return 1;
#else
return 0;
#endif
}
/**
* @sa cpuid.h included in gcc-4.3 onwards.
* @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
*/
static inline void
cpuid(uint32_t ax, uint32_t *p)
{
#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
__asm __volatile (
"xchgl %%ebx, %1\n\t"
"cpuid\n\t"
"xchgl %%ebx, %1"
: "=a" (p[0]),
"=S" (p[1]),
"=c" (p[2]),
"=d" (p[3])
: "0" (ax)
);
#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64)
__asm __volatile (
"cpuid\n\t"
: "=a" (p[0]),
"=b" (p[1]),
"=c" (p[2]),
"=d" (p[3])
: "0" (ax)
);
#elif defined(PIPE_CC_MSVC)
__cpuid(p, ax);
#else
p[0] = 0;
p[1] = 0;
p[2] = 0;
p[3] = 0;
#endif
}
/**
* @sa cpuid.h included in gcc-4.4 onwards.
* @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx
*/
static inline void
cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p)
{
#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
__asm __volatile (
"xchgl %%ebx, %1\n\t"
"cpuid\n\t"
"xchgl %%ebx, %1"
: "=a" (p[0]),
"=S" (p[1]),
"=c" (p[2]),
"=d" (p[3])
: "0" (ax), "2" (cx)
);
#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64)
__asm __volatile (
"cpuid\n\t"
: "=a" (p[0]),
"=b" (p[1]),
"=c" (p[2]),
"=d" (p[3])
: "0" (ax), "2" (cx)
);
#elif defined(PIPE_CC_MSVC)
__cpuidex(p, ax, cx);
#else
p[0] = 0;
p[1] = 0;
p[2] = 0;
p[3] = 0;
#endif
}
static inline uint64_t xgetbv(void)
{
#if defined(PIPE_CC_GCC)
uint32_t eax, edx;
__asm __volatile (
".byte 0x0f, 0x01, 0xd0" // xgetbv isn't supported on gcc < 4.4
: "=a"(eax),
"=d"(edx)
: "c"(0)
);
return ((uint64_t)edx << 32) | eax;
#elif defined(PIPE_CC_MSVC) && defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
return _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
#else
return 0;
#endif
}
#if defined(PIPE_ARCH_X86)
PIPE_ALIGN_STACK static inline boolean sse2_has_daz(void)
{
struct {
uint32_t pad1[7];
uint32_t mxcsr_mask;
uint32_t pad2[128-8];
} PIPE_ALIGN_VAR(16) fxarea;
fxarea.mxcsr_mask = 0;
#if defined(PIPE_CC_GCC)
__asm __volatile ("fxsave %0" : "+m" (fxarea));
#elif defined(PIPE_CC_MSVC) || defined(PIPE_CC_ICL)
_fxsave(&fxarea);
#else
fxarea.mxcsr_mask = 0;
#endif
return !!(fxarea.mxcsr_mask & (1 << 6));
}
#endif
#endif /* X86 or X86_64 */
#if defined(PIPE_ARCH_ARM)
static void
check_os_arm_support(void)
{
/*
* On Android, the cpufeatures library is preferred way of checking
* CPU capabilities. However, it is not available for standalone Mesa
* builds, i.e. when Android build system (Android.mk-based) is not
* used. Because of this we cannot use PIPE_OS_ANDROID here, but rather
* have a separate macro that only gets enabled from respective Android.mk.
*/
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
util_cpu_caps.has_neon = 1;
#elif defined(PIPE_OS_FREEBSD) && defined(HAVE_ELF_AUX_INFO)
unsigned long hwcap = 0;
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
if (hwcap & HWCAP_NEON)
util_cpu_caps.has_neon = 1;
#elif defined(HAS_ANDROID_CPUFEATURES)
AndroidCpuFamily cpu_family = android_getCpuFamily();
uint64_t cpu_features = android_getCpuFeatures();
if (cpu_family == ANDROID_CPU_FAMILY_ARM) {
if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON)
util_cpu_caps.has_neon = 1;
}
#elif defined(PIPE_OS_LINUX)
Elf32_auxv_t aux;
int fd;
fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
if (fd >= 0) {
while (read(fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) {
if (aux.a_type == AT_HWCAP) {
uint32_t hwcap = aux.a_un.a_val;
util_cpu_caps.has_neon = (hwcap >> 12) & 1;
break;
}
}
close (fd);
}
#endif /* PIPE_OS_LINUX */
}
#elif defined(PIPE_ARCH_AARCH64)
static void
check_os_arm_support(void)
{
util_cpu_caps.has_neon = true;
}
#endif /* PIPE_ARCH_ARM || PIPE_ARCH_AARCH64 */
#if defined(PIPE_ARCH_MIPS64)
static void
check_os_mips64_support(void)
{
Elf64_auxv_t aux;
int fd;
fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
if (fd >= 0) {
while (read(fd, &aux, sizeof(Elf64_auxv_t)) == sizeof(Elf64_auxv_t)) {
if (aux.a_type == AT_HWCAP) {
uint64_t hwcap = aux.a_un.a_val;
util_cpu_caps.has_msa = (hwcap >> 1) & 1;
break;
}
}
close (fd);
}
}
#endif /* PIPE_ARCH_MIPS64 */
static void
get_cpu_topology(void)
{
/* Default. This is OK if L3 is not present or there is only one. */
util_cpu_caps.num_L3_caches = 1;
memset(util_cpu_caps.cpu_to_L3, 0xff, sizeof(util_cpu_caps.cpu_to_L3));
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
/* AMD Zen */
if (util_cpu_caps.family >= CPU_AMD_ZEN1_ZEN2 &&
util_cpu_caps.family < CPU_AMD_LAST) {
uint32_t regs[4];
uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0};
uint32_t mask[UTIL_MAX_CPUS / 32] = {0};
bool saved = false;
uint32_t L3_found[UTIL_MAX_CPUS] = {0};
uint32_t num_L3_caches = 0;
util_affinity_mask *L3_affinity_masks = NULL;
/* Query APIC IDs from each CPU core.
*
* An APIC ID is a logical ID of the CPU with respect to the cache
* hierarchy, meaning that consecutive APIC IDs are neighbours in
* the hierarchy, e.g. sharing the same cache.
*
* For example, CPU 0 can have APIC ID 0 and CPU 12 can have APIC ID 1,
* which means that both CPU 0 and 12 are next to each other.
* (e.g. they are 2 threads belonging to 1 SMT2 core)
*
* We need to find out which CPUs share the same L3 cache and they can
* be all over the place.
*
* Querying the APIC ID can only be done by pinning the current thread
* to each core. The original affinity mask is saved.
*
* Loop over all possible CPUs even though some may be offline.
*/
for (int16_t i = 0; i < util_cpu_caps.max_cpus && i < UTIL_MAX_CPUS; i++) {
uint32_t cpu_bit = 1u << (i % 32);
mask[i / 32] = cpu_bit;
/* The assumption is that trying to bind the thread to a CPU that is
* offline will fail.
*/
if (util_set_current_thread_affinity(mask,
!saved ? saved_mask : NULL,
util_cpu_caps.num_cpu_mask_bits)) {
saved = true;
/* Query the APIC ID of the current core. */
cpuid(0x00000001, regs);
unsigned apic_id = regs[1] >> 24;
/* Query the total core count for the CPU */
uint32_t core_count = 1;
if (regs[3] & (1 << 28))
core_count = (regs[1] >> 16) & 0xff;
core_count = util_next_power_of_two(core_count);
/* Query the L3 cache count. */
cpuid_count(0x8000001D, 3, regs);
unsigned cache_level = (regs[0] >> 5) & 0x7;
unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1;
if (cache_level != 3)
continue;
unsigned local_core_id = apic_id & (core_count - 1);
unsigned phys_id = (apic_id & ~(core_count - 1)) >> util_logbase2(core_count);
unsigned local_l3_cache_index = local_core_id / util_next_power_of_two(cores_per_L3);
#define L3_ID(p, i) (p << 16 | i << 1 | 1);
unsigned l3_id = L3_ID(phys_id, local_l3_cache_index);
int idx = -1;
for (unsigned c = 0; c < num_L3_caches; c++) {
if (L3_found[c] == l3_id) {
idx = c;
break;
}
}
if (idx == -1) {
idx = num_L3_caches;
L3_found[num_L3_caches++] = l3_id;
L3_affinity_masks = realloc(L3_affinity_masks, sizeof(util_affinity_mask) * num_L3_caches);
if (!L3_affinity_masks)
return;
memset(&L3_affinity_masks[num_L3_caches - 1], 0, sizeof(util_affinity_mask));
}
util_cpu_caps.cpu_to_L3[i] = idx;
L3_affinity_masks[idx][i / 32] |= cpu_bit;
}
mask[i / 32] = 0;
}
util_cpu_caps.num_L3_caches = num_L3_caches;
util_cpu_caps.L3_affinity_mask = L3_affinity_masks;
if (saved) {
if (debug_get_option_dump_cpu()) {
fprintf(stderr, "CPU <-> L3 cache mapping:\n");
for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) {
fprintf(stderr, " - L3 %u mask = ", i);
for (int j = util_cpu_caps.max_cpus - 1; j >= 0; j -= 32)
fprintf(stderr, "%08x ", util_cpu_caps.L3_affinity_mask[i][j / 32]);
fprintf(stderr, "\n");
}
}
/* Restore the original affinity mask. */
util_set_current_thread_affinity(saved_mask, NULL,
util_cpu_caps.num_cpu_mask_bits);
} else {
if (debug_get_option_dump_cpu())
fprintf(stderr, "Cannot set thread affinity for any thread.\n");
}
}
#endif
}
static void
util_cpu_detect_once(void)
{
int available_cpus = 0;
int total_cpus = 0;
memset(&util_cpu_caps, 0, sizeof util_cpu_caps);
/* Count the number of CPUs in system */
#if defined(PIPE_OS_WINDOWS)
{
SYSTEM_INFO system_info;
GetSystemInfo(&system_info);
available_cpus = MAX2(1, system_info.dwNumberOfProcessors);
}
#elif defined(PIPE_OS_UNIX)
# if defined(HAS_SCHED_GETAFFINITY)
{
/* sched_setaffinity() can be used to further restrict the number of
* CPUs on which the process can run. Use sched_getaffinity() to
* determine the true number of available CPUs.
*
* FIXME: The Linux manual page for sched_getaffinity describes how this
* simple implementation will fail with > 1024 CPUs, and we'll fall back
* to the _SC_NPROCESSORS_ONLN path. Support for > 1024 CPUs can be
* added to this path once someone has such a system for testing.
*/
cpu_set_t affin;
if (sched_getaffinity(getpid(), sizeof(affin), &affin) == 0)
available_cpus = CPU_COUNT(&affin);
}
# endif
/* Linux, FreeBSD, DragonFly, and Mac OS X should have
* _SC_NOPROCESSORS_ONLN. NetBSD and OpenBSD should have HW_NCPUONLINE.
* This is what FFmpeg uses on those platforms.
*/
# if defined(PIPE_OS_BSD) && defined(HW_NCPUONLINE)
if (available_cpus == 0) {
const int mib[] = { CTL_HW, HW_NCPUONLINE };
int ncpu;
int len = sizeof(ncpu);
sysctl(mib, 2, &ncpu, &len, NULL, 0);
available_cpus = ncpu;
}
# elif defined(_SC_NPROCESSORS_ONLN)
if (available_cpus == 0) {
available_cpus = sysconf(_SC_NPROCESSORS_ONLN);
if (available_cpus == ~0)
available_cpus = 1;
}
# elif defined(PIPE_OS_BSD)
if (available_cpus == 0) {
const int mib[] = { CTL_HW, HW_NCPU };
int ncpu;
int len = sizeof(ncpu);
sysctl(mib, 2, &ncpu, &len, NULL, 0);
available_cpus = ncpu;
}
# endif /* defined(PIPE_OS_BSD) */
/* Determine the maximum number of CPUs configured in the system. This is
* used to properly set num_cpu_mask_bits below. On BSDs that don't have
* HW_NCPUONLINE, it was not clear whether HW_NCPU is the number of
* configured or the number of online CPUs. For that reason, prefer the
* _SC_NPROCESSORS_CONF path on all BSDs.
*/
# if defined(_SC_NPROCESSORS_CONF)
total_cpus = sysconf(_SC_NPROCESSORS_CONF);
if (total_cpus == ~0)
total_cpus = 1;
# elif defined(PIPE_OS_BSD)
{
const int mib[] = { CTL_HW, HW_NCPU };
int ncpu;
int len = sizeof(ncpu);
sysctl(mib, 2, &ncpu, &len, NULL, 0);
total_cpus = ncpu;
}
# endif /* defined(PIPE_OS_BSD) */
#endif /* defined(PIPE_OS_UNIX) */
util_cpu_caps.nr_cpus = MAX2(1, available_cpus);
total_cpus = MAX2(total_cpus, util_cpu_caps.nr_cpus);
util_cpu_caps.max_cpus = total_cpus;
util_cpu_caps.num_cpu_mask_bits = align(total_cpus, 32);
/* Make the fallback cacheline size nonzero so that it can be
* safely passed to align().
*/
util_cpu_caps.cacheline = sizeof(void *);
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
if (has_cpuid()) {
uint32_t regs[4];
uint32_t regs2[4];
util_cpu_caps.cacheline = 32;
/* Get max cpuid level */
cpuid(0x00000000, regs);
if (regs[0] >= 0x00000001) {
unsigned int cacheline;
cpuid (0x00000001, regs2);
util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf;
/* Add "extended family". */
if (util_cpu_caps.x86_cpu_type == 0xf)
util_cpu_caps.x86_cpu_type += ((regs2[0] >> 20) & 0xff);
switch (util_cpu_caps.x86_cpu_type) {
case 0x17:
util_cpu_caps.family = CPU_AMD_ZEN1_ZEN2;
break;
case 0x18:
util_cpu_caps.family = CPU_AMD_ZEN_HYGON;
break;
case 0x19:
util_cpu_caps.family = CPU_AMD_ZEN3;
break;
default:
if (util_cpu_caps.x86_cpu_type > 0x19)
util_cpu_caps.family = CPU_AMD_ZEN_NEXT;
}
/* general feature flags */
util_cpu_caps.has_tsc = (regs2[3] >> 4) & 1; /* 0x0000010 */
util_cpu_caps.has_mmx = (regs2[3] >> 23) & 1; /* 0x0800000 */
util_cpu_caps.has_sse = (regs2[3] >> 25) & 1; /* 0x2000000 */
util_cpu_caps.has_sse2 = (regs2[3] >> 26) & 1; /* 0x4000000 */
util_cpu_caps.has_sse3 = (regs2[2] >> 0) & 1; /* 0x0000001 */
util_cpu_caps.has_ssse3 = (regs2[2] >> 9) & 1; /* 0x0000020 */
util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1;
util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1;
util_cpu_caps.has_popcnt = (regs2[2] >> 23) & 1;
util_cpu_caps.has_avx = ((regs2[2] >> 28) & 1) && // AVX
((regs2[2] >> 27) & 1) && // OSXSAVE
((xgetbv() & 6) == 6); // XMM & YMM
util_cpu_caps.has_f16c = ((regs2[2] >> 29) & 1) && util_cpu_caps.has_avx;
util_cpu_caps.has_fma = ((regs2[2] >> 12) & 1) && util_cpu_caps.has_avx;
util_cpu_caps.has_mmx2 = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */
#if defined(PIPE_ARCH_X86_64)
util_cpu_caps.has_daz = 1;
#else
util_cpu_caps.has_daz = util_cpu_caps.has_sse3 ||
(util_cpu_caps.has_sse2 && sse2_has_daz());
#endif
cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
if (cacheline > 0)
util_cpu_caps.cacheline = cacheline;
}
if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) {
uint32_t regs7[4];
cpuid_count(0x00000007, 0x00000000, regs7);
util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1;
}
// check for avx512
if (((regs2[2] >> 27) & 1) && // OSXSAVE
(xgetbv() & (0x7 << 5)) && // OPMASK: upper-256 enabled by OS
((xgetbv() & 6) == 6)) { // XMM/YMM enabled by OS
uint32_t regs3[4];
cpuid_count(0x00000007, 0x00000000, regs3);
util_cpu_caps.has_avx512f = (regs3[1] >> 16) & 1;
util_cpu_caps.has_avx512dq = (regs3[1] >> 17) & 1;
util_cpu_caps.has_avx512ifma = (regs3[1] >> 21) & 1;
util_cpu_caps.has_avx512pf = (regs3[1] >> 26) & 1;
util_cpu_caps.has_avx512er = (regs3[1] >> 27) & 1;
util_cpu_caps.has_avx512cd = (regs3[1] >> 28) & 1;
util_cpu_caps.has_avx512bw = (regs3[1] >> 30) & 1;
util_cpu_caps.has_avx512vl = (regs3[1] >> 31) & 1;
util_cpu_caps.has_avx512vbmi = (regs3[2] >> 1) & 1;
}
if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) {
/* GenuineIntel */
util_cpu_caps.has_intel = 1;
}
cpuid(0x80000000, regs);
if (regs[0] >= 0x80000001) {
cpuid(0x80000001, regs2);
util_cpu_caps.has_mmx |= (regs2[3] >> 23) & 1;
util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1;
util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1;
util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1;
util_cpu_caps.has_xop = util_cpu_caps.has_avx &&
((regs2[2] >> 11) & 1);
}
if (regs[0] >= 0x80000006) {
/* should we really do this if the clflush size above worked? */
unsigned int cacheline;
cpuid(0x80000006, regs2);
cacheline = regs2[2] & 0xFF;
if (cacheline > 0)
util_cpu_caps.cacheline = cacheline;
}
if (!util_cpu_caps.has_sse) {
util_cpu_caps.has_sse2 = 0;
util_cpu_caps.has_sse3 = 0;
util_cpu_caps.has_ssse3 = 0;
util_cpu_caps.has_sse4_1 = 0;
}
}
#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
#if defined(PIPE_ARCH_ARM) || defined(PIPE_ARCH_AARCH64)
check_os_arm_support();
#endif
#if defined(PIPE_ARCH_PPC)
check_os_altivec_support();
#endif /* PIPE_ARCH_PPC */
#if defined(PIPE_ARCH_MIPS64)
check_os_mips64_support();
#endif /* PIPE_ARCH_MIPS64 */
get_cpu_topology();
if (debug_get_option_dump_cpu()) {
printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type);
printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline);
printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc);
printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx);
printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2);
printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse);
printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2);
printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3);
printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3);
printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1);
printf("util_cpu_caps.has_sse4_2 = %u\n", util_cpu_caps.has_sse4_2);
printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx);
printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2);
printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c);
printf("util_cpu_caps.has_popcnt = %u\n", util_cpu_caps.has_popcnt);
printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow);
printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext);
printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop);
printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec);
printf("util_cpu_caps.has_vsx = %u\n", util_cpu_caps.has_vsx);
printf("util_cpu_caps.has_neon = %u\n", util_cpu_caps.has_neon);
printf("util_cpu_caps.has_msa = %u\n", util_cpu_caps.has_msa);
printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz);
printf("util_cpu_caps.has_avx512f = %u\n", util_cpu_caps.has_avx512f);
printf("util_cpu_caps.has_avx512dq = %u\n", util_cpu_caps.has_avx512dq);
printf("util_cpu_caps.has_avx512ifma = %u\n", util_cpu_caps.has_avx512ifma);
printf("util_cpu_caps.has_avx512pf = %u\n", util_cpu_caps.has_avx512pf);
printf("util_cpu_caps.has_avx512er = %u\n", util_cpu_caps.has_avx512er);
printf("util_cpu_caps.has_avx512cd = %u\n", util_cpu_caps.has_avx512cd);
printf("util_cpu_caps.has_avx512bw = %u\n", util_cpu_caps.has_avx512bw);
printf("util_cpu_caps.has_avx512vl = %u\n", util_cpu_caps.has_avx512vl);
printf("util_cpu_caps.has_avx512vbmi = %u\n", util_cpu_caps.has_avx512vbmi);
printf("util_cpu_caps.num_L3_caches = %u\n", util_cpu_caps.num_L3_caches);
printf("util_cpu_caps.num_cpu_mask_bits = %u\n", util_cpu_caps.num_cpu_mask_bits);
}
}
static once_flag cpu_once_flag = ONCE_FLAG_INIT;
void
util_cpu_detect(void)
{
call_once(&cpu_once_flag, util_cpu_detect_once);
}

@ -36,17 +36,45 @@
#define _UTIL_CPU_DETECT_H
#include "pipe/p_compiler.h"
#include "pipe/p_config.h"
#include "util/u_thread.h"
#ifdef __cplusplus
extern "C" {
#endif
enum cpu_family {
CPU_UNKNOWN,
struct util_cpu_caps {
int nr_cpus;
CPU_AMD_ZEN1_ZEN2,
CPU_AMD_ZEN_HYGON,
CPU_AMD_ZEN3,
CPU_AMD_ZEN_NEXT,
CPU_AMD_LAST,
};
typedef uint32_t util_affinity_mask[UTIL_MAX_CPUS / 32];
struct util_cpu_caps_t {
/**
* Number of CPUs available to the process.
*
* This will be less than or equal to \c max_cpus. This is the number of
* CPUs that are online and available to the process.
*/
int16_t nr_cpus;
/**
* Maximum number of CPUs that can be online in the system.
*
* This will be greater than or equal to \c nr_cpus. This is the number of
* CPUs installed in the system. \c nr_cpus will be less if some CPUs are
* offline.
*/
int16_t max_cpus;
enum cpu_family family;
/* Feature flags */
int x86_cpu_type;
@ -66,15 +94,48 @@ struct util_cpu_caps {
unsigned has_avx:1;
unsigned has_avx2:1;
unsigned has_f16c:1;
unsigned has_fma:1;
unsigned has_3dnow:1;
unsigned has_3dnow_ext:1;
unsigned has_xop:1;
unsigned has_altivec:1;
unsigned has_vsx:1;
unsigned has_daz:1;
unsigned has_neon:1;
unsigned has_msa:1;
unsigned has_avx512f:1;
unsigned has_avx512dq:1;
unsigned has_avx512ifma:1;
unsigned has_avx512pf:1;
unsigned has_avx512er:1;
unsigned has_avx512cd:1;
unsigned has_avx512bw:1;
unsigned has_avx512vl:1;
unsigned has_avx512vbmi:1;
unsigned num_L3_caches;
unsigned num_cpu_mask_bits;
uint16_t cpu_to_L3[UTIL_MAX_CPUS];
/* Affinity masks for each L3 cache. */
util_affinity_mask *L3_affinity_mask;
};
extern struct util_cpu_caps
util_cpu_caps;
#define U_CPU_INVALID_L3 0xffff
static inline const struct util_cpu_caps_t *
util_get_cpu_caps(void)
{
extern struct util_cpu_caps_t util_cpu_caps;
/* If you hit this assert, it means that something is using the
* cpu-caps without having first called util_cpu_detect()
*/
assert(util_cpu_caps.nr_cpus >= 1);
return &util_cpu_caps;
}
void util_cpu_detect(void);

@ -0,0 +1,311 @@
/**************************************************************************
*
* Copyright 2008 VMware, Inc.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
**************************************************************************/
#include "pipe/p_config.h"
#include "util/u_math.h"
#include "util/u_cpu_detect.h"
#if defined(PIPE_ARCH_SSE)
#include <xmmintrin.h>
/* This is defined in pmmintrin.h, but it can only be included when -msse3 is
* used, so just define it here to avoid further. */
#ifndef _MM_DENORMALS_ZERO_MASK
#define _MM_DENORMALS_ZERO_MASK 0x0040
#endif
#endif
/** log2(x), for x in [1.0, 2.0) */
float log2_table[LOG2_TABLE_SIZE];
static void
init_log2_table(void)
{
unsigned i;
for (i = 0; i < LOG2_TABLE_SIZE; i++)
log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SCALE));
}
/**
* One time init for math utilities.
*/
void
util_init_math(void)
{
static bool initialized = false;
if (!initialized) {
init_log2_table();
initialized = true;
}
}
/**
* Fetches the contents of the fpstate (mxcsr on x86) register.
*
* On platforms without support for it just returns 0.
*/
unsigned
util_fpstate_get(void)
{
unsigned mxcsr = 0;
#if defined(PIPE_ARCH_SSE)
if (util_get_cpu_caps()->has_sse) {
mxcsr = _mm_getcsr();
}
#endif
return mxcsr;
}
/**
* Make sure that the fp treats the denormalized floating
* point numbers as zero.
*
* This is the behavior required by D3D10. OpenGL doesn't care.
*/
unsigned
util_fpstate_set_denorms_to_zero(unsigned current_mxcsr)
{
#if defined(PIPE_ARCH_SSE)
if (util_get_cpu_caps()->has_sse) {
/* Enable flush to zero mode */
current_mxcsr |= _MM_FLUSH_ZERO_MASK;
if (util_get_cpu_caps()->has_daz) {
/* Enable denormals are zero mode */
current_mxcsr |= _MM_DENORMALS_ZERO_MASK;
}
util_fpstate_set(current_mxcsr);
}
#endif
return current_mxcsr;
}
/**
* Set the state of the fpstate (mxcsr on x86) register.
*
* On platforms without support for it's a noop.
*/
void
util_fpstate_set(unsigned mxcsr)
{
#if defined(PIPE_ARCH_SSE)
if (util_get_cpu_caps()->has_sse) {
_mm_setcsr(mxcsr);
}
#endif
}
/**
* Compute inverse of 4x4 matrix.
*
* \return false if the source matrix is singular.
*
* \author
* Code contributed by Jacques Leroy jle@star.be
*
* Calculates the inverse matrix by performing the gaussian matrix reduction
* with partial pivoting followed by back/substitution with the loops manually
* unrolled.
*/
bool
util_invert_mat4x4(float *out, const float *m)
{
float wtmp[4][8];
float m0, m1, m2, m3, s;
float *r0, *r1, *r2, *r3;
#define MAT(m, r, c) (m)[(c)*4 + (r)]
#define SWAP_ROWS(a, b) \
{ \
float *_tmp = a; \
(a) = (b); \
(b) = _tmp; \
}
r0 = wtmp[0], r1 = wtmp[1], r2 = wtmp[2], r3 = wtmp[3];
r0[0] = MAT(m, 0, 0), r0[1] = MAT(m, 0, 1), r0[2] = MAT(m, 0, 2), r0[3] = MAT(m, 0, 3),
r0[4] = 1.0, r0[5] = r0[6] = r0[7] = 0.0,
r1[0] = MAT(m, 1, 0), r1[1] = MAT(m, 1, 1), r1[2] = MAT(m, 1, 2), r1[3] = MAT(m, 1, 3),
r1[5] = 1.0, r1[4] = r1[6] = r1[7] = 0.0,
r2[0] = MAT(m, 2, 0), r2[1] = MAT(m, 2, 1), r2[2] = MAT(m, 2, 2), r2[3] = MAT(m, 2, 3),
r2[6] = 1.0, r2[4] = r2[5] = r2[7] = 0.0,
r3[0] = MAT(m, 3, 0), r3[1] = MAT(m, 3, 1), r3[2] = MAT(m, 3, 2), r3[3] = MAT(m, 3, 3),
r3[7] = 1.0, r3[4] = r3[5] = r3[6] = 0.0;
/* choose pivot - or die */
if (fabsf(r3[0]) > fabsf(r2[0]))
SWAP_ROWS(r3, r2);
if (fabsf(r2[0]) > fabsf(r1[0]))
SWAP_ROWS(r2, r1);
if (fabsf(r1[0]) > fabsf(r0[0]))
SWAP_ROWS(r1, r0);
if (0.0F == r0[0])
return false;
/* eliminate first variable */
m1 = r1[0] / r0[0];
m2 = r2[0] / r0[0];
m3 = r3[0] / r0[0];
s = r0[1];
r1[1] -= m1 * s;
r2[1] -= m2 * s;
r3[1] -= m3 * s;
s = r0[2];
r1[2] -= m1 * s;
r2[2] -= m2 * s;
r3[2] -= m3 * s;
s = r0[3];
r1[3] -= m1 * s;
r2[3] -= m2 * s;
r3[3] -= m3 * s;
s = r0[4];
if (s != 0.0F) {
r1[4] -= m1 * s;
r2[4] -= m2 * s;
r3[4] -= m3 * s;
}
s = r0[5];
if (s != 0.0F) {
r1[5] -= m1 * s;
r2[5] -= m2 * s;
r3[5] -= m3 * s;
}
s = r0[6];
if (s != 0.0F) {
r1[6] -= m1 * s;
r2[6] -= m2 * s;
r3[6] -= m3 * s;
}
s = r0[7];
if (s != 0.0F) {
r1[7] -= m1 * s;
r2[7] -= m2 * s;
r3[7] -= m3 * s;
}
/* choose pivot - or die */
if (fabsf(r3[1]) > fabsf(r2[1]))
SWAP_ROWS(r3, r2);
if (fabsf(r2[1]) > fabsf(r1[1]))
SWAP_ROWS(r2, r1);
if (0.0F == r1[1])
return false;
/* eliminate second variable */
m2 = r2[1] / r1[1];
m3 = r3[1] / r1[1];
r2[2] -= m2 * r1[2];
r3[2] -= m3 * r1[2];
r2[3] -= m2 * r1[3];
r3[3] -= m3 * r1[3];
s = r1[4];
if (0.0F != s) {
r2[4] -= m2 * s;
r3[4] -= m3 * s;
}
s = r1[5];
if (0.0F != s) {
r2[5] -= m2 * s;
r3[5] -= m3 * s;
}
s = r1[6];
if (0.0F != s) {
r2[6] -= m2 * s;
r3[6] -= m3 * s;
}
s = r1[7];
if (0.0F != s) {
r2[7] -= m2 * s;
r3[7] -= m3 * s;
}
/* choose pivot - or die */
if (fabsf(r3[2]) > fabsf(r2[2]))
SWAP_ROWS(r3, r2);
if (0.0F == r2[2])
return false;
/* eliminate third variable */
m3 = r3[2] / r2[2];
r3[3] -= m3 * r2[3], r3[4] -= m3 * r2[4], r3[5] -= m3 * r2[5], r3[6] -= m3 * r2[6],
r3[7] -= m3 * r2[7];
/* last check */
if (0.0F == r3[3])
return false;
s = 1.0F / r3[3]; /* now back substitute row 3 */
r3[4] *= s;
r3[5] *= s;
r3[6] *= s;
r3[7] *= s;
m2 = r2[3]; /* now back substitute row 2 */
s = 1.0F / r2[2];
r2[4] = s * (r2[4] - r3[4] * m2), r2[5] = s * (r2[5] - r3[5] * m2),
r2[6] = s * (r2[6] - r3[6] * m2), r2[7] = s * (r2[7] - r3[7] * m2);
m1 = r1[3];
r1[4] -= r3[4] * m1, r1[5] -= r3[5] * m1, r1[6] -= r3[6] * m1, r1[7] -= r3[7] * m1;
m0 = r0[3];
r0[4] -= r3[4] * m0, r0[5] -= r3[5] * m0, r0[6] -= r3[6] * m0, r0[7] -= r3[7] * m0;
m1 = r1[2]; /* now back substitute row 1 */
s = 1.0F / r1[1];
r1[4] = s * (r1[4] - r2[4] * m1), r1[5] = s * (r1[5] - r2[5] * m1),
r1[6] = s * (r1[6] - r2[6] * m1), r1[7] = s * (r1[7] - r2[7] * m1);
m0 = r0[2];
r0[4] -= r2[4] * m0, r0[5] -= r2[5] * m0, r0[6] -= r2[6] * m0, r0[7] -= r2[7] * m0;
m0 = r0[1]; /* now back substitute row 0 */
s = 1.0F / r0[0];
r0[4] = s * (r0[4] - r1[4] * m0), r0[5] = s * (r0[5] - r1[5] * m0),
r0[6] = s * (r0[6] - r1[6] * m0), r0[7] = s * (r0[7] - r1[7] * m0);
MAT(out, 0, 0) = r0[4];
MAT(out, 0, 1) = r0[5], MAT(out, 0, 2) = r0[6];
MAT(out, 0, 3) = r0[7], MAT(out, 1, 0) = r1[4];
MAT(out, 1, 1) = r1[5], MAT(out, 1, 2) = r1[6];
MAT(out, 1, 3) = r1[7], MAT(out, 2, 0) = r2[4];
MAT(out, 2, 1) = r2[5], MAT(out, 2, 2) = r2[6];
MAT(out, 2, 3) = r2[7], MAT(out, 3, 0) = r3[4];
MAT(out, 3, 1) = r3[5], MAT(out, 3, 2) = r3[6];
MAT(out, 3, 3) = r3[7];
#undef MAT
#undef SWAP_ROWS
return true;
}

@ -39,177 +39,24 @@
#define U_MATH_H
#include "pipe/p_compiler.h"
#include "c99_math.h"
#include <assert.h>
#include <float.h>
#include <stdarg.h>
#include "bitscan.h"
#include "u_endian.h" /* for UTIL_ARCH_BIG_ENDIAN */
#ifdef __cplusplus
extern "C" {
#endif
#include <math.h>
#include <float.h>
#include <stdarg.h>
#ifdef PIPE_OS_UNIX
#include <strings.h> /* for ffs */
#endif
#ifndef M_SQRT2
#define M_SQRT2 1.41421356237309504880
#endif
#if defined(_MSC_VER)
#if _MSC_VER < 1400 && !defined(__cplusplus)
static inline float cosf( float f )
{
return (float) cos( (double) f );
}
static inline float sinf( float f )
{
return (float) sin( (double) f );
}
static inline float ceilf( float f )
{
return (float) ceil( (double) f );
}
static inline float floorf( float f )
{
return (float) floor( (double) f );
}
static inline float powf( float f, float g )
{
return (float) pow( (double) f, (double) g );
}
static inline float sqrtf( float f )
{
return (float) sqrt( (double) f );
}
static inline float fabsf( float f )
{
return (float) fabs( (double) f );
}
static inline float logf( float f )
{
return (float) log( (double) f );
}
#else
/* Work-around an extra semi-colon in VS 2005 logf definition */
#ifdef logf
#undef logf
#define logf(x) ((float)log((double)(x)))
#endif /* logf */
#if _MSC_VER < 1800
#define isfinite(x) _finite((double)(x))
#define isnan(x) _isnan((double)(x))
#endif /* _MSC_VER < 1800 */
#endif /* _MSC_VER < 1400 && !defined(__cplusplus) */
#if _MSC_VER < 1800
static inline double log2( double x )
{
const double invln2 = 1.442695041;
return log( x ) * invln2;
}
static inline double
round(double x)
{
return x >= 0.0 ? floor(x + 0.5) : ceil(x - 0.5);
}
static inline float
roundf(float x)
{
return x >= 0.0f ? floorf(x + 0.5f) : ceilf(x - 0.5f);
}
#endif
#ifndef INFINITY
#define INFINITY (DBL_MAX + DBL_MAX)
#endif
#ifndef NAN
#define NAN (INFINITY - INFINITY)
#endif
#endif /* _MSC_VER */
#if __STDC_VERSION__ < 199901L && (!defined(__cplusplus) || defined(_MSC_VER))
static inline long int
lrint(double d)
{
long int rounded = (long int)(d + 0.5);
if (d - floor(d) == 0.5) {
if (rounded % 2 != 0)
rounded += (d > 0) ? -1 : 1;
}
return rounded;
}
static inline long int
lrintf(float f)
{
long int rounded = (long int)(f + 0.5f);
if (f - floorf(f) == 0.5f) {
if (rounded % 2 != 0)
rounded += (f > 0) ? -1 : 1;
}
return rounded;
}
static inline long long int
llrint(double d)
{
long long int rounded = (long long int)(d + 0.5);
if (d - floor(d) == 0.5) {
if (rounded % 2 != 0)
rounded += (d > 0) ? -1 : 1;
}
return rounded;
}
static inline long long int
llrintf(float f)
{
long long int rounded = (long long int)(f + 0.5f);
if (f - floorf(f) == 0.5f) {
if (rounded % 2 != 0)
rounded += (f > 0) ? -1 : 1;
}
return rounded;
}
#endif /* C99 */
#define POW2_TABLE_SIZE_LOG2 9
#define POW2_TABLE_SIZE (1 << POW2_TABLE_SIZE_LOG2)
#define POW2_TABLE_OFFSET (POW2_TABLE_SIZE/2)
#define POW2_TABLE_SCALE ((float)(POW2_TABLE_SIZE/2))
extern float pow2_table[POW2_TABLE_SIZE];
/**
* Initialize math module. This should be called before using any
* other functions in this module.
@ -236,7 +83,8 @@ union di {
* Extract the IEEE float32 exponent.
*/
static inline signed
util_get_float32_exponent(float x) {
util_get_float32_exponent(float x)
{
union fi f;
f.f = x;
@ -245,57 +93,7 @@ util_get_float32_exponent(float x) {
}
/**
* Fast version of 2^x
* Identity: exp2(a + b) = exp2(a) * exp2(b)
* Let ipart = int(x)
* Let fpart = x - ipart;
* So, exp2(x) = exp2(ipart) * exp2(fpart)
* Compute exp2(ipart) with i << ipart
* Compute exp2(fpart) with lookup table.
*/
static inline float
util_fast_exp2(float x)
{
int32_t ipart;
float fpart, mpart;
union fi epart;
if(x > 129.00000f)
return 3.402823466e+38f;
if (x < -126.99999f)
return 0.0f;
ipart = (int32_t) x;
fpart = x - (float) ipart;
/* same as
* epart.f = (float) (1 << ipart)
* but faster and without integer overflow for ipart > 31
*/
epart.i = (ipart + 127 ) << 23;
mpart = pow2_table[POW2_TABLE_OFFSET + (int)(fpart * POW2_TABLE_SCALE)];
return epart.f * mpart;
}
/**
* Fast approximation to exp(x).
*/
static inline float
util_fast_exp(float x)
{
const float k = 1.44269f; /* = log2(e) */
return util_fast_exp2(k * x);
}
#if 0
#define LOG2_TABLE_SIZE_LOG2 16
#define LOG2_TABLE_SIZE_LOG2 8
#define LOG2_TABLE_SCALE (1 << LOG2_TABLE_SIZE_LOG2)
#define LOG2_TABLE_SIZE (LOG2_TABLE_SCALE + 1)
extern float log2_table[LOG2_TABLE_SIZE];
@ -317,30 +115,29 @@ util_fast_log2(float x)
}
/**
* Fast approximation to x^y.
*/
static inline float
util_fast_pow(float x, float y)
{
return util_fast_exp2(util_fast_log2(x) * y);
}
#endif
/* Note that this counts zero as a power of two.
*/
static inline boolean
util_is_power_of_two( unsigned v )
{
return (v & (v-1)) == 0;
}
/**
* Floor(x), returned as int.
*/
static inline int
util_ifloor(float f)
{
#if defined(USE_X86_ASM) && defined(__GNUC__) && defined(__i386__)
/*
* IEEE floor for computers that round to nearest or even.
* 'f' must be between -4194304 and 4194303.
* This floor operation is done by "(iround(f + .5) + iround(f - .5)) >> 1",
* but uses some IEEE specific tricks for better speed.
* Contributed by Josh Vanderhoof
*/
int ai, bi;
double af, bf;
af = (3 << 22) + 0.5 + (double)f;
bf = (3 << 22) + 0.5 - (double)f;
/* GCC generates an extra fstp/fld without this. */
__asm__ ("fstps %0" : "=m" (ai) : "t" (af) : "st");
__asm__ ("fstps %0" : "=m" (bi) : "t" (bf) : "st");
return (ai - bi) >> 1;
#else
int ai, bi;
double af, bf;
union fi u;
@ -349,6 +146,7 @@ util_ifloor(float f)
u.f = (float) af; ai = u.i;
u.f = (float) bf; bi = u.i;
return (ai - bi) >> 1;
#endif
}
@ -381,10 +179,10 @@ util_iround(float f)
/**
* Approximate floating point comparison
*/
static inline boolean
static inline bool
util_is_approx(float a, float b, float tol)
{
return fabs(b - a) <= tol;
return fabsf(b - a) <= tol;
}
@ -400,7 +198,7 @@ util_is_approx(float a, float b, float tol)
/**
* Single-float
*/
static inline boolean
static inline bool
util_is_inf_or_nan(float x)
{
union fi tmp;
@ -409,7 +207,7 @@ util_is_inf_or_nan(float x)
}
static inline boolean
static inline bool
util_is_nan(float x)
{
union fi tmp;
@ -434,7 +232,7 @@ util_inf_sign(float x)
/**
* Double-float
*/
static inline boolean
static inline bool
util_is_double_inf_or_nan(double x)
{
union di tmp;
@ -443,7 +241,7 @@ util_is_double_inf_or_nan(double x)
}
static inline boolean
static inline bool
util_is_double_nan(double x)
{
union di tmp;
@ -468,14 +266,14 @@ util_double_inf_sign(double x)
/**
* Half-float
*/
static inline boolean
static inline bool
util_is_half_inf_or_nan(int16_t x)
{
return (x & 0x7c00) == 0x7c00;
}
static inline boolean
static inline bool
util_is_half_nan(int16_t x)
{
return (x & 0x7fff) > 0x7c00;
@ -494,163 +292,84 @@ util_half_inf_sign(int16_t x)
/**
* Find first bit set in word. Least significant bit is 1.
* Return 0 if no bits set.
* Return float bits.
*/
#ifndef FFS_DEFINED
#define FFS_DEFINED 1
#if defined(_MSC_VER) && _MSC_VER >= 1300 && (_M_IX86 || _M_AMD64 || _M_IA64)
unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask);
#pragma intrinsic(_BitScanForward)
static inline
unsigned long ffs( unsigned long u )
{
unsigned long i;
if (_BitScanForward(&i, u))
return i + 1;
else
return 0;
}
#elif defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86)
static inline
unsigned ffs( unsigned u )
static inline unsigned
fui( float f )
{
unsigned i;
if (u == 0) {
return 0;
}
__asm bsf eax, [u]
__asm inc eax
__asm mov [i], eax
return i;
union fi fi;
fi.f = f;
return fi.ui;
}
#elif defined(__MINGW32__) || defined(PIPE_OS_ANDROID)
#define ffs __builtin_ffs
#endif
#endif /* FFS_DEFINED */
/**
* Find last bit set in a word. The least significant bit is 1.
* Return 0 if no bits are set.
*/
static inline unsigned util_last_bit(unsigned u)
static inline float
uif(uint32_t ui)
{
#if defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304)
return u == 0 ? 0 : 32 - __builtin_clz(u);
#else
unsigned r = 0;
while (u) {
r++;
u >>= 1;
}
return r;
#endif
union fi fi;
fi.ui = ui;
return fi.f;
}
/**
* Find last bit in a word that does not match the sign bit. The least
* significant bit is 1.
* Return 0 if no bits are set.
*/
static inline unsigned util_last_bit_signed(int i)
{
#if defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 407)
return 31 - __builtin_clrsb(i);
#else
if (i >= 0)
return util_last_bit(i);
else
return util_last_bit(~(unsigned)i);
#endif
}
/* Destructively loop over all of the bits in a mask as in:
*
* while (mymask) {
* int i = u_bit_scan(&mymask);
* ... process element i
* }
*
/**
* Convert uint8_t to float in [0, 1].
*/
static inline int u_bit_scan(unsigned *mask)
static inline float
ubyte_to_float(uint8_t ub)
{
int i = ffs(*mask) - 1;
*mask &= ~(1 << i);
return i;
return (float) ub * (1.0f / 255.0f);
}
/* For looping over a bitmask when you want to loop over consecutive bits
* manually, for example:
*
* while (mask) {
* int start, count, i;
*
* u_bit_scan_consecutive_range(&mask, &start, &count);
*
* for (i = 0; i < count; i++)
* ... process element (start+i)
* }
*/
static inline void
u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count)
{
if (*mask == 0xffffffff) {
*start = 0;
*count = 32;
*mask = 0;
return;
}
*start = ffs(*mask) - 1;
*count = ffs(~(*mask >> *start)) - 1;
*mask &= ~(((1u << *count) - 1) << *start);
}
/**
* Return float bits.
* Convert float in [0,1] to uint8_t in [0,255] with clamping.
*/
static inline unsigned
fui( float f )
static inline uint8_t
float_to_ubyte(float f)
{
union fi fi;
fi.f = f;
return fi.ui;
/* return 0 for NaN too */
if (!(f > 0.0f)) {
return (uint8_t) 0;
}
else if (f >= 1.0f) {
return (uint8_t) 255;
}
else {
union fi tmp;
tmp.f = f;
tmp.f = tmp.f * (255.0f/256.0f) + 32768.0f;
return (uint8_t) tmp.i;
}
}
/**
* Convert ubyte to float in [0, 1].
* XXX a 256-entry lookup table would be slightly faster.
* Convert uint16_t to float in [0, 1].
*/
static inline float
ubyte_to_float(ubyte ub)
ushort_to_float(uint16_t us)
{
return (float) ub * (1.0f / 255.0f);
return (float) us * (1.0f / 65535.0f);
}
/**
* Convert float in [0,1] to ubyte in [0,255] with clamping.
* Convert float in [0,1] to uint16_t in [0,65535] with clamping.
*/
static inline ubyte
float_to_ubyte(float f)
static inline uint16_t
float_to_ushort(float f)
{
union fi tmp;
tmp.f = f;
if (tmp.i < 0) {
return (ubyte) 0;
/* return 0 for NaN too */
if (!(f > 0.0f)) {
return (uint16_t) 0;
}
else if (tmp.i >= 0x3f800000 /* 1.0f */) {
return (ubyte) 255;
else if (f >= 1.0f) {
return (uint16_t) 65535;
}
else {
tmp.f = tmp.f * (255.0f/256.0f) + 32768.0f;
return (ubyte) tmp.i;
union fi tmp;
tmp.f = f;
tmp.f = tmp.f * (65535.0f/65536.0f) + 128.0f;
return (uint16_t) tmp.i;
}
}
@ -672,7 +391,7 @@ float_to_byte_tex(float f)
static inline unsigned
util_logbase2(unsigned n)
{
#if defined(PIPE_CC_GCC) && (PIPE_CC_GCC_VERSION >= 304)
#if defined(HAVE___BUILTIN_CLZ)
return ((sizeof(unsigned) * 8 - 1) - __builtin_clz(n | 1));
#else
unsigned pos = 0;
@ -685,6 +404,44 @@ util_logbase2(unsigned n)
#endif
}
static inline uint64_t
util_logbase2_64(uint64_t n)
{
#if defined(HAVE___BUILTIN_CLZLL)
return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1));
#else
uint64_t pos = 0ull;
if (n >= 1ull<<32) { n >>= 32; pos += 32; }
if (n >= 1ull<<16) { n >>= 16; pos += 16; }
if (n >= 1ull<< 8) { n >>= 8; pos += 8; }
if (n >= 1ull<< 4) { n >>= 4; pos += 4; }
if (n >= 1ull<< 2) { n >>= 2; pos += 2; }
if (n >= 1ull<< 1) { pos += 1; }
return pos;
#endif
}
/**
* Returns the ceiling of log n base 2, and 0 when n == 0. Equivalently,
* returns the smallest x such that n <= 2**x.
*/
static inline unsigned
util_logbase2_ceil(unsigned n)
{
if (n <= 1)
return 0;
return 1 + util_logbase2(n - 1);
}
static inline uint64_t
util_logbase2_ceil64(uint64_t n)
{
if (n <= 1)
return 0;
return 1ull + util_logbase2_64(n - 1);
}
/**
* Returns the smallest power of two >= x
@ -692,7 +449,7 @@ util_logbase2(unsigned n)
static inline unsigned
util_next_power_of_two(unsigned x)
{
#if defined(PIPE_CC_GCC) && (PIPE_CC_GCC_VERSION >= 304)
#if defined(HAVE___BUILTIN_CLZ)
if (x <= 1)
return 1;
@ -703,7 +460,7 @@ util_next_power_of_two(unsigned x)
if (x <= 1)
return 1;
if (util_is_power_of_two(x))
if (util_is_power_of_two_or_zero(x))
return x;
val--;
@ -717,27 +474,32 @@ util_next_power_of_two(unsigned x)
#endif
}
/**
* Return number of bits set in n.
*/
static inline unsigned
util_bitcount(unsigned n)
static inline uint64_t
util_next_power_of_two64(uint64_t x)
{
#if defined(PIPE_CC_GCC) && (PIPE_CC_GCC_VERSION >= 304)
return __builtin_popcount(n);
#if defined(HAVE___BUILTIN_CLZLL)
if (x <= 1)
return 1;
return (1ull << ((sizeof(uint64_t) * 8) - __builtin_clzll(x - 1)));
#else
/* K&R classic bitcount.
*
* For each iteration, clear the LSB from the bitfield.
* Requires only one iteration per set bit, instead of
* one iteration per bit less than highest set bit.
*/
unsigned bits = 0;
for (bits; n; bits++) {
n &= n - 1;
}
return bits;
uint64_t val = x;
if (x <= 1)
return 1;
if (util_is_power_of_two_or_zero64(x))
return x;
val--;
val = (val >> 1) | val;
val = (val >> 2) | val;
val = (val >> 4) | val;
val = (val >> 8) | val;
val = (val >> 16) | val;
val = (val >> 32) | val;
val++;
return val;
#endif
}
@ -781,8 +543,7 @@ util_bitreverse(unsigned n)
static inline uint32_t
util_bswap32(uint32_t n)
{
/* We need the gcc version checks for non-autoconf build system */
#if defined(HAVE___BUILTIN_BSWAP32) || (defined(PIPE_CC_GCC) && (PIPE_CC_GCC_VERSION >= 403))
#if defined(HAVE___BUILTIN_BSWAP32)
return __builtin_bswap32(n);
#else
return (n >> 24) |
@ -801,7 +562,7 @@ util_bswap64(uint64_t n)
#if defined(HAVE___BUILTIN_BSWAP64)
return __builtin_bswap64(n);
#else
return ((uint64_t)util_bswap32(n) << 32) |
return ((uint64_t)util_bswap32((uint32_t)n) << 32) |
util_bswap32((n >> 32));
#endif
}
@ -817,6 +578,37 @@ util_bswap16(uint16_t n)
(n << 8);
}
/**
* Extend sign.
*/
static inline int64_t
util_sign_extend(uint64_t val, unsigned width)
{
assert(width > 0);
if (val & (UINT64_C(1) << (width - 1))) {
return -(int64_t)((UINT64_C(1) << width) - val);
} else {
return val;
}
}
static inline void*
util_memcpy_cpu_to_le32(void * restrict dest, const void * restrict src, size_t n)
{
#if UTIL_ARCH_BIG_ENDIAN
size_t i, e;
assert(n % 4 == 0);
for (i = 0, e = n / 4; i < e; i++) {
uint32_t * restrict d = (uint32_t* restrict)dest;
const uint32_t * restrict s = (const uint32_t* restrict)src;
d[i] = util_bswap32(s[i]);
}
return dest;
#else
return memcpy(dest, src, n);
#endif
}
/**
* Clamp X to [MIN, MAX].
@ -825,6 +617,9 @@ util_bswap16(uint16_t n)
*/
#define CLAMP( X, MIN, MAX ) ( (X)>(MIN) ? ((X)>(MAX) ? (MAX) : (X)) : (MIN) )
/* Syntax sugar occuring frequently in graphics code */
#define SATURATE( X ) CLAMP(X, 0.0f, 1.0f)
#define MIN2( A, B ) ( (A)<(B) ? (A) : (B) )
#define MAX2( A, B ) ( (A)>(B) ? (A) : (B) )
@ -835,6 +630,56 @@ util_bswap16(uint16_t n)
#define MAX4( A, B, C, D ) ((A) > (B) ? MAX3(A, C, D) : MAX3(B, C, D))
/**
* Align a value up to an alignment value
*
* If \c value is not already aligned to the requested alignment value, it
* will be rounded up.
*
* \param value Value to be rounded
* \param alignment Alignment value to be used. This must be a power of two.
*
* \sa ROUND_DOWN_TO()
*/
#if defined(ALIGN)
#undef ALIGN
#endif
static inline uintptr_t
ALIGN(uintptr_t value, int32_t alignment)
{
assert(util_is_power_of_two_nonzero(alignment));
return (((value) + (alignment) - 1) & ~((alignment) - 1));
}
/**
* Like ALIGN(), but works with a non-power-of-two alignment.
*/
static inline uintptr_t
ALIGN_NPOT(uintptr_t value, int32_t alignment)
{
assert(alignment > 0);
return (value + alignment - 1) / alignment * alignment;
}
/**
* Align a value down to an alignment value
*
* If \c value is not already aligned to the requested alignment value, it
* will be rounded down.
*
* \param value Value to be rounded
* \param alignment Alignment value to be used. This must be a power of two.
*
* \sa ALIGN()
*/
static inline uint64_t
ROUND_DOWN_TO(uint64_t value, int32_t alignment)
{
assert(util_is_power_of_two_nonzero(alignment));
return ((value) & ~(alignment - 1));
}
/**
* Align a value, only works pot alignemnts.
*/
@ -844,6 +689,12 @@ align(int value, int alignment)
return (value + alignment - 1) & ~(alignment - 1);
}
static inline uint64_t
align64(uint64_t value, unsigned alignment)
{
return (value + alignment - 1) & ~((uint64_t)alignment - 1);
}
/**
* Works like align but on npot alignments.
*/
@ -888,12 +739,14 @@ do { \
#endif
static inline uint32_t util_unsigned_fixed(float value, unsigned frac_bits)
static inline uint32_t
util_unsigned_fixed(float value, unsigned frac_bits)
{
return value < 0 ? 0 : (uint32_t)(value * (1<<frac_bits));
}
static inline int32_t util_signed_fixed(float value, unsigned frac_bits)
static inline int32_t
util_signed_fixed(float value, unsigned frac_bits)
{
return (int32_t)(value * (1<<frac_bits));
}
@ -905,7 +758,41 @@ util_fpstate_set_denorms_to_zero(unsigned current_fpstate);
void
util_fpstate_set(unsigned fpstate);
/**
* For indexed draw calls, return true if the vertex count to be drawn is
* much lower than the vertex count that has to be uploaded, meaning
* that the driver should flatten indices instead of trying to upload
* a too big range.
*
* This is used by vertex upload code in u_vbuf and glthread.
*/
static inline bool
util_is_vbo_upload_ratio_too_large(unsigned draw_vertex_count,
unsigned upload_vertex_count)
{
if (draw_vertex_count > 1024)
return upload_vertex_count > draw_vertex_count * 4;
else if (draw_vertex_count > 32)
return upload_vertex_count > draw_vertex_count * 8;
else
return upload_vertex_count > draw_vertex_count * 16;
}
bool util_invert_mat4x4(float *out, const float *m);
/* Quantize the lod bias value to reduce the number of sampler state
* variants in gallium because apps use it for smooth mipmap transitions,
* thrashing cso_cache and degrading performance.
*
* This quantization matches the AMD hw specification, so having more
* precision would have no effect anyway.
*/
static inline float
util_quantize_lod_bias(float lod)
{
lod = CLAMP(lod, -16, 16);
return roundf(lod * 256) / 256;
}
#ifdef __cplusplus
}

@ -258,7 +258,7 @@ vkr_region_size(const struct vkr_region *region)
static inline bool
vkr_region_is_aligned(const struct vkr_region *region, size_t align)
{
assert(align && util_is_power_of_two(align));
assert(util_is_power_of_two_nonzero(align));
return !((region->begin | region->end) & (align - 1));
}

@ -66,7 +66,7 @@ vkr_ring_init_buffer(struct vkr_ring *ring, const struct vkr_ring_layout *layout
&buf->base_iov_offset);
buf->size = vkr_region_size(&layout->buffer);
assert(buf->size && util_is_power_of_two(buf->size));
assert(util_is_power_of_two_nonzero(buf->size));
buf->mask = buf->size - 1;
buf->cur = 0;

@ -217,8 +217,7 @@ vkr_ring_layout_init(struct vkr_ring_layout *layout,
}
const size_t buf_size = vkr_region_size(&layout->buffer);
if (!buf_size || buf_size > VKR_RING_BUFFER_MAX_SIZE ||
!util_is_power_of_two(buf_size)) {
if (buf_size > VKR_RING_BUFFER_MAX_SIZE || !util_is_power_of_two_nonzero(buf_size)) {
vkr_log("ring buffer size (%lu) must be a power of two and not exceed %lu",
buf_size, VKR_RING_BUFFER_MAX_SIZE);
return false;

@ -233,13 +233,6 @@ static int vrend_decode_clear_texture(struct vrend_context *ctx, const uint32_t
return 0;
}
static float uif(unsigned int ui)
{
union { float f; unsigned int ui; } myuif;
myuif.ui = ui;
return myuif.f;
}
static int vrend_decode_set_viewport_state(struct vrend_context *ctx, const uint32_t *buf, uint32_t length)
{
struct pipe_viewport_state vps[PIPE_MAX_VIEWPORTS];

Loading…
Cancel
Save