From 10b89464a31357f8e1362e13de1c50ca0cc6c3e5 Mon Sep 17 00:00:00 2001 From: Chia-I Wu Date: Fri, 24 Sep 2021 13:36:06 -0700 Subject: [PATCH] mesa: update to the latest u_math.h uif is now defined. util_is_power_of_two is replaced by other variants. Signed-off-by: Chia-I Wu Reviewed-by: Yiwei Zhang Reviewed-by: Ryan Neph Acked-by: Gert Wollny --- config.h.meson | 8 + meson.build | 3 +- src/gallium/auxiliary/util/u_cpu_detect.c | 458 ---------- src/gallium/auxiliary/util/u_math.c | 139 --- src/gallium/meson.build | 4 - src/mesa/meson.build | 3 + src/mesa/util/bitscan.c | 80 ++ src/mesa/util/bitscan.h | 356 +++++++ src/mesa/util/u_cpu_detect.c | 865 ++++++++++++++++++ .../auxiliary => mesa}/util/u_cpu_detect.h | 71 +- src/mesa/util/u_math.c | 311 +++++++ src/{gallium/auxiliary => mesa}/util/u_math.h | 667 ++++++-------- src/venus/vkr_common.h | 2 +- src/venus/vkr_ring.c | 2 +- src/venus/vkr_transport.c | 3 +- src/vrend_decode.c | 7 - 16 files changed, 1971 insertions(+), 1008 deletions(-) delete mode 100644 src/gallium/auxiliary/util/u_cpu_detect.c delete mode 100644 src/gallium/auxiliary/util/u_math.c create mode 100644 src/mesa/util/bitscan.c create mode 100644 src/mesa/util/bitscan.h create mode 100644 src/mesa/util/u_cpu_detect.c rename src/{gallium/auxiliary => mesa}/util/u_cpu_detect.h (57%) create mode 100644 src/mesa/util/u_math.c rename src/{gallium/auxiliary => mesa}/util/u_math.h (57%) diff --git a/config.h.meson b/config.h.meson index 10e46cc..599dc8d 100644 --- a/config.h.meson +++ b/config.h.meson @@ -1,7 +1,15 @@ #mesondefine VERSION #mesondefine _GNU_SOURCE #mesondefine VIRGL_RENDERER_UNSTABLE_APIS +#mesondefine HAVE___BUILTIN_BSWAP32 +#mesondefine HAVE___BUILTIN_BSWAP64 +#mesondefine HAVE___BUILTIN_CLZ +#mesondefine HAVE___BUILTIN_CLZLL #mesondefine HAVE___BUILTIN_EXPECT +#mesondefine HAVE___BUILTIN_FFS +#mesondefine HAVE___BUILTIN_FFSLL +#mesondefine HAVE___BUILTIN_POPCOUNT +#mesondefine HAVE___BUILTIN_POPCOUNTLL #mesondefine HAVE___BUILTIN_TYPES_COMPATIBLE_P #mesondefine HAVE___BUILTIN_UNREACHABLE #mesondefine HAVE_FUNC_ATTRIBUTE_CONST diff --git a/meson.build b/meson.build index 5190cdb..082e0b5 100644 --- a/meson.build +++ b/meson.build @@ -129,7 +129,8 @@ if cc.has_header('sys/select.h') conf_data.set('HAVE_SYS_SELECT_H', 1) endif -foreach b : ['expect', 'types_compatible_p', 'unreachable'] +foreach b : ['bswap32', 'bswap64', 'clz', 'clzll', 'expect', 'ffs', 'ffsll', + 'popcount', 'popcountll', 'types_compatible_p', 'unreachable'] if cc.has_function(b) conf_data.set('HAVE___BUILTIN_@0@'.format(b.to_upper()), 1) endif diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c deleted file mode 100644 index 0b4b83a..0000000 --- a/src/gallium/auxiliary/util/u_cpu_detect.c +++ /dev/null @@ -1,458 +0,0 @@ -/************************************************************************** - * - * Copyright 2008 Dennis Smit - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -/** - * @file - * CPU feature detection. - * - * @author Dennis Smit - * @author Based on the work of Eric Anholt - */ - -#include "pipe/p_config.h" - -#include "u_debug.h" -#include "u_cpu_detect.h" - -#if defined(PIPE_ARCH_PPC) -#if defined(PIPE_OS_APPLE) -#include -#else -#include -#include -#endif -#endif - -#if defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD) -#include -#include -#include -#endif - -#if defined(PIPE_OS_FREEBSD) -#include -#include -#endif - -#if defined(PIPE_OS_LINUX) -#include -#endif - -#ifdef PIPE_OS_UNIX -#include -#endif - -#if defined(PIPE_OS_WINDOWS) -#include -#if defined(PIPE_CC_MSVC) -#include -#endif -#endif - - -#ifdef DEBUG -DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", FALSE) -#endif - - -struct util_cpu_caps util_cpu_caps; - -#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) -static int has_cpuid(void); -#endif - - -#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE) -static jmp_buf __lv_powerpc_jmpbuf; -static volatile sig_atomic_t __lv_powerpc_canjump = 0; - -static void -sigill_handler(int sig) -{ - if (!__lv_powerpc_canjump) { - signal (sig, SIG_DFL); - raise (sig); - } - - __lv_powerpc_canjump = 0; - longjmp(__lv_powerpc_jmpbuf, 1); -} -#endif - -#if defined(PIPE_ARCH_PPC) -static void -check_os_altivec_support(void) -{ -#if defined(PIPE_OS_APPLE) - int sels[2] = {CTL_HW, HW_VECTORUNIT}; - int has_vu = 0; - int len = sizeof (has_vu); - int err; - - err = sysctl(sels, 2, &has_vu, &len, NULL, 0); - - if (err == 0) { - if (has_vu != 0) { - util_cpu_caps.has_altivec = 1; - } - } -#else /* !PIPE_OS_APPLE */ - /* not on Apple/Darwin, do it the brute-force way */ - /* this is borrowed from the libmpeg2 library */ - signal(SIGILL, sigill_handler); - if (setjmp(__lv_powerpc_jmpbuf)) { - signal(SIGILL, SIG_DFL); - } else { - __lv_powerpc_canjump = 1; - - __asm __volatile - ("mtspr 256, %0\n\t" - "vand %%v0, %%v0, %%v0" - : - : "r" (-1)); - - signal(SIGILL, SIG_DFL); - util_cpu_caps.has_altivec = 1; - } -#endif /* !PIPE_OS_APPLE */ -} -#endif /* PIPE_ARCH_PPC */ - - -#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64) -static int has_cpuid(void) -{ -#if defined(PIPE_ARCH_X86) -#if defined(PIPE_OS_GCC) - int a, c; - - __asm __volatile - ("pushf\n" - "popl %0\n" - "movl %0, %1\n" - "xorl $0x200000, %0\n" - "push %0\n" - "popf\n" - "pushf\n" - "popl %0\n" - : "=a" (a), "=c" (c) - : - : "cc"); - - return a != c; -#else - /* FIXME */ - return 1; -#endif -#elif defined(PIPE_ARCH_X86_64) - return 1; -#else - return 0; -#endif -} - - -/** - * @sa cpuid.h included in gcc-4.3 onwards. - * @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx - */ -static inline void -cpuid(uint32_t ax, uint32_t *p) -{ -#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86) - __asm __volatile ( - "xchgl %%ebx, %1\n\t" - "cpuid\n\t" - "xchgl %%ebx, %1" - : "=a" (p[0]), - "=S" (p[1]), - "=c" (p[2]), - "=d" (p[3]) - : "0" (ax) - ); -#elif (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86_64) - __asm __volatile ( - "cpuid\n\t" - : "=a" (p[0]), - "=b" (p[1]), - "=c" (p[2]), - "=d" (p[3]) - : "0" (ax) - ); -#elif defined(PIPE_CC_MSVC) - __cpuid(p, ax); -#else - p[0] = 0; - p[1] = 0; - p[2] = 0; - p[3] = 0; -#endif -} - -/** - * @sa cpuid.h included in gcc-4.4 onwards. - * @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx - */ -static inline void -cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p) -{ -#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86) - __asm __volatile ( - "xchgl %%ebx, %1\n\t" - "cpuid\n\t" - "xchgl %%ebx, %1" - : "=a" (p[0]), - "=S" (p[1]), - "=c" (p[2]), - "=d" (p[3]) - : "0" (ax), "2" (cx) - ); -#elif (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86_64) - __asm __volatile ( - "cpuid\n\t" - : "=a" (p[0]), - "=b" (p[1]), - "=c" (p[2]), - "=d" (p[3]) - : "0" (ax), "2" (cx) - ); -#elif defined(PIPE_CC_MSVC) - __cpuidex(p, ax, cx); -#else - p[0] = 0; - p[1] = 0; - p[2] = 0; - p[3] = 0; -#endif -} - - -static inline uint64_t xgetbv(void) -{ -#if defined(PIPE_CC_GCC) - uint32_t eax, edx; - - __asm __volatile ( - ".byte 0x0f, 0x01, 0xd0" // xgetbv isn't supported on gcc < 4.4 - : "=a"(eax), - "=d"(edx) - : "c"(0) - ); - - return ((uint64_t)edx << 32) | eax; -#elif defined(PIPE_CC_MSVC) && defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK) - return _xgetbv(_XCR_XFEATURE_ENABLED_MASK); -#else - return 0; -#endif -} - - -#if defined(PIPE_ARCH_X86) -static inline boolean sse2_has_daz(void) -{ - struct { - uint32_t pad1[7]; - uint32_t mxcsr_mask; - uint32_t pad2[128-8]; - } PIPE_ALIGN_VAR(16) fxarea; - - fxarea.mxcsr_mask = 0; -#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) - __asm __volatile ("fxsave %0" : "+m" (fxarea)); -#elif (defined(PIPE_CC_MSVC) && _MSC_VER >= 1700) || defined(PIPE_CC_ICL) - /* 1700 = Visual Studio 2012 */ - _fxsave(&fxarea); -#else - fxarea.mxcsr_mask = 0; -#endif - return !!(fxarea.mxcsr_mask & (1 << 6)); -} -#endif - -#endif /* X86 or X86_64 */ - -void -util_cpu_detect(void) -{ - static boolean util_cpu_detect_initialized = FALSE; - - if(util_cpu_detect_initialized) - return; - - memset(&util_cpu_caps, 0, sizeof util_cpu_caps); - - /* Count the number of CPUs in system */ -#if defined(PIPE_OS_WINDOWS) - { - SYSTEM_INFO system_info; - GetSystemInfo(&system_info); - util_cpu_caps.nr_cpus = system_info.dwNumberOfProcessors; - } -#elif defined(PIPE_OS_UNIX) && defined(_SC_NPROCESSORS_ONLN) - util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - if (util_cpu_caps.nr_cpus == -1) - util_cpu_caps.nr_cpus = 1; -#elif defined(PIPE_OS_BSD) - { - int mib[2], ncpu; - int len; - - mib[0] = CTL_HW; - mib[1] = HW_NCPU; - - len = sizeof (ncpu); - sysctl(mib, 2, &ncpu, &len, NULL, 0); - util_cpu_caps.nr_cpus = ncpu; - } -#else - util_cpu_caps.nr_cpus = 1; -#endif - - /* Make the fallback cacheline size nonzero so that it can be - * safely passed to align(). - */ - util_cpu_caps.cacheline = sizeof(void *); - -#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) - if (has_cpuid()) { - uint32_t regs[4]; - uint32_t regs2[4]; - - util_cpu_caps.cacheline = 32; - - /* Get max cpuid level */ - cpuid(0x00000000, regs); - - if (regs[0] >= 0x00000001) { - unsigned int cacheline; - - cpuid (0x00000001, regs2); - - util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf; - if (util_cpu_caps.x86_cpu_type == 0xf) - util_cpu_caps.x86_cpu_type = 8 + ((regs2[0] >> 20) & 255); /* use extended family (P4, IA64) */ - - /* general feature flags */ - util_cpu_caps.has_tsc = (regs2[3] >> 4) & 1; /* 0x0000010 */ - util_cpu_caps.has_mmx = (regs2[3] >> 23) & 1; /* 0x0800000 */ - util_cpu_caps.has_sse = (regs2[3] >> 25) & 1; /* 0x2000000 */ - util_cpu_caps.has_sse2 = (regs2[3] >> 26) & 1; /* 0x4000000 */ - util_cpu_caps.has_sse3 = (regs2[2] >> 0) & 1; /* 0x0000001 */ - util_cpu_caps.has_ssse3 = (regs2[2] >> 9) & 1; /* 0x0000020 */ - util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1; - util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1; - util_cpu_caps.has_popcnt = (regs2[2] >> 23) & 1; - util_cpu_caps.has_avx = ((regs2[2] >> 28) & 1) && // AVX - ((regs2[2] >> 27) & 1) && // OSXSAVE - ((xgetbv() & 6) == 6); // XMM & YMM - util_cpu_caps.has_f16c = (regs2[2] >> 29) & 1; - util_cpu_caps.has_mmx2 = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */ -#if defined(PIPE_ARCH_X86_64) - util_cpu_caps.has_daz = 1; -#else - util_cpu_caps.has_daz = util_cpu_caps.has_sse3 || - (util_cpu_caps.has_sse2 && sse2_has_daz()); -#endif - - cacheline = ((regs2[1] >> 8) & 0xFF) * 8; - if (cacheline > 0) - util_cpu_caps.cacheline = cacheline; - } - if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) { - uint32_t regs7[4]; - cpuid_count(0x00000007, 0x00000000, regs7); - util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1; - } - - if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) { - /* GenuineIntel */ - util_cpu_caps.has_intel = 1; - } - - cpuid(0x80000000, regs); - - if (regs[0] >= 0x80000001) { - - cpuid(0x80000001, regs2); - - util_cpu_caps.has_mmx |= (regs2[3] >> 23) & 1; - util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1; - util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1; - util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1; - - util_cpu_caps.has_xop = util_cpu_caps.has_avx && - ((regs2[2] >> 11) & 1); - } - - if (regs[0] >= 0x80000006) { - cpuid(0x80000006, regs2); - util_cpu_caps.cacheline = regs2[2] & 0xFF; - } - - if (!util_cpu_caps.has_sse) { - util_cpu_caps.has_sse2 = 0; - util_cpu_caps.has_sse3 = 0; - util_cpu_caps.has_ssse3 = 0; - util_cpu_caps.has_sse4_1 = 0; - } - } -#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */ - -#if defined(PIPE_ARCH_PPC) - check_os_altivec_support(); -#endif /* PIPE_ARCH_PPC */ - -#ifdef DEBUG - if (debug_get_option_dump_cpu()) { - debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus); - - debug_printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type); - debug_printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline); - - debug_printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc); - debug_printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx); - debug_printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2); - debug_printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse); - debug_printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2); - debug_printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3); - debug_printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3); - debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1); - debug_printf("util_cpu_caps.has_sse4_2 = %u\n", util_cpu_caps.has_sse4_2); - debug_printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx); - debug_printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2); - debug_printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c); - debug_printf("util_cpu_caps.has_popcnt = %u\n", util_cpu_caps.has_popcnt); - debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow); - debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext); - debug_printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop); - debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec); - debug_printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz); - } -#endif - - util_cpu_detect_initialized = TRUE; -} diff --git a/src/gallium/auxiliary/util/u_math.c b/src/gallium/auxiliary/util/u_math.c deleted file mode 100644 index 79c31e1..0000000 --- a/src/gallium/auxiliary/util/u_math.c +++ /dev/null @@ -1,139 +0,0 @@ -/************************************************************************** - * - * Copyright 2008 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - - -#include "pipe/p_config.h" -#include "util/u_math.h" -#include "util/u_cpu_detect.h" - -#if defined(PIPE_ARCH_SSE) -#include -/* This is defined in pmmintrin.h, but it can only be included when -msse3 is - * used, so just define it here to avoid further. */ -#ifndef _MM_DENORMALS_ZERO_MASK -#define _MM_DENORMALS_ZERO_MASK 0x0040 -#endif -#endif - -#if 0 -/** 2^x, for x in [-1.0, 1.0) */ -float pow2_table[POW2_TABLE_SIZE]; - - -static void -init_pow2_table(void) -{ - int i; - for (i = 0; i < POW2_TABLE_SIZE; i++) - pow2_table[i] = (float) pow(2.0, (i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE); -} - - -/** log2(x), for x in [1.0, 2.0) */ -float log2_table[LOG2_TABLE_SIZE]; - - -static void -init_log2_table(void) -{ - unsigned i; - for (i = 0; i < LOG2_TABLE_SIZE; i++) - log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SCALE)); -} -#endif - -/** - * One time init for math utilities. - */ -void -util_init_math(void) -{ - static boolean initialized = FALSE; - if (!initialized) { - // init_pow2_table(); - /* init_log2_table();*/ - initialized = TRUE; - } -} - -/** - * Fetches the contents of the fpstate (mxcsr on x86) register. - * - * On platforms without support for it just returns 0. - */ -unsigned -util_fpstate_get(void) -{ - unsigned mxcsr = 0; - -#if defined(PIPE_ARCH_SSE) - if (util_cpu_caps.has_sse) { - mxcsr = _mm_getcsr(); - } -#endif - - return mxcsr; -} - -/** - * Make sure that the fp treats the denormalized floating - * point numbers as zero. - * - * This is the behavior required by D3D10. OpenGL doesn't care. - */ -unsigned -util_fpstate_set_denorms_to_zero(unsigned current_mxcsr) -{ -#if defined(PIPE_ARCH_SSE) - if (util_cpu_caps.has_sse) { - /* Enable flush to zero mode */ - current_mxcsr |= _MM_FLUSH_ZERO_MASK; - if (util_cpu_caps.has_daz) { - /* Enable denormals are zero mode */ - current_mxcsr |= _MM_DENORMALS_ZERO_MASK; - } - util_fpstate_set(current_mxcsr); - } -#endif - return current_mxcsr; -} - -/** - * Set the state of the fpstate (mxcsr on x86) register. - * - * On platforms without support for it's a noop. - */ -void -util_fpstate_set(unsigned mxcsr) -{ -#if defined(PIPE_ARCH_SSE) - if (util_cpu_caps.has_sse) { - _mm_setcsr(mxcsr); - } -#endif -} diff --git a/src/gallium/meson.build b/src/gallium/meson.build index 8fd5388..a7276df 100644 --- a/src/gallium/meson.build +++ b/src/gallium/meson.build @@ -32,7 +32,6 @@ sources_libgallium = [ 'auxiliary/util/u_format.h', 'auxiliary/util/u_rect.h', 'auxiliary/util/u_surface.h', - 'auxiliary/util/u_math.h', 'auxiliary/util/rgtc.h', 'auxiliary/util/u_format.c', 'auxiliary/util/u_inlines.h', @@ -44,16 +43,13 @@ sources_libgallium = [ 'auxiliary/util/u_texture.h', 'auxiliary/util/u_hash_table.h', 'auxiliary/util/u_box.h', - 'auxiliary/util/u_cpu_detect.c', 'auxiliary/util/u_pack_color.h', 'auxiliary/util/u_double_list.h', 'auxiliary/util/u_debug_refcnt.h', 'auxiliary/util/u_bitmask.c', - 'auxiliary/util/u_cpu_detect.h', 'auxiliary/util/u_bitmask.h', 'auxiliary/util/u_format_s3tc.h', 'auxiliary/util/u_surface.c', - 'auxiliary/util/u_math.c', 'auxiliary/util/u_half.h', 'auxiliary/util/u_prim.h', 'auxiliary/util/u_debug_describe.c', diff --git a/src/mesa/meson.build b/src/mesa/meson.build index a582f23..0d1f57a 100644 --- a/src/mesa/meson.build +++ b/src/mesa/meson.build @@ -4,9 +4,12 @@ inc_mesa = include_directories('.', 'compat', 'pipe', 'util') files_mesa = files( + 'util/bitscan.c', 'util/os_file.c', 'util/os_misc.c', + 'util/u_cpu_detect.c', 'util/u_debug.c', + 'util/u_math.c', ) deps_mesa = [ diff --git a/src/mesa/util/bitscan.c b/src/mesa/util/bitscan.c new file mode 100644 index 0000000..88d7f94 --- /dev/null +++ b/src/mesa/util/bitscan.c @@ -0,0 +1,80 @@ +/************************************************************************** + * + * Copyright 2008 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include "bitscan.h" + +#ifdef HAVE___BUILTIN_FFS +#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64) +#else +int +ffs(int i) +{ + int bit = 0; + if (!i) + return bit; + if (!(i & 0xffff)) { + bit += 16; + i >>= 16; + } + if (!(i & 0xff)) { + bit += 8; + i >>= 8; + } + if (!(i & 0xf)) { + bit += 4; + i >>= 4; + } + if (!(i & 0x3)) { + bit += 2; + i >>= 2; + } + if (!(i & 0x1)) + bit += 1; + return bit + 1; +} +#endif + +#ifdef HAVE___BUILTIN_FFSLL +#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64) +#else +int +ffsll(long long int val) +{ + int bit; + + bit = ffs((unsigned) (val & 0xffffffff)); + if (bit != 0) + return bit; + + bit = ffs((unsigned) (val >> 32)); + if (bit != 0) + return 32 + bit; + + return 0; +} +#endif diff --git a/src/mesa/util/bitscan.h b/src/mesa/util/bitscan.h new file mode 100644 index 0000000..105b7ba --- /dev/null +++ b/src/mesa/util/bitscan.h @@ -0,0 +1,356 @@ +/************************************************************************** + * + * Copyright 2008 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef BITSCAN_H +#define BITSCAN_H + +#include +#include +#include +#include + +#if defined(_MSC_VER) +#include +#endif + +#if defined(__POPCNT__) +#include +#endif + +#include "c99_compat.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +/** + * Find first bit set in word. Least significant bit is 1. + * Return 0 if no bits set. + */ +#ifdef HAVE___BUILTIN_FFS +#define ffs __builtin_ffs +#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64) +static inline +int ffs(int i) +{ + unsigned long index; + if (_BitScanForward(&index, i)) + return index + 1; + else + return 0; +} +#else +extern +int ffs(int i); +#endif + +#ifdef HAVE___BUILTIN_FFSLL +#define ffsll __builtin_ffsll +#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64) +static inline int +ffsll(long long int i) +{ + unsigned long index; + if (_BitScanForward64(&index, i)) + return index + 1; + else + return 0; +} +#else +extern int +ffsll(long long int val); +#endif + + +/* Destructively loop over all of the bits in a mask as in: + * + * while (mymask) { + * int i = u_bit_scan(&mymask); + * ... process element i + * } + * + */ +static inline int +u_bit_scan(unsigned *mask) +{ + const int i = ffs(*mask) - 1; + *mask ^= (1u << i); + return i; +} + +#define u_foreach_bit(b, dword) \ + for (uint32_t __dword = (dword), b; \ + ((b) = ffs(__dword) - 1, __dword); \ + __dword &= ~(1 << (b))) + +static inline int +u_bit_scan64(uint64_t *mask) +{ + const int i = ffsll(*mask) - 1; + *mask ^= (((uint64_t)1) << i); + return i; +} + +#define u_foreach_bit64(b, dword) \ + for (uint64_t __dword = (dword), b; \ + ((b) = ffsll(__dword) - 1, __dword); \ + __dword &= ~(1ull << (b))) + +/* Determine if an unsigned value is a power of two. + * + * \note + * Zero is treated as a power of two. + */ +static inline bool +util_is_power_of_two_or_zero(unsigned v) +{ + return (v & (v - 1)) == 0; +} + +/* Determine if an uint64_t value is a power of two. + * + * \note + * Zero is treated as a power of two. + */ +static inline bool +util_is_power_of_two_or_zero64(uint64_t v) +{ + return (v & (v - 1)) == 0; +} + +/* Determine if an unsigned value is a power of two. + * + * \note + * Zero is \b not treated as a power of two. + */ +static inline bool +util_is_power_of_two_nonzero(unsigned v) +{ + /* __POPCNT__ is different from HAVE___BUILTIN_POPCOUNT. The latter + * indicates the existence of the __builtin_popcount function. The former + * indicates that _mm_popcnt_u32 exists and is a native instruction. + * + * The other alternative is to use SSE 4.2 compile-time flags. This has + * two drawbacks. First, there is currently no build infrastructure for + * SSE 4.2 (only 4.1), so that would have to be added. Second, some AMD + * CPUs support POPCNT but not SSE 4.2 (e.g., Barcelona). + */ +#ifdef __POPCNT__ + return _mm_popcnt_u32(v) == 1; +#else + return v != 0 && (v & (v - 1)) == 0; +#endif +} + +/* For looping over a bitmask when you want to loop over consecutive bits + * manually, for example: + * + * while (mask) { + * int start, count, i; + * + * u_bit_scan_consecutive_range(&mask, &start, &count); + * + * for (i = 0; i < count; i++) + * ... process element (start+i) + * } + */ +static inline void +u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count) +{ + if (*mask == 0xffffffff) { + *start = 0; + *count = 32; + *mask = 0; + return; + } + *start = ffs(*mask) - 1; + *count = ffs(~(*mask >> *start)) - 1; + *mask &= ~(((1u << *count) - 1) << *start); +} + +static inline void +u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count) +{ + if (*mask == ~0ull) { + *start = 0; + *count = 64; + *mask = 0; + return; + } + *start = ffsll(*mask) - 1; + *count = ffsll(~(*mask >> *start)) - 1; + *mask &= ~(((((uint64_t)1) << *count) - 1) << *start); +} + + +/** + * Find last bit set in a word. The least significant bit is 1. + * Return 0 if no bits are set. + * Essentially ffs() in the reverse direction. + */ +static inline unsigned +util_last_bit(unsigned u) +{ +#if defined(HAVE___BUILTIN_CLZ) + return u == 0 ? 0 : 32 - __builtin_clz(u); +#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64) + unsigned long index; + if (_BitScanReverse(&index, u)) + return index + 1; + else + return 0; +#else + unsigned r = 0; + while (u) { + r++; + u >>= 1; + } + return r; +#endif +} + +/** + * Find last bit set in a word. The least significant bit is 1. + * Return 0 if no bits are set. + * Essentially ffsll() in the reverse direction. + */ +static inline unsigned +util_last_bit64(uint64_t u) +{ +#if defined(HAVE___BUILTIN_CLZLL) + return u == 0 ? 0 : 64 - __builtin_clzll(u); +#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64) + unsigned long index; + if (_BitScanReverse64(&index, u)) + return index + 1; + else + return 0; +#else + unsigned r = 0; + while (u) { + r++; + u >>= 1; + } + return r; +#endif +} + +/** + * Find last bit in a word that does not match the sign bit. The least + * significant bit is 1. + * Return 0 if no bits are set. + */ +static inline unsigned +util_last_bit_signed(int i) +{ + if (i >= 0) + return util_last_bit(i); + else + return util_last_bit(~(unsigned)i); +} + +/* Returns a bitfield in which the first count bits starting at start are + * set. + */ +static inline unsigned +u_bit_consecutive(unsigned start, unsigned count) +{ + assert(start + count <= 32); + if (count == 32) + return ~0; + return ((1u << count) - 1) << start; +} + +static inline uint64_t +u_bit_consecutive64(unsigned start, unsigned count) +{ + assert(start + count <= 64); + if (count == 64) + return ~(uint64_t)0; + return (((uint64_t)1 << count) - 1) << start; +} + +/** + * Return number of bits set in n. + */ +static inline unsigned +util_bitcount(unsigned n) +{ +#if defined(HAVE___BUILTIN_POPCOUNT) + return __builtin_popcount(n); +#else + /* K&R classic bitcount. + * + * For each iteration, clear the LSB from the bitfield. + * Requires only one iteration per set bit, instead of + * one iteration per bit less than highest set bit. + */ + unsigned bits; + for (bits = 0; n; bits++) { + n &= n - 1; + } + return bits; +#endif +} + +/** + * Return the number of bits set in n using the native popcnt instruction. + * The caller is responsible for ensuring that popcnt is supported by the CPU. + * + * gcc doesn't use it if -mpopcnt or -march= that has popcnt is missing. + * + */ +static inline unsigned +util_popcnt_inline_asm(unsigned n) +{ +#if defined(USE_X86_64_ASM) || defined(USE_X86_ASM) + uint32_t out; + __asm volatile("popcnt %1, %0" : "=r"(out) : "r"(n)); + return out; +#else + /* We should never get here by accident, but I'm sure it'll happen. */ + return util_bitcount(n); +#endif +} + +static inline unsigned +util_bitcount64(uint64_t n) +{ +#ifdef HAVE___BUILTIN_POPCOUNTLL + return __builtin_popcountll(n); +#else + return util_bitcount(n) + util_bitcount(n >> 32); +#endif +} + +#ifdef __cplusplus +} +#endif + +#endif /* BITSCAN_H */ diff --git a/src/mesa/util/u_cpu_detect.c b/src/mesa/util/u_cpu_detect.c new file mode 100644 index 0000000..955d087 --- /dev/null +++ b/src/mesa/util/u_cpu_detect.c @@ -0,0 +1,865 @@ +/************************************************************************** + * + * Copyright 2008 Dennis Smit + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * @file + * CPU feature detection. + * + * @author Dennis Smit + * @author Based on the work of Eric Anholt + */ + +#include "pipe/p_config.h" +#include "pipe/p_compiler.h" + +#include "util/u_debug.h" +#include "u_cpu_detect.h" +#include "u_math.h" +#include "c11/threads.h" + +#include +#include + +#if defined(PIPE_ARCH_PPC) +#if defined(PIPE_OS_APPLE) +#include +#else +#include +#include +#endif +#endif + +#if defined(PIPE_OS_BSD) +#include +#include +#include +#endif + +#if defined(PIPE_OS_FREEBSD) +#if __has_include() +#include +#define HAVE_ELF_AUX_INFO +#endif +#endif + +#if defined(PIPE_OS_LINUX) +#include +#include +#include +#endif + +#ifdef PIPE_OS_UNIX +#include +#endif + +#if defined(HAS_ANDROID_CPUFEATURES) +#include +#endif + +#if defined(PIPE_OS_WINDOWS) +#include +#if defined(PIPE_CC_MSVC) +#include +#endif +#endif + +#if defined(HAS_SCHED_H) +#include +#endif + +DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false) + + +struct util_cpu_caps_t util_cpu_caps; + +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) +static int has_cpuid(void); +#endif + + +#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE) && !defined(PIPE_OS_BSD) && !defined(PIPE_OS_LINUX) +static jmp_buf __lv_powerpc_jmpbuf; +static volatile sig_atomic_t __lv_powerpc_canjump = 0; + +static void +sigill_handler(int sig) +{ + if (!__lv_powerpc_canjump) { + signal (sig, SIG_DFL); + raise (sig); + } + + __lv_powerpc_canjump = 0; + longjmp(__lv_powerpc_jmpbuf, 1); +} +#endif + +#if defined(PIPE_ARCH_PPC) +static void +check_os_altivec_support(void) +{ +#if defined(__ALTIVEC__) + util_cpu_caps.has_altivec = 1; +#endif +#if defined(__VSX__) + util_cpu_caps.has_vsx = 1; +#endif +#if defined(__ALTIVEC__) && defined(__VSX__) +/* Do nothing */ +#elif defined(PIPE_OS_APPLE) || defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD) +#ifdef HW_VECTORUNIT + int sels[2] = {CTL_HW, HW_VECTORUNIT}; +#else + int sels[2] = {CTL_MACHDEP, CPU_ALTIVEC}; +#endif + int has_vu = 0; + int len = sizeof (has_vu); + int err; + + err = sysctl(sels, 2, &has_vu, &len, NULL, 0); + + if (err == 0) { + if (has_vu != 0) { + util_cpu_caps.has_altivec = 1; + } + } +#elif defined(PIPE_OS_FREEBSD) /* !PIPE_OS_APPLE && !PIPE_OS_NETBSD && !PIPE_OS_OPENBSD */ + unsigned long hwcap = 0; +#ifdef HAVE_ELF_AUX_INFO + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); +#else + size_t len = sizeof(hwcap); + sysctlbyname("hw.cpu_features", &hwcap, &len, NULL, 0); +#endif + if (hwcap & PPC_FEATURE_HAS_ALTIVEC) + util_cpu_caps.has_altivec = 1; + if (hwcap & PPC_FEATURE_HAS_VSX) + util_cpu_caps.has_vsx = 1; +#elif defined(PIPE_OS_LINUX) /* !PIPE_OS_FREEBSD */ +#if defined(PIPE_ARCH_PPC_64) + Elf64_auxv_t aux; +#else + Elf32_auxv_t aux; +#endif + int fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); + if (fd >= 0) { + while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) { + if (aux.a_type == AT_HWCAP) { + char *env_vsx = getenv("GALLIVM_VSX"); + uint64_t hwcap = aux.a_un.a_val; + util_cpu_caps.has_altivec = (hwcap >> 28) & 1; + if (!env_vsx || env_vsx[0] != '0') { + util_cpu_caps.has_vsx = (hwcap >> 7) & 1; + } + break; + } + } + close(fd); + } +#else /* !PIPE_OS_APPLE && !PIPE_OS_BSD && !PIPE_OS_LINUX */ + /* not on Apple/Darwin or Linux, do it the brute-force way */ + /* this is borrowed from the libmpeg2 library */ + signal(SIGILL, sigill_handler); + if (setjmp(__lv_powerpc_jmpbuf)) { + signal(SIGILL, SIG_DFL); + } else { + boolean enable_altivec = TRUE; /* Default: enable if available, and if not overridden */ + boolean enable_vsx = TRUE; +#ifdef DEBUG + /* Disabling Altivec code generation is not the same as disabling VSX code generation, + * which can be done simply by passing -mattr=-vsx to the LLVM compiler; cf. + * lp_build_create_jit_compiler_for_module(). + * If you want to disable Altivec code generation, the best place to do it is here. + */ + char *env_control = getenv("GALLIVM_ALTIVEC"); /* 1=enable (default); 0=disable */ + if (env_control && env_control[0] == '0') { + enable_altivec = FALSE; + } +#endif + /* VSX instructions can be explicitly enabled/disabled via GALLIVM_VSX=1 or 0 */ + char *env_vsx = getenv("GALLIVM_VSX"); + if (env_vsx && env_vsx[0] == '0') { + enable_vsx = FALSE; + } + if (enable_altivec) { + __lv_powerpc_canjump = 1; + + __asm __volatile + ("mtspr 256, %0\n\t" + "vand %%v0, %%v0, %%v0" + : + : "r" (-1)); + + util_cpu_caps.has_altivec = 1; + + if (enable_vsx) { + __asm __volatile("xxland %vs0, %vs0, %vs0"); + util_cpu_caps.has_vsx = 1; + } + signal(SIGILL, SIG_DFL); + } else { + util_cpu_caps.has_altivec = 0; + } + } +#endif /* !PIPE_OS_APPLE && !PIPE_OS_LINUX */ +} +#endif /* PIPE_ARCH_PPC */ + + +#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64) +static int has_cpuid(void) +{ +#if defined(PIPE_ARCH_X86) +#if defined(PIPE_OS_GCC) + int a, c; + + __asm __volatile + ("pushf\n" + "popl %0\n" + "movl %0, %1\n" + "xorl $0x200000, %0\n" + "push %0\n" + "popf\n" + "pushf\n" + "popl %0\n" + : "=a" (a), "=c" (c) + : + : "cc"); + + return a != c; +#else + /* FIXME */ + return 1; +#endif +#elif defined(PIPE_ARCH_X86_64) + return 1; +#else + return 0; +#endif +} + + +/** + * @sa cpuid.h included in gcc-4.3 onwards. + * @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx + */ +static inline void +cpuid(uint32_t ax, uint32_t *p) +{ +#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86) + __asm __volatile ( + "xchgl %%ebx, %1\n\t" + "cpuid\n\t" + "xchgl %%ebx, %1" + : "=a" (p[0]), + "=S" (p[1]), + "=c" (p[2]), + "=d" (p[3]) + : "0" (ax) + ); +#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64) + __asm __volatile ( + "cpuid\n\t" + : "=a" (p[0]), + "=b" (p[1]), + "=c" (p[2]), + "=d" (p[3]) + : "0" (ax) + ); +#elif defined(PIPE_CC_MSVC) + __cpuid(p, ax); +#else + p[0] = 0; + p[1] = 0; + p[2] = 0; + p[3] = 0; +#endif +} + +/** + * @sa cpuid.h included in gcc-4.4 onwards. + * @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx + */ +static inline void +cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p) +{ +#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86) + __asm __volatile ( + "xchgl %%ebx, %1\n\t" + "cpuid\n\t" + "xchgl %%ebx, %1" + : "=a" (p[0]), + "=S" (p[1]), + "=c" (p[2]), + "=d" (p[3]) + : "0" (ax), "2" (cx) + ); +#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64) + __asm __volatile ( + "cpuid\n\t" + : "=a" (p[0]), + "=b" (p[1]), + "=c" (p[2]), + "=d" (p[3]) + : "0" (ax), "2" (cx) + ); +#elif defined(PIPE_CC_MSVC) + __cpuidex(p, ax, cx); +#else + p[0] = 0; + p[1] = 0; + p[2] = 0; + p[3] = 0; +#endif +} + + +static inline uint64_t xgetbv(void) +{ +#if defined(PIPE_CC_GCC) + uint32_t eax, edx; + + __asm __volatile ( + ".byte 0x0f, 0x01, 0xd0" // xgetbv isn't supported on gcc < 4.4 + : "=a"(eax), + "=d"(edx) + : "c"(0) + ); + + return ((uint64_t)edx << 32) | eax; +#elif defined(PIPE_CC_MSVC) && defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK) + return _xgetbv(_XCR_XFEATURE_ENABLED_MASK); +#else + return 0; +#endif +} + + +#if defined(PIPE_ARCH_X86) +PIPE_ALIGN_STACK static inline boolean sse2_has_daz(void) +{ + struct { + uint32_t pad1[7]; + uint32_t mxcsr_mask; + uint32_t pad2[128-8]; + } PIPE_ALIGN_VAR(16) fxarea; + + fxarea.mxcsr_mask = 0; +#if defined(PIPE_CC_GCC) + __asm __volatile ("fxsave %0" : "+m" (fxarea)); +#elif defined(PIPE_CC_MSVC) || defined(PIPE_CC_ICL) + _fxsave(&fxarea); +#else + fxarea.mxcsr_mask = 0; +#endif + return !!(fxarea.mxcsr_mask & (1 << 6)); +} +#endif + +#endif /* X86 or X86_64 */ + +#if defined(PIPE_ARCH_ARM) +static void +check_os_arm_support(void) +{ + /* + * On Android, the cpufeatures library is preferred way of checking + * CPU capabilities. However, it is not available for standalone Mesa + * builds, i.e. when Android build system (Android.mk-based) is not + * used. Because of this we cannot use PIPE_OS_ANDROID here, but rather + * have a separate macro that only gets enabled from respective Android.mk. + */ +#if defined(__ARM_NEON) || defined(__ARM_NEON__) + util_cpu_caps.has_neon = 1; +#elif defined(PIPE_OS_FREEBSD) && defined(HAVE_ELF_AUX_INFO) + unsigned long hwcap = 0; + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + if (hwcap & HWCAP_NEON) + util_cpu_caps.has_neon = 1; +#elif defined(HAS_ANDROID_CPUFEATURES) + AndroidCpuFamily cpu_family = android_getCpuFamily(); + uint64_t cpu_features = android_getCpuFeatures(); + + if (cpu_family == ANDROID_CPU_FAMILY_ARM) { + if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) + util_cpu_caps.has_neon = 1; + } +#elif defined(PIPE_OS_LINUX) + Elf32_auxv_t aux; + int fd; + + fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); + if (fd >= 0) { + while (read(fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) { + if (aux.a_type == AT_HWCAP) { + uint32_t hwcap = aux.a_un.a_val; + + util_cpu_caps.has_neon = (hwcap >> 12) & 1; + break; + } + } + close (fd); + } +#endif /* PIPE_OS_LINUX */ +} + +#elif defined(PIPE_ARCH_AARCH64) +static void +check_os_arm_support(void) +{ + util_cpu_caps.has_neon = true; +} +#endif /* PIPE_ARCH_ARM || PIPE_ARCH_AARCH64 */ + +#if defined(PIPE_ARCH_MIPS64) +static void +check_os_mips64_support(void) +{ + Elf64_auxv_t aux; + int fd; + + fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); + if (fd >= 0) { + while (read(fd, &aux, sizeof(Elf64_auxv_t)) == sizeof(Elf64_auxv_t)) { + if (aux.a_type == AT_HWCAP) { + uint64_t hwcap = aux.a_un.a_val; + + util_cpu_caps.has_msa = (hwcap >> 1) & 1; + break; + } + } + close (fd); + } +} +#endif /* PIPE_ARCH_MIPS64 */ + + +static void +get_cpu_topology(void) +{ + /* Default. This is OK if L3 is not present or there is only one. */ + util_cpu_caps.num_L3_caches = 1; + + memset(util_cpu_caps.cpu_to_L3, 0xff, sizeof(util_cpu_caps.cpu_to_L3)); + +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + /* AMD Zen */ + if (util_cpu_caps.family >= CPU_AMD_ZEN1_ZEN2 && + util_cpu_caps.family < CPU_AMD_LAST) { + uint32_t regs[4]; + + uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0}; + uint32_t mask[UTIL_MAX_CPUS / 32] = {0}; + bool saved = false; + + uint32_t L3_found[UTIL_MAX_CPUS] = {0}; + uint32_t num_L3_caches = 0; + util_affinity_mask *L3_affinity_masks = NULL; + + /* Query APIC IDs from each CPU core. + * + * An APIC ID is a logical ID of the CPU with respect to the cache + * hierarchy, meaning that consecutive APIC IDs are neighbours in + * the hierarchy, e.g. sharing the same cache. + * + * For example, CPU 0 can have APIC ID 0 and CPU 12 can have APIC ID 1, + * which means that both CPU 0 and 12 are next to each other. + * (e.g. they are 2 threads belonging to 1 SMT2 core) + * + * We need to find out which CPUs share the same L3 cache and they can + * be all over the place. + * + * Querying the APIC ID can only be done by pinning the current thread + * to each core. The original affinity mask is saved. + * + * Loop over all possible CPUs even though some may be offline. + */ + for (int16_t i = 0; i < util_cpu_caps.max_cpus && i < UTIL_MAX_CPUS; i++) { + uint32_t cpu_bit = 1u << (i % 32); + + mask[i / 32] = cpu_bit; + + /* The assumption is that trying to bind the thread to a CPU that is + * offline will fail. + */ + if (util_set_current_thread_affinity(mask, + !saved ? saved_mask : NULL, + util_cpu_caps.num_cpu_mask_bits)) { + saved = true; + + /* Query the APIC ID of the current core. */ + cpuid(0x00000001, regs); + unsigned apic_id = regs[1] >> 24; + + /* Query the total core count for the CPU */ + uint32_t core_count = 1; + if (regs[3] & (1 << 28)) + core_count = (regs[1] >> 16) & 0xff; + + core_count = util_next_power_of_two(core_count); + + /* Query the L3 cache count. */ + cpuid_count(0x8000001D, 3, regs); + unsigned cache_level = (regs[0] >> 5) & 0x7; + unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1; + + if (cache_level != 3) + continue; + + unsigned local_core_id = apic_id & (core_count - 1); + unsigned phys_id = (apic_id & ~(core_count - 1)) >> util_logbase2(core_count); + unsigned local_l3_cache_index = local_core_id / util_next_power_of_two(cores_per_L3); +#define L3_ID(p, i) (p << 16 | i << 1 | 1); + + unsigned l3_id = L3_ID(phys_id, local_l3_cache_index); + int idx = -1; + for (unsigned c = 0; c < num_L3_caches; c++) { + if (L3_found[c] == l3_id) { + idx = c; + break; + } + } + if (idx == -1) { + idx = num_L3_caches; + L3_found[num_L3_caches++] = l3_id; + L3_affinity_masks = realloc(L3_affinity_masks, sizeof(util_affinity_mask) * num_L3_caches); + if (!L3_affinity_masks) + return; + memset(&L3_affinity_masks[num_L3_caches - 1], 0, sizeof(util_affinity_mask)); + } + util_cpu_caps.cpu_to_L3[i] = idx; + L3_affinity_masks[idx][i / 32] |= cpu_bit; + + } + mask[i / 32] = 0; + } + + util_cpu_caps.num_L3_caches = num_L3_caches; + util_cpu_caps.L3_affinity_mask = L3_affinity_masks; + + if (saved) { + if (debug_get_option_dump_cpu()) { + fprintf(stderr, "CPU <-> L3 cache mapping:\n"); + for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) { + fprintf(stderr, " - L3 %u mask = ", i); + for (int j = util_cpu_caps.max_cpus - 1; j >= 0; j -= 32) + fprintf(stderr, "%08x ", util_cpu_caps.L3_affinity_mask[i][j / 32]); + fprintf(stderr, "\n"); + } + } + + /* Restore the original affinity mask. */ + util_set_current_thread_affinity(saved_mask, NULL, + util_cpu_caps.num_cpu_mask_bits); + } else { + if (debug_get_option_dump_cpu()) + fprintf(stderr, "Cannot set thread affinity for any thread.\n"); + } + } +#endif +} + +static void +util_cpu_detect_once(void) +{ + int available_cpus = 0; + int total_cpus = 0; + + memset(&util_cpu_caps, 0, sizeof util_cpu_caps); + + /* Count the number of CPUs in system */ +#if defined(PIPE_OS_WINDOWS) + { + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); + available_cpus = MAX2(1, system_info.dwNumberOfProcessors); + } +#elif defined(PIPE_OS_UNIX) +# if defined(HAS_SCHED_GETAFFINITY) + { + /* sched_setaffinity() can be used to further restrict the number of + * CPUs on which the process can run. Use sched_getaffinity() to + * determine the true number of available CPUs. + * + * FIXME: The Linux manual page for sched_getaffinity describes how this + * simple implementation will fail with > 1024 CPUs, and we'll fall back + * to the _SC_NPROCESSORS_ONLN path. Support for > 1024 CPUs can be + * added to this path once someone has such a system for testing. + */ + cpu_set_t affin; + if (sched_getaffinity(getpid(), sizeof(affin), &affin) == 0) + available_cpus = CPU_COUNT(&affin); + } +# endif + + /* Linux, FreeBSD, DragonFly, and Mac OS X should have + * _SC_NOPROCESSORS_ONLN. NetBSD and OpenBSD should have HW_NCPUONLINE. + * This is what FFmpeg uses on those platforms. + */ +# if defined(PIPE_OS_BSD) && defined(HW_NCPUONLINE) + if (available_cpus == 0) { + const int mib[] = { CTL_HW, HW_NCPUONLINE }; + int ncpu; + int len = sizeof(ncpu); + + sysctl(mib, 2, &ncpu, &len, NULL, 0); + available_cpus = ncpu; + } +# elif defined(_SC_NPROCESSORS_ONLN) + if (available_cpus == 0) { + available_cpus = sysconf(_SC_NPROCESSORS_ONLN); + if (available_cpus == ~0) + available_cpus = 1; + } +# elif defined(PIPE_OS_BSD) + if (available_cpus == 0) { + const int mib[] = { CTL_HW, HW_NCPU }; + int ncpu; + int len = sizeof(ncpu); + + sysctl(mib, 2, &ncpu, &len, NULL, 0); + available_cpus = ncpu; + } +# endif /* defined(PIPE_OS_BSD) */ + + /* Determine the maximum number of CPUs configured in the system. This is + * used to properly set num_cpu_mask_bits below. On BSDs that don't have + * HW_NCPUONLINE, it was not clear whether HW_NCPU is the number of + * configured or the number of online CPUs. For that reason, prefer the + * _SC_NPROCESSORS_CONF path on all BSDs. + */ +# if defined(_SC_NPROCESSORS_CONF) + total_cpus = sysconf(_SC_NPROCESSORS_CONF); + if (total_cpus == ~0) + total_cpus = 1; +# elif defined(PIPE_OS_BSD) + { + const int mib[] = { CTL_HW, HW_NCPU }; + int ncpu; + int len = sizeof(ncpu); + + sysctl(mib, 2, &ncpu, &len, NULL, 0); + total_cpus = ncpu; + } +# endif /* defined(PIPE_OS_BSD) */ +#endif /* defined(PIPE_OS_UNIX) */ + + util_cpu_caps.nr_cpus = MAX2(1, available_cpus); + total_cpus = MAX2(total_cpus, util_cpu_caps.nr_cpus); + + util_cpu_caps.max_cpus = total_cpus; + util_cpu_caps.num_cpu_mask_bits = align(total_cpus, 32); + + /* Make the fallback cacheline size nonzero so that it can be + * safely passed to align(). + */ + util_cpu_caps.cacheline = sizeof(void *); + +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + if (has_cpuid()) { + uint32_t regs[4]; + uint32_t regs2[4]; + + util_cpu_caps.cacheline = 32; + + /* Get max cpuid level */ + cpuid(0x00000000, regs); + + if (regs[0] >= 0x00000001) { + unsigned int cacheline; + + cpuid (0x00000001, regs2); + + util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf; + /* Add "extended family". */ + if (util_cpu_caps.x86_cpu_type == 0xf) + util_cpu_caps.x86_cpu_type += ((regs2[0] >> 20) & 0xff); + + switch (util_cpu_caps.x86_cpu_type) { + case 0x17: + util_cpu_caps.family = CPU_AMD_ZEN1_ZEN2; + break; + case 0x18: + util_cpu_caps.family = CPU_AMD_ZEN_HYGON; + break; + case 0x19: + util_cpu_caps.family = CPU_AMD_ZEN3; + break; + default: + if (util_cpu_caps.x86_cpu_type > 0x19) + util_cpu_caps.family = CPU_AMD_ZEN_NEXT; + } + + /* general feature flags */ + util_cpu_caps.has_tsc = (regs2[3] >> 4) & 1; /* 0x0000010 */ + util_cpu_caps.has_mmx = (regs2[3] >> 23) & 1; /* 0x0800000 */ + util_cpu_caps.has_sse = (regs2[3] >> 25) & 1; /* 0x2000000 */ + util_cpu_caps.has_sse2 = (regs2[3] >> 26) & 1; /* 0x4000000 */ + util_cpu_caps.has_sse3 = (regs2[2] >> 0) & 1; /* 0x0000001 */ + util_cpu_caps.has_ssse3 = (regs2[2] >> 9) & 1; /* 0x0000020 */ + util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1; + util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1; + util_cpu_caps.has_popcnt = (regs2[2] >> 23) & 1; + util_cpu_caps.has_avx = ((regs2[2] >> 28) & 1) && // AVX + ((regs2[2] >> 27) & 1) && // OSXSAVE + ((xgetbv() & 6) == 6); // XMM & YMM + util_cpu_caps.has_f16c = ((regs2[2] >> 29) & 1) && util_cpu_caps.has_avx; + util_cpu_caps.has_fma = ((regs2[2] >> 12) & 1) && util_cpu_caps.has_avx; + util_cpu_caps.has_mmx2 = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */ +#if defined(PIPE_ARCH_X86_64) + util_cpu_caps.has_daz = 1; +#else + util_cpu_caps.has_daz = util_cpu_caps.has_sse3 || + (util_cpu_caps.has_sse2 && sse2_has_daz()); +#endif + + cacheline = ((regs2[1] >> 8) & 0xFF) * 8; + if (cacheline > 0) + util_cpu_caps.cacheline = cacheline; + } + if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) { + uint32_t regs7[4]; + cpuid_count(0x00000007, 0x00000000, regs7); + util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1; + } + + // check for avx512 + if (((regs2[2] >> 27) & 1) && // OSXSAVE + (xgetbv() & (0x7 << 5)) && // OPMASK: upper-256 enabled by OS + ((xgetbv() & 6) == 6)) { // XMM/YMM enabled by OS + uint32_t regs3[4]; + cpuid_count(0x00000007, 0x00000000, regs3); + util_cpu_caps.has_avx512f = (regs3[1] >> 16) & 1; + util_cpu_caps.has_avx512dq = (regs3[1] >> 17) & 1; + util_cpu_caps.has_avx512ifma = (regs3[1] >> 21) & 1; + util_cpu_caps.has_avx512pf = (regs3[1] >> 26) & 1; + util_cpu_caps.has_avx512er = (regs3[1] >> 27) & 1; + util_cpu_caps.has_avx512cd = (regs3[1] >> 28) & 1; + util_cpu_caps.has_avx512bw = (regs3[1] >> 30) & 1; + util_cpu_caps.has_avx512vl = (regs3[1] >> 31) & 1; + util_cpu_caps.has_avx512vbmi = (regs3[2] >> 1) & 1; + } + + if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) { + /* GenuineIntel */ + util_cpu_caps.has_intel = 1; + } + + cpuid(0x80000000, regs); + + if (regs[0] >= 0x80000001) { + + cpuid(0x80000001, regs2); + + util_cpu_caps.has_mmx |= (regs2[3] >> 23) & 1; + util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1; + util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1; + util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1; + + util_cpu_caps.has_xop = util_cpu_caps.has_avx && + ((regs2[2] >> 11) & 1); + } + + if (regs[0] >= 0x80000006) { + /* should we really do this if the clflush size above worked? */ + unsigned int cacheline; + cpuid(0x80000006, regs2); + cacheline = regs2[2] & 0xFF; + if (cacheline > 0) + util_cpu_caps.cacheline = cacheline; + } + + if (!util_cpu_caps.has_sse) { + util_cpu_caps.has_sse2 = 0; + util_cpu_caps.has_sse3 = 0; + util_cpu_caps.has_ssse3 = 0; + util_cpu_caps.has_sse4_1 = 0; + } + } +#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */ + +#if defined(PIPE_ARCH_ARM) || defined(PIPE_ARCH_AARCH64) + check_os_arm_support(); +#endif + +#if defined(PIPE_ARCH_PPC) + check_os_altivec_support(); +#endif /* PIPE_ARCH_PPC */ + +#if defined(PIPE_ARCH_MIPS64) + check_os_mips64_support(); +#endif /* PIPE_ARCH_MIPS64 */ + + get_cpu_topology(); + + if (debug_get_option_dump_cpu()) { + printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus); + + printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type); + printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline); + + printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc); + printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx); + printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2); + printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse); + printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2); + printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3); + printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3); + printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1); + printf("util_cpu_caps.has_sse4_2 = %u\n", util_cpu_caps.has_sse4_2); + printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx); + printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2); + printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c); + printf("util_cpu_caps.has_popcnt = %u\n", util_cpu_caps.has_popcnt); + printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow); + printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext); + printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop); + printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec); + printf("util_cpu_caps.has_vsx = %u\n", util_cpu_caps.has_vsx); + printf("util_cpu_caps.has_neon = %u\n", util_cpu_caps.has_neon); + printf("util_cpu_caps.has_msa = %u\n", util_cpu_caps.has_msa); + printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz); + printf("util_cpu_caps.has_avx512f = %u\n", util_cpu_caps.has_avx512f); + printf("util_cpu_caps.has_avx512dq = %u\n", util_cpu_caps.has_avx512dq); + printf("util_cpu_caps.has_avx512ifma = %u\n", util_cpu_caps.has_avx512ifma); + printf("util_cpu_caps.has_avx512pf = %u\n", util_cpu_caps.has_avx512pf); + printf("util_cpu_caps.has_avx512er = %u\n", util_cpu_caps.has_avx512er); + printf("util_cpu_caps.has_avx512cd = %u\n", util_cpu_caps.has_avx512cd); + printf("util_cpu_caps.has_avx512bw = %u\n", util_cpu_caps.has_avx512bw); + printf("util_cpu_caps.has_avx512vl = %u\n", util_cpu_caps.has_avx512vl); + printf("util_cpu_caps.has_avx512vbmi = %u\n", util_cpu_caps.has_avx512vbmi); + printf("util_cpu_caps.num_L3_caches = %u\n", util_cpu_caps.num_L3_caches); + printf("util_cpu_caps.num_cpu_mask_bits = %u\n", util_cpu_caps.num_cpu_mask_bits); + } +} + +static once_flag cpu_once_flag = ONCE_FLAG_INIT; + +void +util_cpu_detect(void) +{ + call_once(&cpu_once_flag, util_cpu_detect_once); +} diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/mesa/util/u_cpu_detect.h similarity index 57% rename from src/gallium/auxiliary/util/u_cpu_detect.h rename to src/mesa/util/u_cpu_detect.h index 01f3896..59dd230 100644 --- a/src/gallium/auxiliary/util/u_cpu_detect.h +++ b/src/mesa/util/u_cpu_detect.h @@ -36,17 +36,45 @@ #define _UTIL_CPU_DETECT_H -#include "pipe/p_compiler.h" #include "pipe/p_config.h" +#include "util/u_thread.h" #ifdef __cplusplus extern "C" { #endif +enum cpu_family { + CPU_UNKNOWN, -struct util_cpu_caps { - int nr_cpus; + CPU_AMD_ZEN1_ZEN2, + CPU_AMD_ZEN_HYGON, + CPU_AMD_ZEN3, + CPU_AMD_ZEN_NEXT, + CPU_AMD_LAST, +}; + +typedef uint32_t util_affinity_mask[UTIL_MAX_CPUS / 32]; + +struct util_cpu_caps_t { + /** + * Number of CPUs available to the process. + * + * This will be less than or equal to \c max_cpus. This is the number of + * CPUs that are online and available to the process. + */ + int16_t nr_cpus; + + /** + * Maximum number of CPUs that can be online in the system. + * + * This will be greater than or equal to \c nr_cpus. This is the number of + * CPUs installed in the system. \c nr_cpus will be less if some CPUs are + * offline. + */ + int16_t max_cpus; + + enum cpu_family family; /* Feature flags */ int x86_cpu_type; @@ -66,15 +94,48 @@ struct util_cpu_caps { unsigned has_avx:1; unsigned has_avx2:1; unsigned has_f16c:1; + unsigned has_fma:1; unsigned has_3dnow:1; unsigned has_3dnow_ext:1; unsigned has_xop:1; unsigned has_altivec:1; + unsigned has_vsx:1; unsigned has_daz:1; + unsigned has_neon:1; + unsigned has_msa:1; + + unsigned has_avx512f:1; + unsigned has_avx512dq:1; + unsigned has_avx512ifma:1; + unsigned has_avx512pf:1; + unsigned has_avx512er:1; + unsigned has_avx512cd:1; + unsigned has_avx512bw:1; + unsigned has_avx512vl:1; + unsigned has_avx512vbmi:1; + + unsigned num_L3_caches; + unsigned num_cpu_mask_bits; + + uint16_t cpu_to_L3[UTIL_MAX_CPUS]; + /* Affinity masks for each L3 cache. */ + util_affinity_mask *L3_affinity_mask; }; -extern struct util_cpu_caps -util_cpu_caps; +#define U_CPU_INVALID_L3 0xffff + +static inline const struct util_cpu_caps_t * +util_get_cpu_caps(void) +{ + extern struct util_cpu_caps_t util_cpu_caps; + + /* If you hit this assert, it means that something is using the + * cpu-caps without having first called util_cpu_detect() + */ + assert(util_cpu_caps.nr_cpus >= 1); + + return &util_cpu_caps; +} void util_cpu_detect(void); diff --git a/src/mesa/util/u_math.c b/src/mesa/util/u_math.c new file mode 100644 index 0000000..7913285 --- /dev/null +++ b/src/mesa/util/u_math.c @@ -0,0 +1,311 @@ +/************************************************************************** + * + * Copyright 2008 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + + +#include "pipe/p_config.h" +#include "util/u_math.h" +#include "util/u_cpu_detect.h" + +#if defined(PIPE_ARCH_SSE) +#include +/* This is defined in pmmintrin.h, but it can only be included when -msse3 is + * used, so just define it here to avoid further. */ +#ifndef _MM_DENORMALS_ZERO_MASK +#define _MM_DENORMALS_ZERO_MASK 0x0040 +#endif +#endif + + +/** log2(x), for x in [1.0, 2.0) */ +float log2_table[LOG2_TABLE_SIZE]; + + +static void +init_log2_table(void) +{ + unsigned i; + for (i = 0; i < LOG2_TABLE_SIZE; i++) + log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SCALE)); +} + + +/** + * One time init for math utilities. + */ +void +util_init_math(void) +{ + static bool initialized = false; + if (!initialized) { + init_log2_table(); + initialized = true; + } +} + +/** + * Fetches the contents of the fpstate (mxcsr on x86) register. + * + * On platforms without support for it just returns 0. + */ +unsigned +util_fpstate_get(void) +{ + unsigned mxcsr = 0; + +#if defined(PIPE_ARCH_SSE) + if (util_get_cpu_caps()->has_sse) { + mxcsr = _mm_getcsr(); + } +#endif + + return mxcsr; +} + +/** + * Make sure that the fp treats the denormalized floating + * point numbers as zero. + * + * This is the behavior required by D3D10. OpenGL doesn't care. + */ +unsigned +util_fpstate_set_denorms_to_zero(unsigned current_mxcsr) +{ +#if defined(PIPE_ARCH_SSE) + if (util_get_cpu_caps()->has_sse) { + /* Enable flush to zero mode */ + current_mxcsr |= _MM_FLUSH_ZERO_MASK; + if (util_get_cpu_caps()->has_daz) { + /* Enable denormals are zero mode */ + current_mxcsr |= _MM_DENORMALS_ZERO_MASK; + } + util_fpstate_set(current_mxcsr); + } +#endif + return current_mxcsr; +} + +/** + * Set the state of the fpstate (mxcsr on x86) register. + * + * On platforms without support for it's a noop. + */ +void +util_fpstate_set(unsigned mxcsr) +{ +#if defined(PIPE_ARCH_SSE) + if (util_get_cpu_caps()->has_sse) { + _mm_setcsr(mxcsr); + } +#endif +} + +/** + * Compute inverse of 4x4 matrix. + * + * \return false if the source matrix is singular. + * + * \author + * Code contributed by Jacques Leroy jle@star.be + * + * Calculates the inverse matrix by performing the gaussian matrix reduction + * with partial pivoting followed by back/substitution with the loops manually + * unrolled. + */ +bool +util_invert_mat4x4(float *out, const float *m) +{ + float wtmp[4][8]; + float m0, m1, m2, m3, s; + float *r0, *r1, *r2, *r3; + +#define MAT(m, r, c) (m)[(c)*4 + (r)] +#define SWAP_ROWS(a, b) \ + { \ + float *_tmp = a; \ + (a) = (b); \ + (b) = _tmp; \ + } + + r0 = wtmp[0], r1 = wtmp[1], r2 = wtmp[2], r3 = wtmp[3]; + + r0[0] = MAT(m, 0, 0), r0[1] = MAT(m, 0, 1), r0[2] = MAT(m, 0, 2), r0[3] = MAT(m, 0, 3), + r0[4] = 1.0, r0[5] = r0[6] = r0[7] = 0.0, + + r1[0] = MAT(m, 1, 0), r1[1] = MAT(m, 1, 1), r1[2] = MAT(m, 1, 2), r1[3] = MAT(m, 1, 3), + r1[5] = 1.0, r1[4] = r1[6] = r1[7] = 0.0, + + r2[0] = MAT(m, 2, 0), r2[1] = MAT(m, 2, 1), r2[2] = MAT(m, 2, 2), r2[3] = MAT(m, 2, 3), + r2[6] = 1.0, r2[4] = r2[5] = r2[7] = 0.0, + + r3[0] = MAT(m, 3, 0), r3[1] = MAT(m, 3, 1), r3[2] = MAT(m, 3, 2), r3[3] = MAT(m, 3, 3), + r3[7] = 1.0, r3[4] = r3[5] = r3[6] = 0.0; + + /* choose pivot - or die */ + if (fabsf(r3[0]) > fabsf(r2[0])) + SWAP_ROWS(r3, r2); + if (fabsf(r2[0]) > fabsf(r1[0])) + SWAP_ROWS(r2, r1); + if (fabsf(r1[0]) > fabsf(r0[0])) + SWAP_ROWS(r1, r0); + if (0.0F == r0[0]) + return false; + + /* eliminate first variable */ + m1 = r1[0] / r0[0]; + m2 = r2[0] / r0[0]; + m3 = r3[0] / r0[0]; + s = r0[1]; + r1[1] -= m1 * s; + r2[1] -= m2 * s; + r3[1] -= m3 * s; + s = r0[2]; + r1[2] -= m1 * s; + r2[2] -= m2 * s; + r3[2] -= m3 * s; + s = r0[3]; + r1[3] -= m1 * s; + r2[3] -= m2 * s; + r3[3] -= m3 * s; + s = r0[4]; + if (s != 0.0F) { + r1[4] -= m1 * s; + r2[4] -= m2 * s; + r3[4] -= m3 * s; + } + s = r0[5]; + if (s != 0.0F) { + r1[5] -= m1 * s; + r2[5] -= m2 * s; + r3[5] -= m3 * s; + } + s = r0[6]; + if (s != 0.0F) { + r1[6] -= m1 * s; + r2[6] -= m2 * s; + r3[6] -= m3 * s; + } + s = r0[7]; + if (s != 0.0F) { + r1[7] -= m1 * s; + r2[7] -= m2 * s; + r3[7] -= m3 * s; + } + + /* choose pivot - or die */ + if (fabsf(r3[1]) > fabsf(r2[1])) + SWAP_ROWS(r3, r2); + if (fabsf(r2[1]) > fabsf(r1[1])) + SWAP_ROWS(r2, r1); + if (0.0F == r1[1]) + return false; + + /* eliminate second variable */ + m2 = r2[1] / r1[1]; + m3 = r3[1] / r1[1]; + r2[2] -= m2 * r1[2]; + r3[2] -= m3 * r1[2]; + r2[3] -= m2 * r1[3]; + r3[3] -= m3 * r1[3]; + s = r1[4]; + if (0.0F != s) { + r2[4] -= m2 * s; + r3[4] -= m3 * s; + } + s = r1[5]; + if (0.0F != s) { + r2[5] -= m2 * s; + r3[5] -= m3 * s; + } + s = r1[6]; + if (0.0F != s) { + r2[6] -= m2 * s; + r3[6] -= m3 * s; + } + s = r1[7]; + if (0.0F != s) { + r2[7] -= m2 * s; + r3[7] -= m3 * s; + } + + /* choose pivot - or die */ + if (fabsf(r3[2]) > fabsf(r2[2])) + SWAP_ROWS(r3, r2); + if (0.0F == r2[2]) + return false; + + /* eliminate third variable */ + m3 = r3[2] / r2[2]; + r3[3] -= m3 * r2[3], r3[4] -= m3 * r2[4], r3[5] -= m3 * r2[5], r3[6] -= m3 * r2[6], + r3[7] -= m3 * r2[7]; + + /* last check */ + if (0.0F == r3[3]) + return false; + + s = 1.0F / r3[3]; /* now back substitute row 3 */ + r3[4] *= s; + r3[5] *= s; + r3[6] *= s; + r3[7] *= s; + + m2 = r2[3]; /* now back substitute row 2 */ + s = 1.0F / r2[2]; + r2[4] = s * (r2[4] - r3[4] * m2), r2[5] = s * (r2[5] - r3[5] * m2), + r2[6] = s * (r2[6] - r3[6] * m2), r2[7] = s * (r2[7] - r3[7] * m2); + m1 = r1[3]; + r1[4] -= r3[4] * m1, r1[5] -= r3[5] * m1, r1[6] -= r3[6] * m1, r1[7] -= r3[7] * m1; + m0 = r0[3]; + r0[4] -= r3[4] * m0, r0[5] -= r3[5] * m0, r0[6] -= r3[6] * m0, r0[7] -= r3[7] * m0; + + m1 = r1[2]; /* now back substitute row 1 */ + s = 1.0F / r1[1]; + r1[4] = s * (r1[4] - r2[4] * m1), r1[5] = s * (r1[5] - r2[5] * m1), + r1[6] = s * (r1[6] - r2[6] * m1), r1[7] = s * (r1[7] - r2[7] * m1); + m0 = r0[2]; + r0[4] -= r2[4] * m0, r0[5] -= r2[5] * m0, r0[6] -= r2[6] * m0, r0[7] -= r2[7] * m0; + + m0 = r0[1]; /* now back substitute row 0 */ + s = 1.0F / r0[0]; + r0[4] = s * (r0[4] - r1[4] * m0), r0[5] = s * (r0[5] - r1[5] * m0), + r0[6] = s * (r0[6] - r1[6] * m0), r0[7] = s * (r0[7] - r1[7] * m0); + + MAT(out, 0, 0) = r0[4]; + MAT(out, 0, 1) = r0[5], MAT(out, 0, 2) = r0[6]; + MAT(out, 0, 3) = r0[7], MAT(out, 1, 0) = r1[4]; + MAT(out, 1, 1) = r1[5], MAT(out, 1, 2) = r1[6]; + MAT(out, 1, 3) = r1[7], MAT(out, 2, 0) = r2[4]; + MAT(out, 2, 1) = r2[5], MAT(out, 2, 2) = r2[6]; + MAT(out, 2, 3) = r2[7], MAT(out, 3, 0) = r3[4]; + MAT(out, 3, 1) = r3[5], MAT(out, 3, 2) = r3[6]; + MAT(out, 3, 3) = r3[7]; + +#undef MAT +#undef SWAP_ROWS + + return true; +} diff --git a/src/gallium/auxiliary/util/u_math.h b/src/mesa/util/u_math.h similarity index 57% rename from src/gallium/auxiliary/util/u_math.h rename to src/mesa/util/u_math.h index 1b9c51a..7c989a3 100644 --- a/src/gallium/auxiliary/util/u_math.h +++ b/src/mesa/util/u_math.h @@ -39,177 +39,24 @@ #define U_MATH_H -#include "pipe/p_compiler.h" +#include "c99_math.h" +#include +#include +#include +#include "bitscan.h" +#include "u_endian.h" /* for UTIL_ARCH_BIG_ENDIAN */ #ifdef __cplusplus extern "C" { #endif -#include -#include -#include - -#ifdef PIPE_OS_UNIX -#include /* for ffs */ -#endif - - #ifndef M_SQRT2 #define M_SQRT2 1.41421356237309504880 #endif -#if defined(_MSC_VER) - -#if _MSC_VER < 1400 && !defined(__cplusplus) - -static inline float cosf( float f ) -{ - return (float) cos( (double) f ); -} - -static inline float sinf( float f ) -{ - return (float) sin( (double) f ); -} - -static inline float ceilf( float f ) -{ - return (float) ceil( (double) f ); -} - -static inline float floorf( float f ) -{ - return (float) floor( (double) f ); -} - -static inline float powf( float f, float g ) -{ - return (float) pow( (double) f, (double) g ); -} - -static inline float sqrtf( float f ) -{ - return (float) sqrt( (double) f ); -} - -static inline float fabsf( float f ) -{ - return (float) fabs( (double) f ); -} - -static inline float logf( float f ) -{ - return (float) log( (double) f ); -} - -#else -/* Work-around an extra semi-colon in VS 2005 logf definition */ -#ifdef logf -#undef logf -#define logf(x) ((float)log((double)(x))) -#endif /* logf */ - -#if _MSC_VER < 1800 -#define isfinite(x) _finite((double)(x)) -#define isnan(x) _isnan((double)(x)) -#endif /* _MSC_VER < 1800 */ -#endif /* _MSC_VER < 1400 && !defined(__cplusplus) */ - -#if _MSC_VER < 1800 -static inline double log2( double x ) -{ - const double invln2 = 1.442695041; - return log( x ) * invln2; -} - -static inline double -round(double x) -{ - return x >= 0.0 ? floor(x + 0.5) : ceil(x - 0.5); -} - -static inline float -roundf(float x) -{ - return x >= 0.0f ? floorf(x + 0.5f) : ceilf(x - 0.5f); -} -#endif - -#ifndef INFINITY -#define INFINITY (DBL_MAX + DBL_MAX) -#endif - -#ifndef NAN -#define NAN (INFINITY - INFINITY) -#endif - -#endif /* _MSC_VER */ - - -#if __STDC_VERSION__ < 199901L && (!defined(__cplusplus) || defined(_MSC_VER)) -static inline long int -lrint(double d) -{ - long int rounded = (long int)(d + 0.5); - - if (d - floor(d) == 0.5) { - if (rounded % 2 != 0) - rounded += (d > 0) ? -1 : 1; - } - - return rounded; -} - -static inline long int -lrintf(float f) -{ - long int rounded = (long int)(f + 0.5f); - - if (f - floorf(f) == 0.5f) { - if (rounded % 2 != 0) - rounded += (f > 0) ? -1 : 1; - } - - return rounded; -} - -static inline long long int -llrint(double d) -{ - long long int rounded = (long long int)(d + 0.5); - - if (d - floor(d) == 0.5) { - if (rounded % 2 != 0) - rounded += (d > 0) ? -1 : 1; - } - - return rounded; -} - -static inline long long int -llrintf(float f) -{ - long long int rounded = (long long int)(f + 0.5f); - - if (f - floorf(f) == 0.5f) { - if (rounded % 2 != 0) - rounded += (f > 0) ? -1 : 1; - } - - return rounded; -} -#endif /* C99 */ - -#define POW2_TABLE_SIZE_LOG2 9 -#define POW2_TABLE_SIZE (1 << POW2_TABLE_SIZE_LOG2) -#define POW2_TABLE_OFFSET (POW2_TABLE_SIZE/2) -#define POW2_TABLE_SCALE ((float)(POW2_TABLE_SIZE/2)) -extern float pow2_table[POW2_TABLE_SIZE]; - - /** * Initialize math module. This should be called before using any * other functions in this module. @@ -236,7 +83,8 @@ union di { * Extract the IEEE float32 exponent. */ static inline signed -util_get_float32_exponent(float x) { +util_get_float32_exponent(float x) +{ union fi f; f.f = x; @@ -245,57 +93,7 @@ util_get_float32_exponent(float x) { } -/** - * Fast version of 2^x - * Identity: exp2(a + b) = exp2(a) * exp2(b) - * Let ipart = int(x) - * Let fpart = x - ipart; - * So, exp2(x) = exp2(ipart) * exp2(fpart) - * Compute exp2(ipart) with i << ipart - * Compute exp2(fpart) with lookup table. - */ -static inline float -util_fast_exp2(float x) -{ - int32_t ipart; - float fpart, mpart; - union fi epart; - - if(x > 129.00000f) - return 3.402823466e+38f; - - if (x < -126.99999f) - return 0.0f; - - ipart = (int32_t) x; - fpart = x - (float) ipart; - - /* same as - * epart.f = (float) (1 << ipart) - * but faster and without integer overflow for ipart > 31 - */ - epart.i = (ipart + 127 ) << 23; - - mpart = pow2_table[POW2_TABLE_OFFSET + (int)(fpart * POW2_TABLE_SCALE)]; - - return epart.f * mpart; -} - - -/** - * Fast approximation to exp(x). - */ -static inline float -util_fast_exp(float x) -{ - const float k = 1.44269f; /* = log2(e) */ - return util_fast_exp2(k * x); -} - - -#if 0 - -#define LOG2_TABLE_SIZE_LOG2 16 +#define LOG2_TABLE_SIZE_LOG2 8 #define LOG2_TABLE_SCALE (1 << LOG2_TABLE_SIZE_LOG2) #define LOG2_TABLE_SIZE (LOG2_TABLE_SCALE + 1) extern float log2_table[LOG2_TABLE_SIZE]; @@ -317,30 +115,29 @@ util_fast_log2(float x) } -/** - * Fast approximation to x^y. - */ -static inline float -util_fast_pow(float x, float y) -{ - return util_fast_exp2(util_fast_log2(x) * y); -} -#endif -/* Note that this counts zero as a power of two. - */ -static inline boolean -util_is_power_of_two( unsigned v ) -{ - return (v & (v-1)) == 0; -} - - /** * Floor(x), returned as int. */ static inline int util_ifloor(float f) { +#if defined(USE_X86_ASM) && defined(__GNUC__) && defined(__i386__) + /* + * IEEE floor for computers that round to nearest or even. + * 'f' must be between -4194304 and 4194303. + * This floor operation is done by "(iround(f + .5) + iround(f - .5)) >> 1", + * but uses some IEEE specific tricks for better speed. + * Contributed by Josh Vanderhoof + */ + int ai, bi; + double af, bf; + af = (3 << 22) + 0.5 + (double)f; + bf = (3 << 22) + 0.5 - (double)f; + /* GCC generates an extra fstp/fld without this. */ + __asm__ ("fstps %0" : "=m" (ai) : "t" (af) : "st"); + __asm__ ("fstps %0" : "=m" (bi) : "t" (bf) : "st"); + return (ai - bi) >> 1; +#else int ai, bi; double af, bf; union fi u; @@ -349,6 +146,7 @@ util_ifloor(float f) u.f = (float) af; ai = u.i; u.f = (float) bf; bi = u.i; return (ai - bi) >> 1; +#endif } @@ -381,10 +179,10 @@ util_iround(float f) /** * Approximate floating point comparison */ -static inline boolean +static inline bool util_is_approx(float a, float b, float tol) { - return fabs(b - a) <= tol; + return fabsf(b - a) <= tol; } @@ -400,7 +198,7 @@ util_is_approx(float a, float b, float tol) /** * Single-float */ -static inline boolean +static inline bool util_is_inf_or_nan(float x) { union fi tmp; @@ -409,7 +207,7 @@ util_is_inf_or_nan(float x) } -static inline boolean +static inline bool util_is_nan(float x) { union fi tmp; @@ -434,7 +232,7 @@ util_inf_sign(float x) /** * Double-float */ -static inline boolean +static inline bool util_is_double_inf_or_nan(double x) { union di tmp; @@ -443,7 +241,7 @@ util_is_double_inf_or_nan(double x) } -static inline boolean +static inline bool util_is_double_nan(double x) { union di tmp; @@ -468,14 +266,14 @@ util_double_inf_sign(double x) /** * Half-float */ -static inline boolean +static inline bool util_is_half_inf_or_nan(int16_t x) { return (x & 0x7c00) == 0x7c00; } -static inline boolean +static inline bool util_is_half_nan(int16_t x) { return (x & 0x7fff) > 0x7c00; @@ -494,163 +292,84 @@ util_half_inf_sign(int16_t x) /** - * Find first bit set in word. Least significant bit is 1. - * Return 0 if no bits set. + * Return float bits. */ -#ifndef FFS_DEFINED -#define FFS_DEFINED 1 - -#if defined(_MSC_VER) && _MSC_VER >= 1300 && (_M_IX86 || _M_AMD64 || _M_IA64) -unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask); -#pragma intrinsic(_BitScanForward) -static inline -unsigned long ffs( unsigned long u ) -{ - unsigned long i; - if (_BitScanForward(&i, u)) - return i + 1; - else - return 0; -} -#elif defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86) -static inline -unsigned ffs( unsigned u ) +static inline unsigned +fui( float f ) { - unsigned i; - - if (u == 0) { - return 0; - } - - __asm bsf eax, [u] - __asm inc eax - __asm mov [i], eax - - return i; + union fi fi; + fi.f = f; + return fi.ui; } -#elif defined(__MINGW32__) || defined(PIPE_OS_ANDROID) -#define ffs __builtin_ffs -#endif - -#endif /* FFS_DEFINED */ -/** - * Find last bit set in a word. The least significant bit is 1. - * Return 0 if no bits are set. - */ -static inline unsigned util_last_bit(unsigned u) +static inline float +uif(uint32_t ui) { -#if defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) - return u == 0 ? 0 : 32 - __builtin_clz(u); -#else - unsigned r = 0; - while (u) { - r++; - u >>= 1; - } - return r; -#endif + union fi fi; + fi.ui = ui; + return fi.f; } -/** - * Find last bit in a word that does not match the sign bit. The least - * significant bit is 1. - * Return 0 if no bits are set. - */ -static inline unsigned util_last_bit_signed(int i) -{ -#if defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 407) - return 31 - __builtin_clrsb(i); -#else - if (i >= 0) - return util_last_bit(i); - else - return util_last_bit(~(unsigned)i); -#endif -} -/* Destructively loop over all of the bits in a mask as in: - * - * while (mymask) { - * int i = u_bit_scan(&mymask); - * ... process element i - * } - * +/** + * Convert uint8_t to float in [0, 1]. */ -static inline int u_bit_scan(unsigned *mask) +static inline float +ubyte_to_float(uint8_t ub) { - int i = ffs(*mask) - 1; - *mask &= ~(1 << i); - return i; + return (float) ub * (1.0f / 255.0f); } -/* For looping over a bitmask when you want to loop over consecutive bits - * manually, for example: - * - * while (mask) { - * int start, count, i; - * - * u_bit_scan_consecutive_range(&mask, &start, &count); - * - * for (i = 0; i < count; i++) - * ... process element (start+i) - * } - */ -static inline void -u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count) -{ - if (*mask == 0xffffffff) { - *start = 0; - *count = 32; - *mask = 0; - return; - } - *start = ffs(*mask) - 1; - *count = ffs(~(*mask >> *start)) - 1; - *mask &= ~(((1u << *count) - 1) << *start); -} /** - * Return float bits. + * Convert float in [0,1] to uint8_t in [0,255] with clamping. */ -static inline unsigned -fui( float f ) +static inline uint8_t +float_to_ubyte(float f) { - union fi fi; - fi.f = f; - return fi.ui; + /* return 0 for NaN too */ + if (!(f > 0.0f)) { + return (uint8_t) 0; + } + else if (f >= 1.0f) { + return (uint8_t) 255; + } + else { + union fi tmp; + tmp.f = f; + tmp.f = tmp.f * (255.0f/256.0f) + 32768.0f; + return (uint8_t) tmp.i; + } } - /** - * Convert ubyte to float in [0, 1]. - * XXX a 256-entry lookup table would be slightly faster. + * Convert uint16_t to float in [0, 1]. */ static inline float -ubyte_to_float(ubyte ub) +ushort_to_float(uint16_t us) { - return (float) ub * (1.0f / 255.0f); + return (float) us * (1.0f / 65535.0f); } /** - * Convert float in [0,1] to ubyte in [0,255] with clamping. + * Convert float in [0,1] to uint16_t in [0,65535] with clamping. */ -static inline ubyte -float_to_ubyte(float f) +static inline uint16_t +float_to_ushort(float f) { - union fi tmp; - - tmp.f = f; - if (tmp.i < 0) { - return (ubyte) 0; + /* return 0 for NaN too */ + if (!(f > 0.0f)) { + return (uint16_t) 0; } - else if (tmp.i >= 0x3f800000 /* 1.0f */) { - return (ubyte) 255; + else if (f >= 1.0f) { + return (uint16_t) 65535; } else { - tmp.f = tmp.f * (255.0f/256.0f) + 32768.0f; - return (ubyte) tmp.i; + union fi tmp; + tmp.f = f; + tmp.f = tmp.f * (65535.0f/65536.0f) + 128.0f; + return (uint16_t) tmp.i; } } @@ -672,7 +391,7 @@ float_to_byte_tex(float f) static inline unsigned util_logbase2(unsigned n) { -#if defined(PIPE_CC_GCC) && (PIPE_CC_GCC_VERSION >= 304) +#if defined(HAVE___BUILTIN_CLZ) return ((sizeof(unsigned) * 8 - 1) - __builtin_clz(n | 1)); #else unsigned pos = 0; @@ -685,6 +404,44 @@ util_logbase2(unsigned n) #endif } +static inline uint64_t +util_logbase2_64(uint64_t n) +{ +#if defined(HAVE___BUILTIN_CLZLL) + return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1)); +#else + uint64_t pos = 0ull; + if (n >= 1ull<<32) { n >>= 32; pos += 32; } + if (n >= 1ull<<16) { n >>= 16; pos += 16; } + if (n >= 1ull<< 8) { n >>= 8; pos += 8; } + if (n >= 1ull<< 4) { n >>= 4; pos += 4; } + if (n >= 1ull<< 2) { n >>= 2; pos += 2; } + if (n >= 1ull<< 1) { pos += 1; } + return pos; +#endif +} + +/** + * Returns the ceiling of log n base 2, and 0 when n == 0. Equivalently, + * returns the smallest x such that n <= 2**x. + */ +static inline unsigned +util_logbase2_ceil(unsigned n) +{ + if (n <= 1) + return 0; + + return 1 + util_logbase2(n - 1); +} + +static inline uint64_t +util_logbase2_ceil64(uint64_t n) +{ + if (n <= 1) + return 0; + + return 1ull + util_logbase2_64(n - 1); +} /** * Returns the smallest power of two >= x @@ -692,7 +449,7 @@ util_logbase2(unsigned n) static inline unsigned util_next_power_of_two(unsigned x) { -#if defined(PIPE_CC_GCC) && (PIPE_CC_GCC_VERSION >= 304) +#if defined(HAVE___BUILTIN_CLZ) if (x <= 1) return 1; @@ -703,7 +460,7 @@ util_next_power_of_two(unsigned x) if (x <= 1) return 1; - if (util_is_power_of_two(x)) + if (util_is_power_of_two_or_zero(x)) return x; val--; @@ -717,27 +474,32 @@ util_next_power_of_two(unsigned x) #endif } - -/** - * Return number of bits set in n. - */ -static inline unsigned -util_bitcount(unsigned n) +static inline uint64_t +util_next_power_of_two64(uint64_t x) { -#if defined(PIPE_CC_GCC) && (PIPE_CC_GCC_VERSION >= 304) - return __builtin_popcount(n); +#if defined(HAVE___BUILTIN_CLZLL) + if (x <= 1) + return 1; + + return (1ull << ((sizeof(uint64_t) * 8) - __builtin_clzll(x - 1))); #else - /* K&R classic bitcount. - * - * For each iteration, clear the LSB from the bitfield. - * Requires only one iteration per set bit, instead of - * one iteration per bit less than highest set bit. - */ - unsigned bits = 0; - for (bits; n; bits++) { - n &= n - 1; - } - return bits; + uint64_t val = x; + + if (x <= 1) + return 1; + + if (util_is_power_of_two_or_zero64(x)) + return x; + + val--; + val = (val >> 1) | val; + val = (val >> 2) | val; + val = (val >> 4) | val; + val = (val >> 8) | val; + val = (val >> 16) | val; + val = (val >> 32) | val; + val++; + return val; #endif } @@ -781,8 +543,7 @@ util_bitreverse(unsigned n) static inline uint32_t util_bswap32(uint32_t n) { -/* We need the gcc version checks for non-autoconf build system */ -#if defined(HAVE___BUILTIN_BSWAP32) || (defined(PIPE_CC_GCC) && (PIPE_CC_GCC_VERSION >= 403)) +#if defined(HAVE___BUILTIN_BSWAP32) return __builtin_bswap32(n); #else return (n >> 24) | @@ -801,7 +562,7 @@ util_bswap64(uint64_t n) #if defined(HAVE___BUILTIN_BSWAP64) return __builtin_bswap64(n); #else - return ((uint64_t)util_bswap32(n) << 32) | + return ((uint64_t)util_bswap32((uint32_t)n) << 32) | util_bswap32((n >> 32)); #endif } @@ -817,6 +578,37 @@ util_bswap16(uint16_t n) (n << 8); } +/** + * Extend sign. + */ +static inline int64_t +util_sign_extend(uint64_t val, unsigned width) +{ + assert(width > 0); + if (val & (UINT64_C(1) << (width - 1))) { + return -(int64_t)((UINT64_C(1) << width) - val); + } else { + return val; + } +} + +static inline void* +util_memcpy_cpu_to_le32(void * restrict dest, const void * restrict src, size_t n) +{ +#if UTIL_ARCH_BIG_ENDIAN + size_t i, e; + assert(n % 4 == 0); + + for (i = 0, e = n / 4; i < e; i++) { + uint32_t * restrict d = (uint32_t* restrict)dest; + const uint32_t * restrict s = (const uint32_t* restrict)src; + d[i] = util_bswap32(s[i]); + } + return dest; +#else + return memcpy(dest, src, n); +#endif +} /** * Clamp X to [MIN, MAX]. @@ -825,6 +617,9 @@ util_bswap16(uint16_t n) */ #define CLAMP( X, MIN, MAX ) ( (X)>(MIN) ? ((X)>(MAX) ? (MAX) : (X)) : (MIN) ) +/* Syntax sugar occuring frequently in graphics code */ +#define SATURATE( X ) CLAMP(X, 0.0f, 1.0f) + #define MIN2( A, B ) ( (A)<(B) ? (A) : (B) ) #define MAX2( A, B ) ( (A)>(B) ? (A) : (B) ) @@ -835,6 +630,56 @@ util_bswap16(uint16_t n) #define MAX4( A, B, C, D ) ((A) > (B) ? MAX3(A, C, D) : MAX3(B, C, D)) +/** + * Align a value up to an alignment value + * + * If \c value is not already aligned to the requested alignment value, it + * will be rounded up. + * + * \param value Value to be rounded + * \param alignment Alignment value to be used. This must be a power of two. + * + * \sa ROUND_DOWN_TO() + */ + +#if defined(ALIGN) +#undef ALIGN +#endif +static inline uintptr_t +ALIGN(uintptr_t value, int32_t alignment) +{ + assert(util_is_power_of_two_nonzero(alignment)); + return (((value) + (alignment) - 1) & ~((alignment) - 1)); +} + +/** + * Like ALIGN(), but works with a non-power-of-two alignment. + */ +static inline uintptr_t +ALIGN_NPOT(uintptr_t value, int32_t alignment) +{ + assert(alignment > 0); + return (value + alignment - 1) / alignment * alignment; +} + +/** + * Align a value down to an alignment value + * + * If \c value is not already aligned to the requested alignment value, it + * will be rounded down. + * + * \param value Value to be rounded + * \param alignment Alignment value to be used. This must be a power of two. + * + * \sa ALIGN() + */ +static inline uint64_t +ROUND_DOWN_TO(uint64_t value, int32_t alignment) +{ + assert(util_is_power_of_two_nonzero(alignment)); + return ((value) & ~(alignment - 1)); +} + /** * Align a value, only works pot alignemnts. */ @@ -844,6 +689,12 @@ align(int value, int alignment) return (value + alignment - 1) & ~(alignment - 1); } +static inline uint64_t +align64(uint64_t value, unsigned alignment) +{ + return (value + alignment - 1) & ~((uint64_t)alignment - 1); +} + /** * Works like align but on npot alignments. */ @@ -888,12 +739,14 @@ do { \ #endif -static inline uint32_t util_unsigned_fixed(float value, unsigned frac_bits) +static inline uint32_t +util_unsigned_fixed(float value, unsigned frac_bits) { return value < 0 ? 0 : (uint32_t)(value * (1< 1024) + return upload_vertex_count > draw_vertex_count * 4; + else if (draw_vertex_count > 32) + return upload_vertex_count > draw_vertex_count * 8; + else + return upload_vertex_count > draw_vertex_count * 16; +} +bool util_invert_mat4x4(float *out, const float *m); + +/* Quantize the lod bias value to reduce the number of sampler state + * variants in gallium because apps use it for smooth mipmap transitions, + * thrashing cso_cache and degrading performance. + * + * This quantization matches the AMD hw specification, so having more + * precision would have no effect anyway. + */ +static inline float +util_quantize_lod_bias(float lod) +{ + lod = CLAMP(lod, -16, 16); + return roundf(lod * 256) / 256; +} #ifdef __cplusplus } diff --git a/src/venus/vkr_common.h b/src/venus/vkr_common.h index f9c5daa..73816e6 100644 --- a/src/venus/vkr_common.h +++ b/src/venus/vkr_common.h @@ -258,7 +258,7 @@ vkr_region_size(const struct vkr_region *region) static inline bool vkr_region_is_aligned(const struct vkr_region *region, size_t align) { - assert(align && util_is_power_of_two(align)); + assert(util_is_power_of_two_nonzero(align)); return !((region->begin | region->end) & (align - 1)); } diff --git a/src/venus/vkr_ring.c b/src/venus/vkr_ring.c index b42ec1f..884a26f 100644 --- a/src/venus/vkr_ring.c +++ b/src/venus/vkr_ring.c @@ -66,7 +66,7 @@ vkr_ring_init_buffer(struct vkr_ring *ring, const struct vkr_ring_layout *layout &buf->base_iov_offset); buf->size = vkr_region_size(&layout->buffer); - assert(buf->size && util_is_power_of_two(buf->size)); + assert(util_is_power_of_two_nonzero(buf->size)); buf->mask = buf->size - 1; buf->cur = 0; diff --git a/src/venus/vkr_transport.c b/src/venus/vkr_transport.c index c0f3e3e..0523a87 100644 --- a/src/venus/vkr_transport.c +++ b/src/venus/vkr_transport.c @@ -217,8 +217,7 @@ vkr_ring_layout_init(struct vkr_ring_layout *layout, } const size_t buf_size = vkr_region_size(&layout->buffer); - if (!buf_size || buf_size > VKR_RING_BUFFER_MAX_SIZE || - !util_is_power_of_two(buf_size)) { + if (buf_size > VKR_RING_BUFFER_MAX_SIZE || !util_is_power_of_two_nonzero(buf_size)) { vkr_log("ring buffer size (%lu) must be a power of two and not exceed %lu", buf_size, VKR_RING_BUFFER_MAX_SIZE); return false; diff --git a/src/vrend_decode.c b/src/vrend_decode.c index 25a9204..934c8b0 100644 --- a/src/vrend_decode.c +++ b/src/vrend_decode.c @@ -233,13 +233,6 @@ static int vrend_decode_clear_texture(struct vrend_context *ctx, const uint32_t return 0; } -static float uif(unsigned int ui) -{ - union { float f; unsigned int ui; } myuif; - myuif.ui = ui; - return myuif.f; -} - static int vrend_decode_set_viewport_state(struct vrend_context *ctx, const uint32_t *buf, uint32_t length) { struct pipe_viewport_state vps[PIPE_MAX_VIEWPORTS];