uif is now defined. util_is_power_of_two is replaced by other variants. Signed-off-by: Chia-I Wu <olvaffe@gmail.com> Reviewed-by: Yiwei Zhang <zzyiwei@chromium.org> Reviewed-by: Ryan Neph <ryanneph@google.com> Acked-by: Gert Wollny <gert.wollny@collabora.com>macos/master
parent
9526a95d47
commit
10b89464a3
@ -1,458 +0,0 @@ |
||||
/**************************************************************************
|
||||
*
|
||||
* Copyright 2008 Dennis Smit |
||||
* All Rights Reserved. |
||||
* |
||||
* Permission is hereby granted, free of charge, to any person obtaining a |
||||
* copy of this software and associated documentation files (the "Software"), |
||||
* to deal in the Software without restriction, including without limitation |
||||
* on the rights to use, copy, modify, merge, publish, distribute, sub |
||||
* license, and/or sell copies of the Software, and to permit persons to whom |
||||
* the Software is furnished to do so, subject to the following conditions: |
||||
* |
||||
* The above copyright notice and this permission notice (including the next |
||||
* paragraph) shall be included in all copies or substantial portions of the |
||||
* Software. |
||||
* |
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL |
||||
* AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, |
||||
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR |
||||
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE |
||||
* USE OR OTHER DEALINGS IN THE SOFTWARE. |
||||
*
|
||||
**************************************************************************/ |
||||
|
||||
/**
|
||||
* @file |
||||
* CPU feature detection. |
||||
* |
||||
* @author Dennis Smit |
||||
* @author Based on the work of Eric Anholt <anholt@FreeBSD.org> |
||||
*/ |
||||
|
||||
#include "pipe/p_config.h" |
||||
|
||||
#include "u_debug.h" |
||||
#include "u_cpu_detect.h" |
||||
|
||||
#if defined(PIPE_ARCH_PPC) |
||||
#if defined(PIPE_OS_APPLE) |
||||
#include <sys/sysctl.h> |
||||
#else |
||||
#include <signal.h> |
||||
#include <setjmp.h> |
||||
#endif |
||||
#endif |
||||
|
||||
#if defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD) |
||||
#include <sys/param.h> |
||||
#include <sys/sysctl.h> |
||||
#include <machine/cpu.h> |
||||
#endif |
||||
|
||||
#if defined(PIPE_OS_FREEBSD) |
||||
#include <sys/types.h> |
||||
#include <sys/sysctl.h> |
||||
#endif |
||||
|
||||
#if defined(PIPE_OS_LINUX) |
||||
#include <signal.h> |
||||
#endif |
||||
|
||||
#ifdef PIPE_OS_UNIX |
||||
#include <unistd.h> |
||||
#endif |
||||
|
||||
#if defined(PIPE_OS_WINDOWS) |
||||
#include <windows.h> |
||||
#if defined(PIPE_CC_MSVC) |
||||
#include <intrin.h> |
||||
#endif |
||||
#endif |
||||
|
||||
|
||||
#ifdef DEBUG |
||||
DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", FALSE) |
||||
#endif |
||||
|
||||
|
||||
struct util_cpu_caps util_cpu_caps; |
||||
|
||||
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) |
||||
static int has_cpuid(void); |
||||
#endif |
||||
|
||||
|
||||
#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE) |
||||
static jmp_buf __lv_powerpc_jmpbuf; |
||||
static volatile sig_atomic_t __lv_powerpc_canjump = 0; |
||||
|
||||
static void |
||||
sigill_handler(int sig) |
||||
{ |
||||
if (!__lv_powerpc_canjump) { |
||||
signal (sig, SIG_DFL); |
||||
raise (sig); |
||||
} |
||||
|
||||
__lv_powerpc_canjump = 0; |
||||
longjmp(__lv_powerpc_jmpbuf, 1); |
||||
} |
||||
#endif |
||||
|
||||
#if defined(PIPE_ARCH_PPC) |
||||
static void |
||||
check_os_altivec_support(void) |
||||
{ |
||||
#if defined(PIPE_OS_APPLE) |
||||
int sels[2] = {CTL_HW, HW_VECTORUNIT}; |
||||
int has_vu = 0; |
||||
int len = sizeof (has_vu); |
||||
int err; |
||||
|
||||
err = sysctl(sels, 2, &has_vu, &len, NULL, 0); |
||||
|
||||
if (err == 0) { |
||||
if (has_vu != 0) { |
||||
util_cpu_caps.has_altivec = 1; |
||||
} |
||||
} |
||||
#else /* !PIPE_OS_APPLE */ |
||||
/* not on Apple/Darwin, do it the brute-force way */ |
||||
/* this is borrowed from the libmpeg2 library */ |
||||
signal(SIGILL, sigill_handler); |
||||
if (setjmp(__lv_powerpc_jmpbuf)) { |
||||
signal(SIGILL, SIG_DFL); |
||||
} else { |
||||
__lv_powerpc_canjump = 1; |
||||
|
||||
__asm __volatile |
||||
("mtspr 256, %0\n\t" |
||||
"vand %%v0, %%v0, %%v0" |
||||
: |
||||
: "r" (-1)); |
||||
|
||||
signal(SIGILL, SIG_DFL); |
||||
util_cpu_caps.has_altivec = 1; |
||||
} |
||||
#endif /* !PIPE_OS_APPLE */ |
||||
} |
||||
#endif /* PIPE_ARCH_PPC */ |
||||
|
||||
|
||||
#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64) |
||||
static int has_cpuid(void) |
||||
{ |
||||
#if defined(PIPE_ARCH_X86) |
||||
#if defined(PIPE_OS_GCC) |
||||
int a, c; |
||||
|
||||
__asm __volatile |
||||
("pushf\n" |
||||
"popl %0\n" |
||||
"movl %0, %1\n" |
||||
"xorl $0x200000, %0\n" |
||||
"push %0\n" |
||||
"popf\n" |
||||
"pushf\n" |
||||
"popl %0\n" |
||||
: "=a" (a), "=c" (c) |
||||
: |
||||
: "cc"); |
||||
|
||||
return a != c; |
||||
#else |
||||
/* FIXME */ |
||||
return 1; |
||||
#endif |
||||
#elif defined(PIPE_ARCH_X86_64) |
||||
return 1; |
||||
#else |
||||
return 0; |
||||
#endif |
||||
} |
||||
|
||||
|
||||
/**
|
||||
* @sa cpuid.h included in gcc-4.3 onwards. |
||||
* @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
|
||||
*/ |
||||
static inline void |
||||
cpuid(uint32_t ax, uint32_t *p) |
||||
{ |
||||
#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86) |
||||
__asm __volatile ( |
||||
"xchgl %%ebx, %1\n\t" |
||||
"cpuid\n\t" |
||||
"xchgl %%ebx, %1" |
||||
: "=a" (p[0]), |
||||
"=S" (p[1]), |
||||
"=c" (p[2]), |
||||
"=d" (p[3]) |
||||
: "0" (ax) |
||||
); |
||||
#elif (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86_64) |
||||
__asm __volatile ( |
||||
"cpuid\n\t" |
||||
: "=a" (p[0]), |
||||
"=b" (p[1]), |
||||
"=c" (p[2]), |
||||
"=d" (p[3]) |
||||
: "0" (ax) |
||||
); |
||||
#elif defined(PIPE_CC_MSVC) |
||||
__cpuid(p, ax); |
||||
#else |
||||
p[0] = 0; |
||||
p[1] = 0; |
||||
p[2] = 0; |
||||
p[3] = 0; |
||||
#endif |
||||
} |
||||
|
||||
/**
|
||||
* @sa cpuid.h included in gcc-4.4 onwards. |
||||
* @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx
|
||||
*/ |
||||
static inline void |
||||
cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p) |
||||
{ |
||||
#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86) |
||||
__asm __volatile ( |
||||
"xchgl %%ebx, %1\n\t" |
||||
"cpuid\n\t" |
||||
"xchgl %%ebx, %1" |
||||
: "=a" (p[0]), |
||||
"=S" (p[1]), |
||||
"=c" (p[2]), |
||||
"=d" (p[3]) |
||||
: "0" (ax), "2" (cx) |
||||
); |
||||
#elif (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) && defined(PIPE_ARCH_X86_64) |
||||
__asm __volatile ( |
||||
"cpuid\n\t" |
||||
: "=a" (p[0]), |
||||
"=b" (p[1]), |
||||
"=c" (p[2]), |
||||
"=d" (p[3]) |
||||
: "0" (ax), "2" (cx) |
||||
); |
||||
#elif defined(PIPE_CC_MSVC) |
||||
__cpuidex(p, ax, cx); |
||||
#else |
||||
p[0] = 0; |
||||
p[1] = 0; |
||||
p[2] = 0; |
||||
p[3] = 0; |
||||
#endif |
||||
} |
||||
|
||||
|
||||
static inline uint64_t xgetbv(void) |
||||
{ |
||||
#if defined(PIPE_CC_GCC) |
||||
uint32_t eax, edx; |
||||
|
||||
__asm __volatile ( |
||||
".byte 0x0f, 0x01, 0xd0" // xgetbv isn't supported on gcc < 4.4
|
||||
: "=a"(eax), |
||||
"=d"(edx) |
||||
: "c"(0) |
||||
); |
||||
|
||||
return ((uint64_t)edx << 32) | eax; |
||||
#elif defined(PIPE_CC_MSVC) && defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK) |
||||
return _xgetbv(_XCR_XFEATURE_ENABLED_MASK); |
||||
#else |
||||
return 0; |
||||
#endif |
||||
} |
||||
|
||||
|
||||
#if defined(PIPE_ARCH_X86) |
||||
static inline boolean sse2_has_daz(void) |
||||
{ |
||||
struct { |
||||
uint32_t pad1[7]; |
||||
uint32_t mxcsr_mask; |
||||
uint32_t pad2[128-8]; |
||||
} PIPE_ALIGN_VAR(16) fxarea; |
||||
|
||||
fxarea.mxcsr_mask = 0; |
||||
#if (defined(PIPE_CC_GCC) || defined(PIPE_CC_SUNPRO)) |
||||
__asm __volatile ("fxsave %0" : "+m" (fxarea)); |
||||
#elif (defined(PIPE_CC_MSVC) && _MSC_VER >= 1700) || defined(PIPE_CC_ICL) |
||||
/* 1700 = Visual Studio 2012 */ |
||||
_fxsave(&fxarea); |
||||
#else |
||||
fxarea.mxcsr_mask = 0; |
||||
#endif |
||||
return !!(fxarea.mxcsr_mask & (1 << 6)); |
||||
} |
||||
#endif |
||||
|
||||
#endif /* X86 or X86_64 */ |
||||
|
||||
void |
||||
util_cpu_detect(void) |
||||
{ |
||||
static boolean util_cpu_detect_initialized = FALSE; |
||||
|
||||
if(util_cpu_detect_initialized) |
||||
return; |
||||
|
||||
memset(&util_cpu_caps, 0, sizeof util_cpu_caps); |
||||
|
||||
/* Count the number of CPUs in system */ |
||||
#if defined(PIPE_OS_WINDOWS) |
||||
{ |
||||
SYSTEM_INFO system_info; |
||||
GetSystemInfo(&system_info); |
||||
util_cpu_caps.nr_cpus = system_info.dwNumberOfProcessors; |
||||
} |
||||
#elif defined(PIPE_OS_UNIX) && defined(_SC_NPROCESSORS_ONLN) |
||||
util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); |
||||
if (util_cpu_caps.nr_cpus == -1) |
||||
util_cpu_caps.nr_cpus = 1; |
||||
#elif defined(PIPE_OS_BSD) |
||||
{ |
||||
int mib[2], ncpu; |
||||
int len; |
||||
|
||||
mib[0] = CTL_HW; |
||||
mib[1] = HW_NCPU; |
||||
|
||||
len = sizeof (ncpu); |
||||
sysctl(mib, 2, &ncpu, &len, NULL, 0); |
||||
util_cpu_caps.nr_cpus = ncpu; |
||||
} |
||||
#else |
||||
util_cpu_caps.nr_cpus = 1; |
||||
#endif |
||||
|
||||
/* Make the fallback cacheline size nonzero so that it can be
|
||||
* safely passed to align(). |
||||
*/ |
||||
util_cpu_caps.cacheline = sizeof(void *); |
||||
|
||||
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) |
||||
if (has_cpuid()) { |
||||
uint32_t regs[4]; |
||||
uint32_t regs2[4]; |
||||
|
||||
util_cpu_caps.cacheline = 32; |
||||
|
||||
/* Get max cpuid level */ |
||||
cpuid(0x00000000, regs); |
||||
|
||||
if (regs[0] >= 0x00000001) { |
||||
unsigned int cacheline; |
||||
|
||||
cpuid (0x00000001, regs2); |
||||
|
||||
util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf; |
||||
if (util_cpu_caps.x86_cpu_type == 0xf) |
||||
util_cpu_caps.x86_cpu_type = 8 + ((regs2[0] >> 20) & 255); /* use extended family (P4, IA64) */ |
||||
|
||||
/* general feature flags */ |
||||
util_cpu_caps.has_tsc = (regs2[3] >> 4) & 1; /* 0x0000010 */ |
||||
util_cpu_caps.has_mmx = (regs2[3] >> 23) & 1; /* 0x0800000 */ |
||||
util_cpu_caps.has_sse = (regs2[3] >> 25) & 1; /* 0x2000000 */ |
||||
util_cpu_caps.has_sse2 = (regs2[3] >> 26) & 1; /* 0x4000000 */ |
||||
util_cpu_caps.has_sse3 = (regs2[2] >> 0) & 1; /* 0x0000001 */ |
||||
util_cpu_caps.has_ssse3 = (regs2[2] >> 9) & 1; /* 0x0000020 */ |
||||
util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1; |
||||
util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1; |
||||
util_cpu_caps.has_popcnt = (regs2[2] >> 23) & 1; |
||||
util_cpu_caps.has_avx = ((regs2[2] >> 28) & 1) && // AVX
|
||||
((regs2[2] >> 27) & 1) && // OSXSAVE
|
||||
((xgetbv() & 6) == 6); // XMM & YMM
|
||||
util_cpu_caps.has_f16c = (regs2[2] >> 29) & 1; |
||||
util_cpu_caps.has_mmx2 = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */ |
||||
#if defined(PIPE_ARCH_X86_64) |
||||
util_cpu_caps.has_daz = 1; |
||||
#else |
||||
util_cpu_caps.has_daz = util_cpu_caps.has_sse3 || |
||||
(util_cpu_caps.has_sse2 && sse2_has_daz()); |
||||
#endif |
||||
|
||||
cacheline = ((regs2[1] >> 8) & 0xFF) * 8; |
||||
if (cacheline > 0) |
||||
util_cpu_caps.cacheline = cacheline; |
||||
} |
||||
if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) { |
||||
uint32_t regs7[4]; |
||||
cpuid_count(0x00000007, 0x00000000, regs7); |
||||
util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1; |
||||
} |
||||
|
||||
if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) { |
||||
/* GenuineIntel */ |
||||
util_cpu_caps.has_intel = 1; |
||||
} |
||||
|
||||
cpuid(0x80000000, regs); |
||||
|
||||
if (regs[0] >= 0x80000001) { |
||||
|
||||
cpuid(0x80000001, regs2); |
||||
|
||||
util_cpu_caps.has_mmx |= (regs2[3] >> 23) & 1; |
||||
util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1; |
||||
util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1; |
||||
util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1; |
||||
|
||||
util_cpu_caps.has_xop = util_cpu_caps.has_avx && |
||||
((regs2[2] >> 11) & 1); |
||||
} |
||||
|
||||
if (regs[0] >= 0x80000006) { |
||||
cpuid(0x80000006, regs2); |
||||
util_cpu_caps.cacheline = regs2[2] & 0xFF; |
||||
} |
||||
|
||||
if (!util_cpu_caps.has_sse) { |
||||
util_cpu_caps.has_sse2 = 0; |
||||
util_cpu_caps.has_sse3 = 0; |
||||
util_cpu_caps.has_ssse3 = 0; |
||||
util_cpu_caps.has_sse4_1 = 0; |
||||
} |
||||
} |
||||
#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */ |
||||
|
||||
#if defined(PIPE_ARCH_PPC) |
||||
check_os_altivec_support(); |
||||
#endif /* PIPE_ARCH_PPC */ |
||||
|
||||
#ifdef DEBUG |
||||
if (debug_get_option_dump_cpu()) { |
||||
debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus); |
||||
|
||||
debug_printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type); |
||||
debug_printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline); |
||||
|
||||
debug_printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc); |
||||
debug_printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx); |
||||
debug_printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2); |
||||
debug_printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse); |
||||
debug_printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2); |
||||
debug_printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3); |
||||
debug_printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3); |
||||
debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1); |
||||
debug_printf("util_cpu_caps.has_sse4_2 = %u\n", util_cpu_caps.has_sse4_2); |
||||
debug_printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx); |
||||
debug_printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2); |
||||
debug_printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c); |
||||
debug_printf("util_cpu_caps.has_popcnt = %u\n", util_cpu_caps.has_popcnt); |
||||
debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow); |
||||
debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext); |
||||
debug_printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop); |
||||
debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec); |
||||
debug_printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz); |
||||
} |
||||
#endif |
||||
|
||||
util_cpu_detect_initialized = TRUE; |
||||
} |
@ -1,139 +0,0 @@ |
||||
/**************************************************************************
|
||||
*
|
||||
* Copyright 2008 VMware, Inc. |
||||
* All Rights Reserved. |
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a |
||||
* copy of this software and associated documentation files (the |
||||
* "Software"), to deal in the Software without restriction, including |
||||
* without limitation the rights to use, copy, modify, merge, publish, |
||||
* distribute, sub license, and/or sell copies of the Software, and to |
||||
* permit persons to whom the Software is furnished to do so, subject to |
||||
* the following conditions: |
||||
*
|
||||
* The above copyright notice and this permission notice (including the |
||||
* next paragraph) shall be included in all copies or substantial portions |
||||
* of the Software. |
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. |
||||
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR |
||||
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
||||
*
|
||||
**************************************************************************/ |
||||
|
||||
|
||||
|
||||
#include "pipe/p_config.h" |
||||
#include "util/u_math.h" |
||||
#include "util/u_cpu_detect.h" |
||||
|
||||
#if defined(PIPE_ARCH_SSE) |
||||
#include <xmmintrin.h> |
||||
/* This is defined in pmmintrin.h, but it can only be included when -msse3 is
|
||||
* used, so just define it here to avoid further. */ |
||||
#ifndef _MM_DENORMALS_ZERO_MASK |
||||
#define _MM_DENORMALS_ZERO_MASK 0x0040 |
||||
#endif |
||||
#endif |
||||
|
||||
#if 0 |
||||
/** 2^x, for x in [-1.0, 1.0) */ |
||||
float pow2_table[POW2_TABLE_SIZE]; |
||||
|
||||
|
||||
static void |
||||
init_pow2_table(void) |
||||
{ |
||||
int i; |
||||
for (i = 0; i < POW2_TABLE_SIZE; i++) |
||||
pow2_table[i] = (float) pow(2.0, (i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE); |
||||
} |
||||
|
||||
|
||||
/** log2(x), for x in [1.0, 2.0) */ |
||||
float log2_table[LOG2_TABLE_SIZE]; |
||||
|
||||
|
||||
static void
|
||||
init_log2_table(void) |
||||
{ |
||||
unsigned i; |
||||
for (i = 0; i < LOG2_TABLE_SIZE; i++) |
||||
log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SCALE)); |
||||
} |
||||
#endif |
||||
|
||||
/**
|
||||
* One time init for math utilities. |
||||
*/ |
||||
void |
||||
util_init_math(void) |
||||
{ |
||||
static boolean initialized = FALSE; |
||||
if (!initialized) { |
||||
// init_pow2_table();
|
||||
/* init_log2_table();*/ |
||||
initialized = TRUE; |
||||
} |
||||
} |
||||
|
||||
/**
|
||||
* Fetches the contents of the fpstate (mxcsr on x86) register. |
||||
* |
||||
* On platforms without support for it just returns 0. |
||||
*/ |
||||
unsigned |
||||
util_fpstate_get(void) |
||||
{ |
||||
unsigned mxcsr = 0; |
||||
|
||||
#if defined(PIPE_ARCH_SSE) |
||||
if (util_cpu_caps.has_sse) { |
||||
mxcsr = _mm_getcsr(); |
||||
} |
||||
#endif |
||||
|
||||
return mxcsr; |
||||
} |
||||
|
||||
/**
|
||||
* Make sure that the fp treats the denormalized floating |
||||
* point numbers as zero. |
||||
* |
||||
* This is the behavior required by D3D10. OpenGL doesn't care. |
||||
*/ |
||||
unsigned |
||||
util_fpstate_set_denorms_to_zero(unsigned current_mxcsr) |
||||
{ |
||||
#if defined(PIPE_ARCH_SSE) |
||||
if (util_cpu_caps.has_sse) { |
||||
/* Enable flush to zero mode */ |
||||
current_mxcsr |= _MM_FLUSH_ZERO_MASK; |
||||
if (util_cpu_caps.has_daz) { |
||||
/* Enable denormals are zero mode */ |
||||
current_mxcsr |= _MM_DENORMALS_ZERO_MASK; |
||||
} |
||||
util_fpstate_set(current_mxcsr); |
||||
} |
||||
#endif |
||||
return current_mxcsr; |
||||
} |
||||
|
||||
/**
|
||||
* Set the state of the fpstate (mxcsr on x86) register. |
||||
* |
||||
* On platforms without support for it's a noop. |
||||
*/ |
||||
void |
||||
util_fpstate_set(unsigned mxcsr) |
||||
{ |
||||
#if defined(PIPE_ARCH_SSE) |
||||
if (util_cpu_caps.has_sse) { |
||||
_mm_setcsr(mxcsr); |
||||
} |
||||
#endif |
||||
} |
@ -0,0 +1,80 @@ |
||||
/**************************************************************************
|
||||
* |
||||
* Copyright 2008 VMware, Inc. |
||||
* All Rights Reserved. |
||||
* |
||||
* Permission is hereby granted, free of charge, to any person obtaining a |
||||
* copy of this software and associated documentation files (the |
||||
* "Software"), to deal in the Software without restriction, including |
||||
* without limitation the rights to use, copy, modify, merge, publish, |
||||
* distribute, sub license, and/or sell copies of the Software, and to |
||||
* permit persons to whom the Software is furnished to do so, subject to |
||||
* the following conditions: |
||||
* |
||||
* The above copyright notice and this permission notice (including the |
||||
* next paragraph) shall be included in all copies or substantial portions |
||||
* of the Software. |
||||
* |
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. |
||||
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR |
||||
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
||||
* |
||||
**************************************************************************/ |
||||
|
||||
|
||||
#include "bitscan.h" |
||||
|
||||
#ifdef HAVE___BUILTIN_FFS |
||||
#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64) |
||||
#else |
||||
int |
||||
ffs(int i) |
||||
{ |
||||
int bit = 0; |
||||
if (!i) |
||||
return bit; |
||||
if (!(i & 0xffff)) { |
||||
bit += 16; |
||||
i >>= 16; |
||||
} |
||||
if (!(i & 0xff)) { |
||||
bit += 8; |
||||
i >>= 8; |
||||
} |
||||
if (!(i & 0xf)) { |
||||
bit += 4; |
||||
i >>= 4; |
||||
} |
||||
if (!(i & 0x3)) { |
||||
bit += 2; |
||||
i >>= 2; |
||||
} |
||||
if (!(i & 0x1)) |
||||
bit += 1; |
||||
return bit + 1; |
||||
} |
||||
#endif |
||||
|
||||
#ifdef HAVE___BUILTIN_FFSLL |
||||
#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64) |
||||
#else |
||||
int |
||||
ffsll(long long int val) |
||||
{ |
||||
int bit; |
||||
|
||||
bit = ffs((unsigned) (val & 0xffffffff)); |
||||
if (bit != 0) |
||||
return bit; |
||||
|
||||
bit = ffs((unsigned) (val >> 32)); |
||||
if (bit != 0) |
||||
return 32 + bit; |
||||
|
||||
return 0; |
||||
} |
||||
#endif |
@ -0,0 +1,356 @@ |
||||
/**************************************************************************
|
||||
* |
||||
* Copyright 2008 VMware, Inc. |
||||
* All Rights Reserved. |
||||
* |
||||
* Permission is hereby granted, free of charge, to any person obtaining a |
||||
* copy of this software and associated documentation files (the |
||||
* "Software"), to deal in the Software without restriction, including |
||||
* without limitation the rights to use, copy, modify, merge, publish, |
||||
* distribute, sub license, and/or sell copies of the Software, and to |
||||
* permit persons to whom the Software is furnished to do so, subject to |
||||
* the following conditions: |
||||
* |
||||
* The above copyright notice and this permission notice (including the |
||||
* next paragraph) shall be included in all copies or substantial portions |
||||
* of the Software. |
||||
* |
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. |
||||
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR |
||||
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
||||
* |
||||
**************************************************************************/ |
||||
|
||||
|
||||
#ifndef BITSCAN_H |
||||
#define BITSCAN_H |
||||
|
||||
#include <assert.h> |
||||
#include <stdint.h> |
||||
#include <stdbool.h> |
||||
#include <string.h> |
||||
|
||||
#if defined(_MSC_VER) |
||||
#include <intrin.h> |
||||
#endif |
||||
|
||||
#if defined(__POPCNT__) |
||||
#include <popcntintrin.h> |
||||
#endif |
||||
|
||||
#include "c99_compat.h" |
||||
|
||||
#ifdef __cplusplus |
||||
extern "C" { |
||||
#endif |
||||
|
||||
|
||||
/**
|
||||
* Find first bit set in word. Least significant bit is 1. |
||||
* Return 0 if no bits set. |
||||
*/ |
||||
#ifdef HAVE___BUILTIN_FFS |
||||
#define ffs __builtin_ffs |
||||
#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64) |
||||
static inline |
||||
int ffs(int i) |
||||
{ |
||||
unsigned long index; |
||||
if (_BitScanForward(&index, i)) |
||||
return index + 1; |
||||
else |
||||
return 0; |
||||
} |
||||
#else |
||||
extern |
||||
int ffs(int i); |
||||
#endif |
||||
|
||||
#ifdef HAVE___BUILTIN_FFSLL |
||||
#define ffsll __builtin_ffsll |
||||
#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64) |
||||
static inline int |
||||
ffsll(long long int i) |
||||
{ |
||||
unsigned long index; |
||||
if (_BitScanForward64(&index, i)) |
||||
return index + 1; |
||||
else |
||||
return 0; |
||||
} |
||||
#else |
||||
extern int |
||||
ffsll(long long int val); |
||||
#endif |
||||
|
||||
|
||||
/* Destructively loop over all of the bits in a mask as in:
|
||||
* |
||||
* while (mymask) { |
||||
* int i = u_bit_scan(&mymask); |
||||
* ... process element i |
||||
* } |
||||
* |
||||
*/ |
||||
static inline int |
||||
u_bit_scan(unsigned *mask) |
||||
{ |
||||
const int i = ffs(*mask) - 1; |
||||
*mask ^= (1u << i); |
||||
return i; |
||||
} |
||||
|
||||
#define u_foreach_bit(b, dword) \ |
||||
for (uint32_t __dword = (dword), b; \
|
||||
((b) = ffs(__dword) - 1, __dword); \
|
||||
__dword &= ~(1 << (b))) |
||||
|
||||
static inline int |
||||
u_bit_scan64(uint64_t *mask) |
||||
{ |
||||
const int i = ffsll(*mask) - 1; |
||||
*mask ^= (((uint64_t)1) << i); |
||||
return i; |
||||
} |
||||
|
||||
#define u_foreach_bit64(b, dword) \ |
||||
for (uint64_t __dword = (dword), b; \
|
||||
((b) = ffsll(__dword) - 1, __dword); \
|
||||
__dword &= ~(1ull << (b))) |
||||
|
||||
/* Determine if an unsigned value is a power of two.
|
||||
* |
||||
* \note |
||||
* Zero is treated as a power of two. |
||||
*/ |
||||
static inline bool |
||||
util_is_power_of_two_or_zero(unsigned v) |
||||
{ |
||||
return (v & (v - 1)) == 0; |
||||
} |
||||
|
||||
/* Determine if an uint64_t value is a power of two.
|
||||
* |
||||
* \note |
||||
* Zero is treated as a power of two. |
||||
*/ |
||||
static inline bool |
||||
util_is_power_of_two_or_zero64(uint64_t v) |
||||
{ |
||||
return (v & (v - 1)) == 0; |
||||
} |
||||
|
||||
/* Determine if an unsigned value is a power of two.
|
||||
* |
||||
* \note |
||||
* Zero is \b not treated as a power of two. |
||||
*/ |
||||
static inline bool |
||||
util_is_power_of_two_nonzero(unsigned v) |
||||
{ |
||||
/* __POPCNT__ is different from HAVE___BUILTIN_POPCOUNT. The latter
|
||||
* indicates the existence of the __builtin_popcount function. The former |
||||
* indicates that _mm_popcnt_u32 exists and is a native instruction. |
||||
* |
||||
* The other alternative is to use SSE 4.2 compile-time flags. This has |
||||
* two drawbacks. First, there is currently no build infrastructure for |
||||
* SSE 4.2 (only 4.1), so that would have to be added. Second, some AMD |
||||
* CPUs support POPCNT but not SSE 4.2 (e.g., Barcelona). |
||||
*/ |
||||
#ifdef __POPCNT__ |
||||
return _mm_popcnt_u32(v) == 1; |
||||
#else |
||||
return v != 0 && (v & (v - 1)) == 0; |
||||
#endif |
||||
} |
||||
|
||||
/* For looping over a bitmask when you want to loop over consecutive bits
|
||||
* manually, for example: |
||||
* |
||||
* while (mask) { |
||||
* int start, count, i; |
||||
* |
||||
* u_bit_scan_consecutive_range(&mask, &start, &count); |
||||
* |
||||
* for (i = 0; i < count; i++) |
||||
* ... process element (start+i) |
||||
* } |
||||
*/ |
||||
static inline void |
||||
u_bit_scan_consecutive_range(unsigned *mask, int *start, int *count) |
||||
{ |
||||
if (*mask == 0xffffffff) { |
||||
*start = 0; |
||||
*count = 32; |
||||
*mask = 0; |
||||
return; |
||||
} |
||||
*start = ffs(*mask) - 1; |
||||
*count = ffs(~(*mask >> *start)) - 1; |
||||
*mask &= ~(((1u << *count) - 1) << *start); |
||||
} |
||||
|
||||
static inline void |
||||
u_bit_scan_consecutive_range64(uint64_t *mask, int *start, int *count) |
||||
{ |
||||
if (*mask == ~0ull) { |
||||
*start = 0; |
||||
*count = 64; |
||||
*mask = 0; |
||||
return; |
||||
} |
||||
*start = ffsll(*mask) - 1; |
||||
*count = ffsll(~(*mask >> *start)) - 1; |
||||
*mask &= ~(((((uint64_t)1) << *count) - 1) << *start); |
||||
} |
||||
|
||||
|
||||
/**
|
||||
* Find last bit set in a word. The least significant bit is 1. |
||||
* Return 0 if no bits are set. |
||||
* Essentially ffs() in the reverse direction. |
||||
*/ |
||||
static inline unsigned |
||||
util_last_bit(unsigned u) |
||||
{ |
||||
#if defined(HAVE___BUILTIN_CLZ) |
||||
return u == 0 ? 0 : 32 - __builtin_clz(u); |
||||
#elif defined(_MSC_VER) && (_M_IX86 || _M_ARM || _M_AMD64 || _M_IA64) |
||||
unsigned long index; |
||||
if (_BitScanReverse(&index, u)) |
||||
return index + 1; |
||||
else |
||||
return 0; |
||||
#else |
||||
unsigned r = 0; |
||||
while (u) { |
||||
r++; |
||||
u >>= 1; |
||||
} |
||||
return r; |
||||
#endif |
||||
} |
||||
|
||||
/**
|
||||
* Find last bit set in a word. The least significant bit is 1. |
||||
* Return 0 if no bits are set. |
||||
* Essentially ffsll() in the reverse direction. |
||||
*/ |
||||
static inline unsigned |
||||
util_last_bit64(uint64_t u) |
||||
{ |
||||
#if defined(HAVE___BUILTIN_CLZLL) |
||||
return u == 0 ? 0 : 64 - __builtin_clzll(u); |
||||
#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM64 || _M_IA64) |
||||
unsigned long index; |
||||
if (_BitScanReverse64(&index, u)) |
||||
return index + 1; |
||||
else |
||||
return 0; |
||||
#else |
||||
unsigned r = 0; |
||||
while (u) { |
||||
r++; |
||||
u >>= 1; |
||||
} |
||||
return r; |
||||
#endif |
||||
} |
||||
|
||||
/**
|
||||
* Find last bit in a word that does not match the sign bit. The least |
||||
* significant bit is 1. |
||||
* Return 0 if no bits are set. |
||||
*/ |
||||
static inline unsigned |
||||
util_last_bit_signed(int i) |
||||
{ |
||||
if (i >= 0) |
||||
return util_last_bit(i); |
||||
else |
||||
return util_last_bit(~(unsigned)i); |
||||
} |
||||
|
||||
/* Returns a bitfield in which the first count bits starting at start are
|
||||
* set. |
||||
*/ |
||||
static inline unsigned |
||||
u_bit_consecutive(unsigned start, unsigned count) |
||||
{ |
||||
assert(start + count <= 32); |
||||
if (count == 32) |
||||
return ~0; |
||||
return ((1u << count) - 1) << start; |
||||
} |
||||
|
||||
static inline uint64_t |
||||
u_bit_consecutive64(unsigned start, unsigned count) |
||||
{ |
||||
assert(start + count <= 64); |
||||
if (count == 64) |
||||
return ~(uint64_t)0; |
||||
return (((uint64_t)1 << count) - 1) << start; |
||||
} |
||||
|
||||
/**
|
||||
* Return number of bits set in n. |
||||
*/ |
||||
static inline unsigned |
||||
util_bitcount(unsigned n) |
||||
{ |
||||
#if defined(HAVE___BUILTIN_POPCOUNT) |
||||
return __builtin_popcount(n); |
||||
#else |
||||
/* K&R classic bitcount.
|
||||
* |
||||
* For each iteration, clear the LSB from the bitfield. |
||||
* Requires only one iteration per set bit, instead of |
||||
* one iteration per bit less than highest set bit. |
||||
*/ |
||||
unsigned bits; |
||||
for (bits = 0; n; bits++) { |
||||
n &= n - 1; |
||||
} |
||||
return bits; |
||||
#endif |
||||
} |
||||
|
||||
/**
|
||||
* Return the number of bits set in n using the native popcnt instruction. |
||||
* The caller is responsible for ensuring that popcnt is supported by the CPU. |
||||
* |
||||
* gcc doesn't use it if -mpopcnt or -march= that has popcnt is missing. |
||||
* |
||||
*/ |
||||
static inline unsigned |
||||
util_popcnt_inline_asm(unsigned n) |
||||
{ |
||||
#if defined(USE_X86_64_ASM) || defined(USE_X86_ASM) |
||||
uint32_t out; |
||||
__asm volatile("popcnt %1, %0" : "=r"(out) : "r"(n)); |
||||
return out; |
||||
#else |
||||
/* We should never get here by accident, but I'm sure it'll happen. */ |
||||
return util_bitcount(n); |
||||
#endif |
||||
} |
||||
|
||||
static inline unsigned |
||||
util_bitcount64(uint64_t n) |
||||
{ |
||||
#ifdef HAVE___BUILTIN_POPCOUNTLL |
||||
return __builtin_popcountll(n); |
||||
#else |
||||
return util_bitcount(n) + util_bitcount(n >> 32); |
||||
#endif |
||||
} |
||||
|
||||
#ifdef __cplusplus |
||||
} |
||||
#endif |
||||
|
||||
#endif /* BITSCAN_H */ |
@ -0,0 +1,865 @@ |
||||
/**************************************************************************
|
||||
*
|
||||
* Copyright 2008 Dennis Smit |
||||
* All Rights Reserved. |
||||
* |
||||
* Permission is hereby granted, free of charge, to any person obtaining a |
||||
* copy of this software and associated documentation files (the "Software"), |
||||
* to deal in the Software without restriction, including without limitation |
||||
* on the rights to use, copy, modify, merge, publish, distribute, sub |
||||
* license, and/or sell copies of the Software, and to permit persons to whom |
||||
* the Software is furnished to do so, subject to the following conditions: |
||||
* |
||||
* The above copyright notice and this permission notice (including the next |
||||
* paragraph) shall be included in all copies or substantial portions of the |
||||
* Software. |
||||
* |
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL |
||||
* AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, |
||||
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR |
||||
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE |
||||
* USE OR OTHER DEALINGS IN THE SOFTWARE. |
||||
*
|
||||
**************************************************************************/ |
||||
|
||||
/**
|
||||
* @file |
||||
* CPU feature detection. |
||||
* |
||||
* @author Dennis Smit |
||||
* @author Based on the work of Eric Anholt <anholt@FreeBSD.org> |
||||
*/ |
||||
|
||||
#include "pipe/p_config.h" |
||||
#include "pipe/p_compiler.h" |
||||
|
||||
#include "util/u_debug.h" |
||||
#include "u_cpu_detect.h" |
||||
#include "u_math.h" |
||||
#include "c11/threads.h" |
||||
|
||||
#include <stdio.h> |
||||
#include <inttypes.h> |
||||
|
||||
#if defined(PIPE_ARCH_PPC) |
||||
#if defined(PIPE_OS_APPLE) |
||||
#include <sys/sysctl.h> |
||||
#else |
||||
#include <signal.h> |
||||
#include <setjmp.h> |
||||
#endif |
||||
#endif |
||||
|
||||
#if defined(PIPE_OS_BSD) |
||||
#include <sys/param.h> |
||||
#include <sys/sysctl.h> |
||||
#include <machine/cpu.h> |
||||
#endif |
||||
|
||||
#if defined(PIPE_OS_FREEBSD) |
||||
#if __has_include(<sys/auxv.h>) |
||||
#include <sys/auxv.h> |
||||
#define HAVE_ELF_AUX_INFO |
||||
#endif |
||||
#endif |
||||
|
||||
#if defined(PIPE_OS_LINUX) |
||||
#include <signal.h> |
||||
#include <fcntl.h> |
||||
#include <elf.h> |
||||
#endif |
||||
|
||||
#ifdef PIPE_OS_UNIX |
||||
#include <unistd.h> |
||||
#endif |
||||
|
||||
#if defined(HAS_ANDROID_CPUFEATURES) |
||||
#include <cpu-features.h> |
||||
#endif |
||||
|
||||
#if defined(PIPE_OS_WINDOWS) |
||||
#include <windows.h> |
||||
#if defined(PIPE_CC_MSVC) |
||||
#include <intrin.h> |
||||
#endif |
||||
#endif |
||||
|
||||
#if defined(HAS_SCHED_H) |
||||
#include <sched.h> |
||||
#endif |
||||
|
||||
DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false) |
||||
|
||||
|
||||
struct util_cpu_caps_t util_cpu_caps; |
||||
|
||||
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) |
||||
static int has_cpuid(void); |
||||
#endif |
||||
|
||||
|
||||
#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE) && !defined(PIPE_OS_BSD) && !defined(PIPE_OS_LINUX) |
||||
static jmp_buf __lv_powerpc_jmpbuf; |
||||
static volatile sig_atomic_t __lv_powerpc_canjump = 0; |
||||
|
||||
static void |
||||
sigill_handler(int sig) |
||||
{ |
||||
if (!__lv_powerpc_canjump) { |
||||
signal (sig, SIG_DFL); |
||||
raise (sig); |
||||
} |
||||
|
||||
__lv_powerpc_canjump = 0; |
||||
longjmp(__lv_powerpc_jmpbuf, 1); |
||||
} |
||||
#endif |
||||
|
||||
#if defined(PIPE_ARCH_PPC) |
||||
static void |
||||
check_os_altivec_support(void) |
||||
{ |
||||
#if defined(__ALTIVEC__) |
||||
util_cpu_caps.has_altivec = 1; |
||||
#endif |
||||
#if defined(__VSX__) |
||||
util_cpu_caps.has_vsx = 1; |
||||
#endif |
||||
#if defined(__ALTIVEC__) && defined(__VSX__) |
||||
/* Do nothing */ |
||||
#elif defined(PIPE_OS_APPLE) || defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD) |
||||
#ifdef HW_VECTORUNIT |
||||
int sels[2] = {CTL_HW, HW_VECTORUNIT}; |
||||
#else |
||||
int sels[2] = {CTL_MACHDEP, CPU_ALTIVEC}; |
||||
#endif |
||||
int has_vu = 0; |
||||
int len = sizeof (has_vu); |
||||
int err; |
||||
|
||||
err = sysctl(sels, 2, &has_vu, &len, NULL, 0); |
||||
|
||||
if (err == 0) { |
||||
if (has_vu != 0) { |
||||
util_cpu_caps.has_altivec = 1; |
||||
} |
||||
} |
||||
#elif defined(PIPE_OS_FREEBSD) /* !PIPE_OS_APPLE && !PIPE_OS_NETBSD && !PIPE_OS_OPENBSD */ |
||||
unsigned long hwcap = 0; |
||||
#ifdef HAVE_ELF_AUX_INFO |
||||
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); |
||||
#else |
||||
size_t len = sizeof(hwcap); |
||||
sysctlbyname("hw.cpu_features", &hwcap, &len, NULL, 0); |
||||
#endif |
||||
if (hwcap & PPC_FEATURE_HAS_ALTIVEC) |
||||
util_cpu_caps.has_altivec = 1; |
||||
if (hwcap & PPC_FEATURE_HAS_VSX) |
||||
util_cpu_caps.has_vsx = 1; |
||||
#elif defined(PIPE_OS_LINUX) /* !PIPE_OS_FREEBSD */ |
||||
#if defined(PIPE_ARCH_PPC_64) |
||||
Elf64_auxv_t aux; |
||||
#else |
||||
Elf32_auxv_t aux; |
||||
#endif |
||||
int fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); |
||||
if (fd >= 0) { |
||||
while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) { |
||||
if (aux.a_type == AT_HWCAP) { |
||||
char *env_vsx = getenv("GALLIVM_VSX"); |
||||
uint64_t hwcap = aux.a_un.a_val; |
||||
util_cpu_caps.has_altivec = (hwcap >> 28) & 1; |
||||
if (!env_vsx || env_vsx[0] != '0') { |
||||
util_cpu_caps.has_vsx = (hwcap >> 7) & 1; |
||||
} |
||||
break; |
||||
} |
||||
} |
||||
close(fd); |
||||
} |
||||
#else /* !PIPE_OS_APPLE && !PIPE_OS_BSD && !PIPE_OS_LINUX */ |
||||
/* not on Apple/Darwin or Linux, do it the brute-force way */ |
||||
/* this is borrowed from the libmpeg2 library */ |
||||
signal(SIGILL, sigill_handler); |
||||
if (setjmp(__lv_powerpc_jmpbuf)) { |
||||
signal(SIGILL, SIG_DFL); |
||||
} else { |
||||
boolean enable_altivec = TRUE; /* Default: enable if available, and if not overridden */ |
||||
boolean enable_vsx = TRUE; |
||||
#ifdef DEBUG |
||||
/* Disabling Altivec code generation is not the same as disabling VSX code generation,
|
||||
* which can be done simply by passing -mattr=-vsx to the LLVM compiler; cf. |
||||
* lp_build_create_jit_compiler_for_module(). |
||||
* If you want to disable Altivec code generation, the best place to do it is here. |
||||
*/ |
||||
char *env_control = getenv("GALLIVM_ALTIVEC"); /* 1=enable (default); 0=disable */ |
||||
if (env_control && env_control[0] == '0') { |
||||
enable_altivec = FALSE; |
||||
} |
||||
#endif |
||||
/* VSX instructions can be explicitly enabled/disabled via GALLIVM_VSX=1 or 0 */ |
||||
char *env_vsx = getenv("GALLIVM_VSX"); |
||||
if (env_vsx && env_vsx[0] == '0') { |
||||
enable_vsx = FALSE; |
||||
} |
||||
if (enable_altivec) { |
||||
__lv_powerpc_canjump = 1; |
||||
|
||||
__asm __volatile |
||||
("mtspr 256, %0\n\t" |
||||
"vand %%v0, %%v0, %%v0" |
||||
: |
||||
: "r" (-1)); |
||||
|
||||
util_cpu_caps.has_altivec = 1; |
||||
|
||||
if (enable_vsx) { |
||||
__asm __volatile("xxland %vs0, %vs0, %vs0"); |
||||
util_cpu_caps.has_vsx = 1; |
||||
} |
||||
signal(SIGILL, SIG_DFL); |
||||
} else { |
||||
util_cpu_caps.has_altivec = 0; |
||||
} |
||||
} |
||||
#endif /* !PIPE_OS_APPLE && !PIPE_OS_LINUX */ |
||||
} |
||||
#endif /* PIPE_ARCH_PPC */ |
||||
|
||||
|
||||
#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64) |
||||
static int has_cpuid(void) |
||||
{ |
||||
#if defined(PIPE_ARCH_X86) |
||||
#if defined(PIPE_OS_GCC) |
||||
int a, c; |
||||
|
||||
__asm __volatile |
||||
("pushf\n" |
||||
"popl %0\n" |
||||
"movl %0, %1\n" |
||||
"xorl $0x200000, %0\n" |
||||
"push %0\n" |
||||
"popf\n" |
||||
"pushf\n" |
||||
"popl %0\n" |
||||
: "=a" (a), "=c" (c) |
||||
: |
||||
: "cc"); |
||||
|
||||
return a != c; |
||||
#else |
||||
/* FIXME */ |
||||
return 1; |
||||
#endif |
||||
#elif defined(PIPE_ARCH_X86_64) |
||||
return 1; |
||||
#else |
||||
return 0; |
||||
#endif |
||||
} |
||||
|
||||
|
||||
/**
|
||||
* @sa cpuid.h included in gcc-4.3 onwards. |
||||
* @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
|
||||
*/ |
||||
static inline void |
||||
cpuid(uint32_t ax, uint32_t *p) |
||||
{ |
||||
#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86) |
||||
__asm __volatile ( |
||||
"xchgl %%ebx, %1\n\t" |
||||
"cpuid\n\t" |
||||
"xchgl %%ebx, %1" |
||||
: "=a" (p[0]), |
||||
"=S" (p[1]), |
||||
"=c" (p[2]), |
||||
"=d" (p[3]) |
||||
: "0" (ax) |
||||
); |
||||
#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64) |
||||
__asm __volatile ( |
||||
"cpuid\n\t" |
||||
: "=a" (p[0]), |
||||
"=b" (p[1]), |
||||
"=c" (p[2]), |
||||
"=d" (p[3]) |
||||
: "0" (ax) |
||||
); |
||||
#elif defined(PIPE_CC_MSVC) |
||||
__cpuid(p, ax); |
||||
#else |
||||
p[0] = 0; |
||||
p[1] = 0; |
||||
p[2] = 0; |
||||
p[3] = 0; |
||||
#endif |
||||
} |
||||
|
||||
/**
|
||||
* @sa cpuid.h included in gcc-4.4 onwards. |
||||
* @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx
|
||||
*/ |
||||
static inline void |
||||
cpuid_count(uint32_t ax, uint32_t cx, uint32_t *p) |
||||
{ |
||||
#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86) |
||||
__asm __volatile ( |
||||
"xchgl %%ebx, %1\n\t" |
||||
"cpuid\n\t" |
||||
"xchgl %%ebx, %1" |
||||
: "=a" (p[0]), |
||||
"=S" (p[1]), |
||||
"=c" (p[2]), |
||||
"=d" (p[3]) |
||||
: "0" (ax), "2" (cx) |
||||
); |
||||
#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64) |
||||
__asm __volatile ( |
||||
"cpuid\n\t" |
||||
: "=a" (p[0]), |
||||
"=b" (p[1]), |
||||
"=c" (p[2]), |
||||
"=d" (p[3]) |
||||
: "0" (ax), "2" (cx) |
||||
); |
||||
#elif defined(PIPE_CC_MSVC) |
||||
__cpuidex(p, ax, cx); |
||||
#else |
||||
p[0] = 0; |
||||
p[1] = 0; |
||||
p[2] = 0; |
||||
p[3] = 0; |
||||
#endif |
||||
} |
||||
|
||||
|
||||
static inline uint64_t xgetbv(void) |
||||
{ |
||||
#if defined(PIPE_CC_GCC) |
||||
uint32_t eax, edx; |
||||
|
||||
__asm __volatile ( |
||||
".byte 0x0f, 0x01, 0xd0" // xgetbv isn't supported on gcc < 4.4
|
||||
: "=a"(eax), |
||||
"=d"(edx) |
||||
: "c"(0) |
||||
); |
||||
|
||||
return ((uint64_t)edx << 32) | eax; |
||||
#elif defined(PIPE_CC_MSVC) && defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK) |
||||
return _xgetbv(_XCR_XFEATURE_ENABLED_MASK); |
||||
#else |
||||
return 0; |
||||
#endif |
||||
} |
||||
|
||||
|
||||
#if defined(PIPE_ARCH_X86) |
||||
PIPE_ALIGN_STACK static inline boolean sse2_has_daz(void) |
||||
{ |
||||
struct { |
||||
uint32_t pad1[7]; |
||||
uint32_t mxcsr_mask; |
||||
uint32_t pad2[128-8]; |
||||
} PIPE_ALIGN_VAR(16) fxarea; |
||||
|
||||
fxarea.mxcsr_mask = 0; |
||||
#if defined(PIPE_CC_GCC) |
||||
__asm __volatile ("fxsave %0" : "+m" (fxarea)); |
||||
#elif defined(PIPE_CC_MSVC) || defined(PIPE_CC_ICL) |
||||
_fxsave(&fxarea); |
||||
#else |
||||
fxarea.mxcsr_mask = 0; |
||||
#endif |
||||
return !!(fxarea.mxcsr_mask & (1 << 6)); |
||||
} |
||||
#endif |
||||
|
||||
#endif /* X86 or X86_64 */ |
||||
|
||||
#if defined(PIPE_ARCH_ARM) |
||||
static void |
||||
check_os_arm_support(void) |
||||
{ |
||||
/*
|
||||
* On Android, the cpufeatures library is preferred way of checking |
||||
* CPU capabilities. However, it is not available for standalone Mesa |
||||
* builds, i.e. when Android build system (Android.mk-based) is not |
||||
* used. Because of this we cannot use PIPE_OS_ANDROID here, but rather |
||||
* have a separate macro that only gets enabled from respective Android.mk. |
||||
*/ |
||||
#if defined(__ARM_NEON) || defined(__ARM_NEON__) |
||||
util_cpu_caps.has_neon = 1; |
||||
#elif defined(PIPE_OS_FREEBSD) && defined(HAVE_ELF_AUX_INFO) |
||||
unsigned long hwcap = 0; |
||||
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); |
||||
if (hwcap & HWCAP_NEON) |
||||
util_cpu_caps.has_neon = 1; |
||||
#elif defined(HAS_ANDROID_CPUFEATURES) |
||||
AndroidCpuFamily cpu_family = android_getCpuFamily(); |
||||
uint64_t cpu_features = android_getCpuFeatures(); |
||||
|
||||
if (cpu_family == ANDROID_CPU_FAMILY_ARM) { |
||||
if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) |
||||
util_cpu_caps.has_neon = 1; |
||||
} |
||||
#elif defined(PIPE_OS_LINUX) |
||||
Elf32_auxv_t aux; |
||||
int fd; |
||||
|
||||
fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); |
||||
if (fd >= 0) { |
||||
while (read(fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) { |
||||
if (aux.a_type == AT_HWCAP) { |
||||
uint32_t hwcap = aux.a_un.a_val; |
||||
|
||||
util_cpu_caps.has_neon = (hwcap >> 12) & 1; |
||||
break; |
||||
} |
||||
} |
||||
close (fd); |
||||
} |
||||
#endif /* PIPE_OS_LINUX */ |
||||
} |
||||
|
||||
#elif defined(PIPE_ARCH_AARCH64) |
||||
static void |
||||
check_os_arm_support(void) |
||||
{ |
||||
util_cpu_caps.has_neon = true; |
||||
} |
||||
#endif /* PIPE_ARCH_ARM || PIPE_ARCH_AARCH64 */ |
||||
|
||||
#if defined(PIPE_ARCH_MIPS64) |
||||
static void |
||||
check_os_mips64_support(void) |
||||
{ |
||||
Elf64_auxv_t aux; |
||||
int fd; |
||||
|
||||
fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC); |
||||
if (fd >= 0) { |
||||
while (read(fd, &aux, sizeof(Elf64_auxv_t)) == sizeof(Elf64_auxv_t)) { |
||||
if (aux.a_type == AT_HWCAP) { |
||||
uint64_t hwcap = aux.a_un.a_val; |
||||
|
||||
util_cpu_caps.has_msa = (hwcap >> 1) & 1; |
||||
break; |
||||
} |
||||
} |
||||
close (fd); |
||||
} |
||||
} |
||||
#endif /* PIPE_ARCH_MIPS64 */ |
||||
|
||||
|
||||
static void |
||||
get_cpu_topology(void) |
||||
{ |
||||
/* Default. This is OK if L3 is not present or there is only one. */ |
||||
util_cpu_caps.num_L3_caches = 1; |
||||
|
||||
memset(util_cpu_caps.cpu_to_L3, 0xff, sizeof(util_cpu_caps.cpu_to_L3)); |
||||
|
||||
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) |
||||
/* AMD Zen */ |
||||
if (util_cpu_caps.family >= CPU_AMD_ZEN1_ZEN2 && |
||||
util_cpu_caps.family < CPU_AMD_LAST) { |
||||
uint32_t regs[4]; |
||||
|
||||
uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0}; |
||||
uint32_t mask[UTIL_MAX_CPUS / 32] = {0}; |
||||
bool saved = false; |
||||
|
||||
uint32_t L3_found[UTIL_MAX_CPUS] = {0}; |
||||
uint32_t num_L3_caches = 0; |
||||
util_affinity_mask *L3_affinity_masks = NULL; |
||||
|
||||
/* Query APIC IDs from each CPU core.
|
||||
* |
||||
* An APIC ID is a logical ID of the CPU with respect to the cache |
||||
* hierarchy, meaning that consecutive APIC IDs are neighbours in |
||||
* the hierarchy, e.g. sharing the same cache. |
||||
* |
||||
* For example, CPU 0 can have APIC ID 0 and CPU 12 can have APIC ID 1, |
||||
* which means that both CPU 0 and 12 are next to each other. |
||||
* (e.g. they are 2 threads belonging to 1 SMT2 core) |
||||
* |
||||
* We need to find out which CPUs share the same L3 cache and they can |
||||
* be all over the place. |
||||
* |
||||
* Querying the APIC ID can only be done by pinning the current thread |
||||
* to each core. The original affinity mask is saved. |
||||
* |
||||
* Loop over all possible CPUs even though some may be offline. |
||||
*/ |
||||
for (int16_t i = 0; i < util_cpu_caps.max_cpus && i < UTIL_MAX_CPUS; i++) { |
||||
uint32_t cpu_bit = 1u << (i % 32); |
||||
|
||||
mask[i / 32] = cpu_bit; |
||||
|
||||
/* The assumption is that trying to bind the thread to a CPU that is
|
||||
* offline will fail. |
||||
*/ |
||||
if (util_set_current_thread_affinity(mask, |
||||
!saved ? saved_mask : NULL, |
||||
util_cpu_caps.num_cpu_mask_bits)) { |
||||
saved = true; |
||||
|
||||
/* Query the APIC ID of the current core. */ |
||||
cpuid(0x00000001, regs); |
||||
unsigned apic_id = regs[1] >> 24; |
||||
|
||||
/* Query the total core count for the CPU */ |
||||
uint32_t core_count = 1; |
||||
if (regs[3] & (1 << 28)) |
||||
core_count = (regs[1] >> 16) & 0xff; |
||||
|
||||
core_count = util_next_power_of_two(core_count); |
||||
|
||||
/* Query the L3 cache count. */ |
||||
cpuid_count(0x8000001D, 3, regs); |
||||
unsigned cache_level = (regs[0] >> 5) & 0x7; |
||||
unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1; |
||||
|
||||
if (cache_level != 3) |
||||
continue; |
||||
|
||||
unsigned local_core_id = apic_id & (core_count - 1); |
||||
unsigned phys_id = (apic_id & ~(core_count - 1)) >> util_logbase2(core_count); |
||||
unsigned local_l3_cache_index = local_core_id / util_next_power_of_two(cores_per_L3); |
||||
#define L3_ID(p, i) (p << 16 | i << 1 | 1); |
||||
|
||||
unsigned l3_id = L3_ID(phys_id, local_l3_cache_index); |
||||
int idx = -1; |
||||
for (unsigned c = 0; c < num_L3_caches; c++) { |
||||
if (L3_found[c] == l3_id) { |
||||
idx = c; |
||||
break; |
||||
} |
||||
} |
||||
if (idx == -1) { |
||||
idx = num_L3_caches; |
||||
L3_found[num_L3_caches++] = l3_id; |
||||
L3_affinity_masks = realloc(L3_affinity_masks, sizeof(util_affinity_mask) * num_L3_caches); |
||||
if (!L3_affinity_masks) |
||||
return; |
||||
memset(&L3_affinity_masks[num_L3_caches - 1], 0, sizeof(util_affinity_mask)); |
||||
} |
||||
util_cpu_caps.cpu_to_L3[i] = idx; |
||||
L3_affinity_masks[idx][i / 32] |= cpu_bit; |
||||
|
||||
} |
||||
mask[i / 32] = 0; |
||||
} |
||||
|
||||
util_cpu_caps.num_L3_caches = num_L3_caches; |
||||
util_cpu_caps.L3_affinity_mask = L3_affinity_masks; |
||||
|
||||
if (saved) { |
||||
if (debug_get_option_dump_cpu()) { |
||||
fprintf(stderr, "CPU <-> L3 cache mapping:\n"); |
||||
for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) { |
||||
fprintf(stderr, " - L3 %u mask = ", i); |
||||
for (int j = util_cpu_caps.max_cpus - 1; j >= 0; j -= 32) |
||||
fprintf(stderr, "%08x ", util_cpu_caps.L3_affinity_mask[i][j / 32]); |
||||
fprintf(stderr, "\n"); |
||||
} |
||||
} |
||||
|
||||
/* Restore the original affinity mask. */ |
||||
util_set_current_thread_affinity(saved_mask, NULL, |
||||
util_cpu_caps.num_cpu_mask_bits); |
||||
} else { |
||||
if (debug_get_option_dump_cpu()) |
||||
fprintf(stderr, "Cannot set thread affinity for any thread.\n"); |
||||
} |
||||
} |
||||
#endif |
||||
} |
||||
|
||||
static void |
||||
util_cpu_detect_once(void) |
||||
{ |
||||
int available_cpus = 0; |
||||
int total_cpus = 0; |
||||
|
||||
memset(&util_cpu_caps, 0, sizeof util_cpu_caps); |
||||
|
||||
/* Count the number of CPUs in system */ |
||||
#if defined(PIPE_OS_WINDOWS) |
||||
{ |
||||
SYSTEM_INFO system_info; |
||||
GetSystemInfo(&system_info); |
||||
available_cpus = MAX2(1, system_info.dwNumberOfProcessors); |
||||
} |
||||
#elif defined(PIPE_OS_UNIX) |
||||
# if defined(HAS_SCHED_GETAFFINITY) |
||||
{ |
||||
/* sched_setaffinity() can be used to further restrict the number of
|
||||
* CPUs on which the process can run. Use sched_getaffinity() to |
||||
* determine the true number of available CPUs. |
||||
* |
||||
* FIXME: The Linux manual page for sched_getaffinity describes how this |
||||
* simple implementation will fail with > 1024 CPUs, and we'll fall back |
||||
* to the _SC_NPROCESSORS_ONLN path. Support for > 1024 CPUs can be |
||||
* added to this path once someone has such a system for testing. |
||||
*/ |
||||
cpu_set_t affin; |
||||
if (sched_getaffinity(getpid(), sizeof(affin), &affin) == 0) |
||||
available_cpus = CPU_COUNT(&affin); |
||||
} |
||||
# endif |
||||
|
||||
/* Linux, FreeBSD, DragonFly, and Mac OS X should have
|
||||
* _SC_NOPROCESSORS_ONLN. NetBSD and OpenBSD should have HW_NCPUONLINE. |
||||
* This is what FFmpeg uses on those platforms. |
||||
*/ |
||||
# if defined(PIPE_OS_BSD) && defined(HW_NCPUONLINE) |
||||
if (available_cpus == 0) { |
||||
const int mib[] = { CTL_HW, HW_NCPUONLINE }; |
||||
int ncpu; |
||||
int len = sizeof(ncpu); |
||||
|
||||
sysctl(mib, 2, &ncpu, &len, NULL, 0); |
||||
available_cpus = ncpu; |
||||
} |
||||
# elif defined(_SC_NPROCESSORS_ONLN) |
||||
if (available_cpus == 0) { |
||||
available_cpus = sysconf(_SC_NPROCESSORS_ONLN); |
||||
if (available_cpus == ~0) |
||||
available_cpus = 1; |
||||
} |
||||
# elif defined(PIPE_OS_BSD) |
||||
if (available_cpus == 0) { |
||||
const int mib[] = { CTL_HW, HW_NCPU }; |
||||
int ncpu; |
||||
int len = sizeof(ncpu); |
||||
|
||||
sysctl(mib, 2, &ncpu, &len, NULL, 0); |
||||
available_cpus = ncpu; |
||||
} |
||||
# endif /* defined(PIPE_OS_BSD) */ |
||||
|
||||
/* Determine the maximum number of CPUs configured in the system. This is
|
||||
* used to properly set num_cpu_mask_bits below. On BSDs that don't have |
||||
* HW_NCPUONLINE, it was not clear whether HW_NCPU is the number of |
||||
* configured or the number of online CPUs. For that reason, prefer the |
||||
* _SC_NPROCESSORS_CONF path on all BSDs. |
||||
*/ |
||||
# if defined(_SC_NPROCESSORS_CONF) |
||||
total_cpus = sysconf(_SC_NPROCESSORS_CONF); |
||||
if (total_cpus == ~0) |
||||
total_cpus = 1; |
||||
# elif defined(PIPE_OS_BSD) |
||||
{ |
||||
const int mib[] = { CTL_HW, HW_NCPU }; |
||||
int ncpu; |
||||
int len = sizeof(ncpu); |
||||
|
||||
sysctl(mib, 2, &ncpu, &len, NULL, 0); |
||||
total_cpus = ncpu; |
||||
} |
||||
# endif /* defined(PIPE_OS_BSD) */ |
||||
#endif /* defined(PIPE_OS_UNIX) */ |
||||
|
||||
util_cpu_caps.nr_cpus = MAX2(1, available_cpus); |
||||
total_cpus = MAX2(total_cpus, util_cpu_caps.nr_cpus); |
||||
|
||||
util_cpu_caps.max_cpus = total_cpus; |
||||
util_cpu_caps.num_cpu_mask_bits = align(total_cpus, 32); |
||||
|
||||
/* Make the fallback cacheline size nonzero so that it can be
|
||||
* safely passed to align(). |
||||
*/ |
||||
util_cpu_caps.cacheline = sizeof(void *); |
||||
|
||||
#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) |
||||
if (has_cpuid()) { |
||||
uint32_t regs[4]; |
||||
uint32_t regs2[4]; |
||||
|
||||
util_cpu_caps.cacheline = 32; |
||||
|
||||
/* Get max cpuid level */ |
||||
cpuid(0x00000000, regs); |
||||
|
||||
if (regs[0] >= 0x00000001) { |
||||
unsigned int cacheline; |
||||
|
||||
cpuid (0x00000001, regs2); |
||||
|
||||
util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf; |
||||
/* Add "extended family". */ |
||||
if (util_cpu_caps.x86_cpu_type == 0xf) |
||||
util_cpu_caps.x86_cpu_type += ((regs2[0] >> 20) & 0xff); |
||||
|
||||
switch (util_cpu_caps.x86_cpu_type) { |
||||
case 0x17: |
||||
util_cpu_caps.family = CPU_AMD_ZEN1_ZEN2; |
||||
break; |
||||
case 0x18: |
||||
util_cpu_caps.family = CPU_AMD_ZEN_HYGON; |
||||
break; |
||||
case 0x19: |
||||
util_cpu_caps.family = CPU_AMD_ZEN3; |
||||
break; |
||||
default: |
||||
if (util_cpu_caps.x86_cpu_type > 0x19) |
||||
util_cpu_caps.family = CPU_AMD_ZEN_NEXT; |
||||
} |
||||
|
||||
/* general feature flags */ |
||||
util_cpu_caps.has_tsc = (regs2[3] >> 4) & 1; /* 0x0000010 */ |
||||
util_cpu_caps.has_mmx = (regs2[3] >> 23) & 1; /* 0x0800000 */ |
||||
util_cpu_caps.has_sse = (regs2[3] >> 25) & 1; /* 0x2000000 */ |
||||
util_cpu_caps.has_sse2 = (regs2[3] >> 26) & 1; /* 0x4000000 */ |
||||
util_cpu_caps.has_sse3 = (regs2[2] >> 0) & 1; /* 0x0000001 */ |
||||
util_cpu_caps.has_ssse3 = (regs2[2] >> 9) & 1; /* 0x0000020 */ |
||||
util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1; |
||||
util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1; |
||||
util_cpu_caps.has_popcnt = (regs2[2] >> 23) & 1; |
||||
util_cpu_caps.has_avx = ((regs2[2] >> 28) & 1) && // AVX
|
||||
((regs2[2] >> 27) & 1) && // OSXSAVE
|
||||
((xgetbv() & 6) == 6); // XMM & YMM
|
||||
util_cpu_caps.has_f16c = ((regs2[2] >> 29) & 1) && util_cpu_caps.has_avx; |
||||
util_cpu_caps.has_fma = ((regs2[2] >> 12) & 1) && util_cpu_caps.has_avx; |
||||
util_cpu_caps.has_mmx2 = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */ |
||||
#if defined(PIPE_ARCH_X86_64) |
||||
util_cpu_caps.has_daz = 1; |
||||
#else |
||||
util_cpu_caps.has_daz = util_cpu_caps.has_sse3 || |
||||
(util_cpu_caps.has_sse2 && sse2_has_daz()); |
||||
#endif |
||||
|
||||
cacheline = ((regs2[1] >> 8) & 0xFF) * 8; |
||||
if (cacheline > 0) |
||||
util_cpu_caps.cacheline = cacheline; |
||||
} |
||||
if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) { |
||||
uint32_t regs7[4]; |
||||
cpuid_count(0x00000007, 0x00000000, regs7); |
||||
util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1; |
||||
} |
||||
|
||||
// check for avx512
|
||||
if (((regs2[2] >> 27) & 1) && // OSXSAVE
|
||||
(xgetbv() & (0x7 << 5)) && // OPMASK: upper-256 enabled by OS
|
||||
((xgetbv() & 6) == 6)) { // XMM/YMM enabled by OS
|
||||
uint32_t regs3[4]; |
||||
cpuid_count(0x00000007, 0x00000000, regs3); |
||||
util_cpu_caps.has_avx512f = (regs3[1] >> 16) & 1; |
||||
util_cpu_caps.has_avx512dq = (regs3[1] >> 17) & 1; |
||||
util_cpu_caps.has_avx512ifma = (regs3[1] >> 21) & 1; |
||||
util_cpu_caps.has_avx512pf = (regs3[1] >> 26) & 1; |
||||
util_cpu_caps.has_avx512er = (regs3[1] >> 27) & 1; |
||||
util_cpu_caps.has_avx512cd = (regs3[1] >> 28) & 1; |
||||
util_cpu_caps.has_avx512bw = (regs3[1] >> 30) & 1; |
||||
util_cpu_caps.has_avx512vl = (regs3[1] >> 31) & 1; |
||||
util_cpu_caps.has_avx512vbmi = (regs3[2] >> 1) & 1; |
||||
} |
||||
|
||||
if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) { |
||||
/* GenuineIntel */ |
||||
util_cpu_caps.has_intel = 1; |
||||
} |
||||
|
||||
cpuid(0x80000000, regs); |
||||
|
||||
if (regs[0] >= 0x80000001) { |
||||
|
||||
cpuid(0x80000001, regs2); |
||||
|
||||
util_cpu_caps.has_mmx |= (regs2[3] >> 23) & 1; |
||||
util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1; |
||||
util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1; |
||||
util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1; |
||||
|
||||
util_cpu_caps.has_xop = util_cpu_caps.has_avx && |
||||
((regs2[2] >> 11) & 1); |
||||
} |
||||
|
||||
if (regs[0] >= 0x80000006) { |
||||
/* should we really do this if the clflush size above worked? */ |
||||
unsigned int cacheline; |
||||
cpuid(0x80000006, regs2); |
||||
cacheline = regs2[2] & 0xFF; |
||||
if (cacheline > 0) |
||||
util_cpu_caps.cacheline = cacheline; |
||||
} |
||||
|
||||
if (!util_cpu_caps.has_sse) { |
||||
util_cpu_caps.has_sse2 = 0; |
||||
util_cpu_caps.has_sse3 = 0; |
||||
util_cpu_caps.has_ssse3 = 0; |
||||
util_cpu_caps.has_sse4_1 = 0; |
||||
} |
||||
} |
||||
#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */ |
||||
|
||||
#if defined(PIPE_ARCH_ARM) || defined(PIPE_ARCH_AARCH64) |
||||
check_os_arm_support(); |
||||
#endif |
||||
|
||||
#if defined(PIPE_ARCH_PPC) |
||||
check_os_altivec_support(); |
||||
#endif /* PIPE_ARCH_PPC */ |
||||
|
||||
#if defined(PIPE_ARCH_MIPS64) |
||||
check_os_mips64_support(); |
||||
#endif /* PIPE_ARCH_MIPS64 */ |
||||
|
||||
get_cpu_topology(); |
||||
|
||||
if (debug_get_option_dump_cpu()) { |
||||
printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus); |
||||
|
||||
printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type); |
||||
printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline); |
||||
|
||||
printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc); |
||||
printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx); |
||||
printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2); |
||||
printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse); |
||||
printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2); |
||||
printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3); |
||||
printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3); |
||||
printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1); |
||||
printf("util_cpu_caps.has_sse4_2 = %u\n", util_cpu_caps.has_sse4_2); |
||||
printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx); |
||||
printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2); |
||||
printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c); |
||||
printf("util_cpu_caps.has_popcnt = %u\n", util_cpu_caps.has_popcnt); |
||||
printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow); |
||||
printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext); |
||||
printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop); |
||||
printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec); |
||||
printf("util_cpu_caps.has_vsx = %u\n", util_cpu_caps.has_vsx); |
||||
printf("util_cpu_caps.has_neon = %u\n", util_cpu_caps.has_neon); |
||||
printf("util_cpu_caps.has_msa = %u\n", util_cpu_caps.has_msa); |
||||
printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz); |
||||
printf("util_cpu_caps.has_avx512f = %u\n", util_cpu_caps.has_avx512f); |
||||
printf("util_cpu_caps.has_avx512dq = %u\n", util_cpu_caps.has_avx512dq); |
||||
printf("util_cpu_caps.has_avx512ifma = %u\n", util_cpu_caps.has_avx512ifma); |
||||
printf("util_cpu_caps.has_avx512pf = %u\n", util_cpu_caps.has_avx512pf); |
||||
printf("util_cpu_caps.has_avx512er = %u\n", util_cpu_caps.has_avx512er); |
||||
printf("util_cpu_caps.has_avx512cd = %u\n", util_cpu_caps.has_avx512cd); |
||||
printf("util_cpu_caps.has_avx512bw = %u\n", util_cpu_caps.has_avx512bw); |
||||
printf("util_cpu_caps.has_avx512vl = %u\n", util_cpu_caps.has_avx512vl); |
||||
printf("util_cpu_caps.has_avx512vbmi = %u\n", util_cpu_caps.has_avx512vbmi); |
||||
printf("util_cpu_caps.num_L3_caches = %u\n", util_cpu_caps.num_L3_caches); |
||||
printf("util_cpu_caps.num_cpu_mask_bits = %u\n", util_cpu_caps.num_cpu_mask_bits); |
||||
} |
||||
} |
||||
|
||||
static once_flag cpu_once_flag = ONCE_FLAG_INIT; |
||||
|
||||
void |
||||
util_cpu_detect(void) |
||||
{ |
||||
call_once(&cpu_once_flag, util_cpu_detect_once); |
||||
} |
@ -0,0 +1,311 @@ |
||||
/**************************************************************************
|
||||
*
|
||||
* Copyright 2008 VMware, Inc. |
||||
* All Rights Reserved. |
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a |
||||
* copy of this software and associated documentation files (the |
||||
* "Software"), to deal in the Software without restriction, including |
||||
* without limitation the rights to use, copy, modify, merge, publish, |
||||
* distribute, sub license, and/or sell copies of the Software, and to |
||||
* permit persons to whom the Software is furnished to do so, subject to |
||||
* the following conditions: |
||||
*
|
||||
* The above copyright notice and this permission notice (including the |
||||
* next paragraph) shall be included in all copies or substantial portions |
||||
* of the Software. |
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
||||
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. |
||||
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR |
||||
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
||||
*
|
||||
**************************************************************************/ |
||||
|
||||
|
||||
|
||||
#include "pipe/p_config.h" |
||||
#include "util/u_math.h" |
||||
#include "util/u_cpu_detect.h" |
||||
|
||||
#if defined(PIPE_ARCH_SSE) |
||||
#include <xmmintrin.h> |
||||
/* This is defined in pmmintrin.h, but it can only be included when -msse3 is
|
||||
* used, so just define it here to avoid further. */ |
||||
#ifndef _MM_DENORMALS_ZERO_MASK |
||||
#define _MM_DENORMALS_ZERO_MASK 0x0040 |
||||
#endif |
||||
#endif |
||||
|
||||
|
||||
/** log2(x), for x in [1.0, 2.0) */ |
||||
float log2_table[LOG2_TABLE_SIZE]; |
||||
|
||||
|
||||
static void
|
||||
init_log2_table(void) |
||||
{ |
||||
unsigned i; |
||||
for (i = 0; i < LOG2_TABLE_SIZE; i++) |
||||
log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SCALE)); |
||||
} |
||||
|
||||
|
||||
/**
|
||||
* One time init for math utilities. |
||||
*/ |
||||
void |
||||
util_init_math(void) |
||||
{ |
||||
static bool initialized = false; |
||||
if (!initialized) { |
||||
init_log2_table(); |
||||
initialized = true; |
||||
} |
||||
} |
||||
|
||||
/**
|
||||
* Fetches the contents of the fpstate (mxcsr on x86) register. |
||||
* |
||||
* On platforms without support for it just returns 0. |
||||
*/ |
||||
unsigned |
||||
util_fpstate_get(void) |
||||
{ |
||||
unsigned mxcsr = 0; |
||||
|
||||
#if defined(PIPE_ARCH_SSE) |
||||
if (util_get_cpu_caps()->has_sse) { |
||||
mxcsr = _mm_getcsr(); |
||||
} |
||||
#endif |
||||
|
||||
return mxcsr; |
||||
} |
||||
|
||||
/**
|
||||
* Make sure that the fp treats the denormalized floating |
||||
* point numbers as zero. |
||||
* |
||||
* This is the behavior required by D3D10. OpenGL doesn't care. |
||||
*/ |
||||
unsigned |
||||
util_fpstate_set_denorms_to_zero(unsigned current_mxcsr) |
||||
{ |
||||
#if defined(PIPE_ARCH_SSE) |
||||
if (util_get_cpu_caps()->has_sse) { |
||||
/* Enable flush to zero mode */ |
||||
current_mxcsr |= _MM_FLUSH_ZERO_MASK; |
||||
if (util_get_cpu_caps()->has_daz) { |
||||
/* Enable denormals are zero mode */ |
||||
current_mxcsr |= _MM_DENORMALS_ZERO_MASK; |
||||
} |
||||
util_fpstate_set(current_mxcsr); |
||||
} |
||||
#endif |
||||
return current_mxcsr; |
||||
} |
||||
|
||||
/**
|
||||
* Set the state of the fpstate (mxcsr on x86) register. |
||||
* |
||||
* On platforms without support for it's a noop. |
||||
*/ |
||||
void |
||||
util_fpstate_set(unsigned mxcsr) |
||||
{ |
||||
#if defined(PIPE_ARCH_SSE) |
||||
if (util_get_cpu_caps()->has_sse) { |
||||
_mm_setcsr(mxcsr); |
||||
} |
||||
#endif |
||||
} |
||||
|
||||
/**
|
||||
* Compute inverse of 4x4 matrix. |
||||
* |
||||
* \return false if the source matrix is singular. |
||||
* |
||||
* \author |
||||
* Code contributed by Jacques Leroy jle@star.be |
||||
* |
||||
* Calculates the inverse matrix by performing the gaussian matrix reduction |
||||
* with partial pivoting followed by back/substitution with the loops manually |
||||
* unrolled. |
||||
*/ |
||||
bool |
||||
util_invert_mat4x4(float *out, const float *m) |
||||
{ |
||||
float wtmp[4][8]; |
||||
float m0, m1, m2, m3, s; |
||||
float *r0, *r1, *r2, *r3; |
||||
|
||||
#define MAT(m, r, c) (m)[(c)*4 + (r)] |
||||
#define SWAP_ROWS(a, b) \ |
||||
{ \
|
||||
float *_tmp = a; \
|
||||
(a) = (b); \
|
||||
(b) = _tmp; \
|
||||
} |
||||
|
||||
r0 = wtmp[0], r1 = wtmp[1], r2 = wtmp[2], r3 = wtmp[3]; |
||||
|
||||
r0[0] = MAT(m, 0, 0), r0[1] = MAT(m, 0, 1), r0[2] = MAT(m, 0, 2), r0[3] = MAT(m, 0, 3), |
||||
r0[4] = 1.0, r0[5] = r0[6] = r0[7] = 0.0, |
||||
|
||||
r1[0] = MAT(m, 1, 0), r1[1] = MAT(m, 1, 1), r1[2] = MAT(m, 1, 2), r1[3] = MAT(m, 1, 3), |
||||
r1[5] = 1.0, r1[4] = r1[6] = r1[7] = 0.0, |
||||
|
||||
r2[0] = MAT(m, 2, 0), r2[1] = MAT(m, 2, 1), r2[2] = MAT(m, 2, 2), r2[3] = MAT(m, 2, 3), |
||||
r2[6] = 1.0, r2[4] = r2[5] = r2[7] = 0.0, |
||||
|
||||
r3[0] = MAT(m, 3, 0), r3[1] = MAT(m, 3, 1), r3[2] = MAT(m, 3, 2), r3[3] = MAT(m, 3, 3), |
||||
r3[7] = 1.0, r3[4] = r3[5] = r3[6] = 0.0; |
||||
|
||||
/* choose pivot - or die */ |
||||
if (fabsf(r3[0]) > fabsf(r2[0])) |
||||
SWAP_ROWS(r3, r2); |
||||
if (fabsf(r2[0]) > fabsf(r1[0])) |
||||
SWAP_ROWS(r2, r1); |
||||
if (fabsf(r1[0]) > fabsf(r0[0])) |
||||
SWAP_ROWS(r1, r0); |
||||
if (0.0F == r0[0]) |
||||
return false; |
||||
|
||||
/* eliminate first variable */ |
||||
m1 = r1[0] / r0[0]; |
||||
m2 = r2[0] / r0[0]; |
||||
m3 = r3[0] / r0[0]; |
||||
s = r0[1]; |
||||
r1[1] -= m1 * s; |
||||
r2[1] -= m2 * s; |
||||
r3[1] -= m3 * s; |
||||
s = r0[2]; |
||||
r1[2] -= m1 * s; |
||||
r2[2] -= m2 * s; |
||||
r3[2] -= m3 * s; |
||||
s = r0[3]; |
||||
r1[3] -= m1 * s; |
||||
r2[3] -= m2 * s; |
||||
r3[3] -= m3 * s; |
||||
s = r0[4]; |
||||
if (s != 0.0F) { |
||||
r1[4] -= m1 * s; |
||||
r2[4] -= m2 * s; |
||||
r3[4] -= m3 * s; |
||||
} |
||||
s = r0[5]; |
||||
if (s != 0.0F) { |
||||
r1[5] -= m1 * s; |
||||
r2[5] -= m2 * s; |
||||
r3[5] -= m3 * s; |
||||
} |
||||
s = r0[6]; |
||||
if (s != 0.0F) { |
||||
r1[6] -= m1 * s; |
||||
r2[6] -= m2 * s; |
||||
r3[6] -= m3 * s; |
||||
} |
||||
s = r0[7]; |
||||
if (s != 0.0F) { |
||||
r1[7] -= m1 * s; |
||||
r2[7] -= m2 * s; |
||||
r3[7] -= m3 * s; |
||||
} |
||||
|
||||
/* choose pivot - or die */ |
||||
if (fabsf(r3[1]) > fabsf(r2[1])) |
||||
SWAP_ROWS(r3, r2); |
||||
if (fabsf(r2[1]) > fabsf(r1[1])) |
||||
SWAP_ROWS(r2, r1); |
||||
if (0.0F == r1[1]) |
||||
return false; |
||||
|
||||
/* eliminate second variable */ |
||||
m2 = r2[1] / r1[1]; |
||||
m3 = r3[1] / r1[1]; |
||||
r2[2] -= m2 * r1[2]; |
||||
r3[2] -= m3 * r1[2]; |
||||
r2[3] -= m2 * r1[3]; |
||||
r3[3] -= m3 * r1[3]; |
||||
s = r1[4]; |
||||
if (0.0F != s) { |
||||
r2[4] -= m2 * s; |
||||
r3[4] -= m3 * s; |
||||
} |
||||
s = r1[5]; |
||||
if (0.0F != s) { |
||||
r2[5] -= m2 * s; |
||||
r3[5] -= m3 * s; |
||||
} |
||||
s = r1[6]; |
||||
if (0.0F != s) { |
||||
r2[6] -= m2 * s; |
||||
r3[6] -= m3 * s; |
||||
} |
||||
s = r1[7]; |
||||
if (0.0F != s) { |
||||
r2[7] -= m2 * s; |
||||
r3[7] -= m3 * s; |
||||
} |
||||
|
||||
/* choose pivot - or die */ |
||||
if (fabsf(r3[2]) > fabsf(r2[2])) |
||||
SWAP_ROWS(r3, r2); |
||||
if (0.0F == r2[2]) |
||||
return false; |
||||
|
||||
/* eliminate third variable */ |
||||
m3 = r3[2] / r2[2]; |
||||
r3[3] -= m3 * r2[3], r3[4] -= m3 * r2[4], r3[5] -= m3 * r2[5], r3[6] -= m3 * r2[6], |
||||
r3[7] -= m3 * r2[7]; |
||||
|
||||
/* last check */ |
||||
if (0.0F == r3[3]) |
||||
return false; |
||||
|
||||
s = 1.0F / r3[3]; /* now back substitute row 3 */ |
||||
r3[4] *= s; |
||||
r3[5] *= s; |
||||
r3[6] *= s; |
||||
r3[7] *= s; |
||||
|
||||
m2 = r2[3]; /* now back substitute row 2 */ |
||||
s = 1.0F / r2[2]; |
||||
r2[4] = s * (r2[4] - r3[4] * m2), r2[5] = s * (r2[5] - r3[5] * m2), |
||||
r2[6] = s * (r2[6] - r3[6] * m2), r2[7] = s * (r2[7] - r3[7] * m2); |
||||
m1 = r1[3]; |
||||
r1[4] -= r3[4] * m1, r1[5] -= r3[5] * m1, r1[6] -= r3[6] * m1, r1[7] -= r3[7] * m1; |
||||
m0 = r0[3]; |
||||
r0[4] -= r3[4] * m0, r0[5] -= r3[5] * m0, r0[6] -= r3[6] * m0, r0[7] -= r3[7] * m0; |
||||
|
||||
m1 = r1[2]; /* now back substitute row 1 */ |
||||
s = 1.0F / r1[1]; |
||||
r1[4] = s * (r1[4] - r2[4] * m1), r1[5] = s * (r1[5] - r2[5] * m1), |
||||
r1[6] = s * (r1[6] - r2[6] * m1), r1[7] = s * (r1[7] - r2[7] * m1); |
||||
m0 = r0[2]; |
||||
r0[4] -= r2[4] * m0, r0[5] -= r2[5] * m0, r0[6] -= r2[6] * m0, r0[7] -= r2[7] * m0; |
||||
|
||||
m0 = r0[1]; /* now back substitute row 0 */ |
||||
s = 1.0F / r0[0]; |
||||
r0[4] = s * (r0[4] - r1[4] * m0), r0[5] = s * (r0[5] - r1[5] * m0), |
||||
r0[6] = s * (r0[6] - r1[6] * m0), r0[7] = s * (r0[7] - r1[7] * m0); |
||||
|
||||
MAT(out, 0, 0) = r0[4]; |
||||
MAT(out, 0, 1) = r0[5], MAT(out, 0, 2) = r0[6]; |
||||
MAT(out, 0, 3) = r0[7], MAT(out, 1, 0) = r1[4]; |
||||
MAT(out, 1, 1) = r1[5], MAT(out, 1, 2) = r1[6]; |
||||
MAT(out, 1, 3) = r1[7], MAT(out, 2, 0) = r2[4]; |
||||
MAT(out, 2, 1) = r2[5], MAT(out, 2, 2) = r2[6]; |
||||
MAT(out, 2, 3) = r2[7], MAT(out, 3, 0) = r3[4]; |
||||
MAT(out, 3, 1) = r3[5], MAT(out, 3, 2) = r3[6]; |
||||
MAT(out, 3, 3) = r3[7]; |
||||
|
||||
#undef MAT |
||||
#undef SWAP_ROWS |
||||
|
||||
return true; |
||||
} |
Loading…
Reference in new issue