18#if defined(_WIN32) || defined(__i386__)
19#define BT_USE_SSE_IN_API
24#if defined BT_USE_SIMD_VECTOR3
38#if defined BT_USE_SSE || defined _WIN32
40#define LOG2_ARRAY_SIZE 6
41#define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
49 static const unsigned char indexTable[16] = {(
unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
192 for (; index + 4 <= count / 4; index += 4)
286 size_t localCount = (count & -4L) - 4 * index;
297 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
298 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
299 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
300 movaps %[t0], %[max] // vertices[0] \n\
301 movlhps %[t1], %[max] // x0y0x1y1 \n\
302 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
303 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
304 mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
305 movhlps %[t0], %[t1] // z0w0z1w1 \n\
306 movaps %[t3], %[t0] // vertices[2] \n\
307 movlhps %[t4], %[t0] // x2y2x3y3 \n\
308 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
309 movhlps %[t3], %[t4] // z2w2z3w3 \n\
310 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
311 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
312 movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
313 shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
314 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
315 addps %[t3], %[max] // x + y \n\
316 addps %[t1], %[max] // x + y + z \n\
317 movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
318 maxps %[t2], %[max] // record max, restore max \n\
319 add $16, %[byteIndex] // advance loop counter\n\
328 for (
unsigned int i = 0; i <
localCount / 4; i++, index++)
359 float4 v0, v1, v2, x, y, z;
442 static const unsigned char indexTable[16] = {(
unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
585 for (; index + 4 <= count / 4; index += 4)
679 size_t localCount = (count & -4L) - 4 * index;
690 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
691 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
692 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
693 movaps %[t0], %[min] // vertices[0] \n\
694 movlhps %[t1], %[min] // x0y0x1y1 \n\
695 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
696 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
697 mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
698 movhlps %[t0], %[t1] // z0w0z1w1 \n\
699 movaps %[t3], %[t0] // vertices[2] \n\
700 movlhps %[t4], %[t0] // x2y2x3y3 \n\
701 movhlps %[t3], %[t4] // z2w2z3w3 \n\
702 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
703 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
704 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
705 movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
706 shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
707 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
708 addps %[t3], %[min] // x + y \n\
709 addps %[t1], %[min] // x + y + z \n\
710 movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
711 minps %[t2], %[min] // record min, restore min \n\
712 add $16, %[byteIndex] // advance loop counter\n\
721 for (
unsigned int i = 0; i <
localCount / 4; i++, index++)
753 float4 v0, v1, v2, x, y, z;
831#elif defined BT_USE_NEON
833#define ARM_NEON_GCC_COMPATIBILITY 1
835#include <sys/types.h>
836#include <sys/sysctl.h>
889#define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); _r; })
892#define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); _r; })
909 for (; i + 8 <= count; i += 8)
970 for (; i + 4 <= count; i += 4)
1094 unsigned long i = 0;
1095 for (; i + 8 <= count; i += 8)
1150 for (; i + 4 <= count; i += 4)
1280 unsigned long i = 0;
1292 for (; i + 8 <= count; i += 8)
1353 for (; i + 4 <= count; i += 4)
1476 unsigned long i = 0;
1477 for (; i + 8 <= count; i += 8)
1532 for (; i + 4 <= count; i += 4)
1661#error Unhandled __APPLE__ arch
const T & btMax(const T &a, const T &b)